diff options
Diffstat (limited to 'arch/x86/kernel')
78 files changed, 31659 insertions, 0 deletions
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore new file mode 100644 index 000000000000..40836ad9079c --- /dev/null +++ b/arch/x86/kernel/.gitignore | |||
@@ -0,0 +1 @@ | |||
vsyscall.lds | |||
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile new file mode 100644 index 000000000000..577d08f4b8bb --- /dev/null +++ b/arch/x86/kernel/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | ifeq ($(CONFIG_X86_32),y) | ||
2 | include ${srctree}/arch/x86/kernel/Makefile_32 | ||
3 | else | ||
4 | include ${srctree}/arch/x86_64/kernel/Makefile_64 | ||
5 | endif | ||
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 new file mode 100644 index 000000000000..5096f486d389 --- /dev/null +++ b/arch/x86/kernel/Makefile_32 | |||
@@ -0,0 +1,88 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | extra-y := head_32.o init_task_32.o vmlinux.lds | ||
6 | |||
7 | obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \ | ||
8 | ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \ | ||
9 | pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\ | ||
10 | quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o | ||
11 | |||
12 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | ||
13 | obj-y += ../../x86/kernel/cpu/ | ||
14 | obj-y += ../../x86/kernel/acpi/ | ||
15 | obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o | ||
16 | obj-$(CONFIG_MCA) += mca_32.o | ||
17 | obj-$(CONFIG_X86_MSR) += msr.o | ||
18 | obj-$(CONFIG_X86_CPUID) += cpuid.o | ||
19 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
20 | obj-$(CONFIG_APM) += apm_32.o | ||
21 | obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o | ||
22 | obj-$(CONFIG_SMP) += smpcommon_32.o | ||
23 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o | ||
24 | obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o | ||
25 | obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o | ||
26 | obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o | ||
27 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o | ||
28 | obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash_32.o | ||
29 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o | ||
30 | obj-$(CONFIG_X86_NUMAQ) += numaq_32.o | ||
31 | obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o | ||
32 | obj-$(CONFIG_KPROBES) += kprobes_32.o | ||
33 | obj-$(CONFIG_MODULES) += module_32.o | ||
34 | obj-y += sysenter_32.o vsyscall_32.o | ||
35 | obj-$(CONFIG_ACPI_SRAT) += srat_32.o | ||
36 | obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o | ||
37 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o | ||
38 | obj-$(CONFIG_VM86) += vm86_32.o | ||
39 | obj-$(CONFIG_EARLY_PRINTK) += early_printk.o | ||
40 | obj-$(CONFIG_HPET_TIMER) += hpet_32.o | ||
41 | obj-$(CONFIG_K8_NB) += k8.o | ||
42 | obj-$(CONFIG_MGEODE_LX) += geode_32.o | ||
43 | |||
44 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o | ||
45 | obj-$(CONFIG_PARAVIRT) += paravirt_32.o | ||
46 | obj-y += pcspeaker.o | ||
47 | |||
48 | obj-$(CONFIG_SCx200) += scx200_32.o | ||
49 | |||
50 | # vsyscall_32.o contains the vsyscall DSO images as __initdata. | ||
51 | # We must build both images before we can assemble it. | ||
52 | # Note: kbuild does not track this dependency due to usage of .incbin | ||
53 | $(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so | ||
54 | targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) | ||
55 | targets += vsyscall-note_32.o vsyscall_32.lds | ||
56 | |||
57 | # The DSO images are built using a special linker script. | ||
58 | quiet_cmd_syscall = SYSCALL $@ | ||
59 | cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ | ||
60 | -Wl,-T,$(filter-out FORCE,$^) -o $@ | ||
61 | |||
62 | export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH) | ||
63 | |||
64 | vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \ | ||
65 | $(call ld-option, -Wl$(comma)--hash-style=sysv) | ||
66 | SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags) | ||
67 | SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags) | ||
68 | |||
69 | $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \ | ||
70 | $(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \ | ||
71 | $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE | ||
72 | $(call if_changed,syscall) | ||
73 | |||
74 | # We also create a special relocatable object that should mirror the symbol | ||
75 | # table and layout of the linked DSO. With ld -R we can then refer to | ||
76 | # these symbols in the kernel code rather than hand-coded addresses. | ||
77 | extra-y += vsyscall-syms.o | ||
78 | $(obj)/built-in.o: $(obj)/vsyscall-syms.o | ||
79 | $(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o | ||
80 | |||
81 | SYSCFLAGS_vsyscall-syms.o = -r | ||
82 | $(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \ | ||
83 | $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE | ||
84 | $(call if_changed,syscall) | ||
85 | |||
86 | k8-y += ../../x86_64/kernel/k8.o | ||
87 | stacktrace-y += ../../x86_64/kernel/stacktrace.o | ||
88 | |||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c new file mode 100644 index 000000000000..bd72d94e713e --- /dev/null +++ b/arch/x86/kernel/alternative.c | |||
@@ -0,0 +1,450 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/spinlock.h> | ||
4 | #include <linux/list.h> | ||
5 | #include <linux/kprobes.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/vmalloc.h> | ||
8 | #include <asm/alternative.h> | ||
9 | #include <asm/sections.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/mce.h> | ||
12 | #include <asm/nmi.h> | ||
13 | |||
14 | #define MAX_PATCH_LEN (255-1) | ||
15 | |||
16 | #ifdef CONFIG_HOTPLUG_CPU | ||
17 | static int smp_alt_once; | ||
18 | |||
19 | static int __init bootonly(char *str) | ||
20 | { | ||
21 | smp_alt_once = 1; | ||
22 | return 1; | ||
23 | } | ||
24 | __setup("smp-alt-boot", bootonly); | ||
25 | #else | ||
26 | #define smp_alt_once 1 | ||
27 | #endif | ||
28 | |||
29 | static int debug_alternative; | ||
30 | |||
31 | static int __init debug_alt(char *str) | ||
32 | { | ||
33 | debug_alternative = 1; | ||
34 | return 1; | ||
35 | } | ||
36 | __setup("debug-alternative", debug_alt); | ||
37 | |||
38 | static int noreplace_smp; | ||
39 | |||
40 | static int __init setup_noreplace_smp(char *str) | ||
41 | { | ||
42 | noreplace_smp = 1; | ||
43 | return 1; | ||
44 | } | ||
45 | __setup("noreplace-smp", setup_noreplace_smp); | ||
46 | |||
47 | #ifdef CONFIG_PARAVIRT | ||
48 | static int noreplace_paravirt = 0; | ||
49 | |||
50 | static int __init setup_noreplace_paravirt(char *str) | ||
51 | { | ||
52 | noreplace_paravirt = 1; | ||
53 | return 1; | ||
54 | } | ||
55 | __setup("noreplace-paravirt", setup_noreplace_paravirt); | ||
56 | #endif | ||
57 | |||
58 | #define DPRINTK(fmt, args...) if (debug_alternative) \ | ||
59 | printk(KERN_DEBUG fmt, args) | ||
60 | |||
61 | #ifdef GENERIC_NOP1 | ||
62 | /* Use inline assembly to define this because the nops are defined | ||
63 | as inline assembly strings in the include files and we cannot | ||
64 | get them easily into strings. */ | ||
65 | asm("\t.data\nintelnops: " | ||
66 | GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 | ||
67 | GENERIC_NOP7 GENERIC_NOP8); | ||
68 | extern unsigned char intelnops[]; | ||
69 | static unsigned char *intel_nops[ASM_NOP_MAX+1] = { | ||
70 | NULL, | ||
71 | intelnops, | ||
72 | intelnops + 1, | ||
73 | intelnops + 1 + 2, | ||
74 | intelnops + 1 + 2 + 3, | ||
75 | intelnops + 1 + 2 + 3 + 4, | ||
76 | intelnops + 1 + 2 + 3 + 4 + 5, | ||
77 | intelnops + 1 + 2 + 3 + 4 + 5 + 6, | ||
78 | intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, | ||
79 | }; | ||
80 | #endif | ||
81 | |||
82 | #ifdef K8_NOP1 | ||
83 | asm("\t.data\nk8nops: " | ||
84 | K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 | ||
85 | K8_NOP7 K8_NOP8); | ||
86 | extern unsigned char k8nops[]; | ||
87 | static unsigned char *k8_nops[ASM_NOP_MAX+1] = { | ||
88 | NULL, | ||
89 | k8nops, | ||
90 | k8nops + 1, | ||
91 | k8nops + 1 + 2, | ||
92 | k8nops + 1 + 2 + 3, | ||
93 | k8nops + 1 + 2 + 3 + 4, | ||
94 | k8nops + 1 + 2 + 3 + 4 + 5, | ||
95 | k8nops + 1 + 2 + 3 + 4 + 5 + 6, | ||
96 | k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, | ||
97 | }; | ||
98 | #endif | ||
99 | |||
100 | #ifdef K7_NOP1 | ||
101 | asm("\t.data\nk7nops: " | ||
102 | K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 | ||
103 | K7_NOP7 K7_NOP8); | ||
104 | extern unsigned char k7nops[]; | ||
105 | static unsigned char *k7_nops[ASM_NOP_MAX+1] = { | ||
106 | NULL, | ||
107 | k7nops, | ||
108 | k7nops + 1, | ||
109 | k7nops + 1 + 2, | ||
110 | k7nops + 1 + 2 + 3, | ||
111 | k7nops + 1 + 2 + 3 + 4, | ||
112 | k7nops + 1 + 2 + 3 + 4 + 5, | ||
113 | k7nops + 1 + 2 + 3 + 4 + 5 + 6, | ||
114 | k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, | ||
115 | }; | ||
116 | #endif | ||
117 | |||
118 | #ifdef CONFIG_X86_64 | ||
119 | |||
120 | extern char __vsyscall_0; | ||
121 | static inline unsigned char** find_nop_table(void) | ||
122 | { | ||
123 | return k8_nops; | ||
124 | } | ||
125 | |||
126 | #else /* CONFIG_X86_64 */ | ||
127 | |||
128 | static struct nop { | ||
129 | int cpuid; | ||
130 | unsigned char **noptable; | ||
131 | } noptypes[] = { | ||
132 | { X86_FEATURE_K8, k8_nops }, | ||
133 | { X86_FEATURE_K7, k7_nops }, | ||
134 | { -1, NULL } | ||
135 | }; | ||
136 | |||
137 | static unsigned char** find_nop_table(void) | ||
138 | { | ||
139 | unsigned char **noptable = intel_nops; | ||
140 | int i; | ||
141 | |||
142 | for (i = 0; noptypes[i].cpuid >= 0; i++) { | ||
143 | if (boot_cpu_has(noptypes[i].cpuid)) { | ||
144 | noptable = noptypes[i].noptable; | ||
145 | break; | ||
146 | } | ||
147 | } | ||
148 | return noptable; | ||
149 | } | ||
150 | |||
151 | #endif /* CONFIG_X86_64 */ | ||
152 | |||
153 | /* Use this to add nops to a buffer, then text_poke the whole buffer. */ | ||
154 | static void add_nops(void *insns, unsigned int len) | ||
155 | { | ||
156 | unsigned char **noptable = find_nop_table(); | ||
157 | |||
158 | while (len > 0) { | ||
159 | unsigned int noplen = len; | ||
160 | if (noplen > ASM_NOP_MAX) | ||
161 | noplen = ASM_NOP_MAX; | ||
162 | memcpy(insns, noptable[noplen], noplen); | ||
163 | insns += noplen; | ||
164 | len -= noplen; | ||
165 | } | ||
166 | } | ||
167 | |||
168 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | ||
169 | extern u8 *__smp_locks[], *__smp_locks_end[]; | ||
170 | |||
171 | /* Replace instructions with better alternatives for this CPU type. | ||
172 | This runs before SMP is initialized to avoid SMP problems with | ||
173 | self modifying code. This implies that assymetric systems where | ||
174 | APs have less capabilities than the boot processor are not handled. | ||
175 | Tough. Make sure you disable such features by hand. */ | ||
176 | |||
177 | void apply_alternatives(struct alt_instr *start, struct alt_instr *end) | ||
178 | { | ||
179 | struct alt_instr *a; | ||
180 | char insnbuf[MAX_PATCH_LEN]; | ||
181 | |||
182 | DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); | ||
183 | for (a = start; a < end; a++) { | ||
184 | u8 *instr = a->instr; | ||
185 | BUG_ON(a->replacementlen > a->instrlen); | ||
186 | BUG_ON(a->instrlen > sizeof(insnbuf)); | ||
187 | if (!boot_cpu_has(a->cpuid)) | ||
188 | continue; | ||
189 | #ifdef CONFIG_X86_64 | ||
190 | /* vsyscall code is not mapped yet. resolve it manually. */ | ||
191 | if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { | ||
192 | instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); | ||
193 | DPRINTK("%s: vsyscall fixup: %p => %p\n", | ||
194 | __FUNCTION__, a->instr, instr); | ||
195 | } | ||
196 | #endif | ||
197 | memcpy(insnbuf, a->replacement, a->replacementlen); | ||
198 | add_nops(insnbuf + a->replacementlen, | ||
199 | a->instrlen - a->replacementlen); | ||
200 | text_poke(instr, insnbuf, a->instrlen); | ||
201 | } | ||
202 | } | ||
203 | |||
204 | #ifdef CONFIG_SMP | ||
205 | |||
206 | static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) | ||
207 | { | ||
208 | u8 **ptr; | ||
209 | |||
210 | for (ptr = start; ptr < end; ptr++) { | ||
211 | if (*ptr < text) | ||
212 | continue; | ||
213 | if (*ptr > text_end) | ||
214 | continue; | ||
215 | text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ | ||
216 | }; | ||
217 | } | ||
218 | |||
219 | static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) | ||
220 | { | ||
221 | u8 **ptr; | ||
222 | char insn[1]; | ||
223 | |||
224 | if (noreplace_smp) | ||
225 | return; | ||
226 | |||
227 | add_nops(insn, 1); | ||
228 | for (ptr = start; ptr < end; ptr++) { | ||
229 | if (*ptr < text) | ||
230 | continue; | ||
231 | if (*ptr > text_end) | ||
232 | continue; | ||
233 | text_poke(*ptr, insn, 1); | ||
234 | }; | ||
235 | } | ||
236 | |||
237 | struct smp_alt_module { | ||
238 | /* what is this ??? */ | ||
239 | struct module *mod; | ||
240 | char *name; | ||
241 | |||
242 | /* ptrs to lock prefixes */ | ||
243 | u8 **locks; | ||
244 | u8 **locks_end; | ||
245 | |||
246 | /* .text segment, needed to avoid patching init code ;) */ | ||
247 | u8 *text; | ||
248 | u8 *text_end; | ||
249 | |||
250 | struct list_head next; | ||
251 | }; | ||
252 | static LIST_HEAD(smp_alt_modules); | ||
253 | static DEFINE_SPINLOCK(smp_alt); | ||
254 | |||
255 | void alternatives_smp_module_add(struct module *mod, char *name, | ||
256 | void *locks, void *locks_end, | ||
257 | void *text, void *text_end) | ||
258 | { | ||
259 | struct smp_alt_module *smp; | ||
260 | unsigned long flags; | ||
261 | |||
262 | if (noreplace_smp) | ||
263 | return; | ||
264 | |||
265 | if (smp_alt_once) { | ||
266 | if (boot_cpu_has(X86_FEATURE_UP)) | ||
267 | alternatives_smp_unlock(locks, locks_end, | ||
268 | text, text_end); | ||
269 | return; | ||
270 | } | ||
271 | |||
272 | smp = kzalloc(sizeof(*smp), GFP_KERNEL); | ||
273 | if (NULL == smp) | ||
274 | return; /* we'll run the (safe but slow) SMP code then ... */ | ||
275 | |||
276 | smp->mod = mod; | ||
277 | smp->name = name; | ||
278 | smp->locks = locks; | ||
279 | smp->locks_end = locks_end; | ||
280 | smp->text = text; | ||
281 | smp->text_end = text_end; | ||
282 | DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", | ||
283 | __FUNCTION__, smp->locks, smp->locks_end, | ||
284 | smp->text, smp->text_end, smp->name); | ||
285 | |||
286 | spin_lock_irqsave(&smp_alt, flags); | ||
287 | list_add_tail(&smp->next, &smp_alt_modules); | ||
288 | if (boot_cpu_has(X86_FEATURE_UP)) | ||
289 | alternatives_smp_unlock(smp->locks, smp->locks_end, | ||
290 | smp->text, smp->text_end); | ||
291 | spin_unlock_irqrestore(&smp_alt, flags); | ||
292 | } | ||
293 | |||
294 | void alternatives_smp_module_del(struct module *mod) | ||
295 | { | ||
296 | struct smp_alt_module *item; | ||
297 | unsigned long flags; | ||
298 | |||
299 | if (smp_alt_once || noreplace_smp) | ||
300 | return; | ||
301 | |||
302 | spin_lock_irqsave(&smp_alt, flags); | ||
303 | list_for_each_entry(item, &smp_alt_modules, next) { | ||
304 | if (mod != item->mod) | ||
305 | continue; | ||
306 | list_del(&item->next); | ||
307 | spin_unlock_irqrestore(&smp_alt, flags); | ||
308 | DPRINTK("%s: %s\n", __FUNCTION__, item->name); | ||
309 | kfree(item); | ||
310 | return; | ||
311 | } | ||
312 | spin_unlock_irqrestore(&smp_alt, flags); | ||
313 | } | ||
314 | |||
315 | void alternatives_smp_switch(int smp) | ||
316 | { | ||
317 | struct smp_alt_module *mod; | ||
318 | unsigned long flags; | ||
319 | |||
320 | #ifdef CONFIG_LOCKDEP | ||
321 | /* | ||
322 | * A not yet fixed binutils section handling bug prevents | ||
323 | * alternatives-replacement from working reliably, so turn | ||
324 | * it off: | ||
325 | */ | ||
326 | printk("lockdep: not fixing up alternatives.\n"); | ||
327 | return; | ||
328 | #endif | ||
329 | |||
330 | if (noreplace_smp || smp_alt_once) | ||
331 | return; | ||
332 | BUG_ON(!smp && (num_online_cpus() > 1)); | ||
333 | |||
334 | spin_lock_irqsave(&smp_alt, flags); | ||
335 | if (smp) { | ||
336 | printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); | ||
337 | clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); | ||
338 | clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); | ||
339 | list_for_each_entry(mod, &smp_alt_modules, next) | ||
340 | alternatives_smp_lock(mod->locks, mod->locks_end, | ||
341 | mod->text, mod->text_end); | ||
342 | } else { | ||
343 | printk(KERN_INFO "SMP alternatives: switching to UP code\n"); | ||
344 | set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); | ||
345 | set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); | ||
346 | list_for_each_entry(mod, &smp_alt_modules, next) | ||
347 | alternatives_smp_unlock(mod->locks, mod->locks_end, | ||
348 | mod->text, mod->text_end); | ||
349 | } | ||
350 | spin_unlock_irqrestore(&smp_alt, flags); | ||
351 | } | ||
352 | |||
353 | #endif | ||
354 | |||
355 | #ifdef CONFIG_PARAVIRT | ||
356 | void apply_paravirt(struct paravirt_patch_site *start, | ||
357 | struct paravirt_patch_site *end) | ||
358 | { | ||
359 | struct paravirt_patch_site *p; | ||
360 | char insnbuf[MAX_PATCH_LEN]; | ||
361 | |||
362 | if (noreplace_paravirt) | ||
363 | return; | ||
364 | |||
365 | for (p = start; p < end; p++) { | ||
366 | unsigned int used; | ||
367 | |||
368 | BUG_ON(p->len > MAX_PATCH_LEN); | ||
369 | /* prep the buffer with the original instructions */ | ||
370 | memcpy(insnbuf, p->instr, p->len); | ||
371 | used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, | ||
372 | (unsigned long)p->instr, p->len); | ||
373 | |||
374 | BUG_ON(used > p->len); | ||
375 | |||
376 | /* Pad the rest with nops */ | ||
377 | add_nops(insnbuf + used, p->len - used); | ||
378 | text_poke(p->instr, insnbuf, p->len); | ||
379 | } | ||
380 | } | ||
381 | extern struct paravirt_patch_site __start_parainstructions[], | ||
382 | __stop_parainstructions[]; | ||
383 | #endif /* CONFIG_PARAVIRT */ | ||
384 | |||
385 | void __init alternative_instructions(void) | ||
386 | { | ||
387 | unsigned long flags; | ||
388 | |||
389 | /* The patching is not fully atomic, so try to avoid local interruptions | ||
390 | that might execute the to be patched code. | ||
391 | Other CPUs are not running. */ | ||
392 | stop_nmi(); | ||
393 | #ifdef CONFIG_X86_MCE | ||
394 | stop_mce(); | ||
395 | #endif | ||
396 | |||
397 | local_irq_save(flags); | ||
398 | apply_alternatives(__alt_instructions, __alt_instructions_end); | ||
399 | |||
400 | /* switch to patch-once-at-boottime-only mode and free the | ||
401 | * tables in case we know the number of CPUs will never ever | ||
402 | * change */ | ||
403 | #ifdef CONFIG_HOTPLUG_CPU | ||
404 | if (num_possible_cpus() < 2) | ||
405 | smp_alt_once = 1; | ||
406 | #endif | ||
407 | |||
408 | #ifdef CONFIG_SMP | ||
409 | if (smp_alt_once) { | ||
410 | if (1 == num_possible_cpus()) { | ||
411 | printk(KERN_INFO "SMP alternatives: switching to UP code\n"); | ||
412 | set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); | ||
413 | set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); | ||
414 | alternatives_smp_unlock(__smp_locks, __smp_locks_end, | ||
415 | _text, _etext); | ||
416 | } | ||
417 | free_init_pages("SMP alternatives", | ||
418 | (unsigned long)__smp_locks, | ||
419 | (unsigned long)__smp_locks_end); | ||
420 | } else { | ||
421 | alternatives_smp_module_add(NULL, "core kernel", | ||
422 | __smp_locks, __smp_locks_end, | ||
423 | _text, _etext); | ||
424 | alternatives_smp_switch(0); | ||
425 | } | ||
426 | #endif | ||
427 | apply_paravirt(__parainstructions, __parainstructions_end); | ||
428 | local_irq_restore(flags); | ||
429 | |||
430 | restart_nmi(); | ||
431 | #ifdef CONFIG_X86_MCE | ||
432 | restart_mce(); | ||
433 | #endif | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * Warning: | ||
438 | * When you use this code to patch more than one byte of an instruction | ||
439 | * you need to make sure that other CPUs cannot execute this code in parallel. | ||
440 | * Also no thread must be currently preempted in the middle of these instructions. | ||
441 | * And on the local CPU you need to be protected again NMI or MCE handlers | ||
442 | * seeing an inconsistent instruction while you patch. | ||
443 | */ | ||
444 | void __kprobes text_poke(void *addr, unsigned char *opcode, int len) | ||
445 | { | ||
446 | memcpy(addr, opcode, len); | ||
447 | sync_core(); | ||
448 | /* Could also do a CLFLUSH here to speed up CPU recovery; but | ||
449 | that causes hangs on some VIA CPUs. */ | ||
450 | } | ||
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c new file mode 100644 index 000000000000..3d67ae18d762 --- /dev/null +++ b/arch/x86/kernel/apic_32.c | |||
@@ -0,0 +1,1566 @@ | |||
1 | /* | ||
2 | * Local APIC handling, local APIC timers | ||
3 | * | ||
4 | * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes | ||
7 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
8 | * thanks to Eric Gilmore | ||
9 | * and Rolf G. Tews | ||
10 | * for testing these extensively. | ||
11 | * Maciej W. Rozycki : Various updates and fixes. | ||
12 | * Mikael Pettersson : Power Management for UP-APIC. | ||
13 | * Pavel Machek and | ||
14 | * Mikael Pettersson : PM converted to driver model. | ||
15 | */ | ||
16 | |||
17 | #include <linux/init.h> | ||
18 | |||
19 | #include <linux/mm.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/bootmem.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | #include <linux/mc146818rtc.h> | ||
24 | #include <linux/kernel_stat.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/cpu.h> | ||
27 | #include <linux/clockchips.h> | ||
28 | #include <linux/acpi_pmtmr.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/dmi.h> | ||
31 | |||
32 | #include <asm/atomic.h> | ||
33 | #include <asm/smp.h> | ||
34 | #include <asm/mtrr.h> | ||
35 | #include <asm/mpspec.h> | ||
36 | #include <asm/desc.h> | ||
37 | #include <asm/arch_hooks.h> | ||
38 | #include <asm/hpet.h> | ||
39 | #include <asm/i8253.h> | ||
40 | #include <asm/nmi.h> | ||
41 | |||
42 | #include <mach_apic.h> | ||
43 | #include <mach_apicdef.h> | ||
44 | #include <mach_ipi.h> | ||
45 | |||
46 | #include "io_ports.h" | ||
47 | |||
48 | /* | ||
49 | * Sanity check | ||
50 | */ | ||
51 | #if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F | ||
52 | # error SPURIOUS_APIC_VECTOR definition error | ||
53 | #endif | ||
54 | |||
55 | /* | ||
56 | * Knob to control our willingness to enable the local APIC. | ||
57 | * | ||
58 | * -1=force-disable, +1=force-enable | ||
59 | */ | ||
60 | static int enable_local_apic __initdata = 0; | ||
61 | |||
62 | /* Local APIC timer verification ok */ | ||
63 | static int local_apic_timer_verify_ok; | ||
64 | /* Disable local APIC timer from the kernel commandline or via dmi quirk | ||
65 | or using CPU MSR check */ | ||
66 | int local_apic_timer_disabled; | ||
67 | /* Local APIC timer works in C2 */ | ||
68 | int local_apic_timer_c2_ok; | ||
69 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); | ||
70 | |||
71 | /* | ||
72 | * Debug level, exported for io_apic.c | ||
73 | */ | ||
74 | int apic_verbosity; | ||
75 | |||
76 | static unsigned int calibration_result; | ||
77 | |||
78 | static int lapic_next_event(unsigned long delta, | ||
79 | struct clock_event_device *evt); | ||
80 | static void lapic_timer_setup(enum clock_event_mode mode, | ||
81 | struct clock_event_device *evt); | ||
82 | static void lapic_timer_broadcast(cpumask_t mask); | ||
83 | static void apic_pm_activate(void); | ||
84 | |||
85 | /* | ||
86 | * The local apic timer can be used for any function which is CPU local. | ||
87 | */ | ||
88 | static struct clock_event_device lapic_clockevent = { | ||
89 | .name = "lapic", | ||
90 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | ||
91 | | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, | ||
92 | .shift = 32, | ||
93 | .set_mode = lapic_timer_setup, | ||
94 | .set_next_event = lapic_next_event, | ||
95 | .broadcast = lapic_timer_broadcast, | ||
96 | .rating = 100, | ||
97 | .irq = -1, | ||
98 | }; | ||
99 | static DEFINE_PER_CPU(struct clock_event_device, lapic_events); | ||
100 | |||
101 | /* Local APIC was disabled by the BIOS and enabled by the kernel */ | ||
102 | static int enabled_via_apicbase; | ||
103 | |||
104 | /* | ||
105 | * Get the LAPIC version | ||
106 | */ | ||
107 | static inline int lapic_get_version(void) | ||
108 | { | ||
109 | return GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Check, if the APIC is integrated or a seperate chip | ||
114 | */ | ||
115 | static inline int lapic_is_integrated(void) | ||
116 | { | ||
117 | return APIC_INTEGRATED(lapic_get_version()); | ||
118 | } | ||
119 | |||
120 | /* | ||
121 | * Check, whether this is a modern or a first generation APIC | ||
122 | */ | ||
123 | static int modern_apic(void) | ||
124 | { | ||
125 | /* AMD systems use old APIC versions, so check the CPU */ | ||
126 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
127 | boot_cpu_data.x86 >= 0xf) | ||
128 | return 1; | ||
129 | return lapic_get_version() >= 0x14; | ||
130 | } | ||
131 | |||
132 | void apic_wait_icr_idle(void) | ||
133 | { | ||
134 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) | ||
135 | cpu_relax(); | ||
136 | } | ||
137 | |||
138 | unsigned long safe_apic_wait_icr_idle(void) | ||
139 | { | ||
140 | unsigned long send_status; | ||
141 | int timeout; | ||
142 | |||
143 | timeout = 0; | ||
144 | do { | ||
145 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
146 | if (!send_status) | ||
147 | break; | ||
148 | udelay(100); | ||
149 | } while (timeout++ < 1000); | ||
150 | |||
151 | return send_status; | ||
152 | } | ||
153 | |||
154 | /** | ||
155 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 | ||
156 | */ | ||
157 | void enable_NMI_through_LVT0 (void * dummy) | ||
158 | { | ||
159 | unsigned int v = APIC_DM_NMI; | ||
160 | |||
161 | /* Level triggered for 82489DX */ | ||
162 | if (!lapic_is_integrated()) | ||
163 | v |= APIC_LVT_LEVEL_TRIGGER; | ||
164 | apic_write_around(APIC_LVT0, v); | ||
165 | } | ||
166 | |||
167 | /** | ||
168 | * get_physical_broadcast - Get number of physical broadcast IDs | ||
169 | */ | ||
170 | int get_physical_broadcast(void) | ||
171 | { | ||
172 | return modern_apic() ? 0xff : 0xf; | ||
173 | } | ||
174 | |||
175 | /** | ||
176 | * lapic_get_maxlvt - get the maximum number of local vector table entries | ||
177 | */ | ||
178 | int lapic_get_maxlvt(void) | ||
179 | { | ||
180 | unsigned int v = apic_read(APIC_LVR); | ||
181 | |||
182 | /* 82489DXs do not report # of LVT entries. */ | ||
183 | return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * Local APIC timer | ||
188 | */ | ||
189 | |||
190 | /* Clock divisor is set to 16 */ | ||
191 | #define APIC_DIVISOR 16 | ||
192 | |||
193 | /* | ||
194 | * This function sets up the local APIC timer, with a timeout of | ||
195 | * 'clocks' APIC bus clock. During calibration we actually call | ||
196 | * this function twice on the boot CPU, once with a bogus timeout | ||
197 | * value, second time for real. The other (noncalibrating) CPUs | ||
198 | * call this function only once, with the real, calibrated value. | ||
199 | * | ||
200 | * We do reads before writes even if unnecessary, to get around the | ||
201 | * P5 APIC double write bug. | ||
202 | */ | ||
203 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | ||
204 | { | ||
205 | unsigned int lvtt_value, tmp_value; | ||
206 | |||
207 | lvtt_value = LOCAL_TIMER_VECTOR; | ||
208 | if (!oneshot) | ||
209 | lvtt_value |= APIC_LVT_TIMER_PERIODIC; | ||
210 | if (!lapic_is_integrated()) | ||
211 | lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); | ||
212 | |||
213 | if (!irqen) | ||
214 | lvtt_value |= APIC_LVT_MASKED; | ||
215 | |||
216 | apic_write_around(APIC_LVTT, lvtt_value); | ||
217 | |||
218 | /* | ||
219 | * Divide PICLK by 16 | ||
220 | */ | ||
221 | tmp_value = apic_read(APIC_TDCR); | ||
222 | apic_write_around(APIC_TDCR, (tmp_value | ||
223 | & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | ||
224 | | APIC_TDR_DIV_16); | ||
225 | |||
226 | if (!oneshot) | ||
227 | apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * Program the next event, relative to now | ||
232 | */ | ||
233 | static int lapic_next_event(unsigned long delta, | ||
234 | struct clock_event_device *evt) | ||
235 | { | ||
236 | apic_write_around(APIC_TMICT, delta); | ||
237 | return 0; | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * Setup the lapic timer in periodic or oneshot mode | ||
242 | */ | ||
243 | static void lapic_timer_setup(enum clock_event_mode mode, | ||
244 | struct clock_event_device *evt) | ||
245 | { | ||
246 | unsigned long flags; | ||
247 | unsigned int v; | ||
248 | |||
249 | /* Lapic used for broadcast ? */ | ||
250 | if (!local_apic_timer_verify_ok) | ||
251 | return; | ||
252 | |||
253 | local_irq_save(flags); | ||
254 | |||
255 | switch (mode) { | ||
256 | case CLOCK_EVT_MODE_PERIODIC: | ||
257 | case CLOCK_EVT_MODE_ONESHOT: | ||
258 | __setup_APIC_LVTT(calibration_result, | ||
259 | mode != CLOCK_EVT_MODE_PERIODIC, 1); | ||
260 | break; | ||
261 | case CLOCK_EVT_MODE_UNUSED: | ||
262 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
263 | v = apic_read(APIC_LVTT); | ||
264 | v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
265 | apic_write_around(APIC_LVTT, v); | ||
266 | break; | ||
267 | case CLOCK_EVT_MODE_RESUME: | ||
268 | /* Nothing to do here */ | ||
269 | break; | ||
270 | } | ||
271 | |||
272 | local_irq_restore(flags); | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * Local APIC timer broadcast function | ||
277 | */ | ||
278 | static void lapic_timer_broadcast(cpumask_t mask) | ||
279 | { | ||
280 | #ifdef CONFIG_SMP | ||
281 | send_IPI_mask(mask, LOCAL_TIMER_VECTOR); | ||
282 | #endif | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * Setup the local APIC timer for this CPU. Copy the initilized values | ||
287 | * of the boot CPU and register the clock event in the framework. | ||
288 | */ | ||
289 | static void __devinit setup_APIC_timer(void) | ||
290 | { | ||
291 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); | ||
292 | |||
293 | memcpy(levt, &lapic_clockevent, sizeof(*levt)); | ||
294 | levt->cpumask = cpumask_of_cpu(smp_processor_id()); | ||
295 | |||
296 | clockevents_register_device(levt); | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * In this functions we calibrate APIC bus clocks to the external timer. | ||
301 | * | ||
302 | * We want to do the calibration only once since we want to have local timer | ||
303 | * irqs syncron. CPUs connected by the same APIC bus have the very same bus | ||
304 | * frequency. | ||
305 | * | ||
306 | * This was previously done by reading the PIT/HPET and waiting for a wrap | ||
307 | * around to find out, that a tick has elapsed. I have a box, where the PIT | ||
308 | * readout is broken, so it never gets out of the wait loop again. This was | ||
309 | * also reported by others. | ||
310 | * | ||
311 | * Monitoring the jiffies value is inaccurate and the clockevents | ||
312 | * infrastructure allows us to do a simple substitution of the interrupt | ||
313 | * handler. | ||
314 | * | ||
315 | * The calibration routine also uses the pm_timer when possible, as the PIT | ||
316 | * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes | ||
317 | * back to normal later in the boot process). | ||
318 | */ | ||
319 | |||
320 | #define LAPIC_CAL_LOOPS (HZ/10) | ||
321 | |||
322 | static __initdata int lapic_cal_loops = -1; | ||
323 | static __initdata long lapic_cal_t1, lapic_cal_t2; | ||
324 | static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; | ||
325 | static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; | ||
326 | static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; | ||
327 | |||
328 | /* | ||
329 | * Temporary interrupt handler. | ||
330 | */ | ||
331 | static void __init lapic_cal_handler(struct clock_event_device *dev) | ||
332 | { | ||
333 | unsigned long long tsc = 0; | ||
334 | long tapic = apic_read(APIC_TMCCT); | ||
335 | unsigned long pm = acpi_pm_read_early(); | ||
336 | |||
337 | if (cpu_has_tsc) | ||
338 | rdtscll(tsc); | ||
339 | |||
340 | switch (lapic_cal_loops++) { | ||
341 | case 0: | ||
342 | lapic_cal_t1 = tapic; | ||
343 | lapic_cal_tsc1 = tsc; | ||
344 | lapic_cal_pm1 = pm; | ||
345 | lapic_cal_j1 = jiffies; | ||
346 | break; | ||
347 | |||
348 | case LAPIC_CAL_LOOPS: | ||
349 | lapic_cal_t2 = tapic; | ||
350 | lapic_cal_tsc2 = tsc; | ||
351 | if (pm < lapic_cal_pm1) | ||
352 | pm += ACPI_PM_OVRRUN; | ||
353 | lapic_cal_pm2 = pm; | ||
354 | lapic_cal_j2 = jiffies; | ||
355 | break; | ||
356 | } | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | * Setup the boot APIC | ||
361 | * | ||
362 | * Calibrate and verify the result. | ||
363 | */ | ||
364 | void __init setup_boot_APIC_clock(void) | ||
365 | { | ||
366 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); | ||
367 | const long pm_100ms = PMTMR_TICKS_PER_SEC/10; | ||
368 | const long pm_thresh = pm_100ms/100; | ||
369 | void (*real_handler)(struct clock_event_device *dev); | ||
370 | unsigned long deltaj; | ||
371 | long delta, deltapm; | ||
372 | int pm_referenced = 0; | ||
373 | |||
374 | /* | ||
375 | * The local apic timer can be disabled via the kernel | ||
376 | * commandline or from the CPU detection code. Register the lapic | ||
377 | * timer as a dummy clock event source on SMP systems, so the | ||
378 | * broadcast mechanism is used. On UP systems simply ignore it. | ||
379 | */ | ||
380 | if (local_apic_timer_disabled) { | ||
381 | /* No broadcast on UP ! */ | ||
382 | if (num_possible_cpus() > 1) | ||
383 | setup_APIC_timer(); | ||
384 | return; | ||
385 | } | ||
386 | |||
387 | apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" | ||
388 | "calibrating APIC timer ...\n"); | ||
389 | |||
390 | local_irq_disable(); | ||
391 | |||
392 | /* Replace the global interrupt handler */ | ||
393 | real_handler = global_clock_event->event_handler; | ||
394 | global_clock_event->event_handler = lapic_cal_handler; | ||
395 | |||
396 | /* | ||
397 | * Setup the APIC counter to 1e9. There is no way the lapic | ||
398 | * can underflow in the 100ms detection time frame | ||
399 | */ | ||
400 | __setup_APIC_LVTT(1000000000, 0, 0); | ||
401 | |||
402 | /* Let the interrupts run */ | ||
403 | local_irq_enable(); | ||
404 | |||
405 | while (lapic_cal_loops <= LAPIC_CAL_LOOPS) | ||
406 | cpu_relax(); | ||
407 | |||
408 | local_irq_disable(); | ||
409 | |||
410 | /* Restore the real event handler */ | ||
411 | global_clock_event->event_handler = real_handler; | ||
412 | |||
413 | /* Build delta t1-t2 as apic timer counts down */ | ||
414 | delta = lapic_cal_t1 - lapic_cal_t2; | ||
415 | apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); | ||
416 | |||
417 | /* Check, if the PM timer is available */ | ||
418 | deltapm = lapic_cal_pm2 - lapic_cal_pm1; | ||
419 | apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); | ||
420 | |||
421 | if (deltapm) { | ||
422 | unsigned long mult; | ||
423 | u64 res; | ||
424 | |||
425 | mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); | ||
426 | |||
427 | if (deltapm > (pm_100ms - pm_thresh) && | ||
428 | deltapm < (pm_100ms + pm_thresh)) { | ||
429 | apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); | ||
430 | } else { | ||
431 | res = (((u64) deltapm) * mult) >> 22; | ||
432 | do_div(res, 1000000); | ||
433 | printk(KERN_WARNING "APIC calibration not consistent " | ||
434 | "with PM Timer: %ldms instead of 100ms\n", | ||
435 | (long)res); | ||
436 | /* Correct the lapic counter value */ | ||
437 | res = (((u64) delta ) * pm_100ms); | ||
438 | do_div(res, deltapm); | ||
439 | printk(KERN_INFO "APIC delta adjusted to PM-Timer: " | ||
440 | "%lu (%ld)\n", (unsigned long) res, delta); | ||
441 | delta = (long) res; | ||
442 | } | ||
443 | pm_referenced = 1; | ||
444 | } | ||
445 | |||
446 | /* Calculate the scaled math multiplication factor */ | ||
447 | lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); | ||
448 | lapic_clockevent.max_delta_ns = | ||
449 | clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); | ||
450 | lapic_clockevent.min_delta_ns = | ||
451 | clockevent_delta2ns(0xF, &lapic_clockevent); | ||
452 | |||
453 | calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; | ||
454 | |||
455 | apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); | ||
456 | apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); | ||
457 | apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", | ||
458 | calibration_result); | ||
459 | |||
460 | if (cpu_has_tsc) { | ||
461 | delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); | ||
462 | apic_printk(APIC_VERBOSE, "..... CPU clock speed is " | ||
463 | "%ld.%04ld MHz.\n", | ||
464 | (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), | ||
465 | (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); | ||
466 | } | ||
467 | |||
468 | apic_printk(APIC_VERBOSE, "..... host bus clock speed is " | ||
469 | "%u.%04u MHz.\n", | ||
470 | calibration_result / (1000000 / HZ), | ||
471 | calibration_result % (1000000 / HZ)); | ||
472 | |||
473 | local_apic_timer_verify_ok = 1; | ||
474 | |||
475 | /* We trust the pm timer based calibration */ | ||
476 | if (!pm_referenced) { | ||
477 | apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); | ||
478 | |||
479 | /* | ||
480 | * Setup the apic timer manually | ||
481 | */ | ||
482 | levt->event_handler = lapic_cal_handler; | ||
483 | lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); | ||
484 | lapic_cal_loops = -1; | ||
485 | |||
486 | /* Let the interrupts run */ | ||
487 | local_irq_enable(); | ||
488 | |||
489 | while (lapic_cal_loops <= LAPIC_CAL_LOOPS) | ||
490 | cpu_relax(); | ||
491 | |||
492 | local_irq_disable(); | ||
493 | |||
494 | /* Stop the lapic timer */ | ||
495 | lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); | ||
496 | |||
497 | local_irq_enable(); | ||
498 | |||
499 | /* Jiffies delta */ | ||
500 | deltaj = lapic_cal_j2 - lapic_cal_j1; | ||
501 | apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); | ||
502 | |||
503 | /* Check, if the jiffies result is consistent */ | ||
504 | if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) | ||
505 | apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); | ||
506 | else | ||
507 | local_apic_timer_verify_ok = 0; | ||
508 | } else | ||
509 | local_irq_enable(); | ||
510 | |||
511 | if (!local_apic_timer_verify_ok) { | ||
512 | printk(KERN_WARNING | ||
513 | "APIC timer disabled due to verification failure.\n"); | ||
514 | /* No broadcast on UP ! */ | ||
515 | if (num_possible_cpus() == 1) | ||
516 | return; | ||
517 | } else { | ||
518 | /* | ||
519 | * If nmi_watchdog is set to IO_APIC, we need the | ||
520 | * PIT/HPET going. Otherwise register lapic as a dummy | ||
521 | * device. | ||
522 | */ | ||
523 | if (nmi_watchdog != NMI_IO_APIC) | ||
524 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; | ||
525 | else | ||
526 | printk(KERN_WARNING "APIC timer registered as dummy," | ||
527 | " due to nmi_watchdog=1!\n"); | ||
528 | } | ||
529 | |||
530 | /* Setup the lapic or request the broadcast */ | ||
531 | setup_APIC_timer(); | ||
532 | } | ||
533 | |||
534 | void __devinit setup_secondary_APIC_clock(void) | ||
535 | { | ||
536 | setup_APIC_timer(); | ||
537 | } | ||
538 | |||
539 | /* | ||
540 | * The guts of the apic timer interrupt | ||
541 | */ | ||
542 | static void local_apic_timer_interrupt(void) | ||
543 | { | ||
544 | int cpu = smp_processor_id(); | ||
545 | struct clock_event_device *evt = &per_cpu(lapic_events, cpu); | ||
546 | |||
547 | /* | ||
548 | * Normally we should not be here till LAPIC has been initialized but | ||
549 | * in some cases like kdump, its possible that there is a pending LAPIC | ||
550 | * timer interrupt from previous kernel's context and is delivered in | ||
551 | * new kernel the moment interrupts are enabled. | ||
552 | * | ||
553 | * Interrupts are enabled early and LAPIC is setup much later, hence | ||
554 | * its possible that when we get here evt->event_handler is NULL. | ||
555 | * Check for event_handler being NULL and discard the interrupt as | ||
556 | * spurious. | ||
557 | */ | ||
558 | if (!evt->event_handler) { | ||
559 | printk(KERN_WARNING | ||
560 | "Spurious LAPIC timer interrupt on cpu %d\n", cpu); | ||
561 | /* Switch it off */ | ||
562 | lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); | ||
563 | return; | ||
564 | } | ||
565 | |||
566 | per_cpu(irq_stat, cpu).apic_timer_irqs++; | ||
567 | |||
568 | evt->event_handler(evt); | ||
569 | } | ||
570 | |||
571 | /* | ||
572 | * Local APIC timer interrupt. This is the most natural way for doing | ||
573 | * local interrupts, but local timer interrupts can be emulated by | ||
574 | * broadcast interrupts too. [in case the hw doesn't support APIC timers] | ||
575 | * | ||
576 | * [ if a single-CPU system runs an SMP kernel then we call the local | ||
577 | * interrupt as well. Thus we cannot inline the local irq ... ] | ||
578 | */ | ||
579 | |||
580 | void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) | ||
581 | { | ||
582 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
583 | |||
584 | /* | ||
585 | * NOTE! We'd better ACK the irq immediately, | ||
586 | * because timer handling can be slow. | ||
587 | */ | ||
588 | ack_APIC_irq(); | ||
589 | /* | ||
590 | * update_process_times() expects us to have done irq_enter(). | ||
591 | * Besides, if we don't timer interrupts ignore the global | ||
592 | * interrupt lock, which is the WrongThing (tm) to do. | ||
593 | */ | ||
594 | irq_enter(); | ||
595 | local_apic_timer_interrupt(); | ||
596 | irq_exit(); | ||
597 | |||
598 | set_irq_regs(old_regs); | ||
599 | } | ||
600 | |||
601 | int setup_profiling_timer(unsigned int multiplier) | ||
602 | { | ||
603 | return -EINVAL; | ||
604 | } | ||
605 | |||
606 | /* | ||
607 | * Local APIC start and shutdown | ||
608 | */ | ||
609 | |||
610 | /** | ||
611 | * clear_local_APIC - shutdown the local APIC | ||
612 | * | ||
613 | * This is called, when a CPU is disabled and before rebooting, so the state of | ||
614 | * the local APIC has no dangling leftovers. Also used to cleanout any BIOS | ||
615 | * leftovers during boot. | ||
616 | */ | ||
617 | void clear_local_APIC(void) | ||
618 | { | ||
619 | int maxlvt = lapic_get_maxlvt(); | ||
620 | unsigned long v; | ||
621 | |||
622 | /* | ||
623 | * Masking an LVT entry can trigger a local APIC error | ||
624 | * if the vector is zero. Mask LVTERR first to prevent this. | ||
625 | */ | ||
626 | if (maxlvt >= 3) { | ||
627 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ | ||
628 | apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); | ||
629 | } | ||
630 | /* | ||
631 | * Careful: we have to set masks only first to deassert | ||
632 | * any level-triggered sources. | ||
633 | */ | ||
634 | v = apic_read(APIC_LVTT); | ||
635 | apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | ||
636 | v = apic_read(APIC_LVT0); | ||
637 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | ||
638 | v = apic_read(APIC_LVT1); | ||
639 | apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); | ||
640 | if (maxlvt >= 4) { | ||
641 | v = apic_read(APIC_LVTPC); | ||
642 | apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); | ||
643 | } | ||
644 | |||
645 | /* lets not touch this if we didn't frob it */ | ||
646 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
647 | if (maxlvt >= 5) { | ||
648 | v = apic_read(APIC_LVTTHMR); | ||
649 | apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); | ||
650 | } | ||
651 | #endif | ||
652 | /* | ||
653 | * Clean APIC state for other OSs: | ||
654 | */ | ||
655 | apic_write_around(APIC_LVTT, APIC_LVT_MASKED); | ||
656 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED); | ||
657 | apic_write_around(APIC_LVT1, APIC_LVT_MASKED); | ||
658 | if (maxlvt >= 3) | ||
659 | apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); | ||
660 | if (maxlvt >= 4) | ||
661 | apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); | ||
662 | |||
663 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
664 | if (maxlvt >= 5) | ||
665 | apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); | ||
666 | #endif | ||
667 | /* Integrated APIC (!82489DX) ? */ | ||
668 | if (lapic_is_integrated()) { | ||
669 | if (maxlvt > 3) | ||
670 | /* Clear ESR due to Pentium errata 3AP and 11AP */ | ||
671 | apic_write(APIC_ESR, 0); | ||
672 | apic_read(APIC_ESR); | ||
673 | } | ||
674 | } | ||
675 | |||
676 | /** | ||
677 | * disable_local_APIC - clear and disable the local APIC | ||
678 | */ | ||
679 | void disable_local_APIC(void) | ||
680 | { | ||
681 | unsigned long value; | ||
682 | |||
683 | clear_local_APIC(); | ||
684 | |||
685 | /* | ||
686 | * Disable APIC (implies clearing of registers | ||
687 | * for 82489DX!). | ||
688 | */ | ||
689 | value = apic_read(APIC_SPIV); | ||
690 | value &= ~APIC_SPIV_APIC_ENABLED; | ||
691 | apic_write_around(APIC_SPIV, value); | ||
692 | |||
693 | /* | ||
694 | * When LAPIC was disabled by the BIOS and enabled by the kernel, | ||
695 | * restore the disabled state. | ||
696 | */ | ||
697 | if (enabled_via_apicbase) { | ||
698 | unsigned int l, h; | ||
699 | |||
700 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
701 | l &= ~MSR_IA32_APICBASE_ENABLE; | ||
702 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
703 | } | ||
704 | } | ||
705 | |||
706 | /* | ||
707 | * If Linux enabled the LAPIC against the BIOS default disable it down before | ||
708 | * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and | ||
709 | * not power-off. Additionally clear all LVT entries before disable_local_APIC | ||
710 | * for the case where Linux didn't enable the LAPIC. | ||
711 | */ | ||
712 | void lapic_shutdown(void) | ||
713 | { | ||
714 | unsigned long flags; | ||
715 | |||
716 | if (!cpu_has_apic) | ||
717 | return; | ||
718 | |||
719 | local_irq_save(flags); | ||
720 | clear_local_APIC(); | ||
721 | |||
722 | if (enabled_via_apicbase) | ||
723 | disable_local_APIC(); | ||
724 | |||
725 | local_irq_restore(flags); | ||
726 | } | ||
727 | |||
728 | /* | ||
729 | * This is to verify that we're looking at a real local APIC. | ||
730 | * Check these against your board if the CPUs aren't getting | ||
731 | * started for no apparent reason. | ||
732 | */ | ||
733 | int __init verify_local_APIC(void) | ||
734 | { | ||
735 | unsigned int reg0, reg1; | ||
736 | |||
737 | /* | ||
738 | * The version register is read-only in a real APIC. | ||
739 | */ | ||
740 | reg0 = apic_read(APIC_LVR); | ||
741 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); | ||
742 | apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); | ||
743 | reg1 = apic_read(APIC_LVR); | ||
744 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); | ||
745 | |||
746 | /* | ||
747 | * The two version reads above should print the same | ||
748 | * numbers. If the second one is different, then we | ||
749 | * poke at a non-APIC. | ||
750 | */ | ||
751 | if (reg1 != reg0) | ||
752 | return 0; | ||
753 | |||
754 | /* | ||
755 | * Check if the version looks reasonably. | ||
756 | */ | ||
757 | reg1 = GET_APIC_VERSION(reg0); | ||
758 | if (reg1 == 0x00 || reg1 == 0xff) | ||
759 | return 0; | ||
760 | reg1 = lapic_get_maxlvt(); | ||
761 | if (reg1 < 0x02 || reg1 == 0xff) | ||
762 | return 0; | ||
763 | |||
764 | /* | ||
765 | * The ID register is read/write in a real APIC. | ||
766 | */ | ||
767 | reg0 = apic_read(APIC_ID); | ||
768 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | ||
769 | |||
770 | /* | ||
771 | * The next two are just to see if we have sane values. | ||
772 | * They're only really relevant if we're in Virtual Wire | ||
773 | * compatibility mode, but most boxes are anymore. | ||
774 | */ | ||
775 | reg0 = apic_read(APIC_LVT0); | ||
776 | apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); | ||
777 | reg1 = apic_read(APIC_LVT1); | ||
778 | apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); | ||
779 | |||
780 | return 1; | ||
781 | } | ||
782 | |||
783 | /** | ||
784 | * sync_Arb_IDs - synchronize APIC bus arbitration IDs | ||
785 | */ | ||
786 | void __init sync_Arb_IDs(void) | ||
787 | { | ||
788 | /* | ||
789 | * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not | ||
790 | * needed on AMD. | ||
791 | */ | ||
792 | if (modern_apic()) | ||
793 | return; | ||
794 | /* | ||
795 | * Wait for idle. | ||
796 | */ | ||
797 | apic_wait_icr_idle(); | ||
798 | |||
799 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | ||
800 | apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | ||
801 | | APIC_DM_INIT); | ||
802 | } | ||
803 | |||
804 | /* | ||
805 | * An initial setup of the virtual wire mode. | ||
806 | */ | ||
807 | void __init init_bsp_APIC(void) | ||
808 | { | ||
809 | unsigned long value; | ||
810 | |||
811 | /* | ||
812 | * Don't do the setup now if we have a SMP BIOS as the | ||
813 | * through-I/O-APIC virtual wire mode might be active. | ||
814 | */ | ||
815 | if (smp_found_config || !cpu_has_apic) | ||
816 | return; | ||
817 | |||
818 | /* | ||
819 | * Do not trust the local APIC being empty at bootup. | ||
820 | */ | ||
821 | clear_local_APIC(); | ||
822 | |||
823 | /* | ||
824 | * Enable APIC. | ||
825 | */ | ||
826 | value = apic_read(APIC_SPIV); | ||
827 | value &= ~APIC_VECTOR_MASK; | ||
828 | value |= APIC_SPIV_APIC_ENABLED; | ||
829 | |||
830 | /* This bit is reserved on P4/Xeon and should be cleared */ | ||
831 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | ||
832 | (boot_cpu_data.x86 == 15)) | ||
833 | value &= ~APIC_SPIV_FOCUS_DISABLED; | ||
834 | else | ||
835 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
836 | value |= SPURIOUS_APIC_VECTOR; | ||
837 | apic_write_around(APIC_SPIV, value); | ||
838 | |||
839 | /* | ||
840 | * Set up the virtual wire mode. | ||
841 | */ | ||
842 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | ||
843 | value = APIC_DM_NMI; | ||
844 | if (!lapic_is_integrated()) /* 82489DX */ | ||
845 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
846 | apic_write_around(APIC_LVT1, value); | ||
847 | } | ||
848 | |||
849 | /** | ||
850 | * setup_local_APIC - setup the local APIC | ||
851 | */ | ||
852 | void __devinit setup_local_APIC(void) | ||
853 | { | ||
854 | unsigned long oldvalue, value, maxlvt, integrated; | ||
855 | int i, j; | ||
856 | |||
857 | /* Pound the ESR really hard over the head with a big hammer - mbligh */ | ||
858 | if (esr_disable) { | ||
859 | apic_write(APIC_ESR, 0); | ||
860 | apic_write(APIC_ESR, 0); | ||
861 | apic_write(APIC_ESR, 0); | ||
862 | apic_write(APIC_ESR, 0); | ||
863 | } | ||
864 | |||
865 | integrated = lapic_is_integrated(); | ||
866 | |||
867 | /* | ||
868 | * Double-check whether this APIC is really registered. | ||
869 | */ | ||
870 | if (!apic_id_registered()) | ||
871 | BUG(); | ||
872 | |||
873 | /* | ||
874 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
875 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
876 | * document number 292116). So here it goes... | ||
877 | */ | ||
878 | init_apic_ldr(); | ||
879 | |||
880 | /* | ||
881 | * Set Task Priority to 'accept all'. We never change this | ||
882 | * later on. | ||
883 | */ | ||
884 | value = apic_read(APIC_TASKPRI); | ||
885 | value &= ~APIC_TPRI_MASK; | ||
886 | apic_write_around(APIC_TASKPRI, value); | ||
887 | |||
888 | /* | ||
889 | * After a crash, we no longer service the interrupts and a pending | ||
890 | * interrupt from previous kernel might still have ISR bit set. | ||
891 | * | ||
892 | * Most probably by now CPU has serviced that pending interrupt and | ||
893 | * it might not have done the ack_APIC_irq() because it thought, | ||
894 | * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it | ||
895 | * does not clear the ISR bit and cpu thinks it has already serivced | ||
896 | * the interrupt. Hence a vector might get locked. It was noticed | ||
897 | * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. | ||
898 | */ | ||
899 | for (i = APIC_ISR_NR - 1; i >= 0; i--) { | ||
900 | value = apic_read(APIC_ISR + i*0x10); | ||
901 | for (j = 31; j >= 0; j--) { | ||
902 | if (value & (1<<j)) | ||
903 | ack_APIC_irq(); | ||
904 | } | ||
905 | } | ||
906 | |||
907 | /* | ||
908 | * Now that we are all set up, enable the APIC | ||
909 | */ | ||
910 | value = apic_read(APIC_SPIV); | ||
911 | value &= ~APIC_VECTOR_MASK; | ||
912 | /* | ||
913 | * Enable APIC | ||
914 | */ | ||
915 | value |= APIC_SPIV_APIC_ENABLED; | ||
916 | |||
917 | /* | ||
918 | * Some unknown Intel IO/APIC (or APIC) errata is biting us with | ||
919 | * certain networking cards. If high frequency interrupts are | ||
920 | * happening on a particular IOAPIC pin, plus the IOAPIC routing | ||
921 | * entry is masked/unmasked at a high rate as well then sooner or | ||
922 | * later IOAPIC line gets 'stuck', no more interrupts are received | ||
923 | * from the device. If focus CPU is disabled then the hang goes | ||
924 | * away, oh well :-( | ||
925 | * | ||
926 | * [ This bug can be reproduced easily with a level-triggered | ||
927 | * PCI Ne2000 networking cards and PII/PIII processors, dual | ||
928 | * BX chipset. ] | ||
929 | */ | ||
930 | /* | ||
931 | * Actually disabling the focus CPU check just makes the hang less | ||
932 | * frequent as it makes the interrupt distributon model be more | ||
933 | * like LRU than MRU (the short-term load is more even across CPUs). | ||
934 | * See also the comment in end_level_ioapic_irq(). --macro | ||
935 | */ | ||
936 | |||
937 | /* Enable focus processor (bit==0) */ | ||
938 | value &= ~APIC_SPIV_FOCUS_DISABLED; | ||
939 | |||
940 | /* | ||
941 | * Set spurious IRQ vector | ||
942 | */ | ||
943 | value |= SPURIOUS_APIC_VECTOR; | ||
944 | apic_write_around(APIC_SPIV, value); | ||
945 | |||
946 | /* | ||
947 | * Set up LVT0, LVT1: | ||
948 | * | ||
949 | * set up through-local-APIC on the BP's LINT0. This is not | ||
950 | * strictly necessery in pure symmetric-IO mode, but sometimes | ||
951 | * we delegate interrupts to the 8259A. | ||
952 | */ | ||
953 | /* | ||
954 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro | ||
955 | */ | ||
956 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; | ||
957 | if (!smp_processor_id() && (pic_mode || !value)) { | ||
958 | value = APIC_DM_EXTINT; | ||
959 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", | ||
960 | smp_processor_id()); | ||
961 | } else { | ||
962 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; | ||
963 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", | ||
964 | smp_processor_id()); | ||
965 | } | ||
966 | apic_write_around(APIC_LVT0, value); | ||
967 | |||
968 | /* | ||
969 | * only the BP should see the LINT1 NMI signal, obviously. | ||
970 | */ | ||
971 | if (!smp_processor_id()) | ||
972 | value = APIC_DM_NMI; | ||
973 | else | ||
974 | value = APIC_DM_NMI | APIC_LVT_MASKED; | ||
975 | if (!integrated) /* 82489DX */ | ||
976 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
977 | apic_write_around(APIC_LVT1, value); | ||
978 | |||
979 | if (integrated && !esr_disable) { /* !82489DX */ | ||
980 | maxlvt = lapic_get_maxlvt(); | ||
981 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
982 | apic_write(APIC_ESR, 0); | ||
983 | oldvalue = apic_read(APIC_ESR); | ||
984 | |||
985 | /* enables sending errors */ | ||
986 | value = ERROR_APIC_VECTOR; | ||
987 | apic_write_around(APIC_LVTERR, value); | ||
988 | /* | ||
989 | * spec says clear errors after enabling vector. | ||
990 | */ | ||
991 | if (maxlvt > 3) | ||
992 | apic_write(APIC_ESR, 0); | ||
993 | value = apic_read(APIC_ESR); | ||
994 | if (value != oldvalue) | ||
995 | apic_printk(APIC_VERBOSE, "ESR value before enabling " | ||
996 | "vector: 0x%08lx after: 0x%08lx\n", | ||
997 | oldvalue, value); | ||
998 | } else { | ||
999 | if (esr_disable) | ||
1000 | /* | ||
1001 | * Something untraceble is creating bad interrupts on | ||
1002 | * secondary quads ... for the moment, just leave the | ||
1003 | * ESR disabled - we can't do anything useful with the | ||
1004 | * errors anyway - mbligh | ||
1005 | */ | ||
1006 | printk(KERN_INFO "Leaving ESR disabled.\n"); | ||
1007 | else | ||
1008 | printk(KERN_INFO "No ESR for 82489DX.\n"); | ||
1009 | } | ||
1010 | |||
1011 | /* Disable the local apic timer */ | ||
1012 | value = apic_read(APIC_LVTT); | ||
1013 | value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
1014 | apic_write_around(APIC_LVTT, value); | ||
1015 | |||
1016 | setup_apic_nmi_watchdog(NULL); | ||
1017 | apic_pm_activate(); | ||
1018 | } | ||
1019 | |||
1020 | /* | ||
1021 | * Detect and initialize APIC | ||
1022 | */ | ||
1023 | static int __init detect_init_APIC (void) | ||
1024 | { | ||
1025 | u32 h, l, features; | ||
1026 | |||
1027 | /* Disabled by kernel option? */ | ||
1028 | if (enable_local_apic < 0) | ||
1029 | return -1; | ||
1030 | |||
1031 | switch (boot_cpu_data.x86_vendor) { | ||
1032 | case X86_VENDOR_AMD: | ||
1033 | if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || | ||
1034 | (boot_cpu_data.x86 == 15)) | ||
1035 | break; | ||
1036 | goto no_apic; | ||
1037 | case X86_VENDOR_INTEL: | ||
1038 | if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || | ||
1039 | (boot_cpu_data.x86 == 5 && cpu_has_apic)) | ||
1040 | break; | ||
1041 | goto no_apic; | ||
1042 | default: | ||
1043 | goto no_apic; | ||
1044 | } | ||
1045 | |||
1046 | if (!cpu_has_apic) { | ||
1047 | /* | ||
1048 | * Over-ride BIOS and try to enable the local APIC only if | ||
1049 | * "lapic" specified. | ||
1050 | */ | ||
1051 | if (enable_local_apic <= 0) { | ||
1052 | printk(KERN_INFO "Local APIC disabled by BIOS -- " | ||
1053 | "you can enable it with \"lapic\"\n"); | ||
1054 | return -1; | ||
1055 | } | ||
1056 | /* | ||
1057 | * Some BIOSes disable the local APIC in the APIC_BASE | ||
1058 | * MSR. This can only be done in software for Intel P6 or later | ||
1059 | * and AMD K7 (Model > 1) or later. | ||
1060 | */ | ||
1061 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1062 | if (!(l & MSR_IA32_APICBASE_ENABLE)) { | ||
1063 | printk(KERN_INFO | ||
1064 | "Local APIC disabled by BIOS -- reenabling.\n"); | ||
1065 | l &= ~MSR_IA32_APICBASE_BASE; | ||
1066 | l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; | ||
1067 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
1068 | enabled_via_apicbase = 1; | ||
1069 | } | ||
1070 | } | ||
1071 | /* | ||
1072 | * The APIC feature bit should now be enabled | ||
1073 | * in `cpuid' | ||
1074 | */ | ||
1075 | features = cpuid_edx(1); | ||
1076 | if (!(features & (1 << X86_FEATURE_APIC))) { | ||
1077 | printk(KERN_WARNING "Could not enable APIC!\n"); | ||
1078 | return -1; | ||
1079 | } | ||
1080 | set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); | ||
1081 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
1082 | |||
1083 | /* The BIOS may have set up the APIC at some other address */ | ||
1084 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1085 | if (l & MSR_IA32_APICBASE_ENABLE) | ||
1086 | mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; | ||
1087 | |||
1088 | if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED) | ||
1089 | nmi_watchdog = NMI_LOCAL_APIC; | ||
1090 | |||
1091 | printk(KERN_INFO "Found and enabled local APIC!\n"); | ||
1092 | |||
1093 | apic_pm_activate(); | ||
1094 | |||
1095 | return 0; | ||
1096 | |||
1097 | no_apic: | ||
1098 | printk(KERN_INFO "No local APIC present or hardware disabled\n"); | ||
1099 | return -1; | ||
1100 | } | ||
1101 | |||
1102 | /** | ||
1103 | * init_apic_mappings - initialize APIC mappings | ||
1104 | */ | ||
1105 | void __init init_apic_mappings(void) | ||
1106 | { | ||
1107 | unsigned long apic_phys; | ||
1108 | |||
1109 | /* | ||
1110 | * If no local APIC can be found then set up a fake all | ||
1111 | * zeroes page to simulate the local APIC and another | ||
1112 | * one for the IO-APIC. | ||
1113 | */ | ||
1114 | if (!smp_found_config && detect_init_APIC()) { | ||
1115 | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
1116 | apic_phys = __pa(apic_phys); | ||
1117 | } else | ||
1118 | apic_phys = mp_lapic_addr; | ||
1119 | |||
1120 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | ||
1121 | printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, | ||
1122 | apic_phys); | ||
1123 | |||
1124 | /* | ||
1125 | * Fetch the APIC ID of the BSP in case we have a | ||
1126 | * default configuration (or the MP table is broken). | ||
1127 | */ | ||
1128 | if (boot_cpu_physical_apicid == -1U) | ||
1129 | boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); | ||
1130 | |||
1131 | #ifdef CONFIG_X86_IO_APIC | ||
1132 | { | ||
1133 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
1134 | int i; | ||
1135 | |||
1136 | for (i = 0; i < nr_ioapics; i++) { | ||
1137 | if (smp_found_config) { | ||
1138 | ioapic_phys = mp_ioapics[i].mpc_apicaddr; | ||
1139 | if (!ioapic_phys) { | ||
1140 | printk(KERN_ERR | ||
1141 | "WARNING: bogus zero IO-APIC " | ||
1142 | "address found in MPTABLE, " | ||
1143 | "disabling IO/APIC support!\n"); | ||
1144 | smp_found_config = 0; | ||
1145 | skip_ioapic_setup = 1; | ||
1146 | goto fake_ioapic_page; | ||
1147 | } | ||
1148 | } else { | ||
1149 | fake_ioapic_page: | ||
1150 | ioapic_phys = (unsigned long) | ||
1151 | alloc_bootmem_pages(PAGE_SIZE); | ||
1152 | ioapic_phys = __pa(ioapic_phys); | ||
1153 | } | ||
1154 | set_fixmap_nocache(idx, ioapic_phys); | ||
1155 | printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", | ||
1156 | __fix_to_virt(idx), ioapic_phys); | ||
1157 | idx++; | ||
1158 | } | ||
1159 | } | ||
1160 | #endif | ||
1161 | } | ||
1162 | |||
1163 | /* | ||
1164 | * This initializes the IO-APIC and APIC hardware if this is | ||
1165 | * a UP kernel. | ||
1166 | */ | ||
1167 | int __init APIC_init_uniprocessor (void) | ||
1168 | { | ||
1169 | if (enable_local_apic < 0) | ||
1170 | clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); | ||
1171 | |||
1172 | if (!smp_found_config && !cpu_has_apic) | ||
1173 | return -1; | ||
1174 | |||
1175 | /* | ||
1176 | * Complain if the BIOS pretends there is one. | ||
1177 | */ | ||
1178 | if (!cpu_has_apic && | ||
1179 | APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { | ||
1180 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | ||
1181 | boot_cpu_physical_apicid); | ||
1182 | clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); | ||
1183 | return -1; | ||
1184 | } | ||
1185 | |||
1186 | verify_local_APIC(); | ||
1187 | |||
1188 | connect_bsp_APIC(); | ||
1189 | |||
1190 | /* | ||
1191 | * Hack: In case of kdump, after a crash, kernel might be booting | ||
1192 | * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid | ||
1193 | * might be zero if read from MP tables. Get it from LAPIC. | ||
1194 | */ | ||
1195 | #ifdef CONFIG_CRASH_DUMP | ||
1196 | boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); | ||
1197 | #endif | ||
1198 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); | ||
1199 | |||
1200 | setup_local_APIC(); | ||
1201 | |||
1202 | #ifdef CONFIG_X86_IO_APIC | ||
1203 | if (smp_found_config) | ||
1204 | if (!skip_ioapic_setup && nr_ioapics) | ||
1205 | setup_IO_APIC(); | ||
1206 | #endif | ||
1207 | setup_boot_clock(); | ||
1208 | |||
1209 | return 0; | ||
1210 | } | ||
1211 | |||
1212 | /* | ||
1213 | * APIC command line parameters | ||
1214 | */ | ||
1215 | static int __init parse_lapic(char *arg) | ||
1216 | { | ||
1217 | enable_local_apic = 1; | ||
1218 | return 0; | ||
1219 | } | ||
1220 | early_param("lapic", parse_lapic); | ||
1221 | |||
1222 | static int __init parse_nolapic(char *arg) | ||
1223 | { | ||
1224 | enable_local_apic = -1; | ||
1225 | clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); | ||
1226 | return 0; | ||
1227 | } | ||
1228 | early_param("nolapic", parse_nolapic); | ||
1229 | |||
1230 | static int __init parse_disable_lapic_timer(char *arg) | ||
1231 | { | ||
1232 | local_apic_timer_disabled = 1; | ||
1233 | return 0; | ||
1234 | } | ||
1235 | early_param("nolapic_timer", parse_disable_lapic_timer); | ||
1236 | |||
1237 | static int __init parse_lapic_timer_c2_ok(char *arg) | ||
1238 | { | ||
1239 | local_apic_timer_c2_ok = 1; | ||
1240 | return 0; | ||
1241 | } | ||
1242 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); | ||
1243 | |||
1244 | static int __init apic_set_verbosity(char *str) | ||
1245 | { | ||
1246 | if (strcmp("debug", str) == 0) | ||
1247 | apic_verbosity = APIC_DEBUG; | ||
1248 | else if (strcmp("verbose", str) == 0) | ||
1249 | apic_verbosity = APIC_VERBOSE; | ||
1250 | return 1; | ||
1251 | } | ||
1252 | |||
1253 | __setup("apic=", apic_set_verbosity); | ||
1254 | |||
1255 | |||
1256 | /* | ||
1257 | * Local APIC interrupts | ||
1258 | */ | ||
1259 | |||
1260 | /* | ||
1261 | * This interrupt should _never_ happen with our APIC/SMP architecture | ||
1262 | */ | ||
1263 | void smp_spurious_interrupt(struct pt_regs *regs) | ||
1264 | { | ||
1265 | unsigned long v; | ||
1266 | |||
1267 | irq_enter(); | ||
1268 | /* | ||
1269 | * Check if this really is a spurious interrupt and ACK it | ||
1270 | * if it is a vectored one. Just in case... | ||
1271 | * Spurious interrupts should not be ACKed. | ||
1272 | */ | ||
1273 | v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); | ||
1274 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | ||
1275 | ack_APIC_irq(); | ||
1276 | |||
1277 | /* see sw-dev-man vol 3, chapter 7.4.13.5 */ | ||
1278 | printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " | ||
1279 | "should never happen.\n", smp_processor_id()); | ||
1280 | irq_exit(); | ||
1281 | } | ||
1282 | |||
1283 | /* | ||
1284 | * This interrupt should never happen with our APIC/SMP architecture | ||
1285 | */ | ||
1286 | void smp_error_interrupt(struct pt_regs *regs) | ||
1287 | { | ||
1288 | unsigned long v, v1; | ||
1289 | |||
1290 | irq_enter(); | ||
1291 | /* First tickle the hardware, only then report what went on. -- REW */ | ||
1292 | v = apic_read(APIC_ESR); | ||
1293 | apic_write(APIC_ESR, 0); | ||
1294 | v1 = apic_read(APIC_ESR); | ||
1295 | ack_APIC_irq(); | ||
1296 | atomic_inc(&irq_err_count); | ||
1297 | |||
1298 | /* Here is what the APIC error bits mean: | ||
1299 | 0: Send CS error | ||
1300 | 1: Receive CS error | ||
1301 | 2: Send accept error | ||
1302 | 3: Receive accept error | ||
1303 | 4: Reserved | ||
1304 | 5: Send illegal vector | ||
1305 | 6: Received illegal vector | ||
1306 | 7: Illegal register address | ||
1307 | */ | ||
1308 | printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", | ||
1309 | smp_processor_id(), v , v1); | ||
1310 | irq_exit(); | ||
1311 | } | ||
1312 | |||
1313 | /* | ||
1314 | * Initialize APIC interrupts | ||
1315 | */ | ||
1316 | void __init apic_intr_init(void) | ||
1317 | { | ||
1318 | #ifdef CONFIG_SMP | ||
1319 | smp_intr_init(); | ||
1320 | #endif | ||
1321 | /* self generated IPI for local APIC timer */ | ||
1322 | set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
1323 | |||
1324 | /* IPI vectors for APIC spurious and error interrupts */ | ||
1325 | set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
1326 | set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
1327 | |||
1328 | /* thermal monitor LVT interrupt */ | ||
1329 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
1330 | set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
1331 | #endif | ||
1332 | } | ||
1333 | |||
1334 | /** | ||
1335 | * connect_bsp_APIC - attach the APIC to the interrupt system | ||
1336 | */ | ||
1337 | void __init connect_bsp_APIC(void) | ||
1338 | { | ||
1339 | if (pic_mode) { | ||
1340 | /* | ||
1341 | * Do not trust the local APIC being empty at bootup. | ||
1342 | */ | ||
1343 | clear_local_APIC(); | ||
1344 | /* | ||
1345 | * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's | ||
1346 | * local APIC to INT and NMI lines. | ||
1347 | */ | ||
1348 | apic_printk(APIC_VERBOSE, "leaving PIC mode, " | ||
1349 | "enabling APIC mode.\n"); | ||
1350 | outb(0x70, 0x22); | ||
1351 | outb(0x01, 0x23); | ||
1352 | } | ||
1353 | enable_apic_mode(); | ||
1354 | } | ||
1355 | |||
1356 | /** | ||
1357 | * disconnect_bsp_APIC - detach the APIC from the interrupt system | ||
1358 | * @virt_wire_setup: indicates, whether virtual wire mode is selected | ||
1359 | * | ||
1360 | * Virtual wire mode is necessary to deliver legacy interrupts even when the | ||
1361 | * APIC is disabled. | ||
1362 | */ | ||
1363 | void disconnect_bsp_APIC(int virt_wire_setup) | ||
1364 | { | ||
1365 | if (pic_mode) { | ||
1366 | /* | ||
1367 | * Put the board back into PIC mode (has an effect only on | ||
1368 | * certain older boards). Note that APIC interrupts, including | ||
1369 | * IPIs, won't work beyond this point! The only exception are | ||
1370 | * INIT IPIs. | ||
1371 | */ | ||
1372 | apic_printk(APIC_VERBOSE, "disabling APIC mode, " | ||
1373 | "entering PIC mode.\n"); | ||
1374 | outb(0x70, 0x22); | ||
1375 | outb(0x00, 0x23); | ||
1376 | } else { | ||
1377 | /* Go back to Virtual Wire compatibility mode */ | ||
1378 | unsigned long value; | ||
1379 | |||
1380 | /* For the spurious interrupt use vector F, and enable it */ | ||
1381 | value = apic_read(APIC_SPIV); | ||
1382 | value &= ~APIC_VECTOR_MASK; | ||
1383 | value |= APIC_SPIV_APIC_ENABLED; | ||
1384 | value |= 0xf; | ||
1385 | apic_write_around(APIC_SPIV, value); | ||
1386 | |||
1387 | if (!virt_wire_setup) { | ||
1388 | /* | ||
1389 | * For LVT0 make it edge triggered, active high, | ||
1390 | * external and enabled | ||
1391 | */ | ||
1392 | value = apic_read(APIC_LVT0); | ||
1393 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
1394 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
1395 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); | ||
1396 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
1397 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); | ||
1398 | apic_write_around(APIC_LVT0, value); | ||
1399 | } else { | ||
1400 | /* Disable LVT0 */ | ||
1401 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED); | ||
1402 | } | ||
1403 | |||
1404 | /* | ||
1405 | * For LVT1 make it edge triggered, active high, nmi and | ||
1406 | * enabled | ||
1407 | */ | ||
1408 | value = apic_read(APIC_LVT1); | ||
1409 | value &= ~( | ||
1410 | APIC_MODE_MASK | APIC_SEND_PENDING | | ||
1411 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
1412 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
1413 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
1414 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | ||
1415 | apic_write_around(APIC_LVT1, value); | ||
1416 | } | ||
1417 | } | ||
1418 | |||
1419 | /* | ||
1420 | * Power management | ||
1421 | */ | ||
1422 | #ifdef CONFIG_PM | ||
1423 | |||
1424 | static struct { | ||
1425 | int active; | ||
1426 | /* r/w apic fields */ | ||
1427 | unsigned int apic_id; | ||
1428 | unsigned int apic_taskpri; | ||
1429 | unsigned int apic_ldr; | ||
1430 | unsigned int apic_dfr; | ||
1431 | unsigned int apic_spiv; | ||
1432 | unsigned int apic_lvtt; | ||
1433 | unsigned int apic_lvtpc; | ||
1434 | unsigned int apic_lvt0; | ||
1435 | unsigned int apic_lvt1; | ||
1436 | unsigned int apic_lvterr; | ||
1437 | unsigned int apic_tmict; | ||
1438 | unsigned int apic_tdcr; | ||
1439 | unsigned int apic_thmr; | ||
1440 | } apic_pm_state; | ||
1441 | |||
1442 | static int lapic_suspend(struct sys_device *dev, pm_message_t state) | ||
1443 | { | ||
1444 | unsigned long flags; | ||
1445 | int maxlvt; | ||
1446 | |||
1447 | if (!apic_pm_state.active) | ||
1448 | return 0; | ||
1449 | |||
1450 | maxlvt = lapic_get_maxlvt(); | ||
1451 | |||
1452 | apic_pm_state.apic_id = apic_read(APIC_ID); | ||
1453 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); | ||
1454 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); | ||
1455 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); | ||
1456 | apic_pm_state.apic_spiv = apic_read(APIC_SPIV); | ||
1457 | apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); | ||
1458 | if (maxlvt >= 4) | ||
1459 | apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); | ||
1460 | apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); | ||
1461 | apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); | ||
1462 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | ||
1463 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | ||
1464 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | ||
1465 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
1466 | if (maxlvt >= 5) | ||
1467 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | ||
1468 | #endif | ||
1469 | |||
1470 | local_irq_save(flags); | ||
1471 | disable_local_APIC(); | ||
1472 | local_irq_restore(flags); | ||
1473 | return 0; | ||
1474 | } | ||
1475 | |||
1476 | static int lapic_resume(struct sys_device *dev) | ||
1477 | { | ||
1478 | unsigned int l, h; | ||
1479 | unsigned long flags; | ||
1480 | int maxlvt; | ||
1481 | |||
1482 | if (!apic_pm_state.active) | ||
1483 | return 0; | ||
1484 | |||
1485 | maxlvt = lapic_get_maxlvt(); | ||
1486 | |||
1487 | local_irq_save(flags); | ||
1488 | |||
1489 | /* | ||
1490 | * Make sure the APICBASE points to the right address | ||
1491 | * | ||
1492 | * FIXME! This will be wrong if we ever support suspend on | ||
1493 | * SMP! We'll need to do this as part of the CPU restore! | ||
1494 | */ | ||
1495 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1496 | l &= ~MSR_IA32_APICBASE_BASE; | ||
1497 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | ||
1498 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
1499 | |||
1500 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | ||
1501 | apic_write(APIC_ID, apic_pm_state.apic_id); | ||
1502 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); | ||
1503 | apic_write(APIC_LDR, apic_pm_state.apic_ldr); | ||
1504 | apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); | ||
1505 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | ||
1506 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | ||
1507 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | ||
1508 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
1509 | if (maxlvt >= 5) | ||
1510 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | ||
1511 | #endif | ||
1512 | if (maxlvt >= 4) | ||
1513 | apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); | ||
1514 | apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); | ||
1515 | apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); | ||
1516 | apic_write(APIC_TMICT, apic_pm_state.apic_tmict); | ||
1517 | apic_write(APIC_ESR, 0); | ||
1518 | apic_read(APIC_ESR); | ||
1519 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | ||
1520 | apic_write(APIC_ESR, 0); | ||
1521 | apic_read(APIC_ESR); | ||
1522 | local_irq_restore(flags); | ||
1523 | return 0; | ||
1524 | } | ||
1525 | |||
1526 | /* | ||
1527 | * This device has no shutdown method - fully functioning local APICs | ||
1528 | * are needed on every CPU up until machine_halt/restart/poweroff. | ||
1529 | */ | ||
1530 | |||
1531 | static struct sysdev_class lapic_sysclass = { | ||
1532 | set_kset_name("lapic"), | ||
1533 | .resume = lapic_resume, | ||
1534 | .suspend = lapic_suspend, | ||
1535 | }; | ||
1536 | |||
1537 | static struct sys_device device_lapic = { | ||
1538 | .id = 0, | ||
1539 | .cls = &lapic_sysclass, | ||
1540 | }; | ||
1541 | |||
1542 | static void __devinit apic_pm_activate(void) | ||
1543 | { | ||
1544 | apic_pm_state.active = 1; | ||
1545 | } | ||
1546 | |||
1547 | static int __init init_lapic_sysfs(void) | ||
1548 | { | ||
1549 | int error; | ||
1550 | |||
1551 | if (!cpu_has_apic) | ||
1552 | return 0; | ||
1553 | /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ | ||
1554 | |||
1555 | error = sysdev_class_register(&lapic_sysclass); | ||
1556 | if (!error) | ||
1557 | error = sysdev_register(&device_lapic); | ||
1558 | return error; | ||
1559 | } | ||
1560 | device_initcall(init_lapic_sysfs); | ||
1561 | |||
1562 | #else /* CONFIG_PM */ | ||
1563 | |||
1564 | static void apic_pm_activate(void) { } | ||
1565 | |||
1566 | #endif /* CONFIG_PM */ | ||
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c new file mode 100644 index 000000000000..f02a8aca826b --- /dev/null +++ b/arch/x86/kernel/apm_32.c | |||
@@ -0,0 +1,2403 @@ | |||
1 | /* -*- linux-c -*- | ||
2 | * APM BIOS driver for Linux | ||
3 | * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au) | ||
4 | * | ||
5 | * Initial development of this driver was funded by NEC Australia P/L | ||
6 | * and NEC Corporation | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify it | ||
9 | * under the terms of the GNU General Public License as published by the | ||
10 | * Free Software Foundation; either version 2, or (at your option) any | ||
11 | * later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, but | ||
14 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | * | ||
18 | * October 1995, Rik Faith (faith@cs.unc.edu): | ||
19 | * Minor enhancements and updates (to the patch set) for 1.3.x | ||
20 | * Documentation | ||
21 | * January 1996, Rik Faith (faith@cs.unc.edu): | ||
22 | * Make /proc/apm easy to format (bump driver version) | ||
23 | * March 1996, Rik Faith (faith@cs.unc.edu): | ||
24 | * Prohibit APM BIOS calls unless apm_enabled. | ||
25 | * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>) | ||
26 | * April 1996, Stephen Rothwell (sfr@canb.auug.org.au) | ||
27 | * Version 1.0 and 1.1 | ||
28 | * May 1996, Version 1.2 | ||
29 | * Feb 1998, Version 1.3 | ||
30 | * Feb 1998, Version 1.4 | ||
31 | * Aug 1998, Version 1.5 | ||
32 | * Sep 1998, Version 1.6 | ||
33 | * Nov 1998, Version 1.7 | ||
34 | * Jan 1999, Version 1.8 | ||
35 | * Jan 1999, Version 1.9 | ||
36 | * Oct 1999, Version 1.10 | ||
37 | * Nov 1999, Version 1.11 | ||
38 | * Jan 2000, Version 1.12 | ||
39 | * Feb 2000, Version 1.13 | ||
40 | * Nov 2000, Version 1.14 | ||
41 | * Oct 2001, Version 1.15 | ||
42 | * Jan 2002, Version 1.16 | ||
43 | * Oct 2002, Version 1.16ac | ||
44 | * | ||
45 | * History: | ||
46 | * 0.6b: first version in official kernel, Linux 1.3.46 | ||
47 | * 0.7: changed /proc/apm format, Linux 1.3.58 | ||
48 | * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59 | ||
49 | * 0.9: only call bios if bios is present, Linux 1.3.72 | ||
50 | * 1.0: use fixed device number, consolidate /proc/apm into this file, | ||
51 | * Linux 1.3.85 | ||
52 | * 1.1: support user-space standby and suspend, power off after system | ||
53 | * halted, Linux 1.3.98 | ||
54 | * 1.2: When resetting RTC after resume, take care so that the time | ||
55 | * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth | ||
56 | * <jtoth@princeton.edu>); improve interaction between | ||
57 | * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 | ||
58 | * 1.2a:Simple change to stop mysterious bug reports with SMP also added | ||
59 | * levels to the printk calls. APM is not defined for SMP machines. | ||
60 | * The new replacment for it is, but Linux doesn't yet support this. | ||
61 | * Alan Cox Linux 2.1.55 | ||
62 | * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's | ||
63 | * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by | ||
64 | * Dean Gaudet <dgaudet@arctic.org>. | ||
65 | * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87 | ||
66 | * 1.5: Fix segment register reloading (in case of bad segments saved | ||
67 | * across BIOS call). | ||
68 | * Stephen Rothwell | ||
69 | * 1.6: Cope with complier/assembler differences. | ||
70 | * Only try to turn off the first display device. | ||
71 | * Fix OOPS at power off with no APM BIOS by Jan Echternach | ||
72 | * <echter@informatik.uni-rostock.de> | ||
73 | * Stephen Rothwell | ||
74 | * 1.7: Modify driver's cached copy of the disabled/disengaged flags | ||
75 | * to reflect current state of APM BIOS. | ||
76 | * Chris Rankin <rankinc@bellsouth.net> | ||
77 | * Reset interrupt 0 timer to 100Hz after suspend | ||
78 | * Chad Miller <cmiller@surfsouth.com> | ||
79 | * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE | ||
80 | * Richard Gooch <rgooch@atnf.csiro.au> | ||
81 | * Allow boot time disabling of APM | ||
82 | * Make boot messages far less verbose by default | ||
83 | * Make asm safer | ||
84 | * Stephen Rothwell | ||
85 | * 1.8: Add CONFIG_APM_RTC_IS_GMT | ||
86 | * Richard Gooch <rgooch@atnf.csiro.au> | ||
87 | * change APM_NOINTS to CONFIG_APM_ALLOW_INTS | ||
88 | * remove dependency on CONFIG_PROC_FS | ||
89 | * Stephen Rothwell | ||
90 | * 1.9: Fix small typo. <laslo@wodip.opole.pl> | ||
91 | * Try to cope with BIOS's that need to have all display | ||
92 | * devices blanked and not just the first one. | ||
93 | * Ross Paterson <ross@soi.city.ac.uk> | ||
94 | * Fix segment limit setting it has always been wrong as | ||
95 | * the segments needed to have byte granularity. | ||
96 | * Mark a few things __init. | ||
97 | * Add hack to allow power off of SMP systems by popular request. | ||
98 | * Use CONFIG_SMP instead of __SMP__ | ||
99 | * Ignore BOUNCES for three seconds. | ||
100 | * Stephen Rothwell | ||
101 | * 1.10: Fix for Thinkpad return code. | ||
102 | * Merge 2.2 and 2.3 drivers. | ||
103 | * Remove APM dependencies in arch/i386/kernel/process.c | ||
104 | * Remove APM dependencies in drivers/char/sysrq.c | ||
105 | * Reset time across standby. | ||
106 | * Allow more inititialisation on SMP. | ||
107 | * Remove CONFIG_APM_POWER_OFF and make it boot time | ||
108 | * configurable (default on). | ||
109 | * Make debug only a boot time parameter (remove APM_DEBUG). | ||
110 | * Try to blank all devices on any error. | ||
111 | * 1.11: Remove APM dependencies in drivers/char/console.c | ||
112 | * Check nr_running to detect if we are idle (from | ||
113 | * Borislav Deianov <borislav@lix.polytechnique.fr>) | ||
114 | * Fix for bioses that don't zero the top part of the | ||
115 | * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>) | ||
116 | * (reported by Panos Katsaloulis <teras@writeme.com>). | ||
117 | * Real mode power off patch (Walter Hofmann | ||
118 | * <Walter.Hofmann@physik.stud.uni-erlangen.de>). | ||
119 | * 1.12: Remove CONFIG_SMP as the compiler will optimize | ||
120 | * the code away anyway (smp_num_cpus == 1 in UP) | ||
121 | * noted by Artur Skawina <skawina@geocities.com>. | ||
122 | * Make power off under SMP work again. | ||
123 | * Fix thinko with initial engaging of BIOS. | ||
124 | * Make sure power off only happens on CPU 0 | ||
125 | * (Paul "Rusty" Russell <rusty@rustcorp.com.au>). | ||
126 | * Do error notification to user mode if BIOS calls fail. | ||
127 | * Move entrypoint offset fix to ...boot/setup.S | ||
128 | * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>). | ||
129 | * Remove smp-power-off. SMP users must now specify | ||
130 | * "apm=power-off" on the kernel command line. Suggested | ||
131 | * by Jim Avera <jima@hal.com>, modified by Alan Cox | ||
132 | * <alan@lxorguk.ukuu.org.uk>. | ||
133 | * Register the /proc/apm entry even on SMP so that | ||
134 | * scripts that check for it before doing power off | ||
135 | * work (Jim Avera <jima@hal.com>). | ||
136 | * 1.13: Changes for new pm_ interfaces (Andy Henroid | ||
137 | * <andy_henroid@yahoo.com>). | ||
138 | * Modularize the code. | ||
139 | * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS | ||
140 | * is now the way life works). | ||
141 | * Fix thinko in suspend() (wrong return). | ||
142 | * Notify drivers on critical suspend. | ||
143 | * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> | ||
144 | * modified by sfr). | ||
145 | * Disable interrupts while we are suspended (Andy Henroid | ||
146 | * <andy_henroid@yahoo.com> fixed by sfr). | ||
147 | * Make power off work on SMP again (Tony Hoyle | ||
148 | * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr. | ||
149 | * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore | ||
150 | * interval is now configurable. | ||
151 | * 1.14: Make connection version persist across module unload/load. | ||
152 | * Enable and engage power management earlier. | ||
153 | * Disengage power management on module unload. | ||
154 | * Changed to use the sysrq-register hack for registering the | ||
155 | * power off function called by magic sysrq based upon discussions | ||
156 | * in irc://irc.openprojects.net/#kernelnewbies | ||
157 | * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>). | ||
158 | * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable. | ||
159 | * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr. | ||
160 | * Work around byte swap bug in one of the Vaio's BIOS's | ||
161 | * (Marc Boucher <marc@mbsi.ca>). | ||
162 | * Exposed the disable flag to dmi so that we can handle known | ||
163 | * broken APM (Alan Cox <alan@redhat.com>). | ||
164 | * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin | ||
165 | * calling it - instead idle. (Alan Cox <alan@redhat.com>) | ||
166 | * If an APM idle fails log it and idle sensibly | ||
167 | * 1.15: Don't queue events to clients who open the device O_WRONLY. | ||
168 | * Don't expect replies from clients who open the device O_RDONLY. | ||
169 | * (Idea from Thomas Hood) | ||
170 | * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>) | ||
171 | * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.) | ||
172 | * Notify listeners of standby or suspend events before notifying | ||
173 | * drivers. Return EBUSY to ioctl() if suspend is rejected. | ||
174 | * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood) | ||
175 | * Ignore first resume after we generate our own resume event | ||
176 | * after a suspend (Thomas Hood) | ||
177 | * Daemonize now gets rid of our controlling terminal (sfr). | ||
178 | * CONFIG_APM_CPU_IDLE now just affects the default value of | ||
179 | * idle_threshold (sfr). | ||
180 | * Change name of kernel apm daemon (as it no longer idles) (sfr). | ||
181 | * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we | ||
182 | * make _all_ APM calls on the CPU#0. Fix unsafe sign bug. | ||
183 | * TODO: determine if its "boot CPU" or "CPU0" we want to lock to. | ||
184 | * | ||
185 | * APM 1.1 Reference: | ||
186 | * | ||
187 | * Intel Corporation, Microsoft Corporation. Advanced Power Management | ||
188 | * (APM) BIOS Interface Specification, Revision 1.1, September 1993. | ||
189 | * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. | ||
190 | * | ||
191 | * [This document is available free from Intel by calling 800.628.8686 (fax | ||
192 | * 916.356.6100) or 800.548.4725; or via anonymous ftp from | ||
193 | * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also | ||
194 | * available from Microsoft by calling 206.882.8080.] | ||
195 | * | ||
196 | * APM 1.2 Reference: | ||
197 | * Intel Corporation, Microsoft Corporation. Advanced Power Management | ||
198 | * (APM) BIOS Interface Specification, Revision 1.2, February 1996. | ||
199 | * | ||
200 | * [This document is available from Microsoft at: | ||
201 | * http://www.microsoft.com/whdc/archive/amp_12.mspx] | ||
202 | */ | ||
203 | |||
204 | #include <linux/module.h> | ||
205 | |||
206 | #include <linux/poll.h> | ||
207 | #include <linux/types.h> | ||
208 | #include <linux/stddef.h> | ||
209 | #include <linux/timer.h> | ||
210 | #include <linux/fcntl.h> | ||
211 | #include <linux/slab.h> | ||
212 | #include <linux/stat.h> | ||
213 | #include <linux/proc_fs.h> | ||
214 | #include <linux/seq_file.h> | ||
215 | #include <linux/miscdevice.h> | ||
216 | #include <linux/apm_bios.h> | ||
217 | #include <linux/init.h> | ||
218 | #include <linux/time.h> | ||
219 | #include <linux/sched.h> | ||
220 | #include <linux/pm.h> | ||
221 | #include <linux/pm_legacy.h> | ||
222 | #include <linux/capability.h> | ||
223 | #include <linux/device.h> | ||
224 | #include <linux/kernel.h> | ||
225 | #include <linux/freezer.h> | ||
226 | #include <linux/smp.h> | ||
227 | #include <linux/dmi.h> | ||
228 | #include <linux/suspend.h> | ||
229 | #include <linux/kthread.h> | ||
230 | |||
231 | #include <asm/system.h> | ||
232 | #include <asm/uaccess.h> | ||
233 | #include <asm/desc.h> | ||
234 | #include <asm/i8253.h> | ||
235 | #include <asm/paravirt.h> | ||
236 | #include <asm/reboot.h> | ||
237 | |||
238 | #include "io_ports.h" | ||
239 | |||
240 | #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) | ||
241 | extern int (*console_blank_hook)(int); | ||
242 | #endif | ||
243 | |||
244 | /* | ||
245 | * The apm_bios device is one of the misc char devices. | ||
246 | * This is its minor number. | ||
247 | */ | ||
248 | #define APM_MINOR_DEV 134 | ||
249 | |||
250 | /* | ||
251 | * See Documentation/Config.help for the configuration options. | ||
252 | * | ||
253 | * Various options can be changed at boot time as follows: | ||
254 | * (We allow underscores for compatibility with the modules code) | ||
255 | * apm=on/off enable/disable APM | ||
256 | * [no-]allow[-_]ints allow interrupts during BIOS calls | ||
257 | * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call | ||
258 | * [no-]realmode[-_]power[-_]off switch to real mode before | ||
259 | * powering off | ||
260 | * [no-]debug log some debugging messages | ||
261 | * [no-]power[-_]off power off on shutdown | ||
262 | * [no-]smp Use apm even on an SMP box | ||
263 | * bounce[-_]interval=<n> number of ticks to ignore suspend | ||
264 | * bounces | ||
265 | * idle[-_]threshold=<n> System idle percentage above which to | ||
266 | * make APM BIOS idle calls. Set it to | ||
267 | * 100 to disable. | ||
268 | * idle[-_]period=<n> Period (in 1/100s of a second) over | ||
269 | * which the idle percentage is | ||
270 | * calculated. | ||
271 | */ | ||
272 | |||
273 | /* KNOWN PROBLEM MACHINES: | ||
274 | * | ||
275 | * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant | ||
276 | * [Confirmed by TI representative] | ||
277 | * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification | ||
278 | * [Confirmed by BIOS disassembly] | ||
279 | * [This may work now ...] | ||
280 | * P: Toshiba 1950S: battery life information only gets updated after resume | ||
281 | * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking | ||
282 | * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>] | ||
283 | * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP | ||
284 | * Neale Banks <neale@lowendale.com.au> December 2000 | ||
285 | * | ||
286 | * Legend: U = unusable with APM patches | ||
287 | * P = partially usable with APM patches | ||
288 | */ | ||
289 | |||
290 | /* | ||
291 | * Define as 1 to make the driver always call the APM BIOS busy | ||
292 | * routine even if the clock was not reported as slowed by the | ||
293 | * idle routine. Otherwise, define as 0. | ||
294 | */ | ||
295 | #define ALWAYS_CALL_BUSY 1 | ||
296 | |||
297 | /* | ||
298 | * Define to make the APM BIOS calls zero all data segment registers (so | ||
299 | * that an incorrect BIOS implementation will cause a kernel panic if it | ||
300 | * tries to write to arbitrary memory). | ||
301 | */ | ||
302 | #define APM_ZERO_SEGS | ||
303 | |||
304 | #include "apm.h" | ||
305 | |||
306 | /* | ||
307 | * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. | ||
308 | * This patched by Chad Miller <cmiller@surfsouth.com>, original code by | ||
309 | * David Chen <chen@ctpa04.mit.edu> | ||
310 | */ | ||
311 | #undef INIT_TIMER_AFTER_SUSPEND | ||
312 | |||
313 | #ifdef INIT_TIMER_AFTER_SUSPEND | ||
314 | #include <linux/timex.h> | ||
315 | #include <asm/io.h> | ||
316 | #include <linux/delay.h> | ||
317 | #endif | ||
318 | |||
319 | /* | ||
320 | * Need to poll the APM BIOS every second | ||
321 | */ | ||
322 | #define APM_CHECK_TIMEOUT (HZ) | ||
323 | |||
324 | /* | ||
325 | * Ignore suspend events for this amount of time after a resume | ||
326 | */ | ||
327 | #define DEFAULT_BOUNCE_INTERVAL (3 * HZ) | ||
328 | |||
329 | /* | ||
330 | * Maximum number of events stored | ||
331 | */ | ||
332 | #define APM_MAX_EVENTS 20 | ||
333 | |||
334 | /* | ||
335 | * The per-file APM data | ||
336 | */ | ||
337 | struct apm_user { | ||
338 | int magic; | ||
339 | struct apm_user * next; | ||
340 | unsigned int suser: 1; | ||
341 | unsigned int writer: 1; | ||
342 | unsigned int reader: 1; | ||
343 | unsigned int suspend_wait: 1; | ||
344 | int suspend_result; | ||
345 | int suspends_pending; | ||
346 | int standbys_pending; | ||
347 | int suspends_read; | ||
348 | int standbys_read; | ||
349 | int event_head; | ||
350 | int event_tail; | ||
351 | apm_event_t events[APM_MAX_EVENTS]; | ||
352 | }; | ||
353 | |||
354 | /* | ||
355 | * The magic number in apm_user | ||
356 | */ | ||
357 | #define APM_BIOS_MAGIC 0x4101 | ||
358 | |||
359 | /* | ||
360 | * idle percentage above which bios idle calls are done | ||
361 | */ | ||
362 | #ifdef CONFIG_APM_CPU_IDLE | ||
363 | #define DEFAULT_IDLE_THRESHOLD 95 | ||
364 | #else | ||
365 | #define DEFAULT_IDLE_THRESHOLD 100 | ||
366 | #endif | ||
367 | #define DEFAULT_IDLE_PERIOD (100 / 3) | ||
368 | |||
369 | /* | ||
370 | * Local variables | ||
371 | */ | ||
372 | static struct { | ||
373 | unsigned long offset; | ||
374 | unsigned short segment; | ||
375 | } apm_bios_entry; | ||
376 | static int clock_slowed; | ||
377 | static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; | ||
378 | static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; | ||
379 | static int set_pm_idle; | ||
380 | static int suspends_pending; | ||
381 | static int standbys_pending; | ||
382 | static int ignore_sys_suspend; | ||
383 | static int ignore_normal_resume; | ||
384 | static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; | ||
385 | |||
386 | static int debug __read_mostly; | ||
387 | static int smp __read_mostly; | ||
388 | static int apm_disabled = -1; | ||
389 | #ifdef CONFIG_SMP | ||
390 | static int power_off; | ||
391 | #else | ||
392 | static int power_off = 1; | ||
393 | #endif | ||
394 | #ifdef CONFIG_APM_REAL_MODE_POWER_OFF | ||
395 | static int realmode_power_off = 1; | ||
396 | #else | ||
397 | static int realmode_power_off; | ||
398 | #endif | ||
399 | #ifdef CONFIG_APM_ALLOW_INTS | ||
400 | static int allow_ints = 1; | ||
401 | #else | ||
402 | static int allow_ints; | ||
403 | #endif | ||
404 | static int broken_psr; | ||
405 | |||
406 | static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); | ||
407 | static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); | ||
408 | static struct apm_user * user_list; | ||
409 | static DEFINE_SPINLOCK(user_list_lock); | ||
410 | static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; | ||
411 | |||
412 | static const char driver_version[] = "1.16ac"; /* no spaces */ | ||
413 | |||
414 | static struct task_struct *kapmd_task; | ||
415 | |||
416 | /* | ||
417 | * APM event names taken from the APM 1.2 specification. These are | ||
418 | * the message codes that the BIOS uses to tell us about events | ||
419 | */ | ||
420 | static const char * const apm_event_name[] = { | ||
421 | "system standby", | ||
422 | "system suspend", | ||
423 | "normal resume", | ||
424 | "critical resume", | ||
425 | "low battery", | ||
426 | "power status change", | ||
427 | "update time", | ||
428 | "critical suspend", | ||
429 | "user standby", | ||
430 | "user suspend", | ||
431 | "system standby resume", | ||
432 | "capabilities change" | ||
433 | }; | ||
434 | #define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name) | ||
435 | |||
436 | typedef struct lookup_t { | ||
437 | int key; | ||
438 | char * msg; | ||
439 | } lookup_t; | ||
440 | |||
441 | /* | ||
442 | * The BIOS returns a set of standard error codes in AX when the | ||
443 | * carry flag is set. | ||
444 | */ | ||
445 | |||
446 | static const lookup_t error_table[] = { | ||
447 | /* N/A { APM_SUCCESS, "Operation succeeded" }, */ | ||
448 | { APM_DISABLED, "Power management disabled" }, | ||
449 | { APM_CONNECTED, "Real mode interface already connected" }, | ||
450 | { APM_NOT_CONNECTED, "Interface not connected" }, | ||
451 | { APM_16_CONNECTED, "16 bit interface already connected" }, | ||
452 | /* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */ | ||
453 | { APM_32_CONNECTED, "32 bit interface already connected" }, | ||
454 | { APM_32_UNSUPPORTED, "32 bit interface not supported" }, | ||
455 | { APM_BAD_DEVICE, "Unrecognized device ID" }, | ||
456 | { APM_BAD_PARAM, "Parameter out of range" }, | ||
457 | { APM_NOT_ENGAGED, "Interface not engaged" }, | ||
458 | { APM_BAD_FUNCTION, "Function not supported" }, | ||
459 | { APM_RESUME_DISABLED, "Resume timer disabled" }, | ||
460 | { APM_BAD_STATE, "Unable to enter requested state" }, | ||
461 | /* N/A { APM_NO_EVENTS, "No events pending" }, */ | ||
462 | { APM_NO_ERROR, "BIOS did not set a return code" }, | ||
463 | { APM_NOT_PRESENT, "No APM present" } | ||
464 | }; | ||
465 | #define ERROR_COUNT ARRAY_SIZE(error_table) | ||
466 | |||
467 | /** | ||
468 | * apm_error - display an APM error | ||
469 | * @str: information string | ||
470 | * @err: APM BIOS return code | ||
471 | * | ||
472 | * Write a meaningful log entry to the kernel log in the event of | ||
473 | * an APM error. | ||
474 | */ | ||
475 | |||
476 | static void apm_error(char *str, int err) | ||
477 | { | ||
478 | int i; | ||
479 | |||
480 | for (i = 0; i < ERROR_COUNT; i++) | ||
481 | if (error_table[i].key == err) break; | ||
482 | if (i < ERROR_COUNT) | ||
483 | printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); | ||
484 | else | ||
485 | printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", | ||
486 | str, err); | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Lock APM functionality to physical CPU 0 | ||
491 | */ | ||
492 | |||
493 | #ifdef CONFIG_SMP | ||
494 | |||
495 | static cpumask_t apm_save_cpus(void) | ||
496 | { | ||
497 | cpumask_t x = current->cpus_allowed; | ||
498 | /* Some bioses don't like being called from CPU != 0 */ | ||
499 | set_cpus_allowed(current, cpumask_of_cpu(0)); | ||
500 | BUG_ON(smp_processor_id() != 0); | ||
501 | return x; | ||
502 | } | ||
503 | |||
504 | static inline void apm_restore_cpus(cpumask_t mask) | ||
505 | { | ||
506 | set_cpus_allowed(current, mask); | ||
507 | } | ||
508 | |||
509 | #else | ||
510 | |||
511 | /* | ||
512 | * No CPU lockdown needed on a uniprocessor | ||
513 | */ | ||
514 | |||
515 | #define apm_save_cpus() (current->cpus_allowed) | ||
516 | #define apm_restore_cpus(x) (void)(x) | ||
517 | |||
518 | #endif | ||
519 | |||
520 | /* | ||
521 | * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and | ||
522 | * apm_info.allow_ints, we are being really paranoid here! Not only | ||
523 | * are interrupts disabled, but all the segment registers (except SS) | ||
524 | * are saved and zeroed this means that if the BIOS tries to reference | ||
525 | * any data without explicitly loading the segment registers, the kernel | ||
526 | * will fault immediately rather than have some unforeseen circumstances | ||
527 | * for the rest of the kernel. And it will be very obvious! :-) Doing | ||
528 | * this depends on CS referring to the same physical memory as DS so that | ||
529 | * DS can be zeroed before the call. Unfortunately, we can't do anything | ||
530 | * about the stack segment/pointer. Also, we tell the compiler that | ||
531 | * everything could change. | ||
532 | * | ||
533 | * Also, we KNOW that for the non error case of apm_bios_call, there | ||
534 | * is no useful data returned in the low order 8 bits of eax. | ||
535 | */ | ||
536 | |||
537 | static inline unsigned long __apm_irq_save(void) | ||
538 | { | ||
539 | unsigned long flags; | ||
540 | local_save_flags(flags); | ||
541 | if (apm_info.allow_ints) { | ||
542 | if (irqs_disabled_flags(flags)) | ||
543 | local_irq_enable(); | ||
544 | } else | ||
545 | local_irq_disable(); | ||
546 | |||
547 | return flags; | ||
548 | } | ||
549 | |||
550 | #define apm_irq_save(flags) \ | ||
551 | do { flags = __apm_irq_save(); } while (0) | ||
552 | |||
553 | static inline void apm_irq_restore(unsigned long flags) | ||
554 | { | ||
555 | if (irqs_disabled_flags(flags)) | ||
556 | local_irq_disable(); | ||
557 | else if (irqs_disabled()) | ||
558 | local_irq_enable(); | ||
559 | } | ||
560 | |||
561 | #ifdef APM_ZERO_SEGS | ||
562 | # define APM_DECL_SEGS \ | ||
563 | unsigned int saved_fs; unsigned int saved_gs; | ||
564 | # define APM_DO_SAVE_SEGS \ | ||
565 | savesegment(fs, saved_fs); savesegment(gs, saved_gs) | ||
566 | # define APM_DO_RESTORE_SEGS \ | ||
567 | loadsegment(fs, saved_fs); loadsegment(gs, saved_gs) | ||
568 | #else | ||
569 | # define APM_DECL_SEGS | ||
570 | # define APM_DO_SAVE_SEGS | ||
571 | # define APM_DO_RESTORE_SEGS | ||
572 | #endif | ||
573 | |||
574 | /** | ||
575 | * apm_bios_call - Make an APM BIOS 32bit call | ||
576 | * @func: APM function to execute | ||
577 | * @ebx_in: EBX register for call entry | ||
578 | * @ecx_in: ECX register for call entry | ||
579 | * @eax: EAX register return | ||
580 | * @ebx: EBX register return | ||
581 | * @ecx: ECX register return | ||
582 | * @edx: EDX register return | ||
583 | * @esi: ESI register return | ||
584 | * | ||
585 | * Make an APM call using the 32bit protected mode interface. The | ||
586 | * caller is responsible for knowing if APM BIOS is configured and | ||
587 | * enabled. This call can disable interrupts for a long period of | ||
588 | * time on some laptops. The return value is in AH and the carry | ||
589 | * flag is loaded into AL. If there is an error, then the error | ||
590 | * code is returned in AH (bits 8-15 of eax) and this function | ||
591 | * returns non-zero. | ||
592 | */ | ||
593 | |||
594 | static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, | ||
595 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) | ||
596 | { | ||
597 | APM_DECL_SEGS | ||
598 | unsigned long flags; | ||
599 | cpumask_t cpus; | ||
600 | int cpu; | ||
601 | struct desc_struct save_desc_40; | ||
602 | struct desc_struct *gdt; | ||
603 | |||
604 | cpus = apm_save_cpus(); | ||
605 | |||
606 | cpu = get_cpu(); | ||
607 | gdt = get_cpu_gdt_table(cpu); | ||
608 | save_desc_40 = gdt[0x40 / 8]; | ||
609 | gdt[0x40 / 8] = bad_bios_desc; | ||
610 | |||
611 | apm_irq_save(flags); | ||
612 | APM_DO_SAVE_SEGS; | ||
613 | apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); | ||
614 | APM_DO_RESTORE_SEGS; | ||
615 | apm_irq_restore(flags); | ||
616 | gdt[0x40 / 8] = save_desc_40; | ||
617 | put_cpu(); | ||
618 | apm_restore_cpus(cpus); | ||
619 | |||
620 | return *eax & 0xff; | ||
621 | } | ||
622 | |||
623 | /** | ||
624 | * apm_bios_call_simple - make a simple APM BIOS 32bit call | ||
625 | * @func: APM function to invoke | ||
626 | * @ebx_in: EBX register value for BIOS call | ||
627 | * @ecx_in: ECX register value for BIOS call | ||
628 | * @eax: EAX register on return from the BIOS call | ||
629 | * | ||
630 | * Make a BIOS call that returns one value only, or just status. | ||
631 | * If there is an error, then the error code is returned in AH | ||
632 | * (bits 8-15 of eax) and this function returns non-zero. This is | ||
633 | * used for simpler BIOS operations. This call may hold interrupts | ||
634 | * off for a long time on some laptops. | ||
635 | */ | ||
636 | |||
637 | static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) | ||
638 | { | ||
639 | u8 error; | ||
640 | APM_DECL_SEGS | ||
641 | unsigned long flags; | ||
642 | cpumask_t cpus; | ||
643 | int cpu; | ||
644 | struct desc_struct save_desc_40; | ||
645 | struct desc_struct *gdt; | ||
646 | |||
647 | cpus = apm_save_cpus(); | ||
648 | |||
649 | cpu = get_cpu(); | ||
650 | gdt = get_cpu_gdt_table(cpu); | ||
651 | save_desc_40 = gdt[0x40 / 8]; | ||
652 | gdt[0x40 / 8] = bad_bios_desc; | ||
653 | |||
654 | apm_irq_save(flags); | ||
655 | APM_DO_SAVE_SEGS; | ||
656 | error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); | ||
657 | APM_DO_RESTORE_SEGS; | ||
658 | apm_irq_restore(flags); | ||
659 | gdt[0x40 / 8] = save_desc_40; | ||
660 | put_cpu(); | ||
661 | apm_restore_cpus(cpus); | ||
662 | return error; | ||
663 | } | ||
664 | |||
665 | /** | ||
666 | * apm_driver_version - APM driver version | ||
667 | * @val: loaded with the APM version on return | ||
668 | * | ||
669 | * Retrieve the APM version supported by the BIOS. This is only | ||
670 | * supported for APM 1.1 or higher. An error indicates APM 1.0 is | ||
671 | * probably present. | ||
672 | * | ||
673 | * On entry val should point to a value indicating the APM driver | ||
674 | * version with the high byte being the major and the low byte the | ||
675 | * minor number both in BCD | ||
676 | * | ||
677 | * On return it will hold the BIOS revision supported in the | ||
678 | * same format. | ||
679 | */ | ||
680 | |||
681 | static int apm_driver_version(u_short *val) | ||
682 | { | ||
683 | u32 eax; | ||
684 | |||
685 | if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) | ||
686 | return (eax >> 8) & 0xff; | ||
687 | *val = eax; | ||
688 | return APM_SUCCESS; | ||
689 | } | ||
690 | |||
691 | /** | ||
692 | * apm_get_event - get an APM event from the BIOS | ||
693 | * @event: pointer to the event | ||
694 | * @info: point to the event information | ||
695 | * | ||
696 | * The APM BIOS provides a polled information for event | ||
697 | * reporting. The BIOS expects to be polled at least every second | ||
698 | * when events are pending. When a message is found the caller should | ||
699 | * poll until no more messages are present. However, this causes | ||
700 | * problems on some laptops where a suspend event notification is | ||
701 | * not cleared until it is acknowledged. | ||
702 | * | ||
703 | * Additional information is returned in the info pointer, providing | ||
704 | * that APM 1.2 is in use. If no messges are pending the value 0x80 | ||
705 | * is returned (No power management events pending). | ||
706 | */ | ||
707 | |||
708 | static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) | ||
709 | { | ||
710 | u32 eax; | ||
711 | u32 ebx; | ||
712 | u32 ecx; | ||
713 | u32 dummy; | ||
714 | |||
715 | if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, | ||
716 | &dummy, &dummy)) | ||
717 | return (eax >> 8) & 0xff; | ||
718 | *event = ebx; | ||
719 | if (apm_info.connection_version < 0x0102) | ||
720 | *info = ~0; /* indicate info not valid */ | ||
721 | else | ||
722 | *info = ecx; | ||
723 | return APM_SUCCESS; | ||
724 | } | ||
725 | |||
726 | /** | ||
727 | * set_power_state - set the power management state | ||
728 | * @what: which items to transition | ||
729 | * @state: state to transition to | ||
730 | * | ||
731 | * Request an APM change of state for one or more system devices. The | ||
732 | * processor state must be transitioned last of all. what holds the | ||
733 | * class of device in the upper byte and the device number (0xFF for | ||
734 | * all) for the object to be transitioned. | ||
735 | * | ||
736 | * The state holds the state to transition to, which may in fact | ||
737 | * be an acceptance of a BIOS requested state change. | ||
738 | */ | ||
739 | |||
740 | static int set_power_state(u_short what, u_short state) | ||
741 | { | ||
742 | u32 eax; | ||
743 | |||
744 | if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) | ||
745 | return (eax >> 8) & 0xff; | ||
746 | return APM_SUCCESS; | ||
747 | } | ||
748 | |||
749 | /** | ||
750 | * set_system_power_state - set system wide power state | ||
751 | * @state: which state to enter | ||
752 | * | ||
753 | * Transition the entire system into a new APM power state. | ||
754 | */ | ||
755 | |||
756 | static int set_system_power_state(u_short state) | ||
757 | { | ||
758 | return set_power_state(APM_DEVICE_ALL, state); | ||
759 | } | ||
760 | |||
761 | /** | ||
762 | * apm_do_idle - perform power saving | ||
763 | * | ||
764 | * This function notifies the BIOS that the processor is (in the view | ||
765 | * of the OS) idle. It returns -1 in the event that the BIOS refuses | ||
766 | * to handle the idle request. On a success the function returns 1 | ||
767 | * if the BIOS did clock slowing or 0 otherwise. | ||
768 | */ | ||
769 | |||
770 | static int apm_do_idle(void) | ||
771 | { | ||
772 | u32 eax; | ||
773 | u8 ret = 0; | ||
774 | int idled = 0; | ||
775 | int polling; | ||
776 | |||
777 | polling = !!(current_thread_info()->status & TS_POLLING); | ||
778 | if (polling) { | ||
779 | current_thread_info()->status &= ~TS_POLLING; | ||
780 | /* | ||
781 | * TS_POLLING-cleared state must be visible before we | ||
782 | * test NEED_RESCHED: | ||
783 | */ | ||
784 | smp_mb(); | ||
785 | } | ||
786 | if (!need_resched()) { | ||
787 | idled = 1; | ||
788 | ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); | ||
789 | } | ||
790 | if (polling) | ||
791 | current_thread_info()->status |= TS_POLLING; | ||
792 | |||
793 | if (!idled) | ||
794 | return 0; | ||
795 | |||
796 | if (ret) { | ||
797 | static unsigned long t; | ||
798 | |||
799 | /* This always fails on some SMP boards running UP kernels. | ||
800 | * Only report the failure the first 5 times. | ||
801 | */ | ||
802 | if (++t < 5) | ||
803 | { | ||
804 | printk(KERN_DEBUG "apm_do_idle failed (%d)\n", | ||
805 | (eax >> 8) & 0xff); | ||
806 | t = jiffies; | ||
807 | } | ||
808 | return -1; | ||
809 | } | ||
810 | clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0; | ||
811 | return clock_slowed; | ||
812 | } | ||
813 | |||
814 | /** | ||
815 | * apm_do_busy - inform the BIOS the CPU is busy | ||
816 | * | ||
817 | * Request that the BIOS brings the CPU back to full performance. | ||
818 | */ | ||
819 | |||
820 | static void apm_do_busy(void) | ||
821 | { | ||
822 | u32 dummy; | ||
823 | |||
824 | if (clock_slowed || ALWAYS_CALL_BUSY) { | ||
825 | (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); | ||
826 | clock_slowed = 0; | ||
827 | } | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * If no process has really been interested in | ||
832 | * the CPU for some time, we want to call BIOS | ||
833 | * power management - we probably want | ||
834 | * to conserve power. | ||
835 | */ | ||
836 | #define IDLE_CALC_LIMIT (HZ * 100) | ||
837 | #define IDLE_LEAKY_MAX 16 | ||
838 | |||
839 | static void (*original_pm_idle)(void) __read_mostly; | ||
840 | |||
841 | /** | ||
842 | * apm_cpu_idle - cpu idling for APM capable Linux | ||
843 | * | ||
844 | * This is the idling function the kernel executes when APM is available. It | ||
845 | * tries to do BIOS powermanagement based on the average system idle time. | ||
846 | * Furthermore it calls the system default idle routine. | ||
847 | */ | ||
848 | |||
849 | static void apm_cpu_idle(void) | ||
850 | { | ||
851 | static int use_apm_idle; /* = 0 */ | ||
852 | static unsigned int last_jiffies; /* = 0 */ | ||
853 | static unsigned int last_stime; /* = 0 */ | ||
854 | |||
855 | int apm_idle_done = 0; | ||
856 | unsigned int jiffies_since_last_check = jiffies - last_jiffies; | ||
857 | unsigned int bucket; | ||
858 | |||
859 | recalc: | ||
860 | if (jiffies_since_last_check > IDLE_CALC_LIMIT) { | ||
861 | use_apm_idle = 0; | ||
862 | last_jiffies = jiffies; | ||
863 | last_stime = current->stime; | ||
864 | } else if (jiffies_since_last_check > idle_period) { | ||
865 | unsigned int idle_percentage; | ||
866 | |||
867 | idle_percentage = current->stime - last_stime; | ||
868 | idle_percentage *= 100; | ||
869 | idle_percentage /= jiffies_since_last_check; | ||
870 | use_apm_idle = (idle_percentage > idle_threshold); | ||
871 | if (apm_info.forbid_idle) | ||
872 | use_apm_idle = 0; | ||
873 | last_jiffies = jiffies; | ||
874 | last_stime = current->stime; | ||
875 | } | ||
876 | |||
877 | bucket = IDLE_LEAKY_MAX; | ||
878 | |||
879 | while (!need_resched()) { | ||
880 | if (use_apm_idle) { | ||
881 | unsigned int t; | ||
882 | |||
883 | t = jiffies; | ||
884 | switch (apm_do_idle()) { | ||
885 | case 0: apm_idle_done = 1; | ||
886 | if (t != jiffies) { | ||
887 | if (bucket) { | ||
888 | bucket = IDLE_LEAKY_MAX; | ||
889 | continue; | ||
890 | } | ||
891 | } else if (bucket) { | ||
892 | bucket--; | ||
893 | continue; | ||
894 | } | ||
895 | break; | ||
896 | case 1: apm_idle_done = 1; | ||
897 | break; | ||
898 | default: /* BIOS refused */ | ||
899 | break; | ||
900 | } | ||
901 | } | ||
902 | if (original_pm_idle) | ||
903 | original_pm_idle(); | ||
904 | else | ||
905 | default_idle(); | ||
906 | jiffies_since_last_check = jiffies - last_jiffies; | ||
907 | if (jiffies_since_last_check > idle_period) | ||
908 | goto recalc; | ||
909 | } | ||
910 | |||
911 | if (apm_idle_done) | ||
912 | apm_do_busy(); | ||
913 | } | ||
914 | |||
915 | /** | ||
916 | * apm_power_off - ask the BIOS to power off | ||
917 | * | ||
918 | * Handle the power off sequence. This is the one piece of code we | ||
919 | * will execute even on SMP machines. In order to deal with BIOS | ||
920 | * bugs we support real mode APM BIOS power off calls. We also make | ||
921 | * the SMP call on CPU0 as some systems will only honour this call | ||
922 | * on their first cpu. | ||
923 | */ | ||
924 | |||
925 | static void apm_power_off(void) | ||
926 | { | ||
927 | unsigned char po_bios_call[] = { | ||
928 | 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ | ||
929 | 0x8e, 0xd0, /* movw ax,ss */ | ||
930 | 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ | ||
931 | 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ | ||
932 | 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ | ||
933 | 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ | ||
934 | 0xcd, 0x15 /* int $0x15 */ | ||
935 | }; | ||
936 | |||
937 | /* Some bioses don't like being called from CPU != 0 */ | ||
938 | if (apm_info.realmode_power_off) | ||
939 | { | ||
940 | (void)apm_save_cpus(); | ||
941 | machine_real_restart(po_bios_call, sizeof(po_bios_call)); | ||
942 | } | ||
943 | else | ||
944 | (void) set_system_power_state(APM_STATE_OFF); | ||
945 | } | ||
946 | |||
947 | #ifdef CONFIG_APM_DO_ENABLE | ||
948 | |||
949 | /** | ||
950 | * apm_enable_power_management - enable BIOS APM power management | ||
951 | * @enable: enable yes/no | ||
952 | * | ||
953 | * Enable or disable the APM BIOS power services. | ||
954 | */ | ||
955 | |||
956 | static int apm_enable_power_management(int enable) | ||
957 | { | ||
958 | u32 eax; | ||
959 | |||
960 | if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) | ||
961 | return APM_NOT_ENGAGED; | ||
962 | if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, | ||
963 | enable, &eax)) | ||
964 | return (eax >> 8) & 0xff; | ||
965 | if (enable) | ||
966 | apm_info.bios.flags &= ~APM_BIOS_DISABLED; | ||
967 | else | ||
968 | apm_info.bios.flags |= APM_BIOS_DISABLED; | ||
969 | return APM_SUCCESS; | ||
970 | } | ||
971 | #endif | ||
972 | |||
973 | /** | ||
974 | * apm_get_power_status - get current power state | ||
975 | * @status: returned status | ||
976 | * @bat: battery info | ||
977 | * @life: estimated life | ||
978 | * | ||
979 | * Obtain the current power status from the APM BIOS. We return a | ||
980 | * status which gives the rough battery status, and current power | ||
981 | * source. The bat value returned give an estimate as a percentage | ||
982 | * of life and a status value for the battery. The estimated life | ||
983 | * if reported is a lifetime in secodnds/minutes at current powwer | ||
984 | * consumption. | ||
985 | */ | ||
986 | |||
987 | static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) | ||
988 | { | ||
989 | u32 eax; | ||
990 | u32 ebx; | ||
991 | u32 ecx; | ||
992 | u32 edx; | ||
993 | u32 dummy; | ||
994 | |||
995 | if (apm_info.get_power_status_broken) | ||
996 | return APM_32_UNSUPPORTED; | ||
997 | if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, | ||
998 | &eax, &ebx, &ecx, &edx, &dummy)) | ||
999 | return (eax >> 8) & 0xff; | ||
1000 | *status = ebx; | ||
1001 | *bat = ecx; | ||
1002 | if (apm_info.get_power_status_swabinminutes) { | ||
1003 | *life = swab16((u16)edx); | ||
1004 | *life |= 0x8000; | ||
1005 | } else | ||
1006 | *life = edx; | ||
1007 | return APM_SUCCESS; | ||
1008 | } | ||
1009 | |||
1010 | #if 0 | ||
1011 | static int apm_get_battery_status(u_short which, u_short *status, | ||
1012 | u_short *bat, u_short *life, u_short *nbat) | ||
1013 | { | ||
1014 | u32 eax; | ||
1015 | u32 ebx; | ||
1016 | u32 ecx; | ||
1017 | u32 edx; | ||
1018 | u32 esi; | ||
1019 | |||
1020 | if (apm_info.connection_version < 0x0102) { | ||
1021 | /* pretend we only have one battery. */ | ||
1022 | if (which != 1) | ||
1023 | return APM_BAD_DEVICE; | ||
1024 | *nbat = 1; | ||
1025 | return apm_get_power_status(status, bat, life); | ||
1026 | } | ||
1027 | |||
1028 | if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, | ||
1029 | &ebx, &ecx, &edx, &esi)) | ||
1030 | return (eax >> 8) & 0xff; | ||
1031 | *status = ebx; | ||
1032 | *bat = ecx; | ||
1033 | *life = edx; | ||
1034 | *nbat = esi; | ||
1035 | return APM_SUCCESS; | ||
1036 | } | ||
1037 | #endif | ||
1038 | |||
1039 | /** | ||
1040 | * apm_engage_power_management - enable PM on a device | ||
1041 | * @device: identity of device | ||
1042 | * @enable: on/off | ||
1043 | * | ||
1044 | * Activate or deactive power management on either a specific device | ||
1045 | * or the entire system (%APM_DEVICE_ALL). | ||
1046 | */ | ||
1047 | |||
1048 | static int apm_engage_power_management(u_short device, int enable) | ||
1049 | { | ||
1050 | u32 eax; | ||
1051 | |||
1052 | if ((enable == 0) && (device == APM_DEVICE_ALL) | ||
1053 | && (apm_info.bios.flags & APM_BIOS_DISABLED)) | ||
1054 | return APM_DISABLED; | ||
1055 | if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) | ||
1056 | return (eax >> 8) & 0xff; | ||
1057 | if (device == APM_DEVICE_ALL) { | ||
1058 | if (enable) | ||
1059 | apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; | ||
1060 | else | ||
1061 | apm_info.bios.flags |= APM_BIOS_DISENGAGED; | ||
1062 | } | ||
1063 | return APM_SUCCESS; | ||
1064 | } | ||
1065 | |||
1066 | #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) | ||
1067 | |||
1068 | /** | ||
1069 | * apm_console_blank - blank the display | ||
1070 | * @blank: on/off | ||
1071 | * | ||
1072 | * Attempt to blank the console, firstly by blanking just video device | ||
1073 | * zero, and if that fails (some BIOSes don't support it) then it blanks | ||
1074 | * all video devices. Typically the BIOS will do laptop backlight and | ||
1075 | * monitor powerdown for us. | ||
1076 | */ | ||
1077 | |||
1078 | static int apm_console_blank(int blank) | ||
1079 | { | ||
1080 | int error = APM_NOT_ENGAGED; /* silence gcc */ | ||
1081 | int i; | ||
1082 | u_short state; | ||
1083 | static const u_short dev[3] = { 0x100, 0x1FF, 0x101 }; | ||
1084 | |||
1085 | state = blank ? APM_STATE_STANDBY : APM_STATE_READY; | ||
1086 | |||
1087 | for (i = 0; i < ARRAY_SIZE(dev); i++) { | ||
1088 | error = set_power_state(dev[i], state); | ||
1089 | |||
1090 | if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) | ||
1091 | return 1; | ||
1092 | |||
1093 | if (error == APM_NOT_ENGAGED) | ||
1094 | break; | ||
1095 | } | ||
1096 | |||
1097 | if (error == APM_NOT_ENGAGED) { | ||
1098 | static int tried; | ||
1099 | int eng_error; | ||
1100 | if (tried++ == 0) { | ||
1101 | eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1); | ||
1102 | if (eng_error) { | ||
1103 | apm_error("set display", error); | ||
1104 | apm_error("engage interface", eng_error); | ||
1105 | return 0; | ||
1106 | } else | ||
1107 | return apm_console_blank(blank); | ||
1108 | } | ||
1109 | } | ||
1110 | apm_error("set display", error); | ||
1111 | return 0; | ||
1112 | } | ||
1113 | #endif | ||
1114 | |||
1115 | static int queue_empty(struct apm_user *as) | ||
1116 | { | ||
1117 | return as->event_head == as->event_tail; | ||
1118 | } | ||
1119 | |||
1120 | static apm_event_t get_queued_event(struct apm_user *as) | ||
1121 | { | ||
1122 | if (++as->event_tail >= APM_MAX_EVENTS) | ||
1123 | as->event_tail = 0; | ||
1124 | return as->events[as->event_tail]; | ||
1125 | } | ||
1126 | |||
1127 | static void queue_event(apm_event_t event, struct apm_user *sender) | ||
1128 | { | ||
1129 | struct apm_user * as; | ||
1130 | |||
1131 | spin_lock(&user_list_lock); | ||
1132 | if (user_list == NULL) | ||
1133 | goto out; | ||
1134 | for (as = user_list; as != NULL; as = as->next) { | ||
1135 | if ((as == sender) || (!as->reader)) | ||
1136 | continue; | ||
1137 | if (++as->event_head >= APM_MAX_EVENTS) | ||
1138 | as->event_head = 0; | ||
1139 | |||
1140 | if (as->event_head == as->event_tail) { | ||
1141 | static int notified; | ||
1142 | |||
1143 | if (notified++ == 0) | ||
1144 | printk(KERN_ERR "apm: an event queue overflowed\n"); | ||
1145 | if (++as->event_tail >= APM_MAX_EVENTS) | ||
1146 | as->event_tail = 0; | ||
1147 | } | ||
1148 | as->events[as->event_head] = event; | ||
1149 | if ((!as->suser) || (!as->writer)) | ||
1150 | continue; | ||
1151 | switch (event) { | ||
1152 | case APM_SYS_SUSPEND: | ||
1153 | case APM_USER_SUSPEND: | ||
1154 | as->suspends_pending++; | ||
1155 | suspends_pending++; | ||
1156 | break; | ||
1157 | |||
1158 | case APM_SYS_STANDBY: | ||
1159 | case APM_USER_STANDBY: | ||
1160 | as->standbys_pending++; | ||
1161 | standbys_pending++; | ||
1162 | break; | ||
1163 | } | ||
1164 | } | ||
1165 | wake_up_interruptible(&apm_waitqueue); | ||
1166 | out: | ||
1167 | spin_unlock(&user_list_lock); | ||
1168 | } | ||
1169 | |||
1170 | static void reinit_timer(void) | ||
1171 | { | ||
1172 | #ifdef INIT_TIMER_AFTER_SUSPEND | ||
1173 | unsigned long flags; | ||
1174 | |||
1175 | spin_lock_irqsave(&i8253_lock, flags); | ||
1176 | /* set the clock to HZ */ | ||
1177 | outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
1178 | udelay(10); | ||
1179 | outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ | ||
1180 | udelay(10); | ||
1181 | outb(LATCH >> 8, PIT_CH0); /* MSB */ | ||
1182 | udelay(10); | ||
1183 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
1184 | #endif | ||
1185 | } | ||
1186 | |||
1187 | static int suspend(int vetoable) | ||
1188 | { | ||
1189 | int err; | ||
1190 | struct apm_user *as; | ||
1191 | |||
1192 | if (pm_send_all(PM_SUSPEND, (void *)3)) { | ||
1193 | /* Vetoed */ | ||
1194 | if (vetoable) { | ||
1195 | if (apm_info.connection_version > 0x100) | ||
1196 | set_system_power_state(APM_STATE_REJECT); | ||
1197 | err = -EBUSY; | ||
1198 | ignore_sys_suspend = 0; | ||
1199 | printk(KERN_WARNING "apm: suspend was vetoed.\n"); | ||
1200 | goto out; | ||
1201 | } | ||
1202 | printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n"); | ||
1203 | } | ||
1204 | |||
1205 | device_suspend(PMSG_SUSPEND); | ||
1206 | local_irq_disable(); | ||
1207 | device_power_down(PMSG_SUSPEND); | ||
1208 | |||
1209 | local_irq_enable(); | ||
1210 | |||
1211 | save_processor_state(); | ||
1212 | err = set_system_power_state(APM_STATE_SUSPEND); | ||
1213 | ignore_normal_resume = 1; | ||
1214 | restore_processor_state(); | ||
1215 | |||
1216 | local_irq_disable(); | ||
1217 | reinit_timer(); | ||
1218 | |||
1219 | if (err == APM_NO_ERROR) | ||
1220 | err = APM_SUCCESS; | ||
1221 | if (err != APM_SUCCESS) | ||
1222 | apm_error("suspend", err); | ||
1223 | err = (err == APM_SUCCESS) ? 0 : -EIO; | ||
1224 | device_power_up(); | ||
1225 | local_irq_enable(); | ||
1226 | device_resume(); | ||
1227 | pm_send_all(PM_RESUME, (void *)0); | ||
1228 | queue_event(APM_NORMAL_RESUME, NULL); | ||
1229 | out: | ||
1230 | spin_lock(&user_list_lock); | ||
1231 | for (as = user_list; as != NULL; as = as->next) { | ||
1232 | as->suspend_wait = 0; | ||
1233 | as->suspend_result = err; | ||
1234 | } | ||
1235 | spin_unlock(&user_list_lock); | ||
1236 | wake_up_interruptible(&apm_suspend_waitqueue); | ||
1237 | return err; | ||
1238 | } | ||
1239 | |||
1240 | static void standby(void) | ||
1241 | { | ||
1242 | int err; | ||
1243 | |||
1244 | local_irq_disable(); | ||
1245 | device_power_down(PMSG_SUSPEND); | ||
1246 | local_irq_enable(); | ||
1247 | |||
1248 | err = set_system_power_state(APM_STATE_STANDBY); | ||
1249 | if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) | ||
1250 | apm_error("standby", err); | ||
1251 | |||
1252 | local_irq_disable(); | ||
1253 | device_power_up(); | ||
1254 | local_irq_enable(); | ||
1255 | } | ||
1256 | |||
1257 | static apm_event_t get_event(void) | ||
1258 | { | ||
1259 | int error; | ||
1260 | apm_event_t event = APM_NO_EVENTS; /* silence gcc */ | ||
1261 | apm_eventinfo_t info; | ||
1262 | |||
1263 | static int notified; | ||
1264 | |||
1265 | /* we don't use the eventinfo */ | ||
1266 | error = apm_get_event(&event, &info); | ||
1267 | if (error == APM_SUCCESS) | ||
1268 | return event; | ||
1269 | |||
1270 | if ((error != APM_NO_EVENTS) && (notified++ == 0)) | ||
1271 | apm_error("get_event", error); | ||
1272 | |||
1273 | return 0; | ||
1274 | } | ||
1275 | |||
1276 | static void check_events(void) | ||
1277 | { | ||
1278 | apm_event_t event; | ||
1279 | static unsigned long last_resume; | ||
1280 | static int ignore_bounce; | ||
1281 | |||
1282 | while ((event = get_event()) != 0) { | ||
1283 | if (debug) { | ||
1284 | if (event <= NR_APM_EVENT_NAME) | ||
1285 | printk(KERN_DEBUG "apm: received %s notify\n", | ||
1286 | apm_event_name[event - 1]); | ||
1287 | else | ||
1288 | printk(KERN_DEBUG "apm: received unknown " | ||
1289 | "event 0x%02x\n", event); | ||
1290 | } | ||
1291 | if (ignore_bounce | ||
1292 | && ((jiffies - last_resume) > bounce_interval)) | ||
1293 | ignore_bounce = 0; | ||
1294 | |||
1295 | switch (event) { | ||
1296 | case APM_SYS_STANDBY: | ||
1297 | case APM_USER_STANDBY: | ||
1298 | queue_event(event, NULL); | ||
1299 | if (standbys_pending <= 0) | ||
1300 | standby(); | ||
1301 | break; | ||
1302 | |||
1303 | case APM_USER_SUSPEND: | ||
1304 | #ifdef CONFIG_APM_IGNORE_USER_SUSPEND | ||
1305 | if (apm_info.connection_version > 0x100) | ||
1306 | set_system_power_state(APM_STATE_REJECT); | ||
1307 | break; | ||
1308 | #endif | ||
1309 | case APM_SYS_SUSPEND: | ||
1310 | if (ignore_bounce) { | ||
1311 | if (apm_info.connection_version > 0x100) | ||
1312 | set_system_power_state(APM_STATE_REJECT); | ||
1313 | break; | ||
1314 | } | ||
1315 | /* | ||
1316 | * If we are already processing a SUSPEND, | ||
1317 | * then further SUSPEND events from the BIOS | ||
1318 | * will be ignored. We also return here to | ||
1319 | * cope with the fact that the Thinkpads keep | ||
1320 | * sending a SUSPEND event until something else | ||
1321 | * happens! | ||
1322 | */ | ||
1323 | if (ignore_sys_suspend) | ||
1324 | return; | ||
1325 | ignore_sys_suspend = 1; | ||
1326 | queue_event(event, NULL); | ||
1327 | if (suspends_pending <= 0) | ||
1328 | (void) suspend(1); | ||
1329 | break; | ||
1330 | |||
1331 | case APM_NORMAL_RESUME: | ||
1332 | case APM_CRITICAL_RESUME: | ||
1333 | case APM_STANDBY_RESUME: | ||
1334 | ignore_sys_suspend = 0; | ||
1335 | last_resume = jiffies; | ||
1336 | ignore_bounce = 1; | ||
1337 | if ((event != APM_NORMAL_RESUME) | ||
1338 | || (ignore_normal_resume == 0)) { | ||
1339 | device_resume(); | ||
1340 | pm_send_all(PM_RESUME, (void *)0); | ||
1341 | queue_event(event, NULL); | ||
1342 | } | ||
1343 | ignore_normal_resume = 0; | ||
1344 | break; | ||
1345 | |||
1346 | case APM_CAPABILITY_CHANGE: | ||
1347 | case APM_LOW_BATTERY: | ||
1348 | case APM_POWER_STATUS_CHANGE: | ||
1349 | queue_event(event, NULL); | ||
1350 | /* If needed, notify drivers here */ | ||
1351 | break; | ||
1352 | |||
1353 | case APM_UPDATE_TIME: | ||
1354 | break; | ||
1355 | |||
1356 | case APM_CRITICAL_SUSPEND: | ||
1357 | /* | ||
1358 | * We are not allowed to reject a critical suspend. | ||
1359 | */ | ||
1360 | (void) suspend(0); | ||
1361 | break; | ||
1362 | } | ||
1363 | } | ||
1364 | } | ||
1365 | |||
1366 | static void apm_event_handler(void) | ||
1367 | { | ||
1368 | static int pending_count = 4; | ||
1369 | int err; | ||
1370 | |||
1371 | if ((standbys_pending > 0) || (suspends_pending > 0)) { | ||
1372 | if ((apm_info.connection_version > 0x100) && | ||
1373 | (pending_count-- <= 0)) { | ||
1374 | pending_count = 4; | ||
1375 | if (debug) | ||
1376 | printk(KERN_DEBUG "apm: setting state busy\n"); | ||
1377 | err = set_system_power_state(APM_STATE_BUSY); | ||
1378 | if (err) | ||
1379 | apm_error("busy", err); | ||
1380 | } | ||
1381 | } else | ||
1382 | pending_count = 4; | ||
1383 | check_events(); | ||
1384 | } | ||
1385 | |||
1386 | /* | ||
1387 | * This is the APM thread main loop. | ||
1388 | */ | ||
1389 | |||
1390 | static void apm_mainloop(void) | ||
1391 | { | ||
1392 | DECLARE_WAITQUEUE(wait, current); | ||
1393 | |||
1394 | add_wait_queue(&apm_waitqueue, &wait); | ||
1395 | set_current_state(TASK_INTERRUPTIBLE); | ||
1396 | for (;;) { | ||
1397 | schedule_timeout(APM_CHECK_TIMEOUT); | ||
1398 | if (kthread_should_stop()) | ||
1399 | break; | ||
1400 | /* | ||
1401 | * Ok, check all events, check for idle (and mark us sleeping | ||
1402 | * so as not to count towards the load average).. | ||
1403 | */ | ||
1404 | set_current_state(TASK_INTERRUPTIBLE); | ||
1405 | apm_event_handler(); | ||
1406 | } | ||
1407 | remove_wait_queue(&apm_waitqueue, &wait); | ||
1408 | } | ||
1409 | |||
1410 | static int check_apm_user(struct apm_user *as, const char *func) | ||
1411 | { | ||
1412 | if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { | ||
1413 | printk(KERN_ERR "apm: %s passed bad filp\n", func); | ||
1414 | return 1; | ||
1415 | } | ||
1416 | return 0; | ||
1417 | } | ||
1418 | |||
1419 | static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) | ||
1420 | { | ||
1421 | struct apm_user * as; | ||
1422 | int i; | ||
1423 | apm_event_t event; | ||
1424 | |||
1425 | as = fp->private_data; | ||
1426 | if (check_apm_user(as, "read")) | ||
1427 | return -EIO; | ||
1428 | if ((int)count < sizeof(apm_event_t)) | ||
1429 | return -EINVAL; | ||
1430 | if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK)) | ||
1431 | return -EAGAIN; | ||
1432 | wait_event_interruptible(apm_waitqueue, !queue_empty(as)); | ||
1433 | i = count; | ||
1434 | while ((i >= sizeof(event)) && !queue_empty(as)) { | ||
1435 | event = get_queued_event(as); | ||
1436 | if (copy_to_user(buf, &event, sizeof(event))) { | ||
1437 | if (i < count) | ||
1438 | break; | ||
1439 | return -EFAULT; | ||
1440 | } | ||
1441 | switch (event) { | ||
1442 | case APM_SYS_SUSPEND: | ||
1443 | case APM_USER_SUSPEND: | ||
1444 | as->suspends_read++; | ||
1445 | break; | ||
1446 | |||
1447 | case APM_SYS_STANDBY: | ||
1448 | case APM_USER_STANDBY: | ||
1449 | as->standbys_read++; | ||
1450 | break; | ||
1451 | } | ||
1452 | buf += sizeof(event); | ||
1453 | i -= sizeof(event); | ||
1454 | } | ||
1455 | if (i < count) | ||
1456 | return count - i; | ||
1457 | if (signal_pending(current)) | ||
1458 | return -ERESTARTSYS; | ||
1459 | return 0; | ||
1460 | } | ||
1461 | |||
1462 | static unsigned int do_poll(struct file *fp, poll_table * wait) | ||
1463 | { | ||
1464 | struct apm_user * as; | ||
1465 | |||
1466 | as = fp->private_data; | ||
1467 | if (check_apm_user(as, "poll")) | ||
1468 | return 0; | ||
1469 | poll_wait(fp, &apm_waitqueue, wait); | ||
1470 | if (!queue_empty(as)) | ||
1471 | return POLLIN | POLLRDNORM; | ||
1472 | return 0; | ||
1473 | } | ||
1474 | |||
1475 | static int do_ioctl(struct inode * inode, struct file *filp, | ||
1476 | u_int cmd, u_long arg) | ||
1477 | { | ||
1478 | struct apm_user * as; | ||
1479 | |||
1480 | as = filp->private_data; | ||
1481 | if (check_apm_user(as, "ioctl")) | ||
1482 | return -EIO; | ||
1483 | if ((!as->suser) || (!as->writer)) | ||
1484 | return -EPERM; | ||
1485 | switch (cmd) { | ||
1486 | case APM_IOC_STANDBY: | ||
1487 | if (as->standbys_read > 0) { | ||
1488 | as->standbys_read--; | ||
1489 | as->standbys_pending--; | ||
1490 | standbys_pending--; | ||
1491 | } else | ||
1492 | queue_event(APM_USER_STANDBY, as); | ||
1493 | if (standbys_pending <= 0) | ||
1494 | standby(); | ||
1495 | break; | ||
1496 | case APM_IOC_SUSPEND: | ||
1497 | if (as->suspends_read > 0) { | ||
1498 | as->suspends_read--; | ||
1499 | as->suspends_pending--; | ||
1500 | suspends_pending--; | ||
1501 | } else | ||
1502 | queue_event(APM_USER_SUSPEND, as); | ||
1503 | if (suspends_pending <= 0) { | ||
1504 | return suspend(1); | ||
1505 | } else { | ||
1506 | as->suspend_wait = 1; | ||
1507 | wait_event_interruptible(apm_suspend_waitqueue, | ||
1508 | as->suspend_wait == 0); | ||
1509 | return as->suspend_result; | ||
1510 | } | ||
1511 | break; | ||
1512 | default: | ||
1513 | return -EINVAL; | ||
1514 | } | ||
1515 | return 0; | ||
1516 | } | ||
1517 | |||
1518 | static int do_release(struct inode * inode, struct file * filp) | ||
1519 | { | ||
1520 | struct apm_user * as; | ||
1521 | |||
1522 | as = filp->private_data; | ||
1523 | if (check_apm_user(as, "release")) | ||
1524 | return 0; | ||
1525 | filp->private_data = NULL; | ||
1526 | if (as->standbys_pending > 0) { | ||
1527 | standbys_pending -= as->standbys_pending; | ||
1528 | if (standbys_pending <= 0) | ||
1529 | standby(); | ||
1530 | } | ||
1531 | if (as->suspends_pending > 0) { | ||
1532 | suspends_pending -= as->suspends_pending; | ||
1533 | if (suspends_pending <= 0) | ||
1534 | (void) suspend(1); | ||
1535 | } | ||
1536 | spin_lock(&user_list_lock); | ||
1537 | if (user_list == as) | ||
1538 | user_list = as->next; | ||
1539 | else { | ||
1540 | struct apm_user * as1; | ||
1541 | |||
1542 | for (as1 = user_list; | ||
1543 | (as1 != NULL) && (as1->next != as); | ||
1544 | as1 = as1->next) | ||
1545 | ; | ||
1546 | if (as1 == NULL) | ||
1547 | printk(KERN_ERR "apm: filp not in user list\n"); | ||
1548 | else | ||
1549 | as1->next = as->next; | ||
1550 | } | ||
1551 | spin_unlock(&user_list_lock); | ||
1552 | kfree(as); | ||
1553 | return 0; | ||
1554 | } | ||
1555 | |||
1556 | static int do_open(struct inode * inode, struct file * filp) | ||
1557 | { | ||
1558 | struct apm_user * as; | ||
1559 | |||
1560 | as = kmalloc(sizeof(*as), GFP_KERNEL); | ||
1561 | if (as == NULL) { | ||
1562 | printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", | ||
1563 | sizeof(*as)); | ||
1564 | return -ENOMEM; | ||
1565 | } | ||
1566 | as->magic = APM_BIOS_MAGIC; | ||
1567 | as->event_tail = as->event_head = 0; | ||
1568 | as->suspends_pending = as->standbys_pending = 0; | ||
1569 | as->suspends_read = as->standbys_read = 0; | ||
1570 | /* | ||
1571 | * XXX - this is a tiny bit broken, when we consider BSD | ||
1572 | * process accounting. If the device is opened by root, we | ||
1573 | * instantly flag that we used superuser privs. Who knows, | ||
1574 | * we might close the device immediately without doing a | ||
1575 | * privileged operation -- cevans | ||
1576 | */ | ||
1577 | as->suser = capable(CAP_SYS_ADMIN); | ||
1578 | as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE; | ||
1579 | as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ; | ||
1580 | spin_lock(&user_list_lock); | ||
1581 | as->next = user_list; | ||
1582 | user_list = as; | ||
1583 | spin_unlock(&user_list_lock); | ||
1584 | filp->private_data = as; | ||
1585 | return 0; | ||
1586 | } | ||
1587 | |||
1588 | static int proc_apm_show(struct seq_file *m, void *v) | ||
1589 | { | ||
1590 | unsigned short bx; | ||
1591 | unsigned short cx; | ||
1592 | unsigned short dx; | ||
1593 | int error; | ||
1594 | unsigned short ac_line_status = 0xff; | ||
1595 | unsigned short battery_status = 0xff; | ||
1596 | unsigned short battery_flag = 0xff; | ||
1597 | int percentage = -1; | ||
1598 | int time_units = -1; | ||
1599 | char *units = "?"; | ||
1600 | |||
1601 | if ((num_online_cpus() == 1) && | ||
1602 | !(error = apm_get_power_status(&bx, &cx, &dx))) { | ||
1603 | ac_line_status = (bx >> 8) & 0xff; | ||
1604 | battery_status = bx & 0xff; | ||
1605 | if ((cx & 0xff) != 0xff) | ||
1606 | percentage = cx & 0xff; | ||
1607 | |||
1608 | if (apm_info.connection_version > 0x100) { | ||
1609 | battery_flag = (cx >> 8) & 0xff; | ||
1610 | if (dx != 0xffff) { | ||
1611 | units = (dx & 0x8000) ? "min" : "sec"; | ||
1612 | time_units = dx & 0x7fff; | ||
1613 | } | ||
1614 | } | ||
1615 | } | ||
1616 | /* Arguments, with symbols from linux/apm_bios.h. Information is | ||
1617 | from the Get Power Status (0x0a) call unless otherwise noted. | ||
1618 | |||
1619 | 0) Linux driver version (this will change if format changes) | ||
1620 | 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2. | ||
1621 | 2) APM flags from APM Installation Check (0x00): | ||
1622 | bit 0: APM_16_BIT_SUPPORT | ||
1623 | bit 1: APM_32_BIT_SUPPORT | ||
1624 | bit 2: APM_IDLE_SLOWS_CLOCK | ||
1625 | bit 3: APM_BIOS_DISABLED | ||
1626 | bit 4: APM_BIOS_DISENGAGED | ||
1627 | 3) AC line status | ||
1628 | 0x00: Off-line | ||
1629 | 0x01: On-line | ||
1630 | 0x02: On backup power (BIOS >= 1.1 only) | ||
1631 | 0xff: Unknown | ||
1632 | 4) Battery status | ||
1633 | 0x00: High | ||
1634 | 0x01: Low | ||
1635 | 0x02: Critical | ||
1636 | 0x03: Charging | ||
1637 | 0x04: Selected battery not present (BIOS >= 1.2 only) | ||
1638 | 0xff: Unknown | ||
1639 | 5) Battery flag | ||
1640 | bit 0: High | ||
1641 | bit 1: Low | ||
1642 | bit 2: Critical | ||
1643 | bit 3: Charging | ||
1644 | bit 7: No system battery | ||
1645 | 0xff: Unknown | ||
1646 | 6) Remaining battery life (percentage of charge): | ||
1647 | 0-100: valid | ||
1648 | -1: Unknown | ||
1649 | 7) Remaining battery life (time units): | ||
1650 | Number of remaining minutes or seconds | ||
1651 | -1: Unknown | ||
1652 | 8) min = minutes; sec = seconds */ | ||
1653 | |||
1654 | seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", | ||
1655 | driver_version, | ||
1656 | (apm_info.bios.version >> 8) & 0xff, | ||
1657 | apm_info.bios.version & 0xff, | ||
1658 | apm_info.bios.flags, | ||
1659 | ac_line_status, | ||
1660 | battery_status, | ||
1661 | battery_flag, | ||
1662 | percentage, | ||
1663 | time_units, | ||
1664 | units); | ||
1665 | return 0; | ||
1666 | } | ||
1667 | |||
1668 | static int proc_apm_open(struct inode *inode, struct file *file) | ||
1669 | { | ||
1670 | return single_open(file, proc_apm_show, NULL); | ||
1671 | } | ||
1672 | |||
1673 | static const struct file_operations apm_file_ops = { | ||
1674 | .owner = THIS_MODULE, | ||
1675 | .open = proc_apm_open, | ||
1676 | .read = seq_read, | ||
1677 | .llseek = seq_lseek, | ||
1678 | .release = single_release, | ||
1679 | }; | ||
1680 | |||
1681 | static int apm(void *unused) | ||
1682 | { | ||
1683 | unsigned short bx; | ||
1684 | unsigned short cx; | ||
1685 | unsigned short dx; | ||
1686 | int error; | ||
1687 | char * power_stat; | ||
1688 | char * bat_stat; | ||
1689 | |||
1690 | #ifdef CONFIG_SMP | ||
1691 | /* 2002/08/01 - WT | ||
1692 | * This is to avoid random crashes at boot time during initialization | ||
1693 | * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. | ||
1694 | * Some bioses don't like being called from CPU != 0. | ||
1695 | * Method suggested by Ingo Molnar. | ||
1696 | */ | ||
1697 | set_cpus_allowed(current, cpumask_of_cpu(0)); | ||
1698 | BUG_ON(smp_processor_id() != 0); | ||
1699 | #endif | ||
1700 | |||
1701 | if (apm_info.connection_version == 0) { | ||
1702 | apm_info.connection_version = apm_info.bios.version; | ||
1703 | if (apm_info.connection_version > 0x100) { | ||
1704 | /* | ||
1705 | * We only support BIOSs up to version 1.2 | ||
1706 | */ | ||
1707 | if (apm_info.connection_version > 0x0102) | ||
1708 | apm_info.connection_version = 0x0102; | ||
1709 | error = apm_driver_version(&apm_info.connection_version); | ||
1710 | if (error != APM_SUCCESS) { | ||
1711 | apm_error("driver version", error); | ||
1712 | /* Fall back to an APM 1.0 connection. */ | ||
1713 | apm_info.connection_version = 0x100; | ||
1714 | } | ||
1715 | } | ||
1716 | } | ||
1717 | |||
1718 | if (debug) | ||
1719 | printk(KERN_INFO "apm: Connection version %d.%d\n", | ||
1720 | (apm_info.connection_version >> 8) & 0xff, | ||
1721 | apm_info.connection_version & 0xff); | ||
1722 | |||
1723 | #ifdef CONFIG_APM_DO_ENABLE | ||
1724 | if (apm_info.bios.flags & APM_BIOS_DISABLED) { | ||
1725 | /* | ||
1726 | * This call causes my NEC UltraLite Versa 33/C to hang if it | ||
1727 | * is booted with PM disabled but not in the docking station. | ||
1728 | * Unfortunate ... | ||
1729 | */ | ||
1730 | error = apm_enable_power_management(1); | ||
1731 | if (error) { | ||
1732 | apm_error("enable power management", error); | ||
1733 | return -1; | ||
1734 | } | ||
1735 | } | ||
1736 | #endif | ||
1737 | |||
1738 | if ((apm_info.bios.flags & APM_BIOS_DISENGAGED) | ||
1739 | && (apm_info.connection_version > 0x0100)) { | ||
1740 | error = apm_engage_power_management(APM_DEVICE_ALL, 1); | ||
1741 | if (error) { | ||
1742 | apm_error("engage power management", error); | ||
1743 | return -1; | ||
1744 | } | ||
1745 | } | ||
1746 | |||
1747 | if (debug && (num_online_cpus() == 1 || smp )) { | ||
1748 | error = apm_get_power_status(&bx, &cx, &dx); | ||
1749 | if (error) | ||
1750 | printk(KERN_INFO "apm: power status not available\n"); | ||
1751 | else { | ||
1752 | switch ((bx >> 8) & 0xff) { | ||
1753 | case 0: power_stat = "off line"; break; | ||
1754 | case 1: power_stat = "on line"; break; | ||
1755 | case 2: power_stat = "on backup power"; break; | ||
1756 | default: power_stat = "unknown"; break; | ||
1757 | } | ||
1758 | switch (bx & 0xff) { | ||
1759 | case 0: bat_stat = "high"; break; | ||
1760 | case 1: bat_stat = "low"; break; | ||
1761 | case 2: bat_stat = "critical"; break; | ||
1762 | case 3: bat_stat = "charging"; break; | ||
1763 | default: bat_stat = "unknown"; break; | ||
1764 | } | ||
1765 | printk(KERN_INFO | ||
1766 | "apm: AC %s, battery status %s, battery life ", | ||
1767 | power_stat, bat_stat); | ||
1768 | if ((cx & 0xff) == 0xff) | ||
1769 | printk("unknown\n"); | ||
1770 | else | ||
1771 | printk("%d%%\n", cx & 0xff); | ||
1772 | if (apm_info.connection_version > 0x100) { | ||
1773 | printk(KERN_INFO | ||
1774 | "apm: battery flag 0x%02x, battery life ", | ||
1775 | (cx >> 8) & 0xff); | ||
1776 | if (dx == 0xffff) | ||
1777 | printk("unknown\n"); | ||
1778 | else | ||
1779 | printk("%d %s\n", dx & 0x7fff, | ||
1780 | (dx & 0x8000) ? | ||
1781 | "minutes" : "seconds"); | ||
1782 | } | ||
1783 | } | ||
1784 | } | ||
1785 | |||
1786 | /* Install our power off handler.. */ | ||
1787 | if (power_off) | ||
1788 | pm_power_off = apm_power_off; | ||
1789 | |||
1790 | if (num_online_cpus() == 1 || smp) { | ||
1791 | #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) | ||
1792 | console_blank_hook = apm_console_blank; | ||
1793 | #endif | ||
1794 | apm_mainloop(); | ||
1795 | #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) | ||
1796 | console_blank_hook = NULL; | ||
1797 | #endif | ||
1798 | } | ||
1799 | |||
1800 | return 0; | ||
1801 | } | ||
1802 | |||
1803 | #ifndef MODULE | ||
1804 | static int __init apm_setup(char *str) | ||
1805 | { | ||
1806 | int invert; | ||
1807 | |||
1808 | while ((str != NULL) && (*str != '\0')) { | ||
1809 | if (strncmp(str, "off", 3) == 0) | ||
1810 | apm_disabled = 1; | ||
1811 | if (strncmp(str, "on", 2) == 0) | ||
1812 | apm_disabled = 0; | ||
1813 | if ((strncmp(str, "bounce-interval=", 16) == 0) || | ||
1814 | (strncmp(str, "bounce_interval=", 16) == 0)) | ||
1815 | bounce_interval = simple_strtol(str + 16, NULL, 0); | ||
1816 | if ((strncmp(str, "idle-threshold=", 15) == 0) || | ||
1817 | (strncmp(str, "idle_threshold=", 15) == 0)) | ||
1818 | idle_threshold = simple_strtol(str + 15, NULL, 0); | ||
1819 | if ((strncmp(str, "idle-period=", 12) == 0) || | ||
1820 | (strncmp(str, "idle_period=", 12) == 0)) | ||
1821 | idle_period = simple_strtol(str + 12, NULL, 0); | ||
1822 | invert = (strncmp(str, "no-", 3) == 0) || | ||
1823 | (strncmp(str, "no_", 3) == 0); | ||
1824 | if (invert) | ||
1825 | str += 3; | ||
1826 | if (strncmp(str, "debug", 5) == 0) | ||
1827 | debug = !invert; | ||
1828 | if ((strncmp(str, "power-off", 9) == 0) || | ||
1829 | (strncmp(str, "power_off", 9) == 0)) | ||
1830 | power_off = !invert; | ||
1831 | if (strncmp(str, "smp", 3) == 0) | ||
1832 | { | ||
1833 | smp = !invert; | ||
1834 | idle_threshold = 100; | ||
1835 | } | ||
1836 | if ((strncmp(str, "allow-ints", 10) == 0) || | ||
1837 | (strncmp(str, "allow_ints", 10) == 0)) | ||
1838 | apm_info.allow_ints = !invert; | ||
1839 | if ((strncmp(str, "broken-psr", 10) == 0) || | ||
1840 | (strncmp(str, "broken_psr", 10) == 0)) | ||
1841 | apm_info.get_power_status_broken = !invert; | ||
1842 | if ((strncmp(str, "realmode-power-off", 18) == 0) || | ||
1843 | (strncmp(str, "realmode_power_off", 18) == 0)) | ||
1844 | apm_info.realmode_power_off = !invert; | ||
1845 | str = strchr(str, ','); | ||
1846 | if (str != NULL) | ||
1847 | str += strspn(str, ", \t"); | ||
1848 | } | ||
1849 | return 1; | ||
1850 | } | ||
1851 | |||
1852 | __setup("apm=", apm_setup); | ||
1853 | #endif | ||
1854 | |||
1855 | static const struct file_operations apm_bios_fops = { | ||
1856 | .owner = THIS_MODULE, | ||
1857 | .read = do_read, | ||
1858 | .poll = do_poll, | ||
1859 | .ioctl = do_ioctl, | ||
1860 | .open = do_open, | ||
1861 | .release = do_release, | ||
1862 | }; | ||
1863 | |||
1864 | static struct miscdevice apm_device = { | ||
1865 | APM_MINOR_DEV, | ||
1866 | "apm_bios", | ||
1867 | &apm_bios_fops | ||
1868 | }; | ||
1869 | |||
1870 | |||
1871 | /* Simple "print if true" callback */ | ||
1872 | static int __init print_if_true(struct dmi_system_id *d) | ||
1873 | { | ||
1874 | printk("%s\n", d->ident); | ||
1875 | return 0; | ||
1876 | } | ||
1877 | |||
1878 | /* | ||
1879 | * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was | ||
1880 | * disabled before the suspend. Linux used to get terribly confused by that. | ||
1881 | */ | ||
1882 | static int __init broken_ps2_resume(struct dmi_system_id *d) | ||
1883 | { | ||
1884 | printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); | ||
1885 | return 0; | ||
1886 | } | ||
1887 | |||
1888 | /* Some bioses have a broken protected mode poweroff and need to use realmode */ | ||
1889 | static int __init set_realmode_power_off(struct dmi_system_id *d) | ||
1890 | { | ||
1891 | if (apm_info.realmode_power_off == 0) { | ||
1892 | apm_info.realmode_power_off = 1; | ||
1893 | printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); | ||
1894 | } | ||
1895 | return 0; | ||
1896 | } | ||
1897 | |||
1898 | /* Some laptops require interrupts to be enabled during APM calls */ | ||
1899 | static int __init set_apm_ints(struct dmi_system_id *d) | ||
1900 | { | ||
1901 | if (apm_info.allow_ints == 0) { | ||
1902 | apm_info.allow_ints = 1; | ||
1903 | printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); | ||
1904 | } | ||
1905 | return 0; | ||
1906 | } | ||
1907 | |||
1908 | /* Some APM bioses corrupt memory or just plain do not work */ | ||
1909 | static int __init apm_is_horked(struct dmi_system_id *d) | ||
1910 | { | ||
1911 | if (apm_info.disabled == 0) { | ||
1912 | apm_info.disabled = 1; | ||
1913 | printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); | ||
1914 | } | ||
1915 | return 0; | ||
1916 | } | ||
1917 | |||
1918 | static int __init apm_is_horked_d850md(struct dmi_system_id *d) | ||
1919 | { | ||
1920 | if (apm_info.disabled == 0) { | ||
1921 | apm_info.disabled = 1; | ||
1922 | printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); | ||
1923 | printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); | ||
1924 | printk(KERN_INFO "download from support.intel.com \n"); | ||
1925 | } | ||
1926 | return 0; | ||
1927 | } | ||
1928 | |||
1929 | /* Some APM bioses hang on APM idle calls */ | ||
1930 | static int __init apm_likes_to_melt(struct dmi_system_id *d) | ||
1931 | { | ||
1932 | if (apm_info.forbid_idle == 0) { | ||
1933 | apm_info.forbid_idle = 1; | ||
1934 | printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); | ||
1935 | } | ||
1936 | return 0; | ||
1937 | } | ||
1938 | |||
1939 | /* | ||
1940 | * Check for clue free BIOS implementations who use | ||
1941 | * the following QA technique | ||
1942 | * | ||
1943 | * [ Write BIOS Code ]<------ | ||
1944 | * | ^ | ||
1945 | * < Does it Compile >----N-- | ||
1946 | * |Y ^ | ||
1947 | * < Does it Boot Win98 >-N-- | ||
1948 | * |Y | ||
1949 | * [Ship It] | ||
1950 | * | ||
1951 | * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e) | ||
1952 | * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000) | ||
1953 | */ | ||
1954 | static int __init broken_apm_power(struct dmi_system_id *d) | ||
1955 | { | ||
1956 | apm_info.get_power_status_broken = 1; | ||
1957 | printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); | ||
1958 | return 0; | ||
1959 | } | ||
1960 | |||
1961 | /* | ||
1962 | * This bios swaps the APM minute reporting bytes over (Many sony laptops | ||
1963 | * have this problem). | ||
1964 | */ | ||
1965 | static int __init swab_apm_power_in_minutes(struct dmi_system_id *d) | ||
1966 | { | ||
1967 | apm_info.get_power_status_swabinminutes = 1; | ||
1968 | printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); | ||
1969 | return 0; | ||
1970 | } | ||
1971 | |||
1972 | static struct dmi_system_id __initdata apm_dmi_table[] = { | ||
1973 | { | ||
1974 | print_if_true, | ||
1975 | KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", | ||
1976 | { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), | ||
1977 | DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), }, | ||
1978 | }, | ||
1979 | { /* Handle problems with APM on the C600 */ | ||
1980 | broken_ps2_resume, "Dell Latitude C600", | ||
1981 | { DMI_MATCH(DMI_SYS_VENDOR, "Dell"), | ||
1982 | DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), }, | ||
1983 | }, | ||
1984 | { /* Allow interrupts during suspend on Dell Latitude laptops*/ | ||
1985 | set_apm_ints, "Dell Latitude", | ||
1986 | { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
1987 | DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), } | ||
1988 | }, | ||
1989 | { /* APM crashes */ | ||
1990 | apm_is_horked, "Dell Inspiron 2500", | ||
1991 | { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
1992 | DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), | ||
1993 | DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), | ||
1994 | DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, | ||
1995 | }, | ||
1996 | { /* Allow interrupts during suspend on Dell Inspiron laptops*/ | ||
1997 | set_apm_ints, "Dell Inspiron", { | ||
1998 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
1999 | DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), }, | ||
2000 | }, | ||
2001 | { /* Handle problems with APM on Inspiron 5000e */ | ||
2002 | broken_apm_power, "Dell Inspiron 5000e", | ||
2003 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2004 | DMI_MATCH(DMI_BIOS_VERSION, "A04"), | ||
2005 | DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), }, | ||
2006 | }, | ||
2007 | { /* Handle problems with APM on Inspiron 2500 */ | ||
2008 | broken_apm_power, "Dell Inspiron 2500", | ||
2009 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2010 | DMI_MATCH(DMI_BIOS_VERSION, "A12"), | ||
2011 | DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), }, | ||
2012 | }, | ||
2013 | { /* APM crashes */ | ||
2014 | apm_is_horked, "Dell Dimension 4100", | ||
2015 | { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
2016 | DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), | ||
2017 | DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), | ||
2018 | DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, | ||
2019 | }, | ||
2020 | { /* Allow interrupts during suspend on Compaq Laptops*/ | ||
2021 | set_apm_ints, "Compaq 12XL125", | ||
2022 | { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), | ||
2023 | DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), | ||
2024 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2025 | DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, | ||
2026 | }, | ||
2027 | { /* Allow interrupts during APM or the clock goes slow */ | ||
2028 | set_apm_ints, "ASUSTeK", | ||
2029 | { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), | ||
2030 | DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), }, | ||
2031 | }, | ||
2032 | { /* APM blows on shutdown */ | ||
2033 | apm_is_horked, "ABIT KX7-333[R]", | ||
2034 | { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"), | ||
2035 | DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), }, | ||
2036 | }, | ||
2037 | { /* APM crashes */ | ||
2038 | apm_is_horked, "Trigem Delhi3", | ||
2039 | { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"), | ||
2040 | DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), }, | ||
2041 | }, | ||
2042 | { /* APM crashes */ | ||
2043 | apm_is_horked, "Fujitsu-Siemens", | ||
2044 | { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"), | ||
2045 | DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), }, | ||
2046 | }, | ||
2047 | { /* APM crashes */ | ||
2048 | apm_is_horked_d850md, "Intel D850MD", | ||
2049 | { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), | ||
2050 | DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), }, | ||
2051 | }, | ||
2052 | { /* APM crashes */ | ||
2053 | apm_is_horked, "Intel D810EMO", | ||
2054 | { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), | ||
2055 | DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), }, | ||
2056 | }, | ||
2057 | { /* APM crashes */ | ||
2058 | apm_is_horked, "Dell XPS-Z", | ||
2059 | { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), | ||
2060 | DMI_MATCH(DMI_BIOS_VERSION, "A11"), | ||
2061 | DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), }, | ||
2062 | }, | ||
2063 | { /* APM crashes */ | ||
2064 | apm_is_horked, "Sharp PC-PJ/AX", | ||
2065 | { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), | ||
2066 | DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), | ||
2067 | DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), | ||
2068 | DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, | ||
2069 | }, | ||
2070 | { /* APM crashes */ | ||
2071 | apm_is_horked, "Dell Inspiron 2500", | ||
2072 | { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
2073 | DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), | ||
2074 | DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), | ||
2075 | DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, | ||
2076 | }, | ||
2077 | { /* APM idle hangs */ | ||
2078 | apm_likes_to_melt, "Jabil AMD", | ||
2079 | { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), | ||
2080 | DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), }, | ||
2081 | }, | ||
2082 | { /* APM idle hangs */ | ||
2083 | apm_likes_to_melt, "AMI Bios", | ||
2084 | { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), | ||
2085 | DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), }, | ||
2086 | }, | ||
2087 | { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */ | ||
2088 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2089 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2090 | DMI_MATCH(DMI_BIOS_VERSION, "R0206H"), | ||
2091 | DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), }, | ||
2092 | }, | ||
2093 | { /* Handle problems with APM on Sony Vaio PCG-N505VX */ | ||
2094 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2095 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2096 | DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"), | ||
2097 | DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), }, | ||
2098 | }, | ||
2099 | { /* Handle problems with APM on Sony Vaio PCG-XG29 */ | ||
2100 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2101 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2102 | DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"), | ||
2103 | DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), }, | ||
2104 | }, | ||
2105 | { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ | ||
2106 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2107 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2108 | DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"), | ||
2109 | DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), }, | ||
2110 | }, | ||
2111 | { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ | ||
2112 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2113 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2114 | DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"), | ||
2115 | DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), }, | ||
2116 | }, | ||
2117 | { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */ | ||
2118 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2119 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2120 | DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"), | ||
2121 | DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), }, | ||
2122 | }, | ||
2123 | { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ | ||
2124 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2125 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2126 | DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"), | ||
2127 | DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), }, | ||
2128 | }, | ||
2129 | { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ | ||
2130 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2131 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2132 | DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"), | ||
2133 | DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), }, | ||
2134 | }, | ||
2135 | { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */ | ||
2136 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2137 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2138 | DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"), | ||
2139 | DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), }, | ||
2140 | }, | ||
2141 | { /* Handle problems with APM on Sony Vaio PCG-F104K */ | ||
2142 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2143 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2144 | DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"), | ||
2145 | DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), }, | ||
2146 | }, | ||
2147 | |||
2148 | { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */ | ||
2149 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2150 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2151 | DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"), | ||
2152 | DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), }, | ||
2153 | }, | ||
2154 | { /* Handle problems with APM on Sony Vaio PCG-C1VE */ | ||
2155 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2156 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2157 | DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"), | ||
2158 | DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), }, | ||
2159 | }, | ||
2160 | { /* Handle problems with APM on Sony Vaio PCG-C1VE */ | ||
2161 | swab_apm_power_in_minutes, "Sony VAIO", | ||
2162 | { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), | ||
2163 | DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"), | ||
2164 | DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), }, | ||
2165 | }, | ||
2166 | { /* broken PM poweroff bios */ | ||
2167 | set_realmode_power_off, "Award Software v4.60 PGMA", | ||
2168 | { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."), | ||
2169 | DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), | ||
2170 | DMI_MATCH(DMI_BIOS_DATE, "134526184"), }, | ||
2171 | }, | ||
2172 | |||
2173 | /* Generic per vendor APM settings */ | ||
2174 | |||
2175 | { /* Allow interrupts during suspend on IBM laptops */ | ||
2176 | set_apm_ints, "IBM", | ||
2177 | { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, | ||
2178 | }, | ||
2179 | |||
2180 | { } | ||
2181 | }; | ||
2182 | |||
2183 | /* | ||
2184 | * Just start the APM thread. We do NOT want to do APM BIOS | ||
2185 | * calls from anything but the APM thread, if for no other reason | ||
2186 | * than the fact that we don't trust the APM BIOS. This way, | ||
2187 | * most common APM BIOS problems that lead to protection errors | ||
2188 | * etc will have at least some level of being contained... | ||
2189 | * | ||
2190 | * In short, if something bad happens, at least we have a choice | ||
2191 | * of just killing the apm thread.. | ||
2192 | */ | ||
2193 | static int __init apm_init(void) | ||
2194 | { | ||
2195 | struct proc_dir_entry *apm_proc; | ||
2196 | struct desc_struct *gdt; | ||
2197 | int err; | ||
2198 | |||
2199 | dmi_check_system(apm_dmi_table); | ||
2200 | |||
2201 | if (apm_info.bios.version == 0 || paravirt_enabled()) { | ||
2202 | printk(KERN_INFO "apm: BIOS not found.\n"); | ||
2203 | return -ENODEV; | ||
2204 | } | ||
2205 | printk(KERN_INFO | ||
2206 | "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", | ||
2207 | ((apm_info.bios.version >> 8) & 0xff), | ||
2208 | (apm_info.bios.version & 0xff), | ||
2209 | apm_info.bios.flags, | ||
2210 | driver_version); | ||
2211 | if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { | ||
2212 | printk(KERN_INFO "apm: no 32 bit BIOS support\n"); | ||
2213 | return -ENODEV; | ||
2214 | } | ||
2215 | |||
2216 | if (allow_ints) | ||
2217 | apm_info.allow_ints = 1; | ||
2218 | if (broken_psr) | ||
2219 | apm_info.get_power_status_broken = 1; | ||
2220 | if (realmode_power_off) | ||
2221 | apm_info.realmode_power_off = 1; | ||
2222 | /* User can override, but default is to trust DMI */ | ||
2223 | if (apm_disabled != -1) | ||
2224 | apm_info.disabled = apm_disabled; | ||
2225 | |||
2226 | /* | ||
2227 | * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1 | ||
2228 | * but is reportedly a 1.0 BIOS. | ||
2229 | */ | ||
2230 | if (apm_info.bios.version == 0x001) | ||
2231 | apm_info.bios.version = 0x100; | ||
2232 | |||
2233 | /* BIOS < 1.2 doesn't set cseg_16_len */ | ||
2234 | if (apm_info.bios.version < 0x102) | ||
2235 | apm_info.bios.cseg_16_len = 0; /* 64k */ | ||
2236 | |||
2237 | if (debug) { | ||
2238 | printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x", | ||
2239 | apm_info.bios.cseg, apm_info.bios.offset, | ||
2240 | apm_info.bios.cseg_16, apm_info.bios.dseg); | ||
2241 | if (apm_info.bios.version > 0x100) | ||
2242 | printk(" cseg len %x, dseg len %x", | ||
2243 | apm_info.bios.cseg_len, | ||
2244 | apm_info.bios.dseg_len); | ||
2245 | if (apm_info.bios.version > 0x101) | ||
2246 | printk(" cseg16 len %x", apm_info.bios.cseg_16_len); | ||
2247 | printk("\n"); | ||
2248 | } | ||
2249 | |||
2250 | if (apm_info.disabled) { | ||
2251 | printk(KERN_NOTICE "apm: disabled on user request.\n"); | ||
2252 | return -ENODEV; | ||
2253 | } | ||
2254 | if ((num_online_cpus() > 1) && !power_off && !smp) { | ||
2255 | printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); | ||
2256 | apm_info.disabled = 1; | ||
2257 | return -ENODEV; | ||
2258 | } | ||
2259 | if (PM_IS_ACTIVE()) { | ||
2260 | printk(KERN_NOTICE "apm: overridden by ACPI.\n"); | ||
2261 | apm_info.disabled = 1; | ||
2262 | return -ENODEV; | ||
2263 | } | ||
2264 | #ifdef CONFIG_PM_LEGACY | ||
2265 | pm_active = 1; | ||
2266 | #endif | ||
2267 | |||
2268 | /* | ||
2269 | * Set up a segment that references the real mode segment 0x40 | ||
2270 | * that extends up to the end of page zero (that we have reserved). | ||
2271 | * This is for buggy BIOS's that refer to (real mode) segment 0x40 | ||
2272 | * even though they are called in protected mode. | ||
2273 | */ | ||
2274 | set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); | ||
2275 | _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); | ||
2276 | |||
2277 | /* | ||
2278 | * Set up the long jump entry point to the APM BIOS, which is called | ||
2279 | * from inline assembly. | ||
2280 | */ | ||
2281 | apm_bios_entry.offset = apm_info.bios.offset; | ||
2282 | apm_bios_entry.segment = APM_CS; | ||
2283 | |||
2284 | /* | ||
2285 | * The APM 1.1 BIOS is supposed to provide limit information that it | ||
2286 | * recognizes. Many machines do this correctly, but many others do | ||
2287 | * not restrict themselves to their claimed limit. When this happens, | ||
2288 | * they will cause a segmentation violation in the kernel at boot time. | ||
2289 | * Most BIOS's, however, will respect a 64k limit, so we use that. | ||
2290 | * | ||
2291 | * Note we only set APM segments on CPU zero, since we pin the APM | ||
2292 | * code to that CPU. | ||
2293 | */ | ||
2294 | gdt = get_cpu_gdt_table(0); | ||
2295 | set_base(gdt[APM_CS >> 3], | ||
2296 | __va((unsigned long)apm_info.bios.cseg << 4)); | ||
2297 | set_base(gdt[APM_CS_16 >> 3], | ||
2298 | __va((unsigned long)apm_info.bios.cseg_16 << 4)); | ||
2299 | set_base(gdt[APM_DS >> 3], | ||
2300 | __va((unsigned long)apm_info.bios.dseg << 4)); | ||
2301 | |||
2302 | apm_proc = create_proc_entry("apm", 0, NULL); | ||
2303 | if (apm_proc) | ||
2304 | apm_proc->proc_fops = &apm_file_ops; | ||
2305 | |||
2306 | kapmd_task = kthread_create(apm, NULL, "kapmd"); | ||
2307 | if (IS_ERR(kapmd_task)) { | ||
2308 | printk(KERN_ERR "apm: disabled - Unable to start kernel " | ||
2309 | "thread.\n"); | ||
2310 | err = PTR_ERR(kapmd_task); | ||
2311 | kapmd_task = NULL; | ||
2312 | remove_proc_entry("apm", NULL); | ||
2313 | return err; | ||
2314 | } | ||
2315 | wake_up_process(kapmd_task); | ||
2316 | |||
2317 | if (num_online_cpus() > 1 && !smp ) { | ||
2318 | printk(KERN_NOTICE | ||
2319 | "apm: disabled - APM is not SMP safe (power off active).\n"); | ||
2320 | return 0; | ||
2321 | } | ||
2322 | |||
2323 | /* | ||
2324 | * Note we don't actually care if the misc_device cannot be registered. | ||
2325 | * this driver can do its job without it, even if userspace can't | ||
2326 | * control it. just log the error | ||
2327 | */ | ||
2328 | if (misc_register(&apm_device)) | ||
2329 | printk(KERN_WARNING "apm: Could not register misc device.\n"); | ||
2330 | |||
2331 | if (HZ != 100) | ||
2332 | idle_period = (idle_period * HZ) / 100; | ||
2333 | if (idle_threshold < 100) { | ||
2334 | original_pm_idle = pm_idle; | ||
2335 | pm_idle = apm_cpu_idle; | ||
2336 | set_pm_idle = 1; | ||
2337 | } | ||
2338 | |||
2339 | return 0; | ||
2340 | } | ||
2341 | |||
2342 | static void __exit apm_exit(void) | ||
2343 | { | ||
2344 | int error; | ||
2345 | |||
2346 | if (set_pm_idle) { | ||
2347 | pm_idle = original_pm_idle; | ||
2348 | /* | ||
2349 | * We are about to unload the current idle thread pm callback | ||
2350 | * (pm_idle), Wait for all processors to update cached/local | ||
2351 | * copies of pm_idle before proceeding. | ||
2352 | */ | ||
2353 | cpu_idle_wait(); | ||
2354 | } | ||
2355 | if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) | ||
2356 | && (apm_info.connection_version > 0x0100)) { | ||
2357 | error = apm_engage_power_management(APM_DEVICE_ALL, 0); | ||
2358 | if (error) | ||
2359 | apm_error("disengage power management", error); | ||
2360 | } | ||
2361 | misc_deregister(&apm_device); | ||
2362 | remove_proc_entry("apm", NULL); | ||
2363 | if (power_off) | ||
2364 | pm_power_off = NULL; | ||
2365 | if (kapmd_task) { | ||
2366 | kthread_stop(kapmd_task); | ||
2367 | kapmd_task = NULL; | ||
2368 | } | ||
2369 | #ifdef CONFIG_PM_LEGACY | ||
2370 | pm_active = 0; | ||
2371 | #endif | ||
2372 | } | ||
2373 | |||
2374 | module_init(apm_init); | ||
2375 | module_exit(apm_exit); | ||
2376 | |||
2377 | MODULE_AUTHOR("Stephen Rothwell"); | ||
2378 | MODULE_DESCRIPTION("Advanced Power Management"); | ||
2379 | MODULE_LICENSE("GPL"); | ||
2380 | module_param(debug, bool, 0644); | ||
2381 | MODULE_PARM_DESC(debug, "Enable debug mode"); | ||
2382 | module_param(power_off, bool, 0444); | ||
2383 | MODULE_PARM_DESC(power_off, "Enable power off"); | ||
2384 | module_param(bounce_interval, int, 0444); | ||
2385 | MODULE_PARM_DESC(bounce_interval, | ||
2386 | "Set the number of ticks to ignore suspend bounces"); | ||
2387 | module_param(allow_ints, bool, 0444); | ||
2388 | MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls"); | ||
2389 | module_param(broken_psr, bool, 0444); | ||
2390 | MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call"); | ||
2391 | module_param(realmode_power_off, bool, 0444); | ||
2392 | MODULE_PARM_DESC(realmode_power_off, | ||
2393 | "Switch to real mode before powering off"); | ||
2394 | module_param(idle_threshold, int, 0444); | ||
2395 | MODULE_PARM_DESC(idle_threshold, | ||
2396 | "System idle percentage above which to make APM BIOS idle calls"); | ||
2397 | module_param(idle_period, int, 0444); | ||
2398 | MODULE_PARM_DESC(idle_period, | ||
2399 | "Period (in sec/100) over which to caculate the idle percentage"); | ||
2400 | module_param(smp, bool, 0444); | ||
2401 | MODULE_PARM_DESC(smp, | ||
2402 | "Set this to enable APM use on an SMP platform. Use with caution on older systems"); | ||
2403 | MODULE_ALIAS_MISCDEV(APM_MINOR_DEV); | ||
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c new file mode 100644 index 000000000000..cfa82c899f47 --- /dev/null +++ b/arch/x86/kernel/asm-offsets.c | |||
@@ -0,0 +1,5 @@ | |||
1 | #ifdef CONFIG_X86_32 | ||
2 | # include "asm-offsets_32.c" | ||
3 | #else | ||
4 | # include "asm-offsets_64.c" | ||
5 | #endif | ||
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c new file mode 100644 index 000000000000..8029742c0fc1 --- /dev/null +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -0,0 +1,147 @@ | |||
1 | /* | ||
2 | * Generate definitions needed by assembly language modules. | ||
3 | * This code generates raw asm output which is post-processed | ||
4 | * to extract and format the required data. | ||
5 | */ | ||
6 | |||
7 | #include <linux/crypto.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/signal.h> | ||
10 | #include <linux/personality.h> | ||
11 | #include <linux/suspend.h> | ||
12 | #include <asm/ucontext.h> | ||
13 | #include "sigframe_32.h" | ||
14 | #include <asm/pgtable.h> | ||
15 | #include <asm/fixmap.h> | ||
16 | #include <asm/processor.h> | ||
17 | #include <asm/thread_info.h> | ||
18 | #include <asm/elf.h> | ||
19 | |||
20 | #include <xen/interface/xen.h> | ||
21 | |||
22 | #ifdef CONFIG_LGUEST_GUEST | ||
23 | #include <linux/lguest.h> | ||
24 | #include "../../../drivers/lguest/lg.h" | ||
25 | #endif | ||
26 | |||
27 | #define DEFINE(sym, val) \ | ||
28 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) | ||
29 | |||
30 | #define BLANK() asm volatile("\n->" : : ) | ||
31 | |||
32 | #define OFFSET(sym, str, mem) \ | ||
33 | DEFINE(sym, offsetof(struct str, mem)); | ||
34 | |||
35 | /* workaround for a warning with -Wmissing-prototypes */ | ||
36 | void foo(void); | ||
37 | |||
38 | void foo(void) | ||
39 | { | ||
40 | OFFSET(SIGCONTEXT_eax, sigcontext, eax); | ||
41 | OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); | ||
42 | OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); | ||
43 | OFFSET(SIGCONTEXT_edx, sigcontext, edx); | ||
44 | OFFSET(SIGCONTEXT_esi, sigcontext, esi); | ||
45 | OFFSET(SIGCONTEXT_edi, sigcontext, edi); | ||
46 | OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); | ||
47 | OFFSET(SIGCONTEXT_esp, sigcontext, esp); | ||
48 | OFFSET(SIGCONTEXT_eip, sigcontext, eip); | ||
49 | BLANK(); | ||
50 | |||
51 | OFFSET(CPUINFO_x86, cpuinfo_x86, x86); | ||
52 | OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); | ||
53 | OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); | ||
54 | OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); | ||
55 | OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math); | ||
56 | OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); | ||
57 | OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); | ||
58 | OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); | ||
59 | BLANK(); | ||
60 | |||
61 | OFFSET(TI_task, thread_info, task); | ||
62 | OFFSET(TI_exec_domain, thread_info, exec_domain); | ||
63 | OFFSET(TI_flags, thread_info, flags); | ||
64 | OFFSET(TI_status, thread_info, status); | ||
65 | OFFSET(TI_preempt_count, thread_info, preempt_count); | ||
66 | OFFSET(TI_addr_limit, thread_info, addr_limit); | ||
67 | OFFSET(TI_restart_block, thread_info, restart_block); | ||
68 | OFFSET(TI_sysenter_return, thread_info, sysenter_return); | ||
69 | OFFSET(TI_cpu, thread_info, cpu); | ||
70 | BLANK(); | ||
71 | |||
72 | OFFSET(GDS_size, Xgt_desc_struct, size); | ||
73 | OFFSET(GDS_address, Xgt_desc_struct, address); | ||
74 | OFFSET(GDS_pad, Xgt_desc_struct, pad); | ||
75 | BLANK(); | ||
76 | |||
77 | OFFSET(PT_EBX, pt_regs, ebx); | ||
78 | OFFSET(PT_ECX, pt_regs, ecx); | ||
79 | OFFSET(PT_EDX, pt_regs, edx); | ||
80 | OFFSET(PT_ESI, pt_regs, esi); | ||
81 | OFFSET(PT_EDI, pt_regs, edi); | ||
82 | OFFSET(PT_EBP, pt_regs, ebp); | ||
83 | OFFSET(PT_EAX, pt_regs, eax); | ||
84 | OFFSET(PT_DS, pt_regs, xds); | ||
85 | OFFSET(PT_ES, pt_regs, xes); | ||
86 | OFFSET(PT_FS, pt_regs, xfs); | ||
87 | OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); | ||
88 | OFFSET(PT_EIP, pt_regs, eip); | ||
89 | OFFSET(PT_CS, pt_regs, xcs); | ||
90 | OFFSET(PT_EFLAGS, pt_regs, eflags); | ||
91 | OFFSET(PT_OLDESP, pt_regs, esp); | ||
92 | OFFSET(PT_OLDSS, pt_regs, xss); | ||
93 | BLANK(); | ||
94 | |||
95 | OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); | ||
96 | OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); | ||
97 | BLANK(); | ||
98 | |||
99 | OFFSET(pbe_address, pbe, address); | ||
100 | OFFSET(pbe_orig_address, pbe, orig_address); | ||
101 | OFFSET(pbe_next, pbe, next); | ||
102 | |||
103 | /* Offset from the sysenter stack to tss.esp0 */ | ||
104 | DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - | ||
105 | sizeof(struct tss_struct)); | ||
106 | |||
107 | DEFINE(PAGE_SIZE_asm, PAGE_SIZE); | ||
108 | DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); | ||
109 | DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); | ||
110 | DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); | ||
111 | DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); | ||
112 | |||
113 | DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); | ||
114 | |||
115 | OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); | ||
116 | |||
117 | #ifdef CONFIG_PARAVIRT | ||
118 | BLANK(); | ||
119 | OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); | ||
120 | OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); | ||
121 | OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); | ||
122 | OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); | ||
123 | OFFSET(PARAVIRT_iret, paravirt_ops, iret); | ||
124 | OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); | ||
125 | #endif | ||
126 | |||
127 | #ifdef CONFIG_XEN | ||
128 | BLANK(); | ||
129 | OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); | ||
130 | OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); | ||
131 | #endif | ||
132 | |||
133 | #ifdef CONFIG_LGUEST_GUEST | ||
134 | BLANK(); | ||
135 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); | ||
136 | OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); | ||
137 | OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); | ||
138 | OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); | ||
139 | OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); | ||
140 | OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); | ||
141 | OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); | ||
142 | OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); | ||
143 | OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); | ||
144 | OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); | ||
145 | OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); | ||
146 | #endif | ||
147 | } | ||
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c new file mode 100644 index 000000000000..0b9860530a6b --- /dev/null +++ b/arch/x86/kernel/bootflag.c | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * Implement 'Simple Boot Flag Specification 2.0' | ||
3 | */ | ||
4 | |||
5 | |||
6 | #include <linux/types.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/string.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <linux/spinlock.h> | ||
12 | #include <linux/acpi.h> | ||
13 | #include <asm/io.h> | ||
14 | |||
15 | #include <linux/mc146818rtc.h> | ||
16 | |||
17 | |||
18 | #define SBF_RESERVED (0x78) | ||
19 | #define SBF_PNPOS (1<<0) | ||
20 | #define SBF_BOOTING (1<<1) | ||
21 | #define SBF_DIAG (1<<2) | ||
22 | #define SBF_PARITY (1<<7) | ||
23 | |||
24 | |||
25 | int sbf_port __initdata = -1; /* set via acpi_boot_init() */ | ||
26 | |||
27 | |||
28 | static int __init parity(u8 v) | ||
29 | { | ||
30 | int x = 0; | ||
31 | int i; | ||
32 | |||
33 | for(i=0;i<8;i++) | ||
34 | { | ||
35 | x^=(v&1); | ||
36 | v>>=1; | ||
37 | } | ||
38 | return x; | ||
39 | } | ||
40 | |||
41 | static void __init sbf_write(u8 v) | ||
42 | { | ||
43 | unsigned long flags; | ||
44 | if(sbf_port != -1) | ||
45 | { | ||
46 | v &= ~SBF_PARITY; | ||
47 | if(!parity(v)) | ||
48 | v|=SBF_PARITY; | ||
49 | |||
50 | printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); | ||
51 | |||
52 | spin_lock_irqsave(&rtc_lock, flags); | ||
53 | CMOS_WRITE(v, sbf_port); | ||
54 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
55 | } | ||
56 | } | ||
57 | |||
58 | static u8 __init sbf_read(void) | ||
59 | { | ||
60 | u8 v; | ||
61 | unsigned long flags; | ||
62 | if(sbf_port == -1) | ||
63 | return 0; | ||
64 | spin_lock_irqsave(&rtc_lock, flags); | ||
65 | v = CMOS_READ(sbf_port); | ||
66 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
67 | return v; | ||
68 | } | ||
69 | |||
70 | static int __init sbf_value_valid(u8 v) | ||
71 | { | ||
72 | if(v&SBF_RESERVED) /* Reserved bits */ | ||
73 | return 0; | ||
74 | if(!parity(v)) | ||
75 | return 0; | ||
76 | return 1; | ||
77 | } | ||
78 | |||
79 | static int __init sbf_init(void) | ||
80 | { | ||
81 | u8 v; | ||
82 | if(sbf_port == -1) | ||
83 | return 0; | ||
84 | v = sbf_read(); | ||
85 | if(!sbf_value_valid(v)) | ||
86 | printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); | ||
87 | |||
88 | v &= ~SBF_RESERVED; | ||
89 | v &= ~SBF_BOOTING; | ||
90 | v &= ~SBF_DIAG; | ||
91 | #if defined(CONFIG_ISAPNP) | ||
92 | v |= SBF_PNPOS; | ||
93 | #endif | ||
94 | sbf_write(v); | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | module_init(sbf_init); | ||
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c new file mode 100644 index 000000000000..5c2faa10e9fa --- /dev/null +++ b/arch/x86/kernel/cpuid.c | |||
@@ -0,0 +1,242 @@ | |||
1 | /* ----------------------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2000 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, | ||
8 | * USA; either version 2 of the License, or (at your option) any later | ||
9 | * version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * cpuid.c | ||
15 | * | ||
16 | * x86 CPUID access device | ||
17 | * | ||
18 | * This device is accessed by lseek() to the appropriate CPUID level | ||
19 | * and then read in chunks of 16 bytes. A larger size means multiple | ||
20 | * reads of consecutive levels. | ||
21 | * | ||
22 | * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on | ||
23 | * an SMP box will direct the access to CPU %d. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <linux/types.h> | ||
29 | #include <linux/errno.h> | ||
30 | #include <linux/fcntl.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/poll.h> | ||
33 | #include <linux/smp.h> | ||
34 | #include <linux/major.h> | ||
35 | #include <linux/fs.h> | ||
36 | #include <linux/smp_lock.h> | ||
37 | #include <linux/device.h> | ||
38 | #include <linux/cpu.h> | ||
39 | #include <linux/notifier.h> | ||
40 | |||
41 | #include <asm/processor.h> | ||
42 | #include <asm/msr.h> | ||
43 | #include <asm/uaccess.h> | ||
44 | #include <asm/system.h> | ||
45 | |||
46 | static struct class *cpuid_class; | ||
47 | |||
48 | #ifdef CONFIG_SMP | ||
49 | |||
50 | struct cpuid_command { | ||
51 | u32 reg; | ||
52 | u32 *data; | ||
53 | }; | ||
54 | |||
55 | static void cpuid_smp_cpuid(void *cmd_block) | ||
56 | { | ||
57 | struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; | ||
58 | |||
59 | cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], | ||
60 | &cmd->data[3]); | ||
61 | } | ||
62 | |||
63 | static inline void do_cpuid(int cpu, u32 reg, u32 * data) | ||
64 | { | ||
65 | struct cpuid_command cmd; | ||
66 | |||
67 | preempt_disable(); | ||
68 | if (cpu == smp_processor_id()) { | ||
69 | cpuid(reg, &data[0], &data[1], &data[2], &data[3]); | ||
70 | } else { | ||
71 | cmd.reg = reg; | ||
72 | cmd.data = data; | ||
73 | |||
74 | smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); | ||
75 | } | ||
76 | preempt_enable(); | ||
77 | } | ||
78 | #else /* ! CONFIG_SMP */ | ||
79 | |||
80 | static inline void do_cpuid(int cpu, u32 reg, u32 * data) | ||
81 | { | ||
82 | cpuid(reg, &data[0], &data[1], &data[2], &data[3]); | ||
83 | } | ||
84 | |||
85 | #endif /* ! CONFIG_SMP */ | ||
86 | |||
87 | static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) | ||
88 | { | ||
89 | loff_t ret; | ||
90 | |||
91 | lock_kernel(); | ||
92 | |||
93 | switch (orig) { | ||
94 | case 0: | ||
95 | file->f_pos = offset; | ||
96 | ret = file->f_pos; | ||
97 | break; | ||
98 | case 1: | ||
99 | file->f_pos += offset; | ||
100 | ret = file->f_pos; | ||
101 | break; | ||
102 | default: | ||
103 | ret = -EINVAL; | ||
104 | } | ||
105 | |||
106 | unlock_kernel(); | ||
107 | return ret; | ||
108 | } | ||
109 | |||
110 | static ssize_t cpuid_read(struct file *file, char __user *buf, | ||
111 | size_t count, loff_t * ppos) | ||
112 | { | ||
113 | char __user *tmp = buf; | ||
114 | u32 data[4]; | ||
115 | u32 reg = *ppos; | ||
116 | int cpu = iminor(file->f_path.dentry->d_inode); | ||
117 | |||
118 | if (count % 16) | ||
119 | return -EINVAL; /* Invalid chunk size */ | ||
120 | |||
121 | for (; count; count -= 16) { | ||
122 | do_cpuid(cpu, reg, data); | ||
123 | if (copy_to_user(tmp, &data, 16)) | ||
124 | return -EFAULT; | ||
125 | tmp += 16; | ||
126 | *ppos = reg++; | ||
127 | } | ||
128 | |||
129 | return tmp - buf; | ||
130 | } | ||
131 | |||
132 | static int cpuid_open(struct inode *inode, struct file *file) | ||
133 | { | ||
134 | unsigned int cpu = iminor(file->f_path.dentry->d_inode); | ||
135 | struct cpuinfo_x86 *c = &(cpu_data)[cpu]; | ||
136 | |||
137 | if (cpu >= NR_CPUS || !cpu_online(cpu)) | ||
138 | return -ENXIO; /* No such CPU */ | ||
139 | if (c->cpuid_level < 0) | ||
140 | return -EIO; /* CPUID not supported */ | ||
141 | |||
142 | return 0; | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * File operations we support | ||
147 | */ | ||
148 | static const struct file_operations cpuid_fops = { | ||
149 | .owner = THIS_MODULE, | ||
150 | .llseek = cpuid_seek, | ||
151 | .read = cpuid_read, | ||
152 | .open = cpuid_open, | ||
153 | }; | ||
154 | |||
155 | static int cpuid_device_create(int i) | ||
156 | { | ||
157 | int err = 0; | ||
158 | struct device *dev; | ||
159 | |||
160 | dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i); | ||
161 | if (IS_ERR(dev)) | ||
162 | err = PTR_ERR(dev); | ||
163 | return err; | ||
164 | } | ||
165 | |||
166 | static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
167 | { | ||
168 | unsigned int cpu = (unsigned long)hcpu; | ||
169 | |||
170 | switch (action) { | ||
171 | case CPU_ONLINE: | ||
172 | case CPU_ONLINE_FROZEN: | ||
173 | cpuid_device_create(cpu); | ||
174 | break; | ||
175 | case CPU_DEAD: | ||
176 | case CPU_DEAD_FROZEN: | ||
177 | device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); | ||
178 | break; | ||
179 | } | ||
180 | return NOTIFY_OK; | ||
181 | } | ||
182 | |||
183 | static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = | ||
184 | { | ||
185 | .notifier_call = cpuid_class_cpu_callback, | ||
186 | }; | ||
187 | |||
188 | static int __init cpuid_init(void) | ||
189 | { | ||
190 | int i, err = 0; | ||
191 | i = 0; | ||
192 | |||
193 | if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { | ||
194 | printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", | ||
195 | CPUID_MAJOR); | ||
196 | err = -EBUSY; | ||
197 | goto out; | ||
198 | } | ||
199 | cpuid_class = class_create(THIS_MODULE, "cpuid"); | ||
200 | if (IS_ERR(cpuid_class)) { | ||
201 | err = PTR_ERR(cpuid_class); | ||
202 | goto out_chrdev; | ||
203 | } | ||
204 | for_each_online_cpu(i) { | ||
205 | err = cpuid_device_create(i); | ||
206 | if (err != 0) | ||
207 | goto out_class; | ||
208 | } | ||
209 | register_hotcpu_notifier(&cpuid_class_cpu_notifier); | ||
210 | |||
211 | err = 0; | ||
212 | goto out; | ||
213 | |||
214 | out_class: | ||
215 | i = 0; | ||
216 | for_each_online_cpu(i) { | ||
217 | device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); | ||
218 | } | ||
219 | class_destroy(cpuid_class); | ||
220 | out_chrdev: | ||
221 | unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); | ||
222 | out: | ||
223 | return err; | ||
224 | } | ||
225 | |||
226 | static void __exit cpuid_exit(void) | ||
227 | { | ||
228 | int cpu = 0; | ||
229 | |||
230 | for_each_online_cpu(cpu) | ||
231 | device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); | ||
232 | class_destroy(cpuid_class); | ||
233 | unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); | ||
234 | unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); | ||
235 | } | ||
236 | |||
237 | module_init(cpuid_init); | ||
238 | module_exit(cpuid_exit); | ||
239 | |||
240 | MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); | ||
241 | MODULE_DESCRIPTION("x86 generic CPUID driver"); | ||
242 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86/kernel/crash_32.c b/arch/x86/kernel/crash_32.c new file mode 100644 index 000000000000..53589d1b1a05 --- /dev/null +++ b/arch/x86/kernel/crash_32.c | |||
@@ -0,0 +1,137 @@ | |||
1 | /* | ||
2 | * Architecture specific (i386) functions for kexec based crash dumps. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2004. All rights reserved. | ||
7 | * | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/reboot.h> | ||
15 | #include <linux/kexec.h> | ||
16 | #include <linux/delay.h> | ||
17 | #include <linux/elf.h> | ||
18 | #include <linux/elfcore.h> | ||
19 | |||
20 | #include <asm/processor.h> | ||
21 | #include <asm/hardirq.h> | ||
22 | #include <asm/nmi.h> | ||
23 | #include <asm/hw_irq.h> | ||
24 | #include <asm/apic.h> | ||
25 | #include <linux/kdebug.h> | ||
26 | #include <asm/smp.h> | ||
27 | |||
28 | #include <mach_ipi.h> | ||
29 | |||
30 | |||
31 | /* This keeps a track of which one is crashing cpu. */ | ||
32 | static int crashing_cpu; | ||
33 | |||
34 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) | ||
35 | static atomic_t waiting_for_crash_ipi; | ||
36 | |||
37 | static int crash_nmi_callback(struct notifier_block *self, | ||
38 | unsigned long val, void *data) | ||
39 | { | ||
40 | struct pt_regs *regs; | ||
41 | struct pt_regs fixed_regs; | ||
42 | int cpu; | ||
43 | |||
44 | if (val != DIE_NMI_IPI) | ||
45 | return NOTIFY_OK; | ||
46 | |||
47 | regs = ((struct die_args *)data)->regs; | ||
48 | cpu = raw_smp_processor_id(); | ||
49 | |||
50 | /* Don't do anything if this handler is invoked on crashing cpu. | ||
51 | * Otherwise, system will completely hang. Crashing cpu can get | ||
52 | * an NMI if system was initially booted with nmi_watchdog parameter. | ||
53 | */ | ||
54 | if (cpu == crashing_cpu) | ||
55 | return NOTIFY_STOP; | ||
56 | local_irq_disable(); | ||
57 | |||
58 | if (!user_mode_vm(regs)) { | ||
59 | crash_fixup_ss_esp(&fixed_regs, regs); | ||
60 | regs = &fixed_regs; | ||
61 | } | ||
62 | crash_save_cpu(regs, cpu); | ||
63 | disable_local_APIC(); | ||
64 | atomic_dec(&waiting_for_crash_ipi); | ||
65 | /* Assume hlt works */ | ||
66 | halt(); | ||
67 | for (;;) | ||
68 | cpu_relax(); | ||
69 | |||
70 | return 1; | ||
71 | } | ||
72 | |||
73 | static void smp_send_nmi_allbutself(void) | ||
74 | { | ||
75 | cpumask_t mask = cpu_online_map; | ||
76 | cpu_clear(safe_smp_processor_id(), mask); | ||
77 | if (!cpus_empty(mask)) | ||
78 | send_IPI_mask(mask, NMI_VECTOR); | ||
79 | } | ||
80 | |||
81 | static struct notifier_block crash_nmi_nb = { | ||
82 | .notifier_call = crash_nmi_callback, | ||
83 | }; | ||
84 | |||
85 | static void nmi_shootdown_cpus(void) | ||
86 | { | ||
87 | unsigned long msecs; | ||
88 | |||
89 | atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); | ||
90 | /* Would it be better to replace the trap vector here? */ | ||
91 | if (register_die_notifier(&crash_nmi_nb)) | ||
92 | return; /* return what? */ | ||
93 | /* Ensure the new callback function is set before sending | ||
94 | * out the NMI | ||
95 | */ | ||
96 | wmb(); | ||
97 | |||
98 | smp_send_nmi_allbutself(); | ||
99 | |||
100 | msecs = 1000; /* Wait at most a second for the other cpus to stop */ | ||
101 | while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { | ||
102 | mdelay(1); | ||
103 | msecs--; | ||
104 | } | ||
105 | |||
106 | /* Leave the nmi callback set */ | ||
107 | disable_local_APIC(); | ||
108 | } | ||
109 | #else | ||
110 | static void nmi_shootdown_cpus(void) | ||
111 | { | ||
112 | /* There are no cpus to shootdown */ | ||
113 | } | ||
114 | #endif | ||
115 | |||
116 | void machine_crash_shutdown(struct pt_regs *regs) | ||
117 | { | ||
118 | /* This function is only called after the system | ||
119 | * has panicked or is otherwise in a critical state. | ||
120 | * The minimum amount of code to allow a kexec'd kernel | ||
121 | * to run successfully needs to happen here. | ||
122 | * | ||
123 | * In practice this means shooting down the other cpus in | ||
124 | * an SMP system. | ||
125 | */ | ||
126 | /* The kernel is broken so disable interrupts */ | ||
127 | local_irq_disable(); | ||
128 | |||
129 | /* Make a note of crashing cpu. Will be used in NMI callback.*/ | ||
130 | crashing_cpu = safe_smp_processor_id(); | ||
131 | nmi_shootdown_cpus(); | ||
132 | lapic_shutdown(); | ||
133 | #if defined(CONFIG_X86_IO_APIC) | ||
134 | disable_IO_APIC(); | ||
135 | #endif | ||
136 | crash_save_cpu(regs, safe_smp_processor_id()); | ||
137 | } | ||
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c new file mode 100644 index 000000000000..3f532df488bc --- /dev/null +++ b/arch/x86/kernel/crash_dump_32.c | |||
@@ -0,0 +1,74 @@ | |||
1 | /* | ||
2 | * kernel/crash_dump.c - Memory preserving reboot related code. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * Copyright (C) IBM Corporation, 2004. All rights reserved | ||
6 | */ | ||
7 | |||
8 | #include <linux/errno.h> | ||
9 | #include <linux/highmem.h> | ||
10 | #include <linux/crash_dump.h> | ||
11 | |||
12 | #include <asm/uaccess.h> | ||
13 | |||
14 | static void *kdump_buf_page; | ||
15 | |||
16 | /** | ||
17 | * copy_oldmem_page - copy one page from "oldmem" | ||
18 | * @pfn: page frame number to be copied | ||
19 | * @buf: target memory address for the copy; this can be in kernel address | ||
20 | * space or user address space (see @userbuf) | ||
21 | * @csize: number of bytes to copy | ||
22 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
23 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
24 | * otherwise @buf is in kernel address space, use memcpy(). | ||
25 | * | ||
26 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
27 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
28 | * | ||
29 | * Calling copy_to_user() in atomic context is not desirable. Hence first | ||
30 | * copying the data to a pre-allocated kernel page and then copying to user | ||
31 | * space in non-atomic context. | ||
32 | */ | ||
33 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
34 | size_t csize, unsigned long offset, int userbuf) | ||
35 | { | ||
36 | void *vaddr; | ||
37 | |||
38 | if (!csize) | ||
39 | return 0; | ||
40 | |||
41 | vaddr = kmap_atomic_pfn(pfn, KM_PTE0); | ||
42 | |||
43 | if (!userbuf) { | ||
44 | memcpy(buf, (vaddr + offset), csize); | ||
45 | kunmap_atomic(vaddr, KM_PTE0); | ||
46 | } else { | ||
47 | if (!kdump_buf_page) { | ||
48 | printk(KERN_WARNING "Kdump: Kdump buffer page not" | ||
49 | " allocated\n"); | ||
50 | return -EFAULT; | ||
51 | } | ||
52 | copy_page(kdump_buf_page, vaddr); | ||
53 | kunmap_atomic(vaddr, KM_PTE0); | ||
54 | if (copy_to_user(buf, (kdump_buf_page + offset), csize)) | ||
55 | return -EFAULT; | ||
56 | } | ||
57 | |||
58 | return csize; | ||
59 | } | ||
60 | |||
61 | static int __init kdump_buf_page_init(void) | ||
62 | { | ||
63 | int ret = 0; | ||
64 | |||
65 | kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
66 | if (!kdump_buf_page) { | ||
67 | printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" | ||
68 | " page\n"); | ||
69 | ret = -ENOMEM; | ||
70 | } | ||
71 | |||
72 | return ret; | ||
73 | } | ||
74 | arch_initcall(kdump_buf_page_init); | ||
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c new file mode 100644 index 000000000000..40978af630e7 --- /dev/null +++ b/arch/x86/kernel/doublefault_32.c | |||
@@ -0,0 +1,70 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/init_task.h> | ||
5 | #include <linux/fs.h> | ||
6 | |||
7 | #include <asm/uaccess.h> | ||
8 | #include <asm/pgtable.h> | ||
9 | #include <asm/processor.h> | ||
10 | #include <asm/desc.h> | ||
11 | |||
12 | #define DOUBLEFAULT_STACKSIZE (1024) | ||
13 | static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; | ||
14 | #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) | ||
15 | |||
16 | #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) | ||
17 | |||
18 | static void doublefault_fn(void) | ||
19 | { | ||
20 | struct Xgt_desc_struct gdt_desc = {0, 0}; | ||
21 | unsigned long gdt, tss; | ||
22 | |||
23 | store_gdt(&gdt_desc); | ||
24 | gdt = gdt_desc.address; | ||
25 | |||
26 | printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); | ||
27 | |||
28 | if (ptr_ok(gdt)) { | ||
29 | gdt += GDT_ENTRY_TSS << 3; | ||
30 | tss = *(u16 *)(gdt+2); | ||
31 | tss += *(u8 *)(gdt+4) << 16; | ||
32 | tss += *(u8 *)(gdt+7) << 24; | ||
33 | printk(KERN_EMERG "double fault, tss at %08lx\n", tss); | ||
34 | |||
35 | if (ptr_ok(tss)) { | ||
36 | struct i386_hw_tss *t = (struct i386_hw_tss *)tss; | ||
37 | |||
38 | printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); | ||
39 | |||
40 | printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", | ||
41 | t->eax, t->ebx, t->ecx, t->edx); | ||
42 | printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", | ||
43 | t->esi, t->edi); | ||
44 | } | ||
45 | } | ||
46 | |||
47 | for (;;) | ||
48 | cpu_relax(); | ||
49 | } | ||
50 | |||
51 | struct tss_struct doublefault_tss __cacheline_aligned = { | ||
52 | .x86_tss = { | ||
53 | .esp0 = STACK_START, | ||
54 | .ss0 = __KERNEL_DS, | ||
55 | .ldt = 0, | ||
56 | .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, | ||
57 | |||
58 | .eip = (unsigned long) doublefault_fn, | ||
59 | /* 0x2 bit is always set */ | ||
60 | .eflags = X86_EFLAGS_SF | 0x2, | ||
61 | .esp = STACK_START, | ||
62 | .es = __USER_DS, | ||
63 | .cs = __KERNEL_CS, | ||
64 | .ss = __KERNEL_DS, | ||
65 | .ds = __USER_DS, | ||
66 | .fs = __KERNEL_PERCPU, | ||
67 | |||
68 | .__cr3 = __pa(swapper_pg_dir) | ||
69 | } | ||
70 | }; | ||
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c new file mode 100644 index 000000000000..3c86b979a40a --- /dev/null +++ b/arch/x86/kernel/e820_32.c | |||
@@ -0,0 +1,944 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/types.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/bootmem.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <linux/kexec.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/efi.h> | ||
11 | #include <linux/pfn.h> | ||
12 | #include <linux/uaccess.h> | ||
13 | #include <linux/suspend.h> | ||
14 | |||
15 | #include <asm/pgtable.h> | ||
16 | #include <asm/page.h> | ||
17 | #include <asm/e820.h> | ||
18 | #include <asm/setup.h> | ||
19 | |||
20 | #ifdef CONFIG_EFI | ||
21 | int efi_enabled = 0; | ||
22 | EXPORT_SYMBOL(efi_enabled); | ||
23 | #endif | ||
24 | |||
25 | struct e820map e820; | ||
26 | struct change_member { | ||
27 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
28 | unsigned long long addr; /* address for this change point */ | ||
29 | }; | ||
30 | static struct change_member change_point_list[2*E820MAX] __initdata; | ||
31 | static struct change_member *change_point[2*E820MAX] __initdata; | ||
32 | static struct e820entry *overlap_list[E820MAX] __initdata; | ||
33 | static struct e820entry new_bios[E820MAX] __initdata; | ||
34 | /* For PCI or other memory-mapped resources */ | ||
35 | unsigned long pci_mem_start = 0x10000000; | ||
36 | #ifdef CONFIG_PCI | ||
37 | EXPORT_SYMBOL(pci_mem_start); | ||
38 | #endif | ||
39 | extern int user_defined_memmap; | ||
40 | struct resource data_resource = { | ||
41 | .name = "Kernel data", | ||
42 | .start = 0, | ||
43 | .end = 0, | ||
44 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
45 | }; | ||
46 | |||
47 | struct resource code_resource = { | ||
48 | .name = "Kernel code", | ||
49 | .start = 0, | ||
50 | .end = 0, | ||
51 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
52 | }; | ||
53 | |||
54 | static struct resource system_rom_resource = { | ||
55 | .name = "System ROM", | ||
56 | .start = 0xf0000, | ||
57 | .end = 0xfffff, | ||
58 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
59 | }; | ||
60 | |||
61 | static struct resource extension_rom_resource = { | ||
62 | .name = "Extension ROM", | ||
63 | .start = 0xe0000, | ||
64 | .end = 0xeffff, | ||
65 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
66 | }; | ||
67 | |||
68 | static struct resource adapter_rom_resources[] = { { | ||
69 | .name = "Adapter ROM", | ||
70 | .start = 0xc8000, | ||
71 | .end = 0, | ||
72 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
73 | }, { | ||
74 | .name = "Adapter ROM", | ||
75 | .start = 0, | ||
76 | .end = 0, | ||
77 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
78 | }, { | ||
79 | .name = "Adapter ROM", | ||
80 | .start = 0, | ||
81 | .end = 0, | ||
82 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
83 | }, { | ||
84 | .name = "Adapter ROM", | ||
85 | .start = 0, | ||
86 | .end = 0, | ||
87 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
88 | }, { | ||
89 | .name = "Adapter ROM", | ||
90 | .start = 0, | ||
91 | .end = 0, | ||
92 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
93 | }, { | ||
94 | .name = "Adapter ROM", | ||
95 | .start = 0, | ||
96 | .end = 0, | ||
97 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
98 | } }; | ||
99 | |||
100 | static struct resource video_rom_resource = { | ||
101 | .name = "Video ROM", | ||
102 | .start = 0xc0000, | ||
103 | .end = 0xc7fff, | ||
104 | .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM | ||
105 | }; | ||
106 | |||
107 | static struct resource video_ram_resource = { | ||
108 | .name = "Video RAM area", | ||
109 | .start = 0xa0000, | ||
110 | .end = 0xbffff, | ||
111 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
112 | }; | ||
113 | |||
114 | static struct resource standard_io_resources[] = { { | ||
115 | .name = "dma1", | ||
116 | .start = 0x0000, | ||
117 | .end = 0x001f, | ||
118 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
119 | }, { | ||
120 | .name = "pic1", | ||
121 | .start = 0x0020, | ||
122 | .end = 0x0021, | ||
123 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
124 | }, { | ||
125 | .name = "timer0", | ||
126 | .start = 0x0040, | ||
127 | .end = 0x0043, | ||
128 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
129 | }, { | ||
130 | .name = "timer1", | ||
131 | .start = 0x0050, | ||
132 | .end = 0x0053, | ||
133 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
134 | }, { | ||
135 | .name = "keyboard", | ||
136 | .start = 0x0060, | ||
137 | .end = 0x006f, | ||
138 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
139 | }, { | ||
140 | .name = "dma page reg", | ||
141 | .start = 0x0080, | ||
142 | .end = 0x008f, | ||
143 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
144 | }, { | ||
145 | .name = "pic2", | ||
146 | .start = 0x00a0, | ||
147 | .end = 0x00a1, | ||
148 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
149 | }, { | ||
150 | .name = "dma2", | ||
151 | .start = 0x00c0, | ||
152 | .end = 0x00df, | ||
153 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
154 | }, { | ||
155 | .name = "fpu", | ||
156 | .start = 0x00f0, | ||
157 | .end = 0x00ff, | ||
158 | .flags = IORESOURCE_BUSY | IORESOURCE_IO | ||
159 | } }; | ||
160 | |||
161 | #define ROMSIGNATURE 0xaa55 | ||
162 | |||
163 | static int __init romsignature(const unsigned char *rom) | ||
164 | { | ||
165 | const unsigned short * const ptr = (const unsigned short *)rom; | ||
166 | unsigned short sig; | ||
167 | |||
168 | return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; | ||
169 | } | ||
170 | |||
171 | static int __init romchecksum(const unsigned char *rom, unsigned long length) | ||
172 | { | ||
173 | unsigned char sum, c; | ||
174 | |||
175 | for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) | ||
176 | sum += c; | ||
177 | return !length && !sum; | ||
178 | } | ||
179 | |||
180 | static void __init probe_roms(void) | ||
181 | { | ||
182 | const unsigned char *rom; | ||
183 | unsigned long start, length, upper; | ||
184 | unsigned char c; | ||
185 | int i; | ||
186 | |||
187 | /* video rom */ | ||
188 | upper = adapter_rom_resources[0].start; | ||
189 | for (start = video_rom_resource.start; start < upper; start += 2048) { | ||
190 | rom = isa_bus_to_virt(start); | ||
191 | if (!romsignature(rom)) | ||
192 | continue; | ||
193 | |||
194 | video_rom_resource.start = start; | ||
195 | |||
196 | if (probe_kernel_address(rom + 2, c) != 0) | ||
197 | continue; | ||
198 | |||
199 | /* 0 < length <= 0x7f * 512, historically */ | ||
200 | length = c * 512; | ||
201 | |||
202 | /* if checksum okay, trust length byte */ | ||
203 | if (length && romchecksum(rom, length)) | ||
204 | video_rom_resource.end = start + length - 1; | ||
205 | |||
206 | request_resource(&iomem_resource, &video_rom_resource); | ||
207 | break; | ||
208 | } | ||
209 | |||
210 | start = (video_rom_resource.end + 1 + 2047) & ~2047UL; | ||
211 | if (start < upper) | ||
212 | start = upper; | ||
213 | |||
214 | /* system rom */ | ||
215 | request_resource(&iomem_resource, &system_rom_resource); | ||
216 | upper = system_rom_resource.start; | ||
217 | |||
218 | /* check for extension rom (ignore length byte!) */ | ||
219 | rom = isa_bus_to_virt(extension_rom_resource.start); | ||
220 | if (romsignature(rom)) { | ||
221 | length = extension_rom_resource.end - extension_rom_resource.start + 1; | ||
222 | if (romchecksum(rom, length)) { | ||
223 | request_resource(&iomem_resource, &extension_rom_resource); | ||
224 | upper = extension_rom_resource.start; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | /* check for adapter roms on 2k boundaries */ | ||
229 | for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { | ||
230 | rom = isa_bus_to_virt(start); | ||
231 | if (!romsignature(rom)) | ||
232 | continue; | ||
233 | |||
234 | if (probe_kernel_address(rom + 2, c) != 0) | ||
235 | continue; | ||
236 | |||
237 | /* 0 < length <= 0x7f * 512, historically */ | ||
238 | length = c * 512; | ||
239 | |||
240 | /* but accept any length that fits if checksum okay */ | ||
241 | if (!length || start + length > upper || !romchecksum(rom, length)) | ||
242 | continue; | ||
243 | |||
244 | adapter_rom_resources[i].start = start; | ||
245 | adapter_rom_resources[i].end = start + length - 1; | ||
246 | request_resource(&iomem_resource, &adapter_rom_resources[i]); | ||
247 | |||
248 | start = adapter_rom_resources[i++].end & ~2047UL; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * Request address space for all standard RAM and ROM resources | ||
254 | * and also for regions reported as reserved by the e820. | ||
255 | */ | ||
256 | static void __init | ||
257 | legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) | ||
258 | { | ||
259 | int i; | ||
260 | |||
261 | probe_roms(); | ||
262 | for (i = 0; i < e820.nr_map; i++) { | ||
263 | struct resource *res; | ||
264 | #ifndef CONFIG_RESOURCES_64BIT | ||
265 | if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) | ||
266 | continue; | ||
267 | #endif | ||
268 | res = kzalloc(sizeof(struct resource), GFP_ATOMIC); | ||
269 | switch (e820.map[i].type) { | ||
270 | case E820_RAM: res->name = "System RAM"; break; | ||
271 | case E820_ACPI: res->name = "ACPI Tables"; break; | ||
272 | case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | ||
273 | default: res->name = "reserved"; | ||
274 | } | ||
275 | res->start = e820.map[i].addr; | ||
276 | res->end = res->start + e820.map[i].size - 1; | ||
277 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
278 | if (request_resource(&iomem_resource, res)) { | ||
279 | kfree(res); | ||
280 | continue; | ||
281 | } | ||
282 | if (e820.map[i].type == E820_RAM) { | ||
283 | /* | ||
284 | * We don't know which RAM region contains kernel data, | ||
285 | * so we try it repeatedly and let the resource manager | ||
286 | * test it. | ||
287 | */ | ||
288 | request_resource(res, code_resource); | ||
289 | request_resource(res, data_resource); | ||
290 | #ifdef CONFIG_KEXEC | ||
291 | request_resource(res, &crashk_res); | ||
292 | #endif | ||
293 | } | ||
294 | } | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Request address space for all standard resources | ||
299 | * | ||
300 | * This is called just before pcibios_init(), which is also a | ||
301 | * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). | ||
302 | */ | ||
303 | static int __init request_standard_resources(void) | ||
304 | { | ||
305 | int i; | ||
306 | |||
307 | printk("Setting up standard PCI resources\n"); | ||
308 | if (efi_enabled) | ||
309 | efi_initialize_iomem_resources(&code_resource, &data_resource); | ||
310 | else | ||
311 | legacy_init_iomem_resources(&code_resource, &data_resource); | ||
312 | |||
313 | /* EFI systems may still have VGA */ | ||
314 | request_resource(&iomem_resource, &video_ram_resource); | ||
315 | |||
316 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
317 | for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | ||
318 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | subsys_initcall(request_standard_resources); | ||
323 | |||
324 | #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) | ||
325 | /** | ||
326 | * e820_mark_nosave_regions - Find the ranges of physical addresses that do not | ||
327 | * correspond to e820 RAM areas and mark the corresponding pages as nosave for | ||
328 | * hibernation. | ||
329 | * | ||
330 | * This function requires the e820 map to be sorted and without any | ||
331 | * overlapping entries and assumes the first e820 area to be RAM. | ||
332 | */ | ||
333 | void __init e820_mark_nosave_regions(void) | ||
334 | { | ||
335 | int i; | ||
336 | unsigned long pfn; | ||
337 | |||
338 | pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); | ||
339 | for (i = 1; i < e820.nr_map; i++) { | ||
340 | struct e820entry *ei = &e820.map[i]; | ||
341 | |||
342 | if (pfn < PFN_UP(ei->addr)) | ||
343 | register_nosave_region(pfn, PFN_UP(ei->addr)); | ||
344 | |||
345 | pfn = PFN_DOWN(ei->addr + ei->size); | ||
346 | if (ei->type != E820_RAM) | ||
347 | register_nosave_region(PFN_UP(ei->addr), pfn); | ||
348 | |||
349 | if (pfn >= max_low_pfn) | ||
350 | break; | ||
351 | } | ||
352 | } | ||
353 | #endif | ||
354 | |||
355 | void __init add_memory_region(unsigned long long start, | ||
356 | unsigned long long size, int type) | ||
357 | { | ||
358 | int x; | ||
359 | |||
360 | if (!efi_enabled) { | ||
361 | x = e820.nr_map; | ||
362 | |||
363 | if (x == E820MAX) { | ||
364 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
365 | return; | ||
366 | } | ||
367 | |||
368 | e820.map[x].addr = start; | ||
369 | e820.map[x].size = size; | ||
370 | e820.map[x].type = type; | ||
371 | e820.nr_map++; | ||
372 | } | ||
373 | } /* add_memory_region */ | ||
374 | |||
375 | /* | ||
376 | * Sanitize the BIOS e820 map. | ||
377 | * | ||
378 | * Some e820 responses include overlapping entries. The following | ||
379 | * replaces the original e820 map with a new one, removing overlaps. | ||
380 | * | ||
381 | */ | ||
382 | int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | ||
383 | { | ||
384 | struct change_member *change_tmp; | ||
385 | unsigned long current_type, last_type; | ||
386 | unsigned long long last_addr; | ||
387 | int chgidx, still_changing; | ||
388 | int overlap_entries; | ||
389 | int new_bios_entry; | ||
390 | int old_nr, new_nr, chg_nr; | ||
391 | int i; | ||
392 | |||
393 | /* | ||
394 | Visually we're performing the following (1,2,3,4 = memory types)... | ||
395 | |||
396 | Sample memory map (w/overlaps): | ||
397 | ____22__________________ | ||
398 | ______________________4_ | ||
399 | ____1111________________ | ||
400 | _44_____________________ | ||
401 | 11111111________________ | ||
402 | ____________________33__ | ||
403 | ___________44___________ | ||
404 | __________33333_________ | ||
405 | ______________22________ | ||
406 | ___________________2222_ | ||
407 | _________111111111______ | ||
408 | _____________________11_ | ||
409 | _________________4______ | ||
410 | |||
411 | Sanitized equivalent (no overlap): | ||
412 | 1_______________________ | ||
413 | _44_____________________ | ||
414 | ___1____________________ | ||
415 | ____22__________________ | ||
416 | ______11________________ | ||
417 | _________1______________ | ||
418 | __________3_____________ | ||
419 | ___________44___________ | ||
420 | _____________33_________ | ||
421 | _______________2________ | ||
422 | ________________1_______ | ||
423 | _________________4______ | ||
424 | ___________________2____ | ||
425 | ____________________33__ | ||
426 | ______________________4_ | ||
427 | */ | ||
428 | /* if there's only one memory region, don't bother */ | ||
429 | if (*pnr_map < 2) { | ||
430 | return -1; | ||
431 | } | ||
432 | |||
433 | old_nr = *pnr_map; | ||
434 | |||
435 | /* bail out if we find any unreasonable addresses in bios map */ | ||
436 | for (i=0; i<old_nr; i++) | ||
437 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { | ||
438 | return -1; | ||
439 | } | ||
440 | |||
441 | /* create pointers for initial change-point information (for sorting) */ | ||
442 | for (i=0; i < 2*old_nr; i++) | ||
443 | change_point[i] = &change_point_list[i]; | ||
444 | |||
445 | /* record all known change-points (starting and ending addresses), | ||
446 | omitting those that are for empty memory regions */ | ||
447 | chgidx = 0; | ||
448 | for (i=0; i < old_nr; i++) { | ||
449 | if (biosmap[i].size != 0) { | ||
450 | change_point[chgidx]->addr = biosmap[i].addr; | ||
451 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
452 | change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | ||
453 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
454 | } | ||
455 | } | ||
456 | chg_nr = chgidx; /* true number of change-points */ | ||
457 | |||
458 | /* sort change-point list by memory addresses (low -> high) */ | ||
459 | still_changing = 1; | ||
460 | while (still_changing) { | ||
461 | still_changing = 0; | ||
462 | for (i=1; i < chg_nr; i++) { | ||
463 | /* if <current_addr> > <last_addr>, swap */ | ||
464 | /* or, if current=<start_addr> & last=<end_addr>, swap */ | ||
465 | if ((change_point[i]->addr < change_point[i-1]->addr) || | ||
466 | ((change_point[i]->addr == change_point[i-1]->addr) && | ||
467 | (change_point[i]->addr == change_point[i]->pbios->addr) && | ||
468 | (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | ||
469 | ) | ||
470 | { | ||
471 | change_tmp = change_point[i]; | ||
472 | change_point[i] = change_point[i-1]; | ||
473 | change_point[i-1] = change_tmp; | ||
474 | still_changing=1; | ||
475 | } | ||
476 | } | ||
477 | } | ||
478 | |||
479 | /* create a new bios memory map, removing overlaps */ | ||
480 | overlap_entries=0; /* number of entries in the overlap table */ | ||
481 | new_bios_entry=0; /* index for creating new bios map entries */ | ||
482 | last_type = 0; /* start with undefined memory type */ | ||
483 | last_addr = 0; /* start with 0 as last starting address */ | ||
484 | /* loop through change-points, determining affect on the new bios map */ | ||
485 | for (chgidx=0; chgidx < chg_nr; chgidx++) | ||
486 | { | ||
487 | /* keep track of all overlapping bios entries */ | ||
488 | if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | ||
489 | { | ||
490 | /* add map entry to overlap list (> 1 entry implies an overlap) */ | ||
491 | overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | ||
492 | } | ||
493 | else | ||
494 | { | ||
495 | /* remove entry from list (order independent, so swap with last) */ | ||
496 | for (i=0; i<overlap_entries; i++) | ||
497 | { | ||
498 | if (overlap_list[i] == change_point[chgidx]->pbios) | ||
499 | overlap_list[i] = overlap_list[overlap_entries-1]; | ||
500 | } | ||
501 | overlap_entries--; | ||
502 | } | ||
503 | /* if there are overlapping entries, decide which "type" to use */ | ||
504 | /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | ||
505 | current_type = 0; | ||
506 | for (i=0; i<overlap_entries; i++) | ||
507 | if (overlap_list[i]->type > current_type) | ||
508 | current_type = overlap_list[i]->type; | ||
509 | /* continue building up new bios map based on this information */ | ||
510 | if (current_type != last_type) { | ||
511 | if (last_type != 0) { | ||
512 | new_bios[new_bios_entry].size = | ||
513 | change_point[chgidx]->addr - last_addr; | ||
514 | /* move forward only if the new size was non-zero */ | ||
515 | if (new_bios[new_bios_entry].size != 0) | ||
516 | if (++new_bios_entry >= E820MAX) | ||
517 | break; /* no more space left for new bios entries */ | ||
518 | } | ||
519 | if (current_type != 0) { | ||
520 | new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | ||
521 | new_bios[new_bios_entry].type = current_type; | ||
522 | last_addr=change_point[chgidx]->addr; | ||
523 | } | ||
524 | last_type = current_type; | ||
525 | } | ||
526 | } | ||
527 | new_nr = new_bios_entry; /* retain count for new bios entries */ | ||
528 | |||
529 | /* copy new bios mapping into original location */ | ||
530 | memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | ||
531 | *pnr_map = new_nr; | ||
532 | |||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | /* | ||
537 | * Copy the BIOS e820 map into a safe place. | ||
538 | * | ||
539 | * Sanity-check it while we're at it.. | ||
540 | * | ||
541 | * If we're lucky and live on a modern system, the setup code | ||
542 | * will have given us a memory map that we can use to properly | ||
543 | * set up memory. If we aren't, we'll fake a memory map. | ||
544 | * | ||
545 | * We check to see that the memory map contains at least 2 elements | ||
546 | * before we'll use it, because the detection code in setup.S may | ||
547 | * not be perfect and most every PC known to man has two memory | ||
548 | * regions: one from 0 to 640k, and one from 1mb up. (The IBM | ||
549 | * thinkpad 560x, for example, does not cooperate with the memory | ||
550 | * detection code.) | ||
551 | */ | ||
552 | int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | ||
553 | { | ||
554 | /* Only one memory region (or negative)? Ignore it */ | ||
555 | if (nr_map < 2) | ||
556 | return -1; | ||
557 | |||
558 | do { | ||
559 | unsigned long long start = biosmap->addr; | ||
560 | unsigned long long size = biosmap->size; | ||
561 | unsigned long long end = start + size; | ||
562 | unsigned long type = biosmap->type; | ||
563 | |||
564 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
565 | if (start > end) | ||
566 | return -1; | ||
567 | |||
568 | /* | ||
569 | * Some BIOSes claim RAM in the 640k - 1M region. | ||
570 | * Not right. Fix it up. | ||
571 | */ | ||
572 | if (type == E820_RAM) { | ||
573 | if (start < 0x100000ULL && end > 0xA0000ULL) { | ||
574 | if (start < 0xA0000ULL) | ||
575 | add_memory_region(start, 0xA0000ULL-start, type); | ||
576 | if (end <= 0x100000ULL) | ||
577 | continue; | ||
578 | start = 0x100000ULL; | ||
579 | size = end - start; | ||
580 | } | ||
581 | } | ||
582 | add_memory_region(start, size, type); | ||
583 | } while (biosmap++,--nr_map); | ||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | /* | ||
588 | * Callback for efi_memory_walk. | ||
589 | */ | ||
590 | static int __init | ||
591 | efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) | ||
592 | { | ||
593 | unsigned long *max_pfn = arg, pfn; | ||
594 | |||
595 | if (start < end) { | ||
596 | pfn = PFN_UP(end -1); | ||
597 | if (pfn > *max_pfn) | ||
598 | *max_pfn = pfn; | ||
599 | } | ||
600 | return 0; | ||
601 | } | ||
602 | |||
603 | static int __init | ||
604 | efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) | ||
605 | { | ||
606 | memory_present(0, PFN_UP(start), PFN_DOWN(end)); | ||
607 | return 0; | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Find the highest page frame number we have available | ||
612 | */ | ||
613 | void __init find_max_pfn(void) | ||
614 | { | ||
615 | int i; | ||
616 | |||
617 | max_pfn = 0; | ||
618 | if (efi_enabled) { | ||
619 | efi_memmap_walk(efi_find_max_pfn, &max_pfn); | ||
620 | efi_memmap_walk(efi_memory_present_wrapper, NULL); | ||
621 | return; | ||
622 | } | ||
623 | |||
624 | for (i = 0; i < e820.nr_map; i++) { | ||
625 | unsigned long start, end; | ||
626 | /* RAM? */ | ||
627 | if (e820.map[i].type != E820_RAM) | ||
628 | continue; | ||
629 | start = PFN_UP(e820.map[i].addr); | ||
630 | end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); | ||
631 | if (start >= end) | ||
632 | continue; | ||
633 | if (end > max_pfn) | ||
634 | max_pfn = end; | ||
635 | memory_present(0, start, end); | ||
636 | } | ||
637 | } | ||
638 | |||
639 | /* | ||
640 | * Free all available memory for boot time allocation. Used | ||
641 | * as a callback function by efi_memory_walk() | ||
642 | */ | ||
643 | |||
644 | static int __init | ||
645 | free_available_memory(unsigned long start, unsigned long end, void *arg) | ||
646 | { | ||
647 | /* check max_low_pfn */ | ||
648 | if (start >= (max_low_pfn << PAGE_SHIFT)) | ||
649 | return 0; | ||
650 | if (end >= (max_low_pfn << PAGE_SHIFT)) | ||
651 | end = max_low_pfn << PAGE_SHIFT; | ||
652 | if (start < end) | ||
653 | free_bootmem(start, end - start); | ||
654 | |||
655 | return 0; | ||
656 | } | ||
657 | /* | ||
658 | * Register fully available low RAM pages with the bootmem allocator. | ||
659 | */ | ||
660 | void __init register_bootmem_low_pages(unsigned long max_low_pfn) | ||
661 | { | ||
662 | int i; | ||
663 | |||
664 | if (efi_enabled) { | ||
665 | efi_memmap_walk(free_available_memory, NULL); | ||
666 | return; | ||
667 | } | ||
668 | for (i = 0; i < e820.nr_map; i++) { | ||
669 | unsigned long curr_pfn, last_pfn, size; | ||
670 | /* | ||
671 | * Reserve usable low memory | ||
672 | */ | ||
673 | if (e820.map[i].type != E820_RAM) | ||
674 | continue; | ||
675 | /* | ||
676 | * We are rounding up the start address of usable memory: | ||
677 | */ | ||
678 | curr_pfn = PFN_UP(e820.map[i].addr); | ||
679 | if (curr_pfn >= max_low_pfn) | ||
680 | continue; | ||
681 | /* | ||
682 | * ... and at the end of the usable range downwards: | ||
683 | */ | ||
684 | last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); | ||
685 | |||
686 | if (last_pfn > max_low_pfn) | ||
687 | last_pfn = max_low_pfn; | ||
688 | |||
689 | /* | ||
690 | * .. finally, did all the rounding and playing | ||
691 | * around just make the area go away? | ||
692 | */ | ||
693 | if (last_pfn <= curr_pfn) | ||
694 | continue; | ||
695 | |||
696 | size = last_pfn - curr_pfn; | ||
697 | free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); | ||
698 | } | ||
699 | } | ||
700 | |||
701 | void __init e820_register_memory(void) | ||
702 | { | ||
703 | unsigned long gapstart, gapsize, round; | ||
704 | unsigned long long last; | ||
705 | int i; | ||
706 | |||
707 | /* | ||
708 | * Search for the bigest gap in the low 32 bits of the e820 | ||
709 | * memory space. | ||
710 | */ | ||
711 | last = 0x100000000ull; | ||
712 | gapstart = 0x10000000; | ||
713 | gapsize = 0x400000; | ||
714 | i = e820.nr_map; | ||
715 | while (--i >= 0) { | ||
716 | unsigned long long start = e820.map[i].addr; | ||
717 | unsigned long long end = start + e820.map[i].size; | ||
718 | |||
719 | /* | ||
720 | * Since "last" is at most 4GB, we know we'll | ||
721 | * fit in 32 bits if this condition is true | ||
722 | */ | ||
723 | if (last > end) { | ||
724 | unsigned long gap = last - end; | ||
725 | |||
726 | if (gap > gapsize) { | ||
727 | gapsize = gap; | ||
728 | gapstart = end; | ||
729 | } | ||
730 | } | ||
731 | if (start < last) | ||
732 | last = start; | ||
733 | } | ||
734 | |||
735 | /* | ||
736 | * See how much we want to round up: start off with | ||
737 | * rounding to the next 1MB area. | ||
738 | */ | ||
739 | round = 0x100000; | ||
740 | while ((gapsize >> 4) > round) | ||
741 | round += round; | ||
742 | /* Fun with two's complement */ | ||
743 | pci_mem_start = (gapstart + round) & -round; | ||
744 | |||
745 | printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", | ||
746 | pci_mem_start, gapstart, gapsize); | ||
747 | } | ||
748 | |||
749 | void __init print_memory_map(char *who) | ||
750 | { | ||
751 | int i; | ||
752 | |||
753 | for (i = 0; i < e820.nr_map; i++) { | ||
754 | printk(" %s: %016Lx - %016Lx ", who, | ||
755 | e820.map[i].addr, | ||
756 | e820.map[i].addr + e820.map[i].size); | ||
757 | switch (e820.map[i].type) { | ||
758 | case E820_RAM: printk("(usable)\n"); | ||
759 | break; | ||
760 | case E820_RESERVED: | ||
761 | printk("(reserved)\n"); | ||
762 | break; | ||
763 | case E820_ACPI: | ||
764 | printk("(ACPI data)\n"); | ||
765 | break; | ||
766 | case E820_NVS: | ||
767 | printk("(ACPI NVS)\n"); | ||
768 | break; | ||
769 | default: printk("type %u\n", e820.map[i].type); | ||
770 | break; | ||
771 | } | ||
772 | } | ||
773 | } | ||
774 | |||
775 | static __init __always_inline void efi_limit_regions(unsigned long long size) | ||
776 | { | ||
777 | unsigned long long current_addr = 0; | ||
778 | efi_memory_desc_t *md, *next_md; | ||
779 | void *p, *p1; | ||
780 | int i, j; | ||
781 | |||
782 | j = 0; | ||
783 | p1 = memmap.map; | ||
784 | for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { | ||
785 | md = p; | ||
786 | next_md = p1; | ||
787 | current_addr = md->phys_addr + | ||
788 | PFN_PHYS(md->num_pages); | ||
789 | if (is_available_memory(md)) { | ||
790 | if (md->phys_addr >= size) continue; | ||
791 | memcpy(next_md, md, memmap.desc_size); | ||
792 | if (current_addr >= size) { | ||
793 | next_md->num_pages -= | ||
794 | PFN_UP(current_addr-size); | ||
795 | } | ||
796 | p1 += memmap.desc_size; | ||
797 | next_md = p1; | ||
798 | j++; | ||
799 | } else if ((md->attribute & EFI_MEMORY_RUNTIME) == | ||
800 | EFI_MEMORY_RUNTIME) { | ||
801 | /* In order to make runtime services | ||
802 | * available we have to include runtime | ||
803 | * memory regions in memory map */ | ||
804 | memcpy(next_md, md, memmap.desc_size); | ||
805 | p1 += memmap.desc_size; | ||
806 | next_md = p1; | ||
807 | j++; | ||
808 | } | ||
809 | } | ||
810 | memmap.nr_map = j; | ||
811 | memmap.map_end = memmap.map + | ||
812 | (memmap.nr_map * memmap.desc_size); | ||
813 | } | ||
814 | |||
815 | void __init limit_regions(unsigned long long size) | ||
816 | { | ||
817 | unsigned long long current_addr; | ||
818 | int i; | ||
819 | |||
820 | print_memory_map("limit_regions start"); | ||
821 | if (efi_enabled) { | ||
822 | efi_limit_regions(size); | ||
823 | return; | ||
824 | } | ||
825 | for (i = 0; i < e820.nr_map; i++) { | ||
826 | current_addr = e820.map[i].addr + e820.map[i].size; | ||
827 | if (current_addr < size) | ||
828 | continue; | ||
829 | |||
830 | if (e820.map[i].type != E820_RAM) | ||
831 | continue; | ||
832 | |||
833 | if (e820.map[i].addr >= size) { | ||
834 | /* | ||
835 | * This region starts past the end of the | ||
836 | * requested size, skip it completely. | ||
837 | */ | ||
838 | e820.nr_map = i; | ||
839 | } else { | ||
840 | e820.nr_map = i + 1; | ||
841 | e820.map[i].size -= current_addr - size; | ||
842 | } | ||
843 | print_memory_map("limit_regions endfor"); | ||
844 | return; | ||
845 | } | ||
846 | print_memory_map("limit_regions endfunc"); | ||
847 | } | ||
848 | |||
849 | /* | ||
850 | * This function checks if any part of the range <start,end> is mapped | ||
851 | * with type. | ||
852 | */ | ||
853 | int | ||
854 | e820_any_mapped(u64 start, u64 end, unsigned type) | ||
855 | { | ||
856 | int i; | ||
857 | for (i = 0; i < e820.nr_map; i++) { | ||
858 | const struct e820entry *ei = &e820.map[i]; | ||
859 | if (type && ei->type != type) | ||
860 | continue; | ||
861 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
862 | continue; | ||
863 | return 1; | ||
864 | } | ||
865 | return 0; | ||
866 | } | ||
867 | EXPORT_SYMBOL_GPL(e820_any_mapped); | ||
868 | |||
869 | /* | ||
870 | * This function checks if the entire range <start,end> is mapped with type. | ||
871 | * | ||
872 | * Note: this function only works correct if the e820 table is sorted and | ||
873 | * not-overlapping, which is the case | ||
874 | */ | ||
875 | int __init | ||
876 | e820_all_mapped(unsigned long s, unsigned long e, unsigned type) | ||
877 | { | ||
878 | u64 start = s; | ||
879 | u64 end = e; | ||
880 | int i; | ||
881 | for (i = 0; i < e820.nr_map; i++) { | ||
882 | struct e820entry *ei = &e820.map[i]; | ||
883 | if (type && ei->type != type) | ||
884 | continue; | ||
885 | /* is the region (part) in overlap with the current region ?*/ | ||
886 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
887 | continue; | ||
888 | /* if the region is at the beginning of <start,end> we move | ||
889 | * start to the end of the region since it's ok until there | ||
890 | */ | ||
891 | if (ei->addr <= start) | ||
892 | start = ei->addr + ei->size; | ||
893 | /* if start is now at or beyond end, we're done, full | ||
894 | * coverage */ | ||
895 | if (start >= end) | ||
896 | return 1; /* we're done */ | ||
897 | } | ||
898 | return 0; | ||
899 | } | ||
900 | |||
901 | static int __init parse_memmap(char *arg) | ||
902 | { | ||
903 | if (!arg) | ||
904 | return -EINVAL; | ||
905 | |||
906 | if (strcmp(arg, "exactmap") == 0) { | ||
907 | #ifdef CONFIG_CRASH_DUMP | ||
908 | /* If we are doing a crash dump, we | ||
909 | * still need to know the real mem | ||
910 | * size before original memory map is | ||
911 | * reset. | ||
912 | */ | ||
913 | find_max_pfn(); | ||
914 | saved_max_pfn = max_pfn; | ||
915 | #endif | ||
916 | e820.nr_map = 0; | ||
917 | user_defined_memmap = 1; | ||
918 | } else { | ||
919 | /* If the user specifies memory size, we | ||
920 | * limit the BIOS-provided memory map to | ||
921 | * that size. exactmap can be used to specify | ||
922 | * the exact map. mem=number can be used to | ||
923 | * trim the existing memory map. | ||
924 | */ | ||
925 | unsigned long long start_at, mem_size; | ||
926 | |||
927 | mem_size = memparse(arg, &arg); | ||
928 | if (*arg == '@') { | ||
929 | start_at = memparse(arg+1, &arg); | ||
930 | add_memory_region(start_at, mem_size, E820_RAM); | ||
931 | } else if (*arg == '#') { | ||
932 | start_at = memparse(arg+1, &arg); | ||
933 | add_memory_region(start_at, mem_size, E820_ACPI); | ||
934 | } else if (*arg == '$') { | ||
935 | start_at = memparse(arg+1, &arg); | ||
936 | add_memory_region(start_at, mem_size, E820_RESERVED); | ||
937 | } else { | ||
938 | limit_regions(mem_size); | ||
939 | user_defined_memmap = 1; | ||
940 | } | ||
941 | } | ||
942 | return 0; | ||
943 | } | ||
944 | early_param("memmap", parse_memmap); | ||
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c new file mode 100644 index 000000000000..92f812ba275c --- /dev/null +++ b/arch/x86/kernel/early_printk.c | |||
@@ -0,0 +1,2 @@ | |||
1 | |||
2 | #include "../../x86_64/kernel/early_printk.c" | ||
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c new file mode 100644 index 000000000000..2452c6fbe992 --- /dev/null +++ b/arch/x86/kernel/efi_32.c | |||
@@ -0,0 +1,712 @@ | |||
1 | /* | ||
2 | * Extensible Firmware Interface | ||
3 | * | ||
4 | * Based on Extensible Firmware Interface Specification version 1.0 | ||
5 | * | ||
6 | * Copyright (C) 1999 VA Linux Systems | ||
7 | * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> | ||
8 | * Copyright (C) 1999-2002 Hewlett-Packard Co. | ||
9 | * David Mosberger-Tang <davidm@hpl.hp.com> | ||
10 | * Stephane Eranian <eranian@hpl.hp.com> | ||
11 | * | ||
12 | * All EFI Runtime Services are not implemented yet as EFI only | ||
13 | * supports physical mode addressing on SoftSDV. This is to be fixed | ||
14 | * in a future version. --drummond 1999-07-20 | ||
15 | * | ||
16 | * Implemented EFI runtime services and virtual mode calls. --davidm | ||
17 | * | ||
18 | * Goutham Rao: <goutham.rao@intel.com> | ||
19 | * Skip non-WB memory and ignore empty memory ranges. | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/types.h> | ||
26 | #include <linux/time.h> | ||
27 | #include <linux/spinlock.h> | ||
28 | #include <linux/bootmem.h> | ||
29 | #include <linux/ioport.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/efi.h> | ||
32 | #include <linux/kexec.h> | ||
33 | |||
34 | #include <asm/setup.h> | ||
35 | #include <asm/io.h> | ||
36 | #include <asm/page.h> | ||
37 | #include <asm/pgtable.h> | ||
38 | #include <asm/processor.h> | ||
39 | #include <asm/desc.h> | ||
40 | #include <asm/tlbflush.h> | ||
41 | |||
42 | #define EFI_DEBUG 0 | ||
43 | #define PFX "EFI: " | ||
44 | |||
45 | extern efi_status_t asmlinkage efi_call_phys(void *, ...); | ||
46 | |||
47 | struct efi efi; | ||
48 | EXPORT_SYMBOL(efi); | ||
49 | static struct efi efi_phys; | ||
50 | struct efi_memory_map memmap; | ||
51 | |||
52 | /* | ||
53 | * We require an early boot_ioremap mapping mechanism initially | ||
54 | */ | ||
55 | extern void * boot_ioremap(unsigned long, unsigned long); | ||
56 | |||
57 | /* | ||
58 | * To make EFI call EFI runtime service in physical addressing mode we need | ||
59 | * prelog/epilog before/after the invocation to disable interrupt, to | ||
60 | * claim EFI runtime service handler exclusively and to duplicate a memory in | ||
61 | * low memory space say 0 - 3G. | ||
62 | */ | ||
63 | |||
64 | static unsigned long efi_rt_eflags; | ||
65 | static DEFINE_SPINLOCK(efi_rt_lock); | ||
66 | static pgd_t efi_bak_pg_dir_pointer[2]; | ||
67 | |||
68 | static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) | ||
69 | { | ||
70 | unsigned long cr4; | ||
71 | unsigned long temp; | ||
72 | struct Xgt_desc_struct gdt_descr; | ||
73 | |||
74 | spin_lock(&efi_rt_lock); | ||
75 | local_irq_save(efi_rt_eflags); | ||
76 | |||
77 | /* | ||
78 | * If I don't have PSE, I should just duplicate two entries in page | ||
79 | * directory. If I have PSE, I just need to duplicate one entry in | ||
80 | * page directory. | ||
81 | */ | ||
82 | cr4 = read_cr4(); | ||
83 | |||
84 | if (cr4 & X86_CR4_PSE) { | ||
85 | efi_bak_pg_dir_pointer[0].pgd = | ||
86 | swapper_pg_dir[pgd_index(0)].pgd; | ||
87 | swapper_pg_dir[0].pgd = | ||
88 | swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; | ||
89 | } else { | ||
90 | efi_bak_pg_dir_pointer[0].pgd = | ||
91 | swapper_pg_dir[pgd_index(0)].pgd; | ||
92 | efi_bak_pg_dir_pointer[1].pgd = | ||
93 | swapper_pg_dir[pgd_index(0x400000)].pgd; | ||
94 | swapper_pg_dir[pgd_index(0)].pgd = | ||
95 | swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; | ||
96 | temp = PAGE_OFFSET + 0x400000; | ||
97 | swapper_pg_dir[pgd_index(0x400000)].pgd = | ||
98 | swapper_pg_dir[pgd_index(temp)].pgd; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * After the lock is released, the original page table is restored. | ||
103 | */ | ||
104 | local_flush_tlb(); | ||
105 | |||
106 | gdt_descr.address = __pa(get_cpu_gdt_table(0)); | ||
107 | gdt_descr.size = GDT_SIZE - 1; | ||
108 | load_gdt(&gdt_descr); | ||
109 | } | ||
110 | |||
111 | static void efi_call_phys_epilog(void) __releases(efi_rt_lock) | ||
112 | { | ||
113 | unsigned long cr4; | ||
114 | struct Xgt_desc_struct gdt_descr; | ||
115 | |||
116 | gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); | ||
117 | gdt_descr.size = GDT_SIZE - 1; | ||
118 | load_gdt(&gdt_descr); | ||
119 | |||
120 | cr4 = read_cr4(); | ||
121 | |||
122 | if (cr4 & X86_CR4_PSE) { | ||
123 | swapper_pg_dir[pgd_index(0)].pgd = | ||
124 | efi_bak_pg_dir_pointer[0].pgd; | ||
125 | } else { | ||
126 | swapper_pg_dir[pgd_index(0)].pgd = | ||
127 | efi_bak_pg_dir_pointer[0].pgd; | ||
128 | swapper_pg_dir[pgd_index(0x400000)].pgd = | ||
129 | efi_bak_pg_dir_pointer[1].pgd; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * After the lock is released, the original page table is restored. | ||
134 | */ | ||
135 | local_flush_tlb(); | ||
136 | |||
137 | local_irq_restore(efi_rt_eflags); | ||
138 | spin_unlock(&efi_rt_lock); | ||
139 | } | ||
140 | |||
141 | static efi_status_t | ||
142 | phys_efi_set_virtual_address_map(unsigned long memory_map_size, | ||
143 | unsigned long descriptor_size, | ||
144 | u32 descriptor_version, | ||
145 | efi_memory_desc_t *virtual_map) | ||
146 | { | ||
147 | efi_status_t status; | ||
148 | |||
149 | efi_call_phys_prelog(); | ||
150 | status = efi_call_phys(efi_phys.set_virtual_address_map, | ||
151 | memory_map_size, descriptor_size, | ||
152 | descriptor_version, virtual_map); | ||
153 | efi_call_phys_epilog(); | ||
154 | return status; | ||
155 | } | ||
156 | |||
157 | static efi_status_t | ||
158 | phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) | ||
159 | { | ||
160 | efi_status_t status; | ||
161 | |||
162 | efi_call_phys_prelog(); | ||
163 | status = efi_call_phys(efi_phys.get_time, tm, tc); | ||
164 | efi_call_phys_epilog(); | ||
165 | return status; | ||
166 | } | ||
167 | |||
168 | inline int efi_set_rtc_mmss(unsigned long nowtime) | ||
169 | { | ||
170 | int real_seconds, real_minutes; | ||
171 | efi_status_t status; | ||
172 | efi_time_t eft; | ||
173 | efi_time_cap_t cap; | ||
174 | |||
175 | spin_lock(&efi_rt_lock); | ||
176 | status = efi.get_time(&eft, &cap); | ||
177 | spin_unlock(&efi_rt_lock); | ||
178 | if (status != EFI_SUCCESS) | ||
179 | panic("Ooops, efitime: can't read time!\n"); | ||
180 | real_seconds = nowtime % 60; | ||
181 | real_minutes = nowtime / 60; | ||
182 | |||
183 | if (((abs(real_minutes - eft.minute) + 15)/30) & 1) | ||
184 | real_minutes += 30; | ||
185 | real_minutes %= 60; | ||
186 | |||
187 | eft.minute = real_minutes; | ||
188 | eft.second = real_seconds; | ||
189 | |||
190 | if (status != EFI_SUCCESS) { | ||
191 | printk("Ooops: efitime: can't read time!\n"); | ||
192 | return -1; | ||
193 | } | ||
194 | return 0; | ||
195 | } | ||
196 | /* | ||
197 | * This is used during kernel init before runtime | ||
198 | * services have been remapped and also during suspend, therefore, | ||
199 | * we'll need to call both in physical and virtual modes. | ||
200 | */ | ||
201 | inline unsigned long efi_get_time(void) | ||
202 | { | ||
203 | efi_status_t status; | ||
204 | efi_time_t eft; | ||
205 | efi_time_cap_t cap; | ||
206 | |||
207 | if (efi.get_time) { | ||
208 | /* if we are in virtual mode use remapped function */ | ||
209 | status = efi.get_time(&eft, &cap); | ||
210 | } else { | ||
211 | /* we are in physical mode */ | ||
212 | status = phys_efi_get_time(&eft, &cap); | ||
213 | } | ||
214 | |||
215 | if (status != EFI_SUCCESS) | ||
216 | printk("Oops: efitime: can't read time status: 0x%lx\n",status); | ||
217 | |||
218 | return mktime(eft.year, eft.month, eft.day, eft.hour, | ||
219 | eft.minute, eft.second); | ||
220 | } | ||
221 | |||
222 | int is_available_memory(efi_memory_desc_t * md) | ||
223 | { | ||
224 | if (!(md->attribute & EFI_MEMORY_WB)) | ||
225 | return 0; | ||
226 | |||
227 | switch (md->type) { | ||
228 | case EFI_LOADER_CODE: | ||
229 | case EFI_LOADER_DATA: | ||
230 | case EFI_BOOT_SERVICES_CODE: | ||
231 | case EFI_BOOT_SERVICES_DATA: | ||
232 | case EFI_CONVENTIONAL_MEMORY: | ||
233 | return 1; | ||
234 | } | ||
235 | return 0; | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * We need to map the EFI memory map again after paging_init(). | ||
240 | */ | ||
241 | void __init efi_map_memmap(void) | ||
242 | { | ||
243 | memmap.map = NULL; | ||
244 | |||
245 | memmap.map = bt_ioremap((unsigned long) memmap.phys_map, | ||
246 | (memmap.nr_map * memmap.desc_size)); | ||
247 | if (memmap.map == NULL) | ||
248 | printk(KERN_ERR PFX "Could not remap the EFI memmap!\n"); | ||
249 | |||
250 | memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); | ||
251 | } | ||
252 | |||
253 | #if EFI_DEBUG | ||
254 | static void __init print_efi_memmap(void) | ||
255 | { | ||
256 | efi_memory_desc_t *md; | ||
257 | void *p; | ||
258 | int i; | ||
259 | |||
260 | for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { | ||
261 | md = p; | ||
262 | printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, " | ||
263 | "range=[0x%016llx-0x%016llx) (%lluMB)\n", | ||
264 | i, md->type, md->attribute, md->phys_addr, | ||
265 | md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), | ||
266 | (md->num_pages >> (20 - EFI_PAGE_SHIFT))); | ||
267 | } | ||
268 | } | ||
269 | #endif /* EFI_DEBUG */ | ||
270 | |||
271 | /* | ||
272 | * Walks the EFI memory map and calls CALLBACK once for each EFI | ||
273 | * memory descriptor that has memory that is available for kernel use. | ||
274 | */ | ||
275 | void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) | ||
276 | { | ||
277 | int prev_valid = 0; | ||
278 | struct range { | ||
279 | unsigned long start; | ||
280 | unsigned long end; | ||
281 | } uninitialized_var(prev), curr; | ||
282 | efi_memory_desc_t *md; | ||
283 | unsigned long start, end; | ||
284 | void *p; | ||
285 | |||
286 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
287 | md = p; | ||
288 | |||
289 | if ((md->num_pages == 0) || (!is_available_memory(md))) | ||
290 | continue; | ||
291 | |||
292 | curr.start = md->phys_addr; | ||
293 | curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT); | ||
294 | |||
295 | if (!prev_valid) { | ||
296 | prev = curr; | ||
297 | prev_valid = 1; | ||
298 | } else { | ||
299 | if (curr.start < prev.start) | ||
300 | printk(KERN_INFO PFX "Unordered memory map\n"); | ||
301 | if (prev.end == curr.start) | ||
302 | prev.end = curr.end; | ||
303 | else { | ||
304 | start = | ||
305 | (unsigned long) (PAGE_ALIGN(prev.start)); | ||
306 | end = (unsigned long) (prev.end & PAGE_MASK); | ||
307 | if ((end > start) | ||
308 | && (*callback) (start, end, arg) < 0) | ||
309 | return; | ||
310 | prev = curr; | ||
311 | } | ||
312 | } | ||
313 | } | ||
314 | if (prev_valid) { | ||
315 | start = (unsigned long) PAGE_ALIGN(prev.start); | ||
316 | end = (unsigned long) (prev.end & PAGE_MASK); | ||
317 | if (end > start) | ||
318 | (*callback) (start, end, arg); | ||
319 | } | ||
320 | } | ||
321 | |||
322 | void __init efi_init(void) | ||
323 | { | ||
324 | efi_config_table_t *config_tables; | ||
325 | efi_runtime_services_t *runtime; | ||
326 | efi_char16_t *c16; | ||
327 | char vendor[100] = "unknown"; | ||
328 | unsigned long num_config_tables; | ||
329 | int i = 0; | ||
330 | |||
331 | memset(&efi, 0, sizeof(efi) ); | ||
332 | memset(&efi_phys, 0, sizeof(efi_phys)); | ||
333 | |||
334 | efi_phys.systab = EFI_SYSTAB; | ||
335 | memmap.phys_map = EFI_MEMMAP; | ||
336 | memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE; | ||
337 | memmap.desc_version = EFI_MEMDESC_VERSION; | ||
338 | memmap.desc_size = EFI_MEMDESC_SIZE; | ||
339 | |||
340 | efi.systab = (efi_system_table_t *) | ||
341 | boot_ioremap((unsigned long) efi_phys.systab, | ||
342 | sizeof(efi_system_table_t)); | ||
343 | /* | ||
344 | * Verify the EFI Table | ||
345 | */ | ||
346 | if (efi.systab == NULL) | ||
347 | printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n"); | ||
348 | if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) | ||
349 | printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n"); | ||
350 | if ((efi.systab->hdr.revision >> 16) == 0) | ||
351 | printk(KERN_ERR PFX "Warning: EFI system table version " | ||
352 | "%d.%02d, expected 1.00 or greater\n", | ||
353 | efi.systab->hdr.revision >> 16, | ||
354 | efi.systab->hdr.revision & 0xffff); | ||
355 | |||
356 | /* | ||
357 | * Grab some details from the system table | ||
358 | */ | ||
359 | num_config_tables = efi.systab->nr_tables; | ||
360 | config_tables = (efi_config_table_t *)efi.systab->tables; | ||
361 | runtime = efi.systab->runtime; | ||
362 | |||
363 | /* | ||
364 | * Show what we know for posterity | ||
365 | */ | ||
366 | c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2); | ||
367 | if (c16) { | ||
368 | for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i) | ||
369 | vendor[i] = *c16++; | ||
370 | vendor[i] = '\0'; | ||
371 | } else | ||
372 | printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); | ||
373 | |||
374 | printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n", | ||
375 | efi.systab->hdr.revision >> 16, | ||
376 | efi.systab->hdr.revision & 0xffff, vendor); | ||
377 | |||
378 | /* | ||
379 | * Let's see what config tables the firmware passed to us. | ||
380 | */ | ||
381 | config_tables = (efi_config_table_t *) | ||
382 | boot_ioremap((unsigned long) config_tables, | ||
383 | num_config_tables * sizeof(efi_config_table_t)); | ||
384 | |||
385 | if (config_tables == NULL) | ||
386 | printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n"); | ||
387 | |||
388 | efi.mps = EFI_INVALID_TABLE_ADDR; | ||
389 | efi.acpi = EFI_INVALID_TABLE_ADDR; | ||
390 | efi.acpi20 = EFI_INVALID_TABLE_ADDR; | ||
391 | efi.smbios = EFI_INVALID_TABLE_ADDR; | ||
392 | efi.sal_systab = EFI_INVALID_TABLE_ADDR; | ||
393 | efi.boot_info = EFI_INVALID_TABLE_ADDR; | ||
394 | efi.hcdp = EFI_INVALID_TABLE_ADDR; | ||
395 | efi.uga = EFI_INVALID_TABLE_ADDR; | ||
396 | |||
397 | for (i = 0; i < num_config_tables; i++) { | ||
398 | if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { | ||
399 | efi.mps = config_tables[i].table; | ||
400 | printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table); | ||
401 | } else | ||
402 | if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { | ||
403 | efi.acpi20 = config_tables[i].table; | ||
404 | printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table); | ||
405 | } else | ||
406 | if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { | ||
407 | efi.acpi = config_tables[i].table; | ||
408 | printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table); | ||
409 | } else | ||
410 | if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { | ||
411 | efi.smbios = config_tables[i].table; | ||
412 | printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table); | ||
413 | } else | ||
414 | if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { | ||
415 | efi.hcdp = config_tables[i].table; | ||
416 | printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table); | ||
417 | } else | ||
418 | if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) { | ||
419 | efi.uga = config_tables[i].table; | ||
420 | printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table); | ||
421 | } | ||
422 | } | ||
423 | printk("\n"); | ||
424 | |||
425 | /* | ||
426 | * Check out the runtime services table. We need to map | ||
427 | * the runtime services table so that we can grab the physical | ||
428 | * address of several of the EFI runtime functions, needed to | ||
429 | * set the firmware into virtual mode. | ||
430 | */ | ||
431 | |||
432 | runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long) | ||
433 | runtime, | ||
434 | sizeof(efi_runtime_services_t)); | ||
435 | if (runtime != NULL) { | ||
436 | /* | ||
437 | * We will only need *early* access to the following | ||
438 | * two EFI runtime services before set_virtual_address_map | ||
439 | * is invoked. | ||
440 | */ | ||
441 | efi_phys.get_time = (efi_get_time_t *) runtime->get_time; | ||
442 | efi_phys.set_virtual_address_map = | ||
443 | (efi_set_virtual_address_map_t *) | ||
444 | runtime->set_virtual_address_map; | ||
445 | } else | ||
446 | printk(KERN_ERR PFX "Could not map the runtime service table!\n"); | ||
447 | |||
448 | /* Map the EFI memory map for use until paging_init() */ | ||
449 | memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE); | ||
450 | if (memmap.map == NULL) | ||
451 | printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); | ||
452 | |||
453 | memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); | ||
454 | |||
455 | #if EFI_DEBUG | ||
456 | print_efi_memmap(); | ||
457 | #endif | ||
458 | } | ||
459 | |||
460 | static inline void __init check_range_for_systab(efi_memory_desc_t *md) | ||
461 | { | ||
462 | if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) && | ||
463 | ((unsigned long)efi_phys.systab < md->phys_addr + | ||
464 | ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) { | ||
465 | unsigned long addr; | ||
466 | |||
467 | addr = md->virt_addr - md->phys_addr + | ||
468 | (unsigned long)efi_phys.systab; | ||
469 | efi.systab = (efi_system_table_t *)addr; | ||
470 | } | ||
471 | } | ||
472 | |||
473 | /* | ||
474 | * Wrap all the virtual calls in a way that forces the parameters on the stack. | ||
475 | */ | ||
476 | |||
477 | #define efi_call_virt(f, args...) \ | ||
478 | ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) | ||
479 | |||
480 | static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) | ||
481 | { | ||
482 | return efi_call_virt(get_time, tm, tc); | ||
483 | } | ||
484 | |||
485 | static efi_status_t virt_efi_set_time (efi_time_t *tm) | ||
486 | { | ||
487 | return efi_call_virt(set_time, tm); | ||
488 | } | ||
489 | |||
490 | static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled, | ||
491 | efi_bool_t *pending, | ||
492 | efi_time_t *tm) | ||
493 | { | ||
494 | return efi_call_virt(get_wakeup_time, enabled, pending, tm); | ||
495 | } | ||
496 | |||
497 | static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled, | ||
498 | efi_time_t *tm) | ||
499 | { | ||
500 | return efi_call_virt(set_wakeup_time, enabled, tm); | ||
501 | } | ||
502 | |||
503 | static efi_status_t virt_efi_get_variable (efi_char16_t *name, | ||
504 | efi_guid_t *vendor, u32 *attr, | ||
505 | unsigned long *data_size, void *data) | ||
506 | { | ||
507 | return efi_call_virt(get_variable, name, vendor, attr, data_size, data); | ||
508 | } | ||
509 | |||
510 | static efi_status_t virt_efi_get_next_variable (unsigned long *name_size, | ||
511 | efi_char16_t *name, | ||
512 | efi_guid_t *vendor) | ||
513 | { | ||
514 | return efi_call_virt(get_next_variable, name_size, name, vendor); | ||
515 | } | ||
516 | |||
517 | static efi_status_t virt_efi_set_variable (efi_char16_t *name, | ||
518 | efi_guid_t *vendor, | ||
519 | unsigned long attr, | ||
520 | unsigned long data_size, void *data) | ||
521 | { | ||
522 | return efi_call_virt(set_variable, name, vendor, attr, data_size, data); | ||
523 | } | ||
524 | |||
525 | static efi_status_t virt_efi_get_next_high_mono_count (u32 *count) | ||
526 | { | ||
527 | return efi_call_virt(get_next_high_mono_count, count); | ||
528 | } | ||
529 | |||
530 | static void virt_efi_reset_system (int reset_type, efi_status_t status, | ||
531 | unsigned long data_size, | ||
532 | efi_char16_t *data) | ||
533 | { | ||
534 | efi_call_virt(reset_system, reset_type, status, data_size, data); | ||
535 | } | ||
536 | |||
537 | /* | ||
538 | * This function will switch the EFI runtime services to virtual mode. | ||
539 | * Essentially, look through the EFI memmap and map every region that | ||
540 | * has the runtime attribute bit set in its memory descriptor and update | ||
541 | * that memory descriptor with the virtual address obtained from ioremap(). | ||
542 | * This enables the runtime services to be called without having to | ||
543 | * thunk back into physical mode for every invocation. | ||
544 | */ | ||
545 | |||
546 | void __init efi_enter_virtual_mode(void) | ||
547 | { | ||
548 | efi_memory_desc_t *md; | ||
549 | efi_status_t status; | ||
550 | void *p; | ||
551 | |||
552 | efi.systab = NULL; | ||
553 | |||
554 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
555 | md = p; | ||
556 | |||
557 | if (!(md->attribute & EFI_MEMORY_RUNTIME)) | ||
558 | continue; | ||
559 | |||
560 | md->virt_addr = (unsigned long)ioremap(md->phys_addr, | ||
561 | md->num_pages << EFI_PAGE_SHIFT); | ||
562 | if (!(unsigned long)md->virt_addr) { | ||
563 | printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", | ||
564 | (unsigned long)md->phys_addr); | ||
565 | } | ||
566 | /* update the virtual address of the EFI system table */ | ||
567 | check_range_for_systab(md); | ||
568 | } | ||
569 | |||
570 | BUG_ON(!efi.systab); | ||
571 | |||
572 | status = phys_efi_set_virtual_address_map( | ||
573 | memmap.desc_size * memmap.nr_map, | ||
574 | memmap.desc_size, | ||
575 | memmap.desc_version, | ||
576 | memmap.phys_map); | ||
577 | |||
578 | if (status != EFI_SUCCESS) { | ||
579 | printk (KERN_ALERT "You are screwed! " | ||
580 | "Unable to switch EFI into virtual mode " | ||
581 | "(status=%lx)\n", status); | ||
582 | panic("EFI call to SetVirtualAddressMap() failed!"); | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Now that EFI is in virtual mode, update the function | ||
587 | * pointers in the runtime service table to the new virtual addresses. | ||
588 | */ | ||
589 | |||
590 | efi.get_time = virt_efi_get_time; | ||
591 | efi.set_time = virt_efi_set_time; | ||
592 | efi.get_wakeup_time = virt_efi_get_wakeup_time; | ||
593 | efi.set_wakeup_time = virt_efi_set_wakeup_time; | ||
594 | efi.get_variable = virt_efi_get_variable; | ||
595 | efi.get_next_variable = virt_efi_get_next_variable; | ||
596 | efi.set_variable = virt_efi_set_variable; | ||
597 | efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; | ||
598 | efi.reset_system = virt_efi_reset_system; | ||
599 | } | ||
600 | |||
601 | void __init | ||
602 | efi_initialize_iomem_resources(struct resource *code_resource, | ||
603 | struct resource *data_resource) | ||
604 | { | ||
605 | struct resource *res; | ||
606 | efi_memory_desc_t *md; | ||
607 | void *p; | ||
608 | |||
609 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
610 | md = p; | ||
611 | |||
612 | if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) > | ||
613 | 0x100000000ULL) | ||
614 | continue; | ||
615 | res = kzalloc(sizeof(struct resource), GFP_ATOMIC); | ||
616 | switch (md->type) { | ||
617 | case EFI_RESERVED_TYPE: | ||
618 | res->name = "Reserved Memory"; | ||
619 | break; | ||
620 | case EFI_LOADER_CODE: | ||
621 | res->name = "Loader Code"; | ||
622 | break; | ||
623 | case EFI_LOADER_DATA: | ||
624 | res->name = "Loader Data"; | ||
625 | break; | ||
626 | case EFI_BOOT_SERVICES_DATA: | ||
627 | res->name = "BootServices Data"; | ||
628 | break; | ||
629 | case EFI_BOOT_SERVICES_CODE: | ||
630 | res->name = "BootServices Code"; | ||
631 | break; | ||
632 | case EFI_RUNTIME_SERVICES_CODE: | ||
633 | res->name = "Runtime Service Code"; | ||
634 | break; | ||
635 | case EFI_RUNTIME_SERVICES_DATA: | ||
636 | res->name = "Runtime Service Data"; | ||
637 | break; | ||
638 | case EFI_CONVENTIONAL_MEMORY: | ||
639 | res->name = "Conventional Memory"; | ||
640 | break; | ||
641 | case EFI_UNUSABLE_MEMORY: | ||
642 | res->name = "Unusable Memory"; | ||
643 | break; | ||
644 | case EFI_ACPI_RECLAIM_MEMORY: | ||
645 | res->name = "ACPI Reclaim"; | ||
646 | break; | ||
647 | case EFI_ACPI_MEMORY_NVS: | ||
648 | res->name = "ACPI NVS"; | ||
649 | break; | ||
650 | case EFI_MEMORY_MAPPED_IO: | ||
651 | res->name = "Memory Mapped IO"; | ||
652 | break; | ||
653 | case EFI_MEMORY_MAPPED_IO_PORT_SPACE: | ||
654 | res->name = "Memory Mapped IO Port Space"; | ||
655 | break; | ||
656 | default: | ||
657 | res->name = "Reserved"; | ||
658 | break; | ||
659 | } | ||
660 | res->start = md->phys_addr; | ||
661 | res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1); | ||
662 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
663 | if (request_resource(&iomem_resource, res) < 0) | ||
664 | printk(KERN_ERR PFX "Failed to allocate res %s : " | ||
665 | "0x%llx-0x%llx\n", res->name, | ||
666 | (unsigned long long)res->start, | ||
667 | (unsigned long long)res->end); | ||
668 | /* | ||
669 | * We don't know which region contains kernel data so we try | ||
670 | * it repeatedly and let the resource manager test it. | ||
671 | */ | ||
672 | if (md->type == EFI_CONVENTIONAL_MEMORY) { | ||
673 | request_resource(res, code_resource); | ||
674 | request_resource(res, data_resource); | ||
675 | #ifdef CONFIG_KEXEC | ||
676 | request_resource(res, &crashk_res); | ||
677 | #endif | ||
678 | } | ||
679 | } | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * Convenience functions to obtain memory types and attributes | ||
684 | */ | ||
685 | |||
686 | u32 efi_mem_type(unsigned long phys_addr) | ||
687 | { | ||
688 | efi_memory_desc_t *md; | ||
689 | void *p; | ||
690 | |||
691 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
692 | md = p; | ||
693 | if ((md->phys_addr <= phys_addr) && (phys_addr < | ||
694 | (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) | ||
695 | return md->type; | ||
696 | } | ||
697 | return 0; | ||
698 | } | ||
699 | |||
700 | u64 efi_mem_attributes(unsigned long phys_addr) | ||
701 | { | ||
702 | efi_memory_desc_t *md; | ||
703 | void *p; | ||
704 | |||
705 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
706 | md = p; | ||
707 | if ((md->phys_addr <= phys_addr) && (phys_addr < | ||
708 | (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) | ||
709 | return md->attribute; | ||
710 | } | ||
711 | return 0; | ||
712 | } | ||
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S new file mode 100644 index 000000000000..ef00bb77d7e4 --- /dev/null +++ b/arch/x86/kernel/efi_stub_32.S | |||
@@ -0,0 +1,122 @@ | |||
1 | /* | ||
2 | * EFI call stub for IA32. | ||
3 | * | ||
4 | * This stub allows us to make EFI calls in physical mode with interrupts | ||
5 | * turned off. | ||
6 | */ | ||
7 | |||
8 | #include <linux/linkage.h> | ||
9 | #include <asm/page.h> | ||
10 | |||
11 | /* | ||
12 | * efi_call_phys(void *, ...) is a function with variable parameters. | ||
13 | * All the callers of this function assure that all the parameters are 4-bytes. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. | ||
18 | * So we'd better save all of them at the beginning of this function and restore | ||
19 | * at the end no matter how many we use, because we can not assure EFI runtime | ||
20 | * service functions will comply with gcc calling convention, too. | ||
21 | */ | ||
22 | |||
23 | .text | ||
24 | ENTRY(efi_call_phys) | ||
25 | /* | ||
26 | * 0. The function can only be called in Linux kernel. So CS has been | ||
27 | * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found | ||
28 | * the values of these registers are the same. And, the corresponding | ||
29 | * GDT entries are identical. So I will do nothing about segment reg | ||
30 | * and GDT, but change GDT base register in prelog and epilog. | ||
31 | */ | ||
32 | |||
33 | /* | ||
34 | * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET. | ||
35 | * But to make it smoothly switch from virtual mode to flat mode. | ||
36 | * The mapping of lower virtual memory has been created in prelog and | ||
37 | * epilog. | ||
38 | */ | ||
39 | movl $1f, %edx | ||
40 | subl $__PAGE_OFFSET, %edx | ||
41 | jmp *%edx | ||
42 | 1: | ||
43 | |||
44 | /* | ||
45 | * 2. Now on the top of stack is the return | ||
46 | * address in the caller of efi_call_phys(), then parameter 1, | ||
47 | * parameter 2, ..., param n. To make things easy, we save the return | ||
48 | * address of efi_call_phys in a global variable. | ||
49 | */ | ||
50 | popl %edx | ||
51 | movl %edx, saved_return_addr | ||
52 | /* get the function pointer into ECX*/ | ||
53 | popl %ecx | ||
54 | movl %ecx, efi_rt_function_ptr | ||
55 | movl $2f, %edx | ||
56 | subl $__PAGE_OFFSET, %edx | ||
57 | pushl %edx | ||
58 | |||
59 | /* | ||
60 | * 3. Clear PG bit in %CR0. | ||
61 | */ | ||
62 | movl %cr0, %edx | ||
63 | andl $0x7fffffff, %edx | ||
64 | movl %edx, %cr0 | ||
65 | jmp 1f | ||
66 | 1: | ||
67 | |||
68 | /* | ||
69 | * 4. Adjust stack pointer. | ||
70 | */ | ||
71 | subl $__PAGE_OFFSET, %esp | ||
72 | |||
73 | /* | ||
74 | * 5. Call the physical function. | ||
75 | */ | ||
76 | jmp *%ecx | ||
77 | |||
78 | 2: | ||
79 | /* | ||
80 | * 6. After EFI runtime service returns, control will return to | ||
81 | * following instruction. We'd better readjust stack pointer first. | ||
82 | */ | ||
83 | addl $__PAGE_OFFSET, %esp | ||
84 | |||
85 | /* | ||
86 | * 7. Restore PG bit | ||
87 | */ | ||
88 | movl %cr0, %edx | ||
89 | orl $0x80000000, %edx | ||
90 | movl %edx, %cr0 | ||
91 | jmp 1f | ||
92 | 1: | ||
93 | /* | ||
94 | * 8. Now restore the virtual mode from flat mode by | ||
95 | * adding EIP with PAGE_OFFSET. | ||
96 | */ | ||
97 | movl $1f, %edx | ||
98 | jmp *%edx | ||
99 | 1: | ||
100 | |||
101 | /* | ||
102 | * 9. Balance the stack. And because EAX contain the return value, | ||
103 | * we'd better not clobber it. | ||
104 | */ | ||
105 | leal efi_rt_function_ptr, %edx | ||
106 | movl (%edx), %ecx | ||
107 | pushl %ecx | ||
108 | |||
109 | /* | ||
110 | * 10. Push the saved return address onto the stack and return. | ||
111 | */ | ||
112 | leal saved_return_addr, %edx | ||
113 | movl (%edx), %ecx | ||
114 | pushl %ecx | ||
115 | ret | ||
116 | .previous | ||
117 | |||
118 | .data | ||
119 | saved_return_addr: | ||
120 | .long 0 | ||
121 | efi_rt_function_ptr: | ||
122 | .long 0 | ||
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S new file mode 100644 index 000000000000..290b7bc82da3 --- /dev/null +++ b/arch/x86/kernel/entry_32.S | |||
@@ -0,0 +1,1112 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/entry.S | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * entry.S contains the system-call and fault low-level handling routines. | ||
9 | * This also contains the timer-interrupt handler, as well as all interrupts | ||
10 | * and faults that can result in a task-switch. | ||
11 | * | ||
12 | * NOTE: This code handles signal-recognition, which happens every time | ||
13 | * after a timer-interrupt and after each system call. | ||
14 | * | ||
15 | * I changed all the .align's to 4 (16 byte alignment), as that's faster | ||
16 | * on a 486. | ||
17 | * | ||
18 | * Stack layout in 'syscall_exit': | ||
19 | * ptrace needs to have all regs on the stack. | ||
20 | * if the order here is changed, it needs to be | ||
21 | * updated in fork.c:copy_process, signal.c:do_signal, | ||
22 | * ptrace.c and ptrace.h | ||
23 | * | ||
24 | * 0(%esp) - %ebx | ||
25 | * 4(%esp) - %ecx | ||
26 | * 8(%esp) - %edx | ||
27 | * C(%esp) - %esi | ||
28 | * 10(%esp) - %edi | ||
29 | * 14(%esp) - %ebp | ||
30 | * 18(%esp) - %eax | ||
31 | * 1C(%esp) - %ds | ||
32 | * 20(%esp) - %es | ||
33 | * 24(%esp) - %fs | ||
34 | * 28(%esp) - orig_eax | ||
35 | * 2C(%esp) - %eip | ||
36 | * 30(%esp) - %cs | ||
37 | * 34(%esp) - %eflags | ||
38 | * 38(%esp) - %oldesp | ||
39 | * 3C(%esp) - %oldss | ||
40 | * | ||
41 | * "current" is in register %ebx during any slow entries. | ||
42 | */ | ||
43 | |||
44 | #include <linux/linkage.h> | ||
45 | #include <asm/thread_info.h> | ||
46 | #include <asm/irqflags.h> | ||
47 | #include <asm/errno.h> | ||
48 | #include <asm/segment.h> | ||
49 | #include <asm/smp.h> | ||
50 | #include <asm/page.h> | ||
51 | #include <asm/desc.h> | ||
52 | #include <asm/percpu.h> | ||
53 | #include <asm/dwarf2.h> | ||
54 | #include "irq_vectors.h" | ||
55 | |||
56 | /* | ||
57 | * We use macros for low-level operations which need to be overridden | ||
58 | * for paravirtualization. The following will never clobber any registers: | ||
59 | * INTERRUPT_RETURN (aka. "iret") | ||
60 | * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") | ||
61 | * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). | ||
62 | * | ||
63 | * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must | ||
64 | * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). | ||
65 | * Allowing a register to be clobbered can shrink the paravirt replacement | ||
66 | * enough to patch inline, increasing performance. | ||
67 | */ | ||
68 | |||
69 | #define nr_syscalls ((syscall_table_size)/4) | ||
70 | |||
71 | CF_MASK = 0x00000001 | ||
72 | TF_MASK = 0x00000100 | ||
73 | IF_MASK = 0x00000200 | ||
74 | DF_MASK = 0x00000400 | ||
75 | NT_MASK = 0x00004000 | ||
76 | VM_MASK = 0x00020000 | ||
77 | |||
78 | #ifdef CONFIG_PREEMPT | ||
79 | #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF | ||
80 | #else | ||
81 | #define preempt_stop(clobbers) | ||
82 | #define resume_kernel restore_nocheck | ||
83 | #endif | ||
84 | |||
85 | .macro TRACE_IRQS_IRET | ||
86 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
87 | testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? | ||
88 | jz 1f | ||
89 | TRACE_IRQS_ON | ||
90 | 1: | ||
91 | #endif | ||
92 | .endm | ||
93 | |||
94 | #ifdef CONFIG_VM86 | ||
95 | #define resume_userspace_sig check_userspace | ||
96 | #else | ||
97 | #define resume_userspace_sig resume_userspace | ||
98 | #endif | ||
99 | |||
100 | #define SAVE_ALL \ | ||
101 | cld; \ | ||
102 | pushl %fs; \ | ||
103 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
104 | /*CFI_REL_OFFSET fs, 0;*/\ | ||
105 | pushl %es; \ | ||
106 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
107 | /*CFI_REL_OFFSET es, 0;*/\ | ||
108 | pushl %ds; \ | ||
109 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
110 | /*CFI_REL_OFFSET ds, 0;*/\ | ||
111 | pushl %eax; \ | ||
112 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
113 | CFI_REL_OFFSET eax, 0;\ | ||
114 | pushl %ebp; \ | ||
115 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
116 | CFI_REL_OFFSET ebp, 0;\ | ||
117 | pushl %edi; \ | ||
118 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
119 | CFI_REL_OFFSET edi, 0;\ | ||
120 | pushl %esi; \ | ||
121 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
122 | CFI_REL_OFFSET esi, 0;\ | ||
123 | pushl %edx; \ | ||
124 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
125 | CFI_REL_OFFSET edx, 0;\ | ||
126 | pushl %ecx; \ | ||
127 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
128 | CFI_REL_OFFSET ecx, 0;\ | ||
129 | pushl %ebx; \ | ||
130 | CFI_ADJUST_CFA_OFFSET 4;\ | ||
131 | CFI_REL_OFFSET ebx, 0;\ | ||
132 | movl $(__USER_DS), %edx; \ | ||
133 | movl %edx, %ds; \ | ||
134 | movl %edx, %es; \ | ||
135 | movl $(__KERNEL_PERCPU), %edx; \ | ||
136 | movl %edx, %fs | ||
137 | |||
138 | #define RESTORE_INT_REGS \ | ||
139 | popl %ebx; \ | ||
140 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
141 | CFI_RESTORE ebx;\ | ||
142 | popl %ecx; \ | ||
143 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
144 | CFI_RESTORE ecx;\ | ||
145 | popl %edx; \ | ||
146 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
147 | CFI_RESTORE edx;\ | ||
148 | popl %esi; \ | ||
149 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
150 | CFI_RESTORE esi;\ | ||
151 | popl %edi; \ | ||
152 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
153 | CFI_RESTORE edi;\ | ||
154 | popl %ebp; \ | ||
155 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
156 | CFI_RESTORE ebp;\ | ||
157 | popl %eax; \ | ||
158 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
159 | CFI_RESTORE eax | ||
160 | |||
161 | #define RESTORE_REGS \ | ||
162 | RESTORE_INT_REGS; \ | ||
163 | 1: popl %ds; \ | ||
164 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
165 | /*CFI_RESTORE ds;*/\ | ||
166 | 2: popl %es; \ | ||
167 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
168 | /*CFI_RESTORE es;*/\ | ||
169 | 3: popl %fs; \ | ||
170 | CFI_ADJUST_CFA_OFFSET -4;\ | ||
171 | /*CFI_RESTORE fs;*/\ | ||
172 | .pushsection .fixup,"ax"; \ | ||
173 | 4: movl $0,(%esp); \ | ||
174 | jmp 1b; \ | ||
175 | 5: movl $0,(%esp); \ | ||
176 | jmp 2b; \ | ||
177 | 6: movl $0,(%esp); \ | ||
178 | jmp 3b; \ | ||
179 | .section __ex_table,"a";\ | ||
180 | .align 4; \ | ||
181 | .long 1b,4b; \ | ||
182 | .long 2b,5b; \ | ||
183 | .long 3b,6b; \ | ||
184 | .popsection | ||
185 | |||
186 | #define RING0_INT_FRAME \ | ||
187 | CFI_STARTPROC simple;\ | ||
188 | CFI_SIGNAL_FRAME;\ | ||
189 | CFI_DEF_CFA esp, 3*4;\ | ||
190 | /*CFI_OFFSET cs, -2*4;*/\ | ||
191 | CFI_OFFSET eip, -3*4 | ||
192 | |||
193 | #define RING0_EC_FRAME \ | ||
194 | CFI_STARTPROC simple;\ | ||
195 | CFI_SIGNAL_FRAME;\ | ||
196 | CFI_DEF_CFA esp, 4*4;\ | ||
197 | /*CFI_OFFSET cs, -2*4;*/\ | ||
198 | CFI_OFFSET eip, -3*4 | ||
199 | |||
200 | #define RING0_PTREGS_FRAME \ | ||
201 | CFI_STARTPROC simple;\ | ||
202 | CFI_SIGNAL_FRAME;\ | ||
203 | CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ | ||
204 | /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ | ||
205 | CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ | ||
206 | /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ | ||
207 | /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ | ||
208 | CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ | ||
209 | CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ | ||
210 | CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ | ||
211 | CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ | ||
212 | CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ | ||
213 | CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ | ||
214 | CFI_OFFSET ebx, PT_EBX-PT_OLDESP | ||
215 | |||
216 | ENTRY(ret_from_fork) | ||
217 | CFI_STARTPROC | ||
218 | pushl %eax | ||
219 | CFI_ADJUST_CFA_OFFSET 4 | ||
220 | call schedule_tail | ||
221 | GET_THREAD_INFO(%ebp) | ||
222 | popl %eax | ||
223 | CFI_ADJUST_CFA_OFFSET -4 | ||
224 | pushl $0x0202 # Reset kernel eflags | ||
225 | CFI_ADJUST_CFA_OFFSET 4 | ||
226 | popfl | ||
227 | CFI_ADJUST_CFA_OFFSET -4 | ||
228 | jmp syscall_exit | ||
229 | CFI_ENDPROC | ||
230 | END(ret_from_fork) | ||
231 | |||
232 | /* | ||
233 | * Return to user mode is not as complex as all this looks, | ||
234 | * but we want the default path for a system call return to | ||
235 | * go as quickly as possible which is why some of this is | ||
236 | * less clear than it otherwise should be. | ||
237 | */ | ||
238 | |||
239 | # userspace resumption stub bypassing syscall exit tracing | ||
240 | ALIGN | ||
241 | RING0_PTREGS_FRAME | ||
242 | ret_from_exception: | ||
243 | preempt_stop(CLBR_ANY) | ||
244 | ret_from_intr: | ||
245 | GET_THREAD_INFO(%ebp) | ||
246 | check_userspace: | ||
247 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS | ||
248 | movb PT_CS(%esp), %al | ||
249 | andl $(VM_MASK | SEGMENT_RPL_MASK), %eax | ||
250 | cmpl $USER_RPL, %eax | ||
251 | jb resume_kernel # not returning to v8086 or userspace | ||
252 | |||
253 | ENTRY(resume_userspace) | ||
254 | DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt | ||
255 | # setting need_resched or sigpending | ||
256 | # between sampling and the iret | ||
257 | movl TI_flags(%ebp), %ecx | ||
258 | andl $_TIF_WORK_MASK, %ecx # is there any work to be done on | ||
259 | # int/exception return? | ||
260 | jne work_pending | ||
261 | jmp restore_all | ||
262 | END(ret_from_exception) | ||
263 | |||
264 | #ifdef CONFIG_PREEMPT | ||
265 | ENTRY(resume_kernel) | ||
266 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
267 | cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? | ||
268 | jnz restore_nocheck | ||
269 | need_resched: | ||
270 | movl TI_flags(%ebp), %ecx # need_resched set ? | ||
271 | testb $_TIF_NEED_RESCHED, %cl | ||
272 | jz restore_all | ||
273 | testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? | ||
274 | jz restore_all | ||
275 | call preempt_schedule_irq | ||
276 | jmp need_resched | ||
277 | END(resume_kernel) | ||
278 | #endif | ||
279 | CFI_ENDPROC | ||
280 | |||
281 | /* SYSENTER_RETURN points to after the "sysenter" instruction in | ||
282 | the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ | ||
283 | |||
284 | # sysenter call handler stub | ||
285 | ENTRY(sysenter_entry) | ||
286 | CFI_STARTPROC simple | ||
287 | CFI_SIGNAL_FRAME | ||
288 | CFI_DEF_CFA esp, 0 | ||
289 | CFI_REGISTER esp, ebp | ||
290 | movl TSS_sysenter_esp0(%esp),%esp | ||
291 | sysenter_past_esp: | ||
292 | /* | ||
293 | * No need to follow this irqs on/off section: the syscall | ||
294 | * disabled irqs and here we enable it straight after entry: | ||
295 | */ | ||
296 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
297 | pushl $(__USER_DS) | ||
298 | CFI_ADJUST_CFA_OFFSET 4 | ||
299 | /*CFI_REL_OFFSET ss, 0*/ | ||
300 | pushl %ebp | ||
301 | CFI_ADJUST_CFA_OFFSET 4 | ||
302 | CFI_REL_OFFSET esp, 0 | ||
303 | pushfl | ||
304 | CFI_ADJUST_CFA_OFFSET 4 | ||
305 | pushl $(__USER_CS) | ||
306 | CFI_ADJUST_CFA_OFFSET 4 | ||
307 | /*CFI_REL_OFFSET cs, 0*/ | ||
308 | /* | ||
309 | * Push current_thread_info()->sysenter_return to the stack. | ||
310 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words | ||
311 | * pushed above; +8 corresponds to copy_thread's esp0 setting. | ||
312 | */ | ||
313 | pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) | ||
314 | CFI_ADJUST_CFA_OFFSET 4 | ||
315 | CFI_REL_OFFSET eip, 0 | ||
316 | |||
317 | /* | ||
318 | * Load the potential sixth argument from user stack. | ||
319 | * Careful about security. | ||
320 | */ | ||
321 | cmpl $__PAGE_OFFSET-3,%ebp | ||
322 | jae syscall_fault | ||
323 | 1: movl (%ebp),%ebp | ||
324 | .section __ex_table,"a" | ||
325 | .align 4 | ||
326 | .long 1b,syscall_fault | ||
327 | .previous | ||
328 | |||
329 | pushl %eax | ||
330 | CFI_ADJUST_CFA_OFFSET 4 | ||
331 | SAVE_ALL | ||
332 | GET_THREAD_INFO(%ebp) | ||
333 | |||
334 | /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | ||
335 | testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | ||
336 | jnz syscall_trace_entry | ||
337 | cmpl $(nr_syscalls), %eax | ||
338 | jae syscall_badsys | ||
339 | call *sys_call_table(,%eax,4) | ||
340 | movl %eax,PT_EAX(%esp) | ||
341 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
342 | TRACE_IRQS_OFF | ||
343 | movl TI_flags(%ebp), %ecx | ||
344 | testw $_TIF_ALLWORK_MASK, %cx | ||
345 | jne syscall_exit_work | ||
346 | /* if something modifies registers it must also disable sysexit */ | ||
347 | movl PT_EIP(%esp), %edx | ||
348 | movl PT_OLDESP(%esp), %ecx | ||
349 | xorl %ebp,%ebp | ||
350 | TRACE_IRQS_ON | ||
351 | 1: mov PT_FS(%esp), %fs | ||
352 | ENABLE_INTERRUPTS_SYSEXIT | ||
353 | CFI_ENDPROC | ||
354 | .pushsection .fixup,"ax" | ||
355 | 2: movl $0,PT_FS(%esp) | ||
356 | jmp 1b | ||
357 | .section __ex_table,"a" | ||
358 | .align 4 | ||
359 | .long 1b,2b | ||
360 | .popsection | ||
361 | ENDPROC(sysenter_entry) | ||
362 | |||
363 | # system call handler stub | ||
364 | ENTRY(system_call) | ||
365 | RING0_INT_FRAME # can't unwind into user space anyway | ||
366 | pushl %eax # save orig_eax | ||
367 | CFI_ADJUST_CFA_OFFSET 4 | ||
368 | SAVE_ALL | ||
369 | GET_THREAD_INFO(%ebp) | ||
370 | # system call tracing in operation / emulation | ||
371 | /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ | ||
372 | testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) | ||
373 | jnz syscall_trace_entry | ||
374 | cmpl $(nr_syscalls), %eax | ||
375 | jae syscall_badsys | ||
376 | syscall_call: | ||
377 | call *sys_call_table(,%eax,4) | ||
378 | movl %eax,PT_EAX(%esp) # store the return value | ||
379 | syscall_exit: | ||
380 | DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt | ||
381 | # setting need_resched or sigpending | ||
382 | # between sampling and the iret | ||
383 | TRACE_IRQS_OFF | ||
384 | testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit | ||
385 | jz no_singlestep | ||
386 | orl $_TIF_SINGLESTEP,TI_flags(%ebp) | ||
387 | no_singlestep: | ||
388 | movl TI_flags(%ebp), %ecx | ||
389 | testw $_TIF_ALLWORK_MASK, %cx # current->work | ||
390 | jne syscall_exit_work | ||
391 | |||
392 | restore_all: | ||
393 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS | ||
394 | # Warning: PT_OLDSS(%esp) contains the wrong/random values if we | ||
395 | # are returning to the kernel. | ||
396 | # See comments in process.c:copy_thread() for details. | ||
397 | movb PT_OLDSS(%esp), %ah | ||
398 | movb PT_CS(%esp), %al | ||
399 | andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax | ||
400 | cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax | ||
401 | CFI_REMEMBER_STATE | ||
402 | je ldt_ss # returning to user-space with LDT SS | ||
403 | restore_nocheck: | ||
404 | TRACE_IRQS_IRET | ||
405 | restore_nocheck_notrace: | ||
406 | RESTORE_REGS | ||
407 | addl $4, %esp # skip orig_eax/error_code | ||
408 | CFI_ADJUST_CFA_OFFSET -4 | ||
409 | 1: INTERRUPT_RETURN | ||
410 | .section .fixup,"ax" | ||
411 | iret_exc: | ||
412 | pushl $0 # no error code | ||
413 | pushl $do_iret_error | ||
414 | jmp error_code | ||
415 | .previous | ||
416 | .section __ex_table,"a" | ||
417 | .align 4 | ||
418 | .long 1b,iret_exc | ||
419 | .previous | ||
420 | |||
421 | CFI_RESTORE_STATE | ||
422 | ldt_ss: | ||
423 | larl PT_OLDSS(%esp), %eax | ||
424 | jnz restore_nocheck | ||
425 | testl $0x00400000, %eax # returning to 32bit stack? | ||
426 | jnz restore_nocheck # allright, normal return | ||
427 | |||
428 | #ifdef CONFIG_PARAVIRT | ||
429 | /* | ||
430 | * The kernel can't run on a non-flat stack if paravirt mode | ||
431 | * is active. Rather than try to fixup the high bits of | ||
432 | * ESP, bypass this code entirely. This may break DOSemu | ||
433 | * and/or Wine support in a paravirt VM, although the option | ||
434 | * is still available to implement the setting of the high | ||
435 | * 16-bits in the INTERRUPT_RETURN paravirt-op. | ||
436 | */ | ||
437 | cmpl $0, paravirt_ops+PARAVIRT_enabled | ||
438 | jne restore_nocheck | ||
439 | #endif | ||
440 | |||
441 | /* If returning to userspace with 16bit stack, | ||
442 | * try to fix the higher word of ESP, as the CPU | ||
443 | * won't restore it. | ||
444 | * This is an "official" bug of all the x86-compatible | ||
445 | * CPUs, which we can try to work around to make | ||
446 | * dosemu and wine happy. */ | ||
447 | movl PT_OLDESP(%esp), %eax | ||
448 | movl %esp, %edx | ||
449 | call patch_espfix_desc | ||
450 | pushl $__ESPFIX_SS | ||
451 | CFI_ADJUST_CFA_OFFSET 4 | ||
452 | pushl %eax | ||
453 | CFI_ADJUST_CFA_OFFSET 4 | ||
454 | DISABLE_INTERRUPTS(CLBR_EAX) | ||
455 | TRACE_IRQS_OFF | ||
456 | lss (%esp), %esp | ||
457 | CFI_ADJUST_CFA_OFFSET -8 | ||
458 | jmp restore_nocheck | ||
459 | CFI_ENDPROC | ||
460 | ENDPROC(system_call) | ||
461 | |||
462 | # perform work that needs to be done immediately before resumption | ||
463 | ALIGN | ||
464 | RING0_PTREGS_FRAME # can't unwind into user space anyway | ||
465 | work_pending: | ||
466 | testb $_TIF_NEED_RESCHED, %cl | ||
467 | jz work_notifysig | ||
468 | work_resched: | ||
469 | call schedule | ||
470 | DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt | ||
471 | # setting need_resched or sigpending | ||
472 | # between sampling and the iret | ||
473 | TRACE_IRQS_OFF | ||
474 | movl TI_flags(%ebp), %ecx | ||
475 | andl $_TIF_WORK_MASK, %ecx # is there any work to be done other | ||
476 | # than syscall tracing? | ||
477 | jz restore_all | ||
478 | testb $_TIF_NEED_RESCHED, %cl | ||
479 | jnz work_resched | ||
480 | |||
481 | work_notifysig: # deal with pending signals and | ||
482 | # notify-resume requests | ||
483 | #ifdef CONFIG_VM86 | ||
484 | testl $VM_MASK, PT_EFLAGS(%esp) | ||
485 | movl %esp, %eax | ||
486 | jne work_notifysig_v86 # returning to kernel-space or | ||
487 | # vm86-space | ||
488 | xorl %edx, %edx | ||
489 | call do_notify_resume | ||
490 | jmp resume_userspace_sig | ||
491 | |||
492 | ALIGN | ||
493 | work_notifysig_v86: | ||
494 | pushl %ecx # save ti_flags for do_notify_resume | ||
495 | CFI_ADJUST_CFA_OFFSET 4 | ||
496 | call save_v86_state # %eax contains pt_regs pointer | ||
497 | popl %ecx | ||
498 | CFI_ADJUST_CFA_OFFSET -4 | ||
499 | movl %eax, %esp | ||
500 | #else | ||
501 | movl %esp, %eax | ||
502 | #endif | ||
503 | xorl %edx, %edx | ||
504 | call do_notify_resume | ||
505 | jmp resume_userspace_sig | ||
506 | END(work_pending) | ||
507 | |||
508 | # perform syscall exit tracing | ||
509 | ALIGN | ||
510 | syscall_trace_entry: | ||
511 | movl $-ENOSYS,PT_EAX(%esp) | ||
512 | movl %esp, %eax | ||
513 | xorl %edx,%edx | ||
514 | call do_syscall_trace | ||
515 | cmpl $0, %eax | ||
516 | jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, | ||
517 | # so must skip actual syscall | ||
518 | movl PT_ORIG_EAX(%esp), %eax | ||
519 | cmpl $(nr_syscalls), %eax | ||
520 | jnae syscall_call | ||
521 | jmp syscall_exit | ||
522 | END(syscall_trace_entry) | ||
523 | |||
524 | # perform syscall exit tracing | ||
525 | ALIGN | ||
526 | syscall_exit_work: | ||
527 | testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl | ||
528 | jz work_pending | ||
529 | TRACE_IRQS_ON | ||
530 | ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call | ||
531 | # schedule() instead | ||
532 | movl %esp, %eax | ||
533 | movl $1, %edx | ||
534 | call do_syscall_trace | ||
535 | jmp resume_userspace | ||
536 | END(syscall_exit_work) | ||
537 | CFI_ENDPROC | ||
538 | |||
539 | RING0_INT_FRAME # can't unwind into user space anyway | ||
540 | syscall_fault: | ||
541 | pushl %eax # save orig_eax | ||
542 | CFI_ADJUST_CFA_OFFSET 4 | ||
543 | SAVE_ALL | ||
544 | GET_THREAD_INFO(%ebp) | ||
545 | movl $-EFAULT,PT_EAX(%esp) | ||
546 | jmp resume_userspace | ||
547 | END(syscall_fault) | ||
548 | |||
549 | syscall_badsys: | ||
550 | movl $-ENOSYS,PT_EAX(%esp) | ||
551 | jmp resume_userspace | ||
552 | END(syscall_badsys) | ||
553 | CFI_ENDPROC | ||
554 | |||
555 | #define FIXUP_ESPFIX_STACK \ | ||
556 | /* since we are on a wrong stack, we cant make it a C code :( */ \ | ||
557 | PER_CPU(gdt_page, %ebx); \ | ||
558 | GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ | ||
559 | addl %esp, %eax; \ | ||
560 | pushl $__KERNEL_DS; \ | ||
561 | CFI_ADJUST_CFA_OFFSET 4; \ | ||
562 | pushl %eax; \ | ||
563 | CFI_ADJUST_CFA_OFFSET 4; \ | ||
564 | lss (%esp), %esp; \ | ||
565 | CFI_ADJUST_CFA_OFFSET -8; | ||
566 | #define UNWIND_ESPFIX_STACK \ | ||
567 | movl %ss, %eax; \ | ||
568 | /* see if on espfix stack */ \ | ||
569 | cmpw $__ESPFIX_SS, %ax; \ | ||
570 | jne 27f; \ | ||
571 | movl $__KERNEL_DS, %eax; \ | ||
572 | movl %eax, %ds; \ | ||
573 | movl %eax, %es; \ | ||
574 | /* switch to normal stack */ \ | ||
575 | FIXUP_ESPFIX_STACK; \ | ||
576 | 27:; | ||
577 | |||
578 | /* | ||
579 | * Build the entry stubs and pointer table with | ||
580 | * some assembler magic. | ||
581 | */ | ||
582 | .data | ||
583 | ENTRY(interrupt) | ||
584 | .text | ||
585 | |||
586 | ENTRY(irq_entries_start) | ||
587 | RING0_INT_FRAME | ||
588 | vector=0 | ||
589 | .rept NR_IRQS | ||
590 | ALIGN | ||
591 | .if vector | ||
592 | CFI_ADJUST_CFA_OFFSET -4 | ||
593 | .endif | ||
594 | 1: pushl $~(vector) | ||
595 | CFI_ADJUST_CFA_OFFSET 4 | ||
596 | jmp common_interrupt | ||
597 | .previous | ||
598 | .long 1b | ||
599 | .text | ||
600 | vector=vector+1 | ||
601 | .endr | ||
602 | END(irq_entries_start) | ||
603 | |||
604 | .previous | ||
605 | END(interrupt) | ||
606 | .previous | ||
607 | |||
608 | /* | ||
609 | * the CPU automatically disables interrupts when executing an IRQ vector, | ||
610 | * so IRQ-flags tracing has to follow that: | ||
611 | */ | ||
612 | ALIGN | ||
613 | common_interrupt: | ||
614 | SAVE_ALL | ||
615 | TRACE_IRQS_OFF | ||
616 | movl %esp,%eax | ||
617 | call do_IRQ | ||
618 | jmp ret_from_intr | ||
619 | ENDPROC(common_interrupt) | ||
620 | CFI_ENDPROC | ||
621 | |||
622 | #define BUILD_INTERRUPT(name, nr) \ | ||
623 | ENTRY(name) \ | ||
624 | RING0_INT_FRAME; \ | ||
625 | pushl $~(nr); \ | ||
626 | CFI_ADJUST_CFA_OFFSET 4; \ | ||
627 | SAVE_ALL; \ | ||
628 | TRACE_IRQS_OFF \ | ||
629 | movl %esp,%eax; \ | ||
630 | call smp_##name; \ | ||
631 | jmp ret_from_intr; \ | ||
632 | CFI_ENDPROC; \ | ||
633 | ENDPROC(name) | ||
634 | |||
635 | /* The include is where all of the SMP etc. interrupts come from */ | ||
636 | #include "entry_arch.h" | ||
637 | |||
638 | KPROBE_ENTRY(page_fault) | ||
639 | RING0_EC_FRAME | ||
640 | pushl $do_page_fault | ||
641 | CFI_ADJUST_CFA_OFFSET 4 | ||
642 | ALIGN | ||
643 | error_code: | ||
644 | /* the function address is in %fs's slot on the stack */ | ||
645 | pushl %es | ||
646 | CFI_ADJUST_CFA_OFFSET 4 | ||
647 | /*CFI_REL_OFFSET es, 0*/ | ||
648 | pushl %ds | ||
649 | CFI_ADJUST_CFA_OFFSET 4 | ||
650 | /*CFI_REL_OFFSET ds, 0*/ | ||
651 | pushl %eax | ||
652 | CFI_ADJUST_CFA_OFFSET 4 | ||
653 | CFI_REL_OFFSET eax, 0 | ||
654 | pushl %ebp | ||
655 | CFI_ADJUST_CFA_OFFSET 4 | ||
656 | CFI_REL_OFFSET ebp, 0 | ||
657 | pushl %edi | ||
658 | CFI_ADJUST_CFA_OFFSET 4 | ||
659 | CFI_REL_OFFSET edi, 0 | ||
660 | pushl %esi | ||
661 | CFI_ADJUST_CFA_OFFSET 4 | ||
662 | CFI_REL_OFFSET esi, 0 | ||
663 | pushl %edx | ||
664 | CFI_ADJUST_CFA_OFFSET 4 | ||
665 | CFI_REL_OFFSET edx, 0 | ||
666 | pushl %ecx | ||
667 | CFI_ADJUST_CFA_OFFSET 4 | ||
668 | CFI_REL_OFFSET ecx, 0 | ||
669 | pushl %ebx | ||
670 | CFI_ADJUST_CFA_OFFSET 4 | ||
671 | CFI_REL_OFFSET ebx, 0 | ||
672 | cld | ||
673 | pushl %fs | ||
674 | CFI_ADJUST_CFA_OFFSET 4 | ||
675 | /*CFI_REL_OFFSET fs, 0*/ | ||
676 | movl $(__KERNEL_PERCPU), %ecx | ||
677 | movl %ecx, %fs | ||
678 | UNWIND_ESPFIX_STACK | ||
679 | popl %ecx | ||
680 | CFI_ADJUST_CFA_OFFSET -4 | ||
681 | /*CFI_REGISTER es, ecx*/ | ||
682 | movl PT_FS(%esp), %edi # get the function address | ||
683 | movl PT_ORIG_EAX(%esp), %edx # get the error code | ||
684 | movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart | ||
685 | mov %ecx, PT_FS(%esp) | ||
686 | /*CFI_REL_OFFSET fs, ES*/ | ||
687 | movl $(__USER_DS), %ecx | ||
688 | movl %ecx, %ds | ||
689 | movl %ecx, %es | ||
690 | movl %esp,%eax # pt_regs pointer | ||
691 | call *%edi | ||
692 | jmp ret_from_exception | ||
693 | CFI_ENDPROC | ||
694 | KPROBE_END(page_fault) | ||
695 | |||
696 | ENTRY(coprocessor_error) | ||
697 | RING0_INT_FRAME | ||
698 | pushl $0 | ||
699 | CFI_ADJUST_CFA_OFFSET 4 | ||
700 | pushl $do_coprocessor_error | ||
701 | CFI_ADJUST_CFA_OFFSET 4 | ||
702 | jmp error_code | ||
703 | CFI_ENDPROC | ||
704 | END(coprocessor_error) | ||
705 | |||
706 | ENTRY(simd_coprocessor_error) | ||
707 | RING0_INT_FRAME | ||
708 | pushl $0 | ||
709 | CFI_ADJUST_CFA_OFFSET 4 | ||
710 | pushl $do_simd_coprocessor_error | ||
711 | CFI_ADJUST_CFA_OFFSET 4 | ||
712 | jmp error_code | ||
713 | CFI_ENDPROC | ||
714 | END(simd_coprocessor_error) | ||
715 | |||
716 | ENTRY(device_not_available) | ||
717 | RING0_INT_FRAME | ||
718 | pushl $-1 # mark this as an int | ||
719 | CFI_ADJUST_CFA_OFFSET 4 | ||
720 | SAVE_ALL | ||
721 | GET_CR0_INTO_EAX | ||
722 | testl $0x4, %eax # EM (math emulation bit) | ||
723 | jne device_not_available_emulate | ||
724 | preempt_stop(CLBR_ANY) | ||
725 | call math_state_restore | ||
726 | jmp ret_from_exception | ||
727 | device_not_available_emulate: | ||
728 | pushl $0 # temporary storage for ORIG_EIP | ||
729 | CFI_ADJUST_CFA_OFFSET 4 | ||
730 | call math_emulate | ||
731 | addl $4, %esp | ||
732 | CFI_ADJUST_CFA_OFFSET -4 | ||
733 | jmp ret_from_exception | ||
734 | CFI_ENDPROC | ||
735 | END(device_not_available) | ||
736 | |||
737 | /* | ||
738 | * Debug traps and NMI can happen at the one SYSENTER instruction | ||
739 | * that sets up the real kernel stack. Check here, since we can't | ||
740 | * allow the wrong stack to be used. | ||
741 | * | ||
742 | * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have | ||
743 | * already pushed 3 words if it hits on the sysenter instruction: | ||
744 | * eflags, cs and eip. | ||
745 | * | ||
746 | * We just load the right stack, and push the three (known) values | ||
747 | * by hand onto the new stack - while updating the return eip past | ||
748 | * the instruction that would have done it for sysenter. | ||
749 | */ | ||
750 | #define FIX_STACK(offset, ok, label) \ | ||
751 | cmpw $__KERNEL_CS,4(%esp); \ | ||
752 | jne ok; \ | ||
753 | label: \ | ||
754 | movl TSS_sysenter_esp0+offset(%esp),%esp; \ | ||
755 | CFI_DEF_CFA esp, 0; \ | ||
756 | CFI_UNDEFINED eip; \ | ||
757 | pushfl; \ | ||
758 | CFI_ADJUST_CFA_OFFSET 4; \ | ||
759 | pushl $__KERNEL_CS; \ | ||
760 | CFI_ADJUST_CFA_OFFSET 4; \ | ||
761 | pushl $sysenter_past_esp; \ | ||
762 | CFI_ADJUST_CFA_OFFSET 4; \ | ||
763 | CFI_REL_OFFSET eip, 0 | ||
764 | |||
765 | KPROBE_ENTRY(debug) | ||
766 | RING0_INT_FRAME | ||
767 | cmpl $sysenter_entry,(%esp) | ||
768 | jne debug_stack_correct | ||
769 | FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) | ||
770 | debug_stack_correct: | ||
771 | pushl $-1 # mark this as an int | ||
772 | CFI_ADJUST_CFA_OFFSET 4 | ||
773 | SAVE_ALL | ||
774 | xorl %edx,%edx # error code 0 | ||
775 | movl %esp,%eax # pt_regs pointer | ||
776 | call do_debug | ||
777 | jmp ret_from_exception | ||
778 | CFI_ENDPROC | ||
779 | KPROBE_END(debug) | ||
780 | |||
781 | /* | ||
782 | * NMI is doubly nasty. It can happen _while_ we're handling | ||
783 | * a debug fault, and the debug fault hasn't yet been able to | ||
784 | * clear up the stack. So we first check whether we got an | ||
785 | * NMI on the sysenter entry path, but after that we need to | ||
786 | * check whether we got an NMI on the debug path where the debug | ||
787 | * fault happened on the sysenter path. | ||
788 | */ | ||
789 | KPROBE_ENTRY(nmi) | ||
790 | RING0_INT_FRAME | ||
791 | pushl %eax | ||
792 | CFI_ADJUST_CFA_OFFSET 4 | ||
793 | movl %ss, %eax | ||
794 | cmpw $__ESPFIX_SS, %ax | ||
795 | popl %eax | ||
796 | CFI_ADJUST_CFA_OFFSET -4 | ||
797 | je nmi_espfix_stack | ||
798 | cmpl $sysenter_entry,(%esp) | ||
799 | je nmi_stack_fixup | ||
800 | pushl %eax | ||
801 | CFI_ADJUST_CFA_OFFSET 4 | ||
802 | movl %esp,%eax | ||
803 | /* Do not access memory above the end of our stack page, | ||
804 | * it might not exist. | ||
805 | */ | ||
806 | andl $(THREAD_SIZE-1),%eax | ||
807 | cmpl $(THREAD_SIZE-20),%eax | ||
808 | popl %eax | ||
809 | CFI_ADJUST_CFA_OFFSET -4 | ||
810 | jae nmi_stack_correct | ||
811 | cmpl $sysenter_entry,12(%esp) | ||
812 | je nmi_debug_stack_check | ||
813 | nmi_stack_correct: | ||
814 | /* We have a RING0_INT_FRAME here */ | ||
815 | pushl %eax | ||
816 | CFI_ADJUST_CFA_OFFSET 4 | ||
817 | SAVE_ALL | ||
818 | xorl %edx,%edx # zero error code | ||
819 | movl %esp,%eax # pt_regs pointer | ||
820 | call do_nmi | ||
821 | jmp restore_nocheck_notrace | ||
822 | CFI_ENDPROC | ||
823 | |||
824 | nmi_stack_fixup: | ||
825 | RING0_INT_FRAME | ||
826 | FIX_STACK(12,nmi_stack_correct, 1) | ||
827 | jmp nmi_stack_correct | ||
828 | |||
829 | nmi_debug_stack_check: | ||
830 | /* We have a RING0_INT_FRAME here */ | ||
831 | cmpw $__KERNEL_CS,16(%esp) | ||
832 | jne nmi_stack_correct | ||
833 | cmpl $debug,(%esp) | ||
834 | jb nmi_stack_correct | ||
835 | cmpl $debug_esp_fix_insn,(%esp) | ||
836 | ja nmi_stack_correct | ||
837 | FIX_STACK(24,nmi_stack_correct, 1) | ||
838 | jmp nmi_stack_correct | ||
839 | |||
840 | nmi_espfix_stack: | ||
841 | /* We have a RING0_INT_FRAME here. | ||
842 | * | ||
843 | * create the pointer to lss back | ||
844 | */ | ||
845 | pushl %ss | ||
846 | CFI_ADJUST_CFA_OFFSET 4 | ||
847 | pushl %esp | ||
848 | CFI_ADJUST_CFA_OFFSET 4 | ||
849 | addw $4, (%esp) | ||
850 | /* copy the iret frame of 12 bytes */ | ||
851 | .rept 3 | ||
852 | pushl 16(%esp) | ||
853 | CFI_ADJUST_CFA_OFFSET 4 | ||
854 | .endr | ||
855 | pushl %eax | ||
856 | CFI_ADJUST_CFA_OFFSET 4 | ||
857 | SAVE_ALL | ||
858 | FIXUP_ESPFIX_STACK # %eax == %esp | ||
859 | xorl %edx,%edx # zero error code | ||
860 | call do_nmi | ||
861 | RESTORE_REGS | ||
862 | lss 12+4(%esp), %esp # back to espfix stack | ||
863 | CFI_ADJUST_CFA_OFFSET -24 | ||
864 | 1: INTERRUPT_RETURN | ||
865 | CFI_ENDPROC | ||
866 | .section __ex_table,"a" | ||
867 | .align 4 | ||
868 | .long 1b,iret_exc | ||
869 | .previous | ||
870 | KPROBE_END(nmi) | ||
871 | |||
872 | #ifdef CONFIG_PARAVIRT | ||
873 | ENTRY(native_iret) | ||
874 | 1: iret | ||
875 | .section __ex_table,"a" | ||
876 | .align 4 | ||
877 | .long 1b,iret_exc | ||
878 | .previous | ||
879 | END(native_iret) | ||
880 | |||
881 | ENTRY(native_irq_enable_sysexit) | ||
882 | sti | ||
883 | sysexit | ||
884 | END(native_irq_enable_sysexit) | ||
885 | #endif | ||
886 | |||
887 | KPROBE_ENTRY(int3) | ||
888 | RING0_INT_FRAME | ||
889 | pushl $-1 # mark this as an int | ||
890 | CFI_ADJUST_CFA_OFFSET 4 | ||
891 | SAVE_ALL | ||
892 | xorl %edx,%edx # zero error code | ||
893 | movl %esp,%eax # pt_regs pointer | ||
894 | call do_int3 | ||
895 | jmp ret_from_exception | ||
896 | CFI_ENDPROC | ||
897 | KPROBE_END(int3) | ||
898 | |||
899 | ENTRY(overflow) | ||
900 | RING0_INT_FRAME | ||
901 | pushl $0 | ||
902 | CFI_ADJUST_CFA_OFFSET 4 | ||
903 | pushl $do_overflow | ||
904 | CFI_ADJUST_CFA_OFFSET 4 | ||
905 | jmp error_code | ||
906 | CFI_ENDPROC | ||
907 | END(overflow) | ||
908 | |||
909 | ENTRY(bounds) | ||
910 | RING0_INT_FRAME | ||
911 | pushl $0 | ||
912 | CFI_ADJUST_CFA_OFFSET 4 | ||
913 | pushl $do_bounds | ||
914 | CFI_ADJUST_CFA_OFFSET 4 | ||
915 | jmp error_code | ||
916 | CFI_ENDPROC | ||
917 | END(bounds) | ||
918 | |||
919 | ENTRY(invalid_op) | ||
920 | RING0_INT_FRAME | ||
921 | pushl $0 | ||
922 | CFI_ADJUST_CFA_OFFSET 4 | ||
923 | pushl $do_invalid_op | ||
924 | CFI_ADJUST_CFA_OFFSET 4 | ||
925 | jmp error_code | ||
926 | CFI_ENDPROC | ||
927 | END(invalid_op) | ||
928 | |||
929 | ENTRY(coprocessor_segment_overrun) | ||
930 | RING0_INT_FRAME | ||
931 | pushl $0 | ||
932 | CFI_ADJUST_CFA_OFFSET 4 | ||
933 | pushl $do_coprocessor_segment_overrun | ||
934 | CFI_ADJUST_CFA_OFFSET 4 | ||
935 | jmp error_code | ||
936 | CFI_ENDPROC | ||
937 | END(coprocessor_segment_overrun) | ||
938 | |||
939 | ENTRY(invalid_TSS) | ||
940 | RING0_EC_FRAME | ||
941 | pushl $do_invalid_TSS | ||
942 | CFI_ADJUST_CFA_OFFSET 4 | ||
943 | jmp error_code | ||
944 | CFI_ENDPROC | ||
945 | END(invalid_TSS) | ||
946 | |||
947 | ENTRY(segment_not_present) | ||
948 | RING0_EC_FRAME | ||
949 | pushl $do_segment_not_present | ||
950 | CFI_ADJUST_CFA_OFFSET 4 | ||
951 | jmp error_code | ||
952 | CFI_ENDPROC | ||
953 | END(segment_not_present) | ||
954 | |||
955 | ENTRY(stack_segment) | ||
956 | RING0_EC_FRAME | ||
957 | pushl $do_stack_segment | ||
958 | CFI_ADJUST_CFA_OFFSET 4 | ||
959 | jmp error_code | ||
960 | CFI_ENDPROC | ||
961 | END(stack_segment) | ||
962 | |||
963 | KPROBE_ENTRY(general_protection) | ||
964 | RING0_EC_FRAME | ||
965 | pushl $do_general_protection | ||
966 | CFI_ADJUST_CFA_OFFSET 4 | ||
967 | jmp error_code | ||
968 | CFI_ENDPROC | ||
969 | KPROBE_END(general_protection) | ||
970 | |||
971 | ENTRY(alignment_check) | ||
972 | RING0_EC_FRAME | ||
973 | pushl $do_alignment_check | ||
974 | CFI_ADJUST_CFA_OFFSET 4 | ||
975 | jmp error_code | ||
976 | CFI_ENDPROC | ||
977 | END(alignment_check) | ||
978 | |||
979 | ENTRY(divide_error) | ||
980 | RING0_INT_FRAME | ||
981 | pushl $0 # no error code | ||
982 | CFI_ADJUST_CFA_OFFSET 4 | ||
983 | pushl $do_divide_error | ||
984 | CFI_ADJUST_CFA_OFFSET 4 | ||
985 | jmp error_code | ||
986 | CFI_ENDPROC | ||
987 | END(divide_error) | ||
988 | |||
989 | #ifdef CONFIG_X86_MCE | ||
990 | ENTRY(machine_check) | ||
991 | RING0_INT_FRAME | ||
992 | pushl $0 | ||
993 | CFI_ADJUST_CFA_OFFSET 4 | ||
994 | pushl machine_check_vector | ||
995 | CFI_ADJUST_CFA_OFFSET 4 | ||
996 | jmp error_code | ||
997 | CFI_ENDPROC | ||
998 | END(machine_check) | ||
999 | #endif | ||
1000 | |||
1001 | ENTRY(spurious_interrupt_bug) | ||
1002 | RING0_INT_FRAME | ||
1003 | pushl $0 | ||
1004 | CFI_ADJUST_CFA_OFFSET 4 | ||
1005 | pushl $do_spurious_interrupt_bug | ||
1006 | CFI_ADJUST_CFA_OFFSET 4 | ||
1007 | jmp error_code | ||
1008 | CFI_ENDPROC | ||
1009 | END(spurious_interrupt_bug) | ||
1010 | |||
1011 | ENTRY(kernel_thread_helper) | ||
1012 | pushl $0 # fake return address for unwinder | ||
1013 | CFI_STARTPROC | ||
1014 | movl %edx,%eax | ||
1015 | push %edx | ||
1016 | CFI_ADJUST_CFA_OFFSET 4 | ||
1017 | call *%ebx | ||
1018 | push %eax | ||
1019 | CFI_ADJUST_CFA_OFFSET 4 | ||
1020 | call do_exit | ||
1021 | CFI_ENDPROC | ||
1022 | ENDPROC(kernel_thread_helper) | ||
1023 | |||
1024 | #ifdef CONFIG_XEN | ||
1025 | ENTRY(xen_hypervisor_callback) | ||
1026 | CFI_STARTPROC | ||
1027 | pushl $0 | ||
1028 | CFI_ADJUST_CFA_OFFSET 4 | ||
1029 | SAVE_ALL | ||
1030 | TRACE_IRQS_OFF | ||
1031 | |||
1032 | /* Check to see if we got the event in the critical | ||
1033 | region in xen_iret_direct, after we've reenabled | ||
1034 | events and checked for pending events. This simulates | ||
1035 | iret instruction's behaviour where it delivers a | ||
1036 | pending interrupt when enabling interrupts. */ | ||
1037 | movl PT_EIP(%esp),%eax | ||
1038 | cmpl $xen_iret_start_crit,%eax | ||
1039 | jb 1f | ||
1040 | cmpl $xen_iret_end_crit,%eax | ||
1041 | jae 1f | ||
1042 | |||
1043 | call xen_iret_crit_fixup | ||
1044 | |||
1045 | 1: mov %esp, %eax | ||
1046 | call xen_evtchn_do_upcall | ||
1047 | jmp ret_from_intr | ||
1048 | CFI_ENDPROC | ||
1049 | ENDPROC(xen_hypervisor_callback) | ||
1050 | |||
1051 | # Hypervisor uses this for application faults while it executes. | ||
1052 | # We get here for two reasons: | ||
1053 | # 1. Fault while reloading DS, ES, FS or GS | ||
1054 | # 2. Fault while executing IRET | ||
1055 | # Category 1 we fix up by reattempting the load, and zeroing the segment | ||
1056 | # register if the load fails. | ||
1057 | # Category 2 we fix up by jumping to do_iret_error. We cannot use the | ||
1058 | # normal Linux return path in this case because if we use the IRET hypercall | ||
1059 | # to pop the stack frame we end up in an infinite loop of failsafe callbacks. | ||
1060 | # We distinguish between categories by maintaining a status value in EAX. | ||
1061 | ENTRY(xen_failsafe_callback) | ||
1062 | CFI_STARTPROC | ||
1063 | pushl %eax | ||
1064 | CFI_ADJUST_CFA_OFFSET 4 | ||
1065 | movl $1,%eax | ||
1066 | 1: mov 4(%esp),%ds | ||
1067 | 2: mov 8(%esp),%es | ||
1068 | 3: mov 12(%esp),%fs | ||
1069 | 4: mov 16(%esp),%gs | ||
1070 | testl %eax,%eax | ||
1071 | popl %eax | ||
1072 | CFI_ADJUST_CFA_OFFSET -4 | ||
1073 | lea 16(%esp),%esp | ||
1074 | CFI_ADJUST_CFA_OFFSET -16 | ||
1075 | jz 5f | ||
1076 | addl $16,%esp | ||
1077 | jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) | ||
1078 | 5: pushl $0 # EAX == 0 => Category 1 (Bad segment) | ||
1079 | CFI_ADJUST_CFA_OFFSET 4 | ||
1080 | SAVE_ALL | ||
1081 | jmp ret_from_exception | ||
1082 | CFI_ENDPROC | ||
1083 | |||
1084 | .section .fixup,"ax" | ||
1085 | 6: xorl %eax,%eax | ||
1086 | movl %eax,4(%esp) | ||
1087 | jmp 1b | ||
1088 | 7: xorl %eax,%eax | ||
1089 | movl %eax,8(%esp) | ||
1090 | jmp 2b | ||
1091 | 8: xorl %eax,%eax | ||
1092 | movl %eax,12(%esp) | ||
1093 | jmp 3b | ||
1094 | 9: xorl %eax,%eax | ||
1095 | movl %eax,16(%esp) | ||
1096 | jmp 4b | ||
1097 | .previous | ||
1098 | .section __ex_table,"a" | ||
1099 | .align 4 | ||
1100 | .long 1b,6b | ||
1101 | .long 2b,7b | ||
1102 | .long 3b,8b | ||
1103 | .long 4b,9b | ||
1104 | .previous | ||
1105 | ENDPROC(xen_failsafe_callback) | ||
1106 | |||
1107 | #endif /* CONFIG_XEN */ | ||
1108 | |||
1109 | .section .rodata,"a" | ||
1110 | #include "syscall_table_32.S" | ||
1111 | |||
1112 | syscall_table_size=(.-sys_call_table) | ||
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c new file mode 100644 index 000000000000..41e8aec4c61d --- /dev/null +++ b/arch/x86/kernel/geode_32.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * AMD Geode southbridge support code | ||
3 | * Copyright (C) 2006, Advanced Micro Devices, Inc. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of version 2 of the GNU General Public License | ||
7 | * as published by the Free Software Foundation. | ||
8 | */ | ||
9 | |||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/ioport.h> | ||
13 | #include <linux/io.h> | ||
14 | #include <asm/msr.h> | ||
15 | #include <asm/geode.h> | ||
16 | |||
17 | static struct { | ||
18 | char *name; | ||
19 | u32 msr; | ||
20 | int size; | ||
21 | u32 base; | ||
22 | } lbars[] = { | ||
23 | { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, | ||
24 | { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, | ||
25 | { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, | ||
26 | { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } | ||
27 | }; | ||
28 | |||
29 | static void __init init_lbars(void) | ||
30 | { | ||
31 | u32 lo, hi; | ||
32 | int i; | ||
33 | |||
34 | for (i = 0; i < ARRAY_SIZE(lbars); i++) { | ||
35 | rdmsr(lbars[i].msr, lo, hi); | ||
36 | if (hi & 0x01) | ||
37 | lbars[i].base = lo & 0x0000ffff; | ||
38 | |||
39 | if (lbars[i].base == 0) | ||
40 | printk(KERN_ERR "geode: Couldn't initialize '%s'\n", | ||
41 | lbars[i].name); | ||
42 | } | ||
43 | } | ||
44 | |||
45 | int geode_get_dev_base(unsigned int dev) | ||
46 | { | ||
47 | BUG_ON(dev >= ARRAY_SIZE(lbars)); | ||
48 | return lbars[dev].base; | ||
49 | } | ||
50 | EXPORT_SYMBOL_GPL(geode_get_dev_base); | ||
51 | |||
52 | /* === GPIO API === */ | ||
53 | |||
54 | void geode_gpio_set(unsigned int gpio, unsigned int reg) | ||
55 | { | ||
56 | u32 base = geode_get_dev_base(GEODE_DEV_GPIO); | ||
57 | |||
58 | if (!base) | ||
59 | return; | ||
60 | |||
61 | if (gpio < 16) | ||
62 | outl(1 << gpio, base + reg); | ||
63 | else | ||
64 | outl(1 << (gpio - 16), base + 0x80 + reg); | ||
65 | } | ||
66 | EXPORT_SYMBOL_GPL(geode_gpio_set); | ||
67 | |||
68 | void geode_gpio_clear(unsigned int gpio, unsigned int reg) | ||
69 | { | ||
70 | u32 base = geode_get_dev_base(GEODE_DEV_GPIO); | ||
71 | |||
72 | if (!base) | ||
73 | return; | ||
74 | |||
75 | if (gpio < 16) | ||
76 | outl(1 << (gpio + 16), base + reg); | ||
77 | else | ||
78 | outl(1 << gpio, base + 0x80 + reg); | ||
79 | } | ||
80 | EXPORT_SYMBOL_GPL(geode_gpio_clear); | ||
81 | |||
82 | int geode_gpio_isset(unsigned int gpio, unsigned int reg) | ||
83 | { | ||
84 | u32 base = geode_get_dev_base(GEODE_DEV_GPIO); | ||
85 | |||
86 | if (!base) | ||
87 | return 0; | ||
88 | |||
89 | if (gpio < 16) | ||
90 | return (inl(base + reg) & (1 << gpio)) ? 1 : 0; | ||
91 | else | ||
92 | return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; | ||
93 | } | ||
94 | EXPORT_SYMBOL_GPL(geode_gpio_isset); | ||
95 | |||
96 | void geode_gpio_set_irq(unsigned int group, unsigned int irq) | ||
97 | { | ||
98 | u32 lo, hi; | ||
99 | |||
100 | if (group > 7 || irq > 15) | ||
101 | return; | ||
102 | |||
103 | rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); | ||
104 | |||
105 | lo &= ~(0xF << (group * 4)); | ||
106 | lo |= (irq & 0xF) << (group * 4); | ||
107 | |||
108 | wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); | ||
109 | } | ||
110 | EXPORT_SYMBOL_GPL(geode_gpio_set_irq); | ||
111 | |||
112 | void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) | ||
113 | { | ||
114 | u32 base = geode_get_dev_base(GEODE_DEV_GPIO); | ||
115 | u32 offset, shift, val; | ||
116 | |||
117 | if (gpio >= 24) | ||
118 | offset = GPIO_MAP_W; | ||
119 | else if (gpio >= 16) | ||
120 | offset = GPIO_MAP_Z; | ||
121 | else if (gpio >= 8) | ||
122 | offset = GPIO_MAP_Y; | ||
123 | else | ||
124 | offset = GPIO_MAP_X; | ||
125 | |||
126 | shift = (gpio % 8) * 4; | ||
127 | |||
128 | val = inl(base + offset); | ||
129 | |||
130 | /* Clear whatever was there before */ | ||
131 | val &= ~(0xF << shift); | ||
132 | |||
133 | /* And set the new value */ | ||
134 | |||
135 | val |= ((pair & 7) << shift); | ||
136 | |||
137 | /* Set the PME bit if this is a PME event */ | ||
138 | |||
139 | if (pme) | ||
140 | val |= (1 << (shift + 3)); | ||
141 | |||
142 | outl(val, base + offset); | ||
143 | } | ||
144 | EXPORT_SYMBOL_GPL(geode_gpio_setup_event); | ||
145 | |||
146 | static int __init geode_southbridge_init(void) | ||
147 | { | ||
148 | if (!is_geode()) | ||
149 | return -ENODEV; | ||
150 | |||
151 | init_lbars(); | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | postcore_initcall(geode_southbridge_init); | ||
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S new file mode 100644 index 000000000000..9150ca9b5f80 --- /dev/null +++ b/arch/x86/kernel/head_32.S | |||
@@ -0,0 +1,578 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/head.S -- the 32-bit startup code. | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * Enhanced CPU detection and feature setting code by Mike Jagdis | ||
7 | * and Martin Mares, November 1997. | ||
8 | */ | ||
9 | |||
10 | .text | ||
11 | #include <linux/threads.h> | ||
12 | #include <linux/linkage.h> | ||
13 | #include <asm/segment.h> | ||
14 | #include <asm/page.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | #include <asm/desc.h> | ||
17 | #include <asm/cache.h> | ||
18 | #include <asm/thread_info.h> | ||
19 | #include <asm/asm-offsets.h> | ||
20 | #include <asm/setup.h> | ||
21 | |||
22 | /* | ||
23 | * References to members of the new_cpu_data structure. | ||
24 | */ | ||
25 | |||
26 | #define X86 new_cpu_data+CPUINFO_x86 | ||
27 | #define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor | ||
28 | #define X86_MODEL new_cpu_data+CPUINFO_x86_model | ||
29 | #define X86_MASK new_cpu_data+CPUINFO_x86_mask | ||
30 | #define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math | ||
31 | #define X86_CPUID new_cpu_data+CPUINFO_cpuid_level | ||
32 | #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability | ||
33 | #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id | ||
34 | |||
35 | /* | ||
36 | * This is how much memory *in addition to the memory covered up to | ||
37 | * and including _end* we need mapped initially. | ||
38 | * We need: | ||
39 | * - one bit for each possible page, but only in low memory, which means | ||
40 | * 2^32/4096/8 = 128K worst case (4G/4G split.) | ||
41 | * - enough space to map all low memory, which means | ||
42 | * (2^32/4096) / 1024 pages (worst case, non PAE) | ||
43 | * (2^32/4096) / 512 + 4 pages (worst case for PAE) | ||
44 | * - a few pages for allocator use before the kernel pagetable has | ||
45 | * been set up | ||
46 | * | ||
47 | * Modulo rounding, each megabyte assigned here requires a kilobyte of | ||
48 | * memory, which is currently unreclaimed. | ||
49 | * | ||
50 | * This should be a multiple of a page. | ||
51 | */ | ||
52 | LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) | ||
53 | |||
54 | #if PTRS_PER_PMD > 1 | ||
55 | PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD | ||
56 | #else | ||
57 | PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) | ||
58 | #endif | ||
59 | BOOTBITMAP_SIZE = LOW_PAGES / 8 | ||
60 | ALLOCATOR_SLOP = 4 | ||
61 | |||
62 | INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm | ||
63 | |||
64 | /* | ||
65 | * 32-bit kernel entrypoint; only used by the boot CPU. On entry, | ||
66 | * %esi points to the real-mode code as a 32-bit pointer. | ||
67 | * CS and DS must be 4 GB flat segments, but we don't depend on | ||
68 | * any particular GDT layout, because we load our own as soon as we | ||
69 | * can. | ||
70 | */ | ||
71 | .section .text.head,"ax",@progbits | ||
72 | ENTRY(startup_32) | ||
73 | |||
74 | /* | ||
75 | * Set segments to known values. | ||
76 | */ | ||
77 | cld | ||
78 | lgdt boot_gdt_descr - __PAGE_OFFSET | ||
79 | movl $(__BOOT_DS),%eax | ||
80 | movl %eax,%ds | ||
81 | movl %eax,%es | ||
82 | movl %eax,%fs | ||
83 | movl %eax,%gs | ||
84 | |||
85 | /* | ||
86 | * Clear BSS first so that there are no surprises... | ||
87 | * No need to cld as DF is already clear from cld above... | ||
88 | */ | ||
89 | xorl %eax,%eax | ||
90 | movl $__bss_start - __PAGE_OFFSET,%edi | ||
91 | movl $__bss_stop - __PAGE_OFFSET,%ecx | ||
92 | subl %edi,%ecx | ||
93 | shrl $2,%ecx | ||
94 | rep ; stosl | ||
95 | /* | ||
96 | * Copy bootup parameters out of the way. | ||
97 | * Note: %esi still has the pointer to the real-mode data. | ||
98 | * With the kexec as boot loader, parameter segment might be loaded beyond | ||
99 | * kernel image and might not even be addressable by early boot page tables. | ||
100 | * (kexec on panic case). Hence copy out the parameters before initializing | ||
101 | * page tables. | ||
102 | */ | ||
103 | movl $(boot_params - __PAGE_OFFSET),%edi | ||
104 | movl $(PARAM_SIZE/4),%ecx | ||
105 | cld | ||
106 | rep | ||
107 | movsl | ||
108 | movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi | ||
109 | andl %esi,%esi | ||
110 | jnz 2f # New command line protocol | ||
111 | cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR | ||
112 | jne 1f | ||
113 | movzwl OLD_CL_OFFSET,%esi | ||
114 | addl $(OLD_CL_BASE_ADDR),%esi | ||
115 | 2: | ||
116 | movl $(boot_command_line - __PAGE_OFFSET),%edi | ||
117 | movl $(COMMAND_LINE_SIZE/4),%ecx | ||
118 | rep | ||
119 | movsl | ||
120 | 1: | ||
121 | |||
122 | /* | ||
123 | * Initialize page tables. This creates a PDE and a set of page | ||
124 | * tables, which are located immediately beyond _end. The variable | ||
125 | * init_pg_tables_end is set up to point to the first "safe" location. | ||
126 | * Mappings are created both at virtual address 0 (identity mapping) | ||
127 | * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. | ||
128 | * | ||
129 | * Warning: don't use %esi or the stack in this code. However, %esp | ||
130 | * can be used as a GPR if you really need it... | ||
131 | */ | ||
132 | page_pde_offset = (__PAGE_OFFSET >> 20); | ||
133 | |||
134 | movl $(pg0 - __PAGE_OFFSET), %edi | ||
135 | movl $(swapper_pg_dir - __PAGE_OFFSET), %edx | ||
136 | movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ | ||
137 | 10: | ||
138 | leal 0x007(%edi),%ecx /* Create PDE entry */ | ||
139 | movl %ecx,(%edx) /* Store identity PDE entry */ | ||
140 | movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ | ||
141 | addl $4,%edx | ||
142 | movl $1024, %ecx | ||
143 | 11: | ||
144 | stosl | ||
145 | addl $0x1000,%eax | ||
146 | loop 11b | ||
147 | /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ | ||
148 | /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ | ||
149 | leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp | ||
150 | cmpl %ebp,%eax | ||
151 | jb 10b | ||
152 | movl %edi,(init_pg_tables_end - __PAGE_OFFSET) | ||
153 | |||
154 | xorl %ebx,%ebx /* This is the boot CPU (BSP) */ | ||
155 | jmp 3f | ||
156 | /* | ||
157 | * Non-boot CPU entry point; entered from trampoline.S | ||
158 | * We can't lgdt here, because lgdt itself uses a data segment, but | ||
159 | * we know the trampoline has already loaded the boot_gdt for us. | ||
160 | * | ||
161 | * If cpu hotplug is not supported then this code can go in init section | ||
162 | * which will be freed later | ||
163 | */ | ||
164 | |||
165 | #ifndef CONFIG_HOTPLUG_CPU | ||
166 | .section .init.text,"ax",@progbits | ||
167 | #endif | ||
168 | |||
169 | /* Do an early initialization of the fixmap area */ | ||
170 | movl $(swapper_pg_dir - __PAGE_OFFSET), %edx | ||
171 | movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax | ||
172 | addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ | ||
173 | movl %eax, 4092(%edx) | ||
174 | |||
175 | #ifdef CONFIG_SMP | ||
176 | ENTRY(startup_32_smp) | ||
177 | cld | ||
178 | movl $(__BOOT_DS),%eax | ||
179 | movl %eax,%ds | ||
180 | movl %eax,%es | ||
181 | movl %eax,%fs | ||
182 | movl %eax,%gs | ||
183 | |||
184 | /* | ||
185 | * New page tables may be in 4Mbyte page mode and may | ||
186 | * be using the global pages. | ||
187 | * | ||
188 | * NOTE! If we are on a 486 we may have no cr4 at all! | ||
189 | * So we do not try to touch it unless we really have | ||
190 | * some bits in it to set. This won't work if the BSP | ||
191 | * implements cr4 but this AP does not -- very unlikely | ||
192 | * but be warned! The same applies to the pse feature | ||
193 | * if not equally supported. --macro | ||
194 | * | ||
195 | * NOTE! We have to correct for the fact that we're | ||
196 | * not yet offset PAGE_OFFSET.. | ||
197 | */ | ||
198 | #define cr4_bits mmu_cr4_features-__PAGE_OFFSET | ||
199 | movl cr4_bits,%edx | ||
200 | andl %edx,%edx | ||
201 | jz 6f | ||
202 | movl %cr4,%eax # Turn on paging options (PSE,PAE,..) | ||
203 | orl %edx,%eax | ||
204 | movl %eax,%cr4 | ||
205 | |||
206 | btl $5, %eax # check if PAE is enabled | ||
207 | jnc 6f | ||
208 | |||
209 | /* Check if extended functions are implemented */ | ||
210 | movl $0x80000000, %eax | ||
211 | cpuid | ||
212 | cmpl $0x80000000, %eax | ||
213 | jbe 6f | ||
214 | mov $0x80000001, %eax | ||
215 | cpuid | ||
216 | /* Execute Disable bit supported? */ | ||
217 | btl $20, %edx | ||
218 | jnc 6f | ||
219 | |||
220 | /* Setup EFER (Extended Feature Enable Register) */ | ||
221 | movl $0xc0000080, %ecx | ||
222 | rdmsr | ||
223 | |||
224 | btsl $11, %eax | ||
225 | /* Make changes effective */ | ||
226 | wrmsr | ||
227 | |||
228 | 6: | ||
229 | /* This is a secondary processor (AP) */ | ||
230 | xorl %ebx,%ebx | ||
231 | incl %ebx | ||
232 | |||
233 | #endif /* CONFIG_SMP */ | ||
234 | 3: | ||
235 | |||
236 | /* | ||
237 | * Enable paging | ||
238 | */ | ||
239 | movl $swapper_pg_dir-__PAGE_OFFSET,%eax | ||
240 | movl %eax,%cr3 /* set the page table pointer.. */ | ||
241 | movl %cr0,%eax | ||
242 | orl $0x80000000,%eax | ||
243 | movl %eax,%cr0 /* ..and set paging (PG) bit */ | ||
244 | ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ | ||
245 | 1: | ||
246 | /* Set up the stack pointer */ | ||
247 | lss stack_start,%esp | ||
248 | |||
249 | /* | ||
250 | * Initialize eflags. Some BIOS's leave bits like NT set. This would | ||
251 | * confuse the debugger if this code is traced. | ||
252 | * XXX - best to initialize before switching to protected mode. | ||
253 | */ | ||
254 | pushl $0 | ||
255 | popfl | ||
256 | |||
257 | #ifdef CONFIG_SMP | ||
258 | andl %ebx,%ebx | ||
259 | jz 1f /* Initial CPU cleans BSS */ | ||
260 | jmp checkCPUtype | ||
261 | 1: | ||
262 | #endif /* CONFIG_SMP */ | ||
263 | |||
264 | /* | ||
265 | * start system 32-bit setup. We need to re-do some of the things done | ||
266 | * in 16-bit mode for the "real" operations. | ||
267 | */ | ||
268 | call setup_idt | ||
269 | |||
270 | checkCPUtype: | ||
271 | |||
272 | movl $-1,X86_CPUID # -1 for no CPUID initially | ||
273 | |||
274 | /* check if it is 486 or 386. */ | ||
275 | /* | ||
276 | * XXX - this does a lot of unnecessary setup. Alignment checks don't | ||
277 | * apply at our cpl of 0 and the stack ought to be aligned already, and | ||
278 | * we don't need to preserve eflags. | ||
279 | */ | ||
280 | |||
281 | movb $3,X86 # at least 386 | ||
282 | pushfl # push EFLAGS | ||
283 | popl %eax # get EFLAGS | ||
284 | movl %eax,%ecx # save original EFLAGS | ||
285 | xorl $0x240000,%eax # flip AC and ID bits in EFLAGS | ||
286 | pushl %eax # copy to EFLAGS | ||
287 | popfl # set EFLAGS | ||
288 | pushfl # get new EFLAGS | ||
289 | popl %eax # put it in eax | ||
290 | xorl %ecx,%eax # change in flags | ||
291 | pushl %ecx # restore original EFLAGS | ||
292 | popfl | ||
293 | testl $0x40000,%eax # check if AC bit changed | ||
294 | je is386 | ||
295 | |||
296 | movb $4,X86 # at least 486 | ||
297 | testl $0x200000,%eax # check if ID bit changed | ||
298 | je is486 | ||
299 | |||
300 | /* get vendor info */ | ||
301 | xorl %eax,%eax # call CPUID with 0 -> return vendor ID | ||
302 | cpuid | ||
303 | movl %eax,X86_CPUID # save CPUID level | ||
304 | movl %ebx,X86_VENDOR_ID # lo 4 chars | ||
305 | movl %edx,X86_VENDOR_ID+4 # next 4 chars | ||
306 | movl %ecx,X86_VENDOR_ID+8 # last 4 chars | ||
307 | |||
308 | orl %eax,%eax # do we have processor info as well? | ||
309 | je is486 | ||
310 | |||
311 | movl $1,%eax # Use the CPUID instruction to get CPU type | ||
312 | cpuid | ||
313 | movb %al,%cl # save reg for future use | ||
314 | andb $0x0f,%ah # mask processor family | ||
315 | movb %ah,X86 | ||
316 | andb $0xf0,%al # mask model | ||
317 | shrb $4,%al | ||
318 | movb %al,X86_MODEL | ||
319 | andb $0x0f,%cl # mask mask revision | ||
320 | movb %cl,X86_MASK | ||
321 | movl %edx,X86_CAPABILITY | ||
322 | |||
323 | is486: movl $0x50022,%ecx # set AM, WP, NE and MP | ||
324 | jmp 2f | ||
325 | |||
326 | is386: movl $2,%ecx # set MP | ||
327 | 2: movl %cr0,%eax | ||
328 | andl $0x80000011,%eax # Save PG,PE,ET | ||
329 | orl %ecx,%eax | ||
330 | movl %eax,%cr0 | ||
331 | |||
332 | call check_x87 | ||
333 | lgdt early_gdt_descr | ||
334 | lidt idt_descr | ||
335 | ljmp $(__KERNEL_CS),$1f | ||
336 | 1: movl $(__KERNEL_DS),%eax # reload all the segment registers | ||
337 | movl %eax,%ss # after changing gdt. | ||
338 | movl %eax,%fs # gets reset once there's real percpu | ||
339 | |||
340 | movl $(__USER_DS),%eax # DS/ES contains default USER segment | ||
341 | movl %eax,%ds | ||
342 | movl %eax,%es | ||
343 | |||
344 | xorl %eax,%eax # Clear GS and LDT | ||
345 | movl %eax,%gs | ||
346 | lldt %ax | ||
347 | |||
348 | cld # gcc2 wants the direction flag cleared at all times | ||
349 | pushl $0 # fake return address for unwinder | ||
350 | #ifdef CONFIG_SMP | ||
351 | movb ready, %cl | ||
352 | movb $1, ready | ||
353 | cmpb $0,%cl # the first CPU calls start_kernel | ||
354 | je 1f | ||
355 | movl $(__KERNEL_PERCPU), %eax | ||
356 | movl %eax,%fs # set this cpu's percpu | ||
357 | jmp initialize_secondary # all other CPUs call initialize_secondary | ||
358 | 1: | ||
359 | #endif /* CONFIG_SMP */ | ||
360 | jmp start_kernel | ||
361 | |||
362 | /* | ||
363 | * We depend on ET to be correct. This checks for 287/387. | ||
364 | */ | ||
365 | check_x87: | ||
366 | movb $0,X86_HARD_MATH | ||
367 | clts | ||
368 | fninit | ||
369 | fstsw %ax | ||
370 | cmpb $0,%al | ||
371 | je 1f | ||
372 | movl %cr0,%eax /* no coprocessor: have to set bits */ | ||
373 | xorl $4,%eax /* set EM */ | ||
374 | movl %eax,%cr0 | ||
375 | ret | ||
376 | ALIGN | ||
377 | 1: movb $1,X86_HARD_MATH | ||
378 | .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ | ||
379 | ret | ||
380 | |||
381 | /* | ||
382 | * setup_idt | ||
383 | * | ||
384 | * sets up a idt with 256 entries pointing to | ||
385 | * ignore_int, interrupt gates. It doesn't actually load | ||
386 | * idt - that can be done only after paging has been enabled | ||
387 | * and the kernel moved to PAGE_OFFSET. Interrupts | ||
388 | * are enabled elsewhere, when we can be relatively | ||
389 | * sure everything is ok. | ||
390 | * | ||
391 | * Warning: %esi is live across this function. | ||
392 | */ | ||
393 | setup_idt: | ||
394 | lea ignore_int,%edx | ||
395 | movl $(__KERNEL_CS << 16),%eax | ||
396 | movw %dx,%ax /* selector = 0x0010 = cs */ | ||
397 | movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ | ||
398 | |||
399 | lea idt_table,%edi | ||
400 | mov $256,%ecx | ||
401 | rp_sidt: | ||
402 | movl %eax,(%edi) | ||
403 | movl %edx,4(%edi) | ||
404 | addl $8,%edi | ||
405 | dec %ecx | ||
406 | jne rp_sidt | ||
407 | |||
408 | .macro set_early_handler handler,trapno | ||
409 | lea \handler,%edx | ||
410 | movl $(__KERNEL_CS << 16),%eax | ||
411 | movw %dx,%ax | ||
412 | movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ | ||
413 | lea idt_table,%edi | ||
414 | movl %eax,8*\trapno(%edi) | ||
415 | movl %edx,8*\trapno+4(%edi) | ||
416 | .endm | ||
417 | |||
418 | set_early_handler handler=early_divide_err,trapno=0 | ||
419 | set_early_handler handler=early_illegal_opcode,trapno=6 | ||
420 | set_early_handler handler=early_protection_fault,trapno=13 | ||
421 | set_early_handler handler=early_page_fault,trapno=14 | ||
422 | |||
423 | ret | ||
424 | |||
425 | early_divide_err: | ||
426 | xor %edx,%edx | ||
427 | pushl $0 /* fake errcode */ | ||
428 | jmp early_fault | ||
429 | |||
430 | early_illegal_opcode: | ||
431 | movl $6,%edx | ||
432 | pushl $0 /* fake errcode */ | ||
433 | jmp early_fault | ||
434 | |||
435 | early_protection_fault: | ||
436 | movl $13,%edx | ||
437 | jmp early_fault | ||
438 | |||
439 | early_page_fault: | ||
440 | movl $14,%edx | ||
441 | jmp early_fault | ||
442 | |||
443 | early_fault: | ||
444 | cld | ||
445 | #ifdef CONFIG_PRINTK | ||
446 | movl $(__KERNEL_DS),%eax | ||
447 | movl %eax,%ds | ||
448 | movl %eax,%es | ||
449 | cmpl $2,early_recursion_flag | ||
450 | je hlt_loop | ||
451 | incl early_recursion_flag | ||
452 | movl %cr2,%eax | ||
453 | pushl %eax | ||
454 | pushl %edx /* trapno */ | ||
455 | pushl $fault_msg | ||
456 | #ifdef CONFIG_EARLY_PRINTK | ||
457 | call early_printk | ||
458 | #else | ||
459 | call printk | ||
460 | #endif | ||
461 | #endif | ||
462 | hlt_loop: | ||
463 | hlt | ||
464 | jmp hlt_loop | ||
465 | |||
466 | /* This is the default interrupt "handler" :-) */ | ||
467 | ALIGN | ||
468 | ignore_int: | ||
469 | cld | ||
470 | #ifdef CONFIG_PRINTK | ||
471 | pushl %eax | ||
472 | pushl %ecx | ||
473 | pushl %edx | ||
474 | pushl %es | ||
475 | pushl %ds | ||
476 | movl $(__KERNEL_DS),%eax | ||
477 | movl %eax,%ds | ||
478 | movl %eax,%es | ||
479 | cmpl $2,early_recursion_flag | ||
480 | je hlt_loop | ||
481 | incl early_recursion_flag | ||
482 | pushl 16(%esp) | ||
483 | pushl 24(%esp) | ||
484 | pushl 32(%esp) | ||
485 | pushl 40(%esp) | ||
486 | pushl $int_msg | ||
487 | #ifdef CONFIG_EARLY_PRINTK | ||
488 | call early_printk | ||
489 | #else | ||
490 | call printk | ||
491 | #endif | ||
492 | addl $(5*4),%esp | ||
493 | popl %ds | ||
494 | popl %es | ||
495 | popl %edx | ||
496 | popl %ecx | ||
497 | popl %eax | ||
498 | #endif | ||
499 | iret | ||
500 | |||
501 | .section .text | ||
502 | /* | ||
503 | * Real beginning of normal "text" segment | ||
504 | */ | ||
505 | ENTRY(stext) | ||
506 | ENTRY(_stext) | ||
507 | |||
508 | /* | ||
509 | * BSS section | ||
510 | */ | ||
511 | .section ".bss.page_aligned","wa" | ||
512 | .align PAGE_SIZE_asm | ||
513 | ENTRY(swapper_pg_dir) | ||
514 | .fill 1024,4,0 | ||
515 | ENTRY(swapper_pg_pmd) | ||
516 | .fill 1024,4,0 | ||
517 | ENTRY(empty_zero_page) | ||
518 | .fill 4096,1,0 | ||
519 | |||
520 | /* | ||
521 | * This starts the data section. | ||
522 | */ | ||
523 | .data | ||
524 | ENTRY(stack_start) | ||
525 | .long init_thread_union+THREAD_SIZE | ||
526 | .long __BOOT_DS | ||
527 | |||
528 | ready: .byte 0 | ||
529 | |||
530 | early_recursion_flag: | ||
531 | .long 0 | ||
532 | |||
533 | int_msg: | ||
534 | .asciz "Unknown interrupt or fault at EIP %p %p %p\n" | ||
535 | |||
536 | fault_msg: | ||
537 | .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" | ||
538 | .asciz "Stack: %p %p %p %p %p %p %p %p\n" | ||
539 | |||
540 | #include "../../x86/xen/xen-head.S" | ||
541 | |||
542 | /* | ||
543 | * The IDT and GDT 'descriptors' are a strange 48-bit object | ||
544 | * only used by the lidt and lgdt instructions. They are not | ||
545 | * like usual segment descriptors - they consist of a 16-bit | ||
546 | * segment size, and 32-bit linear address value: | ||
547 | */ | ||
548 | |||
549 | .globl boot_gdt_descr | ||
550 | .globl idt_descr | ||
551 | |||
552 | ALIGN | ||
553 | # early boot GDT descriptor (must use 1:1 address mapping) | ||
554 | .word 0 # 32 bit align gdt_desc.address | ||
555 | boot_gdt_descr: | ||
556 | .word __BOOT_DS+7 | ||
557 | .long boot_gdt - __PAGE_OFFSET | ||
558 | |||
559 | .word 0 # 32-bit align idt_desc.address | ||
560 | idt_descr: | ||
561 | .word IDT_ENTRIES*8-1 # idt contains 256 entries | ||
562 | .long idt_table | ||
563 | |||
564 | # boot GDT descriptor (later on used by CPU#0): | ||
565 | .word 0 # 32 bit align gdt_desc.address | ||
566 | ENTRY(early_gdt_descr) | ||
567 | .word GDT_ENTRIES*8-1 | ||
568 | .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ | ||
569 | |||
570 | /* | ||
571 | * The boot_gdt must mirror the equivalent in setup.S and is | ||
572 | * used only for booting. | ||
573 | */ | ||
574 | .align L1_CACHE_BYTES | ||
575 | ENTRY(boot_gdt) | ||
576 | .fill GDT_ENTRY_BOOT_CS,8,0 | ||
577 | .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ | ||
578 | .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ | ||
diff --git a/arch/x86/kernel/hpet_32.c b/arch/x86/kernel/hpet_32.c new file mode 100644 index 000000000000..533d4932bc79 --- /dev/null +++ b/arch/x86/kernel/hpet_32.c | |||
@@ -0,0 +1,553 @@ | |||
1 | #include <linux/clocksource.h> | ||
2 | #include <linux/clockchips.h> | ||
3 | #include <linux/errno.h> | ||
4 | #include <linux/hpet.h> | ||
5 | #include <linux/init.h> | ||
6 | #include <linux/sysdev.h> | ||
7 | #include <linux/pm.h> | ||
8 | #include <linux/delay.h> | ||
9 | |||
10 | #include <asm/hpet.h> | ||
11 | #include <asm/io.h> | ||
12 | |||
13 | extern struct clock_event_device *global_clock_event; | ||
14 | |||
15 | #define HPET_MASK CLOCKSOURCE_MASK(32) | ||
16 | #define HPET_SHIFT 22 | ||
17 | |||
18 | /* FSEC = 10^-15 NSEC = 10^-9 */ | ||
19 | #define FSEC_PER_NSEC 1000000 | ||
20 | |||
21 | /* | ||
22 | * HPET address is set in acpi/boot.c, when an ACPI entry exists | ||
23 | */ | ||
24 | unsigned long hpet_address; | ||
25 | static void __iomem * hpet_virt_address; | ||
26 | |||
27 | static inline unsigned long hpet_readl(unsigned long a) | ||
28 | { | ||
29 | return readl(hpet_virt_address + a); | ||
30 | } | ||
31 | |||
32 | static inline void hpet_writel(unsigned long d, unsigned long a) | ||
33 | { | ||
34 | writel(d, hpet_virt_address + a); | ||
35 | } | ||
36 | |||
37 | /* | ||
38 | * HPET command line enable / disable | ||
39 | */ | ||
40 | static int boot_hpet_disable; | ||
41 | |||
42 | static int __init hpet_setup(char* str) | ||
43 | { | ||
44 | if (str) { | ||
45 | if (!strncmp("disable", str, 7)) | ||
46 | boot_hpet_disable = 1; | ||
47 | } | ||
48 | return 1; | ||
49 | } | ||
50 | __setup("hpet=", hpet_setup); | ||
51 | |||
52 | static inline int is_hpet_capable(void) | ||
53 | { | ||
54 | return (!boot_hpet_disable && hpet_address); | ||
55 | } | ||
56 | |||
57 | /* | ||
58 | * HPET timer interrupt enable / disable | ||
59 | */ | ||
60 | static int hpet_legacy_int_enabled; | ||
61 | |||
62 | /** | ||
63 | * is_hpet_enabled - check whether the hpet timer interrupt is enabled | ||
64 | */ | ||
65 | int is_hpet_enabled(void) | ||
66 | { | ||
67 | return is_hpet_capable() && hpet_legacy_int_enabled; | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * When the hpet driver (/dev/hpet) is enabled, we need to reserve | ||
72 | * timer 0 and timer 1 in case of RTC emulation. | ||
73 | */ | ||
74 | #ifdef CONFIG_HPET | ||
75 | static void hpet_reserve_platform_timers(unsigned long id) | ||
76 | { | ||
77 | struct hpet __iomem *hpet = hpet_virt_address; | ||
78 | struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; | ||
79 | unsigned int nrtimers, i; | ||
80 | struct hpet_data hd; | ||
81 | |||
82 | nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; | ||
83 | |||
84 | memset(&hd, 0, sizeof (hd)); | ||
85 | hd.hd_phys_address = hpet_address; | ||
86 | hd.hd_address = hpet_virt_address; | ||
87 | hd.hd_nirqs = nrtimers; | ||
88 | hd.hd_flags = HPET_DATA_PLATFORM; | ||
89 | hpet_reserve_timer(&hd, 0); | ||
90 | |||
91 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
92 | hpet_reserve_timer(&hd, 1); | ||
93 | #endif | ||
94 | |||
95 | hd.hd_irq[0] = HPET_LEGACY_8254; | ||
96 | hd.hd_irq[1] = HPET_LEGACY_RTC; | ||
97 | |||
98 | for (i = 2; i < nrtimers; timer++, i++) | ||
99 | hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> | ||
100 | Tn_INT_ROUTE_CNF_SHIFT; | ||
101 | |||
102 | hpet_alloc(&hd); | ||
103 | |||
104 | } | ||
105 | #else | ||
106 | static void hpet_reserve_platform_timers(unsigned long id) { } | ||
107 | #endif | ||
108 | |||
109 | /* | ||
110 | * Common hpet info | ||
111 | */ | ||
112 | static unsigned long hpet_period; | ||
113 | |||
114 | static void hpet_set_mode(enum clock_event_mode mode, | ||
115 | struct clock_event_device *evt); | ||
116 | static int hpet_next_event(unsigned long delta, | ||
117 | struct clock_event_device *evt); | ||
118 | |||
119 | /* | ||
120 | * The hpet clock event device | ||
121 | */ | ||
122 | static struct clock_event_device hpet_clockevent = { | ||
123 | .name = "hpet", | ||
124 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, | ||
125 | .set_mode = hpet_set_mode, | ||
126 | .set_next_event = hpet_next_event, | ||
127 | .shift = 32, | ||
128 | .irq = 0, | ||
129 | }; | ||
130 | |||
131 | static void hpet_start_counter(void) | ||
132 | { | ||
133 | unsigned long cfg = hpet_readl(HPET_CFG); | ||
134 | |||
135 | cfg &= ~HPET_CFG_ENABLE; | ||
136 | hpet_writel(cfg, HPET_CFG); | ||
137 | hpet_writel(0, HPET_COUNTER); | ||
138 | hpet_writel(0, HPET_COUNTER + 4); | ||
139 | cfg |= HPET_CFG_ENABLE; | ||
140 | hpet_writel(cfg, HPET_CFG); | ||
141 | } | ||
142 | |||
143 | static void hpet_enable_int(void) | ||
144 | { | ||
145 | unsigned long cfg = hpet_readl(HPET_CFG); | ||
146 | |||
147 | cfg |= HPET_CFG_LEGACY; | ||
148 | hpet_writel(cfg, HPET_CFG); | ||
149 | hpet_legacy_int_enabled = 1; | ||
150 | } | ||
151 | |||
152 | static void hpet_set_mode(enum clock_event_mode mode, | ||
153 | struct clock_event_device *evt) | ||
154 | { | ||
155 | unsigned long cfg, cmp, now; | ||
156 | uint64_t delta; | ||
157 | |||
158 | switch(mode) { | ||
159 | case CLOCK_EVT_MODE_PERIODIC: | ||
160 | delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; | ||
161 | delta >>= hpet_clockevent.shift; | ||
162 | now = hpet_readl(HPET_COUNTER); | ||
163 | cmp = now + (unsigned long) delta; | ||
164 | cfg = hpet_readl(HPET_T0_CFG); | ||
165 | cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | | ||
166 | HPET_TN_SETVAL | HPET_TN_32BIT; | ||
167 | hpet_writel(cfg, HPET_T0_CFG); | ||
168 | /* | ||
169 | * The first write after writing TN_SETVAL to the | ||
170 | * config register sets the counter value, the second | ||
171 | * write sets the period. | ||
172 | */ | ||
173 | hpet_writel(cmp, HPET_T0_CMP); | ||
174 | udelay(1); | ||
175 | hpet_writel((unsigned long) delta, HPET_T0_CMP); | ||
176 | break; | ||
177 | |||
178 | case CLOCK_EVT_MODE_ONESHOT: | ||
179 | cfg = hpet_readl(HPET_T0_CFG); | ||
180 | cfg &= ~HPET_TN_PERIODIC; | ||
181 | cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; | ||
182 | hpet_writel(cfg, HPET_T0_CFG); | ||
183 | break; | ||
184 | |||
185 | case CLOCK_EVT_MODE_UNUSED: | ||
186 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
187 | cfg = hpet_readl(HPET_T0_CFG); | ||
188 | cfg &= ~HPET_TN_ENABLE; | ||
189 | hpet_writel(cfg, HPET_T0_CFG); | ||
190 | break; | ||
191 | |||
192 | case CLOCK_EVT_MODE_RESUME: | ||
193 | hpet_enable_int(); | ||
194 | break; | ||
195 | } | ||
196 | } | ||
197 | |||
198 | static int hpet_next_event(unsigned long delta, | ||
199 | struct clock_event_device *evt) | ||
200 | { | ||
201 | unsigned long cnt; | ||
202 | |||
203 | cnt = hpet_readl(HPET_COUNTER); | ||
204 | cnt += delta; | ||
205 | hpet_writel(cnt, HPET_T0_CMP); | ||
206 | |||
207 | return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Clock source related code | ||
212 | */ | ||
213 | static cycle_t read_hpet(void) | ||
214 | { | ||
215 | return (cycle_t)hpet_readl(HPET_COUNTER); | ||
216 | } | ||
217 | |||
218 | static struct clocksource clocksource_hpet = { | ||
219 | .name = "hpet", | ||
220 | .rating = 250, | ||
221 | .read = read_hpet, | ||
222 | .mask = HPET_MASK, | ||
223 | .shift = HPET_SHIFT, | ||
224 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
225 | .resume = hpet_start_counter, | ||
226 | }; | ||
227 | |||
228 | /* | ||
229 | * Try to setup the HPET timer | ||
230 | */ | ||
231 | int __init hpet_enable(void) | ||
232 | { | ||
233 | unsigned long id; | ||
234 | uint64_t hpet_freq; | ||
235 | u64 tmp, start, now; | ||
236 | cycle_t t1; | ||
237 | |||
238 | if (!is_hpet_capable()) | ||
239 | return 0; | ||
240 | |||
241 | hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); | ||
242 | |||
243 | /* | ||
244 | * Read the period and check for a sane value: | ||
245 | */ | ||
246 | hpet_period = hpet_readl(HPET_PERIOD); | ||
247 | if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) | ||
248 | goto out_nohpet; | ||
249 | |||
250 | /* | ||
251 | * The period is a femto seconds value. We need to calculate the | ||
252 | * scaled math multiplication factor for nanosecond to hpet tick | ||
253 | * conversion. | ||
254 | */ | ||
255 | hpet_freq = 1000000000000000ULL; | ||
256 | do_div(hpet_freq, hpet_period); | ||
257 | hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, | ||
258 | NSEC_PER_SEC, 32); | ||
259 | /* Calculate the min / max delta */ | ||
260 | hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, | ||
261 | &hpet_clockevent); | ||
262 | hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, | ||
263 | &hpet_clockevent); | ||
264 | |||
265 | /* | ||
266 | * Read the HPET ID register to retrieve the IRQ routing | ||
267 | * information and the number of channels | ||
268 | */ | ||
269 | id = hpet_readl(HPET_ID); | ||
270 | |||
271 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
272 | /* | ||
273 | * The legacy routing mode needs at least two channels, tick timer | ||
274 | * and the rtc emulation channel. | ||
275 | */ | ||
276 | if (!(id & HPET_ID_NUMBER)) | ||
277 | goto out_nohpet; | ||
278 | #endif | ||
279 | |||
280 | /* Start the counter */ | ||
281 | hpet_start_counter(); | ||
282 | |||
283 | /* Verify whether hpet counter works */ | ||
284 | t1 = read_hpet(); | ||
285 | rdtscll(start); | ||
286 | |||
287 | /* | ||
288 | * We don't know the TSC frequency yet, but waiting for | ||
289 | * 200000 TSC cycles is safe: | ||
290 | * 4 GHz == 50us | ||
291 | * 1 GHz == 200us | ||
292 | */ | ||
293 | do { | ||
294 | rep_nop(); | ||
295 | rdtscll(now); | ||
296 | } while ((now - start) < 200000UL); | ||
297 | |||
298 | if (t1 == read_hpet()) { | ||
299 | printk(KERN_WARNING | ||
300 | "HPET counter not counting. HPET disabled\n"); | ||
301 | goto out_nohpet; | ||
302 | } | ||
303 | |||
304 | /* Initialize and register HPET clocksource | ||
305 | * | ||
306 | * hpet period is in femto seconds per cycle | ||
307 | * so we need to convert this to ns/cyc units | ||
308 | * aproximated by mult/2^shift | ||
309 | * | ||
310 | * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift | ||
311 | * fsec/cyc * 1ns/1000000fsec * 2^shift = mult | ||
312 | * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult | ||
313 | * (fsec/cyc << shift)/1000000 = mult | ||
314 | * (hpet_period << shift)/FSEC_PER_NSEC = mult | ||
315 | */ | ||
316 | tmp = (u64)hpet_period << HPET_SHIFT; | ||
317 | do_div(tmp, FSEC_PER_NSEC); | ||
318 | clocksource_hpet.mult = (u32)tmp; | ||
319 | |||
320 | clocksource_register(&clocksource_hpet); | ||
321 | |||
322 | if (id & HPET_ID_LEGSUP) { | ||
323 | hpet_enable_int(); | ||
324 | hpet_reserve_platform_timers(id); | ||
325 | /* | ||
326 | * Start hpet with the boot cpu mask and make it | ||
327 | * global after the IO_APIC has been initialized. | ||
328 | */ | ||
329 | hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); | ||
330 | clockevents_register_device(&hpet_clockevent); | ||
331 | global_clock_event = &hpet_clockevent; | ||
332 | return 1; | ||
333 | } | ||
334 | return 0; | ||
335 | |||
336 | out_nohpet: | ||
337 | iounmap(hpet_virt_address); | ||
338 | hpet_virt_address = NULL; | ||
339 | boot_hpet_disable = 1; | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | |||
344 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
345 | |||
346 | /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET | ||
347 | * is enabled, we support RTC interrupt functionality in software. | ||
348 | * RTC has 3 kinds of interrupts: | ||
349 | * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock | ||
350 | * is updated | ||
351 | * 2) Alarm Interrupt - generate an interrupt at a specific time of day | ||
352 | * 3) Periodic Interrupt - generate periodic interrupt, with frequencies | ||
353 | * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) | ||
354 | * (1) and (2) above are implemented using polling at a frequency of | ||
355 | * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt | ||
356 | * overhead. (DEFAULT_RTC_INT_FREQ) | ||
357 | * For (3), we use interrupts at 64Hz or user specified periodic | ||
358 | * frequency, whichever is higher. | ||
359 | */ | ||
360 | #include <linux/mc146818rtc.h> | ||
361 | #include <linux/rtc.h> | ||
362 | |||
363 | #define DEFAULT_RTC_INT_FREQ 64 | ||
364 | #define DEFAULT_RTC_SHIFT 6 | ||
365 | #define RTC_NUM_INTS 1 | ||
366 | |||
367 | static unsigned long hpet_rtc_flags; | ||
368 | static unsigned long hpet_prev_update_sec; | ||
369 | static struct rtc_time hpet_alarm_time; | ||
370 | static unsigned long hpet_pie_count; | ||
371 | static unsigned long hpet_t1_cmp; | ||
372 | static unsigned long hpet_default_delta; | ||
373 | static unsigned long hpet_pie_delta; | ||
374 | static unsigned long hpet_pie_limit; | ||
375 | |||
376 | /* | ||
377 | * Timer 1 for RTC emulation. We use one shot mode, as periodic mode | ||
378 | * is not supported by all HPET implementations for timer 1. | ||
379 | * | ||
380 | * hpet_rtc_timer_init() is called when the rtc is initialized. | ||
381 | */ | ||
382 | int hpet_rtc_timer_init(void) | ||
383 | { | ||
384 | unsigned long cfg, cnt, delta, flags; | ||
385 | |||
386 | if (!is_hpet_enabled()) | ||
387 | return 0; | ||
388 | |||
389 | if (!hpet_default_delta) { | ||
390 | uint64_t clc; | ||
391 | |||
392 | clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; | ||
393 | clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; | ||
394 | hpet_default_delta = (unsigned long) clc; | ||
395 | } | ||
396 | |||
397 | if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) | ||
398 | delta = hpet_default_delta; | ||
399 | else | ||
400 | delta = hpet_pie_delta; | ||
401 | |||
402 | local_irq_save(flags); | ||
403 | |||
404 | cnt = delta + hpet_readl(HPET_COUNTER); | ||
405 | hpet_writel(cnt, HPET_T1_CMP); | ||
406 | hpet_t1_cmp = cnt; | ||
407 | |||
408 | cfg = hpet_readl(HPET_T1_CFG); | ||
409 | cfg &= ~HPET_TN_PERIODIC; | ||
410 | cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; | ||
411 | hpet_writel(cfg, HPET_T1_CFG); | ||
412 | |||
413 | local_irq_restore(flags); | ||
414 | |||
415 | return 1; | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * The functions below are called from rtc driver. | ||
420 | * Return 0 if HPET is not being used. | ||
421 | * Otherwise do the necessary changes and return 1. | ||
422 | */ | ||
423 | int hpet_mask_rtc_irq_bit(unsigned long bit_mask) | ||
424 | { | ||
425 | if (!is_hpet_enabled()) | ||
426 | return 0; | ||
427 | |||
428 | hpet_rtc_flags &= ~bit_mask; | ||
429 | return 1; | ||
430 | } | ||
431 | |||
432 | int hpet_set_rtc_irq_bit(unsigned long bit_mask) | ||
433 | { | ||
434 | unsigned long oldbits = hpet_rtc_flags; | ||
435 | |||
436 | if (!is_hpet_enabled()) | ||
437 | return 0; | ||
438 | |||
439 | hpet_rtc_flags |= bit_mask; | ||
440 | |||
441 | if (!oldbits) | ||
442 | hpet_rtc_timer_init(); | ||
443 | |||
444 | return 1; | ||
445 | } | ||
446 | |||
447 | int hpet_set_alarm_time(unsigned char hrs, unsigned char min, | ||
448 | unsigned char sec) | ||
449 | { | ||
450 | if (!is_hpet_enabled()) | ||
451 | return 0; | ||
452 | |||
453 | hpet_alarm_time.tm_hour = hrs; | ||
454 | hpet_alarm_time.tm_min = min; | ||
455 | hpet_alarm_time.tm_sec = sec; | ||
456 | |||
457 | return 1; | ||
458 | } | ||
459 | |||
460 | int hpet_set_periodic_freq(unsigned long freq) | ||
461 | { | ||
462 | uint64_t clc; | ||
463 | |||
464 | if (!is_hpet_enabled()) | ||
465 | return 0; | ||
466 | |||
467 | if (freq <= DEFAULT_RTC_INT_FREQ) | ||
468 | hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; | ||
469 | else { | ||
470 | clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; | ||
471 | do_div(clc, freq); | ||
472 | clc >>= hpet_clockevent.shift; | ||
473 | hpet_pie_delta = (unsigned long) clc; | ||
474 | } | ||
475 | return 1; | ||
476 | } | ||
477 | |||
478 | int hpet_rtc_dropped_irq(void) | ||
479 | { | ||
480 | return is_hpet_enabled(); | ||
481 | } | ||
482 | |||
483 | static void hpet_rtc_timer_reinit(void) | ||
484 | { | ||
485 | unsigned long cfg, delta; | ||
486 | int lost_ints = -1; | ||
487 | |||
488 | if (unlikely(!hpet_rtc_flags)) { | ||
489 | cfg = hpet_readl(HPET_T1_CFG); | ||
490 | cfg &= ~HPET_TN_ENABLE; | ||
491 | hpet_writel(cfg, HPET_T1_CFG); | ||
492 | return; | ||
493 | } | ||
494 | |||
495 | if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) | ||
496 | delta = hpet_default_delta; | ||
497 | else | ||
498 | delta = hpet_pie_delta; | ||
499 | |||
500 | /* | ||
501 | * Increment the comparator value until we are ahead of the | ||
502 | * current count. | ||
503 | */ | ||
504 | do { | ||
505 | hpet_t1_cmp += delta; | ||
506 | hpet_writel(hpet_t1_cmp, HPET_T1_CMP); | ||
507 | lost_ints++; | ||
508 | } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); | ||
509 | |||
510 | if (lost_ints) { | ||
511 | if (hpet_rtc_flags & RTC_PIE) | ||
512 | hpet_pie_count += lost_ints; | ||
513 | if (printk_ratelimit()) | ||
514 | printk(KERN_WARNING "rtc: lost %d interrupts\n", | ||
515 | lost_ints); | ||
516 | } | ||
517 | } | ||
518 | |||
519 | irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) | ||
520 | { | ||
521 | struct rtc_time curr_time; | ||
522 | unsigned long rtc_int_flag = 0; | ||
523 | |||
524 | hpet_rtc_timer_reinit(); | ||
525 | |||
526 | if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) | ||
527 | rtc_get_rtc_time(&curr_time); | ||
528 | |||
529 | if (hpet_rtc_flags & RTC_UIE && | ||
530 | curr_time.tm_sec != hpet_prev_update_sec) { | ||
531 | rtc_int_flag = RTC_UF; | ||
532 | hpet_prev_update_sec = curr_time.tm_sec; | ||
533 | } | ||
534 | |||
535 | if (hpet_rtc_flags & RTC_PIE && | ||
536 | ++hpet_pie_count >= hpet_pie_limit) { | ||
537 | rtc_int_flag |= RTC_PF; | ||
538 | hpet_pie_count = 0; | ||
539 | } | ||
540 | |||
541 | if (hpet_rtc_flags & RTC_PIE && | ||
542 | (curr_time.tm_sec == hpet_alarm_time.tm_sec) && | ||
543 | (curr_time.tm_min == hpet_alarm_time.tm_min) && | ||
544 | (curr_time.tm_hour == hpet_alarm_time.tm_hour)) | ||
545 | rtc_int_flag |= RTC_AF; | ||
546 | |||
547 | if (rtc_int_flag) { | ||
548 | rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); | ||
549 | rtc_interrupt(rtc_int_flag, dev_id); | ||
550 | } | ||
551 | return IRQ_HANDLED; | ||
552 | } | ||
553 | #endif | ||
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c new file mode 100644 index 000000000000..e3d4b73bfdb0 --- /dev/null +++ b/arch/x86/kernel/i386_ksyms_32.c | |||
@@ -0,0 +1,30 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <asm/checksum.h> | ||
3 | #include <asm/desc.h> | ||
4 | |||
5 | EXPORT_SYMBOL(__down_failed); | ||
6 | EXPORT_SYMBOL(__down_failed_interruptible); | ||
7 | EXPORT_SYMBOL(__down_failed_trylock); | ||
8 | EXPORT_SYMBOL(__up_wakeup); | ||
9 | /* Networking helper routines. */ | ||
10 | EXPORT_SYMBOL(csum_partial_copy_generic); | ||
11 | |||
12 | EXPORT_SYMBOL(__get_user_1); | ||
13 | EXPORT_SYMBOL(__get_user_2); | ||
14 | EXPORT_SYMBOL(__get_user_4); | ||
15 | |||
16 | EXPORT_SYMBOL(__put_user_1); | ||
17 | EXPORT_SYMBOL(__put_user_2); | ||
18 | EXPORT_SYMBOL(__put_user_4); | ||
19 | EXPORT_SYMBOL(__put_user_8); | ||
20 | |||
21 | EXPORT_SYMBOL(strstr); | ||
22 | |||
23 | #ifdef CONFIG_SMP | ||
24 | extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); | ||
25 | extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); | ||
26 | EXPORT_SYMBOL(__write_lock_failed); | ||
27 | EXPORT_SYMBOL(__read_lock_failed); | ||
28 | #endif | ||
29 | |||
30 | EXPORT_SYMBOL(csum_partial); | ||
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c new file mode 100644 index 000000000000..665847281ed2 --- /dev/null +++ b/arch/x86/kernel/i387_32.c | |||
@@ -0,0 +1,546 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/i387.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * | ||
6 | * Pentium III FXSR, SSE support | ||
7 | * General FPU state handling cleanups | ||
8 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
9 | */ | ||
10 | |||
11 | #include <linux/sched.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <asm/processor.h> | ||
14 | #include <asm/i387.h> | ||
15 | #include <asm/math_emu.h> | ||
16 | #include <asm/sigcontext.h> | ||
17 | #include <asm/user.h> | ||
18 | #include <asm/ptrace.h> | ||
19 | #include <asm/uaccess.h> | ||
20 | |||
21 | #ifdef CONFIG_MATH_EMULATION | ||
22 | #define HAVE_HWFP (boot_cpu_data.hard_math) | ||
23 | #else | ||
24 | #define HAVE_HWFP 1 | ||
25 | #endif | ||
26 | |||
27 | static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff; | ||
28 | |||
29 | void mxcsr_feature_mask_init(void) | ||
30 | { | ||
31 | unsigned long mask = 0; | ||
32 | clts(); | ||
33 | if (cpu_has_fxsr) { | ||
34 | memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
35 | asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); | ||
36 | mask = current->thread.i387.fxsave.mxcsr_mask; | ||
37 | if (mask == 0) mask = 0x0000ffbf; | ||
38 | } | ||
39 | mxcsr_feature_mask &= mask; | ||
40 | stts(); | ||
41 | } | ||
42 | |||
43 | /* | ||
44 | * The _current_ task is using the FPU for the first time | ||
45 | * so initialize it and set the mxcsr to its default | ||
46 | * value at reset if we support XMM instructions and then | ||
47 | * remeber the current task has used the FPU. | ||
48 | */ | ||
49 | void init_fpu(struct task_struct *tsk) | ||
50 | { | ||
51 | if (cpu_has_fxsr) { | ||
52 | memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
53 | tsk->thread.i387.fxsave.cwd = 0x37f; | ||
54 | if (cpu_has_xmm) | ||
55 | tsk->thread.i387.fxsave.mxcsr = 0x1f80; | ||
56 | } else { | ||
57 | memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct)); | ||
58 | tsk->thread.i387.fsave.cwd = 0xffff037fu; | ||
59 | tsk->thread.i387.fsave.swd = 0xffff0000u; | ||
60 | tsk->thread.i387.fsave.twd = 0xffffffffu; | ||
61 | tsk->thread.i387.fsave.fos = 0xffff0000u; | ||
62 | } | ||
63 | /* only the device not available exception or ptrace can call init_fpu */ | ||
64 | set_stopped_child_used_math(tsk); | ||
65 | } | ||
66 | |||
67 | /* | ||
68 | * FPU lazy state save handling. | ||
69 | */ | ||
70 | |||
71 | void kernel_fpu_begin(void) | ||
72 | { | ||
73 | struct thread_info *thread = current_thread_info(); | ||
74 | |||
75 | preempt_disable(); | ||
76 | if (thread->status & TS_USEDFPU) { | ||
77 | __save_init_fpu(thread->task); | ||
78 | return; | ||
79 | } | ||
80 | clts(); | ||
81 | } | ||
82 | EXPORT_SYMBOL_GPL(kernel_fpu_begin); | ||
83 | |||
84 | /* | ||
85 | * FPU tag word conversions. | ||
86 | */ | ||
87 | |||
88 | static inline unsigned short twd_i387_to_fxsr( unsigned short twd ) | ||
89 | { | ||
90 | unsigned int tmp; /* to avoid 16 bit prefixes in the code */ | ||
91 | |||
92 | /* Transform each pair of bits into 01 (valid) or 00 (empty) */ | ||
93 | tmp = ~twd; | ||
94 | tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ | ||
95 | /* and move the valid bits to the lower byte. */ | ||
96 | tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ | ||
97 | tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ | ||
98 | tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ | ||
99 | return tmp; | ||
100 | } | ||
101 | |||
102 | static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave ) | ||
103 | { | ||
104 | struct _fpxreg *st = NULL; | ||
105 | unsigned long tos = (fxsave->swd >> 11) & 7; | ||
106 | unsigned long twd = (unsigned long) fxsave->twd; | ||
107 | unsigned long tag; | ||
108 | unsigned long ret = 0xffff0000u; | ||
109 | int i; | ||
110 | |||
111 | #define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); | ||
112 | |||
113 | for ( i = 0 ; i < 8 ; i++ ) { | ||
114 | if ( twd & 0x1 ) { | ||
115 | st = FPREG_ADDR( fxsave, (i - tos) & 7 ); | ||
116 | |||
117 | switch ( st->exponent & 0x7fff ) { | ||
118 | case 0x7fff: | ||
119 | tag = 2; /* Special */ | ||
120 | break; | ||
121 | case 0x0000: | ||
122 | if ( !st->significand[0] && | ||
123 | !st->significand[1] && | ||
124 | !st->significand[2] && | ||
125 | !st->significand[3] ) { | ||
126 | tag = 1; /* Zero */ | ||
127 | } else { | ||
128 | tag = 2; /* Special */ | ||
129 | } | ||
130 | break; | ||
131 | default: | ||
132 | if ( st->significand[3] & 0x8000 ) { | ||
133 | tag = 0; /* Valid */ | ||
134 | } else { | ||
135 | tag = 2; /* Special */ | ||
136 | } | ||
137 | break; | ||
138 | } | ||
139 | } else { | ||
140 | tag = 3; /* Empty */ | ||
141 | } | ||
142 | ret |= (tag << (2 * i)); | ||
143 | twd = twd >> 1; | ||
144 | } | ||
145 | return ret; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * FPU state interaction. | ||
150 | */ | ||
151 | |||
152 | unsigned short get_fpu_cwd( struct task_struct *tsk ) | ||
153 | { | ||
154 | if ( cpu_has_fxsr ) { | ||
155 | return tsk->thread.i387.fxsave.cwd; | ||
156 | } else { | ||
157 | return (unsigned short)tsk->thread.i387.fsave.cwd; | ||
158 | } | ||
159 | } | ||
160 | |||
161 | unsigned short get_fpu_swd( struct task_struct *tsk ) | ||
162 | { | ||
163 | if ( cpu_has_fxsr ) { | ||
164 | return tsk->thread.i387.fxsave.swd; | ||
165 | } else { | ||
166 | return (unsigned short)tsk->thread.i387.fsave.swd; | ||
167 | } | ||
168 | } | ||
169 | |||
170 | #if 0 | ||
171 | unsigned short get_fpu_twd( struct task_struct *tsk ) | ||
172 | { | ||
173 | if ( cpu_has_fxsr ) { | ||
174 | return tsk->thread.i387.fxsave.twd; | ||
175 | } else { | ||
176 | return (unsigned short)tsk->thread.i387.fsave.twd; | ||
177 | } | ||
178 | } | ||
179 | #endif /* 0 */ | ||
180 | |||
181 | unsigned short get_fpu_mxcsr( struct task_struct *tsk ) | ||
182 | { | ||
183 | if ( cpu_has_xmm ) { | ||
184 | return tsk->thread.i387.fxsave.mxcsr; | ||
185 | } else { | ||
186 | return 0x1f80; | ||
187 | } | ||
188 | } | ||
189 | |||
190 | #if 0 | ||
191 | |||
192 | void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd ) | ||
193 | { | ||
194 | if ( cpu_has_fxsr ) { | ||
195 | tsk->thread.i387.fxsave.cwd = cwd; | ||
196 | } else { | ||
197 | tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u); | ||
198 | } | ||
199 | } | ||
200 | |||
201 | void set_fpu_swd( struct task_struct *tsk, unsigned short swd ) | ||
202 | { | ||
203 | if ( cpu_has_fxsr ) { | ||
204 | tsk->thread.i387.fxsave.swd = swd; | ||
205 | } else { | ||
206 | tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u); | ||
207 | } | ||
208 | } | ||
209 | |||
210 | void set_fpu_twd( struct task_struct *tsk, unsigned short twd ) | ||
211 | { | ||
212 | if ( cpu_has_fxsr ) { | ||
213 | tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd); | ||
214 | } else { | ||
215 | tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | #endif /* 0 */ | ||
220 | |||
221 | /* | ||
222 | * FXSR floating point environment conversions. | ||
223 | */ | ||
224 | |||
225 | static int convert_fxsr_to_user( struct _fpstate __user *buf, | ||
226 | struct i387_fxsave_struct *fxsave ) | ||
227 | { | ||
228 | unsigned long env[7]; | ||
229 | struct _fpreg __user *to; | ||
230 | struct _fpxreg *from; | ||
231 | int i; | ||
232 | |||
233 | env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; | ||
234 | env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; | ||
235 | env[2] = twd_fxsr_to_i387(fxsave); | ||
236 | env[3] = fxsave->fip; | ||
237 | env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); | ||
238 | env[5] = fxsave->foo; | ||
239 | env[6] = fxsave->fos; | ||
240 | |||
241 | if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) | ||
242 | return 1; | ||
243 | |||
244 | to = &buf->_st[0]; | ||
245 | from = (struct _fpxreg *) &fxsave->st_space[0]; | ||
246 | for ( i = 0 ; i < 8 ; i++, to++, from++ ) { | ||
247 | unsigned long __user *t = (unsigned long __user *)to; | ||
248 | unsigned long *f = (unsigned long *)from; | ||
249 | |||
250 | if (__put_user(*f, t) || | ||
251 | __put_user(*(f + 1), t + 1) || | ||
252 | __put_user(from->exponent, &to->exponent)) | ||
253 | return 1; | ||
254 | } | ||
255 | return 0; | ||
256 | } | ||
257 | |||
258 | static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, | ||
259 | struct _fpstate __user *buf ) | ||
260 | { | ||
261 | unsigned long env[7]; | ||
262 | struct _fpxreg *to; | ||
263 | struct _fpreg __user *from; | ||
264 | int i; | ||
265 | |||
266 | if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) | ||
267 | return 1; | ||
268 | |||
269 | fxsave->cwd = (unsigned short)(env[0] & 0xffff); | ||
270 | fxsave->swd = (unsigned short)(env[1] & 0xffff); | ||
271 | fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); | ||
272 | fxsave->fip = env[3]; | ||
273 | fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); | ||
274 | fxsave->fcs = (env[4] & 0xffff); | ||
275 | fxsave->foo = env[5]; | ||
276 | fxsave->fos = env[6]; | ||
277 | |||
278 | to = (struct _fpxreg *) &fxsave->st_space[0]; | ||
279 | from = &buf->_st[0]; | ||
280 | for ( i = 0 ; i < 8 ; i++, to++, from++ ) { | ||
281 | unsigned long *t = (unsigned long *)to; | ||
282 | unsigned long __user *f = (unsigned long __user *)from; | ||
283 | |||
284 | if (__get_user(*t, f) || | ||
285 | __get_user(*(t + 1), f + 1) || | ||
286 | __get_user(to->exponent, &from->exponent)) | ||
287 | return 1; | ||
288 | } | ||
289 | return 0; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * Signal frame handlers. | ||
294 | */ | ||
295 | |||
296 | static inline int save_i387_fsave( struct _fpstate __user *buf ) | ||
297 | { | ||
298 | struct task_struct *tsk = current; | ||
299 | |||
300 | unlazy_fpu( tsk ); | ||
301 | tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; | ||
302 | if ( __copy_to_user( buf, &tsk->thread.i387.fsave, | ||
303 | sizeof(struct i387_fsave_struct) ) ) | ||
304 | return -1; | ||
305 | return 1; | ||
306 | } | ||
307 | |||
308 | static int save_i387_fxsave( struct _fpstate __user *buf ) | ||
309 | { | ||
310 | struct task_struct *tsk = current; | ||
311 | int err = 0; | ||
312 | |||
313 | unlazy_fpu( tsk ); | ||
314 | |||
315 | if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) ) | ||
316 | return -1; | ||
317 | |||
318 | err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status ); | ||
319 | err |= __put_user( X86_FXSR_MAGIC, &buf->magic ); | ||
320 | if ( err ) | ||
321 | return -1; | ||
322 | |||
323 | if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave, | ||
324 | sizeof(struct i387_fxsave_struct) ) ) | ||
325 | return -1; | ||
326 | return 1; | ||
327 | } | ||
328 | |||
329 | int save_i387( struct _fpstate __user *buf ) | ||
330 | { | ||
331 | if ( !used_math() ) | ||
332 | return 0; | ||
333 | |||
334 | /* This will cause a "finit" to be triggered by the next | ||
335 | * attempted FPU operation by the 'current' process. | ||
336 | */ | ||
337 | clear_used_math(); | ||
338 | |||
339 | if ( HAVE_HWFP ) { | ||
340 | if ( cpu_has_fxsr ) { | ||
341 | return save_i387_fxsave( buf ); | ||
342 | } else { | ||
343 | return save_i387_fsave( buf ); | ||
344 | } | ||
345 | } else { | ||
346 | return save_i387_soft( ¤t->thread.i387.soft, buf ); | ||
347 | } | ||
348 | } | ||
349 | |||
350 | static inline int restore_i387_fsave( struct _fpstate __user *buf ) | ||
351 | { | ||
352 | struct task_struct *tsk = current; | ||
353 | clear_fpu( tsk ); | ||
354 | return __copy_from_user( &tsk->thread.i387.fsave, buf, | ||
355 | sizeof(struct i387_fsave_struct) ); | ||
356 | } | ||
357 | |||
358 | static int restore_i387_fxsave( struct _fpstate __user *buf ) | ||
359 | { | ||
360 | int err; | ||
361 | struct task_struct *tsk = current; | ||
362 | clear_fpu( tsk ); | ||
363 | err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0], | ||
364 | sizeof(struct i387_fxsave_struct) ); | ||
365 | /* mxcsr reserved bits must be masked to zero for security reasons */ | ||
366 | tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; | ||
367 | return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf ); | ||
368 | } | ||
369 | |||
370 | int restore_i387( struct _fpstate __user *buf ) | ||
371 | { | ||
372 | int err; | ||
373 | |||
374 | if ( HAVE_HWFP ) { | ||
375 | if ( cpu_has_fxsr ) { | ||
376 | err = restore_i387_fxsave( buf ); | ||
377 | } else { | ||
378 | err = restore_i387_fsave( buf ); | ||
379 | } | ||
380 | } else { | ||
381 | err = restore_i387_soft( ¤t->thread.i387.soft, buf ); | ||
382 | } | ||
383 | set_used_math(); | ||
384 | return err; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * ptrace request handlers. | ||
389 | */ | ||
390 | |||
391 | static inline int get_fpregs_fsave( struct user_i387_struct __user *buf, | ||
392 | struct task_struct *tsk ) | ||
393 | { | ||
394 | return __copy_to_user( buf, &tsk->thread.i387.fsave, | ||
395 | sizeof(struct user_i387_struct) ); | ||
396 | } | ||
397 | |||
398 | static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf, | ||
399 | struct task_struct *tsk ) | ||
400 | { | ||
401 | return convert_fxsr_to_user( (struct _fpstate __user *)buf, | ||
402 | &tsk->thread.i387.fxsave ); | ||
403 | } | ||
404 | |||
405 | int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk ) | ||
406 | { | ||
407 | if ( HAVE_HWFP ) { | ||
408 | if ( cpu_has_fxsr ) { | ||
409 | return get_fpregs_fxsave( buf, tsk ); | ||
410 | } else { | ||
411 | return get_fpregs_fsave( buf, tsk ); | ||
412 | } | ||
413 | } else { | ||
414 | return save_i387_soft( &tsk->thread.i387.soft, | ||
415 | (struct _fpstate __user *)buf ); | ||
416 | } | ||
417 | } | ||
418 | |||
419 | static inline int set_fpregs_fsave( struct task_struct *tsk, | ||
420 | struct user_i387_struct __user *buf ) | ||
421 | { | ||
422 | return __copy_from_user( &tsk->thread.i387.fsave, buf, | ||
423 | sizeof(struct user_i387_struct) ); | ||
424 | } | ||
425 | |||
426 | static inline int set_fpregs_fxsave( struct task_struct *tsk, | ||
427 | struct user_i387_struct __user *buf ) | ||
428 | { | ||
429 | return convert_fxsr_from_user( &tsk->thread.i387.fxsave, | ||
430 | (struct _fpstate __user *)buf ); | ||
431 | } | ||
432 | |||
433 | int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf ) | ||
434 | { | ||
435 | if ( HAVE_HWFP ) { | ||
436 | if ( cpu_has_fxsr ) { | ||
437 | return set_fpregs_fxsave( tsk, buf ); | ||
438 | } else { | ||
439 | return set_fpregs_fsave( tsk, buf ); | ||
440 | } | ||
441 | } else { | ||
442 | return restore_i387_soft( &tsk->thread.i387.soft, | ||
443 | (struct _fpstate __user *)buf ); | ||
444 | } | ||
445 | } | ||
446 | |||
447 | int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk ) | ||
448 | { | ||
449 | if ( cpu_has_fxsr ) { | ||
450 | if (__copy_to_user( buf, &tsk->thread.i387.fxsave, | ||
451 | sizeof(struct user_fxsr_struct) )) | ||
452 | return -EFAULT; | ||
453 | return 0; | ||
454 | } else { | ||
455 | return -EIO; | ||
456 | } | ||
457 | } | ||
458 | |||
459 | int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf ) | ||
460 | { | ||
461 | int ret = 0; | ||
462 | |||
463 | if ( cpu_has_fxsr ) { | ||
464 | if (__copy_from_user( &tsk->thread.i387.fxsave, buf, | ||
465 | sizeof(struct user_fxsr_struct) )) | ||
466 | ret = -EFAULT; | ||
467 | /* mxcsr reserved bits must be masked to zero for security reasons */ | ||
468 | tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; | ||
469 | } else { | ||
470 | ret = -EIO; | ||
471 | } | ||
472 | return ret; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * FPU state for core dumps. | ||
477 | */ | ||
478 | |||
479 | static inline void copy_fpu_fsave( struct task_struct *tsk, | ||
480 | struct user_i387_struct *fpu ) | ||
481 | { | ||
482 | memcpy( fpu, &tsk->thread.i387.fsave, | ||
483 | sizeof(struct user_i387_struct) ); | ||
484 | } | ||
485 | |||
486 | static inline void copy_fpu_fxsave( struct task_struct *tsk, | ||
487 | struct user_i387_struct *fpu ) | ||
488 | { | ||
489 | unsigned short *to; | ||
490 | unsigned short *from; | ||
491 | int i; | ||
492 | |||
493 | memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) ); | ||
494 | |||
495 | to = (unsigned short *)&fpu->st_space[0]; | ||
496 | from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0]; | ||
497 | for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) { | ||
498 | memcpy( to, from, 5 * sizeof(unsigned short) ); | ||
499 | } | ||
500 | } | ||
501 | |||
502 | int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) | ||
503 | { | ||
504 | int fpvalid; | ||
505 | struct task_struct *tsk = current; | ||
506 | |||
507 | fpvalid = !!used_math(); | ||
508 | if ( fpvalid ) { | ||
509 | unlazy_fpu( tsk ); | ||
510 | if ( cpu_has_fxsr ) { | ||
511 | copy_fpu_fxsave( tsk, fpu ); | ||
512 | } else { | ||
513 | copy_fpu_fsave( tsk, fpu ); | ||
514 | } | ||
515 | } | ||
516 | |||
517 | return fpvalid; | ||
518 | } | ||
519 | EXPORT_SYMBOL(dump_fpu); | ||
520 | |||
521 | int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) | ||
522 | { | ||
523 | int fpvalid = !!tsk_used_math(tsk); | ||
524 | |||
525 | if (fpvalid) { | ||
526 | if (tsk == current) | ||
527 | unlazy_fpu(tsk); | ||
528 | if (cpu_has_fxsr) | ||
529 | copy_fpu_fxsave(tsk, fpu); | ||
530 | else | ||
531 | copy_fpu_fsave(tsk, fpu); | ||
532 | } | ||
533 | return fpvalid; | ||
534 | } | ||
535 | |||
536 | int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu) | ||
537 | { | ||
538 | int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr; | ||
539 | |||
540 | if (fpvalid) { | ||
541 | if (tsk == current) | ||
542 | unlazy_fpu(tsk); | ||
543 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu)); | ||
544 | } | ||
545 | return fpvalid; | ||
546 | } | ||
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c new file mode 100644 index 000000000000..6f508e8d7c57 --- /dev/null +++ b/arch/x86/kernel/i8237.c | |||
@@ -0,0 +1,72 @@ | |||
1 | /* | ||
2 | * i8237.c: 8237A DMA controller suspend functions. | ||
3 | * | ||
4 | * Written by Pierre Ossman, 2005. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or (at | ||
9 | * your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/init.h> | ||
13 | #include <linux/sysdev.h> | ||
14 | |||
15 | #include <asm/dma.h> | ||
16 | |||
17 | /* | ||
18 | * This module just handles suspend/resume issues with the | ||
19 | * 8237A DMA controller (used for ISA and LPC). | ||
20 | * Allocation is handled in kernel/dma.c and normal usage is | ||
21 | * in asm/dma.h. | ||
22 | */ | ||
23 | |||
24 | static int i8237A_resume(struct sys_device *dev) | ||
25 | { | ||
26 | unsigned long flags; | ||
27 | int i; | ||
28 | |||
29 | flags = claim_dma_lock(); | ||
30 | |||
31 | dma_outb(DMA1_RESET_REG, 0); | ||
32 | dma_outb(DMA2_RESET_REG, 0); | ||
33 | |||
34 | for (i = 0;i < 8;i++) { | ||
35 | set_dma_addr(i, 0x000000); | ||
36 | /* DMA count is a bit weird so this is not 0 */ | ||
37 | set_dma_count(i, 1); | ||
38 | } | ||
39 | |||
40 | /* Enable cascade DMA or channel 0-3 won't work */ | ||
41 | enable_dma(4); | ||
42 | |||
43 | release_dma_lock(flags); | ||
44 | |||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | static int i8237A_suspend(struct sys_device *dev, pm_message_t state) | ||
49 | { | ||
50 | return 0; | ||
51 | } | ||
52 | |||
53 | static struct sysdev_class i8237_sysdev_class = { | ||
54 | set_kset_name("i8237"), | ||
55 | .suspend = i8237A_suspend, | ||
56 | .resume = i8237A_resume, | ||
57 | }; | ||
58 | |||
59 | static struct sys_device device_i8237A = { | ||
60 | .id = 0, | ||
61 | .cls = &i8237_sysdev_class, | ||
62 | }; | ||
63 | |||
64 | static int __init i8237A_init_sysfs(void) | ||
65 | { | ||
66 | int error = sysdev_class_register(&i8237_sysdev_class); | ||
67 | if (!error) | ||
68 | error = sysdev_register(&device_i8237A); | ||
69 | return error; | ||
70 | } | ||
71 | |||
72 | device_initcall(i8237A_init_sysfs); | ||
diff --git a/arch/x86/kernel/i8253_32.c b/arch/x86/kernel/i8253_32.c new file mode 100644 index 000000000000..6d839f2f1b1a --- /dev/null +++ b/arch/x86/kernel/i8253_32.c | |||
@@ -0,0 +1,206 @@ | |||
1 | /* | ||
2 | * i8253.c 8253/PIT functions | ||
3 | * | ||
4 | */ | ||
5 | #include <linux/clockchips.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/jiffies.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/spinlock.h> | ||
11 | |||
12 | #include <asm/smp.h> | ||
13 | #include <asm/delay.h> | ||
14 | #include <asm/i8253.h> | ||
15 | #include <asm/io.h> | ||
16 | #include <asm/timer.h> | ||
17 | |||
18 | DEFINE_SPINLOCK(i8253_lock); | ||
19 | EXPORT_SYMBOL(i8253_lock); | ||
20 | |||
21 | /* | ||
22 | * HPET replaces the PIT, when enabled. So we need to know, which of | ||
23 | * the two timers is used | ||
24 | */ | ||
25 | struct clock_event_device *global_clock_event; | ||
26 | |||
27 | /* | ||
28 | * Initialize the PIT timer. | ||
29 | * | ||
30 | * This is also called after resume to bring the PIT into operation again. | ||
31 | */ | ||
32 | static void init_pit_timer(enum clock_event_mode mode, | ||
33 | struct clock_event_device *evt) | ||
34 | { | ||
35 | unsigned long flags; | ||
36 | |||
37 | spin_lock_irqsave(&i8253_lock, flags); | ||
38 | |||
39 | switch(mode) { | ||
40 | case CLOCK_EVT_MODE_PERIODIC: | ||
41 | /* binary, mode 2, LSB/MSB, ch 0 */ | ||
42 | outb_p(0x34, PIT_MODE); | ||
43 | outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ | ||
44 | outb(LATCH >> 8 , PIT_CH0); /* MSB */ | ||
45 | break; | ||
46 | |||
47 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
48 | case CLOCK_EVT_MODE_UNUSED: | ||
49 | if (evt->mode == CLOCK_EVT_MODE_PERIODIC || | ||
50 | evt->mode == CLOCK_EVT_MODE_ONESHOT) { | ||
51 | outb_p(0x30, PIT_MODE); | ||
52 | outb_p(0, PIT_CH0); | ||
53 | outb_p(0, PIT_CH0); | ||
54 | } | ||
55 | break; | ||
56 | |||
57 | case CLOCK_EVT_MODE_ONESHOT: | ||
58 | /* One shot setup */ | ||
59 | outb_p(0x38, PIT_MODE); | ||
60 | break; | ||
61 | |||
62 | case CLOCK_EVT_MODE_RESUME: | ||
63 | /* Nothing to do here */ | ||
64 | break; | ||
65 | } | ||
66 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * Program the next event in oneshot mode | ||
71 | * | ||
72 | * Delta is given in PIT ticks | ||
73 | */ | ||
74 | static int pit_next_event(unsigned long delta, struct clock_event_device *evt) | ||
75 | { | ||
76 | unsigned long flags; | ||
77 | |||
78 | spin_lock_irqsave(&i8253_lock, flags); | ||
79 | outb_p(delta & 0xff , PIT_CH0); /* LSB */ | ||
80 | outb(delta >> 8 , PIT_CH0); /* MSB */ | ||
81 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
82 | |||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * On UP the PIT can serve all of the possible timer functions. On SMP systems | ||
88 | * it can be solely used for the global tick. | ||
89 | * | ||
90 | * The profiling and update capabilites are switched off once the local apic is | ||
91 | * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - | ||
92 | * !using_apic_timer decisions in do_timer_interrupt_hook() | ||
93 | */ | ||
94 | struct clock_event_device pit_clockevent = { | ||
95 | .name = "pit", | ||
96 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, | ||
97 | .set_mode = init_pit_timer, | ||
98 | .set_next_event = pit_next_event, | ||
99 | .shift = 32, | ||
100 | .irq = 0, | ||
101 | }; | ||
102 | |||
103 | /* | ||
104 | * Initialize the conversion factor and the min/max deltas of the clock event | ||
105 | * structure and register the clock event source with the framework. | ||
106 | */ | ||
107 | void __init setup_pit_timer(void) | ||
108 | { | ||
109 | /* | ||
110 | * Start pit with the boot cpu mask and make it global after the | ||
111 | * IO_APIC has been initialized. | ||
112 | */ | ||
113 | pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); | ||
114 | pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); | ||
115 | pit_clockevent.max_delta_ns = | ||
116 | clockevent_delta2ns(0x7FFF, &pit_clockevent); | ||
117 | pit_clockevent.min_delta_ns = | ||
118 | clockevent_delta2ns(0xF, &pit_clockevent); | ||
119 | clockevents_register_device(&pit_clockevent); | ||
120 | global_clock_event = &pit_clockevent; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Since the PIT overflows every tick, its not very useful | ||
125 | * to just read by itself. So use jiffies to emulate a free | ||
126 | * running counter: | ||
127 | */ | ||
128 | static cycle_t pit_read(void) | ||
129 | { | ||
130 | unsigned long flags; | ||
131 | int count; | ||
132 | u32 jifs; | ||
133 | static int old_count; | ||
134 | static u32 old_jifs; | ||
135 | |||
136 | spin_lock_irqsave(&i8253_lock, flags); | ||
137 | /* | ||
138 | * Although our caller may have the read side of xtime_lock, | ||
139 | * this is now a seqlock, and we are cheating in this routine | ||
140 | * by having side effects on state that we cannot undo if | ||
141 | * there is a collision on the seqlock and our caller has to | ||
142 | * retry. (Namely, old_jifs and old_count.) So we must treat | ||
143 | * jiffies as volatile despite the lock. We read jiffies | ||
144 | * before latching the timer count to guarantee that although | ||
145 | * the jiffies value might be older than the count (that is, | ||
146 | * the counter may underflow between the last point where | ||
147 | * jiffies was incremented and the point where we latch the | ||
148 | * count), it cannot be newer. | ||
149 | */ | ||
150 | jifs = jiffies; | ||
151 | outb_p(0x00, PIT_MODE); /* latch the count ASAP */ | ||
152 | count = inb_p(PIT_CH0); /* read the latched count */ | ||
153 | count |= inb_p(PIT_CH0) << 8; | ||
154 | |||
155 | /* VIA686a test code... reset the latch if count > max + 1 */ | ||
156 | if (count > LATCH) { | ||
157 | outb_p(0x34, PIT_MODE); | ||
158 | outb_p(LATCH & 0xff, PIT_CH0); | ||
159 | outb(LATCH >> 8, PIT_CH0); | ||
160 | count = LATCH - 1; | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * It's possible for count to appear to go the wrong way for a | ||
165 | * couple of reasons: | ||
166 | * | ||
167 | * 1. The timer counter underflows, but we haven't handled the | ||
168 | * resulting interrupt and incremented jiffies yet. | ||
169 | * 2. Hardware problem with the timer, not giving us continuous time, | ||
170 | * the counter does small "jumps" upwards on some Pentium systems, | ||
171 | * (see c't 95/10 page 335 for Neptun bug.) | ||
172 | * | ||
173 | * Previous attempts to handle these cases intelligently were | ||
174 | * buggy, so we just do the simple thing now. | ||
175 | */ | ||
176 | if (count > old_count && jifs == old_jifs) { | ||
177 | count = old_count; | ||
178 | } | ||
179 | old_count = count; | ||
180 | old_jifs = jifs; | ||
181 | |||
182 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
183 | |||
184 | count = (LATCH - 1) - count; | ||
185 | |||
186 | return (cycle_t)(jifs * LATCH) + count; | ||
187 | } | ||
188 | |||
189 | static struct clocksource clocksource_pit = { | ||
190 | .name = "pit", | ||
191 | .rating = 110, | ||
192 | .read = pit_read, | ||
193 | .mask = CLOCKSOURCE_MASK(32), | ||
194 | .mult = 0, | ||
195 | .shift = 20, | ||
196 | }; | ||
197 | |||
198 | static int __init init_pit_clocksource(void) | ||
199 | { | ||
200 | if (num_possible_cpus() > 1) /* PIT does not scale! */ | ||
201 | return 0; | ||
202 | |||
203 | clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); | ||
204 | return clocksource_register(&clocksource_pit); | ||
205 | } | ||
206 | arch_initcall(init_pit_clocksource); | ||
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c new file mode 100644 index 000000000000..0499cbe9871a --- /dev/null +++ b/arch/x86/kernel/i8259_32.c | |||
@@ -0,0 +1,420 @@ | |||
1 | #include <linux/errno.h> | ||
2 | #include <linux/signal.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/ioport.h> | ||
5 | #include <linux/interrupt.h> | ||
6 | #include <linux/slab.h> | ||
7 | #include <linux/random.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/kernel_stat.h> | ||
10 | #include <linux/sysdev.h> | ||
11 | #include <linux/bitops.h> | ||
12 | |||
13 | #include <asm/8253pit.h> | ||
14 | #include <asm/atomic.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <asm/io.h> | ||
17 | #include <asm/timer.h> | ||
18 | #include <asm/pgtable.h> | ||
19 | #include <asm/delay.h> | ||
20 | #include <asm/desc.h> | ||
21 | #include <asm/apic.h> | ||
22 | #include <asm/arch_hooks.h> | ||
23 | #include <asm/i8259.h> | ||
24 | |||
25 | #include <io_ports.h> | ||
26 | |||
27 | /* | ||
28 | * This is the 'legacy' 8259A Programmable Interrupt Controller, | ||
29 | * present in the majority of PC/AT boxes. | ||
30 | * plus some generic x86 specific things if generic specifics makes | ||
31 | * any sense at all. | ||
32 | * this file should become arch/i386/kernel/irq.c when the old irq.c | ||
33 | * moves to arch independent land | ||
34 | */ | ||
35 | |||
36 | static int i8259A_auto_eoi; | ||
37 | DEFINE_SPINLOCK(i8259A_lock); | ||
38 | static void mask_and_ack_8259A(unsigned int); | ||
39 | |||
40 | static struct irq_chip i8259A_chip = { | ||
41 | .name = "XT-PIC", | ||
42 | .mask = disable_8259A_irq, | ||
43 | .disable = disable_8259A_irq, | ||
44 | .unmask = enable_8259A_irq, | ||
45 | .mask_ack = mask_and_ack_8259A, | ||
46 | }; | ||
47 | |||
48 | /* | ||
49 | * 8259A PIC functions to handle ISA devices: | ||
50 | */ | ||
51 | |||
52 | /* | ||
53 | * This contains the irq mask for both 8259A irq controllers, | ||
54 | */ | ||
55 | unsigned int cached_irq_mask = 0xffff; | ||
56 | |||
57 | /* | ||
58 | * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) | ||
59 | * boards the timer interrupt is not really connected to any IO-APIC pin, | ||
60 | * it's fed to the master 8259A's IR0 line only. | ||
61 | * | ||
62 | * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. | ||
63 | * this 'mixed mode' IRQ handling costs nothing because it's only used | ||
64 | * at IRQ setup time. | ||
65 | */ | ||
66 | unsigned long io_apic_irqs; | ||
67 | |||
68 | void disable_8259A_irq(unsigned int irq) | ||
69 | { | ||
70 | unsigned int mask = 1 << irq; | ||
71 | unsigned long flags; | ||
72 | |||
73 | spin_lock_irqsave(&i8259A_lock, flags); | ||
74 | cached_irq_mask |= mask; | ||
75 | if (irq & 8) | ||
76 | outb(cached_slave_mask, PIC_SLAVE_IMR); | ||
77 | else | ||
78 | outb(cached_master_mask, PIC_MASTER_IMR); | ||
79 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
80 | } | ||
81 | |||
82 | void enable_8259A_irq(unsigned int irq) | ||
83 | { | ||
84 | unsigned int mask = ~(1 << irq); | ||
85 | unsigned long flags; | ||
86 | |||
87 | spin_lock_irqsave(&i8259A_lock, flags); | ||
88 | cached_irq_mask &= mask; | ||
89 | if (irq & 8) | ||
90 | outb(cached_slave_mask, PIC_SLAVE_IMR); | ||
91 | else | ||
92 | outb(cached_master_mask, PIC_MASTER_IMR); | ||
93 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
94 | } | ||
95 | |||
96 | int i8259A_irq_pending(unsigned int irq) | ||
97 | { | ||
98 | unsigned int mask = 1<<irq; | ||
99 | unsigned long flags; | ||
100 | int ret; | ||
101 | |||
102 | spin_lock_irqsave(&i8259A_lock, flags); | ||
103 | if (irq < 8) | ||
104 | ret = inb(PIC_MASTER_CMD) & mask; | ||
105 | else | ||
106 | ret = inb(PIC_SLAVE_CMD) & (mask >> 8); | ||
107 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
108 | |||
109 | return ret; | ||
110 | } | ||
111 | |||
112 | void make_8259A_irq(unsigned int irq) | ||
113 | { | ||
114 | disable_irq_nosync(irq); | ||
115 | io_apic_irqs &= ~(1<<irq); | ||
116 | set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, | ||
117 | "XT"); | ||
118 | enable_irq(irq); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * This function assumes to be called rarely. Switching between | ||
123 | * 8259A registers is slow. | ||
124 | * This has to be protected by the irq controller spinlock | ||
125 | * before being called. | ||
126 | */ | ||
127 | static inline int i8259A_irq_real(unsigned int irq) | ||
128 | { | ||
129 | int value; | ||
130 | int irqmask = 1<<irq; | ||
131 | |||
132 | if (irq < 8) { | ||
133 | outb(0x0B,PIC_MASTER_CMD); /* ISR register */ | ||
134 | value = inb(PIC_MASTER_CMD) & irqmask; | ||
135 | outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ | ||
136 | return value; | ||
137 | } | ||
138 | outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ | ||
139 | value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); | ||
140 | outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ | ||
141 | return value; | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * Careful! The 8259A is a fragile beast, it pretty | ||
146 | * much _has_ to be done exactly like this (mask it | ||
147 | * first, _then_ send the EOI, and the order of EOI | ||
148 | * to the two 8259s is important! | ||
149 | */ | ||
150 | static void mask_and_ack_8259A(unsigned int irq) | ||
151 | { | ||
152 | unsigned int irqmask = 1 << irq; | ||
153 | unsigned long flags; | ||
154 | |||
155 | spin_lock_irqsave(&i8259A_lock, flags); | ||
156 | /* | ||
157 | * Lightweight spurious IRQ detection. We do not want | ||
158 | * to overdo spurious IRQ handling - it's usually a sign | ||
159 | * of hardware problems, so we only do the checks we can | ||
160 | * do without slowing down good hardware unnecessarily. | ||
161 | * | ||
162 | * Note that IRQ7 and IRQ15 (the two spurious IRQs | ||
163 | * usually resulting from the 8259A-1|2 PICs) occur | ||
164 | * even if the IRQ is masked in the 8259A. Thus we | ||
165 | * can check spurious 8259A IRQs without doing the | ||
166 | * quite slow i8259A_irq_real() call for every IRQ. | ||
167 | * This does not cover 100% of spurious interrupts, | ||
168 | * but should be enough to warn the user that there | ||
169 | * is something bad going on ... | ||
170 | */ | ||
171 | if (cached_irq_mask & irqmask) | ||
172 | goto spurious_8259A_irq; | ||
173 | cached_irq_mask |= irqmask; | ||
174 | |||
175 | handle_real_irq: | ||
176 | if (irq & 8) { | ||
177 | inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ | ||
178 | outb(cached_slave_mask, PIC_SLAVE_IMR); | ||
179 | outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ | ||
180 | outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ | ||
181 | } else { | ||
182 | inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ | ||
183 | outb(cached_master_mask, PIC_MASTER_IMR); | ||
184 | outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ | ||
185 | } | ||
186 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
187 | return; | ||
188 | |||
189 | spurious_8259A_irq: | ||
190 | /* | ||
191 | * this is the slow path - should happen rarely. | ||
192 | */ | ||
193 | if (i8259A_irq_real(irq)) | ||
194 | /* | ||
195 | * oops, the IRQ _is_ in service according to the | ||
196 | * 8259A - not spurious, go handle it. | ||
197 | */ | ||
198 | goto handle_real_irq; | ||
199 | |||
200 | { | ||
201 | static int spurious_irq_mask; | ||
202 | /* | ||
203 | * At this point we can be sure the IRQ is spurious, | ||
204 | * lets ACK and report it. [once per IRQ] | ||
205 | */ | ||
206 | if (!(spurious_irq_mask & irqmask)) { | ||
207 | printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); | ||
208 | spurious_irq_mask |= irqmask; | ||
209 | } | ||
210 | atomic_inc(&irq_err_count); | ||
211 | /* | ||
212 | * Theoretically we do not have to handle this IRQ, | ||
213 | * but in Linux this does not cause problems and is | ||
214 | * simpler for us. | ||
215 | */ | ||
216 | goto handle_real_irq; | ||
217 | } | ||
218 | } | ||
219 | |||
220 | static char irq_trigger[2]; | ||
221 | /** | ||
222 | * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ | ||
223 | */ | ||
224 | static void restore_ELCR(char *trigger) | ||
225 | { | ||
226 | outb(trigger[0], 0x4d0); | ||
227 | outb(trigger[1], 0x4d1); | ||
228 | } | ||
229 | |||
230 | static void save_ELCR(char *trigger) | ||
231 | { | ||
232 | /* IRQ 0,1,2,8,13 are marked as reserved */ | ||
233 | trigger[0] = inb(0x4d0) & 0xF8; | ||
234 | trigger[1] = inb(0x4d1) & 0xDE; | ||
235 | } | ||
236 | |||
237 | static int i8259A_resume(struct sys_device *dev) | ||
238 | { | ||
239 | init_8259A(i8259A_auto_eoi); | ||
240 | restore_ELCR(irq_trigger); | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | static int i8259A_suspend(struct sys_device *dev, pm_message_t state) | ||
245 | { | ||
246 | save_ELCR(irq_trigger); | ||
247 | return 0; | ||
248 | } | ||
249 | |||
250 | static int i8259A_shutdown(struct sys_device *dev) | ||
251 | { | ||
252 | /* Put the i8259A into a quiescent state that | ||
253 | * the kernel initialization code can get it | ||
254 | * out of. | ||
255 | */ | ||
256 | outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ | ||
257 | outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ | ||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | static struct sysdev_class i8259_sysdev_class = { | ||
262 | set_kset_name("i8259"), | ||
263 | .suspend = i8259A_suspend, | ||
264 | .resume = i8259A_resume, | ||
265 | .shutdown = i8259A_shutdown, | ||
266 | }; | ||
267 | |||
268 | static struct sys_device device_i8259A = { | ||
269 | .id = 0, | ||
270 | .cls = &i8259_sysdev_class, | ||
271 | }; | ||
272 | |||
273 | static int __init i8259A_init_sysfs(void) | ||
274 | { | ||
275 | int error = sysdev_class_register(&i8259_sysdev_class); | ||
276 | if (!error) | ||
277 | error = sysdev_register(&device_i8259A); | ||
278 | return error; | ||
279 | } | ||
280 | |||
281 | device_initcall(i8259A_init_sysfs); | ||
282 | |||
283 | void init_8259A(int auto_eoi) | ||
284 | { | ||
285 | unsigned long flags; | ||
286 | |||
287 | i8259A_auto_eoi = auto_eoi; | ||
288 | |||
289 | spin_lock_irqsave(&i8259A_lock, flags); | ||
290 | |||
291 | outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ | ||
292 | outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ | ||
293 | |||
294 | /* | ||
295 | * outb_p - this has to work on a wide range of PC hardware. | ||
296 | */ | ||
297 | outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ | ||
298 | outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ | ||
299 | outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ | ||
300 | if (auto_eoi) /* master does Auto EOI */ | ||
301 | outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); | ||
302 | else /* master expects normal EOI */ | ||
303 | outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); | ||
304 | |||
305 | outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ | ||
306 | outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ | ||
307 | outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ | ||
308 | outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ | ||
309 | if (auto_eoi) | ||
310 | /* | ||
311 | * In AEOI mode we just have to mask the interrupt | ||
312 | * when acking. | ||
313 | */ | ||
314 | i8259A_chip.mask_ack = disable_8259A_irq; | ||
315 | else | ||
316 | i8259A_chip.mask_ack = mask_and_ack_8259A; | ||
317 | |||
318 | udelay(100); /* wait for 8259A to initialize */ | ||
319 | |||
320 | outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ | ||
321 | outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ | ||
322 | |||
323 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * Note that on a 486, we don't want to do a SIGFPE on an irq13 | ||
328 | * as the irq is unreliable, and exception 16 works correctly | ||
329 | * (ie as explained in the intel literature). On a 386, you | ||
330 | * can't use exception 16 due to bad IBM design, so we have to | ||
331 | * rely on the less exact irq13. | ||
332 | * | ||
333 | * Careful.. Not only is IRQ13 unreliable, but it is also | ||
334 | * leads to races. IBM designers who came up with it should | ||
335 | * be shot. | ||
336 | */ | ||
337 | |||
338 | |||
339 | static irqreturn_t math_error_irq(int cpl, void *dev_id) | ||
340 | { | ||
341 | extern void math_error(void __user *); | ||
342 | outb(0,0xF0); | ||
343 | if (ignore_fpu_irq || !boot_cpu_data.hard_math) | ||
344 | return IRQ_NONE; | ||
345 | math_error((void __user *)get_irq_regs()->eip); | ||
346 | return IRQ_HANDLED; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * New motherboards sometimes make IRQ 13 be a PCI interrupt, | ||
351 | * so allow interrupt sharing. | ||
352 | */ | ||
353 | static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; | ||
354 | |||
355 | void __init init_ISA_irqs (void) | ||
356 | { | ||
357 | int i; | ||
358 | |||
359 | #ifdef CONFIG_X86_LOCAL_APIC | ||
360 | init_bsp_APIC(); | ||
361 | #endif | ||
362 | init_8259A(0); | ||
363 | |||
364 | for (i = 0; i < NR_IRQS; i++) { | ||
365 | irq_desc[i].status = IRQ_DISABLED; | ||
366 | irq_desc[i].action = NULL; | ||
367 | irq_desc[i].depth = 1; | ||
368 | |||
369 | if (i < 16) { | ||
370 | /* | ||
371 | * 16 old-style INTA-cycle interrupts: | ||
372 | */ | ||
373 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
374 | handle_level_irq, "XT"); | ||
375 | } else { | ||
376 | /* | ||
377 | * 'high' PCI IRQs filled in on demand | ||
378 | */ | ||
379 | irq_desc[i].chip = &no_irq_chip; | ||
380 | } | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* Overridden in paravirt.c */ | ||
385 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
386 | |||
387 | void __init native_init_IRQ(void) | ||
388 | { | ||
389 | int i; | ||
390 | |||
391 | /* all the set up before the call gates are initialised */ | ||
392 | pre_intr_init_hook(); | ||
393 | |||
394 | /* | ||
395 | * Cover the whole vector space, no vector can escape | ||
396 | * us. (some of these will be overridden and become | ||
397 | * 'special' SMP interrupts) | ||
398 | */ | ||
399 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
400 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
401 | if (i >= NR_IRQS) | ||
402 | break; | ||
403 | if (vector != SYSCALL_VECTOR) | ||
404 | set_intr_gate(vector, interrupt[i]); | ||
405 | } | ||
406 | |||
407 | /* setup after call gates are initialised (usually add in | ||
408 | * the architecture specific gates) | ||
409 | */ | ||
410 | intr_init_hook(); | ||
411 | |||
412 | /* | ||
413 | * External FPU? Set up irq13 if so, for | ||
414 | * original braindamaged IBM FERR coupling. | ||
415 | */ | ||
416 | if (boot_cpu_data.hard_math && !cpu_has_fpu) | ||
417 | setup_irq(FPU_IRQ, &fpu_irq); | ||
418 | |||
419 | irq_ctx_init(smp_processor_id()); | ||
420 | } | ||
diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task_32.c new file mode 100644 index 000000000000..d26fc063a760 --- /dev/null +++ b/arch/x86/kernel/init_task_32.c | |||
@@ -0,0 +1,46 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/init_task.h> | ||
6 | #include <linux/fs.h> | ||
7 | #include <linux/mqueue.h> | ||
8 | |||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/desc.h> | ||
12 | |||
13 | static struct fs_struct init_fs = INIT_FS; | ||
14 | static struct files_struct init_files = INIT_FILES; | ||
15 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | ||
16 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | ||
17 | struct mm_struct init_mm = INIT_MM(init_mm); | ||
18 | |||
19 | EXPORT_SYMBOL(init_mm); | ||
20 | |||
21 | /* | ||
22 | * Initial thread structure. | ||
23 | * | ||
24 | * We need to make sure that this is THREAD_SIZE aligned due to the | ||
25 | * way process stacks are handled. This is done by having a special | ||
26 | * "init_task" linker map entry.. | ||
27 | */ | ||
28 | union thread_union init_thread_union | ||
29 | __attribute__((__section__(".data.init_task"))) = | ||
30 | { INIT_THREAD_INFO(init_task) }; | ||
31 | |||
32 | /* | ||
33 | * Initial task structure. | ||
34 | * | ||
35 | * All other task structs will be allocated on slabs in fork.c | ||
36 | */ | ||
37 | struct task_struct init_task = INIT_TASK(init_task); | ||
38 | |||
39 | EXPORT_SYMBOL(init_task); | ||
40 | |||
41 | /* | ||
42 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
43 | * no more per-task TSS's. | ||
44 | */ | ||
45 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | ||
46 | |||
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c new file mode 100644 index 000000000000..e2f4a1c68547 --- /dev/null +++ b/arch/x86/kernel/io_apic_32.c | |||
@@ -0,0 +1,2847 @@ | |||
1 | /* | ||
2 | * Intel IO-APIC support for multi-Pentium hosts. | ||
3 | * | ||
4 | * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | ||
5 | * | ||
6 | * Many thanks to Stig Venaas for trying out countless experimental | ||
7 | * patches and reporting/debugging problems patiently! | ||
8 | * | ||
9 | * (c) 1999, Multiple IO-APIC support, developed by | ||
10 | * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | ||
11 | * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | ||
12 | * further tested and cleaned up by Zach Brown <zab@redhat.com> | ||
13 | * and Ingo Molnar <mingo@redhat.com> | ||
14 | * | ||
15 | * Fixes | ||
16 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
17 | * thanks to Eric Gilmore | ||
18 | * and Rolf G. Tews | ||
19 | * for testing these extensively | ||
20 | * Paul Diefenbaugh : Added full ACPI support | ||
21 | */ | ||
22 | |||
23 | #include <linux/mm.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/mc146818rtc.h> | ||
29 | #include <linux/compiler.h> | ||
30 | #include <linux/acpi.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/sysdev.h> | ||
33 | #include <linux/pci.h> | ||
34 | #include <linux/msi.h> | ||
35 | #include <linux/htirq.h> | ||
36 | #include <linux/freezer.h> | ||
37 | #include <linux/kthread.h> | ||
38 | |||
39 | #include <asm/io.h> | ||
40 | #include <asm/smp.h> | ||
41 | #include <asm/desc.h> | ||
42 | #include <asm/timer.h> | ||
43 | #include <asm/i8259.h> | ||
44 | #include <asm/nmi.h> | ||
45 | #include <asm/msidef.h> | ||
46 | #include <asm/hypertransport.h> | ||
47 | |||
48 | #include <mach_apic.h> | ||
49 | #include <mach_apicdef.h> | ||
50 | |||
51 | #include "io_ports.h" | ||
52 | |||
53 | int (*ioapic_renumber_irq)(int ioapic, int irq); | ||
54 | atomic_t irq_mis_count; | ||
55 | |||
56 | /* Where if anywhere is the i8259 connect in external int mode */ | ||
57 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | ||
58 | |||
59 | static DEFINE_SPINLOCK(ioapic_lock); | ||
60 | static DEFINE_SPINLOCK(vector_lock); | ||
61 | |||
62 | int timer_over_8254 __initdata = 1; | ||
63 | |||
64 | /* | ||
65 | * Is the SiS APIC rmw bug present ? | ||
66 | * -1 = don't know, 0 = no, 1 = yes | ||
67 | */ | ||
68 | int sis_apic_bug = -1; | ||
69 | |||
70 | /* | ||
71 | * # of IRQ routing registers | ||
72 | */ | ||
73 | int nr_ioapic_registers[MAX_IO_APICS]; | ||
74 | |||
75 | static int disable_timer_pin_1 __initdata; | ||
76 | |||
77 | /* | ||
78 | * Rough estimation of how many shared IRQs there are, can | ||
79 | * be changed anytime. | ||
80 | */ | ||
81 | #define MAX_PLUS_SHARED_IRQS NR_IRQS | ||
82 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | ||
83 | |||
84 | /* | ||
85 | * This is performance-critical, we want to do it O(1) | ||
86 | * | ||
87 | * the indexing order of this array favors 1:1 mappings | ||
88 | * between pins and IRQs. | ||
89 | */ | ||
90 | |||
91 | static struct irq_pin_list { | ||
92 | int apic, pin, next; | ||
93 | } irq_2_pin[PIN_MAP_SIZE]; | ||
94 | |||
95 | struct io_apic { | ||
96 | unsigned int index; | ||
97 | unsigned int unused[3]; | ||
98 | unsigned int data; | ||
99 | }; | ||
100 | |||
101 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) | ||
102 | { | ||
103 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) | ||
104 | + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); | ||
105 | } | ||
106 | |||
107 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) | ||
108 | { | ||
109 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
110 | writel(reg, &io_apic->index); | ||
111 | return readl(&io_apic->data); | ||
112 | } | ||
113 | |||
114 | static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | ||
115 | { | ||
116 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
117 | writel(reg, &io_apic->index); | ||
118 | writel(value, &io_apic->data); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Re-write a value: to be used for read-modify-write | ||
123 | * cycles where the read already set up the index register. | ||
124 | * | ||
125 | * Older SiS APIC requires we rewrite the index register | ||
126 | */ | ||
127 | static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) | ||
128 | { | ||
129 | volatile struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
130 | if (sis_apic_bug) | ||
131 | writel(reg, &io_apic->index); | ||
132 | writel(value, &io_apic->data); | ||
133 | } | ||
134 | |||
135 | union entry_union { | ||
136 | struct { u32 w1, w2; }; | ||
137 | struct IO_APIC_route_entry entry; | ||
138 | }; | ||
139 | |||
140 | static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) | ||
141 | { | ||
142 | union entry_union eu; | ||
143 | unsigned long flags; | ||
144 | spin_lock_irqsave(&ioapic_lock, flags); | ||
145 | eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); | ||
146 | eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); | ||
147 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
148 | return eu.entry; | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * When we write a new IO APIC routing entry, we need to write the high | ||
153 | * word first! If the mask bit in the low word is clear, we will enable | ||
154 | * the interrupt, and we need to make sure the entry is fully populated | ||
155 | * before that happens. | ||
156 | */ | ||
157 | static void | ||
158 | __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
159 | { | ||
160 | union entry_union eu; | ||
161 | eu.entry = e; | ||
162 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
163 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
164 | } | ||
165 | |||
166 | static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
167 | { | ||
168 | unsigned long flags; | ||
169 | spin_lock_irqsave(&ioapic_lock, flags); | ||
170 | __ioapic_write_entry(apic, pin, e); | ||
171 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * When we mask an IO APIC routing entry, we need to write the low | ||
176 | * word first, in order to set the mask bit before we change the | ||
177 | * high bits! | ||
178 | */ | ||
179 | static void ioapic_mask_entry(int apic, int pin) | ||
180 | { | ||
181 | unsigned long flags; | ||
182 | union entry_union eu = { .entry.mask = 1 }; | ||
183 | |||
184 | spin_lock_irqsave(&ioapic_lock, flags); | ||
185 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
186 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
187 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
188 | } | ||
189 | |||
190 | /* | ||
191 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | ||
192 | * shared ISA-space IRQs, so we have to support them. We are super | ||
193 | * fast in the common case, and fast for shared ISA-space IRQs. | ||
194 | */ | ||
195 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) | ||
196 | { | ||
197 | static int first_free_entry = NR_IRQS; | ||
198 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
199 | |||
200 | while (entry->next) | ||
201 | entry = irq_2_pin + entry->next; | ||
202 | |||
203 | if (entry->pin != -1) { | ||
204 | entry->next = first_free_entry; | ||
205 | entry = irq_2_pin + entry->next; | ||
206 | if (++first_free_entry >= PIN_MAP_SIZE) | ||
207 | panic("io_apic.c: whoops"); | ||
208 | } | ||
209 | entry->apic = apic; | ||
210 | entry->pin = pin; | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * Reroute an IRQ to a different pin. | ||
215 | */ | ||
216 | static void __init replace_pin_at_irq(unsigned int irq, | ||
217 | int oldapic, int oldpin, | ||
218 | int newapic, int newpin) | ||
219 | { | ||
220 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
221 | |||
222 | while (1) { | ||
223 | if (entry->apic == oldapic && entry->pin == oldpin) { | ||
224 | entry->apic = newapic; | ||
225 | entry->pin = newpin; | ||
226 | } | ||
227 | if (!entry->next) | ||
228 | break; | ||
229 | entry = irq_2_pin + entry->next; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) | ||
234 | { | ||
235 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
236 | unsigned int pin, reg; | ||
237 | |||
238 | for (;;) { | ||
239 | pin = entry->pin; | ||
240 | if (pin == -1) | ||
241 | break; | ||
242 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | ||
243 | reg &= ~disable; | ||
244 | reg |= enable; | ||
245 | io_apic_modify(entry->apic, 0x10 + pin*2, reg); | ||
246 | if (!entry->next) | ||
247 | break; | ||
248 | entry = irq_2_pin + entry->next; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | /* mask = 1 */ | ||
253 | static void __mask_IO_APIC_irq (unsigned int irq) | ||
254 | { | ||
255 | __modify_IO_APIC_irq(irq, 0x00010000, 0); | ||
256 | } | ||
257 | |||
258 | /* mask = 0 */ | ||
259 | static void __unmask_IO_APIC_irq (unsigned int irq) | ||
260 | { | ||
261 | __modify_IO_APIC_irq(irq, 0, 0x00010000); | ||
262 | } | ||
263 | |||
264 | /* mask = 1, trigger = 0 */ | ||
265 | static void __mask_and_edge_IO_APIC_irq (unsigned int irq) | ||
266 | { | ||
267 | __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); | ||
268 | } | ||
269 | |||
270 | /* mask = 0, trigger = 1 */ | ||
271 | static void __unmask_and_level_IO_APIC_irq (unsigned int irq) | ||
272 | { | ||
273 | __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); | ||
274 | } | ||
275 | |||
276 | static void mask_IO_APIC_irq (unsigned int irq) | ||
277 | { | ||
278 | unsigned long flags; | ||
279 | |||
280 | spin_lock_irqsave(&ioapic_lock, flags); | ||
281 | __mask_IO_APIC_irq(irq); | ||
282 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
283 | } | ||
284 | |||
285 | static void unmask_IO_APIC_irq (unsigned int irq) | ||
286 | { | ||
287 | unsigned long flags; | ||
288 | |||
289 | spin_lock_irqsave(&ioapic_lock, flags); | ||
290 | __unmask_IO_APIC_irq(irq); | ||
291 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
292 | } | ||
293 | |||
294 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | ||
295 | { | ||
296 | struct IO_APIC_route_entry entry; | ||
297 | |||
298 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ | ||
299 | entry = ioapic_read_entry(apic, pin); | ||
300 | if (entry.delivery_mode == dest_SMI) | ||
301 | return; | ||
302 | |||
303 | /* | ||
304 | * Disable it in the IO-APIC irq-routing table: | ||
305 | */ | ||
306 | ioapic_mask_entry(apic, pin); | ||
307 | } | ||
308 | |||
309 | static void clear_IO_APIC (void) | ||
310 | { | ||
311 | int apic, pin; | ||
312 | |||
313 | for (apic = 0; apic < nr_ioapics; apic++) | ||
314 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | ||
315 | clear_IO_APIC_pin(apic, pin); | ||
316 | } | ||
317 | |||
318 | #ifdef CONFIG_SMP | ||
319 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) | ||
320 | { | ||
321 | unsigned long flags; | ||
322 | int pin; | ||
323 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
324 | unsigned int apicid_value; | ||
325 | cpumask_t tmp; | ||
326 | |||
327 | cpus_and(tmp, cpumask, cpu_online_map); | ||
328 | if (cpus_empty(tmp)) | ||
329 | tmp = TARGET_CPUS; | ||
330 | |||
331 | cpus_and(cpumask, tmp, CPU_MASK_ALL); | ||
332 | |||
333 | apicid_value = cpu_mask_to_apicid(cpumask); | ||
334 | /* Prepare to do the io_apic_write */ | ||
335 | apicid_value = apicid_value << 24; | ||
336 | spin_lock_irqsave(&ioapic_lock, flags); | ||
337 | for (;;) { | ||
338 | pin = entry->pin; | ||
339 | if (pin == -1) | ||
340 | break; | ||
341 | io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); | ||
342 | if (!entry->next) | ||
343 | break; | ||
344 | entry = irq_2_pin + entry->next; | ||
345 | } | ||
346 | irq_desc[irq].affinity = cpumask; | ||
347 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
348 | } | ||
349 | |||
350 | #if defined(CONFIG_IRQBALANCE) | ||
351 | # include <asm/processor.h> /* kernel_thread() */ | ||
352 | # include <linux/kernel_stat.h> /* kstat */ | ||
353 | # include <linux/slab.h> /* kmalloc() */ | ||
354 | # include <linux/timer.h> /* time_after() */ | ||
355 | |||
356 | #define IRQBALANCE_CHECK_ARCH -999 | ||
357 | #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) | ||
358 | #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) | ||
359 | #define BALANCED_IRQ_MORE_DELTA (HZ/10) | ||
360 | #define BALANCED_IRQ_LESS_DELTA (HZ) | ||
361 | |||
362 | static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; | ||
363 | static int physical_balance __read_mostly; | ||
364 | static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; | ||
365 | |||
366 | static struct irq_cpu_info { | ||
367 | unsigned long * last_irq; | ||
368 | unsigned long * irq_delta; | ||
369 | unsigned long irq; | ||
370 | } irq_cpu_data[NR_CPUS]; | ||
371 | |||
372 | #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) | ||
373 | #define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) | ||
374 | #define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) | ||
375 | |||
376 | #define IDLE_ENOUGH(cpu,now) \ | ||
377 | (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) | ||
378 | |||
379 | #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) | ||
380 | |||
381 | #define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) | ||
382 | |||
383 | static cpumask_t balance_irq_affinity[NR_IRQS] = { | ||
384 | [0 ... NR_IRQS-1] = CPU_MASK_ALL | ||
385 | }; | ||
386 | |||
387 | void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) | ||
388 | { | ||
389 | balance_irq_affinity[irq] = mask; | ||
390 | } | ||
391 | |||
392 | static unsigned long move(int curr_cpu, cpumask_t allowed_mask, | ||
393 | unsigned long now, int direction) | ||
394 | { | ||
395 | int search_idle = 1; | ||
396 | int cpu = curr_cpu; | ||
397 | |||
398 | goto inside; | ||
399 | |||
400 | do { | ||
401 | if (unlikely(cpu == curr_cpu)) | ||
402 | search_idle = 0; | ||
403 | inside: | ||
404 | if (direction == 1) { | ||
405 | cpu++; | ||
406 | if (cpu >= NR_CPUS) | ||
407 | cpu = 0; | ||
408 | } else { | ||
409 | cpu--; | ||
410 | if (cpu == -1) | ||
411 | cpu = NR_CPUS-1; | ||
412 | } | ||
413 | } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || | ||
414 | (search_idle && !IDLE_ENOUGH(cpu,now))); | ||
415 | |||
416 | return cpu; | ||
417 | } | ||
418 | |||
419 | static inline void balance_irq(int cpu, int irq) | ||
420 | { | ||
421 | unsigned long now = jiffies; | ||
422 | cpumask_t allowed_mask; | ||
423 | unsigned int new_cpu; | ||
424 | |||
425 | if (irqbalance_disabled) | ||
426 | return; | ||
427 | |||
428 | cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); | ||
429 | new_cpu = move(cpu, allowed_mask, now, 1); | ||
430 | if (cpu != new_cpu) { | ||
431 | set_pending_irq(irq, cpumask_of_cpu(new_cpu)); | ||
432 | } | ||
433 | } | ||
434 | |||
435 | static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) | ||
436 | { | ||
437 | int i, j; | ||
438 | |||
439 | for_each_online_cpu(i) { | ||
440 | for (j = 0; j < NR_IRQS; j++) { | ||
441 | if (!irq_desc[j].action) | ||
442 | continue; | ||
443 | /* Is it a significant load ? */ | ||
444 | if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < | ||
445 | useful_load_threshold) | ||
446 | continue; | ||
447 | balance_irq(i, j); | ||
448 | } | ||
449 | } | ||
450 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | ||
451 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | ||
452 | return; | ||
453 | } | ||
454 | |||
455 | static void do_irq_balance(void) | ||
456 | { | ||
457 | int i, j; | ||
458 | unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); | ||
459 | unsigned long move_this_load = 0; | ||
460 | int max_loaded = 0, min_loaded = 0; | ||
461 | int load; | ||
462 | unsigned long useful_load_threshold = balanced_irq_interval + 10; | ||
463 | int selected_irq; | ||
464 | int tmp_loaded, first_attempt = 1; | ||
465 | unsigned long tmp_cpu_irq; | ||
466 | unsigned long imbalance = 0; | ||
467 | cpumask_t allowed_mask, target_cpu_mask, tmp; | ||
468 | |||
469 | for_each_possible_cpu(i) { | ||
470 | int package_index; | ||
471 | CPU_IRQ(i) = 0; | ||
472 | if (!cpu_online(i)) | ||
473 | continue; | ||
474 | package_index = CPU_TO_PACKAGEINDEX(i); | ||
475 | for (j = 0; j < NR_IRQS; j++) { | ||
476 | unsigned long value_now, delta; | ||
477 | /* Is this an active IRQ or balancing disabled ? */ | ||
478 | if (!irq_desc[j].action || irq_balancing_disabled(j)) | ||
479 | continue; | ||
480 | if ( package_index == i ) | ||
481 | IRQ_DELTA(package_index,j) = 0; | ||
482 | /* Determine the total count per processor per IRQ */ | ||
483 | value_now = (unsigned long) kstat_cpu(i).irqs[j]; | ||
484 | |||
485 | /* Determine the activity per processor per IRQ */ | ||
486 | delta = value_now - LAST_CPU_IRQ(i,j); | ||
487 | |||
488 | /* Update last_cpu_irq[][] for the next time */ | ||
489 | LAST_CPU_IRQ(i,j) = value_now; | ||
490 | |||
491 | /* Ignore IRQs whose rate is less than the clock */ | ||
492 | if (delta < useful_load_threshold) | ||
493 | continue; | ||
494 | /* update the load for the processor or package total */ | ||
495 | IRQ_DELTA(package_index,j) += delta; | ||
496 | |||
497 | /* Keep track of the higher numbered sibling as well */ | ||
498 | if (i != package_index) | ||
499 | CPU_IRQ(i) += delta; | ||
500 | /* | ||
501 | * We have sibling A and sibling B in the package | ||
502 | * | ||
503 | * cpu_irq[A] = load for cpu A + load for cpu B | ||
504 | * cpu_irq[B] = load for cpu B | ||
505 | */ | ||
506 | CPU_IRQ(package_index) += delta; | ||
507 | } | ||
508 | } | ||
509 | /* Find the least loaded processor package */ | ||
510 | for_each_online_cpu(i) { | ||
511 | if (i != CPU_TO_PACKAGEINDEX(i)) | ||
512 | continue; | ||
513 | if (min_cpu_irq > CPU_IRQ(i)) { | ||
514 | min_cpu_irq = CPU_IRQ(i); | ||
515 | min_loaded = i; | ||
516 | } | ||
517 | } | ||
518 | max_cpu_irq = ULONG_MAX; | ||
519 | |||
520 | tryanothercpu: | ||
521 | /* Look for heaviest loaded processor. | ||
522 | * We may come back to get the next heaviest loaded processor. | ||
523 | * Skip processors with trivial loads. | ||
524 | */ | ||
525 | tmp_cpu_irq = 0; | ||
526 | tmp_loaded = -1; | ||
527 | for_each_online_cpu(i) { | ||
528 | if (i != CPU_TO_PACKAGEINDEX(i)) | ||
529 | continue; | ||
530 | if (max_cpu_irq <= CPU_IRQ(i)) | ||
531 | continue; | ||
532 | if (tmp_cpu_irq < CPU_IRQ(i)) { | ||
533 | tmp_cpu_irq = CPU_IRQ(i); | ||
534 | tmp_loaded = i; | ||
535 | } | ||
536 | } | ||
537 | |||
538 | if (tmp_loaded == -1) { | ||
539 | /* In the case of small number of heavy interrupt sources, | ||
540 | * loading some of the cpus too much. We use Ingo's original | ||
541 | * approach to rotate them around. | ||
542 | */ | ||
543 | if (!first_attempt && imbalance >= useful_load_threshold) { | ||
544 | rotate_irqs_among_cpus(useful_load_threshold); | ||
545 | return; | ||
546 | } | ||
547 | goto not_worth_the_effort; | ||
548 | } | ||
549 | |||
550 | first_attempt = 0; /* heaviest search */ | ||
551 | max_cpu_irq = tmp_cpu_irq; /* load */ | ||
552 | max_loaded = tmp_loaded; /* processor */ | ||
553 | imbalance = (max_cpu_irq - min_cpu_irq) / 2; | ||
554 | |||
555 | /* if imbalance is less than approx 10% of max load, then | ||
556 | * observe diminishing returns action. - quit | ||
557 | */ | ||
558 | if (imbalance < (max_cpu_irq >> 3)) | ||
559 | goto not_worth_the_effort; | ||
560 | |||
561 | tryanotherirq: | ||
562 | /* if we select an IRQ to move that can't go where we want, then | ||
563 | * see if there is another one to try. | ||
564 | */ | ||
565 | move_this_load = 0; | ||
566 | selected_irq = -1; | ||
567 | for (j = 0; j < NR_IRQS; j++) { | ||
568 | /* Is this an active IRQ? */ | ||
569 | if (!irq_desc[j].action) | ||
570 | continue; | ||
571 | if (imbalance <= IRQ_DELTA(max_loaded,j)) | ||
572 | continue; | ||
573 | /* Try to find the IRQ that is closest to the imbalance | ||
574 | * without going over. | ||
575 | */ | ||
576 | if (move_this_load < IRQ_DELTA(max_loaded,j)) { | ||
577 | move_this_load = IRQ_DELTA(max_loaded,j); | ||
578 | selected_irq = j; | ||
579 | } | ||
580 | } | ||
581 | if (selected_irq == -1) { | ||
582 | goto tryanothercpu; | ||
583 | } | ||
584 | |||
585 | imbalance = move_this_load; | ||
586 | |||
587 | /* For physical_balance case, we accumlated both load | ||
588 | * values in the one of the siblings cpu_irq[], | ||
589 | * to use the same code for physical and logical processors | ||
590 | * as much as possible. | ||
591 | * | ||
592 | * NOTE: the cpu_irq[] array holds the sum of the load for | ||
593 | * sibling A and sibling B in the slot for the lowest numbered | ||
594 | * sibling (A), _AND_ the load for sibling B in the slot for | ||
595 | * the higher numbered sibling. | ||
596 | * | ||
597 | * We seek the least loaded sibling by making the comparison | ||
598 | * (A+B)/2 vs B | ||
599 | */ | ||
600 | load = CPU_IRQ(min_loaded) >> 1; | ||
601 | for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { | ||
602 | if (load > CPU_IRQ(j)) { | ||
603 | /* This won't change cpu_sibling_map[min_loaded] */ | ||
604 | load = CPU_IRQ(j); | ||
605 | min_loaded = j; | ||
606 | } | ||
607 | } | ||
608 | |||
609 | cpus_and(allowed_mask, | ||
610 | cpu_online_map, | ||
611 | balance_irq_affinity[selected_irq]); | ||
612 | target_cpu_mask = cpumask_of_cpu(min_loaded); | ||
613 | cpus_and(tmp, target_cpu_mask, allowed_mask); | ||
614 | |||
615 | if (!cpus_empty(tmp)) { | ||
616 | /* mark for change destination */ | ||
617 | set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); | ||
618 | |||
619 | /* Since we made a change, come back sooner to | ||
620 | * check for more variation. | ||
621 | */ | ||
622 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | ||
623 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | ||
624 | return; | ||
625 | } | ||
626 | goto tryanotherirq; | ||
627 | |||
628 | not_worth_the_effort: | ||
629 | /* | ||
630 | * if we did not find an IRQ to move, then adjust the time interval | ||
631 | * upward | ||
632 | */ | ||
633 | balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, | ||
634 | balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); | ||
635 | return; | ||
636 | } | ||
637 | |||
638 | static int balanced_irq(void *unused) | ||
639 | { | ||
640 | int i; | ||
641 | unsigned long prev_balance_time = jiffies; | ||
642 | long time_remaining = balanced_irq_interval; | ||
643 | |||
644 | /* push everything to CPU 0 to give us a starting point. */ | ||
645 | for (i = 0 ; i < NR_IRQS ; i++) { | ||
646 | irq_desc[i].pending_mask = cpumask_of_cpu(0); | ||
647 | set_pending_irq(i, cpumask_of_cpu(0)); | ||
648 | } | ||
649 | |||
650 | set_freezable(); | ||
651 | for ( ; ; ) { | ||
652 | time_remaining = schedule_timeout_interruptible(time_remaining); | ||
653 | try_to_freeze(); | ||
654 | if (time_after(jiffies, | ||
655 | prev_balance_time+balanced_irq_interval)) { | ||
656 | preempt_disable(); | ||
657 | do_irq_balance(); | ||
658 | prev_balance_time = jiffies; | ||
659 | time_remaining = balanced_irq_interval; | ||
660 | preempt_enable(); | ||
661 | } | ||
662 | } | ||
663 | return 0; | ||
664 | } | ||
665 | |||
666 | static int __init balanced_irq_init(void) | ||
667 | { | ||
668 | int i; | ||
669 | struct cpuinfo_x86 *c; | ||
670 | cpumask_t tmp; | ||
671 | |||
672 | cpus_shift_right(tmp, cpu_online_map, 2); | ||
673 | c = &boot_cpu_data; | ||
674 | /* When not overwritten by the command line ask subarchitecture. */ | ||
675 | if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) | ||
676 | irqbalance_disabled = NO_BALANCE_IRQ; | ||
677 | if (irqbalance_disabled) | ||
678 | return 0; | ||
679 | |||
680 | /* disable irqbalance completely if there is only one processor online */ | ||
681 | if (num_online_cpus() < 2) { | ||
682 | irqbalance_disabled = 1; | ||
683 | return 0; | ||
684 | } | ||
685 | /* | ||
686 | * Enable physical balance only if more than 1 physical processor | ||
687 | * is present | ||
688 | */ | ||
689 | if (smp_num_siblings > 1 && !cpus_empty(tmp)) | ||
690 | physical_balance = 1; | ||
691 | |||
692 | for_each_online_cpu(i) { | ||
693 | irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | ||
694 | irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | ||
695 | if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { | ||
696 | printk(KERN_ERR "balanced_irq_init: out of memory"); | ||
697 | goto failed; | ||
698 | } | ||
699 | memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); | ||
700 | memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); | ||
701 | } | ||
702 | |||
703 | printk(KERN_INFO "Starting balanced_irq\n"); | ||
704 | if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) | ||
705 | return 0; | ||
706 | printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); | ||
707 | failed: | ||
708 | for_each_possible_cpu(i) { | ||
709 | kfree(irq_cpu_data[i].irq_delta); | ||
710 | irq_cpu_data[i].irq_delta = NULL; | ||
711 | kfree(irq_cpu_data[i].last_irq); | ||
712 | irq_cpu_data[i].last_irq = NULL; | ||
713 | } | ||
714 | return 0; | ||
715 | } | ||
716 | |||
717 | int __devinit irqbalance_disable(char *str) | ||
718 | { | ||
719 | irqbalance_disabled = 1; | ||
720 | return 1; | ||
721 | } | ||
722 | |||
723 | __setup("noirqbalance", irqbalance_disable); | ||
724 | |||
725 | late_initcall(balanced_irq_init); | ||
726 | #endif /* CONFIG_IRQBALANCE */ | ||
727 | #endif /* CONFIG_SMP */ | ||
728 | |||
729 | #ifndef CONFIG_SMP | ||
730 | void fastcall send_IPI_self(int vector) | ||
731 | { | ||
732 | unsigned int cfg; | ||
733 | |||
734 | /* | ||
735 | * Wait for idle. | ||
736 | */ | ||
737 | apic_wait_icr_idle(); | ||
738 | cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; | ||
739 | /* | ||
740 | * Send the IPI. The write to APIC_ICR fires this off. | ||
741 | */ | ||
742 | apic_write_around(APIC_ICR, cfg); | ||
743 | } | ||
744 | #endif /* !CONFIG_SMP */ | ||
745 | |||
746 | |||
747 | /* | ||
748 | * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to | ||
749 | * specific CPU-side IRQs. | ||
750 | */ | ||
751 | |||
752 | #define MAX_PIRQS 8 | ||
753 | static int pirq_entries [MAX_PIRQS]; | ||
754 | static int pirqs_enabled; | ||
755 | int skip_ioapic_setup; | ||
756 | |||
757 | static int __init ioapic_pirq_setup(char *str) | ||
758 | { | ||
759 | int i, max; | ||
760 | int ints[MAX_PIRQS+1]; | ||
761 | |||
762 | get_options(str, ARRAY_SIZE(ints), ints); | ||
763 | |||
764 | for (i = 0; i < MAX_PIRQS; i++) | ||
765 | pirq_entries[i] = -1; | ||
766 | |||
767 | pirqs_enabled = 1; | ||
768 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
769 | "PIRQ redirection, working around broken MP-BIOS.\n"); | ||
770 | max = MAX_PIRQS; | ||
771 | if (ints[0] < MAX_PIRQS) | ||
772 | max = ints[0]; | ||
773 | |||
774 | for (i = 0; i < max; i++) { | ||
775 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
776 | "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); | ||
777 | /* | ||
778 | * PIRQs are mapped upside down, usually. | ||
779 | */ | ||
780 | pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; | ||
781 | } | ||
782 | return 1; | ||
783 | } | ||
784 | |||
785 | __setup("pirq=", ioapic_pirq_setup); | ||
786 | |||
787 | /* | ||
788 | * Find the IRQ entry number of a certain pin. | ||
789 | */ | ||
790 | static int find_irq_entry(int apic, int pin, int type) | ||
791 | { | ||
792 | int i; | ||
793 | |||
794 | for (i = 0; i < mp_irq_entries; i++) | ||
795 | if (mp_irqs[i].mpc_irqtype == type && | ||
796 | (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | ||
797 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | ||
798 | mp_irqs[i].mpc_dstirq == pin) | ||
799 | return i; | ||
800 | |||
801 | return -1; | ||
802 | } | ||
803 | |||
804 | /* | ||
805 | * Find the pin to which IRQ[irq] (ISA) is connected | ||
806 | */ | ||
807 | static int __init find_isa_irq_pin(int irq, int type) | ||
808 | { | ||
809 | int i; | ||
810 | |||
811 | for (i = 0; i < mp_irq_entries; i++) { | ||
812 | int lbus = mp_irqs[i].mpc_srcbus; | ||
813 | |||
814 | if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | ||
815 | mp_bus_id_to_type[lbus] == MP_BUS_EISA || | ||
816 | mp_bus_id_to_type[lbus] == MP_BUS_MCA | ||
817 | ) && | ||
818 | (mp_irqs[i].mpc_irqtype == type) && | ||
819 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
820 | |||
821 | return mp_irqs[i].mpc_dstirq; | ||
822 | } | ||
823 | return -1; | ||
824 | } | ||
825 | |||
826 | static int __init find_isa_irq_apic(int irq, int type) | ||
827 | { | ||
828 | int i; | ||
829 | |||
830 | for (i = 0; i < mp_irq_entries; i++) { | ||
831 | int lbus = mp_irqs[i].mpc_srcbus; | ||
832 | |||
833 | if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | ||
834 | mp_bus_id_to_type[lbus] == MP_BUS_EISA || | ||
835 | mp_bus_id_to_type[lbus] == MP_BUS_MCA | ||
836 | ) && | ||
837 | (mp_irqs[i].mpc_irqtype == type) && | ||
838 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
839 | break; | ||
840 | } | ||
841 | if (i < mp_irq_entries) { | ||
842 | int apic; | ||
843 | for(apic = 0; apic < nr_ioapics; apic++) { | ||
844 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) | ||
845 | return apic; | ||
846 | } | ||
847 | } | ||
848 | |||
849 | return -1; | ||
850 | } | ||
851 | |||
852 | /* | ||
853 | * Find a specific PCI IRQ entry. | ||
854 | * Not an __init, possibly needed by modules | ||
855 | */ | ||
856 | static int pin_2_irq(int idx, int apic, int pin); | ||
857 | |||
858 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | ||
859 | { | ||
860 | int apic, i, best_guess = -1; | ||
861 | |||
862 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " | ||
863 | "slot:%d, pin:%d.\n", bus, slot, pin); | ||
864 | if (mp_bus_id_to_pci_bus[bus] == -1) { | ||
865 | printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
866 | return -1; | ||
867 | } | ||
868 | for (i = 0; i < mp_irq_entries; i++) { | ||
869 | int lbus = mp_irqs[i].mpc_srcbus; | ||
870 | |||
871 | for (apic = 0; apic < nr_ioapics; apic++) | ||
872 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | ||
873 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | ||
874 | break; | ||
875 | |||
876 | if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && | ||
877 | !mp_irqs[i].mpc_irqtype && | ||
878 | (bus == lbus) && | ||
879 | (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | ||
880 | int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | ||
881 | |||
882 | if (!(apic || IO_APIC_IRQ(irq))) | ||
883 | continue; | ||
884 | |||
885 | if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | ||
886 | return irq; | ||
887 | /* | ||
888 | * Use the first all-but-pin matching entry as a | ||
889 | * best-guess fuzzy result for broken mptables. | ||
890 | */ | ||
891 | if (best_guess < 0) | ||
892 | best_guess = irq; | ||
893 | } | ||
894 | } | ||
895 | return best_guess; | ||
896 | } | ||
897 | EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); | ||
898 | |||
899 | /* | ||
900 | * This function currently is only a helper for the i386 smp boot process where | ||
901 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | ||
902 | * so mask in all cases should simply be TARGET_CPUS | ||
903 | */ | ||
904 | #ifdef CONFIG_SMP | ||
905 | void __init setup_ioapic_dest(void) | ||
906 | { | ||
907 | int pin, ioapic, irq, irq_entry; | ||
908 | |||
909 | if (skip_ioapic_setup == 1) | ||
910 | return; | ||
911 | |||
912 | for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | ||
913 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | ||
914 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | ||
915 | if (irq_entry == -1) | ||
916 | continue; | ||
917 | irq = pin_2_irq(irq_entry, ioapic, pin); | ||
918 | set_ioapic_affinity_irq(irq, TARGET_CPUS); | ||
919 | } | ||
920 | |||
921 | } | ||
922 | } | ||
923 | #endif | ||
924 | |||
925 | /* | ||
926 | * EISA Edge/Level control register, ELCR | ||
927 | */ | ||
928 | static int EISA_ELCR(unsigned int irq) | ||
929 | { | ||
930 | if (irq < 16) { | ||
931 | unsigned int port = 0x4d0 + (irq >> 3); | ||
932 | return (inb(port) >> (irq & 7)) & 1; | ||
933 | } | ||
934 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
935 | "Broken MPtable reports ISA irq %d\n", irq); | ||
936 | return 0; | ||
937 | } | ||
938 | |||
939 | /* EISA interrupts are always polarity zero and can be edge or level | ||
940 | * trigger depending on the ELCR value. If an interrupt is listed as | ||
941 | * EISA conforming in the MP table, that means its trigger type must | ||
942 | * be read in from the ELCR */ | ||
943 | |||
944 | #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) | ||
945 | #define default_EISA_polarity(idx) (0) | ||
946 | |||
947 | /* ISA interrupts are always polarity zero edge triggered, | ||
948 | * when listed as conforming in the MP table. */ | ||
949 | |||
950 | #define default_ISA_trigger(idx) (0) | ||
951 | #define default_ISA_polarity(idx) (0) | ||
952 | |||
953 | /* PCI interrupts are always polarity one level triggered, | ||
954 | * when listed as conforming in the MP table. */ | ||
955 | |||
956 | #define default_PCI_trigger(idx) (1) | ||
957 | #define default_PCI_polarity(idx) (1) | ||
958 | |||
959 | /* MCA interrupts are always polarity zero level triggered, | ||
960 | * when listed as conforming in the MP table. */ | ||
961 | |||
962 | #define default_MCA_trigger(idx) (1) | ||
963 | #define default_MCA_polarity(idx) (0) | ||
964 | |||
965 | static int __init MPBIOS_polarity(int idx) | ||
966 | { | ||
967 | int bus = mp_irqs[idx].mpc_srcbus; | ||
968 | int polarity; | ||
969 | |||
970 | /* | ||
971 | * Determine IRQ line polarity (high active or low active): | ||
972 | */ | ||
973 | switch (mp_irqs[idx].mpc_irqflag & 3) | ||
974 | { | ||
975 | case 0: /* conforms, ie. bus-type dependent polarity */ | ||
976 | { | ||
977 | switch (mp_bus_id_to_type[bus]) | ||
978 | { | ||
979 | case MP_BUS_ISA: /* ISA pin */ | ||
980 | { | ||
981 | polarity = default_ISA_polarity(idx); | ||
982 | break; | ||
983 | } | ||
984 | case MP_BUS_EISA: /* EISA pin */ | ||
985 | { | ||
986 | polarity = default_EISA_polarity(idx); | ||
987 | break; | ||
988 | } | ||
989 | case MP_BUS_PCI: /* PCI pin */ | ||
990 | { | ||
991 | polarity = default_PCI_polarity(idx); | ||
992 | break; | ||
993 | } | ||
994 | case MP_BUS_MCA: /* MCA pin */ | ||
995 | { | ||
996 | polarity = default_MCA_polarity(idx); | ||
997 | break; | ||
998 | } | ||
999 | default: | ||
1000 | { | ||
1001 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1002 | polarity = 1; | ||
1003 | break; | ||
1004 | } | ||
1005 | } | ||
1006 | break; | ||
1007 | } | ||
1008 | case 1: /* high active */ | ||
1009 | { | ||
1010 | polarity = 0; | ||
1011 | break; | ||
1012 | } | ||
1013 | case 2: /* reserved */ | ||
1014 | { | ||
1015 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1016 | polarity = 1; | ||
1017 | break; | ||
1018 | } | ||
1019 | case 3: /* low active */ | ||
1020 | { | ||
1021 | polarity = 1; | ||
1022 | break; | ||
1023 | } | ||
1024 | default: /* invalid */ | ||
1025 | { | ||
1026 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1027 | polarity = 1; | ||
1028 | break; | ||
1029 | } | ||
1030 | } | ||
1031 | return polarity; | ||
1032 | } | ||
1033 | |||
1034 | static int MPBIOS_trigger(int idx) | ||
1035 | { | ||
1036 | int bus = mp_irqs[idx].mpc_srcbus; | ||
1037 | int trigger; | ||
1038 | |||
1039 | /* | ||
1040 | * Determine IRQ trigger mode (edge or level sensitive): | ||
1041 | */ | ||
1042 | switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | ||
1043 | { | ||
1044 | case 0: /* conforms, ie. bus-type dependent */ | ||
1045 | { | ||
1046 | switch (mp_bus_id_to_type[bus]) | ||
1047 | { | ||
1048 | case MP_BUS_ISA: /* ISA pin */ | ||
1049 | { | ||
1050 | trigger = default_ISA_trigger(idx); | ||
1051 | break; | ||
1052 | } | ||
1053 | case MP_BUS_EISA: /* EISA pin */ | ||
1054 | { | ||
1055 | trigger = default_EISA_trigger(idx); | ||
1056 | break; | ||
1057 | } | ||
1058 | case MP_BUS_PCI: /* PCI pin */ | ||
1059 | { | ||
1060 | trigger = default_PCI_trigger(idx); | ||
1061 | break; | ||
1062 | } | ||
1063 | case MP_BUS_MCA: /* MCA pin */ | ||
1064 | { | ||
1065 | trigger = default_MCA_trigger(idx); | ||
1066 | break; | ||
1067 | } | ||
1068 | default: | ||
1069 | { | ||
1070 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1071 | trigger = 1; | ||
1072 | break; | ||
1073 | } | ||
1074 | } | ||
1075 | break; | ||
1076 | } | ||
1077 | case 1: /* edge */ | ||
1078 | { | ||
1079 | trigger = 0; | ||
1080 | break; | ||
1081 | } | ||
1082 | case 2: /* reserved */ | ||
1083 | { | ||
1084 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1085 | trigger = 1; | ||
1086 | break; | ||
1087 | } | ||
1088 | case 3: /* level */ | ||
1089 | { | ||
1090 | trigger = 1; | ||
1091 | break; | ||
1092 | } | ||
1093 | default: /* invalid */ | ||
1094 | { | ||
1095 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1096 | trigger = 0; | ||
1097 | break; | ||
1098 | } | ||
1099 | } | ||
1100 | return trigger; | ||
1101 | } | ||
1102 | |||
1103 | static inline int irq_polarity(int idx) | ||
1104 | { | ||
1105 | return MPBIOS_polarity(idx); | ||
1106 | } | ||
1107 | |||
1108 | static inline int irq_trigger(int idx) | ||
1109 | { | ||
1110 | return MPBIOS_trigger(idx); | ||
1111 | } | ||
1112 | |||
1113 | static int pin_2_irq(int idx, int apic, int pin) | ||
1114 | { | ||
1115 | int irq, i; | ||
1116 | int bus = mp_irqs[idx].mpc_srcbus; | ||
1117 | |||
1118 | /* | ||
1119 | * Debugging check, we are in big trouble if this message pops up! | ||
1120 | */ | ||
1121 | if (mp_irqs[idx].mpc_dstirq != pin) | ||
1122 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | ||
1123 | |||
1124 | switch (mp_bus_id_to_type[bus]) | ||
1125 | { | ||
1126 | case MP_BUS_ISA: /* ISA pin */ | ||
1127 | case MP_BUS_EISA: | ||
1128 | case MP_BUS_MCA: | ||
1129 | { | ||
1130 | irq = mp_irqs[idx].mpc_srcbusirq; | ||
1131 | break; | ||
1132 | } | ||
1133 | case MP_BUS_PCI: /* PCI pin */ | ||
1134 | { | ||
1135 | /* | ||
1136 | * PCI IRQs are mapped in order | ||
1137 | */ | ||
1138 | i = irq = 0; | ||
1139 | while (i < apic) | ||
1140 | irq += nr_ioapic_registers[i++]; | ||
1141 | irq += pin; | ||
1142 | |||
1143 | /* | ||
1144 | * For MPS mode, so far only needed by ES7000 platform | ||
1145 | */ | ||
1146 | if (ioapic_renumber_irq) | ||
1147 | irq = ioapic_renumber_irq(apic, irq); | ||
1148 | |||
1149 | break; | ||
1150 | } | ||
1151 | default: | ||
1152 | { | ||
1153 | printk(KERN_ERR "unknown bus type %d.\n",bus); | ||
1154 | irq = 0; | ||
1155 | break; | ||
1156 | } | ||
1157 | } | ||
1158 | |||
1159 | /* | ||
1160 | * PCI IRQ command line redirection. Yes, limits are hardcoded. | ||
1161 | */ | ||
1162 | if ((pin >= 16) && (pin <= 23)) { | ||
1163 | if (pirq_entries[pin-16] != -1) { | ||
1164 | if (!pirq_entries[pin-16]) { | ||
1165 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1166 | "disabling PIRQ%d\n", pin-16); | ||
1167 | } else { | ||
1168 | irq = pirq_entries[pin-16]; | ||
1169 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1170 | "using PIRQ%d -> IRQ %d\n", | ||
1171 | pin-16, irq); | ||
1172 | } | ||
1173 | } | ||
1174 | } | ||
1175 | return irq; | ||
1176 | } | ||
1177 | |||
1178 | static inline int IO_APIC_irq_trigger(int irq) | ||
1179 | { | ||
1180 | int apic, idx, pin; | ||
1181 | |||
1182 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1183 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1184 | idx = find_irq_entry(apic,pin,mp_INT); | ||
1185 | if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) | ||
1186 | return irq_trigger(idx); | ||
1187 | } | ||
1188 | } | ||
1189 | /* | ||
1190 | * nonexistent IRQs are edge default | ||
1191 | */ | ||
1192 | return 0; | ||
1193 | } | ||
1194 | |||
1195 | /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ | ||
1196 | static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; | ||
1197 | |||
1198 | static int __assign_irq_vector(int irq) | ||
1199 | { | ||
1200 | static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; | ||
1201 | int vector, offset, i; | ||
1202 | |||
1203 | BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); | ||
1204 | |||
1205 | if (irq_vector[irq] > 0) | ||
1206 | return irq_vector[irq]; | ||
1207 | |||
1208 | vector = current_vector; | ||
1209 | offset = current_offset; | ||
1210 | next: | ||
1211 | vector += 8; | ||
1212 | if (vector >= FIRST_SYSTEM_VECTOR) { | ||
1213 | offset = (offset + 1) % 8; | ||
1214 | vector = FIRST_DEVICE_VECTOR + offset; | ||
1215 | } | ||
1216 | if (vector == current_vector) | ||
1217 | return -ENOSPC; | ||
1218 | if (vector == SYSCALL_VECTOR) | ||
1219 | goto next; | ||
1220 | for (i = 0; i < NR_IRQ_VECTORS; i++) | ||
1221 | if (irq_vector[i] == vector) | ||
1222 | goto next; | ||
1223 | |||
1224 | current_vector = vector; | ||
1225 | current_offset = offset; | ||
1226 | irq_vector[irq] = vector; | ||
1227 | |||
1228 | return vector; | ||
1229 | } | ||
1230 | |||
1231 | static int assign_irq_vector(int irq) | ||
1232 | { | ||
1233 | unsigned long flags; | ||
1234 | int vector; | ||
1235 | |||
1236 | spin_lock_irqsave(&vector_lock, flags); | ||
1237 | vector = __assign_irq_vector(irq); | ||
1238 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1239 | |||
1240 | return vector; | ||
1241 | } | ||
1242 | static struct irq_chip ioapic_chip; | ||
1243 | |||
1244 | #define IOAPIC_AUTO -1 | ||
1245 | #define IOAPIC_EDGE 0 | ||
1246 | #define IOAPIC_LEVEL 1 | ||
1247 | |||
1248 | static void ioapic_register_intr(int irq, int vector, unsigned long trigger) | ||
1249 | { | ||
1250 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | ||
1251 | trigger == IOAPIC_LEVEL) { | ||
1252 | irq_desc[irq].status |= IRQ_LEVEL; | ||
1253 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
1254 | handle_fasteoi_irq, "fasteoi"); | ||
1255 | } else { | ||
1256 | irq_desc[irq].status &= ~IRQ_LEVEL; | ||
1257 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
1258 | handle_edge_irq, "edge"); | ||
1259 | } | ||
1260 | set_intr_gate(vector, interrupt[irq]); | ||
1261 | } | ||
1262 | |||
1263 | static void __init setup_IO_APIC_irqs(void) | ||
1264 | { | ||
1265 | struct IO_APIC_route_entry entry; | ||
1266 | int apic, pin, idx, irq, first_notcon = 1, vector; | ||
1267 | unsigned long flags; | ||
1268 | |||
1269 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | ||
1270 | |||
1271 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1272 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1273 | |||
1274 | /* | ||
1275 | * add it to the IO-APIC irq-routing table: | ||
1276 | */ | ||
1277 | memset(&entry,0,sizeof(entry)); | ||
1278 | |||
1279 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
1280 | entry.dest_mode = INT_DEST_MODE; | ||
1281 | entry.mask = 0; /* enable IRQ */ | ||
1282 | entry.dest.logical.logical_dest = | ||
1283 | cpu_mask_to_apicid(TARGET_CPUS); | ||
1284 | |||
1285 | idx = find_irq_entry(apic,pin,mp_INT); | ||
1286 | if (idx == -1) { | ||
1287 | if (first_notcon) { | ||
1288 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1289 | " IO-APIC (apicid-pin) %d-%d", | ||
1290 | mp_ioapics[apic].mpc_apicid, | ||
1291 | pin); | ||
1292 | first_notcon = 0; | ||
1293 | } else | ||
1294 | apic_printk(APIC_VERBOSE, ", %d-%d", | ||
1295 | mp_ioapics[apic].mpc_apicid, pin); | ||
1296 | continue; | ||
1297 | } | ||
1298 | |||
1299 | entry.trigger = irq_trigger(idx); | ||
1300 | entry.polarity = irq_polarity(idx); | ||
1301 | |||
1302 | if (irq_trigger(idx)) { | ||
1303 | entry.trigger = 1; | ||
1304 | entry.mask = 1; | ||
1305 | } | ||
1306 | |||
1307 | irq = pin_2_irq(idx, apic, pin); | ||
1308 | /* | ||
1309 | * skip adding the timer int on secondary nodes, which causes | ||
1310 | * a small but painful rift in the time-space continuum | ||
1311 | */ | ||
1312 | if (multi_timer_check(apic, irq)) | ||
1313 | continue; | ||
1314 | else | ||
1315 | add_pin_to_irq(irq, apic, pin); | ||
1316 | |||
1317 | if (!apic && !IO_APIC_IRQ(irq)) | ||
1318 | continue; | ||
1319 | |||
1320 | if (IO_APIC_IRQ(irq)) { | ||
1321 | vector = assign_irq_vector(irq); | ||
1322 | entry.vector = vector; | ||
1323 | ioapic_register_intr(irq, vector, IOAPIC_AUTO); | ||
1324 | |||
1325 | if (!apic && (irq < 16)) | ||
1326 | disable_8259A_irq(irq); | ||
1327 | } | ||
1328 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1329 | __ioapic_write_entry(apic, pin, entry); | ||
1330 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1331 | } | ||
1332 | } | ||
1333 | |||
1334 | if (!first_notcon) | ||
1335 | apic_printk(APIC_VERBOSE, " not connected.\n"); | ||
1336 | } | ||
1337 | |||
1338 | /* | ||
1339 | * Set up the 8259A-master output pin: | ||
1340 | */ | ||
1341 | static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) | ||
1342 | { | ||
1343 | struct IO_APIC_route_entry entry; | ||
1344 | |||
1345 | memset(&entry,0,sizeof(entry)); | ||
1346 | |||
1347 | disable_8259A_irq(0); | ||
1348 | |||
1349 | /* mask LVT0 */ | ||
1350 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
1351 | |||
1352 | /* | ||
1353 | * We use logical delivery to get the timer IRQ | ||
1354 | * to the first CPU. | ||
1355 | */ | ||
1356 | entry.dest_mode = INT_DEST_MODE; | ||
1357 | entry.mask = 0; /* unmask IRQ now */ | ||
1358 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
1359 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
1360 | entry.polarity = 0; | ||
1361 | entry.trigger = 0; | ||
1362 | entry.vector = vector; | ||
1363 | |||
1364 | /* | ||
1365 | * The timer IRQ doesn't have to know that behind the | ||
1366 | * scene we have a 8259A-master in AEOI mode ... | ||
1367 | */ | ||
1368 | irq_desc[0].chip = &ioapic_chip; | ||
1369 | set_irq_handler(0, handle_edge_irq); | ||
1370 | |||
1371 | /* | ||
1372 | * Add it to the IO-APIC irq-routing table: | ||
1373 | */ | ||
1374 | ioapic_write_entry(apic, pin, entry); | ||
1375 | |||
1376 | enable_8259A_irq(0); | ||
1377 | } | ||
1378 | |||
1379 | void __init print_IO_APIC(void) | ||
1380 | { | ||
1381 | int apic, i; | ||
1382 | union IO_APIC_reg_00 reg_00; | ||
1383 | union IO_APIC_reg_01 reg_01; | ||
1384 | union IO_APIC_reg_02 reg_02; | ||
1385 | union IO_APIC_reg_03 reg_03; | ||
1386 | unsigned long flags; | ||
1387 | |||
1388 | if (apic_verbosity == APIC_QUIET) | ||
1389 | return; | ||
1390 | |||
1391 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | ||
1392 | for (i = 0; i < nr_ioapics; i++) | ||
1393 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | ||
1394 | mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | ||
1395 | |||
1396 | /* | ||
1397 | * We are a bit conservative about what we expect. We have to | ||
1398 | * know about every hardware change ASAP. | ||
1399 | */ | ||
1400 | printk(KERN_INFO "testing the IO APIC.......................\n"); | ||
1401 | |||
1402 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1403 | |||
1404 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1405 | reg_00.raw = io_apic_read(apic, 0); | ||
1406 | reg_01.raw = io_apic_read(apic, 1); | ||
1407 | if (reg_01.bits.version >= 0x10) | ||
1408 | reg_02.raw = io_apic_read(apic, 2); | ||
1409 | if (reg_01.bits.version >= 0x20) | ||
1410 | reg_03.raw = io_apic_read(apic, 3); | ||
1411 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1412 | |||
1413 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | ||
1414 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | ||
1415 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | ||
1416 | printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); | ||
1417 | printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); | ||
1418 | |||
1419 | printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); | ||
1420 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | ||
1421 | |||
1422 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | ||
1423 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | ||
1424 | |||
1425 | /* | ||
1426 | * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, | ||
1427 | * but the value of reg_02 is read as the previous read register | ||
1428 | * value, so ignore it if reg_02 == reg_01. | ||
1429 | */ | ||
1430 | if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { | ||
1431 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | ||
1432 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | ||
1433 | } | ||
1434 | |||
1435 | /* | ||
1436 | * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 | ||
1437 | * or reg_03, but the value of reg_0[23] is read as the previous read | ||
1438 | * register value, so ignore it if reg_03 == reg_0[12]. | ||
1439 | */ | ||
1440 | if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && | ||
1441 | reg_03.raw != reg_01.raw) { | ||
1442 | printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); | ||
1443 | printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); | ||
1444 | } | ||
1445 | |||
1446 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | ||
1447 | |||
1448 | printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" | ||
1449 | " Stat Dest Deli Vect: \n"); | ||
1450 | |||
1451 | for (i = 0; i <= reg_01.bits.entries; i++) { | ||
1452 | struct IO_APIC_route_entry entry; | ||
1453 | |||
1454 | entry = ioapic_read_entry(apic, i); | ||
1455 | |||
1456 | printk(KERN_DEBUG " %02x %03X %02X ", | ||
1457 | i, | ||
1458 | entry.dest.logical.logical_dest, | ||
1459 | entry.dest.physical.physical_dest | ||
1460 | ); | ||
1461 | |||
1462 | printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | ||
1463 | entry.mask, | ||
1464 | entry.trigger, | ||
1465 | entry.irr, | ||
1466 | entry.polarity, | ||
1467 | entry.delivery_status, | ||
1468 | entry.dest_mode, | ||
1469 | entry.delivery_mode, | ||
1470 | entry.vector | ||
1471 | ); | ||
1472 | } | ||
1473 | } | ||
1474 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); | ||
1475 | for (i = 0; i < NR_IRQS; i++) { | ||
1476 | struct irq_pin_list *entry = irq_2_pin + i; | ||
1477 | if (entry->pin < 0) | ||
1478 | continue; | ||
1479 | printk(KERN_DEBUG "IRQ%d ", i); | ||
1480 | for (;;) { | ||
1481 | printk("-> %d:%d", entry->apic, entry->pin); | ||
1482 | if (!entry->next) | ||
1483 | break; | ||
1484 | entry = irq_2_pin + entry->next; | ||
1485 | } | ||
1486 | printk("\n"); | ||
1487 | } | ||
1488 | |||
1489 | printk(KERN_INFO ".................................... done.\n"); | ||
1490 | |||
1491 | return; | ||
1492 | } | ||
1493 | |||
1494 | #if 0 | ||
1495 | |||
1496 | static void print_APIC_bitfield (int base) | ||
1497 | { | ||
1498 | unsigned int v; | ||
1499 | int i, j; | ||
1500 | |||
1501 | if (apic_verbosity == APIC_QUIET) | ||
1502 | return; | ||
1503 | |||
1504 | printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | ||
1505 | for (i = 0; i < 8; i++) { | ||
1506 | v = apic_read(base + i*0x10); | ||
1507 | for (j = 0; j < 32; j++) { | ||
1508 | if (v & (1<<j)) | ||
1509 | printk("1"); | ||
1510 | else | ||
1511 | printk("0"); | ||
1512 | } | ||
1513 | printk("\n"); | ||
1514 | } | ||
1515 | } | ||
1516 | |||
1517 | void /*__init*/ print_local_APIC(void * dummy) | ||
1518 | { | ||
1519 | unsigned int v, ver, maxlvt; | ||
1520 | |||
1521 | if (apic_verbosity == APIC_QUIET) | ||
1522 | return; | ||
1523 | |||
1524 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | ||
1525 | smp_processor_id(), hard_smp_processor_id()); | ||
1526 | v = apic_read(APIC_ID); | ||
1527 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); | ||
1528 | v = apic_read(APIC_LVR); | ||
1529 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | ||
1530 | ver = GET_APIC_VERSION(v); | ||
1531 | maxlvt = lapic_get_maxlvt(); | ||
1532 | |||
1533 | v = apic_read(APIC_TASKPRI); | ||
1534 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | ||
1535 | |||
1536 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
1537 | v = apic_read(APIC_ARBPRI); | ||
1538 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | ||
1539 | v & APIC_ARBPRI_MASK); | ||
1540 | v = apic_read(APIC_PROCPRI); | ||
1541 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | ||
1542 | } | ||
1543 | |||
1544 | v = apic_read(APIC_EOI); | ||
1545 | printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | ||
1546 | v = apic_read(APIC_RRR); | ||
1547 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1548 | v = apic_read(APIC_LDR); | ||
1549 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | ||
1550 | v = apic_read(APIC_DFR); | ||
1551 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | ||
1552 | v = apic_read(APIC_SPIV); | ||
1553 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | ||
1554 | |||
1555 | printk(KERN_DEBUG "... APIC ISR field:\n"); | ||
1556 | print_APIC_bitfield(APIC_ISR); | ||
1557 | printk(KERN_DEBUG "... APIC TMR field:\n"); | ||
1558 | print_APIC_bitfield(APIC_TMR); | ||
1559 | printk(KERN_DEBUG "... APIC IRR field:\n"); | ||
1560 | print_APIC_bitfield(APIC_IRR); | ||
1561 | |||
1562 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
1563 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
1564 | apic_write(APIC_ESR, 0); | ||
1565 | v = apic_read(APIC_ESR); | ||
1566 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | ||
1567 | } | ||
1568 | |||
1569 | v = apic_read(APIC_ICR); | ||
1570 | printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | ||
1571 | v = apic_read(APIC_ICR2); | ||
1572 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | ||
1573 | |||
1574 | v = apic_read(APIC_LVTT); | ||
1575 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | ||
1576 | |||
1577 | if (maxlvt > 3) { /* PC is LVT#4. */ | ||
1578 | v = apic_read(APIC_LVTPC); | ||
1579 | printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | ||
1580 | } | ||
1581 | v = apic_read(APIC_LVT0); | ||
1582 | printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | ||
1583 | v = apic_read(APIC_LVT1); | ||
1584 | printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | ||
1585 | |||
1586 | if (maxlvt > 2) { /* ERR is LVT#3. */ | ||
1587 | v = apic_read(APIC_LVTERR); | ||
1588 | printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | ||
1589 | } | ||
1590 | |||
1591 | v = apic_read(APIC_TMICT); | ||
1592 | printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | ||
1593 | v = apic_read(APIC_TMCCT); | ||
1594 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | ||
1595 | v = apic_read(APIC_TDCR); | ||
1596 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | ||
1597 | printk("\n"); | ||
1598 | } | ||
1599 | |||
1600 | void print_all_local_APICs (void) | ||
1601 | { | ||
1602 | on_each_cpu(print_local_APIC, NULL, 1, 1); | ||
1603 | } | ||
1604 | |||
1605 | void /*__init*/ print_PIC(void) | ||
1606 | { | ||
1607 | unsigned int v; | ||
1608 | unsigned long flags; | ||
1609 | |||
1610 | if (apic_verbosity == APIC_QUIET) | ||
1611 | return; | ||
1612 | |||
1613 | printk(KERN_DEBUG "\nprinting PIC contents\n"); | ||
1614 | |||
1615 | spin_lock_irqsave(&i8259A_lock, flags); | ||
1616 | |||
1617 | v = inb(0xa1) << 8 | inb(0x21); | ||
1618 | printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | ||
1619 | |||
1620 | v = inb(0xa0) << 8 | inb(0x20); | ||
1621 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | ||
1622 | |||
1623 | outb(0x0b,0xa0); | ||
1624 | outb(0x0b,0x20); | ||
1625 | v = inb(0xa0) << 8 | inb(0x20); | ||
1626 | outb(0x0a,0xa0); | ||
1627 | outb(0x0a,0x20); | ||
1628 | |||
1629 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
1630 | |||
1631 | printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | ||
1632 | |||
1633 | v = inb(0x4d1) << 8 | inb(0x4d0); | ||
1634 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | ||
1635 | } | ||
1636 | |||
1637 | #endif /* 0 */ | ||
1638 | |||
1639 | static void __init enable_IO_APIC(void) | ||
1640 | { | ||
1641 | union IO_APIC_reg_01 reg_01; | ||
1642 | int i8259_apic, i8259_pin; | ||
1643 | int i, apic; | ||
1644 | unsigned long flags; | ||
1645 | |||
1646 | for (i = 0; i < PIN_MAP_SIZE; i++) { | ||
1647 | irq_2_pin[i].pin = -1; | ||
1648 | irq_2_pin[i].next = 0; | ||
1649 | } | ||
1650 | if (!pirqs_enabled) | ||
1651 | for (i = 0; i < MAX_PIRQS; i++) | ||
1652 | pirq_entries[i] = -1; | ||
1653 | |||
1654 | /* | ||
1655 | * The number of IO-APIC IRQ registers (== #pins): | ||
1656 | */ | ||
1657 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1658 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1659 | reg_01.raw = io_apic_read(apic, 1); | ||
1660 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1661 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | ||
1662 | } | ||
1663 | for(apic = 0; apic < nr_ioapics; apic++) { | ||
1664 | int pin; | ||
1665 | /* See if any of the pins is in ExtINT mode */ | ||
1666 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1667 | struct IO_APIC_route_entry entry; | ||
1668 | entry = ioapic_read_entry(apic, pin); | ||
1669 | |||
1670 | |||
1671 | /* If the interrupt line is enabled and in ExtInt mode | ||
1672 | * I have found the pin where the i8259 is connected. | ||
1673 | */ | ||
1674 | if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { | ||
1675 | ioapic_i8259.apic = apic; | ||
1676 | ioapic_i8259.pin = pin; | ||
1677 | goto found_i8259; | ||
1678 | } | ||
1679 | } | ||
1680 | } | ||
1681 | found_i8259: | ||
1682 | /* Look to see what if the MP table has reported the ExtINT */ | ||
1683 | /* If we could not find the appropriate pin by looking at the ioapic | ||
1684 | * the i8259 probably is not connected the ioapic but give the | ||
1685 | * mptable a chance anyway. | ||
1686 | */ | ||
1687 | i8259_pin = find_isa_irq_pin(0, mp_ExtINT); | ||
1688 | i8259_apic = find_isa_irq_apic(0, mp_ExtINT); | ||
1689 | /* Trust the MP table if nothing is setup in the hardware */ | ||
1690 | if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { | ||
1691 | printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); | ||
1692 | ioapic_i8259.pin = i8259_pin; | ||
1693 | ioapic_i8259.apic = i8259_apic; | ||
1694 | } | ||
1695 | /* Complain if the MP table and the hardware disagree */ | ||
1696 | if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && | ||
1697 | (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) | ||
1698 | { | ||
1699 | printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); | ||
1700 | } | ||
1701 | |||
1702 | /* | ||
1703 | * Do not trust the IO-APIC being empty at bootup | ||
1704 | */ | ||
1705 | clear_IO_APIC(); | ||
1706 | } | ||
1707 | |||
1708 | /* | ||
1709 | * Not an __init, needed by the reboot code | ||
1710 | */ | ||
1711 | void disable_IO_APIC(void) | ||
1712 | { | ||
1713 | /* | ||
1714 | * Clear the IO-APIC before rebooting: | ||
1715 | */ | ||
1716 | clear_IO_APIC(); | ||
1717 | |||
1718 | /* | ||
1719 | * If the i8259 is routed through an IOAPIC | ||
1720 | * Put that IOAPIC in virtual wire mode | ||
1721 | * so legacy interrupts can be delivered. | ||
1722 | */ | ||
1723 | if (ioapic_i8259.pin != -1) { | ||
1724 | struct IO_APIC_route_entry entry; | ||
1725 | |||
1726 | memset(&entry, 0, sizeof(entry)); | ||
1727 | entry.mask = 0; /* Enabled */ | ||
1728 | entry.trigger = 0; /* Edge */ | ||
1729 | entry.irr = 0; | ||
1730 | entry.polarity = 0; /* High */ | ||
1731 | entry.delivery_status = 0; | ||
1732 | entry.dest_mode = 0; /* Physical */ | ||
1733 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ | ||
1734 | entry.vector = 0; | ||
1735 | entry.dest.physical.physical_dest = | ||
1736 | GET_APIC_ID(apic_read(APIC_ID)); | ||
1737 | |||
1738 | /* | ||
1739 | * Add it to the IO-APIC irq-routing table: | ||
1740 | */ | ||
1741 | ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); | ||
1742 | } | ||
1743 | disconnect_bsp_APIC(ioapic_i8259.pin != -1); | ||
1744 | } | ||
1745 | |||
1746 | /* | ||
1747 | * function to set the IO-APIC physical IDs based on the | ||
1748 | * values stored in the MPC table. | ||
1749 | * | ||
1750 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | ||
1751 | */ | ||
1752 | |||
1753 | #ifndef CONFIG_X86_NUMAQ | ||
1754 | static void __init setup_ioapic_ids_from_mpc(void) | ||
1755 | { | ||
1756 | union IO_APIC_reg_00 reg_00; | ||
1757 | physid_mask_t phys_id_present_map; | ||
1758 | int apic; | ||
1759 | int i; | ||
1760 | unsigned char old_id; | ||
1761 | unsigned long flags; | ||
1762 | |||
1763 | /* | ||
1764 | * Don't check I/O APIC IDs for xAPIC systems. They have | ||
1765 | * no meaning without the serial APIC bus. | ||
1766 | */ | ||
1767 | if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
1768 | || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
1769 | return; | ||
1770 | /* | ||
1771 | * This is broken; anything with a real cpu count has to | ||
1772 | * circumvent this idiocy regardless. | ||
1773 | */ | ||
1774 | phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); | ||
1775 | |||
1776 | /* | ||
1777 | * Set the IOAPIC ID to the value stored in the MPC table. | ||
1778 | */ | ||
1779 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1780 | |||
1781 | /* Read the register 0 value */ | ||
1782 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1783 | reg_00.raw = io_apic_read(apic, 0); | ||
1784 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1785 | |||
1786 | old_id = mp_ioapics[apic].mpc_apicid; | ||
1787 | |||
1788 | if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { | ||
1789 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", | ||
1790 | apic, mp_ioapics[apic].mpc_apicid); | ||
1791 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | ||
1792 | reg_00.bits.ID); | ||
1793 | mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; | ||
1794 | } | ||
1795 | |||
1796 | /* | ||
1797 | * Sanity check, is the ID really free? Every APIC in a | ||
1798 | * system must have a unique ID or we get lots of nice | ||
1799 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
1800 | */ | ||
1801 | if (check_apicid_used(phys_id_present_map, | ||
1802 | mp_ioapics[apic].mpc_apicid)) { | ||
1803 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", | ||
1804 | apic, mp_ioapics[apic].mpc_apicid); | ||
1805 | for (i = 0; i < get_physical_broadcast(); i++) | ||
1806 | if (!physid_isset(i, phys_id_present_map)) | ||
1807 | break; | ||
1808 | if (i >= get_physical_broadcast()) | ||
1809 | panic("Max APIC ID exceeded!\n"); | ||
1810 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | ||
1811 | i); | ||
1812 | physid_set(i, phys_id_present_map); | ||
1813 | mp_ioapics[apic].mpc_apicid = i; | ||
1814 | } else { | ||
1815 | physid_mask_t tmp; | ||
1816 | tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); | ||
1817 | apic_printk(APIC_VERBOSE, "Setting %d in the " | ||
1818 | "phys_id_present_map\n", | ||
1819 | mp_ioapics[apic].mpc_apicid); | ||
1820 | physids_or(phys_id_present_map, phys_id_present_map, tmp); | ||
1821 | } | ||
1822 | |||
1823 | |||
1824 | /* | ||
1825 | * We need to adjust the IRQ routing table | ||
1826 | * if the ID changed. | ||
1827 | */ | ||
1828 | if (old_id != mp_ioapics[apic].mpc_apicid) | ||
1829 | for (i = 0; i < mp_irq_entries; i++) | ||
1830 | if (mp_irqs[i].mpc_dstapic == old_id) | ||
1831 | mp_irqs[i].mpc_dstapic | ||
1832 | = mp_ioapics[apic].mpc_apicid; | ||
1833 | |||
1834 | /* | ||
1835 | * Read the right value from the MPC table and | ||
1836 | * write it into the ID register. | ||
1837 | */ | ||
1838 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
1839 | "...changing IO-APIC physical APIC ID to %d ...", | ||
1840 | mp_ioapics[apic].mpc_apicid); | ||
1841 | |||
1842 | reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; | ||
1843 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1844 | io_apic_write(apic, 0, reg_00.raw); | ||
1845 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1846 | |||
1847 | /* | ||
1848 | * Sanity check | ||
1849 | */ | ||
1850 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1851 | reg_00.raw = io_apic_read(apic, 0); | ||
1852 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1853 | if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) | ||
1854 | printk("could not set ID!\n"); | ||
1855 | else | ||
1856 | apic_printk(APIC_VERBOSE, " ok.\n"); | ||
1857 | } | ||
1858 | } | ||
1859 | #else | ||
1860 | static void __init setup_ioapic_ids_from_mpc(void) { } | ||
1861 | #endif | ||
1862 | |||
1863 | int no_timer_check __initdata; | ||
1864 | |||
1865 | static int __init notimercheck(char *s) | ||
1866 | { | ||
1867 | no_timer_check = 1; | ||
1868 | return 1; | ||
1869 | } | ||
1870 | __setup("no_timer_check", notimercheck); | ||
1871 | |||
1872 | /* | ||
1873 | * There is a nasty bug in some older SMP boards, their mptable lies | ||
1874 | * about the timer IRQ. We do the following to work around the situation: | ||
1875 | * | ||
1876 | * - timer IRQ defaults to IO-APIC IRQ | ||
1877 | * - if this function detects that timer IRQs are defunct, then we fall | ||
1878 | * back to ISA timer IRQs | ||
1879 | */ | ||
1880 | static int __init timer_irq_works(void) | ||
1881 | { | ||
1882 | unsigned long t1 = jiffies; | ||
1883 | |||
1884 | if (no_timer_check) | ||
1885 | return 1; | ||
1886 | |||
1887 | local_irq_enable(); | ||
1888 | /* Let ten ticks pass... */ | ||
1889 | mdelay((10 * 1000) / HZ); | ||
1890 | |||
1891 | /* | ||
1892 | * Expect a few ticks at least, to be sure some possible | ||
1893 | * glue logic does not lock up after one or two first | ||
1894 | * ticks in a non-ExtINT mode. Also the local APIC | ||
1895 | * might have cached one ExtINT interrupt. Finally, at | ||
1896 | * least one tick may be lost due to delays. | ||
1897 | */ | ||
1898 | if (jiffies - t1 > 4) | ||
1899 | return 1; | ||
1900 | |||
1901 | return 0; | ||
1902 | } | ||
1903 | |||
1904 | /* | ||
1905 | * In the SMP+IOAPIC case it might happen that there are an unspecified | ||
1906 | * number of pending IRQ events unhandled. These cases are very rare, | ||
1907 | * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | ||
1908 | * better to do it this way as thus we do not have to be aware of | ||
1909 | * 'pending' interrupts in the IRQ path, except at this point. | ||
1910 | */ | ||
1911 | /* | ||
1912 | * Edge triggered needs to resend any interrupt | ||
1913 | * that was delayed but this is now handled in the device | ||
1914 | * independent code. | ||
1915 | */ | ||
1916 | |||
1917 | /* | ||
1918 | * Startup quirk: | ||
1919 | * | ||
1920 | * Starting up a edge-triggered IO-APIC interrupt is | ||
1921 | * nasty - we need to make sure that we get the edge. | ||
1922 | * If it is already asserted for some reason, we need | ||
1923 | * return 1 to indicate that is was pending. | ||
1924 | * | ||
1925 | * This is not complete - we should be able to fake | ||
1926 | * an edge even if it isn't on the 8259A... | ||
1927 | * | ||
1928 | * (We do this for level-triggered IRQs too - it cannot hurt.) | ||
1929 | */ | ||
1930 | static unsigned int startup_ioapic_irq(unsigned int irq) | ||
1931 | { | ||
1932 | int was_pending = 0; | ||
1933 | unsigned long flags; | ||
1934 | |||
1935 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1936 | if (irq < 16) { | ||
1937 | disable_8259A_irq(irq); | ||
1938 | if (i8259A_irq_pending(irq)) | ||
1939 | was_pending = 1; | ||
1940 | } | ||
1941 | __unmask_IO_APIC_irq(irq); | ||
1942 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1943 | |||
1944 | return was_pending; | ||
1945 | } | ||
1946 | |||
1947 | static void ack_ioapic_irq(unsigned int irq) | ||
1948 | { | ||
1949 | move_native_irq(irq); | ||
1950 | ack_APIC_irq(); | ||
1951 | } | ||
1952 | |||
1953 | static void ack_ioapic_quirk_irq(unsigned int irq) | ||
1954 | { | ||
1955 | unsigned long v; | ||
1956 | int i; | ||
1957 | |||
1958 | move_native_irq(irq); | ||
1959 | /* | ||
1960 | * It appears there is an erratum which affects at least version 0x11 | ||
1961 | * of I/O APIC (that's the 82093AA and cores integrated into various | ||
1962 | * chipsets). Under certain conditions a level-triggered interrupt is | ||
1963 | * erroneously delivered as edge-triggered one but the respective IRR | ||
1964 | * bit gets set nevertheless. As a result the I/O unit expects an EOI | ||
1965 | * message but it will never arrive and further interrupts are blocked | ||
1966 | * from the source. The exact reason is so far unknown, but the | ||
1967 | * phenomenon was observed when two consecutive interrupt requests | ||
1968 | * from a given source get delivered to the same CPU and the source is | ||
1969 | * temporarily disabled in between. | ||
1970 | * | ||
1971 | * A workaround is to simulate an EOI message manually. We achieve it | ||
1972 | * by setting the trigger mode to edge and then to level when the edge | ||
1973 | * trigger mode gets detected in the TMR of a local APIC for a | ||
1974 | * level-triggered interrupt. We mask the source for the time of the | ||
1975 | * operation to prevent an edge-triggered interrupt escaping meanwhile. | ||
1976 | * The idea is from Manfred Spraul. --macro | ||
1977 | */ | ||
1978 | i = irq_vector[irq]; | ||
1979 | |||
1980 | v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); | ||
1981 | |||
1982 | ack_APIC_irq(); | ||
1983 | |||
1984 | if (!(v & (1 << (i & 0x1f)))) { | ||
1985 | atomic_inc(&irq_mis_count); | ||
1986 | spin_lock(&ioapic_lock); | ||
1987 | __mask_and_edge_IO_APIC_irq(irq); | ||
1988 | __unmask_and_level_IO_APIC_irq(irq); | ||
1989 | spin_unlock(&ioapic_lock); | ||
1990 | } | ||
1991 | } | ||
1992 | |||
1993 | static int ioapic_retrigger_irq(unsigned int irq) | ||
1994 | { | ||
1995 | send_IPI_self(irq_vector[irq]); | ||
1996 | |||
1997 | return 1; | ||
1998 | } | ||
1999 | |||
2000 | static struct irq_chip ioapic_chip __read_mostly = { | ||
2001 | .name = "IO-APIC", | ||
2002 | .startup = startup_ioapic_irq, | ||
2003 | .mask = mask_IO_APIC_irq, | ||
2004 | .unmask = unmask_IO_APIC_irq, | ||
2005 | .ack = ack_ioapic_irq, | ||
2006 | .eoi = ack_ioapic_quirk_irq, | ||
2007 | #ifdef CONFIG_SMP | ||
2008 | .set_affinity = set_ioapic_affinity_irq, | ||
2009 | #endif | ||
2010 | .retrigger = ioapic_retrigger_irq, | ||
2011 | }; | ||
2012 | |||
2013 | |||
2014 | static inline void init_IO_APIC_traps(void) | ||
2015 | { | ||
2016 | int irq; | ||
2017 | |||
2018 | /* | ||
2019 | * NOTE! The local APIC isn't very good at handling | ||
2020 | * multiple interrupts at the same interrupt level. | ||
2021 | * As the interrupt level is determined by taking the | ||
2022 | * vector number and shifting that right by 4, we | ||
2023 | * want to spread these out a bit so that they don't | ||
2024 | * all fall in the same interrupt level. | ||
2025 | * | ||
2026 | * Also, we've got to be careful not to trash gate | ||
2027 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
2028 | */ | ||
2029 | for (irq = 0; irq < NR_IRQS ; irq++) { | ||
2030 | int tmp = irq; | ||
2031 | if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) { | ||
2032 | /* | ||
2033 | * Hmm.. We don't have an entry for this, | ||
2034 | * so default to an old-fashioned 8259 | ||
2035 | * interrupt if we can.. | ||
2036 | */ | ||
2037 | if (irq < 16) | ||
2038 | make_8259A_irq(irq); | ||
2039 | else | ||
2040 | /* Strange. Oh, well.. */ | ||
2041 | irq_desc[irq].chip = &no_irq_chip; | ||
2042 | } | ||
2043 | } | ||
2044 | } | ||
2045 | |||
2046 | /* | ||
2047 | * The local APIC irq-chip implementation: | ||
2048 | */ | ||
2049 | |||
2050 | static void ack_apic(unsigned int irq) | ||
2051 | { | ||
2052 | ack_APIC_irq(); | ||
2053 | } | ||
2054 | |||
2055 | static void mask_lapic_irq (unsigned int irq) | ||
2056 | { | ||
2057 | unsigned long v; | ||
2058 | |||
2059 | v = apic_read(APIC_LVT0); | ||
2060 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | ||
2061 | } | ||
2062 | |||
2063 | static void unmask_lapic_irq (unsigned int irq) | ||
2064 | { | ||
2065 | unsigned long v; | ||
2066 | |||
2067 | v = apic_read(APIC_LVT0); | ||
2068 | apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); | ||
2069 | } | ||
2070 | |||
2071 | static struct irq_chip lapic_chip __read_mostly = { | ||
2072 | .name = "local-APIC-edge", | ||
2073 | .mask = mask_lapic_irq, | ||
2074 | .unmask = unmask_lapic_irq, | ||
2075 | .eoi = ack_apic, | ||
2076 | }; | ||
2077 | |||
2078 | static void setup_nmi (void) | ||
2079 | { | ||
2080 | /* | ||
2081 | * Dirty trick to enable the NMI watchdog ... | ||
2082 | * We put the 8259A master into AEOI mode and | ||
2083 | * unmask on all local APICs LVT0 as NMI. | ||
2084 | * | ||
2085 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | ||
2086 | * is from Maciej W. Rozycki - so we do not have to EOI from | ||
2087 | * the NMI handler or the timer interrupt. | ||
2088 | */ | ||
2089 | apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); | ||
2090 | |||
2091 | on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); | ||
2092 | |||
2093 | apic_printk(APIC_VERBOSE, " done.\n"); | ||
2094 | } | ||
2095 | |||
2096 | /* | ||
2097 | * This looks a bit hackish but it's about the only one way of sending | ||
2098 | * a few INTA cycles to 8259As and any associated glue logic. ICR does | ||
2099 | * not support the ExtINT mode, unfortunately. We need to send these | ||
2100 | * cycles as some i82489DX-based boards have glue logic that keeps the | ||
2101 | * 8259A interrupt line asserted until INTA. --macro | ||
2102 | */ | ||
2103 | static inline void unlock_ExtINT_logic(void) | ||
2104 | { | ||
2105 | int apic, pin, i; | ||
2106 | struct IO_APIC_route_entry entry0, entry1; | ||
2107 | unsigned char save_control, save_freq_select; | ||
2108 | |||
2109 | pin = find_isa_irq_pin(8, mp_INT); | ||
2110 | if (pin == -1) { | ||
2111 | WARN_ON_ONCE(1); | ||
2112 | return; | ||
2113 | } | ||
2114 | apic = find_isa_irq_apic(8, mp_INT); | ||
2115 | if (apic == -1) { | ||
2116 | WARN_ON_ONCE(1); | ||
2117 | return; | ||
2118 | } | ||
2119 | |||
2120 | entry0 = ioapic_read_entry(apic, pin); | ||
2121 | clear_IO_APIC_pin(apic, pin); | ||
2122 | |||
2123 | memset(&entry1, 0, sizeof(entry1)); | ||
2124 | |||
2125 | entry1.dest_mode = 0; /* physical delivery */ | ||
2126 | entry1.mask = 0; /* unmask IRQ now */ | ||
2127 | entry1.dest.physical.physical_dest = hard_smp_processor_id(); | ||
2128 | entry1.delivery_mode = dest_ExtINT; | ||
2129 | entry1.polarity = entry0.polarity; | ||
2130 | entry1.trigger = 0; | ||
2131 | entry1.vector = 0; | ||
2132 | |||
2133 | ioapic_write_entry(apic, pin, entry1); | ||
2134 | |||
2135 | save_control = CMOS_READ(RTC_CONTROL); | ||
2136 | save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
2137 | CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | ||
2138 | RTC_FREQ_SELECT); | ||
2139 | CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | ||
2140 | |||
2141 | i = 100; | ||
2142 | while (i-- > 0) { | ||
2143 | mdelay(10); | ||
2144 | if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | ||
2145 | i -= 10; | ||
2146 | } | ||
2147 | |||
2148 | CMOS_WRITE(save_control, RTC_CONTROL); | ||
2149 | CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | ||
2150 | clear_IO_APIC_pin(apic, pin); | ||
2151 | |||
2152 | ioapic_write_entry(apic, pin, entry0); | ||
2153 | } | ||
2154 | |||
2155 | int timer_uses_ioapic_pin_0; | ||
2156 | |||
2157 | /* | ||
2158 | * This code may look a bit paranoid, but it's supposed to cooperate with | ||
2159 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | ||
2160 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | ||
2161 | * fanatically on his truly buggy board. | ||
2162 | */ | ||
2163 | static inline void __init check_timer(void) | ||
2164 | { | ||
2165 | int apic1, pin1, apic2, pin2; | ||
2166 | int vector; | ||
2167 | |||
2168 | /* | ||
2169 | * get/set the timer IRQ vector: | ||
2170 | */ | ||
2171 | disable_8259A_irq(0); | ||
2172 | vector = assign_irq_vector(0); | ||
2173 | set_intr_gate(vector, interrupt[0]); | ||
2174 | |||
2175 | /* | ||
2176 | * Subtle, code in do_timer_interrupt() expects an AEOI | ||
2177 | * mode for the 8259A whenever interrupts are routed | ||
2178 | * through I/O APICs. Also IRQ0 has to be enabled in | ||
2179 | * the 8259A which implies the virtual wire has to be | ||
2180 | * disabled in the local APIC. | ||
2181 | */ | ||
2182 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
2183 | init_8259A(1); | ||
2184 | timer_ack = 1; | ||
2185 | if (timer_over_8254 > 0) | ||
2186 | enable_8259A_irq(0); | ||
2187 | |||
2188 | pin1 = find_isa_irq_pin(0, mp_INT); | ||
2189 | apic1 = find_isa_irq_apic(0, mp_INT); | ||
2190 | pin2 = ioapic_i8259.pin; | ||
2191 | apic2 = ioapic_i8259.apic; | ||
2192 | |||
2193 | if (pin1 == 0) | ||
2194 | timer_uses_ioapic_pin_0 = 1; | ||
2195 | |||
2196 | printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", | ||
2197 | vector, apic1, pin1, apic2, pin2); | ||
2198 | |||
2199 | if (pin1 != -1) { | ||
2200 | /* | ||
2201 | * Ok, does IRQ0 through the IOAPIC work? | ||
2202 | */ | ||
2203 | unmask_IO_APIC_irq(0); | ||
2204 | if (timer_irq_works()) { | ||
2205 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2206 | disable_8259A_irq(0); | ||
2207 | setup_nmi(); | ||
2208 | enable_8259A_irq(0); | ||
2209 | } | ||
2210 | if (disable_timer_pin_1 > 0) | ||
2211 | clear_IO_APIC_pin(0, pin1); | ||
2212 | return; | ||
2213 | } | ||
2214 | clear_IO_APIC_pin(apic1, pin1); | ||
2215 | printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " | ||
2216 | "IO-APIC\n"); | ||
2217 | } | ||
2218 | |||
2219 | printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); | ||
2220 | if (pin2 != -1) { | ||
2221 | printk("\n..... (found pin %d) ...", pin2); | ||
2222 | /* | ||
2223 | * legacy devices should be connected to IO APIC #0 | ||
2224 | */ | ||
2225 | setup_ExtINT_IRQ0_pin(apic2, pin2, vector); | ||
2226 | if (timer_irq_works()) { | ||
2227 | printk("works.\n"); | ||
2228 | if (pin1 != -1) | ||
2229 | replace_pin_at_irq(0, apic1, pin1, apic2, pin2); | ||
2230 | else | ||
2231 | add_pin_to_irq(0, apic2, pin2); | ||
2232 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2233 | setup_nmi(); | ||
2234 | } | ||
2235 | return; | ||
2236 | } | ||
2237 | /* | ||
2238 | * Cleanup, just in case ... | ||
2239 | */ | ||
2240 | clear_IO_APIC_pin(apic2, pin2); | ||
2241 | } | ||
2242 | printk(" failed.\n"); | ||
2243 | |||
2244 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2245 | printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | ||
2246 | nmi_watchdog = 0; | ||
2247 | } | ||
2248 | |||
2249 | printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | ||
2250 | |||
2251 | disable_8259A_irq(0); | ||
2252 | set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, | ||
2253 | "fasteoi"); | ||
2254 | apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ | ||
2255 | enable_8259A_irq(0); | ||
2256 | |||
2257 | if (timer_irq_works()) { | ||
2258 | printk(" works.\n"); | ||
2259 | return; | ||
2260 | } | ||
2261 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); | ||
2262 | printk(" failed.\n"); | ||
2263 | |||
2264 | printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | ||
2265 | |||
2266 | timer_ack = 0; | ||
2267 | init_8259A(0); | ||
2268 | make_8259A_irq(0); | ||
2269 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | ||
2270 | |||
2271 | unlock_ExtINT_logic(); | ||
2272 | |||
2273 | if (timer_irq_works()) { | ||
2274 | printk(" works.\n"); | ||
2275 | return; | ||
2276 | } | ||
2277 | printk(" failed :(.\n"); | ||
2278 | panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " | ||
2279 | "report. Then try booting with the 'noapic' option"); | ||
2280 | } | ||
2281 | |||
2282 | /* | ||
2283 | * | ||
2284 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. | ||
2285 | * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | ||
2286 | * Linux doesn't really care, as it's not actually used | ||
2287 | * for any interrupt handling anyway. | ||
2288 | */ | ||
2289 | #define PIC_IRQS (1 << PIC_CASCADE_IR) | ||
2290 | |||
2291 | void __init setup_IO_APIC(void) | ||
2292 | { | ||
2293 | enable_IO_APIC(); | ||
2294 | |||
2295 | if (acpi_ioapic) | ||
2296 | io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | ||
2297 | else | ||
2298 | io_apic_irqs = ~PIC_IRQS; | ||
2299 | |||
2300 | printk("ENABLING IO-APIC IRQs\n"); | ||
2301 | |||
2302 | /* | ||
2303 | * Set up IO-APIC IRQ routing. | ||
2304 | */ | ||
2305 | if (!acpi_ioapic) | ||
2306 | setup_ioapic_ids_from_mpc(); | ||
2307 | sync_Arb_IDs(); | ||
2308 | setup_IO_APIC_irqs(); | ||
2309 | init_IO_APIC_traps(); | ||
2310 | check_timer(); | ||
2311 | if (!acpi_ioapic) | ||
2312 | print_IO_APIC(); | ||
2313 | } | ||
2314 | |||
2315 | static int __init setup_disable_8254_timer(char *s) | ||
2316 | { | ||
2317 | timer_over_8254 = -1; | ||
2318 | return 1; | ||
2319 | } | ||
2320 | static int __init setup_enable_8254_timer(char *s) | ||
2321 | { | ||
2322 | timer_over_8254 = 2; | ||
2323 | return 1; | ||
2324 | } | ||
2325 | |||
2326 | __setup("disable_8254_timer", setup_disable_8254_timer); | ||
2327 | __setup("enable_8254_timer", setup_enable_8254_timer); | ||
2328 | |||
2329 | /* | ||
2330 | * Called after all the initialization is done. If we didnt find any | ||
2331 | * APIC bugs then we can allow the modify fast path | ||
2332 | */ | ||
2333 | |||
2334 | static int __init io_apic_bug_finalize(void) | ||
2335 | { | ||
2336 | if(sis_apic_bug == -1) | ||
2337 | sis_apic_bug = 0; | ||
2338 | return 0; | ||
2339 | } | ||
2340 | |||
2341 | late_initcall(io_apic_bug_finalize); | ||
2342 | |||
2343 | struct sysfs_ioapic_data { | ||
2344 | struct sys_device dev; | ||
2345 | struct IO_APIC_route_entry entry[0]; | ||
2346 | }; | ||
2347 | static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | ||
2348 | |||
2349 | static int ioapic_suspend(struct sys_device *dev, pm_message_t state) | ||
2350 | { | ||
2351 | struct IO_APIC_route_entry *entry; | ||
2352 | struct sysfs_ioapic_data *data; | ||
2353 | int i; | ||
2354 | |||
2355 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
2356 | entry = data->entry; | ||
2357 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) | ||
2358 | entry[i] = ioapic_read_entry(dev->id, i); | ||
2359 | |||
2360 | return 0; | ||
2361 | } | ||
2362 | |||
2363 | static int ioapic_resume(struct sys_device *dev) | ||
2364 | { | ||
2365 | struct IO_APIC_route_entry *entry; | ||
2366 | struct sysfs_ioapic_data *data; | ||
2367 | unsigned long flags; | ||
2368 | union IO_APIC_reg_00 reg_00; | ||
2369 | int i; | ||
2370 | |||
2371 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
2372 | entry = data->entry; | ||
2373 | |||
2374 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2375 | reg_00.raw = io_apic_read(dev->id, 0); | ||
2376 | if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | ||
2377 | reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | ||
2378 | io_apic_write(dev->id, 0, reg_00.raw); | ||
2379 | } | ||
2380 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2381 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) | ||
2382 | ioapic_write_entry(dev->id, i, entry[i]); | ||
2383 | |||
2384 | return 0; | ||
2385 | } | ||
2386 | |||
2387 | static struct sysdev_class ioapic_sysdev_class = { | ||
2388 | set_kset_name("ioapic"), | ||
2389 | .suspend = ioapic_suspend, | ||
2390 | .resume = ioapic_resume, | ||
2391 | }; | ||
2392 | |||
2393 | static int __init ioapic_init_sysfs(void) | ||
2394 | { | ||
2395 | struct sys_device * dev; | ||
2396 | int i, size, error = 0; | ||
2397 | |||
2398 | error = sysdev_class_register(&ioapic_sysdev_class); | ||
2399 | if (error) | ||
2400 | return error; | ||
2401 | |||
2402 | for (i = 0; i < nr_ioapics; i++ ) { | ||
2403 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] | ||
2404 | * sizeof(struct IO_APIC_route_entry); | ||
2405 | mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | ||
2406 | if (!mp_ioapic_data[i]) { | ||
2407 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
2408 | continue; | ||
2409 | } | ||
2410 | memset(mp_ioapic_data[i], 0, size); | ||
2411 | dev = &mp_ioapic_data[i]->dev; | ||
2412 | dev->id = i; | ||
2413 | dev->cls = &ioapic_sysdev_class; | ||
2414 | error = sysdev_register(dev); | ||
2415 | if (error) { | ||
2416 | kfree(mp_ioapic_data[i]); | ||
2417 | mp_ioapic_data[i] = NULL; | ||
2418 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
2419 | continue; | ||
2420 | } | ||
2421 | } | ||
2422 | |||
2423 | return 0; | ||
2424 | } | ||
2425 | |||
2426 | device_initcall(ioapic_init_sysfs); | ||
2427 | |||
2428 | /* | ||
2429 | * Dynamic irq allocate and deallocation | ||
2430 | */ | ||
2431 | int create_irq(void) | ||
2432 | { | ||
2433 | /* Allocate an unused irq */ | ||
2434 | int irq, new, vector = 0; | ||
2435 | unsigned long flags; | ||
2436 | |||
2437 | irq = -ENOSPC; | ||
2438 | spin_lock_irqsave(&vector_lock, flags); | ||
2439 | for (new = (NR_IRQS - 1); new >= 0; new--) { | ||
2440 | if (platform_legacy_irq(new)) | ||
2441 | continue; | ||
2442 | if (irq_vector[new] != 0) | ||
2443 | continue; | ||
2444 | vector = __assign_irq_vector(new); | ||
2445 | if (likely(vector > 0)) | ||
2446 | irq = new; | ||
2447 | break; | ||
2448 | } | ||
2449 | spin_unlock_irqrestore(&vector_lock, flags); | ||
2450 | |||
2451 | if (irq >= 0) { | ||
2452 | set_intr_gate(vector, interrupt[irq]); | ||
2453 | dynamic_irq_init(irq); | ||
2454 | } | ||
2455 | return irq; | ||
2456 | } | ||
2457 | |||
2458 | void destroy_irq(unsigned int irq) | ||
2459 | { | ||
2460 | unsigned long flags; | ||
2461 | |||
2462 | dynamic_irq_cleanup(irq); | ||
2463 | |||
2464 | spin_lock_irqsave(&vector_lock, flags); | ||
2465 | irq_vector[irq] = 0; | ||
2466 | spin_unlock_irqrestore(&vector_lock, flags); | ||
2467 | } | ||
2468 | |||
2469 | /* | ||
2470 | * MSI mesage composition | ||
2471 | */ | ||
2472 | #ifdef CONFIG_PCI_MSI | ||
2473 | static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) | ||
2474 | { | ||
2475 | int vector; | ||
2476 | unsigned dest; | ||
2477 | |||
2478 | vector = assign_irq_vector(irq); | ||
2479 | if (vector >= 0) { | ||
2480 | dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
2481 | |||
2482 | msg->address_hi = MSI_ADDR_BASE_HI; | ||
2483 | msg->address_lo = | ||
2484 | MSI_ADDR_BASE_LO | | ||
2485 | ((INT_DEST_MODE == 0) ? | ||
2486 | MSI_ADDR_DEST_MODE_PHYSICAL: | ||
2487 | MSI_ADDR_DEST_MODE_LOGICAL) | | ||
2488 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2489 | MSI_ADDR_REDIRECTION_CPU: | ||
2490 | MSI_ADDR_REDIRECTION_LOWPRI) | | ||
2491 | MSI_ADDR_DEST_ID(dest); | ||
2492 | |||
2493 | msg->data = | ||
2494 | MSI_DATA_TRIGGER_EDGE | | ||
2495 | MSI_DATA_LEVEL_ASSERT | | ||
2496 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2497 | MSI_DATA_DELIVERY_FIXED: | ||
2498 | MSI_DATA_DELIVERY_LOWPRI) | | ||
2499 | MSI_DATA_VECTOR(vector); | ||
2500 | } | ||
2501 | return vector; | ||
2502 | } | ||
2503 | |||
2504 | #ifdef CONFIG_SMP | ||
2505 | static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | ||
2506 | { | ||
2507 | struct msi_msg msg; | ||
2508 | unsigned int dest; | ||
2509 | cpumask_t tmp; | ||
2510 | int vector; | ||
2511 | |||
2512 | cpus_and(tmp, mask, cpu_online_map); | ||
2513 | if (cpus_empty(tmp)) | ||
2514 | tmp = TARGET_CPUS; | ||
2515 | |||
2516 | vector = assign_irq_vector(irq); | ||
2517 | if (vector < 0) | ||
2518 | return; | ||
2519 | |||
2520 | dest = cpu_mask_to_apicid(mask); | ||
2521 | |||
2522 | read_msi_msg(irq, &msg); | ||
2523 | |||
2524 | msg.data &= ~MSI_DATA_VECTOR_MASK; | ||
2525 | msg.data |= MSI_DATA_VECTOR(vector); | ||
2526 | msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; | ||
2527 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | ||
2528 | |||
2529 | write_msi_msg(irq, &msg); | ||
2530 | irq_desc[irq].affinity = mask; | ||
2531 | } | ||
2532 | #endif /* CONFIG_SMP */ | ||
2533 | |||
2534 | /* | ||
2535 | * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, | ||
2536 | * which implement the MSI or MSI-X Capability Structure. | ||
2537 | */ | ||
2538 | static struct irq_chip msi_chip = { | ||
2539 | .name = "PCI-MSI", | ||
2540 | .unmask = unmask_msi_irq, | ||
2541 | .mask = mask_msi_irq, | ||
2542 | .ack = ack_ioapic_irq, | ||
2543 | #ifdef CONFIG_SMP | ||
2544 | .set_affinity = set_msi_irq_affinity, | ||
2545 | #endif | ||
2546 | .retrigger = ioapic_retrigger_irq, | ||
2547 | }; | ||
2548 | |||
2549 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) | ||
2550 | { | ||
2551 | struct msi_msg msg; | ||
2552 | int irq, ret; | ||
2553 | irq = create_irq(); | ||
2554 | if (irq < 0) | ||
2555 | return irq; | ||
2556 | |||
2557 | ret = msi_compose_msg(dev, irq, &msg); | ||
2558 | if (ret < 0) { | ||
2559 | destroy_irq(irq); | ||
2560 | return ret; | ||
2561 | } | ||
2562 | |||
2563 | set_irq_msi(irq, desc); | ||
2564 | write_msi_msg(irq, &msg); | ||
2565 | |||
2566 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, | ||
2567 | "edge"); | ||
2568 | |||
2569 | return 0; | ||
2570 | } | ||
2571 | |||
2572 | void arch_teardown_msi_irq(unsigned int irq) | ||
2573 | { | ||
2574 | destroy_irq(irq); | ||
2575 | } | ||
2576 | |||
2577 | #endif /* CONFIG_PCI_MSI */ | ||
2578 | |||
2579 | /* | ||
2580 | * Hypertransport interrupt support | ||
2581 | */ | ||
2582 | #ifdef CONFIG_HT_IRQ | ||
2583 | |||
2584 | #ifdef CONFIG_SMP | ||
2585 | |||
2586 | static void target_ht_irq(unsigned int irq, unsigned int dest) | ||
2587 | { | ||
2588 | struct ht_irq_msg msg; | ||
2589 | fetch_ht_irq_msg(irq, &msg); | ||
2590 | |||
2591 | msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); | ||
2592 | msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); | ||
2593 | |||
2594 | msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); | ||
2595 | msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); | ||
2596 | |||
2597 | write_ht_irq_msg(irq, &msg); | ||
2598 | } | ||
2599 | |||
2600 | static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) | ||
2601 | { | ||
2602 | unsigned int dest; | ||
2603 | cpumask_t tmp; | ||
2604 | |||
2605 | cpus_and(tmp, mask, cpu_online_map); | ||
2606 | if (cpus_empty(tmp)) | ||
2607 | tmp = TARGET_CPUS; | ||
2608 | |||
2609 | cpus_and(mask, tmp, CPU_MASK_ALL); | ||
2610 | |||
2611 | dest = cpu_mask_to_apicid(mask); | ||
2612 | |||
2613 | target_ht_irq(irq, dest); | ||
2614 | irq_desc[irq].affinity = mask; | ||
2615 | } | ||
2616 | #endif | ||
2617 | |||
2618 | static struct irq_chip ht_irq_chip = { | ||
2619 | .name = "PCI-HT", | ||
2620 | .mask = mask_ht_irq, | ||
2621 | .unmask = unmask_ht_irq, | ||
2622 | .ack = ack_ioapic_irq, | ||
2623 | #ifdef CONFIG_SMP | ||
2624 | .set_affinity = set_ht_irq_affinity, | ||
2625 | #endif | ||
2626 | .retrigger = ioapic_retrigger_irq, | ||
2627 | }; | ||
2628 | |||
2629 | int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | ||
2630 | { | ||
2631 | int vector; | ||
2632 | |||
2633 | vector = assign_irq_vector(irq); | ||
2634 | if (vector >= 0) { | ||
2635 | struct ht_irq_msg msg; | ||
2636 | unsigned dest; | ||
2637 | cpumask_t tmp; | ||
2638 | |||
2639 | cpus_clear(tmp); | ||
2640 | cpu_set(vector >> 8, tmp); | ||
2641 | dest = cpu_mask_to_apicid(tmp); | ||
2642 | |||
2643 | msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); | ||
2644 | |||
2645 | msg.address_lo = | ||
2646 | HT_IRQ_LOW_BASE | | ||
2647 | HT_IRQ_LOW_DEST_ID(dest) | | ||
2648 | HT_IRQ_LOW_VECTOR(vector) | | ||
2649 | ((INT_DEST_MODE == 0) ? | ||
2650 | HT_IRQ_LOW_DM_PHYSICAL : | ||
2651 | HT_IRQ_LOW_DM_LOGICAL) | | ||
2652 | HT_IRQ_LOW_RQEOI_EDGE | | ||
2653 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2654 | HT_IRQ_LOW_MT_FIXED : | ||
2655 | HT_IRQ_LOW_MT_ARBITRATED) | | ||
2656 | HT_IRQ_LOW_IRQ_MASKED; | ||
2657 | |||
2658 | write_ht_irq_msg(irq, &msg); | ||
2659 | |||
2660 | set_irq_chip_and_handler_name(irq, &ht_irq_chip, | ||
2661 | handle_edge_irq, "edge"); | ||
2662 | } | ||
2663 | return vector; | ||
2664 | } | ||
2665 | #endif /* CONFIG_HT_IRQ */ | ||
2666 | |||
2667 | /* -------------------------------------------------------------------------- | ||
2668 | ACPI-based IOAPIC Configuration | ||
2669 | -------------------------------------------------------------------------- */ | ||
2670 | |||
2671 | #ifdef CONFIG_ACPI | ||
2672 | |||
2673 | int __init io_apic_get_unique_id (int ioapic, int apic_id) | ||
2674 | { | ||
2675 | union IO_APIC_reg_00 reg_00; | ||
2676 | static physid_mask_t apic_id_map = PHYSID_MASK_NONE; | ||
2677 | physid_mask_t tmp; | ||
2678 | unsigned long flags; | ||
2679 | int i = 0; | ||
2680 | |||
2681 | /* | ||
2682 | * The P4 platform supports up to 256 APIC IDs on two separate APIC | ||
2683 | * buses (one for LAPICs, one for IOAPICs), where predecessors only | ||
2684 | * supports up to 16 on one shared APIC bus. | ||
2685 | * | ||
2686 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | ||
2687 | * advantage of new APIC bus architecture. | ||
2688 | */ | ||
2689 | |||
2690 | if (physids_empty(apic_id_map)) | ||
2691 | apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); | ||
2692 | |||
2693 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2694 | reg_00.raw = io_apic_read(ioapic, 0); | ||
2695 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2696 | |||
2697 | if (apic_id >= get_physical_broadcast()) { | ||
2698 | printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " | ||
2699 | "%d\n", ioapic, apic_id, reg_00.bits.ID); | ||
2700 | apic_id = reg_00.bits.ID; | ||
2701 | } | ||
2702 | |||
2703 | /* | ||
2704 | * Every APIC in a system must have a unique ID or we get lots of nice | ||
2705 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
2706 | */ | ||
2707 | if (check_apicid_used(apic_id_map, apic_id)) { | ||
2708 | |||
2709 | for (i = 0; i < get_physical_broadcast(); i++) { | ||
2710 | if (!check_apicid_used(apic_id_map, i)) | ||
2711 | break; | ||
2712 | } | ||
2713 | |||
2714 | if (i == get_physical_broadcast()) | ||
2715 | panic("Max apic_id exceeded!\n"); | ||
2716 | |||
2717 | printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " | ||
2718 | "trying %d\n", ioapic, apic_id, i); | ||
2719 | |||
2720 | apic_id = i; | ||
2721 | } | ||
2722 | |||
2723 | tmp = apicid_to_cpu_present(apic_id); | ||
2724 | physids_or(apic_id_map, apic_id_map, tmp); | ||
2725 | |||
2726 | if (reg_00.bits.ID != apic_id) { | ||
2727 | reg_00.bits.ID = apic_id; | ||
2728 | |||
2729 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2730 | io_apic_write(ioapic, 0, reg_00.raw); | ||
2731 | reg_00.raw = io_apic_read(ioapic, 0); | ||
2732 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2733 | |||
2734 | /* Sanity check */ | ||
2735 | if (reg_00.bits.ID != apic_id) { | ||
2736 | printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); | ||
2737 | return -1; | ||
2738 | } | ||
2739 | } | ||
2740 | |||
2741 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
2742 | "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); | ||
2743 | |||
2744 | return apic_id; | ||
2745 | } | ||
2746 | |||
2747 | |||
2748 | int __init io_apic_get_version (int ioapic) | ||
2749 | { | ||
2750 | union IO_APIC_reg_01 reg_01; | ||
2751 | unsigned long flags; | ||
2752 | |||
2753 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2754 | reg_01.raw = io_apic_read(ioapic, 1); | ||
2755 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2756 | |||
2757 | return reg_01.bits.version; | ||
2758 | } | ||
2759 | |||
2760 | |||
2761 | int __init io_apic_get_redir_entries (int ioapic) | ||
2762 | { | ||
2763 | union IO_APIC_reg_01 reg_01; | ||
2764 | unsigned long flags; | ||
2765 | |||
2766 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2767 | reg_01.raw = io_apic_read(ioapic, 1); | ||
2768 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2769 | |||
2770 | return reg_01.bits.entries; | ||
2771 | } | ||
2772 | |||
2773 | |||
2774 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) | ||
2775 | { | ||
2776 | struct IO_APIC_route_entry entry; | ||
2777 | unsigned long flags; | ||
2778 | |||
2779 | if (!IO_APIC_IRQ(irq)) { | ||
2780 | printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
2781 | ioapic); | ||
2782 | return -EINVAL; | ||
2783 | } | ||
2784 | |||
2785 | /* | ||
2786 | * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. | ||
2787 | * Note that we mask (disable) IRQs now -- these get enabled when the | ||
2788 | * corresponding device driver registers for this IRQ. | ||
2789 | */ | ||
2790 | |||
2791 | memset(&entry,0,sizeof(entry)); | ||
2792 | |||
2793 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
2794 | entry.dest_mode = INT_DEST_MODE; | ||
2795 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
2796 | entry.trigger = edge_level; | ||
2797 | entry.polarity = active_high_low; | ||
2798 | entry.mask = 1; | ||
2799 | |||
2800 | /* | ||
2801 | * IRQs < 16 are already in the irq_2_pin[] map | ||
2802 | */ | ||
2803 | if (irq >= 16) | ||
2804 | add_pin_to_irq(irq, ioapic, pin); | ||
2805 | |||
2806 | entry.vector = assign_irq_vector(irq); | ||
2807 | |||
2808 | apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " | ||
2809 | "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, | ||
2810 | mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, | ||
2811 | edge_level, active_high_low); | ||
2812 | |||
2813 | ioapic_register_intr(irq, entry.vector, edge_level); | ||
2814 | |||
2815 | if (!ioapic && (irq < 16)) | ||
2816 | disable_8259A_irq(irq); | ||
2817 | |||
2818 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2819 | __ioapic_write_entry(ioapic, pin, entry); | ||
2820 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2821 | |||
2822 | return 0; | ||
2823 | } | ||
2824 | |||
2825 | #endif /* CONFIG_ACPI */ | ||
2826 | |||
2827 | static int __init parse_disable_timer_pin_1(char *arg) | ||
2828 | { | ||
2829 | disable_timer_pin_1 = 1; | ||
2830 | return 0; | ||
2831 | } | ||
2832 | early_param("disable_timer_pin_1", parse_disable_timer_pin_1); | ||
2833 | |||
2834 | static int __init parse_enable_timer_pin_1(char *arg) | ||
2835 | { | ||
2836 | disable_timer_pin_1 = -1; | ||
2837 | return 0; | ||
2838 | } | ||
2839 | early_param("enable_timer_pin_1", parse_enable_timer_pin_1); | ||
2840 | |||
2841 | static int __init parse_noapic(char *arg) | ||
2842 | { | ||
2843 | /* disable IO-APIC */ | ||
2844 | disable_ioapic_setup(); | ||
2845 | return 0; | ||
2846 | } | ||
2847 | early_param("noapic", parse_noapic); | ||
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c new file mode 100644 index 000000000000..3d310a946d76 --- /dev/null +++ b/arch/x86/kernel/ioport_32.c | |||
@@ -0,0 +1,153 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/ioport.c | ||
3 | * | ||
4 | * This contains the io-permission bitmap code - written by obz, with changes | ||
5 | * by Linus. | ||
6 | */ | ||
7 | |||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/capability.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/ioport.h> | ||
14 | #include <linux/smp.h> | ||
15 | #include <linux/stddef.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/thread_info.h> | ||
18 | #include <linux/syscalls.h> | ||
19 | |||
20 | /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | ||
21 | static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | ||
22 | { | ||
23 | unsigned long mask; | ||
24 | unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); | ||
25 | unsigned int low_index = base & (BITS_PER_LONG-1); | ||
26 | int length = low_index + extent; | ||
27 | |||
28 | if (low_index != 0) { | ||
29 | mask = (~0UL << low_index); | ||
30 | if (length < BITS_PER_LONG) | ||
31 | mask &= ~(~0UL << length); | ||
32 | if (new_value) | ||
33 | *bitmap_base++ |= mask; | ||
34 | else | ||
35 | *bitmap_base++ &= ~mask; | ||
36 | length -= BITS_PER_LONG; | ||
37 | } | ||
38 | |||
39 | mask = (new_value ? ~0UL : 0UL); | ||
40 | while (length >= BITS_PER_LONG) { | ||
41 | *bitmap_base++ = mask; | ||
42 | length -= BITS_PER_LONG; | ||
43 | } | ||
44 | |||
45 | if (length > 0) { | ||
46 | mask = ~(~0UL << length); | ||
47 | if (new_value) | ||
48 | *bitmap_base++ |= mask; | ||
49 | else | ||
50 | *bitmap_base++ &= ~mask; | ||
51 | } | ||
52 | } | ||
53 | |||
54 | |||
55 | /* | ||
56 | * this changes the io permissions bitmap in the current task. | ||
57 | */ | ||
58 | asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | ||
59 | { | ||
60 | unsigned long i, max_long, bytes, bytes_updated; | ||
61 | struct thread_struct * t = ¤t->thread; | ||
62 | struct tss_struct * tss; | ||
63 | unsigned long *bitmap; | ||
64 | |||
65 | if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | ||
66 | return -EINVAL; | ||
67 | if (turn_on && !capable(CAP_SYS_RAWIO)) | ||
68 | return -EPERM; | ||
69 | |||
70 | /* | ||
71 | * If it's the first ioperm() call in this thread's lifetime, set the | ||
72 | * IO bitmap up. ioperm() is much less timing critical than clone(), | ||
73 | * this is why we delay this operation until now: | ||
74 | */ | ||
75 | if (!t->io_bitmap_ptr) { | ||
76 | bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
77 | if (!bitmap) | ||
78 | return -ENOMEM; | ||
79 | |||
80 | memset(bitmap, 0xff, IO_BITMAP_BYTES); | ||
81 | t->io_bitmap_ptr = bitmap; | ||
82 | set_thread_flag(TIF_IO_BITMAP); | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * do it in the per-thread copy and in the TSS ... | ||
87 | * | ||
88 | * Disable preemption via get_cpu() - we must not switch away | ||
89 | * because the ->io_bitmap_max value must match the bitmap | ||
90 | * contents: | ||
91 | */ | ||
92 | tss = &per_cpu(init_tss, get_cpu()); | ||
93 | |||
94 | set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | ||
95 | |||
96 | /* | ||
97 | * Search for a (possibly new) maximum. This is simple and stupid, | ||
98 | * to keep it obviously correct: | ||
99 | */ | ||
100 | max_long = 0; | ||
101 | for (i = 0; i < IO_BITMAP_LONGS; i++) | ||
102 | if (t->io_bitmap_ptr[i] != ~0UL) | ||
103 | max_long = i; | ||
104 | |||
105 | bytes = (max_long + 1) * sizeof(long); | ||
106 | bytes_updated = max(bytes, t->io_bitmap_max); | ||
107 | |||
108 | t->io_bitmap_max = bytes; | ||
109 | |||
110 | /* | ||
111 | * Sets the lazy trigger so that the next I/O operation will | ||
112 | * reload the correct bitmap. | ||
113 | * Reset the owner so that a process switch will not set | ||
114 | * tss->io_bitmap_base to IO_BITMAP_OFFSET. | ||
115 | */ | ||
116 | tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; | ||
117 | tss->io_bitmap_owner = NULL; | ||
118 | |||
119 | put_cpu(); | ||
120 | |||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * sys_iopl has to be used when you want to access the IO ports | ||
126 | * beyond the 0x3ff range: to get the full 65536 ports bitmapped | ||
127 | * you'd need 8kB of bitmaps/process, which is a bit excessive. | ||
128 | * | ||
129 | * Here we just change the eflags value on the stack: we allow | ||
130 | * only the super-user to do it. This depends on the stack-layout | ||
131 | * on system-call entry - see also fork() and the signal handling | ||
132 | * code. | ||
133 | */ | ||
134 | |||
135 | asmlinkage long sys_iopl(unsigned long unused) | ||
136 | { | ||
137 | volatile struct pt_regs * regs = (struct pt_regs *) &unused; | ||
138 | unsigned int level = regs->ebx; | ||
139 | unsigned int old = (regs->eflags >> 12) & 3; | ||
140 | struct thread_struct *t = ¤t->thread; | ||
141 | |||
142 | if (level > 3) | ||
143 | return -EINVAL; | ||
144 | /* Trying to gain more privileges? */ | ||
145 | if (level > old) { | ||
146 | if (!capable(CAP_SYS_RAWIO)) | ||
147 | return -EPERM; | ||
148 | } | ||
149 | t->iopl = level << 12; | ||
150 | regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl; | ||
151 | set_iopl_mask(t->iopl); | ||
152 | return 0; | ||
153 | } | ||
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c new file mode 100644 index 000000000000..dd2b97fc00b2 --- /dev/null +++ b/arch/x86/kernel/irq_32.c | |||
@@ -0,0 +1,343 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/irq.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the lowest level x86-specific interrupt | ||
7 | * entry, irq-stacks and irq statistics code. All the remaining | ||
8 | * irq logic is done by the generic kernel/irq/ code and | ||
9 | * by the x86-specific irq controller code. (e.g. i8259.c and | ||
10 | * io_apic.c.) | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/seq_file.h> | ||
15 | #include <linux/interrupt.h> | ||
16 | #include <linux/kernel_stat.h> | ||
17 | #include <linux/notifier.h> | ||
18 | #include <linux/cpu.h> | ||
19 | #include <linux/delay.h> | ||
20 | |||
21 | #include <asm/apic.h> | ||
22 | #include <asm/uaccess.h> | ||
23 | |||
24 | DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); | ||
25 | EXPORT_PER_CPU_SYMBOL(irq_stat); | ||
26 | |||
27 | DEFINE_PER_CPU(struct pt_regs *, irq_regs); | ||
28 | EXPORT_PER_CPU_SYMBOL(irq_regs); | ||
29 | |||
30 | /* | ||
31 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
32 | * each architecture has to answer this themselves. | ||
33 | */ | ||
34 | void ack_bad_irq(unsigned int irq) | ||
35 | { | ||
36 | printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); | ||
37 | |||
38 | #ifdef CONFIG_X86_LOCAL_APIC | ||
39 | /* | ||
40 | * Currently unexpected vectors happen only on SMP and APIC. | ||
41 | * We _must_ ack these because every local APIC has only N | ||
42 | * irq slots per priority level, and a 'hanging, unacked' IRQ | ||
43 | * holds up an irq slot - in excessive cases (when multiple | ||
44 | * unexpected vectors occur) that might lock up the APIC | ||
45 | * completely. | ||
46 | * But only ack when the APIC is enabled -AK | ||
47 | */ | ||
48 | if (cpu_has_apic) | ||
49 | ack_APIC_irq(); | ||
50 | #endif | ||
51 | } | ||
52 | |||
53 | #ifdef CONFIG_4KSTACKS | ||
54 | /* | ||
55 | * per-CPU IRQ handling contexts (thread information and stack) | ||
56 | */ | ||
57 | union irq_ctx { | ||
58 | struct thread_info tinfo; | ||
59 | u32 stack[THREAD_SIZE/sizeof(u32)]; | ||
60 | }; | ||
61 | |||
62 | static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; | ||
63 | static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; | ||
64 | #endif | ||
65 | |||
66 | /* | ||
67 | * do_IRQ handles all normal device IRQ's (the special | ||
68 | * SMP cross-CPU interrupts have their own specific | ||
69 | * handlers). | ||
70 | */ | ||
71 | fastcall unsigned int do_IRQ(struct pt_regs *regs) | ||
72 | { | ||
73 | struct pt_regs *old_regs; | ||
74 | /* high bit used in ret_from_ code */ | ||
75 | int irq = ~regs->orig_eax; | ||
76 | struct irq_desc *desc = irq_desc + irq; | ||
77 | #ifdef CONFIG_4KSTACKS | ||
78 | union irq_ctx *curctx, *irqctx; | ||
79 | u32 *isp; | ||
80 | #endif | ||
81 | |||
82 | if (unlikely((unsigned)irq >= NR_IRQS)) { | ||
83 | printk(KERN_EMERG "%s: cannot handle IRQ %d\n", | ||
84 | __FUNCTION__, irq); | ||
85 | BUG(); | ||
86 | } | ||
87 | |||
88 | old_regs = set_irq_regs(regs); | ||
89 | irq_enter(); | ||
90 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
91 | /* Debugging check for stack overflow: is there less than 1KB free? */ | ||
92 | { | ||
93 | long esp; | ||
94 | |||
95 | __asm__ __volatile__("andl %%esp,%0" : | ||
96 | "=r" (esp) : "0" (THREAD_SIZE - 1)); | ||
97 | if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { | ||
98 | printk("do_IRQ: stack overflow: %ld\n", | ||
99 | esp - sizeof(struct thread_info)); | ||
100 | dump_stack(); | ||
101 | } | ||
102 | } | ||
103 | #endif | ||
104 | |||
105 | #ifdef CONFIG_4KSTACKS | ||
106 | |||
107 | curctx = (union irq_ctx *) current_thread_info(); | ||
108 | irqctx = hardirq_ctx[smp_processor_id()]; | ||
109 | |||
110 | /* | ||
111 | * this is where we switch to the IRQ stack. However, if we are | ||
112 | * already using the IRQ stack (because we interrupted a hardirq | ||
113 | * handler) we can't do that and just have to keep using the | ||
114 | * current stack (which is the irq stack already after all) | ||
115 | */ | ||
116 | if (curctx != irqctx) { | ||
117 | int arg1, arg2, ebx; | ||
118 | |||
119 | /* build the stack frame on the IRQ stack */ | ||
120 | isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); | ||
121 | irqctx->tinfo.task = curctx->tinfo.task; | ||
122 | irqctx->tinfo.previous_esp = current_stack_pointer; | ||
123 | |||
124 | /* | ||
125 | * Copy the softirq bits in preempt_count so that the | ||
126 | * softirq checks work in the hardirq context. | ||
127 | */ | ||
128 | irqctx->tinfo.preempt_count = | ||
129 | (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | | ||
130 | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); | ||
131 | |||
132 | asm volatile( | ||
133 | " xchgl %%ebx,%%esp \n" | ||
134 | " call *%%edi \n" | ||
135 | " movl %%ebx,%%esp \n" | ||
136 | : "=a" (arg1), "=d" (arg2), "=b" (ebx) | ||
137 | : "0" (irq), "1" (desc), "2" (isp), | ||
138 | "D" (desc->handle_irq) | ||
139 | : "memory", "cc" | ||
140 | ); | ||
141 | } else | ||
142 | #endif | ||
143 | desc->handle_irq(irq, desc); | ||
144 | |||
145 | irq_exit(); | ||
146 | set_irq_regs(old_regs); | ||
147 | return 1; | ||
148 | } | ||
149 | |||
150 | #ifdef CONFIG_4KSTACKS | ||
151 | |||
152 | static char softirq_stack[NR_CPUS * THREAD_SIZE] | ||
153 | __attribute__((__section__(".bss.page_aligned"))); | ||
154 | |||
155 | static char hardirq_stack[NR_CPUS * THREAD_SIZE] | ||
156 | __attribute__((__section__(".bss.page_aligned"))); | ||
157 | |||
158 | /* | ||
159 | * allocate per-cpu stacks for hardirq and for softirq processing | ||
160 | */ | ||
161 | void irq_ctx_init(int cpu) | ||
162 | { | ||
163 | union irq_ctx *irqctx; | ||
164 | |||
165 | if (hardirq_ctx[cpu]) | ||
166 | return; | ||
167 | |||
168 | irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; | ||
169 | irqctx->tinfo.task = NULL; | ||
170 | irqctx->tinfo.exec_domain = NULL; | ||
171 | irqctx->tinfo.cpu = cpu; | ||
172 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; | ||
173 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | ||
174 | |||
175 | hardirq_ctx[cpu] = irqctx; | ||
176 | |||
177 | irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; | ||
178 | irqctx->tinfo.task = NULL; | ||
179 | irqctx->tinfo.exec_domain = NULL; | ||
180 | irqctx->tinfo.cpu = cpu; | ||
181 | irqctx->tinfo.preempt_count = 0; | ||
182 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | ||
183 | |||
184 | softirq_ctx[cpu] = irqctx; | ||
185 | |||
186 | printk("CPU %u irqstacks, hard=%p soft=%p\n", | ||
187 | cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); | ||
188 | } | ||
189 | |||
190 | void irq_ctx_exit(int cpu) | ||
191 | { | ||
192 | hardirq_ctx[cpu] = NULL; | ||
193 | } | ||
194 | |||
195 | extern asmlinkage void __do_softirq(void); | ||
196 | |||
197 | asmlinkage void do_softirq(void) | ||
198 | { | ||
199 | unsigned long flags; | ||
200 | struct thread_info *curctx; | ||
201 | union irq_ctx *irqctx; | ||
202 | u32 *isp; | ||
203 | |||
204 | if (in_interrupt()) | ||
205 | return; | ||
206 | |||
207 | local_irq_save(flags); | ||
208 | |||
209 | if (local_softirq_pending()) { | ||
210 | curctx = current_thread_info(); | ||
211 | irqctx = softirq_ctx[smp_processor_id()]; | ||
212 | irqctx->tinfo.task = curctx->task; | ||
213 | irqctx->tinfo.previous_esp = current_stack_pointer; | ||
214 | |||
215 | /* build the stack frame on the softirq stack */ | ||
216 | isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); | ||
217 | |||
218 | asm volatile( | ||
219 | " xchgl %%ebx,%%esp \n" | ||
220 | " call __do_softirq \n" | ||
221 | " movl %%ebx,%%esp \n" | ||
222 | : "=b"(isp) | ||
223 | : "0"(isp) | ||
224 | : "memory", "cc", "edx", "ecx", "eax" | ||
225 | ); | ||
226 | /* | ||
227 | * Shouldnt happen, we returned above if in_interrupt(): | ||
228 | */ | ||
229 | WARN_ON_ONCE(softirq_count()); | ||
230 | } | ||
231 | |||
232 | local_irq_restore(flags); | ||
233 | } | ||
234 | |||
235 | EXPORT_SYMBOL(do_softirq); | ||
236 | #endif | ||
237 | |||
238 | /* | ||
239 | * Interrupt statistics: | ||
240 | */ | ||
241 | |||
242 | atomic_t irq_err_count; | ||
243 | |||
244 | /* | ||
245 | * /proc/interrupts printing: | ||
246 | */ | ||
247 | |||
248 | int show_interrupts(struct seq_file *p, void *v) | ||
249 | { | ||
250 | int i = *(loff_t *) v, j; | ||
251 | struct irqaction * action; | ||
252 | unsigned long flags; | ||
253 | |||
254 | if (i == 0) { | ||
255 | seq_printf(p, " "); | ||
256 | for_each_online_cpu(j) | ||
257 | seq_printf(p, "CPU%-8d",j); | ||
258 | seq_putc(p, '\n'); | ||
259 | } | ||
260 | |||
261 | if (i < NR_IRQS) { | ||
262 | spin_lock_irqsave(&irq_desc[i].lock, flags); | ||
263 | action = irq_desc[i].action; | ||
264 | if (!action) | ||
265 | goto skip; | ||
266 | seq_printf(p, "%3d: ",i); | ||
267 | #ifndef CONFIG_SMP | ||
268 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
269 | #else | ||
270 | for_each_online_cpu(j) | ||
271 | seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); | ||
272 | #endif | ||
273 | seq_printf(p, " %8s", irq_desc[i].chip->name); | ||
274 | seq_printf(p, "-%-8s", irq_desc[i].name); | ||
275 | seq_printf(p, " %s", action->name); | ||
276 | |||
277 | for (action=action->next; action; action = action->next) | ||
278 | seq_printf(p, ", %s", action->name); | ||
279 | |||
280 | seq_putc(p, '\n'); | ||
281 | skip: | ||
282 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | ||
283 | } else if (i == NR_IRQS) { | ||
284 | seq_printf(p, "NMI: "); | ||
285 | for_each_online_cpu(j) | ||
286 | seq_printf(p, "%10u ", nmi_count(j)); | ||
287 | seq_putc(p, '\n'); | ||
288 | #ifdef CONFIG_X86_LOCAL_APIC | ||
289 | seq_printf(p, "LOC: "); | ||
290 | for_each_online_cpu(j) | ||
291 | seq_printf(p, "%10u ", | ||
292 | per_cpu(irq_stat,j).apic_timer_irqs); | ||
293 | seq_putc(p, '\n'); | ||
294 | #endif | ||
295 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
296 | #if defined(CONFIG_X86_IO_APIC) | ||
297 | seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | ||
298 | #endif | ||
299 | } | ||
300 | return 0; | ||
301 | } | ||
302 | |||
303 | #ifdef CONFIG_HOTPLUG_CPU | ||
304 | #include <mach_apic.h> | ||
305 | |||
306 | void fixup_irqs(cpumask_t map) | ||
307 | { | ||
308 | unsigned int irq; | ||
309 | static int warned; | ||
310 | |||
311 | for (irq = 0; irq < NR_IRQS; irq++) { | ||
312 | cpumask_t mask; | ||
313 | if (irq == 2) | ||
314 | continue; | ||
315 | |||
316 | cpus_and(mask, irq_desc[irq].affinity, map); | ||
317 | if (any_online_cpu(mask) == NR_CPUS) { | ||
318 | printk("Breaking affinity for irq %i\n", irq); | ||
319 | mask = map; | ||
320 | } | ||
321 | if (irq_desc[irq].chip->set_affinity) | ||
322 | irq_desc[irq].chip->set_affinity(irq, mask); | ||
323 | else if (irq_desc[irq].action && !(warned++)) | ||
324 | printk("Cannot set affinity for irq %i\n", irq); | ||
325 | } | ||
326 | |||
327 | #if 0 | ||
328 | barrier(); | ||
329 | /* Ingo Molnar says: "after the IO-APIC masks have been redirected | ||
330 | [note the nop - the interrupt-enable boundary on x86 is two | ||
331 | instructions from sti] - to flush out pending hardirqs and | ||
332 | IPIs. After this point nothing is supposed to reach this CPU." */ | ||
333 | __asm__ __volatile__("sti; nop; cli"); | ||
334 | barrier(); | ||
335 | #else | ||
336 | /* That doesn't seem sufficient. Give it 1ms. */ | ||
337 | local_irq_enable(); | ||
338 | mdelay(1); | ||
339 | local_irq_disable(); | ||
340 | #endif | ||
341 | } | ||
342 | #endif | ||
343 | |||
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c new file mode 100644 index 000000000000..448a50b1324c --- /dev/null +++ b/arch/x86/kernel/kprobes_32.c | |||
@@ -0,0 +1,751 @@ | |||
1 | /* | ||
2 | * Kernel Probes (KProbes) | ||
3 | * arch/i386/kernel/kprobes.c | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
20 | * | ||
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | ||
22 | * Probes initial implementation ( includes contributions from | ||
23 | * Rusty Russell). | ||
24 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | ||
25 | * interface to access function arguments. | ||
26 | * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston | ||
27 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi | ||
28 | * <prasanna@in.ibm.com> added function-return probes. | ||
29 | */ | ||
30 | |||
31 | #include <linux/kprobes.h> | ||
32 | #include <linux/ptrace.h> | ||
33 | #include <linux/preempt.h> | ||
34 | #include <linux/kdebug.h> | ||
35 | #include <asm/cacheflush.h> | ||
36 | #include <asm/desc.h> | ||
37 | #include <asm/uaccess.h> | ||
38 | #include <asm/alternative.h> | ||
39 | |||
40 | void jprobe_return_end(void); | ||
41 | |||
42 | DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; | ||
43 | DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); | ||
44 | |||
45 | /* insert a jmp code */ | ||
46 | static __always_inline void set_jmp_op(void *from, void *to) | ||
47 | { | ||
48 | struct __arch_jmp_op { | ||
49 | char op; | ||
50 | long raddr; | ||
51 | } __attribute__((packed)) *jop; | ||
52 | jop = (struct __arch_jmp_op *)from; | ||
53 | jop->raddr = (long)(to) - ((long)(from) + 5); | ||
54 | jop->op = RELATIVEJUMP_INSTRUCTION; | ||
55 | } | ||
56 | |||
57 | /* | ||
58 | * returns non-zero if opcodes can be boosted. | ||
59 | */ | ||
60 | static __always_inline int can_boost(kprobe_opcode_t *opcodes) | ||
61 | { | ||
62 | #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ | ||
63 | (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ | ||
64 | (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ | ||
65 | (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ | ||
66 | (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ | ||
67 | << (row % 32)) | ||
68 | /* | ||
69 | * Undefined/reserved opcodes, conditional jump, Opcode Extension | ||
70 | * Groups, and some special opcodes can not be boost. | ||
71 | */ | ||
72 | static const unsigned long twobyte_is_boostable[256 / 32] = { | ||
73 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
74 | /* ------------------------------- */ | ||
75 | W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ | ||
76 | W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ | ||
77 | W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ | ||
78 | W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ | ||
79 | W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ | ||
80 | W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ | ||
81 | W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ | ||
82 | W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ | ||
83 | W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ | ||
84 | W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ | ||
85 | W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ | ||
86 | W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ | ||
87 | W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ | ||
88 | W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ | ||
89 | W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ | ||
90 | W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */ | ||
91 | /* ------------------------------- */ | ||
92 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
93 | }; | ||
94 | #undef W | ||
95 | kprobe_opcode_t opcode; | ||
96 | kprobe_opcode_t *orig_opcodes = opcodes; | ||
97 | retry: | ||
98 | if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) | ||
99 | return 0; | ||
100 | opcode = *(opcodes++); | ||
101 | |||
102 | /* 2nd-byte opcode */ | ||
103 | if (opcode == 0x0f) { | ||
104 | if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) | ||
105 | return 0; | ||
106 | return test_bit(*opcodes, twobyte_is_boostable); | ||
107 | } | ||
108 | |||
109 | switch (opcode & 0xf0) { | ||
110 | case 0x60: | ||
111 | if (0x63 < opcode && opcode < 0x67) | ||
112 | goto retry; /* prefixes */ | ||
113 | /* can't boost Address-size override and bound */ | ||
114 | return (opcode != 0x62 && opcode != 0x67); | ||
115 | case 0x70: | ||
116 | return 0; /* can't boost conditional jump */ | ||
117 | case 0xc0: | ||
118 | /* can't boost software-interruptions */ | ||
119 | return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; | ||
120 | case 0xd0: | ||
121 | /* can boost AA* and XLAT */ | ||
122 | return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); | ||
123 | case 0xe0: | ||
124 | /* can boost in/out and absolute jmps */ | ||
125 | return ((opcode & 0x04) || opcode == 0xea); | ||
126 | case 0xf0: | ||
127 | if ((opcode & 0x0c) == 0 && opcode != 0xf1) | ||
128 | goto retry; /* lock/rep(ne) prefix */ | ||
129 | /* clear and set flags can be boost */ | ||
130 | return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); | ||
131 | default: | ||
132 | if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) | ||
133 | goto retry; /* prefixes */ | ||
134 | /* can't boost CS override and call */ | ||
135 | return (opcode != 0x2e && opcode != 0x9a); | ||
136 | } | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * returns non-zero if opcode modifies the interrupt flag. | ||
141 | */ | ||
142 | static int __kprobes is_IF_modifier(kprobe_opcode_t opcode) | ||
143 | { | ||
144 | switch (opcode) { | ||
145 | case 0xfa: /* cli */ | ||
146 | case 0xfb: /* sti */ | ||
147 | case 0xcf: /* iret/iretd */ | ||
148 | case 0x9d: /* popf/popfd */ | ||
149 | return 1; | ||
150 | } | ||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | int __kprobes arch_prepare_kprobe(struct kprobe *p) | ||
155 | { | ||
156 | /* insn: must be on special executable page on i386. */ | ||
157 | p->ainsn.insn = get_insn_slot(); | ||
158 | if (!p->ainsn.insn) | ||
159 | return -ENOMEM; | ||
160 | |||
161 | memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); | ||
162 | p->opcode = *p->addr; | ||
163 | if (can_boost(p->addr)) { | ||
164 | p->ainsn.boostable = 0; | ||
165 | } else { | ||
166 | p->ainsn.boostable = -1; | ||
167 | } | ||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | void __kprobes arch_arm_kprobe(struct kprobe *p) | ||
172 | { | ||
173 | text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); | ||
174 | } | ||
175 | |||
176 | void __kprobes arch_disarm_kprobe(struct kprobe *p) | ||
177 | { | ||
178 | text_poke(p->addr, &p->opcode, 1); | ||
179 | } | ||
180 | |||
181 | void __kprobes arch_remove_kprobe(struct kprobe *p) | ||
182 | { | ||
183 | mutex_lock(&kprobe_mutex); | ||
184 | free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); | ||
185 | mutex_unlock(&kprobe_mutex); | ||
186 | } | ||
187 | |||
188 | static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) | ||
189 | { | ||
190 | kcb->prev_kprobe.kp = kprobe_running(); | ||
191 | kcb->prev_kprobe.status = kcb->kprobe_status; | ||
192 | kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags; | ||
193 | kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags; | ||
194 | } | ||
195 | |||
196 | static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) | ||
197 | { | ||
198 | __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; | ||
199 | kcb->kprobe_status = kcb->prev_kprobe.status; | ||
200 | kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags; | ||
201 | kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags; | ||
202 | } | ||
203 | |||
204 | static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, | ||
205 | struct kprobe_ctlblk *kcb) | ||
206 | { | ||
207 | __get_cpu_var(current_kprobe) = p; | ||
208 | kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags | ||
209 | = (regs->eflags & (TF_MASK | IF_MASK)); | ||
210 | if (is_IF_modifier(p->opcode)) | ||
211 | kcb->kprobe_saved_eflags &= ~IF_MASK; | ||
212 | } | ||
213 | |||
214 | static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | ||
215 | { | ||
216 | regs->eflags |= TF_MASK; | ||
217 | regs->eflags &= ~IF_MASK; | ||
218 | /*single step inline if the instruction is an int3*/ | ||
219 | if (p->opcode == BREAKPOINT_INSTRUCTION) | ||
220 | regs->eip = (unsigned long)p->addr; | ||
221 | else | ||
222 | regs->eip = (unsigned long)p->ainsn.insn; | ||
223 | } | ||
224 | |||
225 | /* Called with kretprobe_lock held */ | ||
226 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | ||
227 | struct pt_regs *regs) | ||
228 | { | ||
229 | unsigned long *sara = (unsigned long *)®s->esp; | ||
230 | |||
231 | ri->ret_addr = (kprobe_opcode_t *) *sara; | ||
232 | |||
233 | /* Replace the return addr with trampoline addr */ | ||
234 | *sara = (unsigned long) &kretprobe_trampoline; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Interrupts are disabled on entry as trap3 is an interrupt gate and they | ||
239 | * remain disabled thorough out this function. | ||
240 | */ | ||
241 | static int __kprobes kprobe_handler(struct pt_regs *regs) | ||
242 | { | ||
243 | struct kprobe *p; | ||
244 | int ret = 0; | ||
245 | kprobe_opcode_t *addr; | ||
246 | struct kprobe_ctlblk *kcb; | ||
247 | |||
248 | addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); | ||
249 | |||
250 | /* | ||
251 | * We don't want to be preempted for the entire | ||
252 | * duration of kprobe processing | ||
253 | */ | ||
254 | preempt_disable(); | ||
255 | kcb = get_kprobe_ctlblk(); | ||
256 | |||
257 | /* Check we're not actually recursing */ | ||
258 | if (kprobe_running()) { | ||
259 | p = get_kprobe(addr); | ||
260 | if (p) { | ||
261 | if (kcb->kprobe_status == KPROBE_HIT_SS && | ||
262 | *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { | ||
263 | regs->eflags &= ~TF_MASK; | ||
264 | regs->eflags |= kcb->kprobe_saved_eflags; | ||
265 | goto no_kprobe; | ||
266 | } | ||
267 | /* We have reentered the kprobe_handler(), since | ||
268 | * another probe was hit while within the handler. | ||
269 | * We here save the original kprobes variables and | ||
270 | * just single step on the instruction of the new probe | ||
271 | * without calling any user handlers. | ||
272 | */ | ||
273 | save_previous_kprobe(kcb); | ||
274 | set_current_kprobe(p, regs, kcb); | ||
275 | kprobes_inc_nmissed_count(p); | ||
276 | prepare_singlestep(p, regs); | ||
277 | kcb->kprobe_status = KPROBE_REENTER; | ||
278 | return 1; | ||
279 | } else { | ||
280 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
281 | /* The breakpoint instruction was removed by | ||
282 | * another cpu right after we hit, no further | ||
283 | * handling of this interrupt is appropriate | ||
284 | */ | ||
285 | regs->eip -= sizeof(kprobe_opcode_t); | ||
286 | ret = 1; | ||
287 | goto no_kprobe; | ||
288 | } | ||
289 | p = __get_cpu_var(current_kprobe); | ||
290 | if (p->break_handler && p->break_handler(p, regs)) { | ||
291 | goto ss_probe; | ||
292 | } | ||
293 | } | ||
294 | goto no_kprobe; | ||
295 | } | ||
296 | |||
297 | p = get_kprobe(addr); | ||
298 | if (!p) { | ||
299 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
300 | /* | ||
301 | * The breakpoint instruction was removed right | ||
302 | * after we hit it. Another cpu has removed | ||
303 | * either a probepoint or a debugger breakpoint | ||
304 | * at this address. In either case, no further | ||
305 | * handling of this interrupt is appropriate. | ||
306 | * Back up over the (now missing) int3 and run | ||
307 | * the original instruction. | ||
308 | */ | ||
309 | regs->eip -= sizeof(kprobe_opcode_t); | ||
310 | ret = 1; | ||
311 | } | ||
312 | /* Not one of ours: let kernel handle it */ | ||
313 | goto no_kprobe; | ||
314 | } | ||
315 | |||
316 | set_current_kprobe(p, regs, kcb); | ||
317 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | ||
318 | |||
319 | if (p->pre_handler && p->pre_handler(p, regs)) | ||
320 | /* handler has already set things up, so skip ss setup */ | ||
321 | return 1; | ||
322 | |||
323 | ss_probe: | ||
324 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) | ||
325 | if (p->ainsn.boostable == 1 && !p->post_handler){ | ||
326 | /* Boost up -- we can execute copied instructions directly */ | ||
327 | reset_current_kprobe(); | ||
328 | regs->eip = (unsigned long)p->ainsn.insn; | ||
329 | preempt_enable_no_resched(); | ||
330 | return 1; | ||
331 | } | ||
332 | #endif | ||
333 | prepare_singlestep(p, regs); | ||
334 | kcb->kprobe_status = KPROBE_HIT_SS; | ||
335 | return 1; | ||
336 | |||
337 | no_kprobe: | ||
338 | preempt_enable_no_resched(); | ||
339 | return ret; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * For function-return probes, init_kprobes() establishes a probepoint | ||
344 | * here. When a retprobed function returns, this probe is hit and | ||
345 | * trampoline_probe_handler() runs, calling the kretprobe's handler. | ||
346 | */ | ||
347 | void __kprobes kretprobe_trampoline_holder(void) | ||
348 | { | ||
349 | asm volatile ( ".global kretprobe_trampoline\n" | ||
350 | "kretprobe_trampoline: \n" | ||
351 | " pushf\n" | ||
352 | /* skip cs, eip, orig_eax */ | ||
353 | " subl $12, %esp\n" | ||
354 | " pushl %fs\n" | ||
355 | " pushl %ds\n" | ||
356 | " pushl %es\n" | ||
357 | " pushl %eax\n" | ||
358 | " pushl %ebp\n" | ||
359 | " pushl %edi\n" | ||
360 | " pushl %esi\n" | ||
361 | " pushl %edx\n" | ||
362 | " pushl %ecx\n" | ||
363 | " pushl %ebx\n" | ||
364 | " movl %esp, %eax\n" | ||
365 | " call trampoline_handler\n" | ||
366 | /* move eflags to cs */ | ||
367 | " movl 52(%esp), %edx\n" | ||
368 | " movl %edx, 48(%esp)\n" | ||
369 | /* save true return address on eflags */ | ||
370 | " movl %eax, 52(%esp)\n" | ||
371 | " popl %ebx\n" | ||
372 | " popl %ecx\n" | ||
373 | " popl %edx\n" | ||
374 | " popl %esi\n" | ||
375 | " popl %edi\n" | ||
376 | " popl %ebp\n" | ||
377 | " popl %eax\n" | ||
378 | /* skip eip, orig_eax, es, ds, fs */ | ||
379 | " addl $20, %esp\n" | ||
380 | " popf\n" | ||
381 | " ret\n"); | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Called from kretprobe_trampoline | ||
386 | */ | ||
387 | fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) | ||
388 | { | ||
389 | struct kretprobe_instance *ri = NULL; | ||
390 | struct hlist_head *head, empty_rp; | ||
391 | struct hlist_node *node, *tmp; | ||
392 | unsigned long flags, orig_ret_address = 0; | ||
393 | unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; | ||
394 | |||
395 | INIT_HLIST_HEAD(&empty_rp); | ||
396 | spin_lock_irqsave(&kretprobe_lock, flags); | ||
397 | head = kretprobe_inst_table_head(current); | ||
398 | /* fixup registers */ | ||
399 | regs->xcs = __KERNEL_CS | get_kernel_rpl(); | ||
400 | regs->eip = trampoline_address; | ||
401 | regs->orig_eax = 0xffffffff; | ||
402 | |||
403 | /* | ||
404 | * It is possible to have multiple instances associated with a given | ||
405 | * task either because an multiple functions in the call path | ||
406 | * have a return probe installed on them, and/or more then one return | ||
407 | * return probe was registered for a target function. | ||
408 | * | ||
409 | * We can handle this because: | ||
410 | * - instances are always inserted at the head of the list | ||
411 | * - when multiple return probes are registered for the same | ||
412 | * function, the first instance's ret_addr will point to the | ||
413 | * real return address, and all the rest will point to | ||
414 | * kretprobe_trampoline | ||
415 | */ | ||
416 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | ||
417 | if (ri->task != current) | ||
418 | /* another task is sharing our hash bucket */ | ||
419 | continue; | ||
420 | |||
421 | if (ri->rp && ri->rp->handler){ | ||
422 | __get_cpu_var(current_kprobe) = &ri->rp->kp; | ||
423 | get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; | ||
424 | ri->rp->handler(ri, regs); | ||
425 | __get_cpu_var(current_kprobe) = NULL; | ||
426 | } | ||
427 | |||
428 | orig_ret_address = (unsigned long)ri->ret_addr; | ||
429 | recycle_rp_inst(ri, &empty_rp); | ||
430 | |||
431 | if (orig_ret_address != trampoline_address) | ||
432 | /* | ||
433 | * This is the real return address. Any other | ||
434 | * instances associated with this task are for | ||
435 | * other calls deeper on the call stack | ||
436 | */ | ||
437 | break; | ||
438 | } | ||
439 | |||
440 | kretprobe_assert(ri, orig_ret_address, trampoline_address); | ||
441 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
442 | |||
443 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { | ||
444 | hlist_del(&ri->hlist); | ||
445 | kfree(ri); | ||
446 | } | ||
447 | return (void*)orig_ret_address; | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * Called after single-stepping. p->addr is the address of the | ||
452 | * instruction whose first byte has been replaced by the "int 3" | ||
453 | * instruction. To avoid the SMP problems that can occur when we | ||
454 | * temporarily put back the original opcode to single-step, we | ||
455 | * single-stepped a copy of the instruction. The address of this | ||
456 | * copy is p->ainsn.insn. | ||
457 | * | ||
458 | * This function prepares to return from the post-single-step | ||
459 | * interrupt. We have to fix up the stack as follows: | ||
460 | * | ||
461 | * 0) Except in the case of absolute or indirect jump or call instructions, | ||
462 | * the new eip is relative to the copied instruction. We need to make | ||
463 | * it relative to the original instruction. | ||
464 | * | ||
465 | * 1) If the single-stepped instruction was pushfl, then the TF and IF | ||
466 | * flags are set in the just-pushed eflags, and may need to be cleared. | ||
467 | * | ||
468 | * 2) If the single-stepped instruction was a call, the return address | ||
469 | * that is atop the stack is the address following the copied instruction. | ||
470 | * We need to make it the address following the original instruction. | ||
471 | * | ||
472 | * This function also checks instruction size for preparing direct execution. | ||
473 | */ | ||
474 | static void __kprobes resume_execution(struct kprobe *p, | ||
475 | struct pt_regs *regs, struct kprobe_ctlblk *kcb) | ||
476 | { | ||
477 | unsigned long *tos = (unsigned long *)®s->esp; | ||
478 | unsigned long copy_eip = (unsigned long)p->ainsn.insn; | ||
479 | unsigned long orig_eip = (unsigned long)p->addr; | ||
480 | |||
481 | regs->eflags &= ~TF_MASK; | ||
482 | switch (p->ainsn.insn[0]) { | ||
483 | case 0x9c: /* pushfl */ | ||
484 | *tos &= ~(TF_MASK | IF_MASK); | ||
485 | *tos |= kcb->kprobe_old_eflags; | ||
486 | break; | ||
487 | case 0xc2: /* iret/ret/lret */ | ||
488 | case 0xc3: | ||
489 | case 0xca: | ||
490 | case 0xcb: | ||
491 | case 0xcf: | ||
492 | case 0xea: /* jmp absolute -- eip is correct */ | ||
493 | /* eip is already adjusted, no more changes required */ | ||
494 | p->ainsn.boostable = 1; | ||
495 | goto no_change; | ||
496 | case 0xe8: /* call relative - Fix return addr */ | ||
497 | *tos = orig_eip + (*tos - copy_eip); | ||
498 | break; | ||
499 | case 0x9a: /* call absolute -- same as call absolute, indirect */ | ||
500 | *tos = orig_eip + (*tos - copy_eip); | ||
501 | goto no_change; | ||
502 | case 0xff: | ||
503 | if ((p->ainsn.insn[1] & 0x30) == 0x10) { | ||
504 | /* | ||
505 | * call absolute, indirect | ||
506 | * Fix return addr; eip is correct. | ||
507 | * But this is not boostable | ||
508 | */ | ||
509 | *tos = orig_eip + (*tos - copy_eip); | ||
510 | goto no_change; | ||
511 | } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ | ||
512 | ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ | ||
513 | /* eip is correct. And this is boostable */ | ||
514 | p->ainsn.boostable = 1; | ||
515 | goto no_change; | ||
516 | } | ||
517 | default: | ||
518 | break; | ||
519 | } | ||
520 | |||
521 | if (p->ainsn.boostable == 0) { | ||
522 | if ((regs->eip > copy_eip) && | ||
523 | (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) { | ||
524 | /* | ||
525 | * These instructions can be executed directly if it | ||
526 | * jumps back to correct address. | ||
527 | */ | ||
528 | set_jmp_op((void *)regs->eip, | ||
529 | (void *)orig_eip + (regs->eip - copy_eip)); | ||
530 | p->ainsn.boostable = 1; | ||
531 | } else { | ||
532 | p->ainsn.boostable = -1; | ||
533 | } | ||
534 | } | ||
535 | |||
536 | regs->eip = orig_eip + (regs->eip - copy_eip); | ||
537 | |||
538 | no_change: | ||
539 | return; | ||
540 | } | ||
541 | |||
542 | /* | ||
543 | * Interrupts are disabled on entry as trap1 is an interrupt gate and they | ||
544 | * remain disabled thoroughout this function. | ||
545 | */ | ||
546 | static int __kprobes post_kprobe_handler(struct pt_regs *regs) | ||
547 | { | ||
548 | struct kprobe *cur = kprobe_running(); | ||
549 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
550 | |||
551 | if (!cur) | ||
552 | return 0; | ||
553 | |||
554 | if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { | ||
555 | kcb->kprobe_status = KPROBE_HIT_SSDONE; | ||
556 | cur->post_handler(cur, regs, 0); | ||
557 | } | ||
558 | |||
559 | resume_execution(cur, regs, kcb); | ||
560 | regs->eflags |= kcb->kprobe_saved_eflags; | ||
561 | |||
562 | /*Restore back the original saved kprobes variables and continue. */ | ||
563 | if (kcb->kprobe_status == KPROBE_REENTER) { | ||
564 | restore_previous_kprobe(kcb); | ||
565 | goto out; | ||
566 | } | ||
567 | reset_current_kprobe(); | ||
568 | out: | ||
569 | preempt_enable_no_resched(); | ||
570 | |||
571 | /* | ||
572 | * if somebody else is singlestepping across a probe point, eflags | ||
573 | * will have TF set, in which case, continue the remaining processing | ||
574 | * of do_debug, as if this is not a probe hit. | ||
575 | */ | ||
576 | if (regs->eflags & TF_MASK) | ||
577 | return 0; | ||
578 | |||
579 | return 1; | ||
580 | } | ||
581 | |||
582 | static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) | ||
583 | { | ||
584 | struct kprobe *cur = kprobe_running(); | ||
585 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
586 | |||
587 | switch(kcb->kprobe_status) { | ||
588 | case KPROBE_HIT_SS: | ||
589 | case KPROBE_REENTER: | ||
590 | /* | ||
591 | * We are here because the instruction being single | ||
592 | * stepped caused a page fault. We reset the current | ||
593 | * kprobe and the eip points back to the probe address | ||
594 | * and allow the page fault handler to continue as a | ||
595 | * normal page fault. | ||
596 | */ | ||
597 | regs->eip = (unsigned long)cur->addr; | ||
598 | regs->eflags |= kcb->kprobe_old_eflags; | ||
599 | if (kcb->kprobe_status == KPROBE_REENTER) | ||
600 | restore_previous_kprobe(kcb); | ||
601 | else | ||
602 | reset_current_kprobe(); | ||
603 | preempt_enable_no_resched(); | ||
604 | break; | ||
605 | case KPROBE_HIT_ACTIVE: | ||
606 | case KPROBE_HIT_SSDONE: | ||
607 | /* | ||
608 | * We increment the nmissed count for accounting, | ||
609 | * we can also use npre/npostfault count for accouting | ||
610 | * these specific fault cases. | ||
611 | */ | ||
612 | kprobes_inc_nmissed_count(cur); | ||
613 | |||
614 | /* | ||
615 | * We come here because instructions in the pre/post | ||
616 | * handler caused the page_fault, this could happen | ||
617 | * if handler tries to access user space by | ||
618 | * copy_from_user(), get_user() etc. Let the | ||
619 | * user-specified handler try to fix it first. | ||
620 | */ | ||
621 | if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) | ||
622 | return 1; | ||
623 | |||
624 | /* | ||
625 | * In case the user-specified fault handler returned | ||
626 | * zero, try to fix up. | ||
627 | */ | ||
628 | if (fixup_exception(regs)) | ||
629 | return 1; | ||
630 | |||
631 | /* | ||
632 | * fixup_exception() could not handle it, | ||
633 | * Let do_page_fault() fix it. | ||
634 | */ | ||
635 | break; | ||
636 | default: | ||
637 | break; | ||
638 | } | ||
639 | return 0; | ||
640 | } | ||
641 | |||
642 | /* | ||
643 | * Wrapper routine to for handling exceptions. | ||
644 | */ | ||
645 | int __kprobes kprobe_exceptions_notify(struct notifier_block *self, | ||
646 | unsigned long val, void *data) | ||
647 | { | ||
648 | struct die_args *args = (struct die_args *)data; | ||
649 | int ret = NOTIFY_DONE; | ||
650 | |||
651 | if (args->regs && user_mode_vm(args->regs)) | ||
652 | return ret; | ||
653 | |||
654 | switch (val) { | ||
655 | case DIE_INT3: | ||
656 | if (kprobe_handler(args->regs)) | ||
657 | ret = NOTIFY_STOP; | ||
658 | break; | ||
659 | case DIE_DEBUG: | ||
660 | if (post_kprobe_handler(args->regs)) | ||
661 | ret = NOTIFY_STOP; | ||
662 | break; | ||
663 | case DIE_GPF: | ||
664 | case DIE_PAGE_FAULT: | ||
665 | /* kprobe_running() needs smp_processor_id() */ | ||
666 | preempt_disable(); | ||
667 | if (kprobe_running() && | ||
668 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
669 | ret = NOTIFY_STOP; | ||
670 | preempt_enable(); | ||
671 | break; | ||
672 | default: | ||
673 | break; | ||
674 | } | ||
675 | return ret; | ||
676 | } | ||
677 | |||
678 | int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
679 | { | ||
680 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
681 | unsigned long addr; | ||
682 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
683 | |||
684 | kcb->jprobe_saved_regs = *regs; | ||
685 | kcb->jprobe_saved_esp = ®s->esp; | ||
686 | addr = (unsigned long)(kcb->jprobe_saved_esp); | ||
687 | |||
688 | /* | ||
689 | * TBD: As Linus pointed out, gcc assumes that the callee | ||
690 | * owns the argument space and could overwrite it, e.g. | ||
691 | * tailcall optimization. So, to be absolutely safe | ||
692 | * we also save and restore enough stack bytes to cover | ||
693 | * the argument area. | ||
694 | */ | ||
695 | memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, | ||
696 | MIN_STACK_SIZE(addr)); | ||
697 | regs->eflags &= ~IF_MASK; | ||
698 | regs->eip = (unsigned long)(jp->entry); | ||
699 | return 1; | ||
700 | } | ||
701 | |||
702 | void __kprobes jprobe_return(void) | ||
703 | { | ||
704 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
705 | |||
706 | asm volatile (" xchgl %%ebx,%%esp \n" | ||
707 | " int3 \n" | ||
708 | " .globl jprobe_return_end \n" | ||
709 | " jprobe_return_end: \n" | ||
710 | " nop \n"::"b" | ||
711 | (kcb->jprobe_saved_esp):"memory"); | ||
712 | } | ||
713 | |||
714 | int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
715 | { | ||
716 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
717 | u8 *addr = (u8 *) (regs->eip - 1); | ||
718 | unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp); | ||
719 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
720 | |||
721 | if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { | ||
722 | if (®s->esp != kcb->jprobe_saved_esp) { | ||
723 | struct pt_regs *saved_regs = | ||
724 | container_of(kcb->jprobe_saved_esp, | ||
725 | struct pt_regs, esp); | ||
726 | printk("current esp %p does not match saved esp %p\n", | ||
727 | ®s->esp, kcb->jprobe_saved_esp); | ||
728 | printk("Saved registers for jprobe %p\n", jp); | ||
729 | show_registers(saved_regs); | ||
730 | printk("Current registers\n"); | ||
731 | show_registers(regs); | ||
732 | BUG(); | ||
733 | } | ||
734 | *regs = kcb->jprobe_saved_regs; | ||
735 | memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, | ||
736 | MIN_STACK_SIZE(stack_addr)); | ||
737 | preempt_enable_no_resched(); | ||
738 | return 1; | ||
739 | } | ||
740 | return 0; | ||
741 | } | ||
742 | |||
743 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) | ||
744 | { | ||
745 | return 0; | ||
746 | } | ||
747 | |||
748 | int __init arch_init_kprobes(void) | ||
749 | { | ||
750 | return 0; | ||
751 | } | ||
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c new file mode 100644 index 000000000000..e0b2d17f4f10 --- /dev/null +++ b/arch/x86/kernel/ldt_32.c | |||
@@ -0,0 +1,250 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/ldt.c | ||
3 | * | ||
4 | * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | ||
5 | * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | ||
6 | */ | ||
7 | |||
8 | #include <linux/errno.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/mm.h> | ||
12 | #include <linux/smp.h> | ||
13 | #include <linux/vmalloc.h> | ||
14 | #include <linux/slab.h> | ||
15 | |||
16 | #include <asm/uaccess.h> | ||
17 | #include <asm/system.h> | ||
18 | #include <asm/ldt.h> | ||
19 | #include <asm/desc.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | |||
22 | #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | ||
23 | static void flush_ldt(void *null) | ||
24 | { | ||
25 | if (current->active_mm) | ||
26 | load_LDT(¤t->active_mm->context); | ||
27 | } | ||
28 | #endif | ||
29 | |||
30 | static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | ||
31 | { | ||
32 | void *oldldt; | ||
33 | void *newldt; | ||
34 | int oldsize; | ||
35 | |||
36 | if (mincount <= pc->size) | ||
37 | return 0; | ||
38 | oldsize = pc->size; | ||
39 | mincount = (mincount+511)&(~511); | ||
40 | if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
41 | newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | ||
42 | else | ||
43 | newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | ||
44 | |||
45 | if (!newldt) | ||
46 | return -ENOMEM; | ||
47 | |||
48 | if (oldsize) | ||
49 | memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | ||
50 | oldldt = pc->ldt; | ||
51 | memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | ||
52 | pc->ldt = newldt; | ||
53 | wmb(); | ||
54 | pc->size = mincount; | ||
55 | wmb(); | ||
56 | |||
57 | if (reload) { | ||
58 | #ifdef CONFIG_SMP | ||
59 | cpumask_t mask; | ||
60 | preempt_disable(); | ||
61 | load_LDT(pc); | ||
62 | mask = cpumask_of_cpu(smp_processor_id()); | ||
63 | if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | ||
64 | smp_call_function(flush_ldt, NULL, 1, 1); | ||
65 | preempt_enable(); | ||
66 | #else | ||
67 | load_LDT(pc); | ||
68 | #endif | ||
69 | } | ||
70 | if (oldsize) { | ||
71 | if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
72 | vfree(oldldt); | ||
73 | else | ||
74 | kfree(oldldt); | ||
75 | } | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | ||
80 | { | ||
81 | int err = alloc_ldt(new, old->size, 0); | ||
82 | if (err < 0) | ||
83 | return err; | ||
84 | memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | ||
85 | return 0; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * we do not have to muck with descriptors here, that is | ||
90 | * done in switch_mm() as needed. | ||
91 | */ | ||
92 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | ||
93 | { | ||
94 | struct mm_struct * old_mm; | ||
95 | int retval = 0; | ||
96 | |||
97 | init_MUTEX(&mm->context.sem); | ||
98 | mm->context.size = 0; | ||
99 | old_mm = current->mm; | ||
100 | if (old_mm && old_mm->context.size > 0) { | ||
101 | down(&old_mm->context.sem); | ||
102 | retval = copy_ldt(&mm->context, &old_mm->context); | ||
103 | up(&old_mm->context.sem); | ||
104 | } | ||
105 | return retval; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * No need to lock the MM as we are the last user | ||
110 | */ | ||
111 | void destroy_context(struct mm_struct *mm) | ||
112 | { | ||
113 | if (mm->context.size) { | ||
114 | if (mm == current->active_mm) | ||
115 | clear_LDT(); | ||
116 | if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
117 | vfree(mm->context.ldt); | ||
118 | else | ||
119 | kfree(mm->context.ldt); | ||
120 | mm->context.size = 0; | ||
121 | } | ||
122 | } | ||
123 | |||
124 | static int read_ldt(void __user * ptr, unsigned long bytecount) | ||
125 | { | ||
126 | int err; | ||
127 | unsigned long size; | ||
128 | struct mm_struct * mm = current->mm; | ||
129 | |||
130 | if (!mm->context.size) | ||
131 | return 0; | ||
132 | if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | ||
133 | bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | ||
134 | |||
135 | down(&mm->context.sem); | ||
136 | size = mm->context.size*LDT_ENTRY_SIZE; | ||
137 | if (size > bytecount) | ||
138 | size = bytecount; | ||
139 | |||
140 | err = 0; | ||
141 | if (copy_to_user(ptr, mm->context.ldt, size)) | ||
142 | err = -EFAULT; | ||
143 | up(&mm->context.sem); | ||
144 | if (err < 0) | ||
145 | goto error_return; | ||
146 | if (size != bytecount) { | ||
147 | /* zero-fill the rest */ | ||
148 | if (clear_user(ptr+size, bytecount-size) != 0) { | ||
149 | err = -EFAULT; | ||
150 | goto error_return; | ||
151 | } | ||
152 | } | ||
153 | return bytecount; | ||
154 | error_return: | ||
155 | return err; | ||
156 | } | ||
157 | |||
158 | static int read_default_ldt(void __user * ptr, unsigned long bytecount) | ||
159 | { | ||
160 | int err; | ||
161 | unsigned long size; | ||
162 | |||
163 | err = 0; | ||
164 | size = 5*sizeof(struct desc_struct); | ||
165 | if (size > bytecount) | ||
166 | size = bytecount; | ||
167 | |||
168 | err = size; | ||
169 | if (clear_user(ptr, size)) | ||
170 | err = -EFAULT; | ||
171 | |||
172 | return err; | ||
173 | } | ||
174 | |||
175 | static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | ||
176 | { | ||
177 | struct mm_struct * mm = current->mm; | ||
178 | __u32 entry_1, entry_2; | ||
179 | int error; | ||
180 | struct user_desc ldt_info; | ||
181 | |||
182 | error = -EINVAL; | ||
183 | if (bytecount != sizeof(ldt_info)) | ||
184 | goto out; | ||
185 | error = -EFAULT; | ||
186 | if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) | ||
187 | goto out; | ||
188 | |||
189 | error = -EINVAL; | ||
190 | if (ldt_info.entry_number >= LDT_ENTRIES) | ||
191 | goto out; | ||
192 | if (ldt_info.contents == 3) { | ||
193 | if (oldmode) | ||
194 | goto out; | ||
195 | if (ldt_info.seg_not_present == 0) | ||
196 | goto out; | ||
197 | } | ||
198 | |||
199 | down(&mm->context.sem); | ||
200 | if (ldt_info.entry_number >= mm->context.size) { | ||
201 | error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | ||
202 | if (error < 0) | ||
203 | goto out_unlock; | ||
204 | } | ||
205 | |||
206 | /* Allow LDTs to be cleared by the user. */ | ||
207 | if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | ||
208 | if (oldmode || LDT_empty(&ldt_info)) { | ||
209 | entry_1 = 0; | ||
210 | entry_2 = 0; | ||
211 | goto install; | ||
212 | } | ||
213 | } | ||
214 | |||
215 | entry_1 = LDT_entry_a(&ldt_info); | ||
216 | entry_2 = LDT_entry_b(&ldt_info); | ||
217 | if (oldmode) | ||
218 | entry_2 &= ~(1 << 20); | ||
219 | |||
220 | /* Install the new entry ... */ | ||
221 | install: | ||
222 | write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); | ||
223 | error = 0; | ||
224 | |||
225 | out_unlock: | ||
226 | up(&mm->context.sem); | ||
227 | out: | ||
228 | return error; | ||
229 | } | ||
230 | |||
231 | asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | ||
232 | { | ||
233 | int ret = -ENOSYS; | ||
234 | |||
235 | switch (func) { | ||
236 | case 0: | ||
237 | ret = read_ldt(ptr, bytecount); | ||
238 | break; | ||
239 | case 1: | ||
240 | ret = write_ldt(ptr, bytecount, 1); | ||
241 | break; | ||
242 | case 2: | ||
243 | ret = read_default_ldt(ptr, bytecount); | ||
244 | break; | ||
245 | case 0x11: | ||
246 | ret = write_ldt(ptr, bytecount, 0); | ||
247 | break; | ||
248 | } | ||
249 | return ret; | ||
250 | } | ||
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c new file mode 100644 index 000000000000..91966bafb3dc --- /dev/null +++ b/arch/x86/kernel/machine_kexec_32.c | |||
@@ -0,0 +1,171 @@ | |||
1 | /* | ||
2 | * machine_kexec.c - handle transition of Linux booting another kernel | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/kexec.h> | ||
11 | #include <linux/delay.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <asm/pgtable.h> | ||
14 | #include <asm/pgalloc.h> | ||
15 | #include <asm/tlbflush.h> | ||
16 | #include <asm/mmu_context.h> | ||
17 | #include <asm/io.h> | ||
18 | #include <asm/apic.h> | ||
19 | #include <asm/cpufeature.h> | ||
20 | #include <asm/desc.h> | ||
21 | #include <asm/system.h> | ||
22 | |||
23 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) | ||
24 | static u32 kexec_pgd[1024] PAGE_ALIGNED; | ||
25 | #ifdef CONFIG_X86_PAE | ||
26 | static u32 kexec_pmd0[1024] PAGE_ALIGNED; | ||
27 | static u32 kexec_pmd1[1024] PAGE_ALIGNED; | ||
28 | #endif | ||
29 | static u32 kexec_pte0[1024] PAGE_ALIGNED; | ||
30 | static u32 kexec_pte1[1024] PAGE_ALIGNED; | ||
31 | |||
32 | static void set_idt(void *newidt, __u16 limit) | ||
33 | { | ||
34 | struct Xgt_desc_struct curidt; | ||
35 | |||
36 | /* ia32 supports unaliged loads & stores */ | ||
37 | curidt.size = limit; | ||
38 | curidt.address = (unsigned long)newidt; | ||
39 | |||
40 | load_idt(&curidt); | ||
41 | }; | ||
42 | |||
43 | |||
44 | static void set_gdt(void *newgdt, __u16 limit) | ||
45 | { | ||
46 | struct Xgt_desc_struct curgdt; | ||
47 | |||
48 | /* ia32 supports unaligned loads & stores */ | ||
49 | curgdt.size = limit; | ||
50 | curgdt.address = (unsigned long)newgdt; | ||
51 | |||
52 | load_gdt(&curgdt); | ||
53 | }; | ||
54 | |||
55 | static void load_segments(void) | ||
56 | { | ||
57 | #define __STR(X) #X | ||
58 | #define STR(X) __STR(X) | ||
59 | |||
60 | __asm__ __volatile__ ( | ||
61 | "\tljmp $"STR(__KERNEL_CS)",$1f\n" | ||
62 | "\t1:\n" | ||
63 | "\tmovl $"STR(__KERNEL_DS)",%%eax\n" | ||
64 | "\tmovl %%eax,%%ds\n" | ||
65 | "\tmovl %%eax,%%es\n" | ||
66 | "\tmovl %%eax,%%fs\n" | ||
67 | "\tmovl %%eax,%%gs\n" | ||
68 | "\tmovl %%eax,%%ss\n" | ||
69 | ::: "eax", "memory"); | ||
70 | #undef STR | ||
71 | #undef __STR | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * A architecture hook called to validate the | ||
76 | * proposed image and prepare the control pages | ||
77 | * as needed. The pages for KEXEC_CONTROL_CODE_SIZE | ||
78 | * have been allocated, but the segments have yet | ||
79 | * been copied into the kernel. | ||
80 | * | ||
81 | * Do what every setup is needed on image and the | ||
82 | * reboot code buffer to allow us to avoid allocations | ||
83 | * later. | ||
84 | * | ||
85 | * Currently nothing. | ||
86 | */ | ||
87 | int machine_kexec_prepare(struct kimage *image) | ||
88 | { | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * Undo anything leftover by machine_kexec_prepare | ||
94 | * when an image is freed. | ||
95 | */ | ||
96 | void machine_kexec_cleanup(struct kimage *image) | ||
97 | { | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Do not allocate memory (or fail in any way) in machine_kexec(). | ||
102 | * We are past the point of no return, committed to rebooting now. | ||
103 | */ | ||
104 | NORET_TYPE void machine_kexec(struct kimage *image) | ||
105 | { | ||
106 | unsigned long page_list[PAGES_NR]; | ||
107 | void *control_page; | ||
108 | |||
109 | /* Interrupts aren't acceptable while we reboot */ | ||
110 | local_irq_disable(); | ||
111 | |||
112 | control_page = page_address(image->control_code_page); | ||
113 | memcpy(control_page, relocate_kernel, PAGE_SIZE); | ||
114 | |||
115 | page_list[PA_CONTROL_PAGE] = __pa(control_page); | ||
116 | page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; | ||
117 | page_list[PA_PGD] = __pa(kexec_pgd); | ||
118 | page_list[VA_PGD] = (unsigned long)kexec_pgd; | ||
119 | #ifdef CONFIG_X86_PAE | ||
120 | page_list[PA_PMD_0] = __pa(kexec_pmd0); | ||
121 | page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; | ||
122 | page_list[PA_PMD_1] = __pa(kexec_pmd1); | ||
123 | page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; | ||
124 | #endif | ||
125 | page_list[PA_PTE_0] = __pa(kexec_pte0); | ||
126 | page_list[VA_PTE_0] = (unsigned long)kexec_pte0; | ||
127 | page_list[PA_PTE_1] = __pa(kexec_pte1); | ||
128 | page_list[VA_PTE_1] = (unsigned long)kexec_pte1; | ||
129 | |||
130 | /* The segment registers are funny things, they have both a | ||
131 | * visible and an invisible part. Whenever the visible part is | ||
132 | * set to a specific selector, the invisible part is loaded | ||
133 | * with from a table in memory. At no other time is the | ||
134 | * descriptor table in memory accessed. | ||
135 | * | ||
136 | * I take advantage of this here by force loading the | ||
137 | * segments, before I zap the gdt with an invalid value. | ||
138 | */ | ||
139 | load_segments(); | ||
140 | /* The gdt & idt are now invalid. | ||
141 | * If you want to load them you must set up your own idt & gdt. | ||
142 | */ | ||
143 | set_gdt(phys_to_virt(0),0); | ||
144 | set_idt(phys_to_virt(0),0); | ||
145 | |||
146 | /* now call it */ | ||
147 | relocate_kernel((unsigned long)image->head, (unsigned long)page_list, | ||
148 | image->start, cpu_has_pae); | ||
149 | } | ||
150 | |||
151 | /* crashkernel=size@addr specifies the location to reserve for | ||
152 | * a crash kernel. By reserving this memory we guarantee | ||
153 | * that linux never sets it up as a DMA target. | ||
154 | * Useful for holding code to do something appropriate | ||
155 | * after a kernel panic. | ||
156 | */ | ||
157 | static int __init parse_crashkernel(char *arg) | ||
158 | { | ||
159 | unsigned long size, base; | ||
160 | size = memparse(arg, &arg); | ||
161 | if (*arg == '@') { | ||
162 | base = memparse(arg+1, &arg); | ||
163 | /* FIXME: Do I want a sanity check | ||
164 | * to validate the memory range? | ||
165 | */ | ||
166 | crashk_res.start = base; | ||
167 | crashk_res.end = base + size - 1; | ||
168 | } | ||
169 | return 0; | ||
170 | } | ||
171 | early_param("crashkernel", parse_crashkernel); | ||
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c new file mode 100644 index 000000000000..b83672b89527 --- /dev/null +++ b/arch/x86/kernel/mca_32.c | |||
@@ -0,0 +1,470 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/mca.c | ||
3 | * Written by Martin Kolinek, February 1996 | ||
4 | * | ||
5 | * Changes: | ||
6 | * | ||
7 | * Chris Beauregard July 28th, 1996 | ||
8 | * - Fixed up integrated SCSI detection | ||
9 | * | ||
10 | * Chris Beauregard August 3rd, 1996 | ||
11 | * - Made mca_info local | ||
12 | * - Made integrated registers accessible through standard function calls | ||
13 | * - Added name field | ||
14 | * - More sanity checking | ||
15 | * | ||
16 | * Chris Beauregard August 9th, 1996 | ||
17 | * - Rewrote /proc/mca | ||
18 | * | ||
19 | * Chris Beauregard January 7th, 1997 | ||
20 | * - Added basic NMI-processing | ||
21 | * - Added more information to mca_info structure | ||
22 | * | ||
23 | * David Weinehall October 12th, 1998 | ||
24 | * - Made a lot of cleaning up in the source | ||
25 | * - Added use of save_flags / restore_flags | ||
26 | * - Added the 'driver_loaded' flag in MCA_adapter | ||
27 | * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter | ||
28 | * | ||
29 | * David Weinehall March 24th, 1999 | ||
30 | * - Fixed the output of 'Driver Installed' in /proc/mca/pos | ||
31 | * - Made the Integrated Video & SCSI show up even if they have id 0000 | ||
32 | * | ||
33 | * Alexander Viro November 9th, 1999 | ||
34 | * - Switched to regular procfs methods | ||
35 | * | ||
36 | * Alfred Arnold & David Weinehall August 23rd, 2000 | ||
37 | * - Added support for Planar POS-registers | ||
38 | */ | ||
39 | |||
40 | #include <linux/module.h> | ||
41 | #include <linux/types.h> | ||
42 | #include <linux/errno.h> | ||
43 | #include <linux/kernel.h> | ||
44 | #include <linux/mca.h> | ||
45 | #include <linux/kprobes.h> | ||
46 | #include <asm/system.h> | ||
47 | #include <asm/io.h> | ||
48 | #include <linux/proc_fs.h> | ||
49 | #include <linux/mman.h> | ||
50 | #include <linux/mm.h> | ||
51 | #include <linux/pagemap.h> | ||
52 | #include <linux/ioport.h> | ||
53 | #include <asm/uaccess.h> | ||
54 | #include <linux/init.h> | ||
55 | #include <asm/arch_hooks.h> | ||
56 | |||
57 | static unsigned char which_scsi = 0; | ||
58 | |||
59 | int MCA_bus = 0; | ||
60 | EXPORT_SYMBOL(MCA_bus); | ||
61 | |||
62 | /* | ||
63 | * Motherboard register spinlock. Untested on SMP at the moment, but | ||
64 | * are there any MCA SMP boxes? | ||
65 | * | ||
66 | * Yes - Alan | ||
67 | */ | ||
68 | static DEFINE_SPINLOCK(mca_lock); | ||
69 | |||
70 | /* Build the status info for the adapter */ | ||
71 | |||
72 | static void mca_configure_adapter_status(struct mca_device *mca_dev) { | ||
73 | mca_dev->status = MCA_ADAPTER_NONE; | ||
74 | |||
75 | mca_dev->pos_id = mca_dev->pos[0] | ||
76 | + (mca_dev->pos[1] << 8); | ||
77 | |||
78 | if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) { | ||
79 | |||
80 | /* id = 0x0000 usually indicates hardware failure, | ||
81 | * however, ZP Gu (zpg@castle.net> reports that his 9556 | ||
82 | * has 0x0000 as id and everything still works. There | ||
83 | * also seem to be an adapter with id = 0x0000; the | ||
84 | * NCR Parallel Bus Memory Card. Until this is confirmed, | ||
85 | * however, this code will stay. | ||
86 | */ | ||
87 | |||
88 | mca_dev->status = MCA_ADAPTER_ERROR; | ||
89 | |||
90 | return; | ||
91 | } else if(mca_dev->pos_id != 0xffff) { | ||
92 | |||
93 | /* 0xffff usually indicates that there's no adapter, | ||
94 | * however, some integrated adapters may have 0xffff as | ||
95 | * their id and still be valid. Examples are on-board | ||
96 | * VGA of the 55sx, the integrated SCSI of the 56 & 57, | ||
97 | * and possibly also the 95 ULTIMEDIA. | ||
98 | */ | ||
99 | |||
100 | mca_dev->status = MCA_ADAPTER_NORMAL; | ||
101 | } | ||
102 | |||
103 | if((mca_dev->pos_id == 0xffff || | ||
104 | mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) { | ||
105 | int j; | ||
106 | |||
107 | for(j = 2; j < 8; j++) { | ||
108 | if(mca_dev->pos[j] != 0xff) { | ||
109 | mca_dev->status = MCA_ADAPTER_NORMAL; | ||
110 | break; | ||
111 | } | ||
112 | } | ||
113 | } | ||
114 | |||
115 | if(!(mca_dev->pos[2] & MCA_ENABLED)) { | ||
116 | |||
117 | /* enabled bit is in POS 2 */ | ||
118 | |||
119 | mca_dev->status = MCA_ADAPTER_DISABLED; | ||
120 | } | ||
121 | } /* mca_configure_adapter_status */ | ||
122 | |||
123 | /*--------------------------------------------------------------------*/ | ||
124 | |||
125 | static struct resource mca_standard_resources[] = { | ||
126 | { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" }, | ||
127 | { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" }, | ||
128 | { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" }, | ||
129 | { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" }, | ||
130 | { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" }, | ||
131 | { .start = 0x96, .end = 0x97, .name = "POS (MCA)" }, | ||
132 | { .start = 0x100, .end = 0x107, .name = "POS (MCA)" } | ||
133 | }; | ||
134 | |||
135 | #define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources) | ||
136 | |||
137 | /** | ||
138 | * mca_read_and_store_pos - read the POS registers into a memory buffer | ||
139 | * @pos: a char pointer to 8 bytes, contains the POS register value on | ||
140 | * successful return | ||
141 | * | ||
142 | * Returns 1 if a card actually exists (i.e. the pos isn't | ||
143 | * all 0xff) or 0 otherwise | ||
144 | */ | ||
145 | static int mca_read_and_store_pos(unsigned char *pos) { | ||
146 | int j; | ||
147 | int found = 0; | ||
148 | |||
149 | for(j=0; j<8; j++) { | ||
150 | if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) { | ||
151 | /* 0xff all across means no device. 0x00 means | ||
152 | * something's broken, but a device is | ||
153 | * probably there. However, if you get 0x00 | ||
154 | * from a motherboard register it won't matter | ||
155 | * what we find. For the record, on the | ||
156 | * 57SLC, the integrated SCSI adapter has | ||
157 | * 0xffff for the adapter ID, but nonzero for | ||
158 | * other registers. */ | ||
159 | |||
160 | found = 1; | ||
161 | } | ||
162 | } | ||
163 | return found; | ||
164 | } | ||
165 | |||
166 | static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg) | ||
167 | { | ||
168 | unsigned char byte; | ||
169 | unsigned long flags; | ||
170 | |||
171 | if(reg < 0 || reg >= 8) | ||
172 | return 0; | ||
173 | |||
174 | spin_lock_irqsave(&mca_lock, flags); | ||
175 | if(mca_dev->pos_register) { | ||
176 | /* Disable adapter setup, enable motherboard setup */ | ||
177 | |||
178 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
179 | outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); | ||
180 | |||
181 | byte = inb_p(MCA_POS_REG(reg)); | ||
182 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
183 | } else { | ||
184 | |||
185 | /* Make sure motherboard setup is off */ | ||
186 | |||
187 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
188 | |||
189 | /* Read the appropriate register */ | ||
190 | |||
191 | outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG); | ||
192 | byte = inb_p(MCA_POS_REG(reg)); | ||
193 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
194 | } | ||
195 | spin_unlock_irqrestore(&mca_lock, flags); | ||
196 | |||
197 | mca_dev->pos[reg] = byte; | ||
198 | |||
199 | return byte; | ||
200 | } | ||
201 | |||
202 | static void mca_pc_write_pos(struct mca_device *mca_dev, int reg, | ||
203 | unsigned char byte) | ||
204 | { | ||
205 | unsigned long flags; | ||
206 | |||
207 | if(reg < 0 || reg >= 8) | ||
208 | return; | ||
209 | |||
210 | spin_lock_irqsave(&mca_lock, flags); | ||
211 | |||
212 | /* Make sure motherboard setup is off */ | ||
213 | |||
214 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
215 | |||
216 | /* Read in the appropriate register */ | ||
217 | |||
218 | outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG); | ||
219 | outb_p(byte, MCA_POS_REG(reg)); | ||
220 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
221 | |||
222 | spin_unlock_irqrestore(&mca_lock, flags); | ||
223 | |||
224 | /* Update the global register list, while we have the byte */ | ||
225 | |||
226 | mca_dev->pos[reg] = byte; | ||
227 | |||
228 | } | ||
229 | |||
230 | /* for the primary MCA bus, we have identity transforms */ | ||
231 | static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq) | ||
232 | { | ||
233 | return irq; | ||
234 | } | ||
235 | |||
236 | static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port) | ||
237 | { | ||
238 | return port; | ||
239 | } | ||
240 | |||
241 | static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem) | ||
242 | { | ||
243 | return mem; | ||
244 | } | ||
245 | |||
246 | |||
247 | static int __init mca_init(void) | ||
248 | { | ||
249 | unsigned int i, j; | ||
250 | struct mca_device *mca_dev; | ||
251 | unsigned char pos[8]; | ||
252 | short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00}; | ||
253 | struct mca_bus *bus; | ||
254 | |||
255 | /* WARNING: Be careful when making changes here. Putting an adapter | ||
256 | * and the motherboard simultaneously into setup mode may result in | ||
257 | * damage to chips (according to The Indispensible PC Hardware Book | ||
258 | * by Hans-Peter Messmer). Also, we disable system interrupts (so | ||
259 | * that we are not disturbed in the middle of this). | ||
260 | */ | ||
261 | |||
262 | /* Make sure the MCA bus is present */ | ||
263 | |||
264 | if (mca_system_init()) { | ||
265 | printk(KERN_ERR "MCA bus system initialisation failed\n"); | ||
266 | return -ENODEV; | ||
267 | } | ||
268 | |||
269 | if (!MCA_bus) | ||
270 | return -ENODEV; | ||
271 | |||
272 | printk(KERN_INFO "Micro Channel bus detected.\n"); | ||
273 | |||
274 | /* All MCA systems have at least a primary bus */ | ||
275 | bus = mca_attach_bus(MCA_PRIMARY_BUS); | ||
276 | if (!bus) | ||
277 | goto out_nomem; | ||
278 | bus->default_dma_mask = 0xffffffffLL; | ||
279 | bus->f.mca_write_pos = mca_pc_write_pos; | ||
280 | bus->f.mca_read_pos = mca_pc_read_pos; | ||
281 | bus->f.mca_transform_irq = mca_dummy_transform_irq; | ||
282 | bus->f.mca_transform_ioport = mca_dummy_transform_ioport; | ||
283 | bus->f.mca_transform_memory = mca_dummy_transform_memory; | ||
284 | |||
285 | /* get the motherboard device */ | ||
286 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); | ||
287 | if(unlikely(!mca_dev)) | ||
288 | goto out_nomem; | ||
289 | |||
290 | /* | ||
291 | * We do not expect many MCA interrupts during initialization, | ||
292 | * but let us be safe: | ||
293 | */ | ||
294 | spin_lock_irq(&mca_lock); | ||
295 | |||
296 | /* Make sure adapter setup is off */ | ||
297 | |||
298 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
299 | |||
300 | /* Read motherboard POS registers */ | ||
301 | |||
302 | mca_dev->pos_register = 0x7f; | ||
303 | outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); | ||
304 | mca_dev->name[0] = 0; | ||
305 | mca_read_and_store_pos(mca_dev->pos); | ||
306 | mca_configure_adapter_status(mca_dev); | ||
307 | /* fake POS and slot for a motherboard */ | ||
308 | mca_dev->pos_id = MCA_MOTHERBOARD_POS; | ||
309 | mca_dev->slot = MCA_MOTHERBOARD; | ||
310 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
311 | |||
312 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); | ||
313 | if(unlikely(!mca_dev)) | ||
314 | goto out_unlock_nomem; | ||
315 | |||
316 | /* Put motherboard into video setup mode, read integrated video | ||
317 | * POS registers, and turn motherboard setup off. | ||
318 | */ | ||
319 | |||
320 | mca_dev->pos_register = 0xdf; | ||
321 | outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); | ||
322 | mca_dev->name[0] = 0; | ||
323 | mca_read_and_store_pos(mca_dev->pos); | ||
324 | mca_configure_adapter_status(mca_dev); | ||
325 | /* fake POS and slot for the integrated video */ | ||
326 | mca_dev->pos_id = MCA_INTEGVIDEO_POS; | ||
327 | mca_dev->slot = MCA_INTEGVIDEO; | ||
328 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
329 | |||
330 | /* Put motherboard into scsi setup mode, read integrated scsi | ||
331 | * POS registers, and turn motherboard setup off. | ||
332 | * | ||
333 | * It seems there are two possible SCSI registers. Martin says that | ||
334 | * for the 56,57, 0xf7 is the one, but fails on the 76. | ||
335 | * Alfredo (apena@vnet.ibm.com) says | ||
336 | * 0xfd works on his machine. We'll try both of them. I figure it's | ||
337 | * a good bet that only one could be valid at a time. This could | ||
338 | * screw up though if one is used for something else on the other | ||
339 | * machine. | ||
340 | */ | ||
341 | |||
342 | for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) { | ||
343 | outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG); | ||
344 | if(mca_read_and_store_pos(pos)) | ||
345 | break; | ||
346 | } | ||
347 | if(which_scsi) { | ||
348 | /* found a scsi card */ | ||
349 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); | ||
350 | if(unlikely(!mca_dev)) | ||
351 | goto out_unlock_nomem; | ||
352 | |||
353 | for(j = 0; j < 8; j++) | ||
354 | mca_dev->pos[j] = pos[j]; | ||
355 | |||
356 | mca_configure_adapter_status(mca_dev); | ||
357 | /* fake POS and slot for integrated SCSI controller */ | ||
358 | mca_dev->pos_id = MCA_INTEGSCSI_POS; | ||
359 | mca_dev->slot = MCA_INTEGSCSI; | ||
360 | mca_dev->pos_register = which_scsi; | ||
361 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
362 | } | ||
363 | |||
364 | /* Turn off motherboard setup */ | ||
365 | |||
366 | outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); | ||
367 | |||
368 | /* Now loop over MCA slots: put each adapter into setup mode, and | ||
369 | * read its POS registers. Then put adapter setup off. | ||
370 | */ | ||
371 | |||
372 | for(i=0; i<MCA_MAX_SLOT_NR; i++) { | ||
373 | outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG); | ||
374 | if(!mca_read_and_store_pos(pos)) | ||
375 | continue; | ||
376 | |||
377 | mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); | ||
378 | if(unlikely(!mca_dev)) | ||
379 | goto out_unlock_nomem; | ||
380 | |||
381 | for(j=0; j<8; j++) | ||
382 | mca_dev->pos[j]=pos[j]; | ||
383 | |||
384 | mca_dev->driver_loaded = 0; | ||
385 | mca_dev->slot = i; | ||
386 | mca_dev->pos_register = 0; | ||
387 | mca_configure_adapter_status(mca_dev); | ||
388 | mca_register_device(MCA_PRIMARY_BUS, mca_dev); | ||
389 | } | ||
390 | outb_p(0, MCA_ADAPTER_SETUP_REG); | ||
391 | |||
392 | /* Enable interrupts and return memory start */ | ||
393 | spin_unlock_irq(&mca_lock); | ||
394 | |||
395 | for (i = 0; i < MCA_STANDARD_RESOURCES; i++) | ||
396 | request_resource(&ioport_resource, mca_standard_resources + i); | ||
397 | |||
398 | mca_do_proc_init(); | ||
399 | |||
400 | return 0; | ||
401 | |||
402 | out_unlock_nomem: | ||
403 | spin_unlock_irq(&mca_lock); | ||
404 | out_nomem: | ||
405 | printk(KERN_EMERG "Failed memory allocation in MCA setup!\n"); | ||
406 | return -ENOMEM; | ||
407 | } | ||
408 | |||
409 | subsys_initcall(mca_init); | ||
410 | |||
411 | /*--------------------------------------------------------------------*/ | ||
412 | |||
413 | static __kprobes void | ||
414 | mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) | ||
415 | { | ||
416 | int slot = mca_dev->slot; | ||
417 | |||
418 | if(slot == MCA_INTEGSCSI) { | ||
419 | printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n", | ||
420 | mca_dev->name); | ||
421 | } else if(slot == MCA_INTEGVIDEO) { | ||
422 | printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n", | ||
423 | mca_dev->name); | ||
424 | } else if(slot == MCA_MOTHERBOARD) { | ||
425 | printk(KERN_CRIT "NMI: caused by motherboard (%s)\n", | ||
426 | mca_dev->name); | ||
427 | } | ||
428 | |||
429 | /* More info available in POS 6 and 7? */ | ||
430 | |||
431 | if(check_flag) { | ||
432 | unsigned char pos6, pos7; | ||
433 | |||
434 | pos6 = mca_device_read_pos(mca_dev, 6); | ||
435 | pos7 = mca_device_read_pos(mca_dev, 7); | ||
436 | |||
437 | printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7); | ||
438 | } | ||
439 | |||
440 | } /* mca_handle_nmi_slot */ | ||
441 | |||
442 | /*--------------------------------------------------------------------*/ | ||
443 | |||
444 | static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) | ||
445 | { | ||
446 | struct mca_device *mca_dev = to_mca_device(dev); | ||
447 | unsigned char pos5; | ||
448 | |||
449 | pos5 = mca_device_read_pos(mca_dev, 5); | ||
450 | |||
451 | if(!(pos5 & 0x80)) { | ||
452 | /* Bit 7 of POS 5 is reset when this adapter has a hardware | ||
453 | * error. Bit 7 it reset if there's error information | ||
454 | * available in POS 6 and 7. | ||
455 | */ | ||
456 | mca_handle_nmi_device(mca_dev, !(pos5 & 0x40)); | ||
457 | return 1; | ||
458 | } | ||
459 | return 0; | ||
460 | } | ||
461 | |||
462 | void __kprobes mca_handle_nmi(void) | ||
463 | { | ||
464 | /* First try - scan the various adapters and see if a specific | ||
465 | * adapter was responsible for the error. | ||
466 | */ | ||
467 | bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); | ||
468 | |||
469 | mca_nmi_hook(); | ||
470 | } /* mca_handle_nmi */ | ||
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c new file mode 100644 index 000000000000..09cf78110358 --- /dev/null +++ b/arch/x86/kernel/microcode.c | |||
@@ -0,0 +1,850 @@ | |||
1 | /* | ||
2 | * Intel CPU Microcode Update Driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> | ||
5 | * 2006 Shaohua Li <shaohua.li@intel.com> | ||
6 | * | ||
7 | * This driver allows to upgrade microcode on Intel processors | ||
8 | * belonging to IA-32 family - PentiumPro, Pentium II, | ||
9 | * Pentium III, Xeon, Pentium 4, etc. | ||
10 | * | ||
11 | * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, | ||
12 | * Order Number 245472 or free download from: | ||
13 | * | ||
14 | * http://developer.intel.com/design/pentium4/manuals/245472.htm | ||
15 | * | ||
16 | * For more information, go to http://www.urbanmyth.org/microcode | ||
17 | * | ||
18 | * This program is free software; you can redistribute it and/or | ||
19 | * modify it under the terms of the GNU General Public License | ||
20 | * as published by the Free Software Foundation; either version | ||
21 | * 2 of the License, or (at your option) any later version. | ||
22 | * | ||
23 | * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
24 | * Initial release. | ||
25 | * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
26 | * Added read() support + cleanups. | ||
27 | * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
28 | * Added 'device trimming' support. open(O_WRONLY) zeroes | ||
29 | * and frees the saved copy of applied microcode. | ||
30 | * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
31 | * Made to use devfs (/dev/cpu/microcode) + cleanups. | ||
32 | * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
33 | * Added misc device support (now uses both devfs and misc). | ||
34 | * Added MICROCODE_IOCFREE ioctl to clear memory. | ||
35 | * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
36 | * Messages for error cases (non Intel & no suitable microcode). | ||
37 | * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> | ||
38 | * Removed ->release(). Removed exclusive open and status bitmap. | ||
39 | * Added microcode_rwsem to serialize read()/write()/ioctl(). | ||
40 | * Removed global kernel lock usage. | ||
41 | * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> | ||
42 | * Write 0 to 0x8B msr and then cpuid before reading revision, | ||
43 | * so that it works even if there were no update done by the | ||
44 | * BIOS. Otherwise, reading from 0x8B gives junk (which happened | ||
45 | * to be 0 on my machine which is why it worked even when I | ||
46 | * disabled update by the BIOS) | ||
47 | * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. | ||
48 | * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and | ||
49 | * Tigran Aivazian <tigran@veritas.com> | ||
50 | * Intel Pentium 4 processor support and bugfixes. | ||
51 | * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> | ||
52 | * Bugfix for HT (Hyper-Threading) enabled processors | ||
53 | * whereby processor resources are shared by all logical processors | ||
54 | * in a single CPU package. | ||
55 | * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and | ||
56 | * Tigran Aivazian <tigran@veritas.com>, | ||
57 | * Serialize updates as required on HT processors due to speculative | ||
58 | * nature of implementation. | ||
59 | * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> | ||
60 | * Fix the panic when writing zero-length microcode chunk. | ||
61 | * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, | ||
62 | * Jun Nakajima <jun.nakajima@intel.com> | ||
63 | * Support for the microcode updates in the new format. | ||
64 | * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> | ||
65 | * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl | ||
66 | * because we no longer hold a copy of applied microcode | ||
67 | * in kernel memory. | ||
68 | * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> | ||
69 | * Fix sigmatch() macro to handle old CPUs with pf == 0. | ||
70 | * Thanks to Stuart Swales for pointing out this bug. | ||
71 | */ | ||
72 | |||
73 | //#define DEBUG /* pr_debug */ | ||
74 | #include <linux/capability.h> | ||
75 | #include <linux/kernel.h> | ||
76 | #include <linux/init.h> | ||
77 | #include <linux/sched.h> | ||
78 | #include <linux/cpumask.h> | ||
79 | #include <linux/module.h> | ||
80 | #include <linux/slab.h> | ||
81 | #include <linux/vmalloc.h> | ||
82 | #include <linux/miscdevice.h> | ||
83 | #include <linux/spinlock.h> | ||
84 | #include <linux/mm.h> | ||
85 | #include <linux/fs.h> | ||
86 | #include <linux/mutex.h> | ||
87 | #include <linux/cpu.h> | ||
88 | #include <linux/firmware.h> | ||
89 | #include <linux/platform_device.h> | ||
90 | |||
91 | #include <asm/msr.h> | ||
92 | #include <asm/uaccess.h> | ||
93 | #include <asm/processor.h> | ||
94 | |||
95 | MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); | ||
96 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); | ||
97 | MODULE_LICENSE("GPL"); | ||
98 | |||
99 | #define MICROCODE_VERSION "1.14a" | ||
100 | |||
101 | #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ | ||
102 | #define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ | ||
103 | #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ | ||
104 | #define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ | ||
105 | #define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ | ||
106 | #define DWSIZE (sizeof (u32)) | ||
107 | #define get_totalsize(mc) \ | ||
108 | (((microcode_t *)mc)->hdr.totalsize ? \ | ||
109 | ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) | ||
110 | #define get_datasize(mc) \ | ||
111 | (((microcode_t *)mc)->hdr.datasize ? \ | ||
112 | ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) | ||
113 | |||
114 | #define sigmatch(s1, s2, p1, p2) \ | ||
115 | (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) | ||
116 | |||
117 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) | ||
118 | |||
119 | /* serialize access to the physical write to MSR 0x79 */ | ||
120 | static DEFINE_SPINLOCK(microcode_update_lock); | ||
121 | |||
122 | /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ | ||
123 | static DEFINE_MUTEX(microcode_mutex); | ||
124 | |||
125 | static struct ucode_cpu_info { | ||
126 | int valid; | ||
127 | unsigned int sig; | ||
128 | unsigned int pf; | ||
129 | unsigned int rev; | ||
130 | microcode_t *mc; | ||
131 | } ucode_cpu_info[NR_CPUS]; | ||
132 | |||
133 | static void collect_cpu_info(int cpu_num) | ||
134 | { | ||
135 | struct cpuinfo_x86 *c = cpu_data + cpu_num; | ||
136 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | ||
137 | unsigned int val[2]; | ||
138 | |||
139 | /* We should bind the task to the CPU */ | ||
140 | BUG_ON(raw_smp_processor_id() != cpu_num); | ||
141 | uci->pf = uci->rev = 0; | ||
142 | uci->mc = NULL; | ||
143 | uci->valid = 1; | ||
144 | |||
145 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | ||
146 | cpu_has(c, X86_FEATURE_IA64)) { | ||
147 | printk(KERN_ERR "microcode: CPU%d not a capable Intel " | ||
148 | "processor\n", cpu_num); | ||
149 | uci->valid = 0; | ||
150 | return; | ||
151 | } | ||
152 | |||
153 | uci->sig = cpuid_eax(0x00000001); | ||
154 | |||
155 | if ((c->x86_model >= 5) || (c->x86 > 6)) { | ||
156 | /* get processor flags from MSR 0x17 */ | ||
157 | rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); | ||
158 | uci->pf = 1 << ((val[1] >> 18) & 7); | ||
159 | } | ||
160 | |||
161 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
162 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
163 | sync_core(); | ||
164 | /* get the current revision from MSR 0x8B */ | ||
165 | rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); | ||
166 | pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", | ||
167 | uci->sig, uci->pf, uci->rev); | ||
168 | } | ||
169 | |||
170 | static inline int microcode_update_match(int cpu_num, | ||
171 | microcode_header_t *mc_header, int sig, int pf) | ||
172 | { | ||
173 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | ||
174 | |||
175 | if (!sigmatch(sig, uci->sig, pf, uci->pf) | ||
176 | || mc_header->rev <= uci->rev) | ||
177 | return 0; | ||
178 | return 1; | ||
179 | } | ||
180 | |||
181 | static int microcode_sanity_check(void *mc) | ||
182 | { | ||
183 | microcode_header_t *mc_header = mc; | ||
184 | struct extended_sigtable *ext_header = NULL; | ||
185 | struct extended_signature *ext_sig; | ||
186 | unsigned long total_size, data_size, ext_table_size; | ||
187 | int sum, orig_sum, ext_sigcount = 0, i; | ||
188 | |||
189 | total_size = get_totalsize(mc_header); | ||
190 | data_size = get_datasize(mc_header); | ||
191 | if (data_size + MC_HEADER_SIZE > total_size) { | ||
192 | printk(KERN_ERR "microcode: error! " | ||
193 | "Bad data size in microcode data file\n"); | ||
194 | return -EINVAL; | ||
195 | } | ||
196 | |||
197 | if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { | ||
198 | printk(KERN_ERR "microcode: error! " | ||
199 | "Unknown microcode update format\n"); | ||
200 | return -EINVAL; | ||
201 | } | ||
202 | ext_table_size = total_size - (MC_HEADER_SIZE + data_size); | ||
203 | if (ext_table_size) { | ||
204 | if ((ext_table_size < EXT_HEADER_SIZE) | ||
205 | || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { | ||
206 | printk(KERN_ERR "microcode: error! " | ||
207 | "Small exttable size in microcode data file\n"); | ||
208 | return -EINVAL; | ||
209 | } | ||
210 | ext_header = mc + MC_HEADER_SIZE + data_size; | ||
211 | if (ext_table_size != exttable_size(ext_header)) { | ||
212 | printk(KERN_ERR "microcode: error! " | ||
213 | "Bad exttable size in microcode data file\n"); | ||
214 | return -EFAULT; | ||
215 | } | ||
216 | ext_sigcount = ext_header->count; | ||
217 | } | ||
218 | |||
219 | /* check extended table checksum */ | ||
220 | if (ext_table_size) { | ||
221 | int ext_table_sum = 0; | ||
222 | int *ext_tablep = (int *)ext_header; | ||
223 | |||
224 | i = ext_table_size / DWSIZE; | ||
225 | while (i--) | ||
226 | ext_table_sum += ext_tablep[i]; | ||
227 | if (ext_table_sum) { | ||
228 | printk(KERN_WARNING "microcode: aborting, " | ||
229 | "bad extended signature table checksum\n"); | ||
230 | return -EINVAL; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | /* calculate the checksum */ | ||
235 | orig_sum = 0; | ||
236 | i = (MC_HEADER_SIZE + data_size) / DWSIZE; | ||
237 | while (i--) | ||
238 | orig_sum += ((int *)mc)[i]; | ||
239 | if (orig_sum) { | ||
240 | printk(KERN_ERR "microcode: aborting, bad checksum\n"); | ||
241 | return -EINVAL; | ||
242 | } | ||
243 | if (!ext_table_size) | ||
244 | return 0; | ||
245 | /* check extended signature checksum */ | ||
246 | for (i = 0; i < ext_sigcount; i++) { | ||
247 | ext_sig = (struct extended_signature *)((void *)ext_header | ||
248 | + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); | ||
249 | sum = orig_sum | ||
250 | - (mc_header->sig + mc_header->pf + mc_header->cksum) | ||
251 | + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); | ||
252 | if (sum) { | ||
253 | printk(KERN_ERR "microcode: aborting, bad checksum\n"); | ||
254 | return -EINVAL; | ||
255 | } | ||
256 | } | ||
257 | return 0; | ||
258 | } | ||
259 | |||
260 | /* | ||
261 | * return 0 - no update found | ||
262 | * return 1 - found update | ||
263 | * return < 0 - error | ||
264 | */ | ||
265 | static int get_maching_microcode(void *mc, int cpu) | ||
266 | { | ||
267 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
268 | microcode_header_t *mc_header = mc; | ||
269 | struct extended_sigtable *ext_header; | ||
270 | unsigned long total_size = get_totalsize(mc_header); | ||
271 | int ext_sigcount, i; | ||
272 | struct extended_signature *ext_sig; | ||
273 | void *new_mc; | ||
274 | |||
275 | if (microcode_update_match(cpu, mc_header, | ||
276 | mc_header->sig, mc_header->pf)) | ||
277 | goto find; | ||
278 | |||
279 | if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) | ||
280 | return 0; | ||
281 | |||
282 | ext_header = (struct extended_sigtable *)(mc + | ||
283 | get_datasize(mc_header) + MC_HEADER_SIZE); | ||
284 | ext_sigcount = ext_header->count; | ||
285 | ext_sig = (struct extended_signature *)((void *)ext_header | ||
286 | + EXT_HEADER_SIZE); | ||
287 | for (i = 0; i < ext_sigcount; i++) { | ||
288 | if (microcode_update_match(cpu, mc_header, | ||
289 | ext_sig->sig, ext_sig->pf)) | ||
290 | goto find; | ||
291 | ext_sig++; | ||
292 | } | ||
293 | return 0; | ||
294 | find: | ||
295 | pr_debug("microcode: CPU %d found a matching microcode update with" | ||
296 | " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); | ||
297 | new_mc = vmalloc(total_size); | ||
298 | if (!new_mc) { | ||
299 | printk(KERN_ERR "microcode: error! Can not allocate memory\n"); | ||
300 | return -ENOMEM; | ||
301 | } | ||
302 | |||
303 | /* free previous update file */ | ||
304 | vfree(uci->mc); | ||
305 | |||
306 | memcpy(new_mc, mc, total_size); | ||
307 | uci->mc = new_mc; | ||
308 | return 1; | ||
309 | } | ||
310 | |||
311 | static void apply_microcode(int cpu) | ||
312 | { | ||
313 | unsigned long flags; | ||
314 | unsigned int val[2]; | ||
315 | int cpu_num = raw_smp_processor_id(); | ||
316 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | ||
317 | |||
318 | /* We should bind the task to the CPU */ | ||
319 | BUG_ON(cpu_num != cpu); | ||
320 | |||
321 | if (uci->mc == NULL) | ||
322 | return; | ||
323 | |||
324 | /* serialize access to the physical write to MSR 0x79 */ | ||
325 | spin_lock_irqsave(µcode_update_lock, flags); | ||
326 | |||
327 | /* write microcode via MSR 0x79 */ | ||
328 | wrmsr(MSR_IA32_UCODE_WRITE, | ||
329 | (unsigned long) uci->mc->bits, | ||
330 | (unsigned long) uci->mc->bits >> 16 >> 16); | ||
331 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
332 | |||
333 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
334 | sync_core(); | ||
335 | |||
336 | /* get the current revision from MSR 0x8B */ | ||
337 | rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
338 | |||
339 | spin_unlock_irqrestore(µcode_update_lock, flags); | ||
340 | if (val[1] != uci->mc->hdr.rev) { | ||
341 | printk(KERN_ERR "microcode: CPU%d updated from revision " | ||
342 | "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); | ||
343 | return; | ||
344 | } | ||
345 | pr_debug("microcode: CPU%d updated from revision " | ||
346 | "0x%x to 0x%x, date = %08x \n", | ||
347 | cpu_num, uci->rev, val[1], uci->mc->hdr.date); | ||
348 | uci->rev = val[1]; | ||
349 | } | ||
350 | |||
351 | #ifdef CONFIG_MICROCODE_OLD_INTERFACE | ||
352 | static void __user *user_buffer; /* user area microcode data buffer */ | ||
353 | static unsigned int user_buffer_size; /* it's size */ | ||
354 | |||
355 | static long get_next_ucode(void **mc, long offset) | ||
356 | { | ||
357 | microcode_header_t mc_header; | ||
358 | unsigned long total_size; | ||
359 | |||
360 | /* No more data */ | ||
361 | if (offset >= user_buffer_size) | ||
362 | return 0; | ||
363 | if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { | ||
364 | printk(KERN_ERR "microcode: error! Can not read user data\n"); | ||
365 | return -EFAULT; | ||
366 | } | ||
367 | total_size = get_totalsize(&mc_header); | ||
368 | if (offset + total_size > user_buffer_size) { | ||
369 | printk(KERN_ERR "microcode: error! Bad total size in microcode " | ||
370 | "data file\n"); | ||
371 | return -EINVAL; | ||
372 | } | ||
373 | *mc = vmalloc(total_size); | ||
374 | if (!*mc) | ||
375 | return -ENOMEM; | ||
376 | if (copy_from_user(*mc, user_buffer + offset, total_size)) { | ||
377 | printk(KERN_ERR "microcode: error! Can not read user data\n"); | ||
378 | vfree(*mc); | ||
379 | return -EFAULT; | ||
380 | } | ||
381 | return offset + total_size; | ||
382 | } | ||
383 | |||
384 | static int do_microcode_update (void) | ||
385 | { | ||
386 | long cursor = 0; | ||
387 | int error = 0; | ||
388 | void *new_mc = NULL; | ||
389 | int cpu; | ||
390 | cpumask_t old; | ||
391 | |||
392 | old = current->cpus_allowed; | ||
393 | |||
394 | while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { | ||
395 | error = microcode_sanity_check(new_mc); | ||
396 | if (error) | ||
397 | goto out; | ||
398 | /* | ||
399 | * It's possible the data file has multiple matching ucode, | ||
400 | * lets keep searching till the latest version | ||
401 | */ | ||
402 | for_each_online_cpu(cpu) { | ||
403 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
404 | |||
405 | if (!uci->valid) | ||
406 | continue; | ||
407 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | ||
408 | error = get_maching_microcode(new_mc, cpu); | ||
409 | if (error < 0) | ||
410 | goto out; | ||
411 | if (error == 1) | ||
412 | apply_microcode(cpu); | ||
413 | } | ||
414 | vfree(new_mc); | ||
415 | } | ||
416 | out: | ||
417 | if (cursor > 0) | ||
418 | vfree(new_mc); | ||
419 | if (cursor < 0) | ||
420 | error = cursor; | ||
421 | set_cpus_allowed(current, old); | ||
422 | return error; | ||
423 | } | ||
424 | |||
425 | static int microcode_open (struct inode *unused1, struct file *unused2) | ||
426 | { | ||
427 | return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; | ||
428 | } | ||
429 | |||
430 | static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) | ||
431 | { | ||
432 | ssize_t ret; | ||
433 | |||
434 | if ((len >> PAGE_SHIFT) > num_physpages) { | ||
435 | printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); | ||
436 | return -EINVAL; | ||
437 | } | ||
438 | |||
439 | lock_cpu_hotplug(); | ||
440 | mutex_lock(µcode_mutex); | ||
441 | |||
442 | user_buffer = (void __user *) buf; | ||
443 | user_buffer_size = (int) len; | ||
444 | |||
445 | ret = do_microcode_update(); | ||
446 | if (!ret) | ||
447 | ret = (ssize_t)len; | ||
448 | |||
449 | mutex_unlock(µcode_mutex); | ||
450 | unlock_cpu_hotplug(); | ||
451 | |||
452 | return ret; | ||
453 | } | ||
454 | |||
455 | static const struct file_operations microcode_fops = { | ||
456 | .owner = THIS_MODULE, | ||
457 | .write = microcode_write, | ||
458 | .open = microcode_open, | ||
459 | }; | ||
460 | |||
461 | static struct miscdevice microcode_dev = { | ||
462 | .minor = MICROCODE_MINOR, | ||
463 | .name = "microcode", | ||
464 | .fops = µcode_fops, | ||
465 | }; | ||
466 | |||
467 | static int __init microcode_dev_init (void) | ||
468 | { | ||
469 | int error; | ||
470 | |||
471 | error = misc_register(µcode_dev); | ||
472 | if (error) { | ||
473 | printk(KERN_ERR | ||
474 | "microcode: can't misc_register on minor=%d\n", | ||
475 | MICROCODE_MINOR); | ||
476 | return error; | ||
477 | } | ||
478 | |||
479 | return 0; | ||
480 | } | ||
481 | |||
482 | static void microcode_dev_exit (void) | ||
483 | { | ||
484 | misc_deregister(µcode_dev); | ||
485 | } | ||
486 | |||
487 | MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); | ||
488 | #else | ||
489 | #define microcode_dev_init() 0 | ||
490 | #define microcode_dev_exit() do { } while(0) | ||
491 | #endif | ||
492 | |||
493 | static long get_next_ucode_from_buffer(void **mc, void *buf, | ||
494 | unsigned long size, long offset) | ||
495 | { | ||
496 | microcode_header_t *mc_header; | ||
497 | unsigned long total_size; | ||
498 | |||
499 | /* No more data */ | ||
500 | if (offset >= size) | ||
501 | return 0; | ||
502 | mc_header = (microcode_header_t *)(buf + offset); | ||
503 | total_size = get_totalsize(mc_header); | ||
504 | |||
505 | if (offset + total_size > size) { | ||
506 | printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); | ||
507 | return -EINVAL; | ||
508 | } | ||
509 | |||
510 | *mc = vmalloc(total_size); | ||
511 | if (!*mc) { | ||
512 | printk(KERN_ERR "microcode: error! Can not allocate memory\n"); | ||
513 | return -ENOMEM; | ||
514 | } | ||
515 | memcpy(*mc, buf + offset, total_size); | ||
516 | return offset + total_size; | ||
517 | } | ||
518 | |||
519 | /* fake device for request_firmware */ | ||
520 | static struct platform_device *microcode_pdev; | ||
521 | |||
522 | static int cpu_request_microcode(int cpu) | ||
523 | { | ||
524 | char name[30]; | ||
525 | struct cpuinfo_x86 *c = cpu_data + cpu; | ||
526 | const struct firmware *firmware; | ||
527 | void *buf; | ||
528 | unsigned long size; | ||
529 | long offset = 0; | ||
530 | int error; | ||
531 | void *mc; | ||
532 | |||
533 | /* We should bind the task to the CPU */ | ||
534 | BUG_ON(cpu != raw_smp_processor_id()); | ||
535 | sprintf(name,"intel-ucode/%02x-%02x-%02x", | ||
536 | c->x86, c->x86_model, c->x86_mask); | ||
537 | error = request_firmware(&firmware, name, µcode_pdev->dev); | ||
538 | if (error) { | ||
539 | pr_debug("ucode data file %s load failed\n", name); | ||
540 | return error; | ||
541 | } | ||
542 | buf = (void *)firmware->data; | ||
543 | size = firmware->size; | ||
544 | while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) | ||
545 | > 0) { | ||
546 | error = microcode_sanity_check(mc); | ||
547 | if (error) | ||
548 | break; | ||
549 | error = get_maching_microcode(mc, cpu); | ||
550 | if (error < 0) | ||
551 | break; | ||
552 | /* | ||
553 | * It's possible the data file has multiple matching ucode, | ||
554 | * lets keep searching till the latest version | ||
555 | */ | ||
556 | if (error == 1) { | ||
557 | apply_microcode(cpu); | ||
558 | error = 0; | ||
559 | } | ||
560 | vfree(mc); | ||
561 | } | ||
562 | if (offset > 0) | ||
563 | vfree(mc); | ||
564 | if (offset < 0) | ||
565 | error = offset; | ||
566 | release_firmware(firmware); | ||
567 | |||
568 | return error; | ||
569 | } | ||
570 | |||
571 | static int apply_microcode_check_cpu(int cpu) | ||
572 | { | ||
573 | struct cpuinfo_x86 *c = cpu_data + cpu; | ||
574 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
575 | cpumask_t old; | ||
576 | unsigned int val[2]; | ||
577 | int err = 0; | ||
578 | |||
579 | /* Check if the microcode is available */ | ||
580 | if (!uci->mc) | ||
581 | return 0; | ||
582 | |||
583 | old = current->cpus_allowed; | ||
584 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | ||
585 | |||
586 | /* Check if the microcode we have in memory matches the CPU */ | ||
587 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | ||
588 | cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) | ||
589 | err = -EINVAL; | ||
590 | |||
591 | if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { | ||
592 | /* get processor flags from MSR 0x17 */ | ||
593 | rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); | ||
594 | if (uci->pf != (1 << ((val[1] >> 18) & 7))) | ||
595 | err = -EINVAL; | ||
596 | } | ||
597 | |||
598 | if (!err) { | ||
599 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
600 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
601 | sync_core(); | ||
602 | /* get the current revision from MSR 0x8B */ | ||
603 | rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
604 | if (uci->rev != val[1]) | ||
605 | err = -EINVAL; | ||
606 | } | ||
607 | |||
608 | if (!err) | ||
609 | apply_microcode(cpu); | ||
610 | else | ||
611 | printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" | ||
612 | " sig=0x%x, pf=0x%x, rev=0x%x\n", | ||
613 | cpu, uci->sig, uci->pf, uci->rev); | ||
614 | |||
615 | set_cpus_allowed(current, old); | ||
616 | return err; | ||
617 | } | ||
618 | |||
619 | static void microcode_init_cpu(int cpu, int resume) | ||
620 | { | ||
621 | cpumask_t old; | ||
622 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
623 | |||
624 | old = current->cpus_allowed; | ||
625 | |||
626 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | ||
627 | mutex_lock(µcode_mutex); | ||
628 | collect_cpu_info(cpu); | ||
629 | if (uci->valid && system_state == SYSTEM_RUNNING && !resume) | ||
630 | cpu_request_microcode(cpu); | ||
631 | mutex_unlock(µcode_mutex); | ||
632 | set_cpus_allowed(current, old); | ||
633 | } | ||
634 | |||
635 | static void microcode_fini_cpu(int cpu) | ||
636 | { | ||
637 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
638 | |||
639 | mutex_lock(µcode_mutex); | ||
640 | uci->valid = 0; | ||
641 | vfree(uci->mc); | ||
642 | uci->mc = NULL; | ||
643 | mutex_unlock(µcode_mutex); | ||
644 | } | ||
645 | |||
646 | static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) | ||
647 | { | ||
648 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
649 | char *end; | ||
650 | unsigned long val = simple_strtoul(buf, &end, 0); | ||
651 | int err = 0; | ||
652 | int cpu = dev->id; | ||
653 | |||
654 | if (end == buf) | ||
655 | return -EINVAL; | ||
656 | if (val == 1) { | ||
657 | cpumask_t old; | ||
658 | |||
659 | old = current->cpus_allowed; | ||
660 | |||
661 | lock_cpu_hotplug(); | ||
662 | set_cpus_allowed(current, cpumask_of_cpu(cpu)); | ||
663 | |||
664 | mutex_lock(µcode_mutex); | ||
665 | if (uci->valid) | ||
666 | err = cpu_request_microcode(cpu); | ||
667 | mutex_unlock(µcode_mutex); | ||
668 | unlock_cpu_hotplug(); | ||
669 | set_cpus_allowed(current, old); | ||
670 | } | ||
671 | if (err) | ||
672 | return err; | ||
673 | return sz; | ||
674 | } | ||
675 | |||
676 | static ssize_t version_show(struct sys_device *dev, char *buf) | ||
677 | { | ||
678 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
679 | |||
680 | return sprintf(buf, "0x%x\n", uci->rev); | ||
681 | } | ||
682 | |||
683 | static ssize_t pf_show(struct sys_device *dev, char *buf) | ||
684 | { | ||
685 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
686 | |||
687 | return sprintf(buf, "0x%x\n", uci->pf); | ||
688 | } | ||
689 | |||
690 | static SYSDEV_ATTR(reload, 0200, NULL, reload_store); | ||
691 | static SYSDEV_ATTR(version, 0400, version_show, NULL); | ||
692 | static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); | ||
693 | |||
694 | static struct attribute *mc_default_attrs[] = { | ||
695 | &attr_reload.attr, | ||
696 | &attr_version.attr, | ||
697 | &attr_processor_flags.attr, | ||
698 | NULL | ||
699 | }; | ||
700 | |||
701 | static struct attribute_group mc_attr_group = { | ||
702 | .attrs = mc_default_attrs, | ||
703 | .name = "microcode", | ||
704 | }; | ||
705 | |||
706 | static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) | ||
707 | { | ||
708 | int err, cpu = sys_dev->id; | ||
709 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
710 | |||
711 | if (!cpu_online(cpu)) | ||
712 | return 0; | ||
713 | |||
714 | pr_debug("Microcode:CPU %d added\n", cpu); | ||
715 | memset(uci, 0, sizeof(*uci)); | ||
716 | |||
717 | err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); | ||
718 | if (err) | ||
719 | return err; | ||
720 | |||
721 | microcode_init_cpu(cpu, resume); | ||
722 | |||
723 | return 0; | ||
724 | } | ||
725 | |||
726 | static int mc_sysdev_add(struct sys_device *sys_dev) | ||
727 | { | ||
728 | return __mc_sysdev_add(sys_dev, 0); | ||
729 | } | ||
730 | |||
731 | static int mc_sysdev_remove(struct sys_device *sys_dev) | ||
732 | { | ||
733 | int cpu = sys_dev->id; | ||
734 | |||
735 | if (!cpu_online(cpu)) | ||
736 | return 0; | ||
737 | |||
738 | pr_debug("Microcode:CPU %d removed\n", cpu); | ||
739 | microcode_fini_cpu(cpu); | ||
740 | sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); | ||
741 | return 0; | ||
742 | } | ||
743 | |||
744 | static int mc_sysdev_resume(struct sys_device *dev) | ||
745 | { | ||
746 | int cpu = dev->id; | ||
747 | |||
748 | if (!cpu_online(cpu)) | ||
749 | return 0; | ||
750 | pr_debug("Microcode:CPU %d resumed\n", cpu); | ||
751 | /* only CPU 0 will apply ucode here */ | ||
752 | apply_microcode(0); | ||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | static struct sysdev_driver mc_sysdev_driver = { | ||
757 | .add = mc_sysdev_add, | ||
758 | .remove = mc_sysdev_remove, | ||
759 | .resume = mc_sysdev_resume, | ||
760 | }; | ||
761 | |||
762 | static __cpuinit int | ||
763 | mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) | ||
764 | { | ||
765 | unsigned int cpu = (unsigned long)hcpu; | ||
766 | struct sys_device *sys_dev; | ||
767 | |||
768 | sys_dev = get_cpu_sysdev(cpu); | ||
769 | switch (action) { | ||
770 | case CPU_UP_CANCELED_FROZEN: | ||
771 | /* The CPU refused to come up during a system resume */ | ||
772 | microcode_fini_cpu(cpu); | ||
773 | break; | ||
774 | case CPU_ONLINE: | ||
775 | case CPU_DOWN_FAILED: | ||
776 | mc_sysdev_add(sys_dev); | ||
777 | break; | ||
778 | case CPU_ONLINE_FROZEN: | ||
779 | /* System-wide resume is in progress, try to apply microcode */ | ||
780 | if (apply_microcode_check_cpu(cpu)) { | ||
781 | /* The application of microcode failed */ | ||
782 | microcode_fini_cpu(cpu); | ||
783 | __mc_sysdev_add(sys_dev, 1); | ||
784 | break; | ||
785 | } | ||
786 | case CPU_DOWN_FAILED_FROZEN: | ||
787 | if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) | ||
788 | printk(KERN_ERR "Microcode: Failed to create the sysfs " | ||
789 | "group for CPU%d\n", cpu); | ||
790 | break; | ||
791 | case CPU_DOWN_PREPARE: | ||
792 | mc_sysdev_remove(sys_dev); | ||
793 | break; | ||
794 | case CPU_DOWN_PREPARE_FROZEN: | ||
795 | /* Suspend is in progress, only remove the interface */ | ||
796 | sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); | ||
797 | break; | ||
798 | } | ||
799 | return NOTIFY_OK; | ||
800 | } | ||
801 | |||
802 | static struct notifier_block __cpuinitdata mc_cpu_notifier = { | ||
803 | .notifier_call = mc_cpu_callback, | ||
804 | }; | ||
805 | |||
806 | static int __init microcode_init (void) | ||
807 | { | ||
808 | int error; | ||
809 | |||
810 | error = microcode_dev_init(); | ||
811 | if (error) | ||
812 | return error; | ||
813 | microcode_pdev = platform_device_register_simple("microcode", -1, | ||
814 | NULL, 0); | ||
815 | if (IS_ERR(microcode_pdev)) { | ||
816 | microcode_dev_exit(); | ||
817 | return PTR_ERR(microcode_pdev); | ||
818 | } | ||
819 | |||
820 | lock_cpu_hotplug(); | ||
821 | error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); | ||
822 | unlock_cpu_hotplug(); | ||
823 | if (error) { | ||
824 | microcode_dev_exit(); | ||
825 | platform_device_unregister(microcode_pdev); | ||
826 | return error; | ||
827 | } | ||
828 | |||
829 | register_hotcpu_notifier(&mc_cpu_notifier); | ||
830 | |||
831 | printk(KERN_INFO | ||
832 | "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); | ||
833 | return 0; | ||
834 | } | ||
835 | |||
836 | static void __exit microcode_exit (void) | ||
837 | { | ||
838 | microcode_dev_exit(); | ||
839 | |||
840 | unregister_hotcpu_notifier(&mc_cpu_notifier); | ||
841 | |||
842 | lock_cpu_hotplug(); | ||
843 | sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); | ||
844 | unlock_cpu_hotplug(); | ||
845 | |||
846 | platform_device_unregister(microcode_pdev); | ||
847 | } | ||
848 | |||
849 | module_init(microcode_init) | ||
850 | module_exit(microcode_exit) | ||
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c new file mode 100644 index 000000000000..3db0a5442eb1 --- /dev/null +++ b/arch/x86/kernel/module_32.c | |||
@@ -0,0 +1,152 @@ | |||
1 | /* Kernel module help for i386. | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #include <linux/moduleloader.h> | ||
19 | #include <linux/elf.h> | ||
20 | #include <linux/vmalloc.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/bug.h> | ||
25 | |||
26 | #if 0 | ||
27 | #define DEBUGP printk | ||
28 | #else | ||
29 | #define DEBUGP(fmt...) | ||
30 | #endif | ||
31 | |||
32 | void *module_alloc(unsigned long size) | ||
33 | { | ||
34 | if (size == 0) | ||
35 | return NULL; | ||
36 | return vmalloc_exec(size); | ||
37 | } | ||
38 | |||
39 | |||
40 | /* Free memory returned from module_alloc */ | ||
41 | void module_free(struct module *mod, void *module_region) | ||
42 | { | ||
43 | vfree(module_region); | ||
44 | /* FIXME: If module_region == mod->init_region, trim exception | ||
45 | table entries. */ | ||
46 | } | ||
47 | |||
48 | /* We don't need anything special. */ | ||
49 | int module_frob_arch_sections(Elf_Ehdr *hdr, | ||
50 | Elf_Shdr *sechdrs, | ||
51 | char *secstrings, | ||
52 | struct module *mod) | ||
53 | { | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | int apply_relocate(Elf32_Shdr *sechdrs, | ||
58 | const char *strtab, | ||
59 | unsigned int symindex, | ||
60 | unsigned int relsec, | ||
61 | struct module *me) | ||
62 | { | ||
63 | unsigned int i; | ||
64 | Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; | ||
65 | Elf32_Sym *sym; | ||
66 | uint32_t *location; | ||
67 | |||
68 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
69 | sechdrs[relsec].sh_info); | ||
70 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
71 | /* This is where to make the change */ | ||
72 | location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
73 | + rel[i].r_offset; | ||
74 | /* This is the symbol it is referring to. Note that all | ||
75 | undefined symbols have been resolved. */ | ||
76 | sym = (Elf32_Sym *)sechdrs[symindex].sh_addr | ||
77 | + ELF32_R_SYM(rel[i].r_info); | ||
78 | |||
79 | switch (ELF32_R_TYPE(rel[i].r_info)) { | ||
80 | case R_386_32: | ||
81 | /* We add the value into the location given */ | ||
82 | *location += sym->st_value; | ||
83 | break; | ||
84 | case R_386_PC32: | ||
85 | /* Add the value, subtract its postition */ | ||
86 | *location += sym->st_value - (uint32_t)location; | ||
87 | break; | ||
88 | default: | ||
89 | printk(KERN_ERR "module %s: Unknown relocation: %u\n", | ||
90 | me->name, ELF32_R_TYPE(rel[i].r_info)); | ||
91 | return -ENOEXEC; | ||
92 | } | ||
93 | } | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | int apply_relocate_add(Elf32_Shdr *sechdrs, | ||
98 | const char *strtab, | ||
99 | unsigned int symindex, | ||
100 | unsigned int relsec, | ||
101 | struct module *me) | ||
102 | { | ||
103 | printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", | ||
104 | me->name); | ||
105 | return -ENOEXEC; | ||
106 | } | ||
107 | |||
108 | int module_finalize(const Elf_Ehdr *hdr, | ||
109 | const Elf_Shdr *sechdrs, | ||
110 | struct module *me) | ||
111 | { | ||
112 | const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, | ||
113 | *para = NULL; | ||
114 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
115 | |||
116 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | ||
117 | if (!strcmp(".text", secstrings + s->sh_name)) | ||
118 | text = s; | ||
119 | if (!strcmp(".altinstructions", secstrings + s->sh_name)) | ||
120 | alt = s; | ||
121 | if (!strcmp(".smp_locks", secstrings + s->sh_name)) | ||
122 | locks= s; | ||
123 | if (!strcmp(".parainstructions", secstrings + s->sh_name)) | ||
124 | para = s; | ||
125 | } | ||
126 | |||
127 | if (alt) { | ||
128 | /* patch .altinstructions */ | ||
129 | void *aseg = (void *)alt->sh_addr; | ||
130 | apply_alternatives(aseg, aseg + alt->sh_size); | ||
131 | } | ||
132 | if (locks && text) { | ||
133 | void *lseg = (void *)locks->sh_addr; | ||
134 | void *tseg = (void *)text->sh_addr; | ||
135 | alternatives_smp_module_add(me, me->name, | ||
136 | lseg, lseg + locks->sh_size, | ||
137 | tseg, tseg + text->sh_size); | ||
138 | } | ||
139 | |||
140 | if (para) { | ||
141 | void *pseg = (void *)para->sh_addr; | ||
142 | apply_paravirt(pseg, pseg + para->sh_size); | ||
143 | } | ||
144 | |||
145 | return module_bug_finalize(hdr, sechdrs, me); | ||
146 | } | ||
147 | |||
148 | void module_arch_cleanup(struct module *mod) | ||
149 | { | ||
150 | alternatives_smp_module_del(mod); | ||
151 | module_bug_cleanup(mod); | ||
152 | } | ||
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c new file mode 100644 index 000000000000..13abb4ebfb79 --- /dev/null +++ b/arch/x86/kernel/mpparse_32.c | |||
@@ -0,0 +1,1132 @@ | |||
1 | /* | ||
2 | * Intel Multiprocessor Specification 1.1 and 1.4 | ||
3 | * compliant MP-table parsing routines. | ||
4 | * | ||
5 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
6 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes | ||
9 | * Erich Boleyn : MP v1.4 and additional changes. | ||
10 | * Alan Cox : Added EBDA scanning | ||
11 | * Ingo Molnar : various cleanups and rewrites | ||
12 | * Maciej W. Rozycki: Bits for default MP configurations | ||
13 | * Paul Diefenbaugh: Added full ACPI support | ||
14 | */ | ||
15 | |||
16 | #include <linux/mm.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/acpi.h> | ||
19 | #include <linux/delay.h> | ||
20 | #include <linux/bootmem.h> | ||
21 | #include <linux/kernel_stat.h> | ||
22 | #include <linux/mc146818rtc.h> | ||
23 | #include <linux/bitops.h> | ||
24 | |||
25 | #include <asm/smp.h> | ||
26 | #include <asm/acpi.h> | ||
27 | #include <asm/mtrr.h> | ||
28 | #include <asm/mpspec.h> | ||
29 | #include <asm/io_apic.h> | ||
30 | |||
31 | #include <mach_apic.h> | ||
32 | #include <mach_apicdef.h> | ||
33 | #include <mach_mpparse.h> | ||
34 | #include <bios_ebda.h> | ||
35 | |||
36 | /* Have we found an MP table */ | ||
37 | int smp_found_config; | ||
38 | unsigned int __cpuinitdata maxcpus = NR_CPUS; | ||
39 | |||
40 | /* | ||
41 | * Various Linux-internal data structures created from the | ||
42 | * MP-table. | ||
43 | */ | ||
44 | int apic_version [MAX_APICS]; | ||
45 | int mp_bus_id_to_type [MAX_MP_BUSSES]; | ||
46 | int mp_bus_id_to_node [MAX_MP_BUSSES]; | ||
47 | int mp_bus_id_to_local [MAX_MP_BUSSES]; | ||
48 | int quad_local_to_mp_bus_id [NR_CPUS/4][4]; | ||
49 | int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | ||
50 | static int mp_current_pci_id; | ||
51 | |||
52 | /* I/O APIC entries */ | ||
53 | struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | ||
54 | |||
55 | /* # of MP IRQ source entries */ | ||
56 | struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | ||
57 | |||
58 | /* MP IRQ source entries */ | ||
59 | int mp_irq_entries; | ||
60 | |||
61 | int nr_ioapics; | ||
62 | |||
63 | int pic_mode; | ||
64 | unsigned long mp_lapic_addr; | ||
65 | |||
66 | unsigned int def_to_bigsmp = 0; | ||
67 | |||
68 | /* Processor that is doing the boot up */ | ||
69 | unsigned int boot_cpu_physical_apicid = -1U; | ||
70 | /* Internal processor count */ | ||
71 | unsigned int __cpuinitdata num_processors; | ||
72 | |||
73 | /* Bitmask of physically existing CPUs */ | ||
74 | physid_mask_t phys_cpu_present_map; | ||
75 | |||
76 | u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
77 | |||
78 | /* | ||
79 | * Intel MP BIOS table parsing routines: | ||
80 | */ | ||
81 | |||
82 | |||
83 | /* | ||
84 | * Checksum an MP configuration block. | ||
85 | */ | ||
86 | |||
87 | static int __init mpf_checksum(unsigned char *mp, int len) | ||
88 | { | ||
89 | int sum = 0; | ||
90 | |||
91 | while (len--) | ||
92 | sum += *mp++; | ||
93 | |||
94 | return sum & 0xFF; | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Have to match translation table entries to main table entries by counter | ||
99 | * hence the mpc_record variable .... can't see a less disgusting way of | ||
100 | * doing this .... | ||
101 | */ | ||
102 | |||
103 | static int mpc_record; | ||
104 | static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata; | ||
105 | |||
106 | static void __cpuinit MP_processor_info (struct mpc_config_processor *m) | ||
107 | { | ||
108 | int ver, apicid; | ||
109 | physid_mask_t phys_cpu; | ||
110 | |||
111 | if (!(m->mpc_cpuflag & CPU_ENABLED)) | ||
112 | return; | ||
113 | |||
114 | apicid = mpc_apic_id(m, translation_table[mpc_record]); | ||
115 | |||
116 | if (m->mpc_featureflag&(1<<0)) | ||
117 | Dprintk(" Floating point unit present.\n"); | ||
118 | if (m->mpc_featureflag&(1<<7)) | ||
119 | Dprintk(" Machine Exception supported.\n"); | ||
120 | if (m->mpc_featureflag&(1<<8)) | ||
121 | Dprintk(" 64 bit compare & exchange supported.\n"); | ||
122 | if (m->mpc_featureflag&(1<<9)) | ||
123 | Dprintk(" Internal APIC present.\n"); | ||
124 | if (m->mpc_featureflag&(1<<11)) | ||
125 | Dprintk(" SEP present.\n"); | ||
126 | if (m->mpc_featureflag&(1<<12)) | ||
127 | Dprintk(" MTRR present.\n"); | ||
128 | if (m->mpc_featureflag&(1<<13)) | ||
129 | Dprintk(" PGE present.\n"); | ||
130 | if (m->mpc_featureflag&(1<<14)) | ||
131 | Dprintk(" MCA present.\n"); | ||
132 | if (m->mpc_featureflag&(1<<15)) | ||
133 | Dprintk(" CMOV present.\n"); | ||
134 | if (m->mpc_featureflag&(1<<16)) | ||
135 | Dprintk(" PAT present.\n"); | ||
136 | if (m->mpc_featureflag&(1<<17)) | ||
137 | Dprintk(" PSE present.\n"); | ||
138 | if (m->mpc_featureflag&(1<<18)) | ||
139 | Dprintk(" PSN present.\n"); | ||
140 | if (m->mpc_featureflag&(1<<19)) | ||
141 | Dprintk(" Cache Line Flush Instruction present.\n"); | ||
142 | /* 20 Reserved */ | ||
143 | if (m->mpc_featureflag&(1<<21)) | ||
144 | Dprintk(" Debug Trace and EMON Store present.\n"); | ||
145 | if (m->mpc_featureflag&(1<<22)) | ||
146 | Dprintk(" ACPI Thermal Throttle Registers present.\n"); | ||
147 | if (m->mpc_featureflag&(1<<23)) | ||
148 | Dprintk(" MMX present.\n"); | ||
149 | if (m->mpc_featureflag&(1<<24)) | ||
150 | Dprintk(" FXSR present.\n"); | ||
151 | if (m->mpc_featureflag&(1<<25)) | ||
152 | Dprintk(" XMM present.\n"); | ||
153 | if (m->mpc_featureflag&(1<<26)) | ||
154 | Dprintk(" Willamette New Instructions present.\n"); | ||
155 | if (m->mpc_featureflag&(1<<27)) | ||
156 | Dprintk(" Self Snoop present.\n"); | ||
157 | if (m->mpc_featureflag&(1<<28)) | ||
158 | Dprintk(" HT present.\n"); | ||
159 | if (m->mpc_featureflag&(1<<29)) | ||
160 | Dprintk(" Thermal Monitor present.\n"); | ||
161 | /* 30, 31 Reserved */ | ||
162 | |||
163 | |||
164 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | ||
165 | Dprintk(" Bootup CPU\n"); | ||
166 | boot_cpu_physical_apicid = m->mpc_apicid; | ||
167 | } | ||
168 | |||
169 | ver = m->mpc_apicver; | ||
170 | |||
171 | /* | ||
172 | * Validate version | ||
173 | */ | ||
174 | if (ver == 0x0) { | ||
175 | printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " | ||
176 | "fixing up to 0x10. (tell your hw vendor)\n", | ||
177 | m->mpc_apicid); | ||
178 | ver = 0x10; | ||
179 | } | ||
180 | apic_version[m->mpc_apicid] = ver; | ||
181 | |||
182 | phys_cpu = apicid_to_cpu_present(apicid); | ||
183 | physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); | ||
184 | |||
185 | if (num_processors >= NR_CPUS) { | ||
186 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | ||
187 | " Processor ignored.\n", NR_CPUS); | ||
188 | return; | ||
189 | } | ||
190 | |||
191 | if (num_processors >= maxcpus) { | ||
192 | printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." | ||
193 | " Processor ignored.\n", maxcpus); | ||
194 | return; | ||
195 | } | ||
196 | |||
197 | cpu_set(num_processors, cpu_possible_map); | ||
198 | num_processors++; | ||
199 | |||
200 | /* | ||
201 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y | ||
202 | * but we need to work other dependencies like SMP_SUSPEND etc | ||
203 | * before this can be done without some confusion. | ||
204 | * if (CPU_HOTPLUG_ENABLED || num_processors > 8) | ||
205 | * - Ashok Raj <ashok.raj@intel.com> | ||
206 | */ | ||
207 | if (num_processors > 8) { | ||
208 | switch (boot_cpu_data.x86_vendor) { | ||
209 | case X86_VENDOR_INTEL: | ||
210 | if (!APIC_XAPIC(ver)) { | ||
211 | def_to_bigsmp = 0; | ||
212 | break; | ||
213 | } | ||
214 | /* If P4 and above fall through */ | ||
215 | case X86_VENDOR_AMD: | ||
216 | def_to_bigsmp = 1; | ||
217 | } | ||
218 | } | ||
219 | bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; | ||
220 | } | ||
221 | |||
222 | static void __init MP_bus_info (struct mpc_config_bus *m) | ||
223 | { | ||
224 | char str[7]; | ||
225 | |||
226 | memcpy(str, m->mpc_bustype, 6); | ||
227 | str[6] = 0; | ||
228 | |||
229 | mpc_oem_bus_info(m, str, translation_table[mpc_record]); | ||
230 | |||
231 | #if MAX_MP_BUSSES < 256 | ||
232 | if (m->mpc_busid >= MAX_MP_BUSSES) { | ||
233 | printk(KERN_WARNING "MP table busid value (%d) for bustype %s " | ||
234 | " is too large, max. supported is %d\n", | ||
235 | m->mpc_busid, str, MAX_MP_BUSSES - 1); | ||
236 | return; | ||
237 | } | ||
238 | #endif | ||
239 | |||
240 | if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { | ||
241 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; | ||
242 | } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { | ||
243 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; | ||
244 | } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { | ||
245 | mpc_oem_pci_bus(m, translation_table[mpc_record]); | ||
246 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; | ||
247 | mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | ||
248 | mp_current_pci_id++; | ||
249 | } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { | ||
250 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; | ||
251 | } else { | ||
252 | printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); | ||
253 | } | ||
254 | } | ||
255 | |||
256 | static void __init MP_ioapic_info (struct mpc_config_ioapic *m) | ||
257 | { | ||
258 | if (!(m->mpc_flags & MPC_APIC_USABLE)) | ||
259 | return; | ||
260 | |||
261 | printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", | ||
262 | m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); | ||
263 | if (nr_ioapics >= MAX_IO_APICS) { | ||
264 | printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", | ||
265 | MAX_IO_APICS, nr_ioapics); | ||
266 | panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); | ||
267 | } | ||
268 | if (!m->mpc_apicaddr) { | ||
269 | printk(KERN_ERR "WARNING: bogus zero I/O APIC address" | ||
270 | " found in MP table, skipping!\n"); | ||
271 | return; | ||
272 | } | ||
273 | mp_ioapics[nr_ioapics] = *m; | ||
274 | nr_ioapics++; | ||
275 | } | ||
276 | |||
277 | static void __init MP_intsrc_info (struct mpc_config_intsrc *m) | ||
278 | { | ||
279 | mp_irqs [mp_irq_entries] = *m; | ||
280 | Dprintk("Int: type %d, pol %d, trig %d, bus %d," | ||
281 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | ||
282 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
283 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | ||
284 | m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | ||
285 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
286 | panic("Max # of irq sources exceeded!!\n"); | ||
287 | } | ||
288 | |||
289 | static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) | ||
290 | { | ||
291 | Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | ||
292 | " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | ||
293 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
294 | (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, | ||
295 | m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | ||
296 | } | ||
297 | |||
298 | #ifdef CONFIG_X86_NUMAQ | ||
299 | static void __init MP_translation_info (struct mpc_config_translation *m) | ||
300 | { | ||
301 | printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); | ||
302 | |||
303 | if (mpc_record >= MAX_MPC_ENTRY) | ||
304 | printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); | ||
305 | else | ||
306 | translation_table[mpc_record] = m; /* stash this for later */ | ||
307 | if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) | ||
308 | node_set_online(m->trans_quad); | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * Read/parse the MPC oem tables | ||
313 | */ | ||
314 | |||
315 | static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ | ||
316 | unsigned short oemsize) | ||
317 | { | ||
318 | int count = sizeof (*oemtable); /* the header size */ | ||
319 | unsigned char *oemptr = ((unsigned char *)oemtable)+count; | ||
320 | |||
321 | mpc_record = 0; | ||
322 | printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); | ||
323 | if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) | ||
324 | { | ||
325 | printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", | ||
326 | oemtable->oem_signature[0], | ||
327 | oemtable->oem_signature[1], | ||
328 | oemtable->oem_signature[2], | ||
329 | oemtable->oem_signature[3]); | ||
330 | return; | ||
331 | } | ||
332 | if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) | ||
333 | { | ||
334 | printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); | ||
335 | return; | ||
336 | } | ||
337 | while (count < oemtable->oem_length) { | ||
338 | switch (*oemptr) { | ||
339 | case MP_TRANSLATION: | ||
340 | { | ||
341 | struct mpc_config_translation *m= | ||
342 | (struct mpc_config_translation *)oemptr; | ||
343 | MP_translation_info(m); | ||
344 | oemptr += sizeof(*m); | ||
345 | count += sizeof(*m); | ||
346 | ++mpc_record; | ||
347 | break; | ||
348 | } | ||
349 | default: | ||
350 | { | ||
351 | printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); | ||
352 | return; | ||
353 | } | ||
354 | } | ||
355 | } | ||
356 | } | ||
357 | |||
358 | static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, | ||
359 | char *productid) | ||
360 | { | ||
361 | if (strncmp(oem, "IBM NUMA", 8)) | ||
362 | printk("Warning! May not be a NUMA-Q system!\n"); | ||
363 | if (mpc->mpc_oemptr) | ||
364 | smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, | ||
365 | mpc->mpc_oemsize); | ||
366 | } | ||
367 | #endif /* CONFIG_X86_NUMAQ */ | ||
368 | |||
369 | /* | ||
370 | * Read/parse the MPC | ||
371 | */ | ||
372 | |||
373 | static int __init smp_read_mpc(struct mp_config_table *mpc) | ||
374 | { | ||
375 | char str[16]; | ||
376 | char oem[10]; | ||
377 | int count=sizeof(*mpc); | ||
378 | unsigned char *mpt=((unsigned char *)mpc)+count; | ||
379 | |||
380 | if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { | ||
381 | printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", | ||
382 | *(u32 *)mpc->mpc_signature); | ||
383 | return 0; | ||
384 | } | ||
385 | if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { | ||
386 | printk(KERN_ERR "SMP mptable: checksum error!\n"); | ||
387 | return 0; | ||
388 | } | ||
389 | if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { | ||
390 | printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", | ||
391 | mpc->mpc_spec); | ||
392 | return 0; | ||
393 | } | ||
394 | if (!mpc->mpc_lapic) { | ||
395 | printk(KERN_ERR "SMP mptable: null local APIC address!\n"); | ||
396 | return 0; | ||
397 | } | ||
398 | memcpy(oem,mpc->mpc_oem,8); | ||
399 | oem[8]=0; | ||
400 | printk(KERN_INFO "OEM ID: %s ",oem); | ||
401 | |||
402 | memcpy(str,mpc->mpc_productid,12); | ||
403 | str[12]=0; | ||
404 | printk("Product ID: %s ",str); | ||
405 | |||
406 | mps_oem_check(mpc, oem, str); | ||
407 | |||
408 | printk("APIC at: 0x%lX\n",mpc->mpc_lapic); | ||
409 | |||
410 | /* | ||
411 | * Save the local APIC address (it might be non-default) -- but only | ||
412 | * if we're not using ACPI. | ||
413 | */ | ||
414 | if (!acpi_lapic) | ||
415 | mp_lapic_addr = mpc->mpc_lapic; | ||
416 | |||
417 | /* | ||
418 | * Now process the configuration blocks. | ||
419 | */ | ||
420 | mpc_record = 0; | ||
421 | while (count < mpc->mpc_length) { | ||
422 | switch(*mpt) { | ||
423 | case MP_PROCESSOR: | ||
424 | { | ||
425 | struct mpc_config_processor *m= | ||
426 | (struct mpc_config_processor *)mpt; | ||
427 | /* ACPI may have already provided this data */ | ||
428 | if (!acpi_lapic) | ||
429 | MP_processor_info(m); | ||
430 | mpt += sizeof(*m); | ||
431 | count += sizeof(*m); | ||
432 | break; | ||
433 | } | ||
434 | case MP_BUS: | ||
435 | { | ||
436 | struct mpc_config_bus *m= | ||
437 | (struct mpc_config_bus *)mpt; | ||
438 | MP_bus_info(m); | ||
439 | mpt += sizeof(*m); | ||
440 | count += sizeof(*m); | ||
441 | break; | ||
442 | } | ||
443 | case MP_IOAPIC: | ||
444 | { | ||
445 | struct mpc_config_ioapic *m= | ||
446 | (struct mpc_config_ioapic *)mpt; | ||
447 | MP_ioapic_info(m); | ||
448 | mpt+=sizeof(*m); | ||
449 | count+=sizeof(*m); | ||
450 | break; | ||
451 | } | ||
452 | case MP_INTSRC: | ||
453 | { | ||
454 | struct mpc_config_intsrc *m= | ||
455 | (struct mpc_config_intsrc *)mpt; | ||
456 | |||
457 | MP_intsrc_info(m); | ||
458 | mpt+=sizeof(*m); | ||
459 | count+=sizeof(*m); | ||
460 | break; | ||
461 | } | ||
462 | case MP_LINTSRC: | ||
463 | { | ||
464 | struct mpc_config_lintsrc *m= | ||
465 | (struct mpc_config_lintsrc *)mpt; | ||
466 | MP_lintsrc_info(m); | ||
467 | mpt+=sizeof(*m); | ||
468 | count+=sizeof(*m); | ||
469 | break; | ||
470 | } | ||
471 | default: | ||
472 | { | ||
473 | count = mpc->mpc_length; | ||
474 | break; | ||
475 | } | ||
476 | } | ||
477 | ++mpc_record; | ||
478 | } | ||
479 | setup_apic_routing(); | ||
480 | if (!num_processors) | ||
481 | printk(KERN_ERR "SMP mptable: no processors registered!\n"); | ||
482 | return num_processors; | ||
483 | } | ||
484 | |||
485 | static int __init ELCR_trigger(unsigned int irq) | ||
486 | { | ||
487 | unsigned int port; | ||
488 | |||
489 | port = 0x4d0 + (irq >> 3); | ||
490 | return (inb(port) >> (irq & 7)) & 1; | ||
491 | } | ||
492 | |||
493 | static void __init construct_default_ioirq_mptable(int mpc_default_type) | ||
494 | { | ||
495 | struct mpc_config_intsrc intsrc; | ||
496 | int i; | ||
497 | int ELCR_fallback = 0; | ||
498 | |||
499 | intsrc.mpc_type = MP_INTSRC; | ||
500 | intsrc.mpc_irqflag = 0; /* conforming */ | ||
501 | intsrc.mpc_srcbus = 0; | ||
502 | intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | ||
503 | |||
504 | intsrc.mpc_irqtype = mp_INT; | ||
505 | |||
506 | /* | ||
507 | * If true, we have an ISA/PCI system with no IRQ entries | ||
508 | * in the MP table. To prevent the PCI interrupts from being set up | ||
509 | * incorrectly, we try to use the ELCR. The sanity check to see if | ||
510 | * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can | ||
511 | * never be level sensitive, so we simply see if the ELCR agrees. | ||
512 | * If it does, we assume it's valid. | ||
513 | */ | ||
514 | if (mpc_default_type == 5) { | ||
515 | printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); | ||
516 | |||
517 | if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) | ||
518 | printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); | ||
519 | else { | ||
520 | printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); | ||
521 | ELCR_fallback = 1; | ||
522 | } | ||
523 | } | ||
524 | |||
525 | for (i = 0; i < 16; i++) { | ||
526 | switch (mpc_default_type) { | ||
527 | case 2: | ||
528 | if (i == 0 || i == 13) | ||
529 | continue; /* IRQ0 & IRQ13 not connected */ | ||
530 | /* fall through */ | ||
531 | default: | ||
532 | if (i == 2) | ||
533 | continue; /* IRQ2 is never connected */ | ||
534 | } | ||
535 | |||
536 | if (ELCR_fallback) { | ||
537 | /* | ||
538 | * If the ELCR indicates a level-sensitive interrupt, we | ||
539 | * copy that information over to the MP table in the | ||
540 | * irqflag field (level sensitive, active high polarity). | ||
541 | */ | ||
542 | if (ELCR_trigger(i)) | ||
543 | intsrc.mpc_irqflag = 13; | ||
544 | else | ||
545 | intsrc.mpc_irqflag = 0; | ||
546 | } | ||
547 | |||
548 | intsrc.mpc_srcbusirq = i; | ||
549 | intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | ||
550 | MP_intsrc_info(&intsrc); | ||
551 | } | ||
552 | |||
553 | intsrc.mpc_irqtype = mp_ExtINT; | ||
554 | intsrc.mpc_srcbusirq = 0; | ||
555 | intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ | ||
556 | MP_intsrc_info(&intsrc); | ||
557 | } | ||
558 | |||
559 | static inline void __init construct_default_ISA_mptable(int mpc_default_type) | ||
560 | { | ||
561 | struct mpc_config_processor processor; | ||
562 | struct mpc_config_bus bus; | ||
563 | struct mpc_config_ioapic ioapic; | ||
564 | struct mpc_config_lintsrc lintsrc; | ||
565 | int linttypes[2] = { mp_ExtINT, mp_NMI }; | ||
566 | int i; | ||
567 | |||
568 | /* | ||
569 | * local APIC has default address | ||
570 | */ | ||
571 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
572 | |||
573 | /* | ||
574 | * 2 CPUs, numbered 0 & 1. | ||
575 | */ | ||
576 | processor.mpc_type = MP_PROCESSOR; | ||
577 | /* Either an integrated APIC or a discrete 82489DX. */ | ||
578 | processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
579 | processor.mpc_cpuflag = CPU_ENABLED; | ||
580 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
581 | (boot_cpu_data.x86_model << 4) | | ||
582 | boot_cpu_data.x86_mask; | ||
583 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
584 | processor.mpc_reserved[0] = 0; | ||
585 | processor.mpc_reserved[1] = 0; | ||
586 | for (i = 0; i < 2; i++) { | ||
587 | processor.mpc_apicid = i; | ||
588 | MP_processor_info(&processor); | ||
589 | } | ||
590 | |||
591 | bus.mpc_type = MP_BUS; | ||
592 | bus.mpc_busid = 0; | ||
593 | switch (mpc_default_type) { | ||
594 | default: | ||
595 | printk("???\n"); | ||
596 | printk(KERN_ERR "Unknown standard configuration %d\n", | ||
597 | mpc_default_type); | ||
598 | /* fall through */ | ||
599 | case 1: | ||
600 | case 5: | ||
601 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
602 | break; | ||
603 | case 2: | ||
604 | case 6: | ||
605 | case 3: | ||
606 | memcpy(bus.mpc_bustype, "EISA ", 6); | ||
607 | break; | ||
608 | case 4: | ||
609 | case 7: | ||
610 | memcpy(bus.mpc_bustype, "MCA ", 6); | ||
611 | } | ||
612 | MP_bus_info(&bus); | ||
613 | if (mpc_default_type > 4) { | ||
614 | bus.mpc_busid = 1; | ||
615 | memcpy(bus.mpc_bustype, "PCI ", 6); | ||
616 | MP_bus_info(&bus); | ||
617 | } | ||
618 | |||
619 | ioapic.mpc_type = MP_IOAPIC; | ||
620 | ioapic.mpc_apicid = 2; | ||
621 | ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
622 | ioapic.mpc_flags = MPC_APIC_USABLE; | ||
623 | ioapic.mpc_apicaddr = 0xFEC00000; | ||
624 | MP_ioapic_info(&ioapic); | ||
625 | |||
626 | /* | ||
627 | * We set up most of the low 16 IO-APIC pins according to MPS rules. | ||
628 | */ | ||
629 | construct_default_ioirq_mptable(mpc_default_type); | ||
630 | |||
631 | lintsrc.mpc_type = MP_LINTSRC; | ||
632 | lintsrc.mpc_irqflag = 0; /* conforming */ | ||
633 | lintsrc.mpc_srcbusid = 0; | ||
634 | lintsrc.mpc_srcbusirq = 0; | ||
635 | lintsrc.mpc_destapic = MP_APIC_ALL; | ||
636 | for (i = 0; i < 2; i++) { | ||
637 | lintsrc.mpc_irqtype = linttypes[i]; | ||
638 | lintsrc.mpc_destapiclint = i; | ||
639 | MP_lintsrc_info(&lintsrc); | ||
640 | } | ||
641 | } | ||
642 | |||
643 | static struct intel_mp_floating *mpf_found; | ||
644 | |||
645 | /* | ||
646 | * Scan the memory blocks for an SMP configuration block. | ||
647 | */ | ||
648 | void __init get_smp_config (void) | ||
649 | { | ||
650 | struct intel_mp_floating *mpf = mpf_found; | ||
651 | |||
652 | /* | ||
653 | * ACPI supports both logical (e.g. Hyper-Threading) and physical | ||
654 | * processors, where MPS only supports physical. | ||
655 | */ | ||
656 | if (acpi_lapic && acpi_ioapic) { | ||
657 | printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); | ||
658 | return; | ||
659 | } | ||
660 | else if (acpi_lapic) | ||
661 | printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); | ||
662 | |||
663 | printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); | ||
664 | if (mpf->mpf_feature2 & (1<<7)) { | ||
665 | printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); | ||
666 | pic_mode = 1; | ||
667 | } else { | ||
668 | printk(KERN_INFO " Virtual Wire compatibility mode.\n"); | ||
669 | pic_mode = 0; | ||
670 | } | ||
671 | |||
672 | /* | ||
673 | * Now see if we need to read further. | ||
674 | */ | ||
675 | if (mpf->mpf_feature1 != 0) { | ||
676 | |||
677 | printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); | ||
678 | construct_default_ISA_mptable(mpf->mpf_feature1); | ||
679 | |||
680 | } else if (mpf->mpf_physptr) { | ||
681 | |||
682 | /* | ||
683 | * Read the physical hardware table. Anything here will | ||
684 | * override the defaults. | ||
685 | */ | ||
686 | if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { | ||
687 | smp_found_config = 0; | ||
688 | printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); | ||
689 | printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); | ||
690 | return; | ||
691 | } | ||
692 | /* | ||
693 | * If there are no explicit MP IRQ entries, then we are | ||
694 | * broken. We set up most of the low 16 IO-APIC pins to | ||
695 | * ISA defaults and hope it will work. | ||
696 | */ | ||
697 | if (!mp_irq_entries) { | ||
698 | struct mpc_config_bus bus; | ||
699 | |||
700 | printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); | ||
701 | |||
702 | bus.mpc_type = MP_BUS; | ||
703 | bus.mpc_busid = 0; | ||
704 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
705 | MP_bus_info(&bus); | ||
706 | |||
707 | construct_default_ioirq_mptable(0); | ||
708 | } | ||
709 | |||
710 | } else | ||
711 | BUG(); | ||
712 | |||
713 | printk(KERN_INFO "Processors: %d\n", num_processors); | ||
714 | /* | ||
715 | * Only use the first configuration found. | ||
716 | */ | ||
717 | } | ||
718 | |||
719 | static int __init smp_scan_config (unsigned long base, unsigned long length) | ||
720 | { | ||
721 | unsigned long *bp = phys_to_virt(base); | ||
722 | struct intel_mp_floating *mpf; | ||
723 | |||
724 | Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | ||
725 | if (sizeof(*mpf) != 16) | ||
726 | printk("Error: MPF size\n"); | ||
727 | |||
728 | while (length > 0) { | ||
729 | mpf = (struct intel_mp_floating *)bp; | ||
730 | if ((*bp == SMP_MAGIC_IDENT) && | ||
731 | (mpf->mpf_length == 1) && | ||
732 | !mpf_checksum((unsigned char *)bp, 16) && | ||
733 | ((mpf->mpf_specification == 1) | ||
734 | || (mpf->mpf_specification == 4)) ) { | ||
735 | |||
736 | smp_found_config = 1; | ||
737 | printk(KERN_INFO "found SMP MP-table at %08lx\n", | ||
738 | virt_to_phys(mpf)); | ||
739 | reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); | ||
740 | if (mpf->mpf_physptr) { | ||
741 | /* | ||
742 | * We cannot access to MPC table to compute | ||
743 | * table size yet, as only few megabytes from | ||
744 | * the bottom is mapped now. | ||
745 | * PC-9800's MPC table places on the very last | ||
746 | * of physical memory; so that simply reserving | ||
747 | * PAGE_SIZE from mpg->mpf_physptr yields BUG() | ||
748 | * in reserve_bootmem. | ||
749 | */ | ||
750 | unsigned long size = PAGE_SIZE; | ||
751 | unsigned long end = max_low_pfn * PAGE_SIZE; | ||
752 | if (mpf->mpf_physptr + size > end) | ||
753 | size = end - mpf->mpf_physptr; | ||
754 | reserve_bootmem(mpf->mpf_physptr, size); | ||
755 | } | ||
756 | |||
757 | mpf_found = mpf; | ||
758 | return 1; | ||
759 | } | ||
760 | bp += 4; | ||
761 | length -= 16; | ||
762 | } | ||
763 | return 0; | ||
764 | } | ||
765 | |||
766 | void __init find_smp_config (void) | ||
767 | { | ||
768 | unsigned int address; | ||
769 | |||
770 | /* | ||
771 | * FIXME: Linux assumes you have 640K of base ram.. | ||
772 | * this continues the error... | ||
773 | * | ||
774 | * 1) Scan the bottom 1K for a signature | ||
775 | * 2) Scan the top 1K of base RAM | ||
776 | * 3) Scan the 64K of bios | ||
777 | */ | ||
778 | if (smp_scan_config(0x0,0x400) || | ||
779 | smp_scan_config(639*0x400,0x400) || | ||
780 | smp_scan_config(0xF0000,0x10000)) | ||
781 | return; | ||
782 | /* | ||
783 | * If it is an SMP machine we should know now, unless the | ||
784 | * configuration is in an EISA/MCA bus machine with an | ||
785 | * extended bios data area. | ||
786 | * | ||
787 | * there is a real-mode segmented pointer pointing to the | ||
788 | * 4K EBDA area at 0x40E, calculate and scan it here. | ||
789 | * | ||
790 | * NOTE! There are Linux loaders that will corrupt the EBDA | ||
791 | * area, and as such this kind of SMP config may be less | ||
792 | * trustworthy, simply because the SMP table may have been | ||
793 | * stomped on during early boot. These loaders are buggy and | ||
794 | * should be fixed. | ||
795 | * | ||
796 | * MP1.4 SPEC states to only scan first 1K of 4K EBDA. | ||
797 | */ | ||
798 | |||
799 | address = get_bios_ebda(); | ||
800 | if (address) | ||
801 | smp_scan_config(address, 0x400); | ||
802 | } | ||
803 | |||
804 | int es7000_plat; | ||
805 | |||
806 | /* -------------------------------------------------------------------------- | ||
807 | ACPI-based MP Configuration | ||
808 | -------------------------------------------------------------------------- */ | ||
809 | |||
810 | #ifdef CONFIG_ACPI | ||
811 | |||
812 | void __init mp_register_lapic_address(u64 address) | ||
813 | { | ||
814 | mp_lapic_addr = (unsigned long) address; | ||
815 | |||
816 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
817 | |||
818 | if (boot_cpu_physical_apicid == -1U) | ||
819 | boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); | ||
820 | |||
821 | Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); | ||
822 | } | ||
823 | |||
824 | void __cpuinit mp_register_lapic (u8 id, u8 enabled) | ||
825 | { | ||
826 | struct mpc_config_processor processor; | ||
827 | int boot_cpu = 0; | ||
828 | |||
829 | if (MAX_APICS - id <= 0) { | ||
830 | printk(KERN_WARNING "Processor #%d invalid (max %d)\n", | ||
831 | id, MAX_APICS); | ||
832 | return; | ||
833 | } | ||
834 | |||
835 | if (id == boot_cpu_physical_apicid) | ||
836 | boot_cpu = 1; | ||
837 | |||
838 | processor.mpc_type = MP_PROCESSOR; | ||
839 | processor.mpc_apicid = id; | ||
840 | processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
841 | processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); | ||
842 | processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); | ||
843 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
844 | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; | ||
845 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
846 | processor.mpc_reserved[0] = 0; | ||
847 | processor.mpc_reserved[1] = 0; | ||
848 | |||
849 | MP_processor_info(&processor); | ||
850 | } | ||
851 | |||
852 | #ifdef CONFIG_X86_IO_APIC | ||
853 | |||
854 | #define MP_ISA_BUS 0 | ||
855 | #define MP_MAX_IOAPIC_PIN 127 | ||
856 | |||
857 | static struct mp_ioapic_routing { | ||
858 | int apic_id; | ||
859 | int gsi_base; | ||
860 | int gsi_end; | ||
861 | u32 pin_programmed[4]; | ||
862 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
863 | |||
864 | static int mp_find_ioapic (int gsi) | ||
865 | { | ||
866 | int i = 0; | ||
867 | |||
868 | /* Find the IOAPIC that manages this GSI. */ | ||
869 | for (i = 0; i < nr_ioapics; i++) { | ||
870 | if ((gsi >= mp_ioapic_routing[i].gsi_base) | ||
871 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | ||
872 | return i; | ||
873 | } | ||
874 | |||
875 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
876 | |||
877 | return -1; | ||
878 | } | ||
879 | |||
880 | void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) | ||
881 | { | ||
882 | int idx = 0; | ||
883 | int tmpid; | ||
884 | |||
885 | if (nr_ioapics >= MAX_IO_APICS) { | ||
886 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
887 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
888 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
889 | } | ||
890 | if (!address) { | ||
891 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
892 | " found in MADT table, skipping!\n"); | ||
893 | return; | ||
894 | } | ||
895 | |||
896 | idx = nr_ioapics++; | ||
897 | |||
898 | mp_ioapics[idx].mpc_type = MP_IOAPIC; | ||
899 | mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | ||
900 | mp_ioapics[idx].mpc_apicaddr = address; | ||
901 | |||
902 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
903 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
904 | && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
905 | tmpid = io_apic_get_unique_id(idx, id); | ||
906 | else | ||
907 | tmpid = id; | ||
908 | if (tmpid == -1) { | ||
909 | nr_ioapics--; | ||
910 | return; | ||
911 | } | ||
912 | mp_ioapics[idx].mpc_apicid = tmpid; | ||
913 | mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); | ||
914 | |||
915 | /* | ||
916 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups | ||
917 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | ||
918 | */ | ||
919 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | ||
920 | mp_ioapic_routing[idx].gsi_base = gsi_base; | ||
921 | mp_ioapic_routing[idx].gsi_end = gsi_base + | ||
922 | io_apic_get_redir_entries(idx); | ||
923 | |||
924 | printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " | ||
925 | "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | ||
926 | mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | ||
927 | mp_ioapic_routing[idx].gsi_base, | ||
928 | mp_ioapic_routing[idx].gsi_end); | ||
929 | } | ||
930 | |||
931 | void __init | ||
932 | mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | ||
933 | { | ||
934 | struct mpc_config_intsrc intsrc; | ||
935 | int ioapic = -1; | ||
936 | int pin = -1; | ||
937 | |||
938 | /* | ||
939 | * Convert 'gsi' to 'ioapic.pin'. | ||
940 | */ | ||
941 | ioapic = mp_find_ioapic(gsi); | ||
942 | if (ioapic < 0) | ||
943 | return; | ||
944 | pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | ||
945 | |||
946 | /* | ||
947 | * TBD: This check is for faulty timer entries, where the override | ||
948 | * erroneously sets the trigger to level, resulting in a HUGE | ||
949 | * increase of timer interrupts! | ||
950 | */ | ||
951 | if ((bus_irq == 0) && (trigger == 3)) | ||
952 | trigger = 1; | ||
953 | |||
954 | intsrc.mpc_type = MP_INTSRC; | ||
955 | intsrc.mpc_irqtype = mp_INT; | ||
956 | intsrc.mpc_irqflag = (trigger << 2) | polarity; | ||
957 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
958 | intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | ||
959 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | ||
960 | intsrc.mpc_dstirq = pin; /* INTIN# */ | ||
961 | |||
962 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", | ||
963 | intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
964 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
965 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); | ||
966 | |||
967 | mp_irqs[mp_irq_entries] = intsrc; | ||
968 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
969 | panic("Max # of irq sources exceeded!\n"); | ||
970 | } | ||
971 | |||
972 | void __init mp_config_acpi_legacy_irqs (void) | ||
973 | { | ||
974 | struct mpc_config_intsrc intsrc; | ||
975 | int i = 0; | ||
976 | int ioapic = -1; | ||
977 | |||
978 | /* | ||
979 | * Fabricate the legacy ISA bus (bus #31). | ||
980 | */ | ||
981 | mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; | ||
982 | Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); | ||
983 | |||
984 | /* | ||
985 | * Older generations of ES7000 have no legacy identity mappings | ||
986 | */ | ||
987 | if (es7000_plat == 1) | ||
988 | return; | ||
989 | |||
990 | /* | ||
991 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | ||
992 | */ | ||
993 | ioapic = mp_find_ioapic(0); | ||
994 | if (ioapic < 0) | ||
995 | return; | ||
996 | |||
997 | intsrc.mpc_type = MP_INTSRC; | ||
998 | intsrc.mpc_irqflag = 0; /* Conforming */ | ||
999 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
1000 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | ||
1001 | |||
1002 | /* | ||
1003 | * Use the default configuration for the IRQs 0-15. Unless | ||
1004 | * overriden by (MADT) interrupt source override entries. | ||
1005 | */ | ||
1006 | for (i = 0; i < 16; i++) { | ||
1007 | int idx; | ||
1008 | |||
1009 | for (idx = 0; idx < mp_irq_entries; idx++) { | ||
1010 | struct mpc_config_intsrc *irq = mp_irqs + idx; | ||
1011 | |||
1012 | /* Do we already have a mapping for this ISA IRQ? */ | ||
1013 | if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) | ||
1014 | break; | ||
1015 | |||
1016 | /* Do we already have a mapping for this IOAPIC pin */ | ||
1017 | if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | ||
1018 | (irq->mpc_dstirq == i)) | ||
1019 | break; | ||
1020 | } | ||
1021 | |||
1022 | if (idx != mp_irq_entries) { | ||
1023 | printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | ||
1024 | continue; /* IRQ already used */ | ||
1025 | } | ||
1026 | |||
1027 | intsrc.mpc_irqtype = mp_INT; | ||
1028 | intsrc.mpc_srcbusirq = i; /* Identity mapped */ | ||
1029 | intsrc.mpc_dstirq = i; | ||
1030 | |||
1031 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " | ||
1032 | "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
1033 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
1034 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, | ||
1035 | intsrc.mpc_dstirq); | ||
1036 | |||
1037 | mp_irqs[mp_irq_entries] = intsrc; | ||
1038 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
1039 | panic("Max # of irq sources exceeded!\n"); | ||
1040 | } | ||
1041 | } | ||
1042 | |||
1043 | #define MAX_GSI_NUM 4096 | ||
1044 | |||
1045 | int mp_register_gsi(u32 gsi, int triggering, int polarity) | ||
1046 | { | ||
1047 | int ioapic = -1; | ||
1048 | int ioapic_pin = 0; | ||
1049 | int idx, bit = 0; | ||
1050 | static int pci_irq = 16; | ||
1051 | /* | ||
1052 | * Mapping between Global System Interrups, which | ||
1053 | * represent all possible interrupts, and IRQs | ||
1054 | * assigned to actual devices. | ||
1055 | */ | ||
1056 | static int gsi_to_irq[MAX_GSI_NUM]; | ||
1057 | |||
1058 | /* Don't set up the ACPI SCI because it's already set up */ | ||
1059 | if (acpi_gbl_FADT.sci_interrupt == gsi) | ||
1060 | return gsi; | ||
1061 | |||
1062 | ioapic = mp_find_ioapic(gsi); | ||
1063 | if (ioapic < 0) { | ||
1064 | printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | ||
1065 | return gsi; | ||
1066 | } | ||
1067 | |||
1068 | ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; | ||
1069 | |||
1070 | if (ioapic_renumber_irq) | ||
1071 | gsi = ioapic_renumber_irq(ioapic, gsi); | ||
1072 | |||
1073 | /* | ||
1074 | * Avoid pin reprogramming. PRTs typically include entries | ||
1075 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
1076 | * we only program the IOAPIC on the first. | ||
1077 | */ | ||
1078 | bit = ioapic_pin % 32; | ||
1079 | idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); | ||
1080 | if (idx > 3) { | ||
1081 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | ||
1082 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | ||
1083 | ioapic_pin); | ||
1084 | return gsi; | ||
1085 | } | ||
1086 | if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | ||
1087 | Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | ||
1088 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | ||
1089 | return gsi_to_irq[gsi]; | ||
1090 | } | ||
1091 | |||
1092 | mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | ||
1093 | |||
1094 | if (triggering == ACPI_LEVEL_SENSITIVE) { | ||
1095 | /* | ||
1096 | * For PCI devices assign IRQs in order, avoiding gaps | ||
1097 | * due to unused I/O APIC pins. | ||
1098 | */ | ||
1099 | int irq = gsi; | ||
1100 | if (gsi < MAX_GSI_NUM) { | ||
1101 | /* | ||
1102 | * Retain the VIA chipset work-around (gsi > 15), but | ||
1103 | * avoid a problem where the 8254 timer (IRQ0) is setup | ||
1104 | * via an override (so it's not on pin 0 of the ioapic), | ||
1105 | * and at the same time, the pin 0 interrupt is a PCI | ||
1106 | * type. The gsi > 15 test could cause these two pins | ||
1107 | * to be shared as IRQ0, and they are not shareable. | ||
1108 | * So test for this condition, and if necessary, avoid | ||
1109 | * the pin collision. | ||
1110 | */ | ||
1111 | if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) | ||
1112 | gsi = pci_irq++; | ||
1113 | /* | ||
1114 | * Don't assign IRQ used by ACPI SCI | ||
1115 | */ | ||
1116 | if (gsi == acpi_gbl_FADT.sci_interrupt) | ||
1117 | gsi = pci_irq++; | ||
1118 | gsi_to_irq[irq] = gsi; | ||
1119 | } else { | ||
1120 | printk(KERN_ERR "GSI %u is too high\n", gsi); | ||
1121 | return gsi; | ||
1122 | } | ||
1123 | } | ||
1124 | |||
1125 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
1126 | triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
1127 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
1128 | return gsi; | ||
1129 | } | ||
1130 | |||
1131 | #endif /* CONFIG_X86_IO_APIC */ | ||
1132 | #endif /* CONFIG_ACPI */ | ||
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c new file mode 100644 index 000000000000..0c1069b8d638 --- /dev/null +++ b/arch/x86/kernel/msr.c | |||
@@ -0,0 +1,224 @@ | |||
1 | /* ----------------------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2000 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, | ||
8 | * USA; either version 2 of the License, or (at your option) any later | ||
9 | * version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * msr.c | ||
15 | * | ||
16 | * x86 MSR access device | ||
17 | * | ||
18 | * This device is accessed by lseek() to the appropriate register number | ||
19 | * and then read/write in chunks of 8 bytes. A larger size means multiple | ||
20 | * reads or writes of the same register. | ||
21 | * | ||
22 | * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on | ||
23 | * an SMP box will direct the access to CPU %d. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | |||
28 | #include <linux/types.h> | ||
29 | #include <linux/errno.h> | ||
30 | #include <linux/fcntl.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/poll.h> | ||
33 | #include <linux/smp.h> | ||
34 | #include <linux/smp_lock.h> | ||
35 | #include <linux/major.h> | ||
36 | #include <linux/fs.h> | ||
37 | #include <linux/device.h> | ||
38 | #include <linux/cpu.h> | ||
39 | #include <linux/notifier.h> | ||
40 | |||
41 | #include <asm/processor.h> | ||
42 | #include <asm/msr.h> | ||
43 | #include <asm/uaccess.h> | ||
44 | #include <asm/system.h> | ||
45 | |||
46 | static struct class *msr_class; | ||
47 | |||
48 | static loff_t msr_seek(struct file *file, loff_t offset, int orig) | ||
49 | { | ||
50 | loff_t ret = -EINVAL; | ||
51 | |||
52 | lock_kernel(); | ||
53 | switch (orig) { | ||
54 | case 0: | ||
55 | file->f_pos = offset; | ||
56 | ret = file->f_pos; | ||
57 | break; | ||
58 | case 1: | ||
59 | file->f_pos += offset; | ||
60 | ret = file->f_pos; | ||
61 | } | ||
62 | unlock_kernel(); | ||
63 | return ret; | ||
64 | } | ||
65 | |||
66 | static ssize_t msr_read(struct file *file, char __user * buf, | ||
67 | size_t count, loff_t * ppos) | ||
68 | { | ||
69 | u32 __user *tmp = (u32 __user *) buf; | ||
70 | u32 data[2]; | ||
71 | u32 reg = *ppos; | ||
72 | int cpu = iminor(file->f_path.dentry->d_inode); | ||
73 | int err; | ||
74 | |||
75 | if (count % 8) | ||
76 | return -EINVAL; /* Invalid chunk size */ | ||
77 | |||
78 | for (; count; count -= 8) { | ||
79 | err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); | ||
80 | if (err) | ||
81 | return -EIO; | ||
82 | if (copy_to_user(tmp, &data, 8)) | ||
83 | return -EFAULT; | ||
84 | tmp += 2; | ||
85 | } | ||
86 | |||
87 | return ((char __user *)tmp) - buf; | ||
88 | } | ||
89 | |||
90 | static ssize_t msr_write(struct file *file, const char __user *buf, | ||
91 | size_t count, loff_t *ppos) | ||
92 | { | ||
93 | const u32 __user *tmp = (const u32 __user *)buf; | ||
94 | u32 data[2]; | ||
95 | u32 reg = *ppos; | ||
96 | int cpu = iminor(file->f_path.dentry->d_inode); | ||
97 | int err; | ||
98 | |||
99 | if (count % 8) | ||
100 | return -EINVAL; /* Invalid chunk size */ | ||
101 | |||
102 | for (; count; count -= 8) { | ||
103 | if (copy_from_user(&data, tmp, 8)) | ||
104 | return -EFAULT; | ||
105 | err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); | ||
106 | if (err) | ||
107 | return -EIO; | ||
108 | tmp += 2; | ||
109 | } | ||
110 | |||
111 | return ((char __user *)tmp) - buf; | ||
112 | } | ||
113 | |||
114 | static int msr_open(struct inode *inode, struct file *file) | ||
115 | { | ||
116 | unsigned int cpu = iminor(file->f_path.dentry->d_inode); | ||
117 | struct cpuinfo_x86 *c = &(cpu_data)[cpu]; | ||
118 | |||
119 | if (cpu >= NR_CPUS || !cpu_online(cpu)) | ||
120 | return -ENXIO; /* No such CPU */ | ||
121 | if (!cpu_has(c, X86_FEATURE_MSR)) | ||
122 | return -EIO; /* MSR not supported */ | ||
123 | |||
124 | return 0; | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * File operations we support | ||
129 | */ | ||
130 | static const struct file_operations msr_fops = { | ||
131 | .owner = THIS_MODULE, | ||
132 | .llseek = msr_seek, | ||
133 | .read = msr_read, | ||
134 | .write = msr_write, | ||
135 | .open = msr_open, | ||
136 | }; | ||
137 | |||
138 | static int msr_device_create(int i) | ||
139 | { | ||
140 | int err = 0; | ||
141 | struct device *dev; | ||
142 | |||
143 | dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i); | ||
144 | if (IS_ERR(dev)) | ||
145 | err = PTR_ERR(dev); | ||
146 | return err; | ||
147 | } | ||
148 | |||
149 | static int msr_class_cpu_callback(struct notifier_block *nfb, | ||
150 | unsigned long action, void *hcpu) | ||
151 | { | ||
152 | unsigned int cpu = (unsigned long)hcpu; | ||
153 | |||
154 | switch (action) { | ||
155 | case CPU_ONLINE: | ||
156 | case CPU_ONLINE_FROZEN: | ||
157 | msr_device_create(cpu); | ||
158 | break; | ||
159 | case CPU_DEAD: | ||
160 | case CPU_DEAD_FROZEN: | ||
161 | device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); | ||
162 | break; | ||
163 | } | ||
164 | return NOTIFY_OK; | ||
165 | } | ||
166 | |||
167 | static struct notifier_block __cpuinitdata msr_class_cpu_notifier = | ||
168 | { | ||
169 | .notifier_call = msr_class_cpu_callback, | ||
170 | }; | ||
171 | |||
172 | static int __init msr_init(void) | ||
173 | { | ||
174 | int i, err = 0; | ||
175 | i = 0; | ||
176 | |||
177 | if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { | ||
178 | printk(KERN_ERR "msr: unable to get major %d for msr\n", | ||
179 | MSR_MAJOR); | ||
180 | err = -EBUSY; | ||
181 | goto out; | ||
182 | } | ||
183 | msr_class = class_create(THIS_MODULE, "msr"); | ||
184 | if (IS_ERR(msr_class)) { | ||
185 | err = PTR_ERR(msr_class); | ||
186 | goto out_chrdev; | ||
187 | } | ||
188 | for_each_online_cpu(i) { | ||
189 | err = msr_device_create(i); | ||
190 | if (err != 0) | ||
191 | goto out_class; | ||
192 | } | ||
193 | register_hotcpu_notifier(&msr_class_cpu_notifier); | ||
194 | |||
195 | err = 0; | ||
196 | goto out; | ||
197 | |||
198 | out_class: | ||
199 | i = 0; | ||
200 | for_each_online_cpu(i) | ||
201 | device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); | ||
202 | class_destroy(msr_class); | ||
203 | out_chrdev: | ||
204 | unregister_chrdev(MSR_MAJOR, "cpu/msr"); | ||
205 | out: | ||
206 | return err; | ||
207 | } | ||
208 | |||
209 | static void __exit msr_exit(void) | ||
210 | { | ||
211 | int cpu = 0; | ||
212 | for_each_online_cpu(cpu) | ||
213 | device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); | ||
214 | class_destroy(msr_class); | ||
215 | unregister_chrdev(MSR_MAJOR, "cpu/msr"); | ||
216 | unregister_hotcpu_notifier(&msr_class_cpu_notifier); | ||
217 | } | ||
218 | |||
219 | module_init(msr_init); | ||
220 | module_exit(msr_exit) | ||
221 | |||
222 | MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); | ||
223 | MODULE_DESCRIPTION("x86 generic MSR driver"); | ||
224 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c new file mode 100644 index 000000000000..c7227e2180f8 --- /dev/null +++ b/arch/x86/kernel/nmi_32.c | |||
@@ -0,0 +1,468 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/nmi.c | ||
3 | * | ||
4 | * NMI watchdog support on APIC systems | ||
5 | * | ||
6 | * Started by Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes: | ||
9 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | ||
10 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | ||
11 | * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. | ||
12 | * Pavel Machek and | ||
13 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | ||
14 | */ | ||
15 | |||
16 | #include <linux/delay.h> | ||
17 | #include <linux/interrupt.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/nmi.h> | ||
20 | #include <linux/sysdev.h> | ||
21 | #include <linux/sysctl.h> | ||
22 | #include <linux/percpu.h> | ||
23 | #include <linux/kprobes.h> | ||
24 | #include <linux/cpumask.h> | ||
25 | #include <linux/kernel_stat.h> | ||
26 | #include <linux/kdebug.h> | ||
27 | |||
28 | #include <asm/smp.h> | ||
29 | #include <asm/nmi.h> | ||
30 | |||
31 | #include "mach_traps.h" | ||
32 | |||
33 | int unknown_nmi_panic; | ||
34 | int nmi_watchdog_enabled; | ||
35 | |||
36 | static cpumask_t backtrace_mask = CPU_MASK_NONE; | ||
37 | |||
38 | /* nmi_active: | ||
39 | * >0: the lapic NMI watchdog is active, but can be disabled | ||
40 | * <0: the lapic NMI watchdog has not been set up, and cannot | ||
41 | * be enabled | ||
42 | * 0: the lapic NMI watchdog is disabled, but can be enabled | ||
43 | */ | ||
44 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
45 | |||
46 | unsigned int nmi_watchdog = NMI_DEFAULT; | ||
47 | static unsigned int nmi_hz = HZ; | ||
48 | |||
49 | static DEFINE_PER_CPU(short, wd_enabled); | ||
50 | |||
51 | /* local prototypes */ | ||
52 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); | ||
53 | |||
54 | static int endflag __initdata = 0; | ||
55 | |||
56 | #ifdef CONFIG_SMP | ||
57 | /* The performance counters used by NMI_LOCAL_APIC don't trigger when | ||
58 | * the CPU is idle. To make sure the NMI watchdog really ticks on all | ||
59 | * CPUs during the test make them busy. | ||
60 | */ | ||
61 | static __init void nmi_cpu_busy(void *data) | ||
62 | { | ||
63 | local_irq_enable_in_hardirq(); | ||
64 | /* Intentionally don't use cpu_relax here. This is | ||
65 | to make sure that the performance counter really ticks, | ||
66 | even if there is a simulator or similar that catches the | ||
67 | pause instruction. On a real HT machine this is fine because | ||
68 | all other CPUs are busy with "useless" delay loops and don't | ||
69 | care if they get somewhat less cycles. */ | ||
70 | while (endflag == 0) | ||
71 | mb(); | ||
72 | } | ||
73 | #endif | ||
74 | |||
75 | static int __init check_nmi_watchdog(void) | ||
76 | { | ||
77 | unsigned int *prev_nmi_count; | ||
78 | int cpu; | ||
79 | |||
80 | if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) | ||
81 | return 0; | ||
82 | |||
83 | if (!atomic_read(&nmi_active)) | ||
84 | return 0; | ||
85 | |||
86 | prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); | ||
87 | if (!prev_nmi_count) | ||
88 | return -1; | ||
89 | |||
90 | printk(KERN_INFO "Testing NMI watchdog ... "); | ||
91 | |||
92 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
93 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); | ||
94 | |||
95 | for_each_possible_cpu(cpu) | ||
96 | prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; | ||
97 | local_irq_enable(); | ||
98 | mdelay((20*1000)/nmi_hz); // wait 20 ticks | ||
99 | |||
100 | for_each_possible_cpu(cpu) { | ||
101 | #ifdef CONFIG_SMP | ||
102 | /* Check cpu_callin_map here because that is set | ||
103 | after the timer is started. */ | ||
104 | if (!cpu_isset(cpu, cpu_callin_map)) | ||
105 | continue; | ||
106 | #endif | ||
107 | if (!per_cpu(wd_enabled, cpu)) | ||
108 | continue; | ||
109 | if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { | ||
110 | printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", | ||
111 | cpu, | ||
112 | prev_nmi_count[cpu], | ||
113 | nmi_count(cpu)); | ||
114 | per_cpu(wd_enabled, cpu) = 0; | ||
115 | atomic_dec(&nmi_active); | ||
116 | } | ||
117 | } | ||
118 | endflag = 1; | ||
119 | if (!atomic_read(&nmi_active)) { | ||
120 | kfree(prev_nmi_count); | ||
121 | atomic_set(&nmi_active, -1); | ||
122 | return -1; | ||
123 | } | ||
124 | printk("OK.\n"); | ||
125 | |||
126 | /* now that we know it works we can reduce NMI frequency to | ||
127 | something more reasonable; makes a difference in some configs */ | ||
128 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
129 | nmi_hz = lapic_adjust_nmi_hz(1); | ||
130 | |||
131 | kfree(prev_nmi_count); | ||
132 | return 0; | ||
133 | } | ||
134 | /* This needs to happen later in boot so counters are working */ | ||
135 | late_initcall(check_nmi_watchdog); | ||
136 | |||
137 | static int __init setup_nmi_watchdog(char *str) | ||
138 | { | ||
139 | int nmi; | ||
140 | |||
141 | get_option(&str, &nmi); | ||
142 | |||
143 | if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) | ||
144 | return 0; | ||
145 | |||
146 | nmi_watchdog = nmi; | ||
147 | return 1; | ||
148 | } | ||
149 | |||
150 | __setup("nmi_watchdog=", setup_nmi_watchdog); | ||
151 | |||
152 | |||
153 | /* Suspend/resume support */ | ||
154 | |||
155 | #ifdef CONFIG_PM | ||
156 | |||
157 | static int nmi_pm_active; /* nmi_active before suspend */ | ||
158 | |||
159 | static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) | ||
160 | { | ||
161 | /* only CPU0 goes here, other CPUs should be offline */ | ||
162 | nmi_pm_active = atomic_read(&nmi_active); | ||
163 | stop_apic_nmi_watchdog(NULL); | ||
164 | BUG_ON(atomic_read(&nmi_active) != 0); | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | static int lapic_nmi_resume(struct sys_device *dev) | ||
169 | { | ||
170 | /* only CPU0 goes here, other CPUs should be offline */ | ||
171 | if (nmi_pm_active > 0) { | ||
172 | setup_apic_nmi_watchdog(NULL); | ||
173 | touch_nmi_watchdog(); | ||
174 | } | ||
175 | return 0; | ||
176 | } | ||
177 | |||
178 | |||
179 | static struct sysdev_class nmi_sysclass = { | ||
180 | set_kset_name("lapic_nmi"), | ||
181 | .resume = lapic_nmi_resume, | ||
182 | .suspend = lapic_nmi_suspend, | ||
183 | }; | ||
184 | |||
185 | static struct sys_device device_lapic_nmi = { | ||
186 | .id = 0, | ||
187 | .cls = &nmi_sysclass, | ||
188 | }; | ||
189 | |||
190 | static int __init init_lapic_nmi_sysfs(void) | ||
191 | { | ||
192 | int error; | ||
193 | |||
194 | /* should really be a BUG_ON but b/c this is an | ||
195 | * init call, it just doesn't work. -dcz | ||
196 | */ | ||
197 | if (nmi_watchdog != NMI_LOCAL_APIC) | ||
198 | return 0; | ||
199 | |||
200 | if (atomic_read(&nmi_active) < 0) | ||
201 | return 0; | ||
202 | |||
203 | error = sysdev_class_register(&nmi_sysclass); | ||
204 | if (!error) | ||
205 | error = sysdev_register(&device_lapic_nmi); | ||
206 | return error; | ||
207 | } | ||
208 | /* must come after the local APIC's device_initcall() */ | ||
209 | late_initcall(init_lapic_nmi_sysfs); | ||
210 | |||
211 | #endif /* CONFIG_PM */ | ||
212 | |||
213 | static void __acpi_nmi_enable(void *__unused) | ||
214 | { | ||
215 | apic_write_around(APIC_LVT0, APIC_DM_NMI); | ||
216 | } | ||
217 | |||
218 | /* | ||
219 | * Enable timer based NMIs on all CPUs: | ||
220 | */ | ||
221 | void acpi_nmi_enable(void) | ||
222 | { | ||
223 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
224 | on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); | ||
225 | } | ||
226 | |||
227 | static void __acpi_nmi_disable(void *__unused) | ||
228 | { | ||
229 | apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Disable timer based NMIs on all CPUs: | ||
234 | */ | ||
235 | void acpi_nmi_disable(void) | ||
236 | { | ||
237 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
238 | on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); | ||
239 | } | ||
240 | |||
241 | void setup_apic_nmi_watchdog (void *unused) | ||
242 | { | ||
243 | if (__get_cpu_var(wd_enabled)) | ||
244 | return; | ||
245 | |||
246 | /* cheap hack to support suspend/resume */ | ||
247 | /* if cpu0 is not active neither should the other cpus */ | ||
248 | if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) | ||
249 | return; | ||
250 | |||
251 | switch (nmi_watchdog) { | ||
252 | case NMI_LOCAL_APIC: | ||
253 | __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ | ||
254 | if (lapic_watchdog_init(nmi_hz) < 0) { | ||
255 | __get_cpu_var(wd_enabled) = 0; | ||
256 | return; | ||
257 | } | ||
258 | /* FALL THROUGH */ | ||
259 | case NMI_IO_APIC: | ||
260 | __get_cpu_var(wd_enabled) = 1; | ||
261 | atomic_inc(&nmi_active); | ||
262 | } | ||
263 | } | ||
264 | |||
265 | void stop_apic_nmi_watchdog(void *unused) | ||
266 | { | ||
267 | /* only support LOCAL and IO APICs for now */ | ||
268 | if ((nmi_watchdog != NMI_LOCAL_APIC) && | ||
269 | (nmi_watchdog != NMI_IO_APIC)) | ||
270 | return; | ||
271 | if (__get_cpu_var(wd_enabled) == 0) | ||
272 | return; | ||
273 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
274 | lapic_watchdog_stop(); | ||
275 | __get_cpu_var(wd_enabled) = 0; | ||
276 | atomic_dec(&nmi_active); | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * the best way to detect whether a CPU has a 'hard lockup' problem | ||
281 | * is to check it's local APIC timer IRQ counts. If they are not | ||
282 | * changing then that CPU has some problem. | ||
283 | * | ||
284 | * as these watchdog NMI IRQs are generated on every CPU, we only | ||
285 | * have to check the current processor. | ||
286 | * | ||
287 | * since NMIs don't listen to _any_ locks, we have to be extremely | ||
288 | * careful not to rely on unsafe variables. The printk might lock | ||
289 | * up though, so we have to break up any console locks first ... | ||
290 | * [when there will be more tty-related locks, break them up | ||
291 | * here too!] | ||
292 | */ | ||
293 | |||
294 | static unsigned int | ||
295 | last_irq_sums [NR_CPUS], | ||
296 | alert_counter [NR_CPUS]; | ||
297 | |||
298 | void touch_nmi_watchdog(void) | ||
299 | { | ||
300 | if (nmi_watchdog > 0) { | ||
301 | unsigned cpu; | ||
302 | |||
303 | /* | ||
304 | * Just reset the alert counters, (other CPUs might be | ||
305 | * spinning on locks we hold): | ||
306 | */ | ||
307 | for_each_present_cpu(cpu) { | ||
308 | if (alert_counter[cpu]) | ||
309 | alert_counter[cpu] = 0; | ||
310 | } | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * Tickle the softlockup detector too: | ||
315 | */ | ||
316 | touch_softlockup_watchdog(); | ||
317 | } | ||
318 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
319 | |||
320 | extern void die_nmi(struct pt_regs *, const char *msg); | ||
321 | |||
322 | __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) | ||
323 | { | ||
324 | |||
325 | /* | ||
326 | * Since current_thread_info()-> is always on the stack, and we | ||
327 | * always switch the stack NMI-atomically, it's safe to use | ||
328 | * smp_processor_id(). | ||
329 | */ | ||
330 | unsigned int sum; | ||
331 | int touched = 0; | ||
332 | int cpu = smp_processor_id(); | ||
333 | int rc=0; | ||
334 | |||
335 | /* check for other users first */ | ||
336 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
337 | == NOTIFY_STOP) { | ||
338 | rc = 1; | ||
339 | touched = 1; | ||
340 | } | ||
341 | |||
342 | if (cpu_isset(cpu, backtrace_mask)) { | ||
343 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ | ||
344 | |||
345 | spin_lock(&lock); | ||
346 | printk("NMI backtrace for cpu %d\n", cpu); | ||
347 | dump_stack(); | ||
348 | spin_unlock(&lock); | ||
349 | cpu_clear(cpu, backtrace_mask); | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * Take the local apic timer and PIT/HPET into account. We don't | ||
354 | * know which one is active, when we have highres/dyntick on | ||
355 | */ | ||
356 | sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0]; | ||
357 | |||
358 | /* if the none of the timers isn't firing, this cpu isn't doing much */ | ||
359 | if (!touched && last_irq_sums[cpu] == sum) { | ||
360 | /* | ||
361 | * Ayiee, looks like this CPU is stuck ... | ||
362 | * wait a few IRQs (5 seconds) before doing the oops ... | ||
363 | */ | ||
364 | alert_counter[cpu]++; | ||
365 | if (alert_counter[cpu] == 5*nmi_hz) | ||
366 | /* | ||
367 | * die_nmi will return ONLY if NOTIFY_STOP happens.. | ||
368 | */ | ||
369 | die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); | ||
370 | } else { | ||
371 | last_irq_sums[cpu] = sum; | ||
372 | alert_counter[cpu] = 0; | ||
373 | } | ||
374 | /* see if the nmi watchdog went off */ | ||
375 | if (!__get_cpu_var(wd_enabled)) | ||
376 | return rc; | ||
377 | switch (nmi_watchdog) { | ||
378 | case NMI_LOCAL_APIC: | ||
379 | rc |= lapic_wd_event(nmi_hz); | ||
380 | break; | ||
381 | case NMI_IO_APIC: | ||
382 | /* don't know how to accurately check for this. | ||
383 | * just assume it was a watchdog timer interrupt | ||
384 | * This matches the old behaviour. | ||
385 | */ | ||
386 | rc = 1; | ||
387 | break; | ||
388 | } | ||
389 | return rc; | ||
390 | } | ||
391 | |||
392 | int do_nmi_callback(struct pt_regs * regs, int cpu) | ||
393 | { | ||
394 | #ifdef CONFIG_SYSCTL | ||
395 | if (unknown_nmi_panic) | ||
396 | return unknown_nmi_panic_callback(regs, cpu); | ||
397 | #endif | ||
398 | return 0; | ||
399 | } | ||
400 | |||
401 | #ifdef CONFIG_SYSCTL | ||
402 | |||
403 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | ||
404 | { | ||
405 | unsigned char reason = get_nmi_reason(); | ||
406 | char buf[64]; | ||
407 | |||
408 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | ||
409 | die_nmi(regs, buf); | ||
410 | return 0; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * proc handler for /proc/sys/kernel/nmi | ||
415 | */ | ||
416 | int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, | ||
417 | void __user *buffer, size_t *length, loff_t *ppos) | ||
418 | { | ||
419 | int old_state; | ||
420 | |||
421 | nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; | ||
422 | old_state = nmi_watchdog_enabled; | ||
423 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
424 | if (!!old_state == !!nmi_watchdog_enabled) | ||
425 | return 0; | ||
426 | |||
427 | if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { | ||
428 | printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); | ||
429 | return -EIO; | ||
430 | } | ||
431 | |||
432 | if (nmi_watchdog == NMI_DEFAULT) { | ||
433 | if (lapic_watchdog_ok()) | ||
434 | nmi_watchdog = NMI_LOCAL_APIC; | ||
435 | else | ||
436 | nmi_watchdog = NMI_IO_APIC; | ||
437 | } | ||
438 | |||
439 | if (nmi_watchdog == NMI_LOCAL_APIC) { | ||
440 | if (nmi_watchdog_enabled) | ||
441 | enable_lapic_nmi_watchdog(); | ||
442 | else | ||
443 | disable_lapic_nmi_watchdog(); | ||
444 | } else { | ||
445 | printk( KERN_WARNING | ||
446 | "NMI watchdog doesn't know what hardware to touch\n"); | ||
447 | return -EIO; | ||
448 | } | ||
449 | return 0; | ||
450 | } | ||
451 | |||
452 | #endif | ||
453 | |||
454 | void __trigger_all_cpu_backtrace(void) | ||
455 | { | ||
456 | int i; | ||
457 | |||
458 | backtrace_mask = cpu_online_map; | ||
459 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
460 | for (i = 0; i < 10 * 1000; i++) { | ||
461 | if (cpus_empty(backtrace_mask)) | ||
462 | break; | ||
463 | mdelay(1); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | EXPORT_SYMBOL(nmi_active); | ||
468 | EXPORT_SYMBOL(nmi_watchdog); | ||
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c new file mode 100644 index 000000000000..9000d82c6dc0 --- /dev/null +++ b/arch/x86/kernel/numaq_32.c | |||
@@ -0,0 +1,89 @@ | |||
1 | /* | ||
2 | * Written by: Patricia Gaughen, IBM Corporation | ||
3 | * | ||
4 | * Copyright (C) 2002, IBM Corp. | ||
5 | * | ||
6 | * All rights reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, but | ||
14 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
16 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
17 | * details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with this program; if not, write to the Free Software | ||
21 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
22 | * | ||
23 | * Send feedback to <gone@us.ibm.com> | ||
24 | */ | ||
25 | |||
26 | #include <linux/mm.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mmzone.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/nodemask.h> | ||
31 | #include <asm/numaq.h> | ||
32 | #include <asm/topology.h> | ||
33 | #include <asm/processor.h> | ||
34 | |||
35 | #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) | ||
36 | |||
37 | /* | ||
38 | * Function: smp_dump_qct() | ||
39 | * | ||
40 | * Description: gets memory layout from the quad config table. This | ||
41 | * function also updates node_online_map with the nodes (quads) present. | ||
42 | */ | ||
43 | static void __init smp_dump_qct(void) | ||
44 | { | ||
45 | int node; | ||
46 | struct eachquadmem *eq; | ||
47 | struct sys_cfg_data *scd = | ||
48 | (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR); | ||
49 | |||
50 | nodes_clear(node_online_map); | ||
51 | for_each_node(node) { | ||
52 | if (scd->quads_present31_0 & (1 << node)) { | ||
53 | node_set_online(node); | ||
54 | eq = &scd->eq[node]; | ||
55 | /* Convert to pages */ | ||
56 | node_start_pfn[node] = MB_TO_PAGES( | ||
57 | eq->hi_shrd_mem_start - eq->priv_mem_size); | ||
58 | node_end_pfn[node] = MB_TO_PAGES( | ||
59 | eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); | ||
60 | |||
61 | memory_present(node, | ||
62 | node_start_pfn[node], node_end_pfn[node]); | ||
63 | node_remap_size[node] = node_memmap_size_bytes(node, | ||
64 | node_start_pfn[node], | ||
65 | node_end_pfn[node]); | ||
66 | } | ||
67 | } | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * Unlike Summit, we don't really care to let the NUMA-Q | ||
72 | * fall back to flat mode. Don't compile for NUMA-Q | ||
73 | * unless you really need it! | ||
74 | */ | ||
75 | int __init get_memcfg_numaq(void) | ||
76 | { | ||
77 | smp_dump_qct(); | ||
78 | return 1; | ||
79 | } | ||
80 | |||
81 | static int __init numaq_tsc_disable(void) | ||
82 | { | ||
83 | if (num_online_nodes() > 1) { | ||
84 | printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); | ||
85 | tsc_disable = 1; | ||
86 | } | ||
87 | return 0; | ||
88 | } | ||
89 | arch_initcall(numaq_tsc_disable); | ||
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c new file mode 100644 index 000000000000..739cfb207dd7 --- /dev/null +++ b/arch/x86/kernel/paravirt_32.c | |||
@@ -0,0 +1,392 @@ | |||
1 | /* Paravirtualization interfaces | ||
2 | Copyright (C) 2006 Rusty Russell IBM Corporation | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/efi.h> | ||
21 | #include <linux/bcd.h> | ||
22 | #include <linux/highmem.h> | ||
23 | |||
24 | #include <asm/bug.h> | ||
25 | #include <asm/paravirt.h> | ||
26 | #include <asm/desc.h> | ||
27 | #include <asm/setup.h> | ||
28 | #include <asm/arch_hooks.h> | ||
29 | #include <asm/time.h> | ||
30 | #include <asm/irq.h> | ||
31 | #include <asm/delay.h> | ||
32 | #include <asm/fixmap.h> | ||
33 | #include <asm/apic.h> | ||
34 | #include <asm/tlbflush.h> | ||
35 | #include <asm/timer.h> | ||
36 | |||
37 | /* nop stub */ | ||
38 | void _paravirt_nop(void) | ||
39 | { | ||
40 | } | ||
41 | |||
42 | static void __init default_banner(void) | ||
43 | { | ||
44 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", | ||
45 | paravirt_ops.name); | ||
46 | } | ||
47 | |||
48 | char *memory_setup(void) | ||
49 | { | ||
50 | return paravirt_ops.memory_setup(); | ||
51 | } | ||
52 | |||
53 | /* Simple instruction patching code. */ | ||
54 | #define DEF_NATIVE(name, code) \ | ||
55 | extern const char start_##name[], end_##name[]; \ | ||
56 | asm("start_" #name ": " code "; end_" #name ":") | ||
57 | |||
58 | DEF_NATIVE(irq_disable, "cli"); | ||
59 | DEF_NATIVE(irq_enable, "sti"); | ||
60 | DEF_NATIVE(restore_fl, "push %eax; popf"); | ||
61 | DEF_NATIVE(save_fl, "pushf; pop %eax"); | ||
62 | DEF_NATIVE(iret, "iret"); | ||
63 | DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); | ||
64 | DEF_NATIVE(read_cr2, "mov %cr2, %eax"); | ||
65 | DEF_NATIVE(write_cr3, "mov %eax, %cr3"); | ||
66 | DEF_NATIVE(read_cr3, "mov %cr3, %eax"); | ||
67 | DEF_NATIVE(clts, "clts"); | ||
68 | DEF_NATIVE(read_tsc, "rdtsc"); | ||
69 | |||
70 | DEF_NATIVE(ud2a, "ud2a"); | ||
71 | |||
72 | static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | ||
73 | unsigned long addr, unsigned len) | ||
74 | { | ||
75 | const unsigned char *start, *end; | ||
76 | unsigned ret; | ||
77 | |||
78 | switch(type) { | ||
79 | #define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site | ||
80 | SITE(irq_disable); | ||
81 | SITE(irq_enable); | ||
82 | SITE(restore_fl); | ||
83 | SITE(save_fl); | ||
84 | SITE(iret); | ||
85 | SITE(irq_enable_sysexit); | ||
86 | SITE(read_cr2); | ||
87 | SITE(read_cr3); | ||
88 | SITE(write_cr3); | ||
89 | SITE(clts); | ||
90 | SITE(read_tsc); | ||
91 | #undef SITE | ||
92 | |||
93 | patch_site: | ||
94 | ret = paravirt_patch_insns(ibuf, len, start, end); | ||
95 | break; | ||
96 | |||
97 | case PARAVIRT_PATCH(make_pgd): | ||
98 | case PARAVIRT_PATCH(make_pte): | ||
99 | case PARAVIRT_PATCH(pgd_val): | ||
100 | case PARAVIRT_PATCH(pte_val): | ||
101 | #ifdef CONFIG_X86_PAE | ||
102 | case PARAVIRT_PATCH(make_pmd): | ||
103 | case PARAVIRT_PATCH(pmd_val): | ||
104 | #endif | ||
105 | /* These functions end up returning exactly what | ||
106 | they're passed, in the same registers. */ | ||
107 | ret = paravirt_patch_nop(); | ||
108 | break; | ||
109 | |||
110 | default: | ||
111 | ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); | ||
112 | break; | ||
113 | } | ||
114 | |||
115 | return ret; | ||
116 | } | ||
117 | |||
118 | unsigned paravirt_patch_nop(void) | ||
119 | { | ||
120 | return 0; | ||
121 | } | ||
122 | |||
123 | unsigned paravirt_patch_ignore(unsigned len) | ||
124 | { | ||
125 | return len; | ||
126 | } | ||
127 | |||
128 | struct branch { | ||
129 | unsigned char opcode; | ||
130 | u32 delta; | ||
131 | } __attribute__((packed)); | ||
132 | |||
133 | unsigned paravirt_patch_call(void *insnbuf, | ||
134 | const void *target, u16 tgt_clobbers, | ||
135 | unsigned long addr, u16 site_clobbers, | ||
136 | unsigned len) | ||
137 | { | ||
138 | struct branch *b = insnbuf; | ||
139 | unsigned long delta = (unsigned long)target - (addr+5); | ||
140 | |||
141 | if (tgt_clobbers & ~site_clobbers) | ||
142 | return len; /* target would clobber too much for this site */ | ||
143 | if (len < 5) | ||
144 | return len; /* call too long for patch site */ | ||
145 | |||
146 | b->opcode = 0xe8; /* call */ | ||
147 | b->delta = delta; | ||
148 | BUILD_BUG_ON(sizeof(*b) != 5); | ||
149 | |||
150 | return 5; | ||
151 | } | ||
152 | |||
153 | unsigned paravirt_patch_jmp(const void *target, void *insnbuf, | ||
154 | unsigned long addr, unsigned len) | ||
155 | { | ||
156 | struct branch *b = insnbuf; | ||
157 | unsigned long delta = (unsigned long)target - (addr+5); | ||
158 | |||
159 | if (len < 5) | ||
160 | return len; /* call too long for patch site */ | ||
161 | |||
162 | b->opcode = 0xe9; /* jmp */ | ||
163 | b->delta = delta; | ||
164 | |||
165 | return 5; | ||
166 | } | ||
167 | |||
168 | unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, | ||
169 | unsigned long addr, unsigned len) | ||
170 | { | ||
171 | void *opfunc = *((void **)¶virt_ops + type); | ||
172 | unsigned ret; | ||
173 | |||
174 | if (opfunc == NULL) | ||
175 | /* If there's no function, patch it with a ud2a (BUG) */ | ||
176 | ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); | ||
177 | else if (opfunc == paravirt_nop) | ||
178 | /* If the operation is a nop, then nop the callsite */ | ||
179 | ret = paravirt_patch_nop(); | ||
180 | else if (type == PARAVIRT_PATCH(iret) || | ||
181 | type == PARAVIRT_PATCH(irq_enable_sysexit)) | ||
182 | /* If operation requires a jmp, then jmp */ | ||
183 | ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); | ||
184 | else | ||
185 | /* Otherwise call the function; assume target could | ||
186 | clobber any caller-save reg */ | ||
187 | ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY, | ||
188 | addr, clobbers, len); | ||
189 | |||
190 | return ret; | ||
191 | } | ||
192 | |||
193 | unsigned paravirt_patch_insns(void *insnbuf, unsigned len, | ||
194 | const char *start, const char *end) | ||
195 | { | ||
196 | unsigned insn_len = end - start; | ||
197 | |||
198 | if (insn_len > len || start == NULL) | ||
199 | insn_len = len; | ||
200 | else | ||
201 | memcpy(insnbuf, start, insn_len); | ||
202 | |||
203 | return insn_len; | ||
204 | } | ||
205 | |||
206 | void init_IRQ(void) | ||
207 | { | ||
208 | paravirt_ops.init_IRQ(); | ||
209 | } | ||
210 | |||
211 | static void native_flush_tlb(void) | ||
212 | { | ||
213 | __native_flush_tlb(); | ||
214 | } | ||
215 | |||
216 | /* | ||
217 | * Global pages have to be flushed a bit differently. Not a real | ||
218 | * performance problem because this does not happen often. | ||
219 | */ | ||
220 | static void native_flush_tlb_global(void) | ||
221 | { | ||
222 | __native_flush_tlb_global(); | ||
223 | } | ||
224 | |||
225 | static void native_flush_tlb_single(unsigned long addr) | ||
226 | { | ||
227 | __native_flush_tlb_single(addr); | ||
228 | } | ||
229 | |||
230 | /* These are in entry.S */ | ||
231 | extern void native_iret(void); | ||
232 | extern void native_irq_enable_sysexit(void); | ||
233 | |||
234 | static int __init print_banner(void) | ||
235 | { | ||
236 | paravirt_ops.banner(); | ||
237 | return 0; | ||
238 | } | ||
239 | core_initcall(print_banner); | ||
240 | |||
241 | static struct resource reserve_ioports = { | ||
242 | .start = 0, | ||
243 | .end = IO_SPACE_LIMIT, | ||
244 | .name = "paravirt-ioport", | ||
245 | .flags = IORESOURCE_IO | IORESOURCE_BUSY, | ||
246 | }; | ||
247 | |||
248 | static struct resource reserve_iomem = { | ||
249 | .start = 0, | ||
250 | .end = -1, | ||
251 | .name = "paravirt-iomem", | ||
252 | .flags = IORESOURCE_MEM | IORESOURCE_BUSY, | ||
253 | }; | ||
254 | |||
255 | /* | ||
256 | * Reserve the whole legacy IO space to prevent any legacy drivers | ||
257 | * from wasting time probing for their hardware. This is a fairly | ||
258 | * brute-force approach to disabling all non-virtual drivers. | ||
259 | * | ||
260 | * Note that this must be called very early to have any effect. | ||
261 | */ | ||
262 | int paravirt_disable_iospace(void) | ||
263 | { | ||
264 | int ret; | ||
265 | |||
266 | ret = request_resource(&ioport_resource, &reserve_ioports); | ||
267 | if (ret == 0) { | ||
268 | ret = request_resource(&iomem_resource, &reserve_iomem); | ||
269 | if (ret) | ||
270 | release_resource(&reserve_ioports); | ||
271 | } | ||
272 | |||
273 | return ret; | ||
274 | } | ||
275 | |||
276 | struct paravirt_ops paravirt_ops = { | ||
277 | .name = "bare hardware", | ||
278 | .paravirt_enabled = 0, | ||
279 | .kernel_rpl = 0, | ||
280 | .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ | ||
281 | |||
282 | .patch = native_patch, | ||
283 | .banner = default_banner, | ||
284 | .arch_setup = paravirt_nop, | ||
285 | .memory_setup = machine_specific_memory_setup, | ||
286 | .get_wallclock = native_get_wallclock, | ||
287 | .set_wallclock = native_set_wallclock, | ||
288 | .time_init = hpet_time_init, | ||
289 | .init_IRQ = native_init_IRQ, | ||
290 | |||
291 | .cpuid = native_cpuid, | ||
292 | .get_debugreg = native_get_debugreg, | ||
293 | .set_debugreg = native_set_debugreg, | ||
294 | .clts = native_clts, | ||
295 | .read_cr0 = native_read_cr0, | ||
296 | .write_cr0 = native_write_cr0, | ||
297 | .read_cr2 = native_read_cr2, | ||
298 | .write_cr2 = native_write_cr2, | ||
299 | .read_cr3 = native_read_cr3, | ||
300 | .write_cr3 = native_write_cr3, | ||
301 | .read_cr4 = native_read_cr4, | ||
302 | .read_cr4_safe = native_read_cr4_safe, | ||
303 | .write_cr4 = native_write_cr4, | ||
304 | .save_fl = native_save_fl, | ||
305 | .restore_fl = native_restore_fl, | ||
306 | .irq_disable = native_irq_disable, | ||
307 | .irq_enable = native_irq_enable, | ||
308 | .safe_halt = native_safe_halt, | ||
309 | .halt = native_halt, | ||
310 | .wbinvd = native_wbinvd, | ||
311 | .read_msr = native_read_msr_safe, | ||
312 | .write_msr = native_write_msr_safe, | ||
313 | .read_tsc = native_read_tsc, | ||
314 | .read_pmc = native_read_pmc, | ||
315 | .sched_clock = native_sched_clock, | ||
316 | .get_cpu_khz = native_calculate_cpu_khz, | ||
317 | .load_tr_desc = native_load_tr_desc, | ||
318 | .set_ldt = native_set_ldt, | ||
319 | .load_gdt = native_load_gdt, | ||
320 | .load_idt = native_load_idt, | ||
321 | .store_gdt = native_store_gdt, | ||
322 | .store_idt = native_store_idt, | ||
323 | .store_tr = native_store_tr, | ||
324 | .load_tls = native_load_tls, | ||
325 | .write_ldt_entry = write_dt_entry, | ||
326 | .write_gdt_entry = write_dt_entry, | ||
327 | .write_idt_entry = write_dt_entry, | ||
328 | .load_esp0 = native_load_esp0, | ||
329 | |||
330 | .set_iopl_mask = native_set_iopl_mask, | ||
331 | .io_delay = native_io_delay, | ||
332 | |||
333 | #ifdef CONFIG_X86_LOCAL_APIC | ||
334 | .apic_write = native_apic_write, | ||
335 | .apic_write_atomic = native_apic_write_atomic, | ||
336 | .apic_read = native_apic_read, | ||
337 | .setup_boot_clock = setup_boot_APIC_clock, | ||
338 | .setup_secondary_clock = setup_secondary_APIC_clock, | ||
339 | .startup_ipi_hook = paravirt_nop, | ||
340 | #endif | ||
341 | .set_lazy_mode = paravirt_nop, | ||
342 | |||
343 | .pagetable_setup_start = native_pagetable_setup_start, | ||
344 | .pagetable_setup_done = native_pagetable_setup_done, | ||
345 | |||
346 | .flush_tlb_user = native_flush_tlb, | ||
347 | .flush_tlb_kernel = native_flush_tlb_global, | ||
348 | .flush_tlb_single = native_flush_tlb_single, | ||
349 | .flush_tlb_others = native_flush_tlb_others, | ||
350 | |||
351 | .alloc_pt = paravirt_nop, | ||
352 | .alloc_pd = paravirt_nop, | ||
353 | .alloc_pd_clone = paravirt_nop, | ||
354 | .release_pt = paravirt_nop, | ||
355 | .release_pd = paravirt_nop, | ||
356 | |||
357 | .set_pte = native_set_pte, | ||
358 | .set_pte_at = native_set_pte_at, | ||
359 | .set_pmd = native_set_pmd, | ||
360 | .pte_update = paravirt_nop, | ||
361 | .pte_update_defer = paravirt_nop, | ||
362 | |||
363 | #ifdef CONFIG_HIGHPTE | ||
364 | .kmap_atomic_pte = kmap_atomic, | ||
365 | #endif | ||
366 | |||
367 | #ifdef CONFIG_X86_PAE | ||
368 | .set_pte_atomic = native_set_pte_atomic, | ||
369 | .set_pte_present = native_set_pte_present, | ||
370 | .set_pud = native_set_pud, | ||
371 | .pte_clear = native_pte_clear, | ||
372 | .pmd_clear = native_pmd_clear, | ||
373 | |||
374 | .pmd_val = native_pmd_val, | ||
375 | .make_pmd = native_make_pmd, | ||
376 | #endif | ||
377 | |||
378 | .pte_val = native_pte_val, | ||
379 | .pgd_val = native_pgd_val, | ||
380 | |||
381 | .make_pte = native_make_pte, | ||
382 | .make_pgd = native_make_pgd, | ||
383 | |||
384 | .irq_enable_sysexit = native_irq_enable_sysexit, | ||
385 | .iret = native_iret, | ||
386 | |||
387 | .dup_mmap = paravirt_nop, | ||
388 | .exit_mmap = paravirt_nop, | ||
389 | .activate_mm = paravirt_nop, | ||
390 | }; | ||
391 | |||
392 | EXPORT_SYMBOL(paravirt_ops); | ||
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c new file mode 100644 index 000000000000..048f09b62553 --- /dev/null +++ b/arch/x86/kernel/pci-dma_32.c | |||
@@ -0,0 +1,177 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support. | ||
3 | * | ||
4 | * On i386 there is no hardware dynamic DMA address translation, | ||
5 | * so consistent alloc/free are merely page allocation/freeing. | ||
6 | * The rest of the dynamic DMA mapping interface is implemented | ||
7 | * in asm/pci.h. | ||
8 | */ | ||
9 | |||
10 | #include <linux/types.h> | ||
11 | #include <linux/mm.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/pci.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/pci.h> | ||
16 | #include <asm/io.h> | ||
17 | |||
18 | struct dma_coherent_mem { | ||
19 | void *virt_base; | ||
20 | u32 device_base; | ||
21 | int size; | ||
22 | int flags; | ||
23 | unsigned long *bitmap; | ||
24 | }; | ||
25 | |||
26 | void *dma_alloc_coherent(struct device *dev, size_t size, | ||
27 | dma_addr_t *dma_handle, gfp_t gfp) | ||
28 | { | ||
29 | void *ret; | ||
30 | struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; | ||
31 | int order = get_order(size); | ||
32 | /* ignore region specifiers */ | ||
33 | gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); | ||
34 | |||
35 | if (mem) { | ||
36 | int page = bitmap_find_free_region(mem->bitmap, mem->size, | ||
37 | order); | ||
38 | if (page >= 0) { | ||
39 | *dma_handle = mem->device_base + (page << PAGE_SHIFT); | ||
40 | ret = mem->virt_base + (page << PAGE_SHIFT); | ||
41 | memset(ret, 0, size); | ||
42 | return ret; | ||
43 | } | ||
44 | if (mem->flags & DMA_MEMORY_EXCLUSIVE) | ||
45 | return NULL; | ||
46 | } | ||
47 | |||
48 | if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) | ||
49 | gfp |= GFP_DMA; | ||
50 | |||
51 | ret = (void *)__get_free_pages(gfp, order); | ||
52 | |||
53 | if (ret != NULL) { | ||
54 | memset(ret, 0, size); | ||
55 | *dma_handle = virt_to_phys(ret); | ||
56 | } | ||
57 | return ret; | ||
58 | } | ||
59 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
60 | |||
61 | void dma_free_coherent(struct device *dev, size_t size, | ||
62 | void *vaddr, dma_addr_t dma_handle) | ||
63 | { | ||
64 | struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; | ||
65 | int order = get_order(size); | ||
66 | |||
67 | if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { | ||
68 | int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; | ||
69 | |||
70 | bitmap_release_region(mem->bitmap, page, order); | ||
71 | } else | ||
72 | free_pages((unsigned long)vaddr, order); | ||
73 | } | ||
74 | EXPORT_SYMBOL(dma_free_coherent); | ||
75 | |||
76 | int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, | ||
77 | dma_addr_t device_addr, size_t size, int flags) | ||
78 | { | ||
79 | void __iomem *mem_base = NULL; | ||
80 | int pages = size >> PAGE_SHIFT; | ||
81 | int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); | ||
82 | |||
83 | if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) | ||
84 | goto out; | ||
85 | if (!size) | ||
86 | goto out; | ||
87 | if (dev->dma_mem) | ||
88 | goto out; | ||
89 | |||
90 | /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ | ||
91 | |||
92 | mem_base = ioremap(bus_addr, size); | ||
93 | if (!mem_base) | ||
94 | goto out; | ||
95 | |||
96 | dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); | ||
97 | if (!dev->dma_mem) | ||
98 | goto out; | ||
99 | dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); | ||
100 | if (!dev->dma_mem->bitmap) | ||
101 | goto free1_out; | ||
102 | |||
103 | dev->dma_mem->virt_base = mem_base; | ||
104 | dev->dma_mem->device_base = device_addr; | ||
105 | dev->dma_mem->size = pages; | ||
106 | dev->dma_mem->flags = flags; | ||
107 | |||
108 | if (flags & DMA_MEMORY_MAP) | ||
109 | return DMA_MEMORY_MAP; | ||
110 | |||
111 | return DMA_MEMORY_IO; | ||
112 | |||
113 | free1_out: | ||
114 | kfree(dev->dma_mem); | ||
115 | out: | ||
116 | if (mem_base) | ||
117 | iounmap(mem_base); | ||
118 | return 0; | ||
119 | } | ||
120 | EXPORT_SYMBOL(dma_declare_coherent_memory); | ||
121 | |||
122 | void dma_release_declared_memory(struct device *dev) | ||
123 | { | ||
124 | struct dma_coherent_mem *mem = dev->dma_mem; | ||
125 | |||
126 | if(!mem) | ||
127 | return; | ||
128 | dev->dma_mem = NULL; | ||
129 | iounmap(mem->virt_base); | ||
130 | kfree(mem->bitmap); | ||
131 | kfree(mem); | ||
132 | } | ||
133 | EXPORT_SYMBOL(dma_release_declared_memory); | ||
134 | |||
135 | void *dma_mark_declared_memory_occupied(struct device *dev, | ||
136 | dma_addr_t device_addr, size_t size) | ||
137 | { | ||
138 | struct dma_coherent_mem *mem = dev->dma_mem; | ||
139 | int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
140 | int pos, err; | ||
141 | |||
142 | if (!mem) | ||
143 | return ERR_PTR(-EINVAL); | ||
144 | |||
145 | pos = (device_addr - mem->device_base) >> PAGE_SHIFT; | ||
146 | err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); | ||
147 | if (err != 0) | ||
148 | return ERR_PTR(err); | ||
149 | return mem->virt_base + (pos << PAGE_SHIFT); | ||
150 | } | ||
151 | EXPORT_SYMBOL(dma_mark_declared_memory_occupied); | ||
152 | |||
153 | #ifdef CONFIG_PCI | ||
154 | /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ | ||
155 | |||
156 | int forbid_dac; | ||
157 | EXPORT_SYMBOL(forbid_dac); | ||
158 | |||
159 | static __devinit void via_no_dac(struct pci_dev *dev) | ||
160 | { | ||
161 | if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { | ||
162 | printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); | ||
163 | forbid_dac = 1; | ||
164 | } | ||
165 | } | ||
166 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); | ||
167 | |||
168 | static int check_iommu(char *s) | ||
169 | { | ||
170 | if (!strcmp(s, "usedac")) { | ||
171 | forbid_dac = -1; | ||
172 | return 1; | ||
173 | } | ||
174 | return 0; | ||
175 | } | ||
176 | __setup("iommu=", check_iommu); | ||
177 | #endif | ||
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c new file mode 100644 index 000000000000..bc1f2d3ea277 --- /dev/null +++ b/arch/x86/kernel/pcspeaker.c | |||
@@ -0,0 +1,20 @@ | |||
1 | #include <linux/platform_device.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/init.h> | ||
4 | |||
5 | static __init int add_pcspkr(void) | ||
6 | { | ||
7 | struct platform_device *pd; | ||
8 | int ret; | ||
9 | |||
10 | pd = platform_device_alloc("pcspkr", -1); | ||
11 | if (!pd) | ||
12 | return -ENOMEM; | ||
13 | |||
14 | ret = platform_device_add(pd); | ||
15 | if (ret) | ||
16 | platform_device_put(pd); | ||
17 | |||
18 | return ret; | ||
19 | } | ||
20 | device_initcall(add_pcspkr); | ||
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c new file mode 100644 index 000000000000..84664710b784 --- /dev/null +++ b/arch/x86/kernel/process_32.c | |||
@@ -0,0 +1,951 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/process.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Pentium III FXSR, SSE support | ||
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
8 | */ | ||
9 | |||
10 | /* | ||
11 | * This file handles the architecture-dependent parts of process handling.. | ||
12 | */ | ||
13 | |||
14 | #include <stdarg.h> | ||
15 | |||
16 | #include <linux/cpu.h> | ||
17 | #include <linux/errno.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/elfcore.h> | ||
23 | #include <linux/smp.h> | ||
24 | #include <linux/stddef.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | #include <linux/user.h> | ||
28 | #include <linux/a.out.h> | ||
29 | #include <linux/interrupt.h> | ||
30 | #include <linux/utsname.h> | ||
31 | #include <linux/delay.h> | ||
32 | #include <linux/reboot.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/mc146818rtc.h> | ||
35 | #include <linux/module.h> | ||
36 | #include <linux/kallsyms.h> | ||
37 | #include <linux/ptrace.h> | ||
38 | #include <linux/random.h> | ||
39 | #include <linux/personality.h> | ||
40 | #include <linux/tick.h> | ||
41 | #include <linux/percpu.h> | ||
42 | |||
43 | #include <asm/uaccess.h> | ||
44 | #include <asm/pgtable.h> | ||
45 | #include <asm/system.h> | ||
46 | #include <asm/io.h> | ||
47 | #include <asm/ldt.h> | ||
48 | #include <asm/processor.h> | ||
49 | #include <asm/i387.h> | ||
50 | #include <asm/desc.h> | ||
51 | #include <asm/vm86.h> | ||
52 | #ifdef CONFIG_MATH_EMULATION | ||
53 | #include <asm/math_emu.h> | ||
54 | #endif | ||
55 | |||
56 | #include <linux/err.h> | ||
57 | |||
58 | #include <asm/tlbflush.h> | ||
59 | #include <asm/cpu.h> | ||
60 | |||
61 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | ||
62 | |||
63 | static int hlt_counter; | ||
64 | |||
65 | unsigned long boot_option_idle_override = 0; | ||
66 | EXPORT_SYMBOL(boot_option_idle_override); | ||
67 | |||
68 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | ||
69 | EXPORT_PER_CPU_SYMBOL(current_task); | ||
70 | |||
71 | DEFINE_PER_CPU(int, cpu_number); | ||
72 | EXPORT_PER_CPU_SYMBOL(cpu_number); | ||
73 | |||
74 | /* | ||
75 | * Return saved PC of a blocked thread. | ||
76 | */ | ||
77 | unsigned long thread_saved_pc(struct task_struct *tsk) | ||
78 | { | ||
79 | return ((unsigned long *)tsk->thread.esp)[3]; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Powermanagement idle function, if any.. | ||
84 | */ | ||
85 | void (*pm_idle)(void); | ||
86 | EXPORT_SYMBOL(pm_idle); | ||
87 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | ||
88 | |||
89 | void disable_hlt(void) | ||
90 | { | ||
91 | hlt_counter++; | ||
92 | } | ||
93 | |||
94 | EXPORT_SYMBOL(disable_hlt); | ||
95 | |||
96 | void enable_hlt(void) | ||
97 | { | ||
98 | hlt_counter--; | ||
99 | } | ||
100 | |||
101 | EXPORT_SYMBOL(enable_hlt); | ||
102 | |||
103 | /* | ||
104 | * We use this if we don't have any better | ||
105 | * idle routine.. | ||
106 | */ | ||
107 | void default_idle(void) | ||
108 | { | ||
109 | if (!hlt_counter && boot_cpu_data.hlt_works_ok) { | ||
110 | current_thread_info()->status &= ~TS_POLLING; | ||
111 | /* | ||
112 | * TS_POLLING-cleared state must be visible before we | ||
113 | * test NEED_RESCHED: | ||
114 | */ | ||
115 | smp_mb(); | ||
116 | |||
117 | local_irq_disable(); | ||
118 | if (!need_resched()) | ||
119 | safe_halt(); /* enables interrupts racelessly */ | ||
120 | else | ||
121 | local_irq_enable(); | ||
122 | current_thread_info()->status |= TS_POLLING; | ||
123 | } else { | ||
124 | /* loop is done by the caller */ | ||
125 | cpu_relax(); | ||
126 | } | ||
127 | } | ||
128 | #ifdef CONFIG_APM_MODULE | ||
129 | EXPORT_SYMBOL(default_idle); | ||
130 | #endif | ||
131 | |||
132 | /* | ||
133 | * On SMP it's slightly faster (but much more power-consuming!) | ||
134 | * to poll the ->work.need_resched flag instead of waiting for the | ||
135 | * cross-CPU IPI to arrive. Use this option with caution. | ||
136 | */ | ||
137 | static void poll_idle (void) | ||
138 | { | ||
139 | cpu_relax(); | ||
140 | } | ||
141 | |||
142 | #ifdef CONFIG_HOTPLUG_CPU | ||
143 | #include <asm/nmi.h> | ||
144 | /* We don't actually take CPU down, just spin without interrupts. */ | ||
145 | static inline void play_dead(void) | ||
146 | { | ||
147 | /* This must be done before dead CPU ack */ | ||
148 | cpu_exit_clear(); | ||
149 | wbinvd(); | ||
150 | mb(); | ||
151 | /* Ack it */ | ||
152 | __get_cpu_var(cpu_state) = CPU_DEAD; | ||
153 | |||
154 | /* | ||
155 | * With physical CPU hotplug, we should halt the cpu | ||
156 | */ | ||
157 | local_irq_disable(); | ||
158 | while (1) | ||
159 | halt(); | ||
160 | } | ||
161 | #else | ||
162 | static inline void play_dead(void) | ||
163 | { | ||
164 | BUG(); | ||
165 | } | ||
166 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
167 | |||
168 | /* | ||
169 | * The idle thread. There's no useful work to be | ||
170 | * done, so just try to conserve power and have a | ||
171 | * low exit latency (ie sit in a loop waiting for | ||
172 | * somebody to say that they'd like to reschedule) | ||
173 | */ | ||
174 | void cpu_idle(void) | ||
175 | { | ||
176 | int cpu = smp_processor_id(); | ||
177 | |||
178 | current_thread_info()->status |= TS_POLLING; | ||
179 | |||
180 | /* endless idle loop with no priority at all */ | ||
181 | while (1) { | ||
182 | tick_nohz_stop_sched_tick(); | ||
183 | while (!need_resched()) { | ||
184 | void (*idle)(void); | ||
185 | |||
186 | if (__get_cpu_var(cpu_idle_state)) | ||
187 | __get_cpu_var(cpu_idle_state) = 0; | ||
188 | |||
189 | check_pgt_cache(); | ||
190 | rmb(); | ||
191 | idle = pm_idle; | ||
192 | |||
193 | if (!idle) | ||
194 | idle = default_idle; | ||
195 | |||
196 | if (cpu_is_offline(cpu)) | ||
197 | play_dead(); | ||
198 | |||
199 | __get_cpu_var(irq_stat).idle_timestamp = jiffies; | ||
200 | idle(); | ||
201 | } | ||
202 | tick_nohz_restart_sched_tick(); | ||
203 | preempt_enable_no_resched(); | ||
204 | schedule(); | ||
205 | preempt_disable(); | ||
206 | } | ||
207 | } | ||
208 | |||
209 | void cpu_idle_wait(void) | ||
210 | { | ||
211 | unsigned int cpu, this_cpu = get_cpu(); | ||
212 | cpumask_t map, tmp = current->cpus_allowed; | ||
213 | |||
214 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | ||
215 | put_cpu(); | ||
216 | |||
217 | cpus_clear(map); | ||
218 | for_each_online_cpu(cpu) { | ||
219 | per_cpu(cpu_idle_state, cpu) = 1; | ||
220 | cpu_set(cpu, map); | ||
221 | } | ||
222 | |||
223 | __get_cpu_var(cpu_idle_state) = 0; | ||
224 | |||
225 | wmb(); | ||
226 | do { | ||
227 | ssleep(1); | ||
228 | for_each_online_cpu(cpu) { | ||
229 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) | ||
230 | cpu_clear(cpu, map); | ||
231 | } | ||
232 | cpus_and(map, map, cpu_online_map); | ||
233 | } while (!cpus_empty(map)); | ||
234 | |||
235 | set_cpus_allowed(current, tmp); | ||
236 | } | ||
237 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | ||
238 | |||
239 | /* | ||
240 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
241 | * which can obviate IPI to trigger checking of need_resched. | ||
242 | * We execute MONITOR against need_resched and enter optimized wait state | ||
243 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
244 | * up from MWAIT (without an IPI). | ||
245 | * | ||
246 | * New with Core Duo processors, MWAIT can take some hints based on CPU | ||
247 | * capability. | ||
248 | */ | ||
249 | void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) | ||
250 | { | ||
251 | if (!need_resched()) { | ||
252 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
253 | smp_mb(); | ||
254 | if (!need_resched()) | ||
255 | __mwait(eax, ecx); | ||
256 | } | ||
257 | } | ||
258 | |||
259 | /* Default MONITOR/MWAIT with no hints, used for default C1 state */ | ||
260 | static void mwait_idle(void) | ||
261 | { | ||
262 | local_irq_enable(); | ||
263 | mwait_idle_with_hints(0, 0); | ||
264 | } | ||
265 | |||
266 | void __devinit select_idle_routine(const struct cpuinfo_x86 *c) | ||
267 | { | ||
268 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | ||
269 | printk("monitor/mwait feature present.\n"); | ||
270 | /* | ||
271 | * Skip, if setup has overridden idle. | ||
272 | * One CPU supports mwait => All CPUs supports mwait | ||
273 | */ | ||
274 | if (!pm_idle) { | ||
275 | printk("using mwait in idle threads.\n"); | ||
276 | pm_idle = mwait_idle; | ||
277 | } | ||
278 | } | ||
279 | } | ||
280 | |||
281 | static int __init idle_setup(char *str) | ||
282 | { | ||
283 | if (!strcmp(str, "poll")) { | ||
284 | printk("using polling idle threads.\n"); | ||
285 | pm_idle = poll_idle; | ||
286 | #ifdef CONFIG_X86_SMP | ||
287 | if (smp_num_siblings > 1) | ||
288 | printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); | ||
289 | #endif | ||
290 | } else if (!strcmp(str, "mwait")) | ||
291 | force_mwait = 1; | ||
292 | else | ||
293 | return -1; | ||
294 | |||
295 | boot_option_idle_override = 1; | ||
296 | return 0; | ||
297 | } | ||
298 | early_param("idle", idle_setup); | ||
299 | |||
300 | void show_regs(struct pt_regs * regs) | ||
301 | { | ||
302 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; | ||
303 | unsigned long d0, d1, d2, d3, d6, d7; | ||
304 | |||
305 | printk("\n"); | ||
306 | printk("Pid: %d, comm: %20s\n", current->pid, current->comm); | ||
307 | printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); | ||
308 | print_symbol("EIP is at %s\n", regs->eip); | ||
309 | |||
310 | if (user_mode_vm(regs)) | ||
311 | printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); | ||
312 | printk(" EFLAGS: %08lx %s (%s %.*s)\n", | ||
313 | regs->eflags, print_tainted(), init_utsname()->release, | ||
314 | (int)strcspn(init_utsname()->version, " "), | ||
315 | init_utsname()->version); | ||
316 | printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", | ||
317 | regs->eax,regs->ebx,regs->ecx,regs->edx); | ||
318 | printk("ESI: %08lx EDI: %08lx EBP: %08lx", | ||
319 | regs->esi, regs->edi, regs->ebp); | ||
320 | printk(" DS: %04x ES: %04x FS: %04x\n", | ||
321 | 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); | ||
322 | |||
323 | cr0 = read_cr0(); | ||
324 | cr2 = read_cr2(); | ||
325 | cr3 = read_cr3(); | ||
326 | cr4 = read_cr4_safe(); | ||
327 | printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); | ||
328 | |||
329 | get_debugreg(d0, 0); | ||
330 | get_debugreg(d1, 1); | ||
331 | get_debugreg(d2, 2); | ||
332 | get_debugreg(d3, 3); | ||
333 | printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", | ||
334 | d0, d1, d2, d3); | ||
335 | get_debugreg(d6, 6); | ||
336 | get_debugreg(d7, 7); | ||
337 | printk("DR6: %08lx DR7: %08lx\n", d6, d7); | ||
338 | |||
339 | show_trace(NULL, regs, ®s->esp); | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * This gets run with %ebx containing the | ||
344 | * function to call, and %edx containing | ||
345 | * the "args". | ||
346 | */ | ||
347 | extern void kernel_thread_helper(void); | ||
348 | |||
349 | /* | ||
350 | * Create a kernel thread | ||
351 | */ | ||
352 | int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | ||
353 | { | ||
354 | struct pt_regs regs; | ||
355 | |||
356 | memset(®s, 0, sizeof(regs)); | ||
357 | |||
358 | regs.ebx = (unsigned long) fn; | ||
359 | regs.edx = (unsigned long) arg; | ||
360 | |||
361 | regs.xds = __USER_DS; | ||
362 | regs.xes = __USER_DS; | ||
363 | regs.xfs = __KERNEL_PERCPU; | ||
364 | regs.orig_eax = -1; | ||
365 | regs.eip = (unsigned long) kernel_thread_helper; | ||
366 | regs.xcs = __KERNEL_CS | get_kernel_rpl(); | ||
367 | regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; | ||
368 | |||
369 | /* Ok, create the new process.. */ | ||
370 | return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); | ||
371 | } | ||
372 | EXPORT_SYMBOL(kernel_thread); | ||
373 | |||
374 | /* | ||
375 | * Free current thread data structures etc.. | ||
376 | */ | ||
377 | void exit_thread(void) | ||
378 | { | ||
379 | /* The process may have allocated an io port bitmap... nuke it. */ | ||
380 | if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { | ||
381 | struct task_struct *tsk = current; | ||
382 | struct thread_struct *t = &tsk->thread; | ||
383 | int cpu = get_cpu(); | ||
384 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
385 | |||
386 | kfree(t->io_bitmap_ptr); | ||
387 | t->io_bitmap_ptr = NULL; | ||
388 | clear_thread_flag(TIF_IO_BITMAP); | ||
389 | /* | ||
390 | * Careful, clear this in the TSS too: | ||
391 | */ | ||
392 | memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); | ||
393 | t->io_bitmap_max = 0; | ||
394 | tss->io_bitmap_owner = NULL; | ||
395 | tss->io_bitmap_max = 0; | ||
396 | tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; | ||
397 | put_cpu(); | ||
398 | } | ||
399 | } | ||
400 | |||
401 | void flush_thread(void) | ||
402 | { | ||
403 | struct task_struct *tsk = current; | ||
404 | |||
405 | memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); | ||
406 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | ||
407 | clear_tsk_thread_flag(tsk, TIF_DEBUG); | ||
408 | /* | ||
409 | * Forget coprocessor state.. | ||
410 | */ | ||
411 | clear_fpu(tsk); | ||
412 | clear_used_math(); | ||
413 | } | ||
414 | |||
415 | void release_thread(struct task_struct *dead_task) | ||
416 | { | ||
417 | BUG_ON(dead_task->mm); | ||
418 | release_vm86_irqs(dead_task); | ||
419 | } | ||
420 | |||
421 | /* | ||
422 | * This gets called before we allocate a new thread and copy | ||
423 | * the current task into it. | ||
424 | */ | ||
425 | void prepare_to_copy(struct task_struct *tsk) | ||
426 | { | ||
427 | unlazy_fpu(tsk); | ||
428 | } | ||
429 | |||
430 | int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, | ||
431 | unsigned long unused, | ||
432 | struct task_struct * p, struct pt_regs * regs) | ||
433 | { | ||
434 | struct pt_regs * childregs; | ||
435 | struct task_struct *tsk; | ||
436 | int err; | ||
437 | |||
438 | childregs = task_pt_regs(p); | ||
439 | *childregs = *regs; | ||
440 | childregs->eax = 0; | ||
441 | childregs->esp = esp; | ||
442 | |||
443 | p->thread.esp = (unsigned long) childregs; | ||
444 | p->thread.esp0 = (unsigned long) (childregs+1); | ||
445 | |||
446 | p->thread.eip = (unsigned long) ret_from_fork; | ||
447 | |||
448 | savesegment(gs,p->thread.gs); | ||
449 | |||
450 | tsk = current; | ||
451 | if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { | ||
452 | p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, | ||
453 | IO_BITMAP_BYTES, GFP_KERNEL); | ||
454 | if (!p->thread.io_bitmap_ptr) { | ||
455 | p->thread.io_bitmap_max = 0; | ||
456 | return -ENOMEM; | ||
457 | } | ||
458 | set_tsk_thread_flag(p, TIF_IO_BITMAP); | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * Set a new TLS for the child thread? | ||
463 | */ | ||
464 | if (clone_flags & CLONE_SETTLS) { | ||
465 | struct desc_struct *desc; | ||
466 | struct user_desc info; | ||
467 | int idx; | ||
468 | |||
469 | err = -EFAULT; | ||
470 | if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) | ||
471 | goto out; | ||
472 | err = -EINVAL; | ||
473 | if (LDT_empty(&info)) | ||
474 | goto out; | ||
475 | |||
476 | idx = info.entry_number; | ||
477 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
478 | goto out; | ||
479 | |||
480 | desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | ||
481 | desc->a = LDT_entry_a(&info); | ||
482 | desc->b = LDT_entry_b(&info); | ||
483 | } | ||
484 | |||
485 | err = 0; | ||
486 | out: | ||
487 | if (err && p->thread.io_bitmap_ptr) { | ||
488 | kfree(p->thread.io_bitmap_ptr); | ||
489 | p->thread.io_bitmap_max = 0; | ||
490 | } | ||
491 | return err; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * fill in the user structure for a core dump.. | ||
496 | */ | ||
497 | void dump_thread(struct pt_regs * regs, struct user * dump) | ||
498 | { | ||
499 | int i; | ||
500 | |||
501 | /* changed the size calculations - should hopefully work better. lbt */ | ||
502 | dump->magic = CMAGIC; | ||
503 | dump->start_code = 0; | ||
504 | dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); | ||
505 | dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; | ||
506 | dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; | ||
507 | dump->u_dsize -= dump->u_tsize; | ||
508 | dump->u_ssize = 0; | ||
509 | for (i = 0; i < 8; i++) | ||
510 | dump->u_debugreg[i] = current->thread.debugreg[i]; | ||
511 | |||
512 | if (dump->start_stack < TASK_SIZE) | ||
513 | dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; | ||
514 | |||
515 | dump->regs.ebx = regs->ebx; | ||
516 | dump->regs.ecx = regs->ecx; | ||
517 | dump->regs.edx = regs->edx; | ||
518 | dump->regs.esi = regs->esi; | ||
519 | dump->regs.edi = regs->edi; | ||
520 | dump->regs.ebp = regs->ebp; | ||
521 | dump->regs.eax = regs->eax; | ||
522 | dump->regs.ds = regs->xds; | ||
523 | dump->regs.es = regs->xes; | ||
524 | dump->regs.fs = regs->xfs; | ||
525 | savesegment(gs,dump->regs.gs); | ||
526 | dump->regs.orig_eax = regs->orig_eax; | ||
527 | dump->regs.eip = regs->eip; | ||
528 | dump->regs.cs = regs->xcs; | ||
529 | dump->regs.eflags = regs->eflags; | ||
530 | dump->regs.esp = regs->esp; | ||
531 | dump->regs.ss = regs->xss; | ||
532 | |||
533 | dump->u_fpvalid = dump_fpu (regs, &dump->i387); | ||
534 | } | ||
535 | EXPORT_SYMBOL(dump_thread); | ||
536 | |||
537 | /* | ||
538 | * Capture the user space registers if the task is not running (in user space) | ||
539 | */ | ||
540 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | ||
541 | { | ||
542 | struct pt_regs ptregs = *task_pt_regs(tsk); | ||
543 | ptregs.xcs &= 0xffff; | ||
544 | ptregs.xds &= 0xffff; | ||
545 | ptregs.xes &= 0xffff; | ||
546 | ptregs.xss &= 0xffff; | ||
547 | |||
548 | elf_core_copy_regs(regs, &ptregs); | ||
549 | |||
550 | return 1; | ||
551 | } | ||
552 | |||
553 | #ifdef CONFIG_SECCOMP | ||
554 | void hard_disable_TSC(void) | ||
555 | { | ||
556 | write_cr4(read_cr4() | X86_CR4_TSD); | ||
557 | } | ||
558 | void disable_TSC(void) | ||
559 | { | ||
560 | preempt_disable(); | ||
561 | if (!test_and_set_thread_flag(TIF_NOTSC)) | ||
562 | /* | ||
563 | * Must flip the CPU state synchronously with | ||
564 | * TIF_NOTSC in the current running context. | ||
565 | */ | ||
566 | hard_disable_TSC(); | ||
567 | preempt_enable(); | ||
568 | } | ||
569 | void hard_enable_TSC(void) | ||
570 | { | ||
571 | write_cr4(read_cr4() & ~X86_CR4_TSD); | ||
572 | } | ||
573 | #endif /* CONFIG_SECCOMP */ | ||
574 | |||
575 | static noinline void | ||
576 | __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | ||
577 | struct tss_struct *tss) | ||
578 | { | ||
579 | struct thread_struct *next; | ||
580 | |||
581 | next = &next_p->thread; | ||
582 | |||
583 | if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { | ||
584 | set_debugreg(next->debugreg[0], 0); | ||
585 | set_debugreg(next->debugreg[1], 1); | ||
586 | set_debugreg(next->debugreg[2], 2); | ||
587 | set_debugreg(next->debugreg[3], 3); | ||
588 | /* no 4 and 5 */ | ||
589 | set_debugreg(next->debugreg[6], 6); | ||
590 | set_debugreg(next->debugreg[7], 7); | ||
591 | } | ||
592 | |||
593 | #ifdef CONFIG_SECCOMP | ||
594 | if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ | ||
595 | test_tsk_thread_flag(next_p, TIF_NOTSC)) { | ||
596 | /* prev and next are different */ | ||
597 | if (test_tsk_thread_flag(next_p, TIF_NOTSC)) | ||
598 | hard_disable_TSC(); | ||
599 | else | ||
600 | hard_enable_TSC(); | ||
601 | } | ||
602 | #endif | ||
603 | |||
604 | if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { | ||
605 | /* | ||
606 | * Disable the bitmap via an invalid offset. We still cache | ||
607 | * the previous bitmap owner and the IO bitmap contents: | ||
608 | */ | ||
609 | tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; | ||
610 | return; | ||
611 | } | ||
612 | |||
613 | if (likely(next == tss->io_bitmap_owner)) { | ||
614 | /* | ||
615 | * Previous owner of the bitmap (hence the bitmap content) | ||
616 | * matches the next task, we dont have to do anything but | ||
617 | * to set a valid offset in the TSS: | ||
618 | */ | ||
619 | tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; | ||
620 | return; | ||
621 | } | ||
622 | /* | ||
623 | * Lazy TSS's I/O bitmap copy. We set an invalid offset here | ||
624 | * and we let the task to get a GPF in case an I/O instruction | ||
625 | * is performed. The handler of the GPF will verify that the | ||
626 | * faulting task has a valid I/O bitmap and, it true, does the | ||
627 | * real copy and restart the instruction. This will save us | ||
628 | * redundant copies when the currently switched task does not | ||
629 | * perform any I/O during its timeslice. | ||
630 | */ | ||
631 | tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; | ||
632 | } | ||
633 | |||
634 | /* | ||
635 | * switch_to(x,yn) should switch tasks from x to y. | ||
636 | * | ||
637 | * We fsave/fwait so that an exception goes off at the right time | ||
638 | * (as a call from the fsave or fwait in effect) rather than to | ||
639 | * the wrong process. Lazy FP saving no longer makes any sense | ||
640 | * with modern CPU's, and this simplifies a lot of things (SMP | ||
641 | * and UP become the same). | ||
642 | * | ||
643 | * NOTE! We used to use the x86 hardware context switching. The | ||
644 | * reason for not using it any more becomes apparent when you | ||
645 | * try to recover gracefully from saved state that is no longer | ||
646 | * valid (stale segment register values in particular). With the | ||
647 | * hardware task-switch, there is no way to fix up bad state in | ||
648 | * a reasonable manner. | ||
649 | * | ||
650 | * The fact that Intel documents the hardware task-switching to | ||
651 | * be slow is a fairly red herring - this code is not noticeably | ||
652 | * faster. However, there _is_ some room for improvement here, | ||
653 | * so the performance issues may eventually be a valid point. | ||
654 | * More important, however, is the fact that this allows us much | ||
655 | * more flexibility. | ||
656 | * | ||
657 | * The return value (in %eax) will be the "prev" task after | ||
658 | * the task-switch, and shows up in ret_from_fork in entry.S, | ||
659 | * for example. | ||
660 | */ | ||
661 | struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
662 | { | ||
663 | struct thread_struct *prev = &prev_p->thread, | ||
664 | *next = &next_p->thread; | ||
665 | int cpu = smp_processor_id(); | ||
666 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
667 | |||
668 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | ||
669 | |||
670 | __unlazy_fpu(prev_p); | ||
671 | |||
672 | |||
673 | /* we're going to use this soon, after a few expensive things */ | ||
674 | if (next_p->fpu_counter > 5) | ||
675 | prefetch(&next->i387.fxsave); | ||
676 | |||
677 | /* | ||
678 | * Reload esp0. | ||
679 | */ | ||
680 | load_esp0(tss, next); | ||
681 | |||
682 | /* | ||
683 | * Save away %gs. No need to save %fs, as it was saved on the | ||
684 | * stack on entry. No need to save %es and %ds, as those are | ||
685 | * always kernel segments while inside the kernel. Doing this | ||
686 | * before setting the new TLS descriptors avoids the situation | ||
687 | * where we temporarily have non-reloadable segments in %fs | ||
688 | * and %gs. This could be an issue if the NMI handler ever | ||
689 | * used %fs or %gs (it does not today), or if the kernel is | ||
690 | * running inside of a hypervisor layer. | ||
691 | */ | ||
692 | savesegment(gs, prev->gs); | ||
693 | |||
694 | /* | ||
695 | * Load the per-thread Thread-Local Storage descriptor. | ||
696 | */ | ||
697 | load_TLS(next, cpu); | ||
698 | |||
699 | /* | ||
700 | * Restore IOPL if needed. In normal use, the flags restore | ||
701 | * in the switch assembly will handle this. But if the kernel | ||
702 | * is running virtualized at a non-zero CPL, the popf will | ||
703 | * not restore flags, so it must be done in a separate step. | ||
704 | */ | ||
705 | if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) | ||
706 | set_iopl_mask(next->iopl); | ||
707 | |||
708 | /* | ||
709 | * Now maybe handle debug registers and/or IO bitmaps | ||
710 | */ | ||
711 | if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || | ||
712 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) | ||
713 | __switch_to_xtra(prev_p, next_p, tss); | ||
714 | |||
715 | /* | ||
716 | * Leave lazy mode, flushing any hypercalls made here. | ||
717 | * This must be done before restoring TLS segments so | ||
718 | * the GDT and LDT are properly updated, and must be | ||
719 | * done before math_state_restore, so the TS bit is up | ||
720 | * to date. | ||
721 | */ | ||
722 | arch_leave_lazy_cpu_mode(); | ||
723 | |||
724 | /* If the task has used fpu the last 5 timeslices, just do a full | ||
725 | * restore of the math state immediately to avoid the trap; the | ||
726 | * chances of needing FPU soon are obviously high now | ||
727 | */ | ||
728 | if (next_p->fpu_counter > 5) | ||
729 | math_state_restore(); | ||
730 | |||
731 | /* | ||
732 | * Restore %gs if needed (which is common) | ||
733 | */ | ||
734 | if (prev->gs | next->gs) | ||
735 | loadsegment(gs, next->gs); | ||
736 | |||
737 | x86_write_percpu(current_task, next_p); | ||
738 | |||
739 | return prev_p; | ||
740 | } | ||
741 | |||
742 | asmlinkage int sys_fork(struct pt_regs regs) | ||
743 | { | ||
744 | return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | ||
745 | } | ||
746 | |||
747 | asmlinkage int sys_clone(struct pt_regs regs) | ||
748 | { | ||
749 | unsigned long clone_flags; | ||
750 | unsigned long newsp; | ||
751 | int __user *parent_tidptr, *child_tidptr; | ||
752 | |||
753 | clone_flags = regs.ebx; | ||
754 | newsp = regs.ecx; | ||
755 | parent_tidptr = (int __user *)regs.edx; | ||
756 | child_tidptr = (int __user *)regs.edi; | ||
757 | if (!newsp) | ||
758 | newsp = regs.esp; | ||
759 | return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); | ||
760 | } | ||
761 | |||
762 | /* | ||
763 | * This is trivial, and on the face of it looks like it | ||
764 | * could equally well be done in user mode. | ||
765 | * | ||
766 | * Not so, for quite unobvious reasons - register pressure. | ||
767 | * In user mode vfork() cannot have a stack frame, and if | ||
768 | * done by calling the "clone()" system call directly, you | ||
769 | * do not have enough call-clobbered registers to hold all | ||
770 | * the information you need. | ||
771 | */ | ||
772 | asmlinkage int sys_vfork(struct pt_regs regs) | ||
773 | { | ||
774 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | ||
775 | } | ||
776 | |||
777 | /* | ||
778 | * sys_execve() executes a new program. | ||
779 | */ | ||
780 | asmlinkage int sys_execve(struct pt_regs regs) | ||
781 | { | ||
782 | int error; | ||
783 | char * filename; | ||
784 | |||
785 | filename = getname((char __user *) regs.ebx); | ||
786 | error = PTR_ERR(filename); | ||
787 | if (IS_ERR(filename)) | ||
788 | goto out; | ||
789 | error = do_execve(filename, | ||
790 | (char __user * __user *) regs.ecx, | ||
791 | (char __user * __user *) regs.edx, | ||
792 | ®s); | ||
793 | if (error == 0) { | ||
794 | task_lock(current); | ||
795 | current->ptrace &= ~PT_DTRACE; | ||
796 | task_unlock(current); | ||
797 | /* Make sure we don't return using sysenter.. */ | ||
798 | set_thread_flag(TIF_IRET); | ||
799 | } | ||
800 | putname(filename); | ||
801 | out: | ||
802 | return error; | ||
803 | } | ||
804 | |||
805 | #define top_esp (THREAD_SIZE - sizeof(unsigned long)) | ||
806 | #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) | ||
807 | |||
808 | unsigned long get_wchan(struct task_struct *p) | ||
809 | { | ||
810 | unsigned long ebp, esp, eip; | ||
811 | unsigned long stack_page; | ||
812 | int count = 0; | ||
813 | if (!p || p == current || p->state == TASK_RUNNING) | ||
814 | return 0; | ||
815 | stack_page = (unsigned long)task_stack_page(p); | ||
816 | esp = p->thread.esp; | ||
817 | if (!stack_page || esp < stack_page || esp > top_esp+stack_page) | ||
818 | return 0; | ||
819 | /* include/asm-i386/system.h:switch_to() pushes ebp last. */ | ||
820 | ebp = *(unsigned long *) esp; | ||
821 | do { | ||
822 | if (ebp < stack_page || ebp > top_ebp+stack_page) | ||
823 | return 0; | ||
824 | eip = *(unsigned long *) (ebp+4); | ||
825 | if (!in_sched_functions(eip)) | ||
826 | return eip; | ||
827 | ebp = *(unsigned long *) ebp; | ||
828 | } while (count++ < 16); | ||
829 | return 0; | ||
830 | } | ||
831 | |||
832 | /* | ||
833 | * sys_alloc_thread_area: get a yet unused TLS descriptor index. | ||
834 | */ | ||
835 | static int get_free_idx(void) | ||
836 | { | ||
837 | struct thread_struct *t = ¤t->thread; | ||
838 | int idx; | ||
839 | |||
840 | for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) | ||
841 | if (desc_empty(t->tls_array + idx)) | ||
842 | return idx + GDT_ENTRY_TLS_MIN; | ||
843 | return -ESRCH; | ||
844 | } | ||
845 | |||
846 | /* | ||
847 | * Set a given TLS descriptor: | ||
848 | */ | ||
849 | asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) | ||
850 | { | ||
851 | struct thread_struct *t = ¤t->thread; | ||
852 | struct user_desc info; | ||
853 | struct desc_struct *desc; | ||
854 | int cpu, idx; | ||
855 | |||
856 | if (copy_from_user(&info, u_info, sizeof(info))) | ||
857 | return -EFAULT; | ||
858 | idx = info.entry_number; | ||
859 | |||
860 | /* | ||
861 | * index -1 means the kernel should try to find and | ||
862 | * allocate an empty descriptor: | ||
863 | */ | ||
864 | if (idx == -1) { | ||
865 | idx = get_free_idx(); | ||
866 | if (idx < 0) | ||
867 | return idx; | ||
868 | if (put_user(idx, &u_info->entry_number)) | ||
869 | return -EFAULT; | ||
870 | } | ||
871 | |||
872 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
873 | return -EINVAL; | ||
874 | |||
875 | desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; | ||
876 | |||
877 | /* | ||
878 | * We must not get preempted while modifying the TLS. | ||
879 | */ | ||
880 | cpu = get_cpu(); | ||
881 | |||
882 | if (LDT_empty(&info)) { | ||
883 | desc->a = 0; | ||
884 | desc->b = 0; | ||
885 | } else { | ||
886 | desc->a = LDT_entry_a(&info); | ||
887 | desc->b = LDT_entry_b(&info); | ||
888 | } | ||
889 | load_TLS(t, cpu); | ||
890 | |||
891 | put_cpu(); | ||
892 | |||
893 | return 0; | ||
894 | } | ||
895 | |||
896 | /* | ||
897 | * Get the current Thread-Local Storage area: | ||
898 | */ | ||
899 | |||
900 | #define GET_BASE(desc) ( \ | ||
901 | (((desc)->a >> 16) & 0x0000ffff) | \ | ||
902 | (((desc)->b << 16) & 0x00ff0000) | \ | ||
903 | ( (desc)->b & 0xff000000) ) | ||
904 | |||
905 | #define GET_LIMIT(desc) ( \ | ||
906 | ((desc)->a & 0x0ffff) | \ | ||
907 | ((desc)->b & 0xf0000) ) | ||
908 | |||
909 | #define GET_32BIT(desc) (((desc)->b >> 22) & 1) | ||
910 | #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) | ||
911 | #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) | ||
912 | #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) | ||
913 | #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) | ||
914 | #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) | ||
915 | |||
916 | asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) | ||
917 | { | ||
918 | struct user_desc info; | ||
919 | struct desc_struct *desc; | ||
920 | int idx; | ||
921 | |||
922 | if (get_user(idx, &u_info->entry_number)) | ||
923 | return -EFAULT; | ||
924 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
925 | return -EINVAL; | ||
926 | |||
927 | memset(&info, 0, sizeof(info)); | ||
928 | |||
929 | desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | ||
930 | |||
931 | info.entry_number = idx; | ||
932 | info.base_addr = GET_BASE(desc); | ||
933 | info.limit = GET_LIMIT(desc); | ||
934 | info.seg_32bit = GET_32BIT(desc); | ||
935 | info.contents = GET_CONTENTS(desc); | ||
936 | info.read_exec_only = !GET_WRITABLE(desc); | ||
937 | info.limit_in_pages = GET_LIMIT_PAGES(desc); | ||
938 | info.seg_not_present = !GET_PRESENT(desc); | ||
939 | info.useable = GET_USEABLE(desc); | ||
940 | |||
941 | if (copy_to_user(u_info, &info, sizeof(info))) | ||
942 | return -EFAULT; | ||
943 | return 0; | ||
944 | } | ||
945 | |||
946 | unsigned long arch_align_stack(unsigned long sp) | ||
947 | { | ||
948 | if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) | ||
949 | sp -= get_random_int() % 8192; | ||
950 | return sp & ~0xf; | ||
951 | } | ||
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c new file mode 100644 index 000000000000..7c1b92522e95 --- /dev/null +++ b/arch/x86/kernel/ptrace_32.c | |||
@@ -0,0 +1,723 @@ | |||
1 | /* ptrace.c */ | ||
2 | /* By Ross Biro 1/23/92 */ | ||
3 | /* | ||
4 | * Pentium III FXSR, SSE support | ||
5 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/smp.h> | ||
12 | #include <linux/errno.h> | ||
13 | #include <linux/ptrace.h> | ||
14 | #include <linux/user.h> | ||
15 | #include <linux/security.h> | ||
16 | #include <linux/audit.h> | ||
17 | #include <linux/seccomp.h> | ||
18 | #include <linux/signal.h> | ||
19 | |||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/pgtable.h> | ||
22 | #include <asm/system.h> | ||
23 | #include <asm/processor.h> | ||
24 | #include <asm/i387.h> | ||
25 | #include <asm/debugreg.h> | ||
26 | #include <asm/ldt.h> | ||
27 | #include <asm/desc.h> | ||
28 | |||
29 | /* | ||
30 | * does not yet catch signals sent when the child dies. | ||
31 | * in exit.c or in signal.c. | ||
32 | */ | ||
33 | |||
34 | /* | ||
35 | * Determines which flags the user has access to [1 = access, 0 = no access]. | ||
36 | * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9). | ||
37 | * Also masks reserved bits (31-22, 15, 5, 3, 1). | ||
38 | */ | ||
39 | #define FLAG_MASK 0x00050dd5 | ||
40 | |||
41 | /* set's the trap flag. */ | ||
42 | #define TRAP_FLAG 0x100 | ||
43 | |||
44 | /* | ||
45 | * Offset of eflags on child stack.. | ||
46 | */ | ||
47 | #define EFL_OFFSET offsetof(struct pt_regs, eflags) | ||
48 | |||
49 | static inline struct pt_regs *get_child_regs(struct task_struct *task) | ||
50 | { | ||
51 | void *stack_top = (void *)task->thread.esp0; | ||
52 | return stack_top - sizeof(struct pt_regs); | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * This routine will get a word off of the processes privileged stack. | ||
57 | * the offset is bytes into the pt_regs structure on the stack. | ||
58 | * This routine assumes that all the privileged stacks are in our | ||
59 | * data space. | ||
60 | */ | ||
61 | static inline int get_stack_long(struct task_struct *task, int offset) | ||
62 | { | ||
63 | unsigned char *stack; | ||
64 | |||
65 | stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); | ||
66 | stack += offset; | ||
67 | return (*((int *)stack)); | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * This routine will put a word on the processes privileged stack. | ||
72 | * the offset is bytes into the pt_regs structure on the stack. | ||
73 | * This routine assumes that all the privileged stacks are in our | ||
74 | * data space. | ||
75 | */ | ||
76 | static inline int put_stack_long(struct task_struct *task, int offset, | ||
77 | unsigned long data) | ||
78 | { | ||
79 | unsigned char * stack; | ||
80 | |||
81 | stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); | ||
82 | stack += offset; | ||
83 | *(unsigned long *) stack = data; | ||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | static int putreg(struct task_struct *child, | ||
88 | unsigned long regno, unsigned long value) | ||
89 | { | ||
90 | switch (regno >> 2) { | ||
91 | case GS: | ||
92 | if (value && (value & 3) != 3) | ||
93 | return -EIO; | ||
94 | child->thread.gs = value; | ||
95 | return 0; | ||
96 | case DS: | ||
97 | case ES: | ||
98 | case FS: | ||
99 | if (value && (value & 3) != 3) | ||
100 | return -EIO; | ||
101 | value &= 0xffff; | ||
102 | break; | ||
103 | case SS: | ||
104 | case CS: | ||
105 | if ((value & 3) != 3) | ||
106 | return -EIO; | ||
107 | value &= 0xffff; | ||
108 | break; | ||
109 | case EFL: | ||
110 | value &= FLAG_MASK; | ||
111 | value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; | ||
112 | break; | ||
113 | } | ||
114 | if (regno > FS*4) | ||
115 | regno -= 1*4; | ||
116 | put_stack_long(child, regno, value); | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | static unsigned long getreg(struct task_struct *child, | ||
121 | unsigned long regno) | ||
122 | { | ||
123 | unsigned long retval = ~0UL; | ||
124 | |||
125 | switch (regno >> 2) { | ||
126 | case GS: | ||
127 | retval = child->thread.gs; | ||
128 | break; | ||
129 | case DS: | ||
130 | case ES: | ||
131 | case FS: | ||
132 | case SS: | ||
133 | case CS: | ||
134 | retval = 0xffff; | ||
135 | /* fall through */ | ||
136 | default: | ||
137 | if (regno > FS*4) | ||
138 | regno -= 1*4; | ||
139 | retval &= get_stack_long(child, regno); | ||
140 | } | ||
141 | return retval; | ||
142 | } | ||
143 | |||
144 | #define LDT_SEGMENT 4 | ||
145 | |||
146 | static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs) | ||
147 | { | ||
148 | unsigned long addr, seg; | ||
149 | |||
150 | addr = regs->eip; | ||
151 | seg = regs->xcs & 0xffff; | ||
152 | if (regs->eflags & VM_MASK) { | ||
153 | addr = (addr & 0xffff) + (seg << 4); | ||
154 | return addr; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * We'll assume that the code segments in the GDT | ||
159 | * are all zero-based. That is largely true: the | ||
160 | * TLS segments are used for data, and the PNPBIOS | ||
161 | * and APM bios ones we just ignore here. | ||
162 | */ | ||
163 | if (seg & LDT_SEGMENT) { | ||
164 | u32 *desc; | ||
165 | unsigned long base; | ||
166 | |||
167 | seg &= ~7UL; | ||
168 | |||
169 | down(&child->mm->context.sem); | ||
170 | if (unlikely((seg >> 3) >= child->mm->context.size)) | ||
171 | addr = -1L; /* bogus selector, access would fault */ | ||
172 | else { | ||
173 | desc = child->mm->context.ldt + seg; | ||
174 | base = ((desc[0] >> 16) | | ||
175 | ((desc[1] & 0xff) << 16) | | ||
176 | (desc[1] & 0xff000000)); | ||
177 | |||
178 | /* 16-bit code segment? */ | ||
179 | if (!((desc[1] >> 22) & 1)) | ||
180 | addr &= 0xffff; | ||
181 | addr += base; | ||
182 | } | ||
183 | up(&child->mm->context.sem); | ||
184 | } | ||
185 | return addr; | ||
186 | } | ||
187 | |||
188 | static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) | ||
189 | { | ||
190 | int i, copied; | ||
191 | unsigned char opcode[15]; | ||
192 | unsigned long addr = convert_eip_to_linear(child, regs); | ||
193 | |||
194 | copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); | ||
195 | for (i = 0; i < copied; i++) { | ||
196 | switch (opcode[i]) { | ||
197 | /* popf and iret */ | ||
198 | case 0x9d: case 0xcf: | ||
199 | return 1; | ||
200 | /* opcode and address size prefixes */ | ||
201 | case 0x66: case 0x67: | ||
202 | continue; | ||
203 | /* irrelevant prefixes (segment overrides and repeats) */ | ||
204 | case 0x26: case 0x2e: | ||
205 | case 0x36: case 0x3e: | ||
206 | case 0x64: case 0x65: | ||
207 | case 0xf0: case 0xf2: case 0xf3: | ||
208 | continue; | ||
209 | |||
210 | /* | ||
211 | * pushf: NOTE! We should probably not let | ||
212 | * the user see the TF bit being set. But | ||
213 | * it's more pain than it's worth to avoid | ||
214 | * it, and a debugger could emulate this | ||
215 | * all in user space if it _really_ cares. | ||
216 | */ | ||
217 | case 0x9c: | ||
218 | default: | ||
219 | return 0; | ||
220 | } | ||
221 | } | ||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | static void set_singlestep(struct task_struct *child) | ||
226 | { | ||
227 | struct pt_regs *regs = get_child_regs(child); | ||
228 | |||
229 | /* | ||
230 | * Always set TIF_SINGLESTEP - this guarantees that | ||
231 | * we single-step system calls etc.. This will also | ||
232 | * cause us to set TF when returning to user mode. | ||
233 | */ | ||
234 | set_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
235 | |||
236 | /* | ||
237 | * If TF was already set, don't do anything else | ||
238 | */ | ||
239 | if (regs->eflags & TRAP_FLAG) | ||
240 | return; | ||
241 | |||
242 | /* Set TF on the kernel stack.. */ | ||
243 | regs->eflags |= TRAP_FLAG; | ||
244 | |||
245 | /* | ||
246 | * ..but if TF is changed by the instruction we will trace, | ||
247 | * don't mark it as being "us" that set it, so that we | ||
248 | * won't clear it by hand later. | ||
249 | */ | ||
250 | if (is_setting_trap_flag(child, regs)) | ||
251 | return; | ||
252 | |||
253 | child->ptrace |= PT_DTRACE; | ||
254 | } | ||
255 | |||
256 | static void clear_singlestep(struct task_struct *child) | ||
257 | { | ||
258 | /* Always clear TIF_SINGLESTEP... */ | ||
259 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
260 | |||
261 | /* But touch TF only if it was set by us.. */ | ||
262 | if (child->ptrace & PT_DTRACE) { | ||
263 | struct pt_regs *regs = get_child_regs(child); | ||
264 | regs->eflags &= ~TRAP_FLAG; | ||
265 | child->ptrace &= ~PT_DTRACE; | ||
266 | } | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Called by kernel/ptrace.c when detaching.. | ||
271 | * | ||
272 | * Make sure the single step bit is not set. | ||
273 | */ | ||
274 | void ptrace_disable(struct task_struct *child) | ||
275 | { | ||
276 | clear_singlestep(child); | ||
277 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
278 | } | ||
279 | |||
280 | /* | ||
281 | * Perform get_thread_area on behalf of the traced child. | ||
282 | */ | ||
283 | static int | ||
284 | ptrace_get_thread_area(struct task_struct *child, | ||
285 | int idx, struct user_desc __user *user_desc) | ||
286 | { | ||
287 | struct user_desc info; | ||
288 | struct desc_struct *desc; | ||
289 | |||
290 | /* | ||
291 | * Get the current Thread-Local Storage area: | ||
292 | */ | ||
293 | |||
294 | #define GET_BASE(desc) ( \ | ||
295 | (((desc)->a >> 16) & 0x0000ffff) | \ | ||
296 | (((desc)->b << 16) & 0x00ff0000) | \ | ||
297 | ( (desc)->b & 0xff000000) ) | ||
298 | |||
299 | #define GET_LIMIT(desc) ( \ | ||
300 | ((desc)->a & 0x0ffff) | \ | ||
301 | ((desc)->b & 0xf0000) ) | ||
302 | |||
303 | #define GET_32BIT(desc) (((desc)->b >> 22) & 1) | ||
304 | #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) | ||
305 | #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) | ||
306 | #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) | ||
307 | #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) | ||
308 | #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) | ||
309 | |||
310 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
311 | return -EINVAL; | ||
312 | |||
313 | desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | ||
314 | |||
315 | info.entry_number = idx; | ||
316 | info.base_addr = GET_BASE(desc); | ||
317 | info.limit = GET_LIMIT(desc); | ||
318 | info.seg_32bit = GET_32BIT(desc); | ||
319 | info.contents = GET_CONTENTS(desc); | ||
320 | info.read_exec_only = !GET_WRITABLE(desc); | ||
321 | info.limit_in_pages = GET_LIMIT_PAGES(desc); | ||
322 | info.seg_not_present = !GET_PRESENT(desc); | ||
323 | info.useable = GET_USEABLE(desc); | ||
324 | |||
325 | if (copy_to_user(user_desc, &info, sizeof(info))) | ||
326 | return -EFAULT; | ||
327 | |||
328 | return 0; | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * Perform set_thread_area on behalf of the traced child. | ||
333 | */ | ||
334 | static int | ||
335 | ptrace_set_thread_area(struct task_struct *child, | ||
336 | int idx, struct user_desc __user *user_desc) | ||
337 | { | ||
338 | struct user_desc info; | ||
339 | struct desc_struct *desc; | ||
340 | |||
341 | if (copy_from_user(&info, user_desc, sizeof(info))) | ||
342 | return -EFAULT; | ||
343 | |||
344 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
345 | return -EINVAL; | ||
346 | |||
347 | desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | ||
348 | if (LDT_empty(&info)) { | ||
349 | desc->a = 0; | ||
350 | desc->b = 0; | ||
351 | } else { | ||
352 | desc->a = LDT_entry_a(&info); | ||
353 | desc->b = LDT_entry_b(&info); | ||
354 | } | ||
355 | |||
356 | return 0; | ||
357 | } | ||
358 | |||
359 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) | ||
360 | { | ||
361 | struct user * dummy = NULL; | ||
362 | int i, ret; | ||
363 | unsigned long __user *datap = (unsigned long __user *)data; | ||
364 | |||
365 | switch (request) { | ||
366 | /* when I and D space are separate, these will need to be fixed. */ | ||
367 | case PTRACE_PEEKTEXT: /* read word at location addr. */ | ||
368 | case PTRACE_PEEKDATA: | ||
369 | ret = generic_ptrace_peekdata(child, addr, data); | ||
370 | break; | ||
371 | |||
372 | /* read the word at location addr in the USER area. */ | ||
373 | case PTRACE_PEEKUSR: { | ||
374 | unsigned long tmp; | ||
375 | |||
376 | ret = -EIO; | ||
377 | if ((addr & 3) || addr < 0 || | ||
378 | addr > sizeof(struct user) - 3) | ||
379 | break; | ||
380 | |||
381 | tmp = 0; /* Default return condition */ | ||
382 | if(addr < FRAME_SIZE*sizeof(long)) | ||
383 | tmp = getreg(child, addr); | ||
384 | if(addr >= (long) &dummy->u_debugreg[0] && | ||
385 | addr <= (long) &dummy->u_debugreg[7]){ | ||
386 | addr -= (long) &dummy->u_debugreg[0]; | ||
387 | addr = addr >> 2; | ||
388 | tmp = child->thread.debugreg[addr]; | ||
389 | } | ||
390 | ret = put_user(tmp, datap); | ||
391 | break; | ||
392 | } | ||
393 | |||
394 | /* when I and D space are separate, this will have to be fixed. */ | ||
395 | case PTRACE_POKETEXT: /* write the word at location addr. */ | ||
396 | case PTRACE_POKEDATA: | ||
397 | ret = generic_ptrace_pokedata(child, addr, data); | ||
398 | break; | ||
399 | |||
400 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | ||
401 | ret = -EIO; | ||
402 | if ((addr & 3) || addr < 0 || | ||
403 | addr > sizeof(struct user) - 3) | ||
404 | break; | ||
405 | |||
406 | if (addr < FRAME_SIZE*sizeof(long)) { | ||
407 | ret = putreg(child, addr, data); | ||
408 | break; | ||
409 | } | ||
410 | /* We need to be very careful here. We implicitly | ||
411 | want to modify a portion of the task_struct, and we | ||
412 | have to be selective about what portions we allow someone | ||
413 | to modify. */ | ||
414 | |||
415 | ret = -EIO; | ||
416 | if(addr >= (long) &dummy->u_debugreg[0] && | ||
417 | addr <= (long) &dummy->u_debugreg[7]){ | ||
418 | |||
419 | if(addr == (long) &dummy->u_debugreg[4]) break; | ||
420 | if(addr == (long) &dummy->u_debugreg[5]) break; | ||
421 | if(addr < (long) &dummy->u_debugreg[4] && | ||
422 | ((unsigned long) data) >= TASK_SIZE-3) break; | ||
423 | |||
424 | /* Sanity-check data. Take one half-byte at once with | ||
425 | * check = (val >> (16 + 4*i)) & 0xf. It contains the | ||
426 | * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits | ||
427 | * 2 and 3 are LENi. Given a list of invalid values, | ||
428 | * we do mask |= 1 << invalid_value, so that | ||
429 | * (mask >> check) & 1 is a correct test for invalid | ||
430 | * values. | ||
431 | * | ||
432 | * R/Wi contains the type of the breakpoint / | ||
433 | * watchpoint, LENi contains the length of the watched | ||
434 | * data in the watchpoint case. | ||
435 | * | ||
436 | * The invalid values are: | ||
437 | * - LENi == 0x10 (undefined), so mask |= 0x0f00. | ||
438 | * - R/Wi == 0x10 (break on I/O reads or writes), so | ||
439 | * mask |= 0x4444. | ||
440 | * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= | ||
441 | * 0x1110. | ||
442 | * | ||
443 | * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. | ||
444 | * | ||
445 | * See the Intel Manual "System Programming Guide", | ||
446 | * 15.2.4 | ||
447 | * | ||
448 | * Note that LENi == 0x10 is defined on x86_64 in long | ||
449 | * mode (i.e. even for 32-bit userspace software, but | ||
450 | * 64-bit kernel), so the x86_64 mask value is 0x5454. | ||
451 | * See the AMD manual no. 24593 (AMD64 System | ||
452 | * Programming)*/ | ||
453 | |||
454 | if(addr == (long) &dummy->u_debugreg[7]) { | ||
455 | data &= ~DR_CONTROL_RESERVED; | ||
456 | for(i=0; i<4; i++) | ||
457 | if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) | ||
458 | goto out_tsk; | ||
459 | if (data) | ||
460 | set_tsk_thread_flag(child, TIF_DEBUG); | ||
461 | else | ||
462 | clear_tsk_thread_flag(child, TIF_DEBUG); | ||
463 | } | ||
464 | addr -= (long) &dummy->u_debugreg; | ||
465 | addr = addr >> 2; | ||
466 | child->thread.debugreg[addr] = data; | ||
467 | ret = 0; | ||
468 | } | ||
469 | break; | ||
470 | |||
471 | case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ | ||
472 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ | ||
473 | case PTRACE_CONT: /* restart after signal. */ | ||
474 | ret = -EIO; | ||
475 | if (!valid_signal(data)) | ||
476 | break; | ||
477 | if (request == PTRACE_SYSEMU) { | ||
478 | set_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
479 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
480 | } else if (request == PTRACE_SYSCALL) { | ||
481 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
482 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
483 | } else { | ||
484 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
485 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
486 | } | ||
487 | child->exit_code = data; | ||
488 | /* make sure the single step bit is not set. */ | ||
489 | clear_singlestep(child); | ||
490 | wake_up_process(child); | ||
491 | ret = 0; | ||
492 | break; | ||
493 | |||
494 | /* | ||
495 | * make the child exit. Best I can do is send it a sigkill. | ||
496 | * perhaps it should be put in the status that it wants to | ||
497 | * exit. | ||
498 | */ | ||
499 | case PTRACE_KILL: | ||
500 | ret = 0; | ||
501 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ | ||
502 | break; | ||
503 | child->exit_code = SIGKILL; | ||
504 | /* make sure the single step bit is not set. */ | ||
505 | clear_singlestep(child); | ||
506 | wake_up_process(child); | ||
507 | break; | ||
508 | |||
509 | case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */ | ||
510 | case PTRACE_SINGLESTEP: /* set the trap flag. */ | ||
511 | ret = -EIO; | ||
512 | if (!valid_signal(data)) | ||
513 | break; | ||
514 | |||
515 | if (request == PTRACE_SYSEMU_SINGLESTEP) | ||
516 | set_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
517 | else | ||
518 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
519 | |||
520 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
521 | set_singlestep(child); | ||
522 | child->exit_code = data; | ||
523 | /* give it a chance to run. */ | ||
524 | wake_up_process(child); | ||
525 | ret = 0; | ||
526 | break; | ||
527 | |||
528 | case PTRACE_DETACH: | ||
529 | /* detach a process that was attached. */ | ||
530 | ret = ptrace_detach(child, data); | ||
531 | break; | ||
532 | |||
533 | case PTRACE_GETREGS: { /* Get all gp regs from the child. */ | ||
534 | if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) { | ||
535 | ret = -EIO; | ||
536 | break; | ||
537 | } | ||
538 | for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { | ||
539 | __put_user(getreg(child, i), datap); | ||
540 | datap++; | ||
541 | } | ||
542 | ret = 0; | ||
543 | break; | ||
544 | } | ||
545 | |||
546 | case PTRACE_SETREGS: { /* Set all gp regs in the child. */ | ||
547 | unsigned long tmp; | ||
548 | if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) { | ||
549 | ret = -EIO; | ||
550 | break; | ||
551 | } | ||
552 | for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { | ||
553 | __get_user(tmp, datap); | ||
554 | putreg(child, i, tmp); | ||
555 | datap++; | ||
556 | } | ||
557 | ret = 0; | ||
558 | break; | ||
559 | } | ||
560 | |||
561 | case PTRACE_GETFPREGS: { /* Get the child FPU state. */ | ||
562 | if (!access_ok(VERIFY_WRITE, datap, | ||
563 | sizeof(struct user_i387_struct))) { | ||
564 | ret = -EIO; | ||
565 | break; | ||
566 | } | ||
567 | ret = 0; | ||
568 | if (!tsk_used_math(child)) | ||
569 | init_fpu(child); | ||
570 | get_fpregs((struct user_i387_struct __user *)data, child); | ||
571 | break; | ||
572 | } | ||
573 | |||
574 | case PTRACE_SETFPREGS: { /* Set the child FPU state. */ | ||
575 | if (!access_ok(VERIFY_READ, datap, | ||
576 | sizeof(struct user_i387_struct))) { | ||
577 | ret = -EIO; | ||
578 | break; | ||
579 | } | ||
580 | set_stopped_child_used_math(child); | ||
581 | set_fpregs(child, (struct user_i387_struct __user *)data); | ||
582 | ret = 0; | ||
583 | break; | ||
584 | } | ||
585 | |||
586 | case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */ | ||
587 | if (!access_ok(VERIFY_WRITE, datap, | ||
588 | sizeof(struct user_fxsr_struct))) { | ||
589 | ret = -EIO; | ||
590 | break; | ||
591 | } | ||
592 | if (!tsk_used_math(child)) | ||
593 | init_fpu(child); | ||
594 | ret = get_fpxregs((struct user_fxsr_struct __user *)data, child); | ||
595 | break; | ||
596 | } | ||
597 | |||
598 | case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */ | ||
599 | if (!access_ok(VERIFY_READ, datap, | ||
600 | sizeof(struct user_fxsr_struct))) { | ||
601 | ret = -EIO; | ||
602 | break; | ||
603 | } | ||
604 | set_stopped_child_used_math(child); | ||
605 | ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data); | ||
606 | break; | ||
607 | } | ||
608 | |||
609 | case PTRACE_GET_THREAD_AREA: | ||
610 | ret = ptrace_get_thread_area(child, addr, | ||
611 | (struct user_desc __user *) data); | ||
612 | break; | ||
613 | |||
614 | case PTRACE_SET_THREAD_AREA: | ||
615 | ret = ptrace_set_thread_area(child, addr, | ||
616 | (struct user_desc __user *) data); | ||
617 | break; | ||
618 | |||
619 | default: | ||
620 | ret = ptrace_request(child, request, addr, data); | ||
621 | break; | ||
622 | } | ||
623 | out_tsk: | ||
624 | return ret; | ||
625 | } | ||
626 | |||
627 | void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) | ||
628 | { | ||
629 | struct siginfo info; | ||
630 | |||
631 | tsk->thread.trap_no = 1; | ||
632 | tsk->thread.error_code = error_code; | ||
633 | |||
634 | memset(&info, 0, sizeof(info)); | ||
635 | info.si_signo = SIGTRAP; | ||
636 | info.si_code = TRAP_BRKPT; | ||
637 | |||
638 | /* User-mode eip? */ | ||
639 | info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; | ||
640 | |||
641 | /* Send us the fakey SIGTRAP */ | ||
642 | force_sig_info(SIGTRAP, &info, tsk); | ||
643 | } | ||
644 | |||
645 | /* notification of system call entry/exit | ||
646 | * - triggered by current->work.syscall_trace | ||
647 | */ | ||
648 | __attribute__((regparm(3))) | ||
649 | int do_syscall_trace(struct pt_regs *regs, int entryexit) | ||
650 | { | ||
651 | int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); | ||
652 | /* | ||
653 | * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall | ||
654 | * interception | ||
655 | */ | ||
656 | int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); | ||
657 | int ret = 0; | ||
658 | |||
659 | /* do the secure computing check first */ | ||
660 | if (!entryexit) | ||
661 | secure_computing(regs->orig_eax); | ||
662 | |||
663 | if (unlikely(current->audit_context)) { | ||
664 | if (entryexit) | ||
665 | audit_syscall_exit(AUDITSC_RESULT(regs->eax), | ||
666 | regs->eax); | ||
667 | /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only | ||
668 | * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is | ||
669 | * not used, entry.S will call us only on syscall exit, not | ||
670 | * entry; so when TIF_SYSCALL_AUDIT is used we must avoid | ||
671 | * calling send_sigtrap() on syscall entry. | ||
672 | * | ||
673 | * Note that when PTRACE_SYSEMU_SINGLESTEP is used, | ||
674 | * is_singlestep is false, despite his name, so we will still do | ||
675 | * the correct thing. | ||
676 | */ | ||
677 | else if (is_singlestep) | ||
678 | goto out; | ||
679 | } | ||
680 | |||
681 | if (!(current->ptrace & PT_PTRACED)) | ||
682 | goto out; | ||
683 | |||
684 | /* If a process stops on the 1st tracepoint with SYSCALL_TRACE | ||
685 | * and then is resumed with SYSEMU_SINGLESTEP, it will come in | ||
686 | * here. We have to check this and return */ | ||
687 | if (is_sysemu && entryexit) | ||
688 | return 0; | ||
689 | |||
690 | /* Fake a debug trap */ | ||
691 | if (is_singlestep) | ||
692 | send_sigtrap(current, regs, 0); | ||
693 | |||
694 | if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) | ||
695 | goto out; | ||
696 | |||
697 | /* the 0x80 provides a way for the tracing parent to distinguish | ||
698 | between a syscall stop and SIGTRAP delivery */ | ||
699 | /* Note that the debugger could change the result of test_thread_flag!*/ | ||
700 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); | ||
701 | |||
702 | /* | ||
703 | * this isn't the same as continuing with a signal, but it will do | ||
704 | * for normal use. strace only continues with a signal if the | ||
705 | * stopping signal is not SIGTRAP. -brl | ||
706 | */ | ||
707 | if (current->exit_code) { | ||
708 | send_sig(current->exit_code, current, 1); | ||
709 | current->exit_code = 0; | ||
710 | } | ||
711 | ret = is_sysemu; | ||
712 | out: | ||
713 | if (unlikely(current->audit_context) && !entryexit) | ||
714 | audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax, | ||
715 | regs->ebx, regs->ecx, regs->edx, regs->esi); | ||
716 | if (ret == 0) | ||
717 | return 0; | ||
718 | |||
719 | regs->orig_eax = -1; /* force skip of syscall restarting */ | ||
720 | if (unlikely(current->audit_context)) | ||
721 | audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax); | ||
722 | return 1; | ||
723 | } | ||
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c new file mode 100644 index 000000000000..6722469c2633 --- /dev/null +++ b/arch/x86/kernel/quirks.c | |||
@@ -0,0 +1,49 @@ | |||
1 | /* | ||
2 | * This file contains work-arounds for x86 and x86_64 platform bugs. | ||
3 | */ | ||
4 | #include <linux/pci.h> | ||
5 | #include <linux/irq.h> | ||
6 | |||
7 | #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) | ||
8 | |||
9 | static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) | ||
10 | { | ||
11 | u8 config, rev; | ||
12 | u32 word; | ||
13 | |||
14 | /* BIOS may enable hardware IRQ balancing for | ||
15 | * E7520/E7320/E7525(revision ID 0x9 and below) | ||
16 | * based platforms. | ||
17 | * Disable SW irqbalance/affinity on those platforms. | ||
18 | */ | ||
19 | pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); | ||
20 | if (rev > 0x9) | ||
21 | return; | ||
22 | |||
23 | /* enable access to config space*/ | ||
24 | pci_read_config_byte(dev, 0xf4, &config); | ||
25 | pci_write_config_byte(dev, 0xf4, config|0x2); | ||
26 | |||
27 | /* read xTPR register */ | ||
28 | raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); | ||
29 | |||
30 | if (!(word & (1 << 13))) { | ||
31 | printk(KERN_INFO "Intel E7520/7320/7525 detected. " | ||
32 | "Disabling irq balancing and affinity\n"); | ||
33 | #ifdef CONFIG_IRQBALANCE | ||
34 | irqbalance_disable(""); | ||
35 | #endif | ||
36 | noirqdebug_setup(""); | ||
37 | #ifdef CONFIG_PROC_FS | ||
38 | no_irq_affinity = 1; | ||
39 | #endif | ||
40 | } | ||
41 | |||
42 | /* put back the original value for config space*/ | ||
43 | if (!(config & 0x2)) | ||
44 | pci_write_config_byte(dev, 0xf4, config); | ||
45 | } | ||
46 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); | ||
47 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); | ||
48 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); | ||
49 | #endif | ||
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c new file mode 100644 index 000000000000..0d796248866c --- /dev/null +++ b/arch/x86/kernel/reboot_32.c | |||
@@ -0,0 +1,413 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/reboot.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/mm.h> | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/delay.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/interrupt.h> | ||
10 | #include <linux/mc146818rtc.h> | ||
11 | #include <linux/efi.h> | ||
12 | #include <linux/dmi.h> | ||
13 | #include <linux/ctype.h> | ||
14 | #include <linux/pm.h> | ||
15 | #include <linux/reboot.h> | ||
16 | #include <asm/uaccess.h> | ||
17 | #include <asm/apic.h> | ||
18 | #include <asm/desc.h> | ||
19 | #include "mach_reboot.h" | ||
20 | #include <asm/reboot_fixups.h> | ||
21 | #include <asm/reboot.h> | ||
22 | |||
23 | /* | ||
24 | * Power off function, if any | ||
25 | */ | ||
26 | void (*pm_power_off)(void); | ||
27 | EXPORT_SYMBOL(pm_power_off); | ||
28 | |||
29 | static int reboot_mode; | ||
30 | static int reboot_thru_bios; | ||
31 | |||
32 | #ifdef CONFIG_SMP | ||
33 | static int reboot_cpu = -1; | ||
34 | #endif | ||
35 | static int __init reboot_setup(char *str) | ||
36 | { | ||
37 | while(1) { | ||
38 | switch (*str) { | ||
39 | case 'w': /* "warm" reboot (no memory testing etc) */ | ||
40 | reboot_mode = 0x1234; | ||
41 | break; | ||
42 | case 'c': /* "cold" reboot (with memory testing etc) */ | ||
43 | reboot_mode = 0x0; | ||
44 | break; | ||
45 | case 'b': /* "bios" reboot by jumping through the BIOS */ | ||
46 | reboot_thru_bios = 1; | ||
47 | break; | ||
48 | case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */ | ||
49 | reboot_thru_bios = 0; | ||
50 | break; | ||
51 | #ifdef CONFIG_SMP | ||
52 | case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ | ||
53 | if (isdigit(*(str+1))) { | ||
54 | reboot_cpu = (int) (*(str+1) - '0'); | ||
55 | if (isdigit(*(str+2))) | ||
56 | reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); | ||
57 | } | ||
58 | /* we will leave sorting out the final value | ||
59 | when we are ready to reboot, since we might not | ||
60 | have set up boot_cpu_id or smp_num_cpu */ | ||
61 | break; | ||
62 | #endif | ||
63 | } | ||
64 | if((str = strchr(str,',')) != NULL) | ||
65 | str++; | ||
66 | else | ||
67 | break; | ||
68 | } | ||
69 | return 1; | ||
70 | } | ||
71 | |||
72 | __setup("reboot=", reboot_setup); | ||
73 | |||
74 | /* | ||
75 | * Reboot options and system auto-detection code provided by | ||
76 | * Dell Inc. so their systems "just work". :-) | ||
77 | */ | ||
78 | |||
79 | /* | ||
80 | * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. | ||
81 | */ | ||
82 | static int __init set_bios_reboot(struct dmi_system_id *d) | ||
83 | { | ||
84 | if (!reboot_thru_bios) { | ||
85 | reboot_thru_bios = 1; | ||
86 | printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); | ||
87 | } | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | static struct dmi_system_id __initdata reboot_dmi_table[] = { | ||
92 | { /* Handle problems with rebooting on Dell E520's */ | ||
93 | .callback = set_bios_reboot, | ||
94 | .ident = "Dell E520", | ||
95 | .matches = { | ||
96 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), | ||
97 | DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), | ||
98 | }, | ||
99 | }, | ||
100 | { /* Handle problems with rebooting on Dell 1300's */ | ||
101 | .callback = set_bios_reboot, | ||
102 | .ident = "Dell PowerEdge 1300", | ||
103 | .matches = { | ||
104 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
105 | DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), | ||
106 | }, | ||
107 | }, | ||
108 | { /* Handle problems with rebooting on Dell 300's */ | ||
109 | .callback = set_bios_reboot, | ||
110 | .ident = "Dell PowerEdge 300", | ||
111 | .matches = { | ||
112 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
113 | DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), | ||
114 | }, | ||
115 | }, | ||
116 | { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ | ||
117 | .callback = set_bios_reboot, | ||
118 | .ident = "Dell OptiPlex 745", | ||
119 | .matches = { | ||
120 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), | ||
121 | DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), | ||
122 | DMI_MATCH(DMI_BOARD_NAME, "0WF810"), | ||
123 | }, | ||
124 | }, | ||
125 | { /* Handle problems with rebooting on Dell 2400's */ | ||
126 | .callback = set_bios_reboot, | ||
127 | .ident = "Dell PowerEdge 2400", | ||
128 | .matches = { | ||
129 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), | ||
130 | DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), | ||
131 | }, | ||
132 | }, | ||
133 | { /* Handle problems with rebooting on HP laptops */ | ||
134 | .callback = set_bios_reboot, | ||
135 | .ident = "HP Compaq Laptop", | ||
136 | .matches = { | ||
137 | DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), | ||
138 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), | ||
139 | }, | ||
140 | }, | ||
141 | { } | ||
142 | }; | ||
143 | |||
144 | static int __init reboot_init(void) | ||
145 | { | ||
146 | dmi_check_system(reboot_dmi_table); | ||
147 | return 0; | ||
148 | } | ||
149 | |||
150 | core_initcall(reboot_init); | ||
151 | |||
152 | /* The following code and data reboots the machine by switching to real | ||
153 | mode and jumping to the BIOS reset entry point, as if the CPU has | ||
154 | really been reset. The previous version asked the keyboard | ||
155 | controller to pulse the CPU reset line, which is more thorough, but | ||
156 | doesn't work with at least one type of 486 motherboard. It is easy | ||
157 | to stop this code working; hence the copious comments. */ | ||
158 | |||
159 | static unsigned long long | ||
160 | real_mode_gdt_entries [3] = | ||
161 | { | ||
162 | 0x0000000000000000ULL, /* Null descriptor */ | ||
163 | 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ | ||
164 | 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ | ||
165 | }; | ||
166 | |||
167 | static struct Xgt_desc_struct | ||
168 | real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, | ||
169 | real_mode_idt = { 0x3ff, 0 }, | ||
170 | no_idt = { 0, 0 }; | ||
171 | |||
172 | |||
173 | /* This is 16-bit protected mode code to disable paging and the cache, | ||
174 | switch to real mode and jump to the BIOS reset code. | ||
175 | |||
176 | The instruction that switches to real mode by writing to CR0 must be | ||
177 | followed immediately by a far jump instruction, which set CS to a | ||
178 | valid value for real mode, and flushes the prefetch queue to avoid | ||
179 | running instructions that have already been decoded in protected | ||
180 | mode. | ||
181 | |||
182 | Clears all the flags except ET, especially PG (paging), PE | ||
183 | (protected-mode enable) and TS (task switch for coprocessor state | ||
184 | save). Flushes the TLB after paging has been disabled. Sets CD and | ||
185 | NW, to disable the cache on a 486, and invalidates the cache. This | ||
186 | is more like the state of a 486 after reset. I don't know if | ||
187 | something else should be done for other chips. | ||
188 | |||
189 | More could be done here to set up the registers as if a CPU reset had | ||
190 | occurred; hopefully real BIOSs don't assume much. */ | ||
191 | |||
192 | static unsigned char real_mode_switch [] = | ||
193 | { | ||
194 | 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ | ||
195 | 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ | ||
196 | 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ | ||
197 | 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ | ||
198 | 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */ | ||
199 | 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */ | ||
200 | 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */ | ||
201 | 0x74, 0x02, /* jz f */ | ||
202 | 0x0f, 0x09, /* wbinvd */ | ||
203 | 0x24, 0x10, /* f: andb $0x10,al */ | ||
204 | 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ | ||
205 | }; | ||
206 | static unsigned char jump_to_bios [] = | ||
207 | { | ||
208 | 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ | ||
209 | }; | ||
210 | |||
211 | /* | ||
212 | * Switch to real mode and then execute the code | ||
213 | * specified by the code and length parameters. | ||
214 | * We assume that length will aways be less that 100! | ||
215 | */ | ||
216 | void machine_real_restart(unsigned char *code, int length) | ||
217 | { | ||
218 | local_irq_disable(); | ||
219 | |||
220 | /* Write zero to CMOS register number 0x0f, which the BIOS POST | ||
221 | routine will recognize as telling it to do a proper reboot. (Well | ||
222 | that's what this book in front of me says -- it may only apply to | ||
223 | the Phoenix BIOS though, it's not clear). At the same time, | ||
224 | disable NMIs by setting the top bit in the CMOS address register, | ||
225 | as we're about to do peculiar things to the CPU. I'm not sure if | ||
226 | `outb_p' is needed instead of just `outb'. Use it to be on the | ||
227 | safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) | ||
228 | */ | ||
229 | |||
230 | spin_lock(&rtc_lock); | ||
231 | CMOS_WRITE(0x00, 0x8f); | ||
232 | spin_unlock(&rtc_lock); | ||
233 | |||
234 | /* Remap the kernel at virtual address zero, as well as offset zero | ||
235 | from the kernel segment. This assumes the kernel segment starts at | ||
236 | virtual address PAGE_OFFSET. */ | ||
237 | |||
238 | memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, | ||
239 | sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); | ||
240 | |||
241 | /* | ||
242 | * Use `swapper_pg_dir' as our page directory. | ||
243 | */ | ||
244 | load_cr3(swapper_pg_dir); | ||
245 | |||
246 | /* Write 0x1234 to absolute memory location 0x472. The BIOS reads | ||
247 | this on booting to tell it to "Bypass memory test (also warm | ||
248 | boot)". This seems like a fairly standard thing that gets set by | ||
249 | REBOOT.COM programs, and the previous reset routine did this | ||
250 | too. */ | ||
251 | |||
252 | *((unsigned short *)0x472) = reboot_mode; | ||
253 | |||
254 | /* For the switch to real mode, copy some code to low memory. It has | ||
255 | to be in the first 64k because it is running in 16-bit mode, and it | ||
256 | has to have the same physical and virtual address, because it turns | ||
257 | off paging. Copy it near the end of the first page, out of the way | ||
258 | of BIOS variables. */ | ||
259 | |||
260 | memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100), | ||
261 | real_mode_switch, sizeof (real_mode_switch)); | ||
262 | memcpy ((void *) (0x1000 - 100), code, length); | ||
263 | |||
264 | /* Set up the IDT for real mode. */ | ||
265 | |||
266 | load_idt(&real_mode_idt); | ||
267 | |||
268 | /* Set up a GDT from which we can load segment descriptors for real | ||
269 | mode. The GDT is not used in real mode; it is just needed here to | ||
270 | prepare the descriptors. */ | ||
271 | |||
272 | load_gdt(&real_mode_gdt); | ||
273 | |||
274 | /* Load the data segment registers, and thus the descriptors ready for | ||
275 | real mode. The base address of each segment is 0x100, 16 times the | ||
276 | selector value being loaded here. This is so that the segment | ||
277 | registers don't have to be reloaded after switching to real mode: | ||
278 | the values are consistent for real mode operation already. */ | ||
279 | |||
280 | __asm__ __volatile__ ("movl $0x0010,%%eax\n" | ||
281 | "\tmovl %%eax,%%ds\n" | ||
282 | "\tmovl %%eax,%%es\n" | ||
283 | "\tmovl %%eax,%%fs\n" | ||
284 | "\tmovl %%eax,%%gs\n" | ||
285 | "\tmovl %%eax,%%ss" : : : "eax"); | ||
286 | |||
287 | /* Jump to the 16-bit code that we copied earlier. It disables paging | ||
288 | and the cache, switches to real mode, and jumps to the BIOS reset | ||
289 | entry point. */ | ||
290 | |||
291 | __asm__ __volatile__ ("ljmp $0x0008,%0" | ||
292 | : | ||
293 | : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); | ||
294 | } | ||
295 | #ifdef CONFIG_APM_MODULE | ||
296 | EXPORT_SYMBOL(machine_real_restart); | ||
297 | #endif | ||
298 | |||
299 | static void native_machine_shutdown(void) | ||
300 | { | ||
301 | #ifdef CONFIG_SMP | ||
302 | int reboot_cpu_id; | ||
303 | |||
304 | /* The boot cpu is always logical cpu 0 */ | ||
305 | reboot_cpu_id = 0; | ||
306 | |||
307 | /* See if there has been given a command line override */ | ||
308 | if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && | ||
309 | cpu_isset(reboot_cpu, cpu_online_map)) { | ||
310 | reboot_cpu_id = reboot_cpu; | ||
311 | } | ||
312 | |||
313 | /* Make certain the cpu I'm rebooting on is online */ | ||
314 | if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { | ||
315 | reboot_cpu_id = smp_processor_id(); | ||
316 | } | ||
317 | |||
318 | /* Make certain I only run on the appropriate processor */ | ||
319 | set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); | ||
320 | |||
321 | /* O.K. Now that I'm on the appropriate processor, stop | ||
322 | * all of the others, and disable their local APICs. | ||
323 | */ | ||
324 | |||
325 | smp_send_stop(); | ||
326 | #endif /* CONFIG_SMP */ | ||
327 | |||
328 | lapic_shutdown(); | ||
329 | |||
330 | #ifdef CONFIG_X86_IO_APIC | ||
331 | disable_IO_APIC(); | ||
332 | #endif | ||
333 | } | ||
334 | |||
335 | void __attribute__((weak)) mach_reboot_fixups(void) | ||
336 | { | ||
337 | } | ||
338 | |||
339 | static void native_machine_emergency_restart(void) | ||
340 | { | ||
341 | if (!reboot_thru_bios) { | ||
342 | if (efi_enabled) { | ||
343 | efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); | ||
344 | load_idt(&no_idt); | ||
345 | __asm__ __volatile__("int3"); | ||
346 | } | ||
347 | /* rebooting needs to touch the page at absolute addr 0 */ | ||
348 | *((unsigned short *)__va(0x472)) = reboot_mode; | ||
349 | for (;;) { | ||
350 | mach_reboot_fixups(); /* for board specific fixups */ | ||
351 | mach_reboot(); | ||
352 | /* That didn't work - force a triple fault.. */ | ||
353 | load_idt(&no_idt); | ||
354 | __asm__ __volatile__("int3"); | ||
355 | } | ||
356 | } | ||
357 | if (efi_enabled) | ||
358 | efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL); | ||
359 | |||
360 | machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); | ||
361 | } | ||
362 | |||
363 | static void native_machine_restart(char * __unused) | ||
364 | { | ||
365 | machine_shutdown(); | ||
366 | machine_emergency_restart(); | ||
367 | } | ||
368 | |||
369 | static void native_machine_halt(void) | ||
370 | { | ||
371 | } | ||
372 | |||
373 | static void native_machine_power_off(void) | ||
374 | { | ||
375 | if (pm_power_off) { | ||
376 | machine_shutdown(); | ||
377 | pm_power_off(); | ||
378 | } | ||
379 | } | ||
380 | |||
381 | |||
382 | struct machine_ops machine_ops = { | ||
383 | .power_off = native_machine_power_off, | ||
384 | .shutdown = native_machine_shutdown, | ||
385 | .emergency_restart = native_machine_emergency_restart, | ||
386 | .restart = native_machine_restart, | ||
387 | .halt = native_machine_halt, | ||
388 | }; | ||
389 | |||
390 | void machine_power_off(void) | ||
391 | { | ||
392 | machine_ops.power_off(); | ||
393 | } | ||
394 | |||
395 | void machine_shutdown(void) | ||
396 | { | ||
397 | machine_ops.shutdown(); | ||
398 | } | ||
399 | |||
400 | void machine_emergency_restart(void) | ||
401 | { | ||
402 | machine_ops.emergency_restart(); | ||
403 | } | ||
404 | |||
405 | void machine_restart(char *cmd) | ||
406 | { | ||
407 | machine_ops.restart(cmd); | ||
408 | } | ||
409 | |||
410 | void machine_halt(void) | ||
411 | { | ||
412 | machine_ops.halt(); | ||
413 | } | ||
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c new file mode 100644 index 000000000000..03e1cce58f49 --- /dev/null +++ b/arch/x86/kernel/reboot_fixups_32.c | |||
@@ -0,0 +1,68 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/reboot_fixups.c | ||
3 | * | ||
4 | * This is a good place to put board specific reboot fixups. | ||
5 | * | ||
6 | * List of supported fixups: | ||
7 | * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz> | ||
8 | * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org> | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | #include <asm/delay.h> | ||
13 | #include <linux/pci.h> | ||
14 | #include <asm/reboot_fixups.h> | ||
15 | #include <asm/msr.h> | ||
16 | |||
17 | static void cs5530a_warm_reset(struct pci_dev *dev) | ||
18 | { | ||
19 | /* writing 1 to the reset control register, 0x44 causes the | ||
20 | cs5530a to perform a system warm reset */ | ||
21 | pci_write_config_byte(dev, 0x44, 0x1); | ||
22 | udelay(50); /* shouldn't get here but be safe and spin-a-while */ | ||
23 | return; | ||
24 | } | ||
25 | |||
26 | static void cs5536_warm_reset(struct pci_dev *dev) | ||
27 | { | ||
28 | /* | ||
29 | * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET) | ||
30 | * writing 1 to the LSB of this MSR causes a hard reset. | ||
31 | */ | ||
32 | wrmsrl(0x51400017, 1ULL); | ||
33 | udelay(50); /* shouldn't get here but be safe and spin a while */ | ||
34 | } | ||
35 | |||
36 | struct device_fixup { | ||
37 | unsigned int vendor; | ||
38 | unsigned int device; | ||
39 | void (*reboot_fixup)(struct pci_dev *); | ||
40 | }; | ||
41 | |||
42 | static struct device_fixup fixups_table[] = { | ||
43 | { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, | ||
44 | { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, | ||
45 | }; | ||
46 | |||
47 | /* | ||
48 | * we see if any fixup is available for our current hardware. if there | ||
49 | * is a fixup, we call it and we expect to never return from it. if we | ||
50 | * do return, we keep looking and then eventually fall back to the | ||
51 | * standard mach_reboot on return. | ||
52 | */ | ||
53 | void mach_reboot_fixups(void) | ||
54 | { | ||
55 | struct device_fixup *cur; | ||
56 | struct pci_dev *dev; | ||
57 | int i; | ||
58 | |||
59 | for (i=0; i < ARRAY_SIZE(fixups_table); i++) { | ||
60 | cur = &(fixups_table[i]); | ||
61 | dev = pci_get_device(cur->vendor, cur->device, NULL); | ||
62 | if (!dev) | ||
63 | continue; | ||
64 | |||
65 | cur->reboot_fixup(dev); | ||
66 | } | ||
67 | } | ||
68 | |||
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S new file mode 100644 index 000000000000..f151d6fae462 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_32.S | |||
@@ -0,0 +1,252 @@ | |||
1 | /* | ||
2 | * relocate_kernel.S - put the kernel image in place to boot | ||
3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/linkage.h> | ||
10 | #include <asm/page.h> | ||
11 | #include <asm/kexec.h> | ||
12 | |||
13 | /* | ||
14 | * Must be relocatable PIC code callable as a C function | ||
15 | */ | ||
16 | |||
17 | #define PTR(x) (x << 2) | ||
18 | #define PAGE_ALIGNED (1 << PAGE_SHIFT) | ||
19 | #define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ | ||
20 | #define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */ | ||
21 | |||
22 | .text | ||
23 | .align PAGE_ALIGNED | ||
24 | .globl relocate_kernel | ||
25 | relocate_kernel: | ||
26 | movl 8(%esp), %ebp /* list of pages */ | ||
27 | |||
28 | #ifdef CONFIG_X86_PAE | ||
29 | /* map the control page at its virtual address */ | ||
30 | |||
31 | movl PTR(VA_PGD)(%ebp), %edi | ||
32 | movl PTR(VA_CONTROL_PAGE)(%ebp), %eax | ||
33 | andl $0xc0000000, %eax | ||
34 | shrl $27, %eax | ||
35 | addl %edi, %eax | ||
36 | |||
37 | movl PTR(PA_PMD_0)(%ebp), %edx | ||
38 | orl $PAE_PGD_ATTR, %edx | ||
39 | movl %edx, (%eax) | ||
40 | |||
41 | movl PTR(VA_PMD_0)(%ebp), %edi | ||
42 | movl PTR(VA_CONTROL_PAGE)(%ebp), %eax | ||
43 | andl $0x3fe00000, %eax | ||
44 | shrl $18, %eax | ||
45 | addl %edi, %eax | ||
46 | |||
47 | movl PTR(PA_PTE_0)(%ebp), %edx | ||
48 | orl $PAGE_ATTR, %edx | ||
49 | movl %edx, (%eax) | ||
50 | |||
51 | movl PTR(VA_PTE_0)(%ebp), %edi | ||
52 | movl PTR(VA_CONTROL_PAGE)(%ebp), %eax | ||
53 | andl $0x001ff000, %eax | ||
54 | shrl $9, %eax | ||
55 | addl %edi, %eax | ||
56 | |||
57 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edx | ||
58 | orl $PAGE_ATTR, %edx | ||
59 | movl %edx, (%eax) | ||
60 | |||
61 | /* identity map the control page at its physical address */ | ||
62 | |||
63 | movl PTR(VA_PGD)(%ebp), %edi | ||
64 | movl PTR(PA_CONTROL_PAGE)(%ebp), %eax | ||
65 | andl $0xc0000000, %eax | ||
66 | shrl $27, %eax | ||
67 | addl %edi, %eax | ||
68 | |||
69 | movl PTR(PA_PMD_1)(%ebp), %edx | ||
70 | orl $PAE_PGD_ATTR, %edx | ||
71 | movl %edx, (%eax) | ||
72 | |||
73 | movl PTR(VA_PMD_1)(%ebp), %edi | ||
74 | movl PTR(PA_CONTROL_PAGE)(%ebp), %eax | ||
75 | andl $0x3fe00000, %eax | ||
76 | shrl $18, %eax | ||
77 | addl %edi, %eax | ||
78 | |||
79 | movl PTR(PA_PTE_1)(%ebp), %edx | ||
80 | orl $PAGE_ATTR, %edx | ||
81 | movl %edx, (%eax) | ||
82 | |||
83 | movl PTR(VA_PTE_1)(%ebp), %edi | ||
84 | movl PTR(PA_CONTROL_PAGE)(%ebp), %eax | ||
85 | andl $0x001ff000, %eax | ||
86 | shrl $9, %eax | ||
87 | addl %edi, %eax | ||
88 | |||
89 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edx | ||
90 | orl $PAGE_ATTR, %edx | ||
91 | movl %edx, (%eax) | ||
92 | #else | ||
93 | /* map the control page at its virtual address */ | ||
94 | |||
95 | movl PTR(VA_PGD)(%ebp), %edi | ||
96 | movl PTR(VA_CONTROL_PAGE)(%ebp), %eax | ||
97 | andl $0xffc00000, %eax | ||
98 | shrl $20, %eax | ||
99 | addl %edi, %eax | ||
100 | |||
101 | movl PTR(PA_PTE_0)(%ebp), %edx | ||
102 | orl $PAGE_ATTR, %edx | ||
103 | movl %edx, (%eax) | ||
104 | |||
105 | movl PTR(VA_PTE_0)(%ebp), %edi | ||
106 | movl PTR(VA_CONTROL_PAGE)(%ebp), %eax | ||
107 | andl $0x003ff000, %eax | ||
108 | shrl $10, %eax | ||
109 | addl %edi, %eax | ||
110 | |||
111 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edx | ||
112 | orl $PAGE_ATTR, %edx | ||
113 | movl %edx, (%eax) | ||
114 | |||
115 | /* identity map the control page at its physical address */ | ||
116 | |||
117 | movl PTR(VA_PGD)(%ebp), %edi | ||
118 | movl PTR(PA_CONTROL_PAGE)(%ebp), %eax | ||
119 | andl $0xffc00000, %eax | ||
120 | shrl $20, %eax | ||
121 | addl %edi, %eax | ||
122 | |||
123 | movl PTR(PA_PTE_1)(%ebp), %edx | ||
124 | orl $PAGE_ATTR, %edx | ||
125 | movl %edx, (%eax) | ||
126 | |||
127 | movl PTR(VA_PTE_1)(%ebp), %edi | ||
128 | movl PTR(PA_CONTROL_PAGE)(%ebp), %eax | ||
129 | andl $0x003ff000, %eax | ||
130 | shrl $10, %eax | ||
131 | addl %edi, %eax | ||
132 | |||
133 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edx | ||
134 | orl $PAGE_ATTR, %edx | ||
135 | movl %edx, (%eax) | ||
136 | #endif | ||
137 | |||
138 | relocate_new_kernel: | ||
139 | /* read the arguments and say goodbye to the stack */ | ||
140 | movl 4(%esp), %ebx /* page_list */ | ||
141 | movl 8(%esp), %ebp /* list of pages */ | ||
142 | movl 12(%esp), %edx /* start address */ | ||
143 | movl 16(%esp), %ecx /* cpu_has_pae */ | ||
144 | |||
145 | /* zero out flags, and disable interrupts */ | ||
146 | pushl $0 | ||
147 | popfl | ||
148 | |||
149 | /* get physical address of control page now */ | ||
150 | /* this is impossible after page table switch */ | ||
151 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edi | ||
152 | |||
153 | /* switch to new set of page tables */ | ||
154 | movl PTR(PA_PGD)(%ebp), %eax | ||
155 | movl %eax, %cr3 | ||
156 | |||
157 | /* setup a new stack at the end of the physical control page */ | ||
158 | lea 4096(%edi), %esp | ||
159 | |||
160 | /* jump to identity mapped page */ | ||
161 | movl %edi, %eax | ||
162 | addl $(identity_mapped - relocate_kernel), %eax | ||
163 | pushl %eax | ||
164 | ret | ||
165 | |||
166 | identity_mapped: | ||
167 | /* store the start address on the stack */ | ||
168 | pushl %edx | ||
169 | |||
170 | /* Set cr0 to a known state: | ||
171 | * 31 0 == Paging disabled | ||
172 | * 18 0 == Alignment check disabled | ||
173 | * 16 0 == Write protect disabled | ||
174 | * 3 0 == No task switch | ||
175 | * 2 0 == Don't do FP software emulation. | ||
176 | * 0 1 == Proctected mode enabled | ||
177 | */ | ||
178 | movl %cr0, %eax | ||
179 | andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax | ||
180 | orl $(1<<0), %eax | ||
181 | movl %eax, %cr0 | ||
182 | |||
183 | /* clear cr4 if applicable */ | ||
184 | testl %ecx, %ecx | ||
185 | jz 1f | ||
186 | /* Set cr4 to a known state: | ||
187 | * Setting everything to zero seems safe. | ||
188 | */ | ||
189 | movl %cr4, %eax | ||
190 | andl $0, %eax | ||
191 | movl %eax, %cr4 | ||
192 | |||
193 | jmp 1f | ||
194 | 1: | ||
195 | |||
196 | /* Flush the TLB (needed?) */ | ||
197 | xorl %eax, %eax | ||
198 | movl %eax, %cr3 | ||
199 | |||
200 | /* Do the copies */ | ||
201 | movl %ebx, %ecx | ||
202 | jmp 1f | ||
203 | |||
204 | 0: /* top, read another word from the indirection page */ | ||
205 | movl (%ebx), %ecx | ||
206 | addl $4, %ebx | ||
207 | 1: | ||
208 | testl $0x1, %ecx /* is it a destination page */ | ||
209 | jz 2f | ||
210 | movl %ecx, %edi | ||
211 | andl $0xfffff000, %edi | ||
212 | jmp 0b | ||
213 | 2: | ||
214 | testl $0x2, %ecx /* is it an indirection page */ | ||
215 | jz 2f | ||
216 | movl %ecx, %ebx | ||
217 | andl $0xfffff000, %ebx | ||
218 | jmp 0b | ||
219 | 2: | ||
220 | testl $0x4, %ecx /* is it the done indicator */ | ||
221 | jz 2f | ||
222 | jmp 3f | ||
223 | 2: | ||
224 | testl $0x8, %ecx /* is it the source indicator */ | ||
225 | jz 0b /* Ignore it otherwise */ | ||
226 | movl %ecx, %esi /* For every source page do a copy */ | ||
227 | andl $0xfffff000, %esi | ||
228 | |||
229 | movl $1024, %ecx | ||
230 | rep ; movsl | ||
231 | jmp 0b | ||
232 | |||
233 | 3: | ||
234 | |||
235 | /* To be certain of avoiding problems with self-modifying code | ||
236 | * I need to execute a serializing instruction here. | ||
237 | * So I flush the TLB, it's handy, and not processor dependent. | ||
238 | */ | ||
239 | xorl %eax, %eax | ||
240 | movl %eax, %cr3 | ||
241 | |||
242 | /* set all of the registers to known values */ | ||
243 | /* leave %esp alone */ | ||
244 | |||
245 | xorl %eax, %eax | ||
246 | xorl %ebx, %ebx | ||
247 | xorl %ecx, %ecx | ||
248 | xorl %edx, %edx | ||
249 | xorl %esi, %esi | ||
250 | xorl %edi, %edi | ||
251 | xorl %ebp, %ebp | ||
252 | ret | ||
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c new file mode 100644 index 000000000000..c7d3df23f589 --- /dev/null +++ b/arch/x86/kernel/scx200_32.c | |||
@@ -0,0 +1,131 @@ | |||
1 | /* linux/arch/i386/kernel/scx200.c | ||
2 | |||
3 | Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> | ||
4 | |||
5 | National Semiconductor SCx200 support. */ | ||
6 | |||
7 | #include <linux/module.h> | ||
8 | #include <linux/errno.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/mutex.h> | ||
12 | #include <linux/pci.h> | ||
13 | |||
14 | #include <linux/scx200.h> | ||
15 | #include <linux/scx200_gpio.h> | ||
16 | |||
17 | /* Verify that the configuration block really is there */ | ||
18 | #define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base)) | ||
19 | |||
20 | #define NAME "scx200" | ||
21 | |||
22 | MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>"); | ||
23 | MODULE_DESCRIPTION("NatSemi SCx200 Driver"); | ||
24 | MODULE_LICENSE("GPL"); | ||
25 | |||
26 | unsigned scx200_gpio_base = 0; | ||
27 | long scx200_gpio_shadow[2]; | ||
28 | |||
29 | unsigned scx200_cb_base = 0; | ||
30 | |||
31 | static struct pci_device_id scx200_tbl[] = { | ||
32 | { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) }, | ||
33 | { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) }, | ||
34 | { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) }, | ||
35 | { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) }, | ||
36 | { }, | ||
37 | }; | ||
38 | MODULE_DEVICE_TABLE(pci,scx200_tbl); | ||
39 | |||
40 | static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *); | ||
41 | |||
42 | static struct pci_driver scx200_pci_driver = { | ||
43 | .name = "scx200", | ||
44 | .id_table = scx200_tbl, | ||
45 | .probe = scx200_probe, | ||
46 | }; | ||
47 | |||
48 | static DEFINE_MUTEX(scx200_gpio_config_lock); | ||
49 | |||
50 | static void __devinit scx200_init_shadow(void) | ||
51 | { | ||
52 | int bank; | ||
53 | |||
54 | /* read the current values driven on the GPIO signals */ | ||
55 | for (bank = 0; bank < 2; ++bank) | ||
56 | scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); | ||
57 | } | ||
58 | |||
59 | static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) | ||
60 | { | ||
61 | unsigned base; | ||
62 | |||
63 | if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || | ||
64 | pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) { | ||
65 | base = pci_resource_start(pdev, 0); | ||
66 | printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); | ||
67 | |||
68 | if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) { | ||
69 | printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); | ||
70 | return -EBUSY; | ||
71 | } | ||
72 | |||
73 | scx200_gpio_base = base; | ||
74 | scx200_init_shadow(); | ||
75 | |||
76 | } else { | ||
77 | /* find the base of the Configuration Block */ | ||
78 | if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) { | ||
79 | scx200_cb_base = SCx200_CB_BASE_FIXED; | ||
80 | } else { | ||
81 | pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base); | ||
82 | if (scx200_cb_probe(base)) { | ||
83 | scx200_cb_base = base; | ||
84 | } else { | ||
85 | printk(KERN_WARNING NAME ": Configuration Block not found\n"); | ||
86 | return -ENODEV; | ||
87 | } | ||
88 | } | ||
89 | printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base); | ||
90 | } | ||
91 | |||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits) | ||
96 | { | ||
97 | u32 config, new_config; | ||
98 | |||
99 | mutex_lock(&scx200_gpio_config_lock); | ||
100 | |||
101 | outl(index, scx200_gpio_base + 0x20); | ||
102 | config = inl(scx200_gpio_base + 0x24); | ||
103 | |||
104 | new_config = (config & mask) | bits; | ||
105 | outl(new_config, scx200_gpio_base + 0x24); | ||
106 | |||
107 | mutex_unlock(&scx200_gpio_config_lock); | ||
108 | |||
109 | return config; | ||
110 | } | ||
111 | |||
112 | static int __init scx200_init(void) | ||
113 | { | ||
114 | printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); | ||
115 | |||
116 | return pci_register_driver(&scx200_pci_driver); | ||
117 | } | ||
118 | |||
119 | static void __exit scx200_cleanup(void) | ||
120 | { | ||
121 | pci_unregister_driver(&scx200_pci_driver); | ||
122 | release_region(scx200_gpio_base, SCx200_GPIO_SIZE); | ||
123 | } | ||
124 | |||
125 | module_init(scx200_init); | ||
126 | module_exit(scx200_cleanup); | ||
127 | |||
128 | EXPORT_SYMBOL(scx200_gpio_base); | ||
129 | EXPORT_SYMBOL(scx200_gpio_shadow); | ||
130 | EXPORT_SYMBOL(scx200_gpio_configure); | ||
131 | EXPORT_SYMBOL(scx200_cb_base); | ||
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c new file mode 100644 index 000000000000..d474cd639bcb --- /dev/null +++ b/arch/x86/kernel/setup_32.c | |||
@@ -0,0 +1,653 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/setup.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | ||
7 | * | ||
8 | * Memory region support | ||
9 | * David Parsons <orc@pell.chi.il.us>, July-August 1999 | ||
10 | * | ||
11 | * Added E820 sanitization routine (removes overlapping memory regions); | ||
12 | * Brian Moyle <bmoyle@mvista.com>, February 2001 | ||
13 | * | ||
14 | * Moved CPU detection code to cpu/${cpu}.c | ||
15 | * Patrick Mochel <mochel@osdl.org>, March 2002 | ||
16 | * | ||
17 | * Provisions for empty E820 memory regions (reported by certain BIOSes). | ||
18 | * Alex Achenbach <xela@slit.de>, December 2002. | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | /* | ||
23 | * This file handles the architecture-dependent parts of initialization | ||
24 | */ | ||
25 | |||
26 | #include <linux/sched.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/mmzone.h> | ||
29 | #include <linux/screen_info.h> | ||
30 | #include <linux/ioport.h> | ||
31 | #include <linux/acpi.h> | ||
32 | #include <linux/apm_bios.h> | ||
33 | #include <linux/initrd.h> | ||
34 | #include <linux/bootmem.h> | ||
35 | #include <linux/seq_file.h> | ||
36 | #include <linux/console.h> | ||
37 | #include <linux/mca.h> | ||
38 | #include <linux/root_dev.h> | ||
39 | #include <linux/highmem.h> | ||
40 | #include <linux/module.h> | ||
41 | #include <linux/efi.h> | ||
42 | #include <linux/init.h> | ||
43 | #include <linux/edd.h> | ||
44 | #include <linux/nodemask.h> | ||
45 | #include <linux/kexec.h> | ||
46 | #include <linux/crash_dump.h> | ||
47 | #include <linux/dmi.h> | ||
48 | #include <linux/pfn.h> | ||
49 | |||
50 | #include <video/edid.h> | ||
51 | |||
52 | #include <asm/apic.h> | ||
53 | #include <asm/e820.h> | ||
54 | #include <asm/mpspec.h> | ||
55 | #include <asm/mmzone.h> | ||
56 | #include <asm/setup.h> | ||
57 | #include <asm/arch_hooks.h> | ||
58 | #include <asm/sections.h> | ||
59 | #include <asm/io_apic.h> | ||
60 | #include <asm/ist.h> | ||
61 | #include <asm/io.h> | ||
62 | #include <asm/vmi.h> | ||
63 | #include <setup_arch.h> | ||
64 | #include <bios_ebda.h> | ||
65 | |||
66 | /* This value is set up by the early boot code to point to the value | ||
67 | immediately after the boot time page tables. It contains a *physical* | ||
68 | address, and must not be in the .bss segment! */ | ||
69 | unsigned long init_pg_tables_end __initdata = ~0UL; | ||
70 | |||
71 | int disable_pse __devinitdata = 0; | ||
72 | |||
73 | /* | ||
74 | * Machine setup.. | ||
75 | */ | ||
76 | extern struct resource code_resource; | ||
77 | extern struct resource data_resource; | ||
78 | |||
79 | /* cpu data as detected by the assembly code in head.S */ | ||
80 | struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | ||
81 | /* common cpu data for all cpus */ | ||
82 | struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; | ||
83 | EXPORT_SYMBOL(boot_cpu_data); | ||
84 | |||
85 | unsigned long mmu_cr4_features; | ||
86 | |||
87 | /* for MCA, but anyone else can use it if they want */ | ||
88 | unsigned int machine_id; | ||
89 | #ifdef CONFIG_MCA | ||
90 | EXPORT_SYMBOL(machine_id); | ||
91 | #endif | ||
92 | unsigned int machine_submodel_id; | ||
93 | unsigned int BIOS_revision; | ||
94 | unsigned int mca_pentium_flag; | ||
95 | |||
96 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
97 | int bootloader_type; | ||
98 | |||
99 | /* user-defined highmem size */ | ||
100 | static unsigned int highmem_pages = -1; | ||
101 | |||
102 | /* | ||
103 | * Setup options | ||
104 | */ | ||
105 | struct screen_info screen_info; | ||
106 | EXPORT_SYMBOL(screen_info); | ||
107 | struct apm_info apm_info; | ||
108 | EXPORT_SYMBOL(apm_info); | ||
109 | struct edid_info edid_info; | ||
110 | EXPORT_SYMBOL_GPL(edid_info); | ||
111 | struct ist_info ist_info; | ||
112 | #if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ | ||
113 | defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) | ||
114 | EXPORT_SYMBOL(ist_info); | ||
115 | #endif | ||
116 | |||
117 | extern void early_cpu_init(void); | ||
118 | extern int root_mountflags; | ||
119 | |||
120 | unsigned long saved_videomode; | ||
121 | |||
122 | #define RAMDISK_IMAGE_START_MASK 0x07FF | ||
123 | #define RAMDISK_PROMPT_FLAG 0x8000 | ||
124 | #define RAMDISK_LOAD_FLAG 0x4000 | ||
125 | |||
126 | static char __initdata command_line[COMMAND_LINE_SIZE]; | ||
127 | |||
128 | struct boot_params __initdata boot_params; | ||
129 | |||
130 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
131 | struct edd edd; | ||
132 | #ifdef CONFIG_EDD_MODULE | ||
133 | EXPORT_SYMBOL(edd); | ||
134 | #endif | ||
135 | /** | ||
136 | * copy_edd() - Copy the BIOS EDD information | ||
137 | * from boot_params into a safe place. | ||
138 | * | ||
139 | */ | ||
140 | static inline void copy_edd(void) | ||
141 | { | ||
142 | memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); | ||
143 | memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); | ||
144 | edd.mbr_signature_nr = EDD_MBR_SIG_NR; | ||
145 | edd.edd_info_nr = EDD_NR; | ||
146 | } | ||
147 | #else | ||
148 | static inline void copy_edd(void) | ||
149 | { | ||
150 | } | ||
151 | #endif | ||
152 | |||
153 | int __initdata user_defined_memmap = 0; | ||
154 | |||
155 | /* | ||
156 | * "mem=nopentium" disables the 4MB page tables. | ||
157 | * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM | ||
158 | * to <mem>, overriding the bios size. | ||
159 | * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from | ||
160 | * <start> to <start>+<mem>, overriding the bios size. | ||
161 | * | ||
162 | * HPA tells me bootloaders need to parse mem=, so no new | ||
163 | * option should be mem= [also see Documentation/i386/boot.txt] | ||
164 | */ | ||
165 | static int __init parse_mem(char *arg) | ||
166 | { | ||
167 | if (!arg) | ||
168 | return -EINVAL; | ||
169 | |||
170 | if (strcmp(arg, "nopentium") == 0) { | ||
171 | clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); | ||
172 | disable_pse = 1; | ||
173 | } else { | ||
174 | /* If the user specifies memory size, we | ||
175 | * limit the BIOS-provided memory map to | ||
176 | * that size. exactmap can be used to specify | ||
177 | * the exact map. mem=number can be used to | ||
178 | * trim the existing memory map. | ||
179 | */ | ||
180 | unsigned long long mem_size; | ||
181 | |||
182 | mem_size = memparse(arg, &arg); | ||
183 | limit_regions(mem_size); | ||
184 | user_defined_memmap = 1; | ||
185 | } | ||
186 | return 0; | ||
187 | } | ||
188 | early_param("mem", parse_mem); | ||
189 | |||
190 | #ifdef CONFIG_PROC_VMCORE | ||
191 | /* elfcorehdr= specifies the location of elf core header | ||
192 | * stored by the crashed kernel. | ||
193 | */ | ||
194 | static int __init parse_elfcorehdr(char *arg) | ||
195 | { | ||
196 | if (!arg) | ||
197 | return -EINVAL; | ||
198 | |||
199 | elfcorehdr_addr = memparse(arg, &arg); | ||
200 | return 0; | ||
201 | } | ||
202 | early_param("elfcorehdr", parse_elfcorehdr); | ||
203 | #endif /* CONFIG_PROC_VMCORE */ | ||
204 | |||
205 | /* | ||
206 | * highmem=size forces highmem to be exactly 'size' bytes. | ||
207 | * This works even on boxes that have no highmem otherwise. | ||
208 | * This also works to reduce highmem size on bigger boxes. | ||
209 | */ | ||
210 | static int __init parse_highmem(char *arg) | ||
211 | { | ||
212 | if (!arg) | ||
213 | return -EINVAL; | ||
214 | |||
215 | highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; | ||
216 | return 0; | ||
217 | } | ||
218 | early_param("highmem", parse_highmem); | ||
219 | |||
220 | /* | ||
221 | * vmalloc=size forces the vmalloc area to be exactly 'size' | ||
222 | * bytes. This can be used to increase (or decrease) the | ||
223 | * vmalloc area - the default is 128m. | ||
224 | */ | ||
225 | static int __init parse_vmalloc(char *arg) | ||
226 | { | ||
227 | if (!arg) | ||
228 | return -EINVAL; | ||
229 | |||
230 | __VMALLOC_RESERVE = memparse(arg, &arg); | ||
231 | return 0; | ||
232 | } | ||
233 | early_param("vmalloc", parse_vmalloc); | ||
234 | |||
235 | /* | ||
236 | * reservetop=size reserves a hole at the top of the kernel address space which | ||
237 | * a hypervisor can load into later. Needed for dynamically loaded hypervisors, | ||
238 | * so relocating the fixmap can be done before paging initialization. | ||
239 | */ | ||
240 | static int __init parse_reservetop(char *arg) | ||
241 | { | ||
242 | unsigned long address; | ||
243 | |||
244 | if (!arg) | ||
245 | return -EINVAL; | ||
246 | |||
247 | address = memparse(arg, &arg); | ||
248 | reserve_top_address(address); | ||
249 | return 0; | ||
250 | } | ||
251 | early_param("reservetop", parse_reservetop); | ||
252 | |||
253 | /* | ||
254 | * Determine low and high memory ranges: | ||
255 | */ | ||
256 | unsigned long __init find_max_low_pfn(void) | ||
257 | { | ||
258 | unsigned long max_low_pfn; | ||
259 | |||
260 | max_low_pfn = max_pfn; | ||
261 | if (max_low_pfn > MAXMEM_PFN) { | ||
262 | if (highmem_pages == -1) | ||
263 | highmem_pages = max_pfn - MAXMEM_PFN; | ||
264 | if (highmem_pages + MAXMEM_PFN < max_pfn) | ||
265 | max_pfn = MAXMEM_PFN + highmem_pages; | ||
266 | if (highmem_pages + MAXMEM_PFN > max_pfn) { | ||
267 | printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); | ||
268 | highmem_pages = 0; | ||
269 | } | ||
270 | max_low_pfn = MAXMEM_PFN; | ||
271 | #ifndef CONFIG_HIGHMEM | ||
272 | /* Maximum memory usable is what is directly addressable */ | ||
273 | printk(KERN_WARNING "Warning only %ldMB will be used.\n", | ||
274 | MAXMEM>>20); | ||
275 | if (max_pfn > MAX_NONPAE_PFN) | ||
276 | printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); | ||
277 | else | ||
278 | printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); | ||
279 | max_pfn = MAXMEM_PFN; | ||
280 | #else /* !CONFIG_HIGHMEM */ | ||
281 | #ifndef CONFIG_HIGHMEM64G | ||
282 | if (max_pfn > MAX_NONPAE_PFN) { | ||
283 | max_pfn = MAX_NONPAE_PFN; | ||
284 | printk(KERN_WARNING "Warning only 4GB will be used.\n"); | ||
285 | printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); | ||
286 | } | ||
287 | #endif /* !CONFIG_HIGHMEM64G */ | ||
288 | #endif /* !CONFIG_HIGHMEM */ | ||
289 | } else { | ||
290 | if (highmem_pages == -1) | ||
291 | highmem_pages = 0; | ||
292 | #ifdef CONFIG_HIGHMEM | ||
293 | if (highmem_pages >= max_pfn) { | ||
294 | printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); | ||
295 | highmem_pages = 0; | ||
296 | } | ||
297 | if (highmem_pages) { | ||
298 | if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ | ||
299 | printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); | ||
300 | highmem_pages = 0; | ||
301 | } | ||
302 | max_low_pfn -= highmem_pages; | ||
303 | } | ||
304 | #else | ||
305 | if (highmem_pages) | ||
306 | printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); | ||
307 | #endif | ||
308 | } | ||
309 | return max_low_pfn; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * workaround for Dell systems that neglect to reserve EBDA | ||
314 | */ | ||
315 | static void __init reserve_ebda_region(void) | ||
316 | { | ||
317 | unsigned int addr; | ||
318 | addr = get_bios_ebda(); | ||
319 | if (addr) | ||
320 | reserve_bootmem(addr, PAGE_SIZE); | ||
321 | } | ||
322 | |||
323 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
324 | void __init setup_bootmem_allocator(void); | ||
325 | static unsigned long __init setup_memory(void) | ||
326 | { | ||
327 | /* | ||
328 | * partially used pages are not usable - thus | ||
329 | * we are rounding upwards: | ||
330 | */ | ||
331 | min_low_pfn = PFN_UP(init_pg_tables_end); | ||
332 | |||
333 | find_max_pfn(); | ||
334 | |||
335 | max_low_pfn = find_max_low_pfn(); | ||
336 | |||
337 | #ifdef CONFIG_HIGHMEM | ||
338 | highstart_pfn = highend_pfn = max_pfn; | ||
339 | if (max_pfn > max_low_pfn) { | ||
340 | highstart_pfn = max_low_pfn; | ||
341 | } | ||
342 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | ||
343 | pages_to_mb(highend_pfn - highstart_pfn)); | ||
344 | num_physpages = highend_pfn; | ||
345 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | ||
346 | #else | ||
347 | num_physpages = max_low_pfn; | ||
348 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | ||
349 | #endif | ||
350 | #ifdef CONFIG_FLATMEM | ||
351 | max_mapnr = num_physpages; | ||
352 | #endif | ||
353 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | ||
354 | pages_to_mb(max_low_pfn)); | ||
355 | |||
356 | setup_bootmem_allocator(); | ||
357 | |||
358 | return max_low_pfn; | ||
359 | } | ||
360 | |||
361 | void __init zone_sizes_init(void) | ||
362 | { | ||
363 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
364 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
365 | max_zone_pfns[ZONE_DMA] = | ||
366 | virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
367 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; | ||
368 | #ifdef CONFIG_HIGHMEM | ||
369 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; | ||
370 | add_active_range(0, 0, highend_pfn); | ||
371 | #else | ||
372 | add_active_range(0, 0, max_low_pfn); | ||
373 | #endif | ||
374 | |||
375 | free_area_init_nodes(max_zone_pfns); | ||
376 | } | ||
377 | #else | ||
378 | extern unsigned long __init setup_memory(void); | ||
379 | extern void zone_sizes_init(void); | ||
380 | #endif /* !CONFIG_NEED_MULTIPLE_NODES */ | ||
381 | |||
382 | void __init setup_bootmem_allocator(void) | ||
383 | { | ||
384 | unsigned long bootmap_size; | ||
385 | /* | ||
386 | * Initialize the boot-time allocator (with low memory only): | ||
387 | */ | ||
388 | bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); | ||
389 | |||
390 | register_bootmem_low_pages(max_low_pfn); | ||
391 | |||
392 | /* | ||
393 | * Reserve the bootmem bitmap itself as well. We do this in two | ||
394 | * steps (first step was init_bootmem()) because this catches | ||
395 | * the (very unlikely) case of us accidentally initializing the | ||
396 | * bootmem allocator with an invalid RAM area. | ||
397 | */ | ||
398 | reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + | ||
399 | bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); | ||
400 | |||
401 | /* | ||
402 | * reserve physical page 0 - it's a special BIOS page on many boxes, | ||
403 | * enabling clean reboots, SMP operation, laptop functions. | ||
404 | */ | ||
405 | reserve_bootmem(0, PAGE_SIZE); | ||
406 | |||
407 | /* reserve EBDA region, it's a 4K region */ | ||
408 | reserve_ebda_region(); | ||
409 | |||
410 | /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent | ||
411 | PCI prefetch into it (errata #56). Usually the page is reserved anyways, | ||
412 | unless you have no PS/2 mouse plugged in. */ | ||
413 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
414 | boot_cpu_data.x86 == 6) | ||
415 | reserve_bootmem(0xa0000 - 4096, 4096); | ||
416 | |||
417 | #ifdef CONFIG_SMP | ||
418 | /* | ||
419 | * But first pinch a few for the stack/trampoline stuff | ||
420 | * FIXME: Don't need the extra page at 4K, but need to fix | ||
421 | * trampoline before removing it. (see the GDT stuff) | ||
422 | */ | ||
423 | reserve_bootmem(PAGE_SIZE, PAGE_SIZE); | ||
424 | #endif | ||
425 | #ifdef CONFIG_ACPI_SLEEP | ||
426 | /* | ||
427 | * Reserve low memory region for sleep support. | ||
428 | */ | ||
429 | acpi_reserve_bootmem(); | ||
430 | #endif | ||
431 | #ifdef CONFIG_X86_FIND_SMP_CONFIG | ||
432 | /* | ||
433 | * Find and reserve possible boot-time SMP configuration: | ||
434 | */ | ||
435 | find_smp_config(); | ||
436 | #endif | ||
437 | numa_kva_reserve(); | ||
438 | #ifdef CONFIG_BLK_DEV_INITRD | ||
439 | if (LOADER_TYPE && INITRD_START) { | ||
440 | if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { | ||
441 | reserve_bootmem(INITRD_START, INITRD_SIZE); | ||
442 | initrd_start = INITRD_START + PAGE_OFFSET; | ||
443 | initrd_end = initrd_start+INITRD_SIZE; | ||
444 | } | ||
445 | else { | ||
446 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
447 | "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | ||
448 | INITRD_START + INITRD_SIZE, | ||
449 | max_low_pfn << PAGE_SHIFT); | ||
450 | initrd_start = 0; | ||
451 | } | ||
452 | } | ||
453 | #endif | ||
454 | #ifdef CONFIG_KEXEC | ||
455 | if (crashk_res.start != crashk_res.end) | ||
456 | reserve_bootmem(crashk_res.start, | ||
457 | crashk_res.end - crashk_res.start + 1); | ||
458 | #endif | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * The node 0 pgdat is initialized before all of these because | ||
463 | * it's needed for bootmem. node>0 pgdats have their virtual | ||
464 | * space allocated before the pagetables are in place to access | ||
465 | * them, so they can't be cleared then. | ||
466 | * | ||
467 | * This should all compile down to nothing when NUMA is off. | ||
468 | */ | ||
469 | static void __init remapped_pgdat_init(void) | ||
470 | { | ||
471 | int nid; | ||
472 | |||
473 | for_each_online_node(nid) { | ||
474 | if (nid != 0) | ||
475 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | ||
476 | } | ||
477 | } | ||
478 | |||
479 | #ifdef CONFIG_MCA | ||
480 | static void set_mca_bus(int x) | ||
481 | { | ||
482 | MCA_bus = x; | ||
483 | } | ||
484 | #else | ||
485 | static void set_mca_bus(int x) { } | ||
486 | #endif | ||
487 | |||
488 | /* Overridden in paravirt.c if CONFIG_PARAVIRT */ | ||
489 | char * __init __attribute__((weak)) memory_setup(void) | ||
490 | { | ||
491 | return machine_specific_memory_setup(); | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * Determine if we were loaded by an EFI loader. If so, then we have also been | ||
496 | * passed the efi memmap, systab, etc., so we should use these data structures | ||
497 | * for initialization. Note, the efi init code path is determined by the | ||
498 | * global efi_enabled. This allows the same kernel image to be used on existing | ||
499 | * systems (with a traditional BIOS) as well as on EFI systems. | ||
500 | */ | ||
501 | void __init setup_arch(char **cmdline_p) | ||
502 | { | ||
503 | unsigned long max_low_pfn; | ||
504 | |||
505 | memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); | ||
506 | pre_setup_arch_hook(); | ||
507 | early_cpu_init(); | ||
508 | |||
509 | /* | ||
510 | * FIXME: This isn't an official loader_type right | ||
511 | * now but does currently work with elilo. | ||
512 | * If we were configured as an EFI kernel, check to make | ||
513 | * sure that we were loaded correctly from elilo and that | ||
514 | * the system table is valid. If not, then initialize normally. | ||
515 | */ | ||
516 | #ifdef CONFIG_EFI | ||
517 | if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) | ||
518 | efi_enabled = 1; | ||
519 | #endif | ||
520 | |||
521 | ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); | ||
522 | screen_info = SCREEN_INFO; | ||
523 | edid_info = EDID_INFO; | ||
524 | apm_info.bios = APM_BIOS_INFO; | ||
525 | ist_info = IST_INFO; | ||
526 | saved_videomode = VIDEO_MODE; | ||
527 | if( SYS_DESC_TABLE.length != 0 ) { | ||
528 | set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); | ||
529 | machine_id = SYS_DESC_TABLE.table[0]; | ||
530 | machine_submodel_id = SYS_DESC_TABLE.table[1]; | ||
531 | BIOS_revision = SYS_DESC_TABLE.table[2]; | ||
532 | } | ||
533 | bootloader_type = LOADER_TYPE; | ||
534 | |||
535 | #ifdef CONFIG_BLK_DEV_RAM | ||
536 | rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; | ||
537 | rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); | ||
538 | rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); | ||
539 | #endif | ||
540 | ARCH_SETUP | ||
541 | if (efi_enabled) | ||
542 | efi_init(); | ||
543 | else { | ||
544 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
545 | print_memory_map(memory_setup()); | ||
546 | } | ||
547 | |||
548 | copy_edd(); | ||
549 | |||
550 | if (!MOUNT_ROOT_RDONLY) | ||
551 | root_mountflags &= ~MS_RDONLY; | ||
552 | init_mm.start_code = (unsigned long) _text; | ||
553 | init_mm.end_code = (unsigned long) _etext; | ||
554 | init_mm.end_data = (unsigned long) _edata; | ||
555 | init_mm.brk = init_pg_tables_end + PAGE_OFFSET; | ||
556 | |||
557 | code_resource.start = virt_to_phys(_text); | ||
558 | code_resource.end = virt_to_phys(_etext)-1; | ||
559 | data_resource.start = virt_to_phys(_etext); | ||
560 | data_resource.end = virt_to_phys(_edata)-1; | ||
561 | |||
562 | parse_early_param(); | ||
563 | |||
564 | if (user_defined_memmap) { | ||
565 | printk(KERN_INFO "user-defined physical RAM map:\n"); | ||
566 | print_memory_map("user"); | ||
567 | } | ||
568 | |||
569 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | ||
570 | *cmdline_p = command_line; | ||
571 | |||
572 | max_low_pfn = setup_memory(); | ||
573 | |||
574 | #ifdef CONFIG_VMI | ||
575 | /* | ||
576 | * Must be after max_low_pfn is determined, and before kernel | ||
577 | * pagetables are setup. | ||
578 | */ | ||
579 | vmi_init(); | ||
580 | #endif | ||
581 | |||
582 | /* | ||
583 | * NOTE: before this point _nobody_ is allowed to allocate | ||
584 | * any memory using the bootmem allocator. Although the | ||
585 | * alloctor is now initialised only the first 8Mb of the kernel | ||
586 | * virtual address space has been mapped. All allocations before | ||
587 | * paging_init() has completed must use the alloc_bootmem_low_pages() | ||
588 | * variant (which allocates DMA'able memory) and care must be taken | ||
589 | * not to exceed the 8Mb limit. | ||
590 | */ | ||
591 | |||
592 | #ifdef CONFIG_SMP | ||
593 | smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ | ||
594 | #endif | ||
595 | paging_init(); | ||
596 | remapped_pgdat_init(); | ||
597 | sparse_init(); | ||
598 | zone_sizes_init(); | ||
599 | |||
600 | /* | ||
601 | * NOTE: at this point the bootmem allocator is fully available. | ||
602 | */ | ||
603 | |||
604 | paravirt_post_allocator_init(); | ||
605 | |||
606 | dmi_scan_machine(); | ||
607 | |||
608 | #ifdef CONFIG_X86_GENERICARCH | ||
609 | generic_apic_probe(); | ||
610 | #endif | ||
611 | if (efi_enabled) | ||
612 | efi_map_memmap(); | ||
613 | |||
614 | #ifdef CONFIG_ACPI | ||
615 | /* | ||
616 | * Parse the ACPI tables for possible boot-time SMP configuration. | ||
617 | */ | ||
618 | acpi_boot_table_init(); | ||
619 | #endif | ||
620 | |||
621 | #ifdef CONFIG_PCI | ||
622 | #ifdef CONFIG_X86_IO_APIC | ||
623 | check_acpi_pci(); /* Checks more than just ACPI actually */ | ||
624 | #endif | ||
625 | #endif | ||
626 | |||
627 | #ifdef CONFIG_ACPI | ||
628 | acpi_boot_init(); | ||
629 | |||
630 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) | ||
631 | if (def_to_bigsmp) | ||
632 | printk(KERN_WARNING "More than 8 CPUs detected and " | ||
633 | "CONFIG_X86_PC cannot handle it.\nUse " | ||
634 | "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); | ||
635 | #endif | ||
636 | #endif | ||
637 | #ifdef CONFIG_X86_LOCAL_APIC | ||
638 | if (smp_found_config) | ||
639 | get_smp_config(); | ||
640 | #endif | ||
641 | |||
642 | e820_register_memory(); | ||
643 | e820_mark_nosave_regions(); | ||
644 | |||
645 | #ifdef CONFIG_VT | ||
646 | #if defined(CONFIG_VGA_CONSOLE) | ||
647 | if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) | ||
648 | conswitchp = &vga_con; | ||
649 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
650 | conswitchp = &dummy_con; | ||
651 | #endif | ||
652 | #endif | ||
653 | } | ||
diff --git a/arch/x86/kernel/sigframe_32.h b/arch/x86/kernel/sigframe_32.h new file mode 100644 index 000000000000..0b2221711dad --- /dev/null +++ b/arch/x86/kernel/sigframe_32.h | |||
@@ -0,0 +1,21 @@ | |||
1 | struct sigframe | ||
2 | { | ||
3 | char __user *pretcode; | ||
4 | int sig; | ||
5 | struct sigcontext sc; | ||
6 | struct _fpstate fpstate; | ||
7 | unsigned long extramask[_NSIG_WORDS-1]; | ||
8 | char retcode[8]; | ||
9 | }; | ||
10 | |||
11 | struct rt_sigframe | ||
12 | { | ||
13 | char __user *pretcode; | ||
14 | int sig; | ||
15 | struct siginfo __user *pinfo; | ||
16 | void __user *puc; | ||
17 | struct siginfo info; | ||
18 | struct ucontext uc; | ||
19 | struct _fpstate fpstate; | ||
20 | char retcode[8]; | ||
21 | }; | ||
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c new file mode 100644 index 000000000000..c03570f7fe8e --- /dev/null +++ b/arch/x86/kernel/signal_32.c | |||
@@ -0,0 +1,667 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson | ||
7 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | ||
8 | */ | ||
9 | |||
10 | #include <linux/sched.h> | ||
11 | #include <linux/mm.h> | ||
12 | #include <linux/smp.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/signal.h> | ||
15 | #include <linux/errno.h> | ||
16 | #include <linux/wait.h> | ||
17 | #include <linux/unistd.h> | ||
18 | #include <linux/stddef.h> | ||
19 | #include <linux/personality.h> | ||
20 | #include <linux/suspend.h> | ||
21 | #include <linux/ptrace.h> | ||
22 | #include <linux/elf.h> | ||
23 | #include <linux/binfmts.h> | ||
24 | #include <asm/processor.h> | ||
25 | #include <asm/ucontext.h> | ||
26 | #include <asm/uaccess.h> | ||
27 | #include <asm/i387.h> | ||
28 | #include "sigframe_32.h" | ||
29 | |||
30 | #define DEBUG_SIG 0 | ||
31 | |||
32 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | ||
33 | |||
34 | /* | ||
35 | * Atomically swap in the new signal mask, and wait for a signal. | ||
36 | */ | ||
37 | asmlinkage int | ||
38 | sys_sigsuspend(int history0, int history1, old_sigset_t mask) | ||
39 | { | ||
40 | mask &= _BLOCKABLE; | ||
41 | spin_lock_irq(¤t->sighand->siglock); | ||
42 | current->saved_sigmask = current->blocked; | ||
43 | siginitset(¤t->blocked, mask); | ||
44 | recalc_sigpending(); | ||
45 | spin_unlock_irq(¤t->sighand->siglock); | ||
46 | |||
47 | current->state = TASK_INTERRUPTIBLE; | ||
48 | schedule(); | ||
49 | set_thread_flag(TIF_RESTORE_SIGMASK); | ||
50 | return -ERESTARTNOHAND; | ||
51 | } | ||
52 | |||
53 | asmlinkage int | ||
54 | sys_sigaction(int sig, const struct old_sigaction __user *act, | ||
55 | struct old_sigaction __user *oact) | ||
56 | { | ||
57 | struct k_sigaction new_ka, old_ka; | ||
58 | int ret; | ||
59 | |||
60 | if (act) { | ||
61 | old_sigset_t mask; | ||
62 | if (!access_ok(VERIFY_READ, act, sizeof(*act)) || | ||
63 | __get_user(new_ka.sa.sa_handler, &act->sa_handler) || | ||
64 | __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) | ||
65 | return -EFAULT; | ||
66 | __get_user(new_ka.sa.sa_flags, &act->sa_flags); | ||
67 | __get_user(mask, &act->sa_mask); | ||
68 | siginitset(&new_ka.sa.sa_mask, mask); | ||
69 | } | ||
70 | |||
71 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
72 | |||
73 | if (!ret && oact) { | ||
74 | if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || | ||
75 | __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || | ||
76 | __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) | ||
77 | return -EFAULT; | ||
78 | __put_user(old_ka.sa.sa_flags, &oact->sa_flags); | ||
79 | __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); | ||
80 | } | ||
81 | |||
82 | return ret; | ||
83 | } | ||
84 | |||
85 | asmlinkage int | ||
86 | sys_sigaltstack(unsigned long ebx) | ||
87 | { | ||
88 | /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ | ||
89 | struct pt_regs *regs = (struct pt_regs *)&ebx; | ||
90 | const stack_t __user *uss = (const stack_t __user *)ebx; | ||
91 | stack_t __user *uoss = (stack_t __user *)regs->ecx; | ||
92 | |||
93 | return do_sigaltstack(uss, uoss, regs->esp); | ||
94 | } | ||
95 | |||
96 | |||
97 | /* | ||
98 | * Do a signal return; undo the signal stack. | ||
99 | */ | ||
100 | |||
101 | static int | ||
102 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax) | ||
103 | { | ||
104 | unsigned int err = 0; | ||
105 | |||
106 | /* Always make any pending restarted system calls return -EINTR */ | ||
107 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | ||
108 | |||
109 | #define COPY(x) err |= __get_user(regs->x, &sc->x) | ||
110 | |||
111 | #define COPY_SEG(seg) \ | ||
112 | { unsigned short tmp; \ | ||
113 | err |= __get_user(tmp, &sc->seg); \ | ||
114 | regs->x##seg = tmp; } | ||
115 | |||
116 | #define COPY_SEG_STRICT(seg) \ | ||
117 | { unsigned short tmp; \ | ||
118 | err |= __get_user(tmp, &sc->seg); \ | ||
119 | regs->x##seg = tmp|3; } | ||
120 | |||
121 | #define GET_SEG(seg) \ | ||
122 | { unsigned short tmp; \ | ||
123 | err |= __get_user(tmp, &sc->seg); \ | ||
124 | loadsegment(seg,tmp); } | ||
125 | |||
126 | #define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_RF | \ | ||
127 | X86_EFLAGS_OF | X86_EFLAGS_DF | \ | ||
128 | X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ | ||
129 | X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) | ||
130 | |||
131 | GET_SEG(gs); | ||
132 | COPY_SEG(fs); | ||
133 | COPY_SEG(es); | ||
134 | COPY_SEG(ds); | ||
135 | COPY(edi); | ||
136 | COPY(esi); | ||
137 | COPY(ebp); | ||
138 | COPY(esp); | ||
139 | COPY(ebx); | ||
140 | COPY(edx); | ||
141 | COPY(ecx); | ||
142 | COPY(eip); | ||
143 | COPY_SEG_STRICT(cs); | ||
144 | COPY_SEG_STRICT(ss); | ||
145 | |||
146 | { | ||
147 | unsigned int tmpflags; | ||
148 | err |= __get_user(tmpflags, &sc->eflags); | ||
149 | regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); | ||
150 | regs->orig_eax = -1; /* disable syscall checks */ | ||
151 | } | ||
152 | |||
153 | { | ||
154 | struct _fpstate __user * buf; | ||
155 | err |= __get_user(buf, &sc->fpstate); | ||
156 | if (buf) { | ||
157 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
158 | goto badframe; | ||
159 | err |= restore_i387(buf); | ||
160 | } else { | ||
161 | struct task_struct *me = current; | ||
162 | if (used_math()) { | ||
163 | clear_fpu(me); | ||
164 | clear_used_math(); | ||
165 | } | ||
166 | } | ||
167 | } | ||
168 | |||
169 | err |= __get_user(*peax, &sc->eax); | ||
170 | return err; | ||
171 | |||
172 | badframe: | ||
173 | return 1; | ||
174 | } | ||
175 | |||
176 | asmlinkage int sys_sigreturn(unsigned long __unused) | ||
177 | { | ||
178 | struct pt_regs *regs = (struct pt_regs *) &__unused; | ||
179 | struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); | ||
180 | sigset_t set; | ||
181 | int eax; | ||
182 | |||
183 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | ||
184 | goto badframe; | ||
185 | if (__get_user(set.sig[0], &frame->sc.oldmask) | ||
186 | || (_NSIG_WORDS > 1 | ||
187 | && __copy_from_user(&set.sig[1], &frame->extramask, | ||
188 | sizeof(frame->extramask)))) | ||
189 | goto badframe; | ||
190 | |||
191 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
192 | spin_lock_irq(¤t->sighand->siglock); | ||
193 | current->blocked = set; | ||
194 | recalc_sigpending(); | ||
195 | spin_unlock_irq(¤t->sighand->siglock); | ||
196 | |||
197 | if (restore_sigcontext(regs, &frame->sc, &eax)) | ||
198 | goto badframe; | ||
199 | return eax; | ||
200 | |||
201 | badframe: | ||
202 | if (show_unhandled_signals && printk_ratelimit()) | ||
203 | printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" | ||
204 | " esp:%lx oeax:%lx\n", | ||
205 | current->pid > 1 ? KERN_INFO : KERN_EMERG, | ||
206 | current->comm, current->pid, frame, regs->eip, | ||
207 | regs->esp, regs->orig_eax); | ||
208 | |||
209 | force_sig(SIGSEGV, current); | ||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | asmlinkage int sys_rt_sigreturn(unsigned long __unused) | ||
214 | { | ||
215 | struct pt_regs *regs = (struct pt_regs *) &__unused; | ||
216 | struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); | ||
217 | sigset_t set; | ||
218 | int eax; | ||
219 | |||
220 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | ||
221 | goto badframe; | ||
222 | if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) | ||
223 | goto badframe; | ||
224 | |||
225 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
226 | spin_lock_irq(¤t->sighand->siglock); | ||
227 | current->blocked = set; | ||
228 | recalc_sigpending(); | ||
229 | spin_unlock_irq(¤t->sighand->siglock); | ||
230 | |||
231 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) | ||
232 | goto badframe; | ||
233 | |||
234 | if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) | ||
235 | goto badframe; | ||
236 | |||
237 | return eax; | ||
238 | |||
239 | badframe: | ||
240 | force_sig(SIGSEGV, current); | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * Set up a signal frame. | ||
246 | */ | ||
247 | |||
248 | static int | ||
249 | setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, | ||
250 | struct pt_regs *regs, unsigned long mask) | ||
251 | { | ||
252 | int tmp, err = 0; | ||
253 | |||
254 | err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); | ||
255 | savesegment(gs, tmp); | ||
256 | err |= __put_user(tmp, (unsigned int __user *)&sc->gs); | ||
257 | |||
258 | err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); | ||
259 | err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); | ||
260 | err |= __put_user(regs->edi, &sc->edi); | ||
261 | err |= __put_user(regs->esi, &sc->esi); | ||
262 | err |= __put_user(regs->ebp, &sc->ebp); | ||
263 | err |= __put_user(regs->esp, &sc->esp); | ||
264 | err |= __put_user(regs->ebx, &sc->ebx); | ||
265 | err |= __put_user(regs->edx, &sc->edx); | ||
266 | err |= __put_user(regs->ecx, &sc->ecx); | ||
267 | err |= __put_user(regs->eax, &sc->eax); | ||
268 | err |= __put_user(current->thread.trap_no, &sc->trapno); | ||
269 | err |= __put_user(current->thread.error_code, &sc->err); | ||
270 | err |= __put_user(regs->eip, &sc->eip); | ||
271 | err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); | ||
272 | err |= __put_user(regs->eflags, &sc->eflags); | ||
273 | err |= __put_user(regs->esp, &sc->esp_at_signal); | ||
274 | err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); | ||
275 | |||
276 | tmp = save_i387(fpstate); | ||
277 | if (tmp < 0) | ||
278 | err = 1; | ||
279 | else | ||
280 | err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); | ||
281 | |||
282 | /* non-iBCS2 extensions.. */ | ||
283 | err |= __put_user(mask, &sc->oldmask); | ||
284 | err |= __put_user(current->thread.cr2, &sc->cr2); | ||
285 | |||
286 | return err; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Determine which stack to use.. | ||
291 | */ | ||
292 | static inline void __user * | ||
293 | get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) | ||
294 | { | ||
295 | unsigned long esp; | ||
296 | |||
297 | /* Default to using normal stack */ | ||
298 | esp = regs->esp; | ||
299 | |||
300 | /* This is the X/Open sanctioned signal stack switching. */ | ||
301 | if (ka->sa.sa_flags & SA_ONSTACK) { | ||
302 | if (sas_ss_flags(esp) == 0) | ||
303 | esp = current->sas_ss_sp + current->sas_ss_size; | ||
304 | } | ||
305 | |||
306 | /* This is the legacy signal stack switching. */ | ||
307 | else if ((regs->xss & 0xffff) != __USER_DS && | ||
308 | !(ka->sa.sa_flags & SA_RESTORER) && | ||
309 | ka->sa.sa_restorer) { | ||
310 | esp = (unsigned long) ka->sa.sa_restorer; | ||
311 | } | ||
312 | |||
313 | esp -= frame_size; | ||
314 | /* Align the stack pointer according to the i386 ABI, | ||
315 | * i.e. so that on function entry ((sp + 4) & 15) == 0. */ | ||
316 | esp = ((esp + 4) & -16ul) - 4; | ||
317 | return (void __user *) esp; | ||
318 | } | ||
319 | |||
320 | /* These symbols are defined with the addresses in the vsyscall page. | ||
321 | See vsyscall-sigreturn.S. */ | ||
322 | extern void __user __kernel_sigreturn; | ||
323 | extern void __user __kernel_rt_sigreturn; | ||
324 | |||
325 | static int setup_frame(int sig, struct k_sigaction *ka, | ||
326 | sigset_t *set, struct pt_regs * regs) | ||
327 | { | ||
328 | void __user *restorer; | ||
329 | struct sigframe __user *frame; | ||
330 | int err = 0; | ||
331 | int usig; | ||
332 | |||
333 | frame = get_sigframe(ka, regs, sizeof(*frame)); | ||
334 | |||
335 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
336 | goto give_sigsegv; | ||
337 | |||
338 | usig = current_thread_info()->exec_domain | ||
339 | && current_thread_info()->exec_domain->signal_invmap | ||
340 | && sig < 32 | ||
341 | ? current_thread_info()->exec_domain->signal_invmap[sig] | ||
342 | : sig; | ||
343 | |||
344 | err = __put_user(usig, &frame->sig); | ||
345 | if (err) | ||
346 | goto give_sigsegv; | ||
347 | |||
348 | err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); | ||
349 | if (err) | ||
350 | goto give_sigsegv; | ||
351 | |||
352 | if (_NSIG_WORDS > 1) { | ||
353 | err = __copy_to_user(&frame->extramask, &set->sig[1], | ||
354 | sizeof(frame->extramask)); | ||
355 | if (err) | ||
356 | goto give_sigsegv; | ||
357 | } | ||
358 | |||
359 | if (current->binfmt->hasvdso) | ||
360 | restorer = (void *)VDSO_SYM(&__kernel_sigreturn); | ||
361 | else | ||
362 | restorer = (void *)&frame->retcode; | ||
363 | if (ka->sa.sa_flags & SA_RESTORER) | ||
364 | restorer = ka->sa.sa_restorer; | ||
365 | |||
366 | /* Set up to return from userspace. */ | ||
367 | err |= __put_user(restorer, &frame->pretcode); | ||
368 | |||
369 | /* | ||
370 | * This is popl %eax ; movl $,%eax ; int $0x80 | ||
371 | * | ||
372 | * WE DO NOT USE IT ANY MORE! It's only left here for historical | ||
373 | * reasons and because gdb uses it as a signature to notice | ||
374 | * signal handler stack frames. | ||
375 | */ | ||
376 | err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); | ||
377 | err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); | ||
378 | err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); | ||
379 | |||
380 | if (err) | ||
381 | goto give_sigsegv; | ||
382 | |||
383 | /* Set up registers for signal handler */ | ||
384 | regs->esp = (unsigned long) frame; | ||
385 | regs->eip = (unsigned long) ka->sa.sa_handler; | ||
386 | regs->eax = (unsigned long) sig; | ||
387 | regs->edx = (unsigned long) 0; | ||
388 | regs->ecx = (unsigned long) 0; | ||
389 | |||
390 | set_fs(USER_DS); | ||
391 | regs->xds = __USER_DS; | ||
392 | regs->xes = __USER_DS; | ||
393 | regs->xss = __USER_DS; | ||
394 | regs->xcs = __USER_CS; | ||
395 | |||
396 | /* | ||
397 | * Clear TF when entering the signal handler, but | ||
398 | * notify any tracer that was single-stepping it. | ||
399 | * The tracer may want to single-step inside the | ||
400 | * handler too. | ||
401 | */ | ||
402 | regs->eflags &= ~TF_MASK; | ||
403 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
404 | ptrace_notify(SIGTRAP); | ||
405 | |||
406 | #if DEBUG_SIG | ||
407 | printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", | ||
408 | current->comm, current->pid, frame, regs->eip, frame->pretcode); | ||
409 | #endif | ||
410 | |||
411 | return 0; | ||
412 | |||
413 | give_sigsegv: | ||
414 | force_sigsegv(sig, current); | ||
415 | return -EFAULT; | ||
416 | } | ||
417 | |||
418 | static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
419 | sigset_t *set, struct pt_regs * regs) | ||
420 | { | ||
421 | void __user *restorer; | ||
422 | struct rt_sigframe __user *frame; | ||
423 | int err = 0; | ||
424 | int usig; | ||
425 | |||
426 | frame = get_sigframe(ka, regs, sizeof(*frame)); | ||
427 | |||
428 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
429 | goto give_sigsegv; | ||
430 | |||
431 | usig = current_thread_info()->exec_domain | ||
432 | && current_thread_info()->exec_domain->signal_invmap | ||
433 | && sig < 32 | ||
434 | ? current_thread_info()->exec_domain->signal_invmap[sig] | ||
435 | : sig; | ||
436 | |||
437 | err |= __put_user(usig, &frame->sig); | ||
438 | err |= __put_user(&frame->info, &frame->pinfo); | ||
439 | err |= __put_user(&frame->uc, &frame->puc); | ||
440 | err |= copy_siginfo_to_user(&frame->info, info); | ||
441 | if (err) | ||
442 | goto give_sigsegv; | ||
443 | |||
444 | /* Create the ucontext. */ | ||
445 | err |= __put_user(0, &frame->uc.uc_flags); | ||
446 | err |= __put_user(0, &frame->uc.uc_link); | ||
447 | err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | ||
448 | err |= __put_user(sas_ss_flags(regs->esp), | ||
449 | &frame->uc.uc_stack.ss_flags); | ||
450 | err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
451 | err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, | ||
452 | regs, set->sig[0]); | ||
453 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | ||
454 | if (err) | ||
455 | goto give_sigsegv; | ||
456 | |||
457 | /* Set up to return from userspace. */ | ||
458 | restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); | ||
459 | if (ka->sa.sa_flags & SA_RESTORER) | ||
460 | restorer = ka->sa.sa_restorer; | ||
461 | err |= __put_user(restorer, &frame->pretcode); | ||
462 | |||
463 | /* | ||
464 | * This is movl $,%eax ; int $0x80 | ||
465 | * | ||
466 | * WE DO NOT USE IT ANY MORE! It's only left here for historical | ||
467 | * reasons and because gdb uses it as a signature to notice | ||
468 | * signal handler stack frames. | ||
469 | */ | ||
470 | err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); | ||
471 | err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); | ||
472 | err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); | ||
473 | |||
474 | if (err) | ||
475 | goto give_sigsegv; | ||
476 | |||
477 | /* Set up registers for signal handler */ | ||
478 | regs->esp = (unsigned long) frame; | ||
479 | regs->eip = (unsigned long) ka->sa.sa_handler; | ||
480 | regs->eax = (unsigned long) usig; | ||
481 | regs->edx = (unsigned long) &frame->info; | ||
482 | regs->ecx = (unsigned long) &frame->uc; | ||
483 | |||
484 | set_fs(USER_DS); | ||
485 | regs->xds = __USER_DS; | ||
486 | regs->xes = __USER_DS; | ||
487 | regs->xss = __USER_DS; | ||
488 | regs->xcs = __USER_CS; | ||
489 | |||
490 | /* | ||
491 | * Clear TF when entering the signal handler, but | ||
492 | * notify any tracer that was single-stepping it. | ||
493 | * The tracer may want to single-step inside the | ||
494 | * handler too. | ||
495 | */ | ||
496 | regs->eflags &= ~TF_MASK; | ||
497 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
498 | ptrace_notify(SIGTRAP); | ||
499 | |||
500 | #if DEBUG_SIG | ||
501 | printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", | ||
502 | current->comm, current->pid, frame, regs->eip, frame->pretcode); | ||
503 | #endif | ||
504 | |||
505 | return 0; | ||
506 | |||
507 | give_sigsegv: | ||
508 | force_sigsegv(sig, current); | ||
509 | return -EFAULT; | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * OK, we're invoking a handler | ||
514 | */ | ||
515 | |||
516 | static int | ||
517 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | ||
518 | sigset_t *oldset, struct pt_regs * regs) | ||
519 | { | ||
520 | int ret; | ||
521 | |||
522 | /* Are we from a system call? */ | ||
523 | if (regs->orig_eax >= 0) { | ||
524 | /* If so, check system call restarting.. */ | ||
525 | switch (regs->eax) { | ||
526 | case -ERESTART_RESTARTBLOCK: | ||
527 | case -ERESTARTNOHAND: | ||
528 | regs->eax = -EINTR; | ||
529 | break; | ||
530 | |||
531 | case -ERESTARTSYS: | ||
532 | if (!(ka->sa.sa_flags & SA_RESTART)) { | ||
533 | regs->eax = -EINTR; | ||
534 | break; | ||
535 | } | ||
536 | /* fallthrough */ | ||
537 | case -ERESTARTNOINTR: | ||
538 | regs->eax = regs->orig_eax; | ||
539 | regs->eip -= 2; | ||
540 | } | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so | ||
545 | * that register information in the sigcontext is correct. | ||
546 | */ | ||
547 | if (unlikely(regs->eflags & TF_MASK) | ||
548 | && likely(current->ptrace & PT_DTRACE)) { | ||
549 | current->ptrace &= ~PT_DTRACE; | ||
550 | regs->eflags &= ~TF_MASK; | ||
551 | } | ||
552 | |||
553 | /* Set up the stack frame */ | ||
554 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
555 | ret = setup_rt_frame(sig, ka, info, oldset, regs); | ||
556 | else | ||
557 | ret = setup_frame(sig, ka, oldset, regs); | ||
558 | |||
559 | if (ret == 0) { | ||
560 | spin_lock_irq(¤t->sighand->siglock); | ||
561 | sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); | ||
562 | if (!(ka->sa.sa_flags & SA_NODEFER)) | ||
563 | sigaddset(¤t->blocked,sig); | ||
564 | recalc_sigpending(); | ||
565 | spin_unlock_irq(¤t->sighand->siglock); | ||
566 | } | ||
567 | |||
568 | return ret; | ||
569 | } | ||
570 | |||
571 | /* | ||
572 | * Note that 'init' is a special process: it doesn't get signals it doesn't | ||
573 | * want to handle. Thus you cannot kill init even with a SIGKILL even by | ||
574 | * mistake. | ||
575 | */ | ||
576 | static void fastcall do_signal(struct pt_regs *regs) | ||
577 | { | ||
578 | siginfo_t info; | ||
579 | int signr; | ||
580 | struct k_sigaction ka; | ||
581 | sigset_t *oldset; | ||
582 | |||
583 | /* | ||
584 | * We want the common case to go fast, which | ||
585 | * is why we may in certain cases get here from | ||
586 | * kernel mode. Just return without doing anything | ||
587 | * if so. vm86 regs switched out by assembly code | ||
588 | * before reaching here, so testing against kernel | ||
589 | * CS suffices. | ||
590 | */ | ||
591 | if (!user_mode(regs)) | ||
592 | return; | ||
593 | |||
594 | if (test_thread_flag(TIF_RESTORE_SIGMASK)) | ||
595 | oldset = ¤t->saved_sigmask; | ||
596 | else | ||
597 | oldset = ¤t->blocked; | ||
598 | |||
599 | signr = get_signal_to_deliver(&info, &ka, regs, NULL); | ||
600 | if (signr > 0) { | ||
601 | /* Reenable any watchpoints before delivering the | ||
602 | * signal to user space. The processor register will | ||
603 | * have been cleared if the watchpoint triggered | ||
604 | * inside the kernel. | ||
605 | */ | ||
606 | if (unlikely(current->thread.debugreg[7])) | ||
607 | set_debugreg(current->thread.debugreg[7], 7); | ||
608 | |||
609 | /* Whee! Actually deliver the signal. */ | ||
610 | if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { | ||
611 | /* a signal was successfully delivered; the saved | ||
612 | * sigmask will have been stored in the signal frame, | ||
613 | * and will be restored by sigreturn, so we can simply | ||
614 | * clear the TIF_RESTORE_SIGMASK flag */ | ||
615 | if (test_thread_flag(TIF_RESTORE_SIGMASK)) | ||
616 | clear_thread_flag(TIF_RESTORE_SIGMASK); | ||
617 | } | ||
618 | |||
619 | return; | ||
620 | } | ||
621 | |||
622 | /* Did we come from a system call? */ | ||
623 | if (regs->orig_eax >= 0) { | ||
624 | /* Restart the system call - no handlers present */ | ||
625 | switch (regs->eax) { | ||
626 | case -ERESTARTNOHAND: | ||
627 | case -ERESTARTSYS: | ||
628 | case -ERESTARTNOINTR: | ||
629 | regs->eax = regs->orig_eax; | ||
630 | regs->eip -= 2; | ||
631 | break; | ||
632 | |||
633 | case -ERESTART_RESTARTBLOCK: | ||
634 | regs->eax = __NR_restart_syscall; | ||
635 | regs->eip -= 2; | ||
636 | break; | ||
637 | } | ||
638 | } | ||
639 | |||
640 | /* if there's no signal to deliver, we just put the saved sigmask | ||
641 | * back */ | ||
642 | if (test_thread_flag(TIF_RESTORE_SIGMASK)) { | ||
643 | clear_thread_flag(TIF_RESTORE_SIGMASK); | ||
644 | sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); | ||
645 | } | ||
646 | } | ||
647 | |||
648 | /* | ||
649 | * notification of userspace execution resumption | ||
650 | * - triggered by the TIF_WORK_MASK flags | ||
651 | */ | ||
652 | __attribute__((regparm(3))) | ||
653 | void do_notify_resume(struct pt_regs *regs, void *_unused, | ||
654 | __u32 thread_info_flags) | ||
655 | { | ||
656 | /* Pending single-step? */ | ||
657 | if (thread_info_flags & _TIF_SINGLESTEP) { | ||
658 | regs->eflags |= TF_MASK; | ||
659 | clear_thread_flag(TIF_SINGLESTEP); | ||
660 | } | ||
661 | |||
662 | /* deal with pending signal delivery */ | ||
663 | if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) | ||
664 | do_signal(regs); | ||
665 | |||
666 | clear_thread_flag(TIF_IRET); | ||
667 | } | ||
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c new file mode 100644 index 000000000000..2d35d8502029 --- /dev/null +++ b/arch/x86/kernel/smp_32.c | |||
@@ -0,0 +1,707 @@ | |||
1 | /* | ||
2 | * Intel SMP support routines. | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * | ||
7 | * This code is released under the GNU General Public License version 2 or | ||
8 | * later. | ||
9 | */ | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | |||
13 | #include <linux/mm.h> | ||
14 | #include <linux/delay.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/kernel_stat.h> | ||
17 | #include <linux/mc146818rtc.h> | ||
18 | #include <linux/cache.h> | ||
19 | #include <linux/interrupt.h> | ||
20 | #include <linux/cpu.h> | ||
21 | #include <linux/module.h> | ||
22 | |||
23 | #include <asm/mtrr.h> | ||
24 | #include <asm/tlbflush.h> | ||
25 | #include <asm/mmu_context.h> | ||
26 | #include <mach_apic.h> | ||
27 | |||
28 | /* | ||
29 | * Some notes on x86 processor bugs affecting SMP operation: | ||
30 | * | ||
31 | * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. | ||
32 | * The Linux implications for SMP are handled as follows: | ||
33 | * | ||
34 | * Pentium III / [Xeon] | ||
35 | * None of the E1AP-E3AP errata are visible to the user. | ||
36 | * | ||
37 | * E1AP. see PII A1AP | ||
38 | * E2AP. see PII A2AP | ||
39 | * E3AP. see PII A3AP | ||
40 | * | ||
41 | * Pentium II / [Xeon] | ||
42 | * None of the A1AP-A3AP errata are visible to the user. | ||
43 | * | ||
44 | * A1AP. see PPro 1AP | ||
45 | * A2AP. see PPro 2AP | ||
46 | * A3AP. see PPro 7AP | ||
47 | * | ||
48 | * Pentium Pro | ||
49 | * None of 1AP-9AP errata are visible to the normal user, | ||
50 | * except occasional delivery of 'spurious interrupt' as trap #15. | ||
51 | * This is very rare and a non-problem. | ||
52 | * | ||
53 | * 1AP. Linux maps APIC as non-cacheable | ||
54 | * 2AP. worked around in hardware | ||
55 | * 3AP. fixed in C0 and above steppings microcode update. | ||
56 | * Linux does not use excessive STARTUP_IPIs. | ||
57 | * 4AP. worked around in hardware | ||
58 | * 5AP. symmetric IO mode (normal Linux operation) not affected. | ||
59 | * 'noapic' mode has vector 0xf filled out properly. | ||
60 | * 6AP. 'noapic' mode might be affected - fixed in later steppings | ||
61 | * 7AP. We do not assume writes to the LVT deassering IRQs | ||
62 | * 8AP. We do not enable low power mode (deep sleep) during MP bootup | ||
63 | * 9AP. We do not use mixed mode | ||
64 | * | ||
65 | * Pentium | ||
66 | * There is a marginal case where REP MOVS on 100MHz SMP | ||
67 | * machines with B stepping processors can fail. XXX should provide | ||
68 | * an L1cache=Writethrough or L1cache=off option. | ||
69 | * | ||
70 | * B stepping CPUs may hang. There are hardware work arounds | ||
71 | * for this. We warn about it in case your board doesn't have the work | ||
72 | * arounds. Basically thats so I can tell anyone with a B stepping | ||
73 | * CPU and SMP problems "tough". | ||
74 | * | ||
75 | * Specific items [From Pentium Processor Specification Update] | ||
76 | * | ||
77 | * 1AP. Linux doesn't use remote read | ||
78 | * 2AP. Linux doesn't trust APIC errors | ||
79 | * 3AP. We work around this | ||
80 | * 4AP. Linux never generated 3 interrupts of the same priority | ||
81 | * to cause a lost local interrupt. | ||
82 | * 5AP. Remote read is never used | ||
83 | * 6AP. not affected - worked around in hardware | ||
84 | * 7AP. not affected - worked around in hardware | ||
85 | * 8AP. worked around in hardware - we get explicit CS errors if not | ||
86 | * 9AP. only 'noapic' mode affected. Might generate spurious | ||
87 | * interrupts, we log only the first one and count the | ||
88 | * rest silently. | ||
89 | * 10AP. not affected - worked around in hardware | ||
90 | * 11AP. Linux reads the APIC between writes to avoid this, as per | ||
91 | * the documentation. Make sure you preserve this as it affects | ||
92 | * the C stepping chips too. | ||
93 | * 12AP. not affected - worked around in hardware | ||
94 | * 13AP. not affected - worked around in hardware | ||
95 | * 14AP. we always deassert INIT during bootup | ||
96 | * 15AP. not affected - worked around in hardware | ||
97 | * 16AP. not affected - worked around in hardware | ||
98 | * 17AP. not affected - worked around in hardware | ||
99 | * 18AP. not affected - worked around in hardware | ||
100 | * 19AP. not affected - worked around in BIOS | ||
101 | * | ||
102 | * If this sounds worrying believe me these bugs are either ___RARE___, | ||
103 | * or are signal timing bugs worked around in hardware and there's | ||
104 | * about nothing of note with C stepping upwards. | ||
105 | */ | ||
106 | |||
107 | DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, }; | ||
108 | |||
109 | /* | ||
110 | * the following functions deal with sending IPIs between CPUs. | ||
111 | * | ||
112 | * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. | ||
113 | */ | ||
114 | |||
115 | static inline int __prepare_ICR (unsigned int shortcut, int vector) | ||
116 | { | ||
117 | unsigned int icr = shortcut | APIC_DEST_LOGICAL; | ||
118 | |||
119 | switch (vector) { | ||
120 | default: | ||
121 | icr |= APIC_DM_FIXED | vector; | ||
122 | break; | ||
123 | case NMI_VECTOR: | ||
124 | icr |= APIC_DM_NMI; | ||
125 | break; | ||
126 | } | ||
127 | return icr; | ||
128 | } | ||
129 | |||
130 | static inline int __prepare_ICR2 (unsigned int mask) | ||
131 | { | ||
132 | return SET_APIC_DEST_FIELD(mask); | ||
133 | } | ||
134 | |||
135 | void __send_IPI_shortcut(unsigned int shortcut, int vector) | ||
136 | { | ||
137 | /* | ||
138 | * Subtle. In the case of the 'never do double writes' workaround | ||
139 | * we have to lock out interrupts to be safe. As we don't care | ||
140 | * of the value read we use an atomic rmw access to avoid costly | ||
141 | * cli/sti. Otherwise we use an even cheaper single atomic write | ||
142 | * to the APIC. | ||
143 | */ | ||
144 | unsigned int cfg; | ||
145 | |||
146 | /* | ||
147 | * Wait for idle. | ||
148 | */ | ||
149 | apic_wait_icr_idle(); | ||
150 | |||
151 | /* | ||
152 | * No need to touch the target chip field | ||
153 | */ | ||
154 | cfg = __prepare_ICR(shortcut, vector); | ||
155 | |||
156 | /* | ||
157 | * Send the IPI. The write to APIC_ICR fires this off. | ||
158 | */ | ||
159 | apic_write_around(APIC_ICR, cfg); | ||
160 | } | ||
161 | |||
162 | void fastcall send_IPI_self(int vector) | ||
163 | { | ||
164 | __send_IPI_shortcut(APIC_DEST_SELF, vector); | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * This is used to send an IPI with no shorthand notation (the destination is | ||
169 | * specified in bits 56 to 63 of the ICR). | ||
170 | */ | ||
171 | static inline void __send_IPI_dest_field(unsigned long mask, int vector) | ||
172 | { | ||
173 | unsigned long cfg; | ||
174 | |||
175 | /* | ||
176 | * Wait for idle. | ||
177 | */ | ||
178 | if (unlikely(vector == NMI_VECTOR)) | ||
179 | safe_apic_wait_icr_idle(); | ||
180 | else | ||
181 | apic_wait_icr_idle(); | ||
182 | |||
183 | /* | ||
184 | * prepare target chip field | ||
185 | */ | ||
186 | cfg = __prepare_ICR2(mask); | ||
187 | apic_write_around(APIC_ICR2, cfg); | ||
188 | |||
189 | /* | ||
190 | * program the ICR | ||
191 | */ | ||
192 | cfg = __prepare_ICR(0, vector); | ||
193 | |||
194 | /* | ||
195 | * Send the IPI. The write to APIC_ICR fires this off. | ||
196 | */ | ||
197 | apic_write_around(APIC_ICR, cfg); | ||
198 | } | ||
199 | |||
200 | /* | ||
201 | * This is only used on smaller machines. | ||
202 | */ | ||
203 | void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) | ||
204 | { | ||
205 | unsigned long mask = cpus_addr(cpumask)[0]; | ||
206 | unsigned long flags; | ||
207 | |||
208 | local_irq_save(flags); | ||
209 | WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); | ||
210 | __send_IPI_dest_field(mask, vector); | ||
211 | local_irq_restore(flags); | ||
212 | } | ||
213 | |||
214 | void send_IPI_mask_sequence(cpumask_t mask, int vector) | ||
215 | { | ||
216 | unsigned long flags; | ||
217 | unsigned int query_cpu; | ||
218 | |||
219 | /* | ||
220 | * Hack. The clustered APIC addressing mode doesn't allow us to send | ||
221 | * to an arbitrary mask, so I do a unicasts to each CPU instead. This | ||
222 | * should be modified to do 1 message per cluster ID - mbligh | ||
223 | */ | ||
224 | |||
225 | local_irq_save(flags); | ||
226 | for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { | ||
227 | if (cpu_isset(query_cpu, mask)) { | ||
228 | __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), | ||
229 | vector); | ||
230 | } | ||
231 | } | ||
232 | local_irq_restore(flags); | ||
233 | } | ||
234 | |||
235 | #include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */ | ||
236 | |||
237 | /* | ||
238 | * Smarter SMP flushing macros. | ||
239 | * c/o Linus Torvalds. | ||
240 | * | ||
241 | * These mean you can really definitely utterly forget about | ||
242 | * writing to user space from interrupts. (Its not allowed anyway). | ||
243 | * | ||
244 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | ||
245 | */ | ||
246 | |||
247 | static cpumask_t flush_cpumask; | ||
248 | static struct mm_struct * flush_mm; | ||
249 | static unsigned long flush_va; | ||
250 | static DEFINE_SPINLOCK(tlbstate_lock); | ||
251 | |||
252 | /* | ||
253 | * We cannot call mmdrop() because we are in interrupt context, | ||
254 | * instead update mm->cpu_vm_mask. | ||
255 | * | ||
256 | * We need to reload %cr3 since the page tables may be going | ||
257 | * away from under us.. | ||
258 | */ | ||
259 | void leave_mm(unsigned long cpu) | ||
260 | { | ||
261 | if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) | ||
262 | BUG(); | ||
263 | cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); | ||
264 | load_cr3(swapper_pg_dir); | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * | ||
269 | * The flush IPI assumes that a thread switch happens in this order: | ||
270 | * [cpu0: the cpu that switches] | ||
271 | * 1) switch_mm() either 1a) or 1b) | ||
272 | * 1a) thread switch to a different mm | ||
273 | * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | ||
274 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
275 | * the other cpus, but smp_invalidate_interrupt ignore flush ipis | ||
276 | * for the wrong mm, and in the worst case we perform a superflous | ||
277 | * tlb flush. | ||
278 | * 1a2) set cpu_tlbstate to TLBSTATE_OK | ||
279 | * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | ||
280 | * was in lazy tlb mode. | ||
281 | * 1a3) update cpu_tlbstate[].active_mm | ||
282 | * Now cpu0 accepts tlb flushes for the new mm. | ||
283 | * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | ||
284 | * Now the other cpus will send tlb flush ipis. | ||
285 | * 1a4) change cr3. | ||
286 | * 1b) thread switch without mm change | ||
287 | * cpu_tlbstate[].active_mm is correct, cpu0 already handles | ||
288 | * flush ipis. | ||
289 | * 1b1) set cpu_tlbstate to TLBSTATE_OK | ||
290 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
291 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
292 | * and test the bit. | ||
293 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
294 | * 2) switch %%esp, ie current | ||
295 | * | ||
296 | * The interrupt must handle 2 special cases: | ||
297 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
298 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
299 | * runs in kernel space, the cpu could load tlb entries for user space | ||
300 | * pages. | ||
301 | * | ||
302 | * The good news is that cpu_tlbstate is local to each cpu, no | ||
303 | * write/read ordering problems. | ||
304 | */ | ||
305 | |||
306 | /* | ||
307 | * TLB flush IPI: | ||
308 | * | ||
309 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
310 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
311 | */ | ||
312 | |||
313 | fastcall void smp_invalidate_interrupt(struct pt_regs *regs) | ||
314 | { | ||
315 | unsigned long cpu; | ||
316 | |||
317 | cpu = get_cpu(); | ||
318 | |||
319 | if (!cpu_isset(cpu, flush_cpumask)) | ||
320 | goto out; | ||
321 | /* | ||
322 | * This was a BUG() but until someone can quote me the | ||
323 | * line from the intel manual that guarantees an IPI to | ||
324 | * multiple CPUs is retried _only_ on the erroring CPUs | ||
325 | * its staying as a return | ||
326 | * | ||
327 | * BUG(); | ||
328 | */ | ||
329 | |||
330 | if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { | ||
331 | if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { | ||
332 | if (flush_va == TLB_FLUSH_ALL) | ||
333 | local_flush_tlb(); | ||
334 | else | ||
335 | __flush_tlb_one(flush_va); | ||
336 | } else | ||
337 | leave_mm(cpu); | ||
338 | } | ||
339 | ack_APIC_irq(); | ||
340 | smp_mb__before_clear_bit(); | ||
341 | cpu_clear(cpu, flush_cpumask); | ||
342 | smp_mb__after_clear_bit(); | ||
343 | out: | ||
344 | put_cpu_no_resched(); | ||
345 | } | ||
346 | |||
347 | void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, | ||
348 | unsigned long va) | ||
349 | { | ||
350 | cpumask_t cpumask = *cpumaskp; | ||
351 | |||
352 | /* | ||
353 | * A couple of (to be removed) sanity checks: | ||
354 | * | ||
355 | * - current CPU must not be in mask | ||
356 | * - mask must exist :) | ||
357 | */ | ||
358 | BUG_ON(cpus_empty(cpumask)); | ||
359 | BUG_ON(cpu_isset(smp_processor_id(), cpumask)); | ||
360 | BUG_ON(!mm); | ||
361 | |||
362 | #ifdef CONFIG_HOTPLUG_CPU | ||
363 | /* If a CPU which we ran on has gone down, OK. */ | ||
364 | cpus_and(cpumask, cpumask, cpu_online_map); | ||
365 | if (unlikely(cpus_empty(cpumask))) | ||
366 | return; | ||
367 | #endif | ||
368 | |||
369 | /* | ||
370 | * i'm not happy about this global shared spinlock in the | ||
371 | * MM hot path, but we'll see how contended it is. | ||
372 | * AK: x86-64 has a faster method that could be ported. | ||
373 | */ | ||
374 | spin_lock(&tlbstate_lock); | ||
375 | |||
376 | flush_mm = mm; | ||
377 | flush_va = va; | ||
378 | cpus_or(flush_cpumask, cpumask, flush_cpumask); | ||
379 | /* | ||
380 | * We have to send the IPI only to | ||
381 | * CPUs affected. | ||
382 | */ | ||
383 | send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); | ||
384 | |||
385 | while (!cpus_empty(flush_cpumask)) | ||
386 | /* nothing. lockup detection does not belong here */ | ||
387 | cpu_relax(); | ||
388 | |||
389 | flush_mm = NULL; | ||
390 | flush_va = 0; | ||
391 | spin_unlock(&tlbstate_lock); | ||
392 | } | ||
393 | |||
394 | void flush_tlb_current_task(void) | ||
395 | { | ||
396 | struct mm_struct *mm = current->mm; | ||
397 | cpumask_t cpu_mask; | ||
398 | |||
399 | preempt_disable(); | ||
400 | cpu_mask = mm->cpu_vm_mask; | ||
401 | cpu_clear(smp_processor_id(), cpu_mask); | ||
402 | |||
403 | local_flush_tlb(); | ||
404 | if (!cpus_empty(cpu_mask)) | ||
405 | flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); | ||
406 | preempt_enable(); | ||
407 | } | ||
408 | |||
409 | void flush_tlb_mm (struct mm_struct * mm) | ||
410 | { | ||
411 | cpumask_t cpu_mask; | ||
412 | |||
413 | preempt_disable(); | ||
414 | cpu_mask = mm->cpu_vm_mask; | ||
415 | cpu_clear(smp_processor_id(), cpu_mask); | ||
416 | |||
417 | if (current->active_mm == mm) { | ||
418 | if (current->mm) | ||
419 | local_flush_tlb(); | ||
420 | else | ||
421 | leave_mm(smp_processor_id()); | ||
422 | } | ||
423 | if (!cpus_empty(cpu_mask)) | ||
424 | flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); | ||
425 | |||
426 | preempt_enable(); | ||
427 | } | ||
428 | |||
429 | void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | ||
430 | { | ||
431 | struct mm_struct *mm = vma->vm_mm; | ||
432 | cpumask_t cpu_mask; | ||
433 | |||
434 | preempt_disable(); | ||
435 | cpu_mask = mm->cpu_vm_mask; | ||
436 | cpu_clear(smp_processor_id(), cpu_mask); | ||
437 | |||
438 | if (current->active_mm == mm) { | ||
439 | if(current->mm) | ||
440 | __flush_tlb_one(va); | ||
441 | else | ||
442 | leave_mm(smp_processor_id()); | ||
443 | } | ||
444 | |||
445 | if (!cpus_empty(cpu_mask)) | ||
446 | flush_tlb_others(cpu_mask, mm, va); | ||
447 | |||
448 | preempt_enable(); | ||
449 | } | ||
450 | EXPORT_SYMBOL(flush_tlb_page); | ||
451 | |||
452 | static void do_flush_tlb_all(void* info) | ||
453 | { | ||
454 | unsigned long cpu = smp_processor_id(); | ||
455 | |||
456 | __flush_tlb_all(); | ||
457 | if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) | ||
458 | leave_mm(cpu); | ||
459 | } | ||
460 | |||
461 | void flush_tlb_all(void) | ||
462 | { | ||
463 | on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * this function sends a 'reschedule' IPI to another CPU. | ||
468 | * it goes straight through and wastes no time serializing | ||
469 | * anything. Worst case is that we lose a reschedule ... | ||
470 | */ | ||
471 | static void native_smp_send_reschedule(int cpu) | ||
472 | { | ||
473 | WARN_ON(cpu_is_offline(cpu)); | ||
474 | send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | ||
475 | } | ||
476 | |||
477 | /* | ||
478 | * Structure and data for smp_call_function(). This is designed to minimise | ||
479 | * static memory requirements. It also looks cleaner. | ||
480 | */ | ||
481 | static DEFINE_SPINLOCK(call_lock); | ||
482 | |||
483 | struct call_data_struct { | ||
484 | void (*func) (void *info); | ||
485 | void *info; | ||
486 | atomic_t started; | ||
487 | atomic_t finished; | ||
488 | int wait; | ||
489 | }; | ||
490 | |||
491 | void lock_ipi_call_lock(void) | ||
492 | { | ||
493 | spin_lock_irq(&call_lock); | ||
494 | } | ||
495 | |||
496 | void unlock_ipi_call_lock(void) | ||
497 | { | ||
498 | spin_unlock_irq(&call_lock); | ||
499 | } | ||
500 | |||
501 | static struct call_data_struct *call_data; | ||
502 | |||
503 | static void __smp_call_function(void (*func) (void *info), void *info, | ||
504 | int nonatomic, int wait) | ||
505 | { | ||
506 | struct call_data_struct data; | ||
507 | int cpus = num_online_cpus() - 1; | ||
508 | |||
509 | if (!cpus) | ||
510 | return; | ||
511 | |||
512 | data.func = func; | ||
513 | data.info = info; | ||
514 | atomic_set(&data.started, 0); | ||
515 | data.wait = wait; | ||
516 | if (wait) | ||
517 | atomic_set(&data.finished, 0); | ||
518 | |||
519 | call_data = &data; | ||
520 | mb(); | ||
521 | |||
522 | /* Send a message to all other CPUs and wait for them to respond */ | ||
523 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | ||
524 | |||
525 | /* Wait for response */ | ||
526 | while (atomic_read(&data.started) != cpus) | ||
527 | cpu_relax(); | ||
528 | |||
529 | if (wait) | ||
530 | while (atomic_read(&data.finished) != cpus) | ||
531 | cpu_relax(); | ||
532 | } | ||
533 | |||
534 | |||
535 | /** | ||
536 | * smp_call_function_mask(): Run a function on a set of other CPUs. | ||
537 | * @mask: The set of cpus to run on. Must not include the current cpu. | ||
538 | * @func: The function to run. This must be fast and non-blocking. | ||
539 | * @info: An arbitrary pointer to pass to the function. | ||
540 | * @wait: If true, wait (atomically) until function has completed on other CPUs. | ||
541 | * | ||
542 | * Returns 0 on success, else a negative status code. | ||
543 | * | ||
544 | * If @wait is true, then returns once @func has returned; otherwise | ||
545 | * it returns just before the target cpu calls @func. | ||
546 | * | ||
547 | * You must not call this function with disabled interrupts or from a | ||
548 | * hardware interrupt handler or from a bottom half handler. | ||
549 | */ | ||
550 | static int | ||
551 | native_smp_call_function_mask(cpumask_t mask, | ||
552 | void (*func)(void *), void *info, | ||
553 | int wait) | ||
554 | { | ||
555 | struct call_data_struct data; | ||
556 | cpumask_t allbutself; | ||
557 | int cpus; | ||
558 | |||
559 | /* Can deadlock when called with interrupts disabled */ | ||
560 | WARN_ON(irqs_disabled()); | ||
561 | |||
562 | /* Holding any lock stops cpus from going down. */ | ||
563 | spin_lock(&call_lock); | ||
564 | |||
565 | allbutself = cpu_online_map; | ||
566 | cpu_clear(smp_processor_id(), allbutself); | ||
567 | |||
568 | cpus_and(mask, mask, allbutself); | ||
569 | cpus = cpus_weight(mask); | ||
570 | |||
571 | if (!cpus) { | ||
572 | spin_unlock(&call_lock); | ||
573 | return 0; | ||
574 | } | ||
575 | |||
576 | data.func = func; | ||
577 | data.info = info; | ||
578 | atomic_set(&data.started, 0); | ||
579 | data.wait = wait; | ||
580 | if (wait) | ||
581 | atomic_set(&data.finished, 0); | ||
582 | |||
583 | call_data = &data; | ||
584 | mb(); | ||
585 | |||
586 | /* Send a message to other CPUs */ | ||
587 | if (cpus_equal(mask, allbutself)) | ||
588 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | ||
589 | else | ||
590 | send_IPI_mask(mask, CALL_FUNCTION_VECTOR); | ||
591 | |||
592 | /* Wait for response */ | ||
593 | while (atomic_read(&data.started) != cpus) | ||
594 | cpu_relax(); | ||
595 | |||
596 | if (wait) | ||
597 | while (atomic_read(&data.finished) != cpus) | ||
598 | cpu_relax(); | ||
599 | spin_unlock(&call_lock); | ||
600 | |||
601 | return 0; | ||
602 | } | ||
603 | |||
604 | static void stop_this_cpu (void * dummy) | ||
605 | { | ||
606 | local_irq_disable(); | ||
607 | /* | ||
608 | * Remove this CPU: | ||
609 | */ | ||
610 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
611 | disable_local_APIC(); | ||
612 | if (cpu_data[smp_processor_id()].hlt_works_ok) | ||
613 | for(;;) halt(); | ||
614 | for (;;); | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * this function calls the 'stop' function on all other CPUs in the system. | ||
619 | */ | ||
620 | |||
621 | static void native_smp_send_stop(void) | ||
622 | { | ||
623 | /* Don't deadlock on the call lock in panic */ | ||
624 | int nolock = !spin_trylock(&call_lock); | ||
625 | unsigned long flags; | ||
626 | |||
627 | local_irq_save(flags); | ||
628 | __smp_call_function(stop_this_cpu, NULL, 0, 0); | ||
629 | if (!nolock) | ||
630 | spin_unlock(&call_lock); | ||
631 | disable_local_APIC(); | ||
632 | local_irq_restore(flags); | ||
633 | } | ||
634 | |||
635 | /* | ||
636 | * Reschedule call back. Nothing to do, | ||
637 | * all the work is done automatically when | ||
638 | * we return from the interrupt. | ||
639 | */ | ||
640 | fastcall void smp_reschedule_interrupt(struct pt_regs *regs) | ||
641 | { | ||
642 | ack_APIC_irq(); | ||
643 | } | ||
644 | |||
645 | fastcall void smp_call_function_interrupt(struct pt_regs *regs) | ||
646 | { | ||
647 | void (*func) (void *info) = call_data->func; | ||
648 | void *info = call_data->info; | ||
649 | int wait = call_data->wait; | ||
650 | |||
651 | ack_APIC_irq(); | ||
652 | /* | ||
653 | * Notify initiating CPU that I've grabbed the data and am | ||
654 | * about to execute the function | ||
655 | */ | ||
656 | mb(); | ||
657 | atomic_inc(&call_data->started); | ||
658 | /* | ||
659 | * At this point the info structure may be out of scope unless wait==1 | ||
660 | */ | ||
661 | irq_enter(); | ||
662 | (*func)(info); | ||
663 | irq_exit(); | ||
664 | |||
665 | if (wait) { | ||
666 | mb(); | ||
667 | atomic_inc(&call_data->finished); | ||
668 | } | ||
669 | } | ||
670 | |||
671 | static int convert_apicid_to_cpu(int apic_id) | ||
672 | { | ||
673 | int i; | ||
674 | |||
675 | for (i = 0; i < NR_CPUS; i++) { | ||
676 | if (x86_cpu_to_apicid[i] == apic_id) | ||
677 | return i; | ||
678 | } | ||
679 | return -1; | ||
680 | } | ||
681 | |||
682 | int safe_smp_processor_id(void) | ||
683 | { | ||
684 | int apicid, cpuid; | ||
685 | |||
686 | if (!boot_cpu_has(X86_FEATURE_APIC)) | ||
687 | return 0; | ||
688 | |||
689 | apicid = hard_smp_processor_id(); | ||
690 | if (apicid == BAD_APICID) | ||
691 | return 0; | ||
692 | |||
693 | cpuid = convert_apicid_to_cpu(apicid); | ||
694 | |||
695 | return cpuid >= 0 ? cpuid : 0; | ||
696 | } | ||
697 | |||
698 | struct smp_ops smp_ops = { | ||
699 | .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, | ||
700 | .smp_prepare_cpus = native_smp_prepare_cpus, | ||
701 | .cpu_up = native_cpu_up, | ||
702 | .smp_cpus_done = native_smp_cpus_done, | ||
703 | |||
704 | .smp_send_stop = native_smp_send_stop, | ||
705 | .smp_send_reschedule = native_smp_send_reschedule, | ||
706 | .smp_call_function_mask = native_smp_call_function_mask, | ||
707 | }; | ||
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c new file mode 100644 index 000000000000..e4f61d1c6248 --- /dev/null +++ b/arch/x86/kernel/smpboot_32.c | |||
@@ -0,0 +1,1322 @@ | |||
1 | /* | ||
2 | * x86 SMP booting functions | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * | ||
7 | * Much of the core SMP work is based on previous work by Thomas Radke, to | ||
8 | * whom a great many thanks are extended. | ||
9 | * | ||
10 | * Thanks to Intel for making available several different Pentium, | ||
11 | * Pentium Pro and Pentium-II/Xeon MP machines. | ||
12 | * Original development of Linux SMP code supported by Caldera. | ||
13 | * | ||
14 | * This code is released under the GNU General Public License version 2 or | ||
15 | * later. | ||
16 | * | ||
17 | * Fixes | ||
18 | * Felix Koop : NR_CPUS used properly | ||
19 | * Jose Renau : Handle single CPU case. | ||
20 | * Alan Cox : By repeated request 8) - Total BogoMIPS report. | ||
21 | * Greg Wright : Fix for kernel stacks panic. | ||
22 | * Erich Boleyn : MP v1.4 and additional changes. | ||
23 | * Matthias Sattler : Changes for 2.1 kernel map. | ||
24 | * Michel Lespinasse : Changes for 2.1 kernel map. | ||
25 | * Michael Chastain : Change trampoline.S to gnu as. | ||
26 | * Alan Cox : Dumb bug: 'B' step PPro's are fine | ||
27 | * Ingo Molnar : Added APIC timers, based on code | ||
28 | * from Jose Renau | ||
29 | * Ingo Molnar : various cleanups and rewrites | ||
30 | * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. | ||
31 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs | ||
32 | * Martin J. Bligh : Added support for multi-quad systems | ||
33 | * Dave Jones : Report invalid combinations of Athlon CPUs. | ||
34 | * Rusty Russell : Hacked into shape for new "hotplug" boot process. */ | ||
35 | |||
36 | #include <linux/module.h> | ||
37 | #include <linux/init.h> | ||
38 | #include <linux/kernel.h> | ||
39 | |||
40 | #include <linux/mm.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <linux/kernel_stat.h> | ||
43 | #include <linux/bootmem.h> | ||
44 | #include <linux/notifier.h> | ||
45 | #include <linux/cpu.h> | ||
46 | #include <linux/percpu.h> | ||
47 | #include <linux/nmi.h> | ||
48 | |||
49 | #include <linux/delay.h> | ||
50 | #include <linux/mc146818rtc.h> | ||
51 | #include <asm/tlbflush.h> | ||
52 | #include <asm/desc.h> | ||
53 | #include <asm/arch_hooks.h> | ||
54 | #include <asm/nmi.h> | ||
55 | |||
56 | #include <mach_apic.h> | ||
57 | #include <mach_wakecpu.h> | ||
58 | #include <smpboot_hooks.h> | ||
59 | #include <asm/vmi.h> | ||
60 | #include <asm/mtrr.h> | ||
61 | |||
62 | /* Set if we find a B stepping CPU */ | ||
63 | static int __devinitdata smp_b_stepping; | ||
64 | |||
65 | /* Number of siblings per CPU package */ | ||
66 | int smp_num_siblings = 1; | ||
67 | EXPORT_SYMBOL(smp_num_siblings); | ||
68 | |||
69 | /* Last level cache ID of each logical CPU */ | ||
70 | int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; | ||
71 | |||
72 | /* representing HT siblings of each logical CPU */ | ||
73 | cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; | ||
74 | EXPORT_SYMBOL(cpu_sibling_map); | ||
75 | |||
76 | /* representing HT and core siblings of each logical CPU */ | ||
77 | cpumask_t cpu_core_map[NR_CPUS] __read_mostly; | ||
78 | EXPORT_SYMBOL(cpu_core_map); | ||
79 | |||
80 | /* bitmap of online cpus */ | ||
81 | cpumask_t cpu_online_map __read_mostly; | ||
82 | EXPORT_SYMBOL(cpu_online_map); | ||
83 | |||
84 | cpumask_t cpu_callin_map; | ||
85 | cpumask_t cpu_callout_map; | ||
86 | EXPORT_SYMBOL(cpu_callout_map); | ||
87 | cpumask_t cpu_possible_map; | ||
88 | EXPORT_SYMBOL(cpu_possible_map); | ||
89 | static cpumask_t smp_commenced_mask; | ||
90 | |||
91 | /* Per CPU bogomips and other parameters */ | ||
92 | struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; | ||
93 | EXPORT_SYMBOL(cpu_data); | ||
94 | |||
95 | u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = | ||
96 | { [0 ... NR_CPUS-1] = 0xff }; | ||
97 | EXPORT_SYMBOL(x86_cpu_to_apicid); | ||
98 | |||
99 | u8 apicid_2_node[MAX_APICID]; | ||
100 | |||
101 | /* | ||
102 | * Trampoline 80x86 program as an array. | ||
103 | */ | ||
104 | |||
105 | extern unsigned char trampoline_data []; | ||
106 | extern unsigned char trampoline_end []; | ||
107 | static unsigned char *trampoline_base; | ||
108 | static int trampoline_exec; | ||
109 | |||
110 | static void map_cpu_to_logical_apicid(void); | ||
111 | |||
112 | /* State of each CPU. */ | ||
113 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | ||
114 | |||
115 | /* | ||
116 | * Currently trivial. Write the real->protected mode | ||
117 | * bootstrap into the page concerned. The caller | ||
118 | * has made sure it's suitably aligned. | ||
119 | */ | ||
120 | |||
121 | static unsigned long __devinit setup_trampoline(void) | ||
122 | { | ||
123 | memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); | ||
124 | return virt_to_phys(trampoline_base); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * We are called very early to get the low memory for the | ||
129 | * SMP bootup trampoline page. | ||
130 | */ | ||
131 | void __init smp_alloc_memory(void) | ||
132 | { | ||
133 | trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); | ||
134 | /* | ||
135 | * Has to be in very low memory so we can execute | ||
136 | * real-mode AP code. | ||
137 | */ | ||
138 | if (__pa(trampoline_base) >= 0x9F000) | ||
139 | BUG(); | ||
140 | /* | ||
141 | * Make the SMP trampoline executable: | ||
142 | */ | ||
143 | trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); | ||
144 | } | ||
145 | |||
146 | /* | ||
147 | * The bootstrap kernel entry code has set these up. Save them for | ||
148 | * a given CPU | ||
149 | */ | ||
150 | |||
151 | void __cpuinit smp_store_cpu_info(int id) | ||
152 | { | ||
153 | struct cpuinfo_x86 *c = cpu_data + id; | ||
154 | |||
155 | *c = boot_cpu_data; | ||
156 | if (id!=0) | ||
157 | identify_secondary_cpu(c); | ||
158 | /* | ||
159 | * Mask B, Pentium, but not Pentium MMX | ||
160 | */ | ||
161 | if (c->x86_vendor == X86_VENDOR_INTEL && | ||
162 | c->x86 == 5 && | ||
163 | c->x86_mask >= 1 && c->x86_mask <= 4 && | ||
164 | c->x86_model <= 3) | ||
165 | /* | ||
166 | * Remember we have B step Pentia with bugs | ||
167 | */ | ||
168 | smp_b_stepping = 1; | ||
169 | |||
170 | /* | ||
171 | * Certain Athlons might work (for various values of 'work') in SMP | ||
172 | * but they are not certified as MP capable. | ||
173 | */ | ||
174 | if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { | ||
175 | |||
176 | if (num_possible_cpus() == 1) | ||
177 | goto valid_k7; | ||
178 | |||
179 | /* Athlon 660/661 is valid. */ | ||
180 | if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) | ||
181 | goto valid_k7; | ||
182 | |||
183 | /* Duron 670 is valid */ | ||
184 | if ((c->x86_model==7) && (c->x86_mask==0)) | ||
185 | goto valid_k7; | ||
186 | |||
187 | /* | ||
188 | * Athlon 662, Duron 671, and Athlon >model 7 have capability bit. | ||
189 | * It's worth noting that the A5 stepping (662) of some Athlon XP's | ||
190 | * have the MP bit set. | ||
191 | * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more. | ||
192 | */ | ||
193 | if (((c->x86_model==6) && (c->x86_mask>=2)) || | ||
194 | ((c->x86_model==7) && (c->x86_mask>=1)) || | ||
195 | (c->x86_model> 7)) | ||
196 | if (cpu_has_mp) | ||
197 | goto valid_k7; | ||
198 | |||
199 | /* If we get here, it's not a certified SMP capable AMD system. */ | ||
200 | add_taint(TAINT_UNSAFE_SMP); | ||
201 | } | ||
202 | |||
203 | valid_k7: | ||
204 | ; | ||
205 | } | ||
206 | |||
207 | extern void calibrate_delay(void); | ||
208 | |||
209 | static atomic_t init_deasserted; | ||
210 | |||
211 | static void __cpuinit smp_callin(void) | ||
212 | { | ||
213 | int cpuid, phys_id; | ||
214 | unsigned long timeout; | ||
215 | |||
216 | /* | ||
217 | * If waken up by an INIT in an 82489DX configuration | ||
218 | * we may get here before an INIT-deassert IPI reaches | ||
219 | * our local APIC. We have to wait for the IPI or we'll | ||
220 | * lock up on an APIC access. | ||
221 | */ | ||
222 | wait_for_init_deassert(&init_deasserted); | ||
223 | |||
224 | /* | ||
225 | * (This works even if the APIC is not enabled.) | ||
226 | */ | ||
227 | phys_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
228 | cpuid = smp_processor_id(); | ||
229 | if (cpu_isset(cpuid, cpu_callin_map)) { | ||
230 | printk("huh, phys CPU#%d, CPU#%d already present??\n", | ||
231 | phys_id, cpuid); | ||
232 | BUG(); | ||
233 | } | ||
234 | Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); | ||
235 | |||
236 | /* | ||
237 | * STARTUP IPIs are fragile beasts as they might sometimes | ||
238 | * trigger some glue motherboard logic. Complete APIC bus | ||
239 | * silence for 1 second, this overestimates the time the | ||
240 | * boot CPU is spending to send the up to 2 STARTUP IPIs | ||
241 | * by a factor of two. This should be enough. | ||
242 | */ | ||
243 | |||
244 | /* | ||
245 | * Waiting 2s total for startup (udelay is not yet working) | ||
246 | */ | ||
247 | timeout = jiffies + 2*HZ; | ||
248 | while (time_before(jiffies, timeout)) { | ||
249 | /* | ||
250 | * Has the boot CPU finished it's STARTUP sequence? | ||
251 | */ | ||
252 | if (cpu_isset(cpuid, cpu_callout_map)) | ||
253 | break; | ||
254 | rep_nop(); | ||
255 | } | ||
256 | |||
257 | if (!time_before(jiffies, timeout)) { | ||
258 | printk("BUG: CPU%d started up but did not get a callout!\n", | ||
259 | cpuid); | ||
260 | BUG(); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * the boot CPU has finished the init stage and is spinning | ||
265 | * on callin_map until we finish. We are free to set up this | ||
266 | * CPU, first the APIC. (this is probably redundant on most | ||
267 | * boards) | ||
268 | */ | ||
269 | |||
270 | Dprintk("CALLIN, before setup_local_APIC().\n"); | ||
271 | smp_callin_clear_local_apic(); | ||
272 | setup_local_APIC(); | ||
273 | map_cpu_to_logical_apicid(); | ||
274 | |||
275 | /* | ||
276 | * Get our bogomips. | ||
277 | */ | ||
278 | calibrate_delay(); | ||
279 | Dprintk("Stack at about %p\n",&cpuid); | ||
280 | |||
281 | /* | ||
282 | * Save our processor parameters | ||
283 | */ | ||
284 | smp_store_cpu_info(cpuid); | ||
285 | |||
286 | /* | ||
287 | * Allow the master to continue. | ||
288 | */ | ||
289 | cpu_set(cpuid, cpu_callin_map); | ||
290 | } | ||
291 | |||
292 | static int cpucount; | ||
293 | |||
294 | /* maps the cpu to the sched domain representing multi-core */ | ||
295 | cpumask_t cpu_coregroup_map(int cpu) | ||
296 | { | ||
297 | struct cpuinfo_x86 *c = cpu_data + cpu; | ||
298 | /* | ||
299 | * For perf, we return last level cache shared map. | ||
300 | * And for power savings, we return cpu_core_map | ||
301 | */ | ||
302 | if (sched_mc_power_savings || sched_smt_power_savings) | ||
303 | return cpu_core_map[cpu]; | ||
304 | else | ||
305 | return c->llc_shared_map; | ||
306 | } | ||
307 | |||
308 | /* representing cpus for which sibling maps can be computed */ | ||
309 | static cpumask_t cpu_sibling_setup_map; | ||
310 | |||
311 | void __cpuinit set_cpu_sibling_map(int cpu) | ||
312 | { | ||
313 | int i; | ||
314 | struct cpuinfo_x86 *c = cpu_data; | ||
315 | |||
316 | cpu_set(cpu, cpu_sibling_setup_map); | ||
317 | |||
318 | if (smp_num_siblings > 1) { | ||
319 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | ||
320 | if (c[cpu].phys_proc_id == c[i].phys_proc_id && | ||
321 | c[cpu].cpu_core_id == c[i].cpu_core_id) { | ||
322 | cpu_set(i, cpu_sibling_map[cpu]); | ||
323 | cpu_set(cpu, cpu_sibling_map[i]); | ||
324 | cpu_set(i, cpu_core_map[cpu]); | ||
325 | cpu_set(cpu, cpu_core_map[i]); | ||
326 | cpu_set(i, c[cpu].llc_shared_map); | ||
327 | cpu_set(cpu, c[i].llc_shared_map); | ||
328 | } | ||
329 | } | ||
330 | } else { | ||
331 | cpu_set(cpu, cpu_sibling_map[cpu]); | ||
332 | } | ||
333 | |||
334 | cpu_set(cpu, c[cpu].llc_shared_map); | ||
335 | |||
336 | if (current_cpu_data.x86_max_cores == 1) { | ||
337 | cpu_core_map[cpu] = cpu_sibling_map[cpu]; | ||
338 | c[cpu].booted_cores = 1; | ||
339 | return; | ||
340 | } | ||
341 | |||
342 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | ||
343 | if (cpu_llc_id[cpu] != BAD_APICID && | ||
344 | cpu_llc_id[cpu] == cpu_llc_id[i]) { | ||
345 | cpu_set(i, c[cpu].llc_shared_map); | ||
346 | cpu_set(cpu, c[i].llc_shared_map); | ||
347 | } | ||
348 | if (c[cpu].phys_proc_id == c[i].phys_proc_id) { | ||
349 | cpu_set(i, cpu_core_map[cpu]); | ||
350 | cpu_set(cpu, cpu_core_map[i]); | ||
351 | /* | ||
352 | * Does this new cpu bringup a new core? | ||
353 | */ | ||
354 | if (cpus_weight(cpu_sibling_map[cpu]) == 1) { | ||
355 | /* | ||
356 | * for each core in package, increment | ||
357 | * the booted_cores for this new cpu | ||
358 | */ | ||
359 | if (first_cpu(cpu_sibling_map[i]) == i) | ||
360 | c[cpu].booted_cores++; | ||
361 | /* | ||
362 | * increment the core count for all | ||
363 | * the other cpus in this package | ||
364 | */ | ||
365 | if (i != cpu) | ||
366 | c[i].booted_cores++; | ||
367 | } else if (i != cpu && !c[cpu].booted_cores) | ||
368 | c[cpu].booted_cores = c[i].booted_cores; | ||
369 | } | ||
370 | } | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Activate a secondary processor. | ||
375 | */ | ||
376 | static void __cpuinit start_secondary(void *unused) | ||
377 | { | ||
378 | /* | ||
379 | * Don't put *anything* before cpu_init(), SMP booting is too | ||
380 | * fragile that we want to limit the things done here to the | ||
381 | * most necessary things. | ||
382 | */ | ||
383 | #ifdef CONFIG_VMI | ||
384 | vmi_bringup(); | ||
385 | #endif | ||
386 | cpu_init(); | ||
387 | preempt_disable(); | ||
388 | smp_callin(); | ||
389 | while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) | ||
390 | rep_nop(); | ||
391 | /* | ||
392 | * Check TSC synchronization with the BP: | ||
393 | */ | ||
394 | check_tsc_sync_target(); | ||
395 | |||
396 | setup_secondary_clock(); | ||
397 | if (nmi_watchdog == NMI_IO_APIC) { | ||
398 | disable_8259A_irq(0); | ||
399 | enable_NMI_through_LVT0(NULL); | ||
400 | enable_8259A_irq(0); | ||
401 | } | ||
402 | /* | ||
403 | * low-memory mappings have been cleared, flush them from | ||
404 | * the local TLBs too. | ||
405 | */ | ||
406 | local_flush_tlb(); | ||
407 | |||
408 | /* This must be done before setting cpu_online_map */ | ||
409 | set_cpu_sibling_map(raw_smp_processor_id()); | ||
410 | wmb(); | ||
411 | |||
412 | /* | ||
413 | * We need to hold call_lock, so there is no inconsistency | ||
414 | * between the time smp_call_function() determines number of | ||
415 | * IPI receipients, and the time when the determination is made | ||
416 | * for which cpus receive the IPI. Holding this | ||
417 | * lock helps us to not include this cpu in a currently in progress | ||
418 | * smp_call_function(). | ||
419 | */ | ||
420 | lock_ipi_call_lock(); | ||
421 | cpu_set(smp_processor_id(), cpu_online_map); | ||
422 | unlock_ipi_call_lock(); | ||
423 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | ||
424 | |||
425 | /* We can take interrupts now: we're officially "up". */ | ||
426 | local_irq_enable(); | ||
427 | |||
428 | wmb(); | ||
429 | cpu_idle(); | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Everything has been set up for the secondary | ||
434 | * CPUs - they just need to reload everything | ||
435 | * from the task structure | ||
436 | * This function must not return. | ||
437 | */ | ||
438 | void __devinit initialize_secondary(void) | ||
439 | { | ||
440 | /* | ||
441 | * We don't actually need to load the full TSS, | ||
442 | * basically just the stack pointer and the eip. | ||
443 | */ | ||
444 | |||
445 | asm volatile( | ||
446 | "movl %0,%%esp\n\t" | ||
447 | "jmp *%1" | ||
448 | : | ||
449 | :"m" (current->thread.esp),"m" (current->thread.eip)); | ||
450 | } | ||
451 | |||
452 | /* Static state in head.S used to set up a CPU */ | ||
453 | extern struct { | ||
454 | void * esp; | ||
455 | unsigned short ss; | ||
456 | } stack_start; | ||
457 | |||
458 | #ifdef CONFIG_NUMA | ||
459 | |||
460 | /* which logical CPUs are on which nodes */ | ||
461 | cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = | ||
462 | { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; | ||
463 | EXPORT_SYMBOL(node_2_cpu_mask); | ||
464 | /* which node each logical CPU is on */ | ||
465 | int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; | ||
466 | EXPORT_SYMBOL(cpu_2_node); | ||
467 | |||
468 | /* set up a mapping between cpu and node. */ | ||
469 | static inline void map_cpu_to_node(int cpu, int node) | ||
470 | { | ||
471 | printk("Mapping cpu %d to node %d\n", cpu, node); | ||
472 | cpu_set(cpu, node_2_cpu_mask[node]); | ||
473 | cpu_2_node[cpu] = node; | ||
474 | } | ||
475 | |||
476 | /* undo a mapping between cpu and node. */ | ||
477 | static inline void unmap_cpu_to_node(int cpu) | ||
478 | { | ||
479 | int node; | ||
480 | |||
481 | printk("Unmapping cpu %d from all nodes\n", cpu); | ||
482 | for (node = 0; node < MAX_NUMNODES; node ++) | ||
483 | cpu_clear(cpu, node_2_cpu_mask[node]); | ||
484 | cpu_2_node[cpu] = 0; | ||
485 | } | ||
486 | #else /* !CONFIG_NUMA */ | ||
487 | |||
488 | #define map_cpu_to_node(cpu, node) ({}) | ||
489 | #define unmap_cpu_to_node(cpu) ({}) | ||
490 | |||
491 | #endif /* CONFIG_NUMA */ | ||
492 | |||
493 | u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
494 | |||
495 | static void map_cpu_to_logical_apicid(void) | ||
496 | { | ||
497 | int cpu = smp_processor_id(); | ||
498 | int apicid = logical_smp_processor_id(); | ||
499 | int node = apicid_to_node(apicid); | ||
500 | |||
501 | if (!node_online(node)) | ||
502 | node = first_online_node; | ||
503 | |||
504 | cpu_2_logical_apicid[cpu] = apicid; | ||
505 | map_cpu_to_node(cpu, node); | ||
506 | } | ||
507 | |||
508 | static void unmap_cpu_to_logical_apicid(int cpu) | ||
509 | { | ||
510 | cpu_2_logical_apicid[cpu] = BAD_APICID; | ||
511 | unmap_cpu_to_node(cpu); | ||
512 | } | ||
513 | |||
514 | static inline void __inquire_remote_apic(int apicid) | ||
515 | { | ||
516 | int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; | ||
517 | char *names[] = { "ID", "VERSION", "SPIV" }; | ||
518 | int timeout; | ||
519 | unsigned long status; | ||
520 | |||
521 | printk("Inquiring remote APIC #%d...\n", apicid); | ||
522 | |||
523 | for (i = 0; i < ARRAY_SIZE(regs); i++) { | ||
524 | printk("... APIC #%d %s: ", apicid, names[i]); | ||
525 | |||
526 | /* | ||
527 | * Wait for idle. | ||
528 | */ | ||
529 | status = safe_apic_wait_icr_idle(); | ||
530 | if (status) | ||
531 | printk("a previous APIC delivery may have failed\n"); | ||
532 | |||
533 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); | ||
534 | apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); | ||
535 | |||
536 | timeout = 0; | ||
537 | do { | ||
538 | udelay(100); | ||
539 | status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; | ||
540 | } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); | ||
541 | |||
542 | switch (status) { | ||
543 | case APIC_ICR_RR_VALID: | ||
544 | status = apic_read(APIC_RRR); | ||
545 | printk("%lx\n", status); | ||
546 | break; | ||
547 | default: | ||
548 | printk("failed\n"); | ||
549 | } | ||
550 | } | ||
551 | } | ||
552 | |||
553 | #ifdef WAKE_SECONDARY_VIA_NMI | ||
554 | /* | ||
555 | * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal | ||
556 | * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this | ||
557 | * won't ... remember to clear down the APIC, etc later. | ||
558 | */ | ||
559 | static int __devinit | ||
560 | wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) | ||
561 | { | ||
562 | unsigned long send_status, accept_status = 0; | ||
563 | int maxlvt; | ||
564 | |||
565 | /* Target chip */ | ||
566 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); | ||
567 | |||
568 | /* Boot on the stack */ | ||
569 | /* Kick the second */ | ||
570 | apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); | ||
571 | |||
572 | Dprintk("Waiting for send to finish...\n"); | ||
573 | send_status = safe_apic_wait_icr_idle(); | ||
574 | |||
575 | /* | ||
576 | * Give the other CPU some time to accept the IPI. | ||
577 | */ | ||
578 | udelay(200); | ||
579 | /* | ||
580 | * Due to the Pentium erratum 3AP. | ||
581 | */ | ||
582 | maxlvt = lapic_get_maxlvt(); | ||
583 | if (maxlvt > 3) { | ||
584 | apic_read_around(APIC_SPIV); | ||
585 | apic_write(APIC_ESR, 0); | ||
586 | } | ||
587 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
588 | Dprintk("NMI sent.\n"); | ||
589 | |||
590 | if (send_status) | ||
591 | printk("APIC never delivered???\n"); | ||
592 | if (accept_status) | ||
593 | printk("APIC delivery error (%lx).\n", accept_status); | ||
594 | |||
595 | return (send_status | accept_status); | ||
596 | } | ||
597 | #endif /* WAKE_SECONDARY_VIA_NMI */ | ||
598 | |||
599 | #ifdef WAKE_SECONDARY_VIA_INIT | ||
600 | static int __devinit | ||
601 | wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | ||
602 | { | ||
603 | unsigned long send_status, accept_status = 0; | ||
604 | int maxlvt, num_starts, j; | ||
605 | |||
606 | /* | ||
607 | * Be paranoid about clearing APIC errors. | ||
608 | */ | ||
609 | if (APIC_INTEGRATED(apic_version[phys_apicid])) { | ||
610 | apic_read_around(APIC_SPIV); | ||
611 | apic_write(APIC_ESR, 0); | ||
612 | apic_read(APIC_ESR); | ||
613 | } | ||
614 | |||
615 | Dprintk("Asserting INIT.\n"); | ||
616 | |||
617 | /* | ||
618 | * Turn INIT on target chip | ||
619 | */ | ||
620 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
621 | |||
622 | /* | ||
623 | * Send IPI | ||
624 | */ | ||
625 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | ||
626 | | APIC_DM_INIT); | ||
627 | |||
628 | Dprintk("Waiting for send to finish...\n"); | ||
629 | send_status = safe_apic_wait_icr_idle(); | ||
630 | |||
631 | mdelay(10); | ||
632 | |||
633 | Dprintk("Deasserting INIT.\n"); | ||
634 | |||
635 | /* Target chip */ | ||
636 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
637 | |||
638 | /* Send IPI */ | ||
639 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); | ||
640 | |||
641 | Dprintk("Waiting for send to finish...\n"); | ||
642 | send_status = safe_apic_wait_icr_idle(); | ||
643 | |||
644 | atomic_set(&init_deasserted, 1); | ||
645 | |||
646 | /* | ||
647 | * Should we send STARTUP IPIs ? | ||
648 | * | ||
649 | * Determine this based on the APIC version. | ||
650 | * If we don't have an integrated APIC, don't send the STARTUP IPIs. | ||
651 | */ | ||
652 | if (APIC_INTEGRATED(apic_version[phys_apicid])) | ||
653 | num_starts = 2; | ||
654 | else | ||
655 | num_starts = 0; | ||
656 | |||
657 | /* | ||
658 | * Paravirt / VMI wants a startup IPI hook here to set up the | ||
659 | * target processor state. | ||
660 | */ | ||
661 | startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, | ||
662 | (unsigned long) stack_start.esp); | ||
663 | |||
664 | /* | ||
665 | * Run STARTUP IPI loop. | ||
666 | */ | ||
667 | Dprintk("#startup loops: %d.\n", num_starts); | ||
668 | |||
669 | maxlvt = lapic_get_maxlvt(); | ||
670 | |||
671 | for (j = 1; j <= num_starts; j++) { | ||
672 | Dprintk("Sending STARTUP #%d.\n",j); | ||
673 | apic_read_around(APIC_SPIV); | ||
674 | apic_write(APIC_ESR, 0); | ||
675 | apic_read(APIC_ESR); | ||
676 | Dprintk("After apic_write.\n"); | ||
677 | |||
678 | /* | ||
679 | * STARTUP IPI | ||
680 | */ | ||
681 | |||
682 | /* Target chip */ | ||
683 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
684 | |||
685 | /* Boot on the stack */ | ||
686 | /* Kick the second */ | ||
687 | apic_write_around(APIC_ICR, APIC_DM_STARTUP | ||
688 | | (start_eip >> 12)); | ||
689 | |||
690 | /* | ||
691 | * Give the other CPU some time to accept the IPI. | ||
692 | */ | ||
693 | udelay(300); | ||
694 | |||
695 | Dprintk("Startup point 1.\n"); | ||
696 | |||
697 | Dprintk("Waiting for send to finish...\n"); | ||
698 | send_status = safe_apic_wait_icr_idle(); | ||
699 | |||
700 | /* | ||
701 | * Give the other CPU some time to accept the IPI. | ||
702 | */ | ||
703 | udelay(200); | ||
704 | /* | ||
705 | * Due to the Pentium erratum 3AP. | ||
706 | */ | ||
707 | if (maxlvt > 3) { | ||
708 | apic_read_around(APIC_SPIV); | ||
709 | apic_write(APIC_ESR, 0); | ||
710 | } | ||
711 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
712 | if (send_status || accept_status) | ||
713 | break; | ||
714 | } | ||
715 | Dprintk("After Startup.\n"); | ||
716 | |||
717 | if (send_status) | ||
718 | printk("APIC never delivered???\n"); | ||
719 | if (accept_status) | ||
720 | printk("APIC delivery error (%lx).\n", accept_status); | ||
721 | |||
722 | return (send_status | accept_status); | ||
723 | } | ||
724 | #endif /* WAKE_SECONDARY_VIA_INIT */ | ||
725 | |||
726 | extern cpumask_t cpu_initialized; | ||
727 | static inline int alloc_cpu_id(void) | ||
728 | { | ||
729 | cpumask_t tmp_map; | ||
730 | int cpu; | ||
731 | cpus_complement(tmp_map, cpu_present_map); | ||
732 | cpu = first_cpu(tmp_map); | ||
733 | if (cpu >= NR_CPUS) | ||
734 | return -ENODEV; | ||
735 | return cpu; | ||
736 | } | ||
737 | |||
738 | #ifdef CONFIG_HOTPLUG_CPU | ||
739 | static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS]; | ||
740 | static inline struct task_struct * alloc_idle_task(int cpu) | ||
741 | { | ||
742 | struct task_struct *idle; | ||
743 | |||
744 | if ((idle = cpu_idle_tasks[cpu]) != NULL) { | ||
745 | /* initialize thread_struct. we really want to avoid destroy | ||
746 | * idle tread | ||
747 | */ | ||
748 | idle->thread.esp = (unsigned long)task_pt_regs(idle); | ||
749 | init_idle(idle, cpu); | ||
750 | return idle; | ||
751 | } | ||
752 | idle = fork_idle(cpu); | ||
753 | |||
754 | if (!IS_ERR(idle)) | ||
755 | cpu_idle_tasks[cpu] = idle; | ||
756 | return idle; | ||
757 | } | ||
758 | #else | ||
759 | #define alloc_idle_task(cpu) fork_idle(cpu) | ||
760 | #endif | ||
761 | |||
762 | static int __cpuinit do_boot_cpu(int apicid, int cpu) | ||
763 | /* | ||
764 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad | ||
765 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. | ||
766 | * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. | ||
767 | */ | ||
768 | { | ||
769 | struct task_struct *idle; | ||
770 | unsigned long boot_error; | ||
771 | int timeout; | ||
772 | unsigned long start_eip; | ||
773 | unsigned short nmi_high = 0, nmi_low = 0; | ||
774 | |||
775 | /* | ||
776 | * Save current MTRR state in case it was changed since early boot | ||
777 | * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: | ||
778 | */ | ||
779 | mtrr_save_state(); | ||
780 | |||
781 | /* | ||
782 | * We can't use kernel_thread since we must avoid to | ||
783 | * reschedule the child. | ||
784 | */ | ||
785 | idle = alloc_idle_task(cpu); | ||
786 | if (IS_ERR(idle)) | ||
787 | panic("failed fork for CPU %d", cpu); | ||
788 | |||
789 | init_gdt(cpu); | ||
790 | per_cpu(current_task, cpu) = idle; | ||
791 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | ||
792 | |||
793 | idle->thread.eip = (unsigned long) start_secondary; | ||
794 | /* start_eip had better be page-aligned! */ | ||
795 | start_eip = setup_trampoline(); | ||
796 | |||
797 | ++cpucount; | ||
798 | alternatives_smp_switch(1); | ||
799 | |||
800 | /* So we see what's up */ | ||
801 | printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); | ||
802 | /* Stack for startup_32 can be just as for start_secondary onwards */ | ||
803 | stack_start.esp = (void *) idle->thread.esp; | ||
804 | |||
805 | irq_ctx_init(cpu); | ||
806 | |||
807 | x86_cpu_to_apicid[cpu] = apicid; | ||
808 | /* | ||
809 | * This grunge runs the startup process for | ||
810 | * the targeted processor. | ||
811 | */ | ||
812 | |||
813 | atomic_set(&init_deasserted, 0); | ||
814 | |||
815 | Dprintk("Setting warm reset code and vector.\n"); | ||
816 | |||
817 | store_NMI_vector(&nmi_high, &nmi_low); | ||
818 | |||
819 | smpboot_setup_warm_reset_vector(start_eip); | ||
820 | |||
821 | /* | ||
822 | * Starting actual IPI sequence... | ||
823 | */ | ||
824 | boot_error = wakeup_secondary_cpu(apicid, start_eip); | ||
825 | |||
826 | if (!boot_error) { | ||
827 | /* | ||
828 | * allow APs to start initializing. | ||
829 | */ | ||
830 | Dprintk("Before Callout %d.\n", cpu); | ||
831 | cpu_set(cpu, cpu_callout_map); | ||
832 | Dprintk("After Callout %d.\n", cpu); | ||
833 | |||
834 | /* | ||
835 | * Wait 5s total for a response | ||
836 | */ | ||
837 | for (timeout = 0; timeout < 50000; timeout++) { | ||
838 | if (cpu_isset(cpu, cpu_callin_map)) | ||
839 | break; /* It has booted */ | ||
840 | udelay(100); | ||
841 | } | ||
842 | |||
843 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
844 | /* number CPUs logically, starting from 1 (BSP is 0) */ | ||
845 | Dprintk("OK.\n"); | ||
846 | printk("CPU%d: ", cpu); | ||
847 | print_cpu_info(&cpu_data[cpu]); | ||
848 | Dprintk("CPU has booted.\n"); | ||
849 | } else { | ||
850 | boot_error= 1; | ||
851 | if (*((volatile unsigned char *)trampoline_base) | ||
852 | == 0xA5) | ||
853 | /* trampoline started but...? */ | ||
854 | printk("Stuck ??\n"); | ||
855 | else | ||
856 | /* trampoline code not run */ | ||
857 | printk("Not responding.\n"); | ||
858 | inquire_remote_apic(apicid); | ||
859 | } | ||
860 | } | ||
861 | |||
862 | if (boot_error) { | ||
863 | /* Try to put things back the way they were before ... */ | ||
864 | unmap_cpu_to_logical_apicid(cpu); | ||
865 | cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ | ||
866 | cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ | ||
867 | cpucount--; | ||
868 | } else { | ||
869 | x86_cpu_to_apicid[cpu] = apicid; | ||
870 | cpu_set(cpu, cpu_present_map); | ||
871 | } | ||
872 | |||
873 | /* mark "stuck" area as not stuck */ | ||
874 | *((volatile unsigned long *)trampoline_base) = 0; | ||
875 | |||
876 | return boot_error; | ||
877 | } | ||
878 | |||
879 | #ifdef CONFIG_HOTPLUG_CPU | ||
880 | void cpu_exit_clear(void) | ||
881 | { | ||
882 | int cpu = raw_smp_processor_id(); | ||
883 | |||
884 | idle_task_exit(); | ||
885 | |||
886 | cpucount --; | ||
887 | cpu_uninit(); | ||
888 | irq_ctx_exit(cpu); | ||
889 | |||
890 | cpu_clear(cpu, cpu_callout_map); | ||
891 | cpu_clear(cpu, cpu_callin_map); | ||
892 | |||
893 | cpu_clear(cpu, smp_commenced_mask); | ||
894 | unmap_cpu_to_logical_apicid(cpu); | ||
895 | } | ||
896 | |||
897 | struct warm_boot_cpu_info { | ||
898 | struct completion *complete; | ||
899 | struct work_struct task; | ||
900 | int apicid; | ||
901 | int cpu; | ||
902 | }; | ||
903 | |||
904 | static void __cpuinit do_warm_boot_cpu(struct work_struct *work) | ||
905 | { | ||
906 | struct warm_boot_cpu_info *info = | ||
907 | container_of(work, struct warm_boot_cpu_info, task); | ||
908 | do_boot_cpu(info->apicid, info->cpu); | ||
909 | complete(info->complete); | ||
910 | } | ||
911 | |||
912 | static int __cpuinit __smp_prepare_cpu(int cpu) | ||
913 | { | ||
914 | DECLARE_COMPLETION_ONSTACK(done); | ||
915 | struct warm_boot_cpu_info info; | ||
916 | int apicid, ret; | ||
917 | |||
918 | apicid = x86_cpu_to_apicid[cpu]; | ||
919 | if (apicid == BAD_APICID) { | ||
920 | ret = -ENODEV; | ||
921 | goto exit; | ||
922 | } | ||
923 | |||
924 | info.complete = &done; | ||
925 | info.apicid = apicid; | ||
926 | info.cpu = cpu; | ||
927 | INIT_WORK(&info.task, do_warm_boot_cpu); | ||
928 | |||
929 | /* init low mem mapping */ | ||
930 | clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, | ||
931 | min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); | ||
932 | flush_tlb_all(); | ||
933 | schedule_work(&info.task); | ||
934 | wait_for_completion(&done); | ||
935 | |||
936 | zap_low_mappings(); | ||
937 | ret = 0; | ||
938 | exit: | ||
939 | return ret; | ||
940 | } | ||
941 | #endif | ||
942 | |||
943 | /* | ||
944 | * Cycle through the processors sending APIC IPIs to boot each. | ||
945 | */ | ||
946 | |||
947 | static int boot_cpu_logical_apicid; | ||
948 | /* Where the IO area was mapped on multiquad, always 0 otherwise */ | ||
949 | void *xquad_portio; | ||
950 | #ifdef CONFIG_X86_NUMAQ | ||
951 | EXPORT_SYMBOL(xquad_portio); | ||
952 | #endif | ||
953 | |||
954 | static void __init smp_boot_cpus(unsigned int max_cpus) | ||
955 | { | ||
956 | int apicid, cpu, bit, kicked; | ||
957 | unsigned long bogosum = 0; | ||
958 | |||
959 | /* | ||
960 | * Setup boot CPU information | ||
961 | */ | ||
962 | smp_store_cpu_info(0); /* Final full version of the data */ | ||
963 | printk("CPU%d: ", 0); | ||
964 | print_cpu_info(&cpu_data[0]); | ||
965 | |||
966 | boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); | ||
967 | boot_cpu_logical_apicid = logical_smp_processor_id(); | ||
968 | x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; | ||
969 | |||
970 | current_thread_info()->cpu = 0; | ||
971 | |||
972 | set_cpu_sibling_map(0); | ||
973 | |||
974 | /* | ||
975 | * If we couldn't find an SMP configuration at boot time, | ||
976 | * get out of here now! | ||
977 | */ | ||
978 | if (!smp_found_config && !acpi_lapic) { | ||
979 | printk(KERN_NOTICE "SMP motherboard not detected.\n"); | ||
980 | smpboot_clear_io_apic_irqs(); | ||
981 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
982 | if (APIC_init_uniprocessor()) | ||
983 | printk(KERN_NOTICE "Local APIC not detected." | ||
984 | " Using dummy APIC emulation.\n"); | ||
985 | map_cpu_to_logical_apicid(); | ||
986 | cpu_set(0, cpu_sibling_map[0]); | ||
987 | cpu_set(0, cpu_core_map[0]); | ||
988 | return; | ||
989 | } | ||
990 | |||
991 | /* | ||
992 | * Should not be necessary because the MP table should list the boot | ||
993 | * CPU too, but we do it for the sake of robustness anyway. | ||
994 | * Makes no sense to do this check in clustered apic mode, so skip it | ||
995 | */ | ||
996 | if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { | ||
997 | printk("weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
998 | boot_cpu_physical_apicid); | ||
999 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
1000 | } | ||
1001 | |||
1002 | /* | ||
1003 | * If we couldn't find a local APIC, then get out of here now! | ||
1004 | */ | ||
1005 | if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { | ||
1006 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | ||
1007 | boot_cpu_physical_apicid); | ||
1008 | printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); | ||
1009 | smpboot_clear_io_apic_irqs(); | ||
1010 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
1011 | cpu_set(0, cpu_sibling_map[0]); | ||
1012 | cpu_set(0, cpu_core_map[0]); | ||
1013 | return; | ||
1014 | } | ||
1015 | |||
1016 | verify_local_APIC(); | ||
1017 | |||
1018 | /* | ||
1019 | * If SMP should be disabled, then really disable it! | ||
1020 | */ | ||
1021 | if (!max_cpus) { | ||
1022 | smp_found_config = 0; | ||
1023 | printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); | ||
1024 | smpboot_clear_io_apic_irqs(); | ||
1025 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
1026 | cpu_set(0, cpu_sibling_map[0]); | ||
1027 | cpu_set(0, cpu_core_map[0]); | ||
1028 | return; | ||
1029 | } | ||
1030 | |||
1031 | connect_bsp_APIC(); | ||
1032 | setup_local_APIC(); | ||
1033 | map_cpu_to_logical_apicid(); | ||
1034 | |||
1035 | |||
1036 | setup_portio_remap(); | ||
1037 | |||
1038 | /* | ||
1039 | * Scan the CPU present map and fire up the other CPUs via do_boot_cpu | ||
1040 | * | ||
1041 | * In clustered apic mode, phys_cpu_present_map is a constructed thus: | ||
1042 | * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the | ||
1043 | * clustered apic ID. | ||
1044 | */ | ||
1045 | Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); | ||
1046 | |||
1047 | kicked = 1; | ||
1048 | for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { | ||
1049 | apicid = cpu_present_to_apicid(bit); | ||
1050 | /* | ||
1051 | * Don't even attempt to start the boot CPU! | ||
1052 | */ | ||
1053 | if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID)) | ||
1054 | continue; | ||
1055 | |||
1056 | if (!check_apicid_present(bit)) | ||
1057 | continue; | ||
1058 | if (max_cpus <= cpucount+1) | ||
1059 | continue; | ||
1060 | |||
1061 | if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) | ||
1062 | printk("CPU #%d not responding - cannot use it.\n", | ||
1063 | apicid); | ||
1064 | else | ||
1065 | ++kicked; | ||
1066 | } | ||
1067 | |||
1068 | /* | ||
1069 | * Cleanup possible dangling ends... | ||
1070 | */ | ||
1071 | smpboot_restore_warm_reset_vector(); | ||
1072 | |||
1073 | /* | ||
1074 | * Allow the user to impress friends. | ||
1075 | */ | ||
1076 | Dprintk("Before bogomips.\n"); | ||
1077 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
1078 | if (cpu_isset(cpu, cpu_callout_map)) | ||
1079 | bogosum += cpu_data[cpu].loops_per_jiffy; | ||
1080 | printk(KERN_INFO | ||
1081 | "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", | ||
1082 | cpucount+1, | ||
1083 | bogosum/(500000/HZ), | ||
1084 | (bogosum/(5000/HZ))%100); | ||
1085 | |||
1086 | Dprintk("Before bogocount - setting activated=1.\n"); | ||
1087 | |||
1088 | if (smp_b_stepping) | ||
1089 | printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); | ||
1090 | |||
1091 | /* | ||
1092 | * Don't taint if we are running SMP kernel on a single non-MP | ||
1093 | * approved Athlon | ||
1094 | */ | ||
1095 | if (tainted & TAINT_UNSAFE_SMP) { | ||
1096 | if (cpucount) | ||
1097 | printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n"); | ||
1098 | else | ||
1099 | tainted &= ~TAINT_UNSAFE_SMP; | ||
1100 | } | ||
1101 | |||
1102 | Dprintk("Boot done.\n"); | ||
1103 | |||
1104 | /* | ||
1105 | * construct cpu_sibling_map[], so that we can tell sibling CPUs | ||
1106 | * efficiently. | ||
1107 | */ | ||
1108 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
1109 | cpus_clear(cpu_sibling_map[cpu]); | ||
1110 | cpus_clear(cpu_core_map[cpu]); | ||
1111 | } | ||
1112 | |||
1113 | cpu_set(0, cpu_sibling_map[0]); | ||
1114 | cpu_set(0, cpu_core_map[0]); | ||
1115 | |||
1116 | smpboot_setup_io_apic(); | ||
1117 | |||
1118 | setup_boot_clock(); | ||
1119 | } | ||
1120 | |||
1121 | /* These are wrappers to interface to the new boot process. Someone | ||
1122 | who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ | ||
1123 | void __init native_smp_prepare_cpus(unsigned int max_cpus) | ||
1124 | { | ||
1125 | smp_commenced_mask = cpumask_of_cpu(0); | ||
1126 | cpu_callin_map = cpumask_of_cpu(0); | ||
1127 | mb(); | ||
1128 | smp_boot_cpus(max_cpus); | ||
1129 | } | ||
1130 | |||
1131 | void __init native_smp_prepare_boot_cpu(void) | ||
1132 | { | ||
1133 | unsigned int cpu = smp_processor_id(); | ||
1134 | |||
1135 | init_gdt(cpu); | ||
1136 | switch_to_new_gdt(); | ||
1137 | |||
1138 | cpu_set(cpu, cpu_online_map); | ||
1139 | cpu_set(cpu, cpu_callout_map); | ||
1140 | cpu_set(cpu, cpu_present_map); | ||
1141 | cpu_set(cpu, cpu_possible_map); | ||
1142 | __get_cpu_var(cpu_state) = CPU_ONLINE; | ||
1143 | } | ||
1144 | |||
1145 | #ifdef CONFIG_HOTPLUG_CPU | ||
1146 | void remove_siblinginfo(int cpu) | ||
1147 | { | ||
1148 | int sibling; | ||
1149 | struct cpuinfo_x86 *c = cpu_data; | ||
1150 | |||
1151 | for_each_cpu_mask(sibling, cpu_core_map[cpu]) { | ||
1152 | cpu_clear(cpu, cpu_core_map[sibling]); | ||
1153 | /* | ||
1154 | * last thread sibling in this cpu core going down | ||
1155 | */ | ||
1156 | if (cpus_weight(cpu_sibling_map[cpu]) == 1) | ||
1157 | c[sibling].booted_cores--; | ||
1158 | } | ||
1159 | |||
1160 | for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) | ||
1161 | cpu_clear(cpu, cpu_sibling_map[sibling]); | ||
1162 | cpus_clear(cpu_sibling_map[cpu]); | ||
1163 | cpus_clear(cpu_core_map[cpu]); | ||
1164 | c[cpu].phys_proc_id = 0; | ||
1165 | c[cpu].cpu_core_id = 0; | ||
1166 | cpu_clear(cpu, cpu_sibling_setup_map); | ||
1167 | } | ||
1168 | |||
1169 | int __cpu_disable(void) | ||
1170 | { | ||
1171 | cpumask_t map = cpu_online_map; | ||
1172 | int cpu = smp_processor_id(); | ||
1173 | |||
1174 | /* | ||
1175 | * Perhaps use cpufreq to drop frequency, but that could go | ||
1176 | * into generic code. | ||
1177 | * | ||
1178 | * We won't take down the boot processor on i386 due to some | ||
1179 | * interrupts only being able to be serviced by the BSP. | ||
1180 | * Especially so if we're not using an IOAPIC -zwane | ||
1181 | */ | ||
1182 | if (cpu == 0) | ||
1183 | return -EBUSY; | ||
1184 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1185 | stop_apic_nmi_watchdog(NULL); | ||
1186 | clear_local_APIC(); | ||
1187 | /* Allow any queued timer interrupts to get serviced */ | ||
1188 | local_irq_enable(); | ||
1189 | mdelay(1); | ||
1190 | local_irq_disable(); | ||
1191 | |||
1192 | remove_siblinginfo(cpu); | ||
1193 | |||
1194 | cpu_clear(cpu, map); | ||
1195 | fixup_irqs(map); | ||
1196 | /* It's now safe to remove this processor from the online map */ | ||
1197 | cpu_clear(cpu, cpu_online_map); | ||
1198 | return 0; | ||
1199 | } | ||
1200 | |||
1201 | void __cpu_die(unsigned int cpu) | ||
1202 | { | ||
1203 | /* We don't do anything here: idle task is faking death itself. */ | ||
1204 | unsigned int i; | ||
1205 | |||
1206 | for (i = 0; i < 10; i++) { | ||
1207 | /* They ack this in play_dead by setting CPU_DEAD */ | ||
1208 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) { | ||
1209 | printk ("CPU %d is now offline\n", cpu); | ||
1210 | if (1 == num_online_cpus()) | ||
1211 | alternatives_smp_switch(0); | ||
1212 | return; | ||
1213 | } | ||
1214 | msleep(100); | ||
1215 | } | ||
1216 | printk(KERN_ERR "CPU %u didn't die...\n", cpu); | ||
1217 | } | ||
1218 | #else /* ... !CONFIG_HOTPLUG_CPU */ | ||
1219 | int __cpu_disable(void) | ||
1220 | { | ||
1221 | return -ENOSYS; | ||
1222 | } | ||
1223 | |||
1224 | void __cpu_die(unsigned int cpu) | ||
1225 | { | ||
1226 | /* We said "no" in __cpu_disable */ | ||
1227 | BUG(); | ||
1228 | } | ||
1229 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1230 | |||
1231 | int __cpuinit native_cpu_up(unsigned int cpu) | ||
1232 | { | ||
1233 | unsigned long flags; | ||
1234 | #ifdef CONFIG_HOTPLUG_CPU | ||
1235 | int ret = 0; | ||
1236 | |||
1237 | /* | ||
1238 | * We do warm boot only on cpus that had booted earlier | ||
1239 | * Otherwise cold boot is all handled from smp_boot_cpus(). | ||
1240 | * cpu_callin_map is set during AP kickstart process. Its reset | ||
1241 | * when a cpu is taken offline from cpu_exit_clear(). | ||
1242 | */ | ||
1243 | if (!cpu_isset(cpu, cpu_callin_map)) | ||
1244 | ret = __smp_prepare_cpu(cpu); | ||
1245 | |||
1246 | if (ret) | ||
1247 | return -EIO; | ||
1248 | #endif | ||
1249 | |||
1250 | /* In case one didn't come up */ | ||
1251 | if (!cpu_isset(cpu, cpu_callin_map)) { | ||
1252 | printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); | ||
1253 | return -EIO; | ||
1254 | } | ||
1255 | |||
1256 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | ||
1257 | /* Unleash the CPU! */ | ||
1258 | cpu_set(cpu, smp_commenced_mask); | ||
1259 | |||
1260 | /* | ||
1261 | * Check TSC synchronization with the AP (keep irqs disabled | ||
1262 | * while doing so): | ||
1263 | */ | ||
1264 | local_irq_save(flags); | ||
1265 | check_tsc_sync_source(cpu); | ||
1266 | local_irq_restore(flags); | ||
1267 | |||
1268 | while (!cpu_isset(cpu, cpu_online_map)) { | ||
1269 | cpu_relax(); | ||
1270 | touch_nmi_watchdog(); | ||
1271 | } | ||
1272 | |||
1273 | return 0; | ||
1274 | } | ||
1275 | |||
1276 | void __init native_smp_cpus_done(unsigned int max_cpus) | ||
1277 | { | ||
1278 | #ifdef CONFIG_X86_IO_APIC | ||
1279 | setup_ioapic_dest(); | ||
1280 | #endif | ||
1281 | zap_low_mappings(); | ||
1282 | #ifndef CONFIG_HOTPLUG_CPU | ||
1283 | /* | ||
1284 | * Disable executability of the SMP trampoline: | ||
1285 | */ | ||
1286 | set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); | ||
1287 | #endif | ||
1288 | } | ||
1289 | |||
1290 | void __init smp_intr_init(void) | ||
1291 | { | ||
1292 | /* | ||
1293 | * IRQ0 must be given a fixed assignment and initialized, | ||
1294 | * because it's used before the IO-APIC is set up. | ||
1295 | */ | ||
1296 | set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); | ||
1297 | |||
1298 | /* | ||
1299 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
1300 | * IPI, driven by wakeup. | ||
1301 | */ | ||
1302 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
1303 | |||
1304 | /* IPI for invalidation */ | ||
1305 | set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | ||
1306 | |||
1307 | /* IPI for generic function call */ | ||
1308 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
1309 | } | ||
1310 | |||
1311 | /* | ||
1312 | * If the BIOS enumerates physical processors before logical, | ||
1313 | * maxcpus=N at enumeration-time can be used to disable HT. | ||
1314 | */ | ||
1315 | static int __init parse_maxcpus(char *arg) | ||
1316 | { | ||
1317 | extern unsigned int maxcpus; | ||
1318 | |||
1319 | maxcpus = simple_strtoul(arg, NULL, 0); | ||
1320 | return 0; | ||
1321 | } | ||
1322 | early_param("maxcpus", parse_maxcpus); | ||
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c new file mode 100644 index 000000000000..bbfe85a0f699 --- /dev/null +++ b/arch/x86/kernel/smpcommon_32.c | |||
@@ -0,0 +1,81 @@ | |||
1 | /* | ||
2 | * SMP stuff which is common to all sub-architectures. | ||
3 | */ | ||
4 | #include <linux/module.h> | ||
5 | #include <asm/smp.h> | ||
6 | |||
7 | DEFINE_PER_CPU(unsigned long, this_cpu_off); | ||
8 | EXPORT_PER_CPU_SYMBOL(this_cpu_off); | ||
9 | |||
10 | /* Initialize the CPU's GDT. This is either the boot CPU doing itself | ||
11 | (still using the master per-cpu area), or a CPU doing it for a | ||
12 | secondary which will soon come up. */ | ||
13 | __cpuinit void init_gdt(int cpu) | ||
14 | { | ||
15 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | ||
16 | |||
17 | pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, | ||
18 | (u32 *)&gdt[GDT_ENTRY_PERCPU].b, | ||
19 | __per_cpu_offset[cpu], 0xFFFFF, | ||
20 | 0x80 | DESCTYPE_S | 0x2, 0x8); | ||
21 | |||
22 | per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; | ||
23 | per_cpu(cpu_number, cpu) = cpu; | ||
24 | } | ||
25 | |||
26 | |||
27 | /** | ||
28 | * smp_call_function(): Run a function on all other CPUs. | ||
29 | * @func: The function to run. This must be fast and non-blocking. | ||
30 | * @info: An arbitrary pointer to pass to the function. | ||
31 | * @nonatomic: Unused. | ||
32 | * @wait: If true, wait (atomically) until function has completed on other CPUs. | ||
33 | * | ||
34 | * Returns 0 on success, else a negative status code. | ||
35 | * | ||
36 | * If @wait is true, then returns once @func has returned; otherwise | ||
37 | * it returns just before the target cpu calls @func. | ||
38 | * | ||
39 | * You must not call this function with disabled interrupts or from a | ||
40 | * hardware interrupt handler or from a bottom half handler. | ||
41 | */ | ||
42 | int smp_call_function(void (*func) (void *info), void *info, int nonatomic, | ||
43 | int wait) | ||
44 | { | ||
45 | return smp_call_function_mask(cpu_online_map, func, info, wait); | ||
46 | } | ||
47 | EXPORT_SYMBOL(smp_call_function); | ||
48 | |||
49 | /** | ||
50 | * smp_call_function_single - Run a function on a specific CPU | ||
51 | * @cpu: The target CPU. Cannot be the calling CPU. | ||
52 | * @func: The function to run. This must be fast and non-blocking. | ||
53 | * @info: An arbitrary pointer to pass to the function. | ||
54 | * @nonatomic: Unused. | ||
55 | * @wait: If true, wait until function has completed on other CPUs. | ||
56 | * | ||
57 | * Returns 0 on success, else a negative status code. | ||
58 | * | ||
59 | * If @wait is true, then returns once @func has returned; otherwise | ||
60 | * it returns just before the target cpu calls @func. | ||
61 | */ | ||
62 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | ||
63 | int nonatomic, int wait) | ||
64 | { | ||
65 | /* prevent preemption and reschedule on another processor */ | ||
66 | int ret; | ||
67 | int me = get_cpu(); | ||
68 | if (cpu == me) { | ||
69 | local_irq_disable(); | ||
70 | func(info); | ||
71 | local_irq_enable(); | ||
72 | put_cpu(); | ||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); | ||
77 | |||
78 | put_cpu(); | ||
79 | return ret; | ||
80 | } | ||
81 | EXPORT_SYMBOL(smp_call_function_single); | ||
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c new file mode 100644 index 000000000000..2a8713ec0f9a --- /dev/null +++ b/arch/x86/kernel/srat_32.c | |||
@@ -0,0 +1,360 @@ | |||
1 | /* | ||
2 | * Some of the code in this file has been gleaned from the 64 bit | ||
3 | * discontigmem support code base. | ||
4 | * | ||
5 | * Copyright (C) 2002, IBM Corp. | ||
6 | * | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | * | ||
24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> | ||
25 | */ | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mmzone.h> | ||
29 | #include <linux/acpi.h> | ||
30 | #include <linux/nodemask.h> | ||
31 | #include <asm/srat.h> | ||
32 | #include <asm/topology.h> | ||
33 | #include <asm/smp.h> | ||
34 | |||
35 | /* | ||
36 | * proximity macros and definitions | ||
37 | */ | ||
38 | #define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ | ||
39 | #define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ | ||
40 | #define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) | ||
41 | #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) | ||
42 | /* bitmap length; _PXM is at most 255 */ | ||
43 | #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) | ||
44 | static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ | ||
45 | |||
46 | #define MAX_CHUNKS_PER_NODE 3 | ||
47 | #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) | ||
48 | struct node_memory_chunk_s { | ||
49 | unsigned long start_pfn; | ||
50 | unsigned long end_pfn; | ||
51 | u8 pxm; // proximity domain of node | ||
52 | u8 nid; // which cnode contains this chunk? | ||
53 | u8 bank; // which mem bank on this node | ||
54 | }; | ||
55 | static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; | ||
56 | |||
57 | static int num_memory_chunks; /* total number of memory chunks */ | ||
58 | static u8 __initdata apicid_to_pxm[MAX_APICID]; | ||
59 | |||
60 | extern void * boot_ioremap(unsigned long, unsigned long); | ||
61 | |||
62 | /* Identify CPU proximity domains */ | ||
63 | static void __init parse_cpu_affinity_structure(char *p) | ||
64 | { | ||
65 | struct acpi_srat_cpu_affinity *cpu_affinity = | ||
66 | (struct acpi_srat_cpu_affinity *) p; | ||
67 | |||
68 | if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
69 | return; /* empty entry */ | ||
70 | |||
71 | /* mark this node as "seen" in node bitmap */ | ||
72 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); | ||
73 | |||
74 | apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; | ||
75 | |||
76 | printk("CPU 0x%02X in proximity domain 0x%02X\n", | ||
77 | cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Identify memory proximity domains and hot-remove capabilities. | ||
82 | * Fill node memory chunk list structure. | ||
83 | */ | ||
84 | static void __init parse_memory_affinity_structure (char *sratp) | ||
85 | { | ||
86 | unsigned long long paddr, size; | ||
87 | unsigned long start_pfn, end_pfn; | ||
88 | u8 pxm; | ||
89 | struct node_memory_chunk_s *p, *q, *pend; | ||
90 | struct acpi_srat_mem_affinity *memory_affinity = | ||
91 | (struct acpi_srat_mem_affinity *) sratp; | ||
92 | |||
93 | if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
94 | return; /* empty entry */ | ||
95 | |||
96 | pxm = memory_affinity->proximity_domain & 0xff; | ||
97 | |||
98 | /* mark this node as "seen" in node bitmap */ | ||
99 | BMAP_SET(pxm_bitmap, pxm); | ||
100 | |||
101 | /* calculate info for memory chunk structure */ | ||
102 | paddr = memory_affinity->base_address; | ||
103 | size = memory_affinity->length; | ||
104 | |||
105 | start_pfn = paddr >> PAGE_SHIFT; | ||
106 | end_pfn = (paddr + size) >> PAGE_SHIFT; | ||
107 | |||
108 | |||
109 | if (num_memory_chunks >= MAXCHUNKS) { | ||
110 | printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", | ||
111 | size/(1024*1024), paddr); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | /* Insertion sort based on base address */ | ||
116 | pend = &node_memory_chunk[num_memory_chunks]; | ||
117 | for (p = &node_memory_chunk[0]; p < pend; p++) { | ||
118 | if (start_pfn < p->start_pfn) | ||
119 | break; | ||
120 | } | ||
121 | if (p < pend) { | ||
122 | for (q = pend; q >= p; q--) | ||
123 | *(q + 1) = *q; | ||
124 | } | ||
125 | p->start_pfn = start_pfn; | ||
126 | p->end_pfn = end_pfn; | ||
127 | p->pxm = pxm; | ||
128 | |||
129 | num_memory_chunks++; | ||
130 | |||
131 | printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", | ||
132 | start_pfn, end_pfn, | ||
133 | memory_affinity->memory_type, | ||
134 | pxm, | ||
135 | ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? | ||
136 | "enabled and removable" : "enabled" ) ); | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * The SRAT table always lists ascending addresses, so can always | ||
141 | * assume that the first "start" address that you see is the real | ||
142 | * start of the node, and that the current "end" address is after | ||
143 | * the previous one. | ||
144 | */ | ||
145 | static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) | ||
146 | { | ||
147 | /* | ||
148 | * Only add present memory as told by the e820. | ||
149 | * There is no guarantee from the SRAT that the memory it | ||
150 | * enumerates is present at boot time because it represents | ||
151 | * *possible* memory hotplug areas the same as normal RAM. | ||
152 | */ | ||
153 | if (memory_chunk->start_pfn >= max_pfn) { | ||
154 | printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", | ||
155 | memory_chunk->start_pfn, memory_chunk->end_pfn); | ||
156 | return; | ||
157 | } | ||
158 | if (memory_chunk->nid != nid) | ||
159 | return; | ||
160 | |||
161 | if (!node_has_online_mem(nid)) | ||
162 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
163 | |||
164 | if (node_start_pfn[nid] > memory_chunk->start_pfn) | ||
165 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
166 | |||
167 | if (node_end_pfn[nid] < memory_chunk->end_pfn) | ||
168 | node_end_pfn[nid] = memory_chunk->end_pfn; | ||
169 | } | ||
170 | |||
171 | /* Parse the ACPI Static Resource Affinity Table */ | ||
172 | static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) | ||
173 | { | ||
174 | u8 *start, *end, *p; | ||
175 | int i, j, nid; | ||
176 | |||
177 | start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ | ||
178 | p = start; | ||
179 | end = (u8 *)sratp + sratp->header.length; | ||
180 | |||
181 | memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ | ||
182 | memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); | ||
183 | |||
184 | num_memory_chunks = 0; | ||
185 | while (p < end) { | ||
186 | switch (*p) { | ||
187 | case ACPI_SRAT_TYPE_CPU_AFFINITY: | ||
188 | parse_cpu_affinity_structure(p); | ||
189 | break; | ||
190 | case ACPI_SRAT_TYPE_MEMORY_AFFINITY: | ||
191 | parse_memory_affinity_structure(p); | ||
192 | break; | ||
193 | default: | ||
194 | printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); | ||
195 | break; | ||
196 | } | ||
197 | p += p[1]; | ||
198 | if (p[1] == 0) { | ||
199 | printk("acpi20_parse_srat: Entry length value is zero;" | ||
200 | " can't parse any further!\n"); | ||
201 | break; | ||
202 | } | ||
203 | } | ||
204 | |||
205 | if (num_memory_chunks == 0) { | ||
206 | printk("could not finy any ACPI SRAT memory areas.\n"); | ||
207 | goto out_fail; | ||
208 | } | ||
209 | |||
210 | /* Calculate total number of nodes in system from PXM bitmap and create | ||
211 | * a set of sequential node IDs starting at zero. (ACPI doesn't seem | ||
212 | * to specify the range of _PXM values.) | ||
213 | */ | ||
214 | /* | ||
215 | * MCD - we no longer HAVE to number nodes sequentially. PXM domain | ||
216 | * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically | ||
217 | * 32, so we will continue numbering them in this manner until MAX_NUMNODES | ||
218 | * approaches MAX_PXM_DOMAINS for i386. | ||
219 | */ | ||
220 | nodes_clear(node_online_map); | ||
221 | for (i = 0; i < MAX_PXM_DOMAINS; i++) { | ||
222 | if (BMAP_TEST(pxm_bitmap, i)) { | ||
223 | int nid = acpi_map_pxm_to_node(i); | ||
224 | node_set_online(nid); | ||
225 | } | ||
226 | } | ||
227 | BUG_ON(num_online_nodes() == 0); | ||
228 | |||
229 | /* set cnode id in memory chunk structure */ | ||
230 | for (i = 0; i < num_memory_chunks; i++) | ||
231 | node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); | ||
232 | |||
233 | printk("pxm bitmap: "); | ||
234 | for (i = 0; i < sizeof(pxm_bitmap); i++) { | ||
235 | printk("%02X ", pxm_bitmap[i]); | ||
236 | } | ||
237 | printk("\n"); | ||
238 | printk("Number of logical nodes in system = %d\n", num_online_nodes()); | ||
239 | printk("Number of memory chunks in system = %d\n", num_memory_chunks); | ||
240 | |||
241 | for (i = 0; i < MAX_APICID; i++) | ||
242 | apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); | ||
243 | |||
244 | for (j = 0; j < num_memory_chunks; j++){ | ||
245 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; | ||
246 | printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", | ||
247 | j, chunk->nid, chunk->start_pfn, chunk->end_pfn); | ||
248 | node_read_chunk(chunk->nid, chunk); | ||
249 | add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); | ||
250 | } | ||
251 | |||
252 | for_each_online_node(nid) { | ||
253 | unsigned long start = node_start_pfn[nid]; | ||
254 | unsigned long end = node_end_pfn[nid]; | ||
255 | |||
256 | memory_present(nid, start, end); | ||
257 | node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); | ||
258 | } | ||
259 | return 1; | ||
260 | out_fail: | ||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | struct acpi_static_rsdt { | ||
265 | struct acpi_table_rsdt table; | ||
266 | u32 padding[7]; /* Allow for 7 more table entries */ | ||
267 | }; | ||
268 | |||
269 | int __init get_memcfg_from_srat(void) | ||
270 | { | ||
271 | struct acpi_table_header *header = NULL; | ||
272 | struct acpi_table_rsdp *rsdp = NULL; | ||
273 | struct acpi_table_rsdt *rsdt = NULL; | ||
274 | acpi_native_uint rsdp_address = 0; | ||
275 | struct acpi_static_rsdt saved_rsdt; | ||
276 | int tables = 0; | ||
277 | int i = 0; | ||
278 | |||
279 | rsdp_address = acpi_find_rsdp(); | ||
280 | if (!rsdp_address) { | ||
281 | printk("%s: System description tables not found\n", | ||
282 | __FUNCTION__); | ||
283 | goto out_err; | ||
284 | } | ||
285 | |||
286 | printk("%s: assigning address to rsdp\n", __FUNCTION__); | ||
287 | rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address; | ||
288 | if (!rsdp) { | ||
289 | printk("%s: Didn't find ACPI root!\n", __FUNCTION__); | ||
290 | goto out_err; | ||
291 | } | ||
292 | |||
293 | printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, | ||
294 | rsdp->oem_id); | ||
295 | |||
296 | if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) { | ||
297 | printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__); | ||
298 | goto out_err; | ||
299 | } | ||
300 | |||
301 | rsdt = (struct acpi_table_rsdt *) | ||
302 | boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); | ||
303 | |||
304 | if (!rsdt) { | ||
305 | printk(KERN_WARNING | ||
306 | "%s: ACPI: Invalid root system description tables (RSDT)\n", | ||
307 | __FUNCTION__); | ||
308 | goto out_err; | ||
309 | } | ||
310 | |||
311 | header = &rsdt->header; | ||
312 | |||
313 | if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) { | ||
314 | printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); | ||
315 | goto out_err; | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * The number of tables is computed by taking the | ||
320 | * size of all entries (header size minus total | ||
321 | * size of RSDT) divided by the size of each entry | ||
322 | * (4-byte table pointers). | ||
323 | */ | ||
324 | tables = (header->length - sizeof(struct acpi_table_header)) / 4; | ||
325 | |||
326 | if (!tables) | ||
327 | goto out_err; | ||
328 | |||
329 | memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); | ||
330 | |||
331 | if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) { | ||
332 | printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", | ||
333 | saved_rsdt.table.header.length); | ||
334 | goto out_err; | ||
335 | } | ||
336 | |||
337 | printk("Begin SRAT table scan....\n"); | ||
338 | |||
339 | for (i = 0; i < tables; i++) { | ||
340 | /* Map in header, then map in full table length. */ | ||
341 | header = (struct acpi_table_header *) | ||
342 | boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); | ||
343 | if (!header) | ||
344 | break; | ||
345 | header = (struct acpi_table_header *) | ||
346 | boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); | ||
347 | if (!header) | ||
348 | break; | ||
349 | |||
350 | if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) | ||
351 | continue; | ||
352 | |||
353 | /* we've found the srat table. don't need to look at any more tables */ | ||
354 | return acpi20_parse_srat((struct acpi_table_srat *)header); | ||
355 | } | ||
356 | out_err: | ||
357 | remove_all_active_ranges(); | ||
358 | printk("failed to get NUMA memory information from SRAT table\n"); | ||
359 | return 0; | ||
360 | } | ||
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c new file mode 100644 index 000000000000..d0e01a3acf35 --- /dev/null +++ b/arch/x86/kernel/summit_32.c | |||
@@ -0,0 +1,180 @@ | |||
1 | /* | ||
2 | * arch/i386/kernel/summit.c - IBM Summit-Specific Code | ||
3 | * | ||
4 | * Written By: Matthew Dobson, IBM Corporation | ||
5 | * | ||
6 | * Copyright (c) 2003 IBM Corp. | ||
7 | * | ||
8 | * All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or (at | ||
13 | * your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, but | ||
16 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
18 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
19 | * details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to the Free Software | ||
23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
24 | * | ||
25 | * Send feedback to <colpatch@us.ibm.com> | ||
26 | * | ||
27 | */ | ||
28 | |||
29 | #include <linux/mm.h> | ||
30 | #include <linux/init.h> | ||
31 | #include <asm/io.h> | ||
32 | #include <asm/mach-summit/mach_mpparse.h> | ||
33 | |||
34 | static struct rio_table_hdr *rio_table_hdr __initdata; | ||
35 | static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; | ||
36 | static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; | ||
37 | |||
38 | static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) | ||
39 | { | ||
40 | int twister = 0, node = 0; | ||
41 | int i, bus, num_buses; | ||
42 | |||
43 | for(i = 0; i < rio_table_hdr->num_rio_dev; i++){ | ||
44 | if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){ | ||
45 | twister = rio_devs[i]->owner_id; | ||
46 | break; | ||
47 | } | ||
48 | } | ||
49 | if (i == rio_table_hdr->num_rio_dev){ | ||
50 | printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__); | ||
51 | return last_bus; | ||
52 | } | ||
53 | |||
54 | for(i = 0; i < rio_table_hdr->num_scal_dev; i++){ | ||
55 | if (scal_devs[i]->node_id == twister){ | ||
56 | node = scal_devs[i]->node_id; | ||
57 | break; | ||
58 | } | ||
59 | } | ||
60 | if (i == rio_table_hdr->num_scal_dev){ | ||
61 | printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__); | ||
62 | return last_bus; | ||
63 | } | ||
64 | |||
65 | switch (rio_devs[wpeg_num]->type){ | ||
66 | case CompatWPEG: | ||
67 | /* The Compatability Winnipeg controls the 2 legacy buses, | ||
68 | * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case | ||
69 | * a PCI-PCI bridge card is used in either slot: total 5 buses. | ||
70 | */ | ||
71 | num_buses = 5; | ||
72 | break; | ||
73 | case AltWPEG: | ||
74 | /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot | ||
75 | * each], their 2 "extra" buses, the 100MHz bus [2 slots] and | ||
76 | * the "extra" buses for each of those slots: total 7 buses. | ||
77 | */ | ||
78 | num_buses = 7; | ||
79 | break; | ||
80 | case LookOutAWPEG: | ||
81 | case LookOutBWPEG: | ||
82 | /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each] | ||
83 | * & the "extra" buses for each of those slots: total 9 buses. | ||
84 | */ | ||
85 | num_buses = 9; | ||
86 | break; | ||
87 | default: | ||
88 | printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__); | ||
89 | return last_bus; | ||
90 | } | ||
91 | |||
92 | for(bus = last_bus; bus < last_bus + num_buses; bus++) | ||
93 | mp_bus_id_to_node[bus] = node; | ||
94 | return bus; | ||
95 | } | ||
96 | |||
97 | static int __init build_detail_arrays(void) | ||
98 | { | ||
99 | unsigned long ptr; | ||
100 | int i, scal_detail_size, rio_detail_size; | ||
101 | |||
102 | if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ | ||
103 | printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | switch (rio_table_hdr->version){ | ||
108 | default: | ||
109 | printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version); | ||
110 | return 0; | ||
111 | case 2: | ||
112 | scal_detail_size = 11; | ||
113 | rio_detail_size = 13; | ||
114 | break; | ||
115 | case 3: | ||
116 | scal_detail_size = 12; | ||
117 | rio_detail_size = 15; | ||
118 | break; | ||
119 | } | ||
120 | |||
121 | ptr = (unsigned long)rio_table_hdr + 3; | ||
122 | for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) | ||
123 | scal_devs[i] = (struct scal_detail *)ptr; | ||
124 | |||
125 | for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) | ||
126 | rio_devs[i] = (struct rio_detail *)ptr; | ||
127 | |||
128 | return 1; | ||
129 | } | ||
130 | |||
131 | void __init setup_summit(void) | ||
132 | { | ||
133 | unsigned long ptr; | ||
134 | unsigned short offset; | ||
135 | int i, next_wpeg, next_bus = 0; | ||
136 | |||
137 | /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ | ||
138 | ptr = *(unsigned short *)phys_to_virt(0x40Eul); | ||
139 | ptr = (unsigned long)phys_to_virt(ptr << 4); | ||
140 | |||
141 | rio_table_hdr = NULL; | ||
142 | offset = 0x180; | ||
143 | while (offset){ | ||
144 | /* The block id is stored in the 2nd word */ | ||
145 | if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ | ||
146 | /* set the pointer past the offset & block id */ | ||
147 | rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); | ||
148 | break; | ||
149 | } | ||
150 | /* The next offset is stored in the 1st word. 0 means no more */ | ||
151 | offset = *((unsigned short *)(ptr + offset)); | ||
152 | } | ||
153 | if (!rio_table_hdr){ | ||
154 | printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__); | ||
155 | return; | ||
156 | } | ||
157 | |||
158 | if (!build_detail_arrays()) | ||
159 | return; | ||
160 | |||
161 | /* The first Winnipeg we're looking for has an index of 0 */ | ||
162 | next_wpeg = 0; | ||
163 | do { | ||
164 | for(i = 0; i < rio_table_hdr->num_rio_dev; i++){ | ||
165 | if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){ | ||
166 | /* It's the Winnipeg we're looking for! */ | ||
167 | next_bus = setup_pci_node_map_for_wpeg(i, next_bus); | ||
168 | next_wpeg++; | ||
169 | break; | ||
170 | } | ||
171 | } | ||
172 | /* | ||
173 | * If we go through all Rio devices and don't find one with | ||
174 | * the next index, it means we've found all the Winnipegs, | ||
175 | * and thus all the PCI buses. | ||
176 | */ | ||
177 | if (i == rio_table_hdr->num_rio_dev) | ||
178 | next_wpeg = 0; | ||
179 | } while (next_wpeg != 0); | ||
180 | } | ||
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c new file mode 100644 index 000000000000..42147304de88 --- /dev/null +++ b/arch/x86/kernel/sys_i386_32.c | |||
@@ -0,0 +1,265 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/sys_i386.c | ||
3 | * | ||
4 | * This file contains various random system calls that | ||
5 | * have a non-standard calling sequence on the Linux/i386 | ||
6 | * platform. | ||
7 | */ | ||
8 | |||
9 | #include <linux/errno.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/mm.h> | ||
12 | #include <linux/fs.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/sem.h> | ||
15 | #include <linux/msg.h> | ||
16 | #include <linux/shm.h> | ||
17 | #include <linux/stat.h> | ||
18 | #include <linux/syscalls.h> | ||
19 | #include <linux/mman.h> | ||
20 | #include <linux/file.h> | ||
21 | #include <linux/utsname.h> | ||
22 | |||
23 | #include <asm/uaccess.h> | ||
24 | #include <asm/unistd.h> | ||
25 | #include <asm/ipc.h> | ||
26 | |||
27 | /* | ||
28 | * sys_pipe() is the normal C calling standard for creating | ||
29 | * a pipe. It's not the way Unix traditionally does this, though. | ||
30 | */ | ||
31 | asmlinkage int sys_pipe(unsigned long __user * fildes) | ||
32 | { | ||
33 | int fd[2]; | ||
34 | int error; | ||
35 | |||
36 | error = do_pipe(fd); | ||
37 | if (!error) { | ||
38 | if (copy_to_user(fildes, fd, 2*sizeof(int))) | ||
39 | error = -EFAULT; | ||
40 | } | ||
41 | return error; | ||
42 | } | ||
43 | |||
44 | asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, | ||
45 | unsigned long prot, unsigned long flags, | ||
46 | unsigned long fd, unsigned long pgoff) | ||
47 | { | ||
48 | int error = -EBADF; | ||
49 | struct file *file = NULL; | ||
50 | struct mm_struct *mm = current->mm; | ||
51 | |||
52 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
53 | if (!(flags & MAP_ANONYMOUS)) { | ||
54 | file = fget(fd); | ||
55 | if (!file) | ||
56 | goto out; | ||
57 | } | ||
58 | |||
59 | down_write(&mm->mmap_sem); | ||
60 | error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
61 | up_write(&mm->mmap_sem); | ||
62 | |||
63 | if (file) | ||
64 | fput(file); | ||
65 | out: | ||
66 | return error; | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * Perform the select(nd, in, out, ex, tv) and mmap() system | ||
71 | * calls. Linux/i386 didn't use to be able to handle more than | ||
72 | * 4 system call parameters, so these system calls used a memory | ||
73 | * block for parameter passing.. | ||
74 | */ | ||
75 | |||
76 | struct mmap_arg_struct { | ||
77 | unsigned long addr; | ||
78 | unsigned long len; | ||
79 | unsigned long prot; | ||
80 | unsigned long flags; | ||
81 | unsigned long fd; | ||
82 | unsigned long offset; | ||
83 | }; | ||
84 | |||
85 | asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) | ||
86 | { | ||
87 | struct mmap_arg_struct a; | ||
88 | int err = -EFAULT; | ||
89 | |||
90 | if (copy_from_user(&a, arg, sizeof(a))) | ||
91 | goto out; | ||
92 | |||
93 | err = -EINVAL; | ||
94 | if (a.offset & ~PAGE_MASK) | ||
95 | goto out; | ||
96 | |||
97 | err = sys_mmap2(a.addr, a.len, a.prot, a.flags, | ||
98 | a.fd, a.offset >> PAGE_SHIFT); | ||
99 | out: | ||
100 | return err; | ||
101 | } | ||
102 | |||
103 | |||
104 | struct sel_arg_struct { | ||
105 | unsigned long n; | ||
106 | fd_set __user *inp, *outp, *exp; | ||
107 | struct timeval __user *tvp; | ||
108 | }; | ||
109 | |||
110 | asmlinkage int old_select(struct sel_arg_struct __user *arg) | ||
111 | { | ||
112 | struct sel_arg_struct a; | ||
113 | |||
114 | if (copy_from_user(&a, arg, sizeof(a))) | ||
115 | return -EFAULT; | ||
116 | /* sys_select() does the appropriate kernel locking */ | ||
117 | return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); | ||
118 | } | ||
119 | |||
120 | /* | ||
121 | * sys_ipc() is the de-multiplexer for the SysV IPC calls.. | ||
122 | * | ||
123 | * This is really horribly ugly. | ||
124 | */ | ||
125 | asmlinkage int sys_ipc (uint call, int first, int second, | ||
126 | int third, void __user *ptr, long fifth) | ||
127 | { | ||
128 | int version, ret; | ||
129 | |||
130 | version = call >> 16; /* hack for backward compatibility */ | ||
131 | call &= 0xffff; | ||
132 | |||
133 | switch (call) { | ||
134 | case SEMOP: | ||
135 | return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); | ||
136 | case SEMTIMEDOP: | ||
137 | return sys_semtimedop(first, (struct sembuf __user *)ptr, second, | ||
138 | (const struct timespec __user *)fifth); | ||
139 | |||
140 | case SEMGET: | ||
141 | return sys_semget (first, second, third); | ||
142 | case SEMCTL: { | ||
143 | union semun fourth; | ||
144 | if (!ptr) | ||
145 | return -EINVAL; | ||
146 | if (get_user(fourth.__pad, (void __user * __user *) ptr)) | ||
147 | return -EFAULT; | ||
148 | return sys_semctl (first, second, third, fourth); | ||
149 | } | ||
150 | |||
151 | case MSGSND: | ||
152 | return sys_msgsnd (first, (struct msgbuf __user *) ptr, | ||
153 | second, third); | ||
154 | case MSGRCV: | ||
155 | switch (version) { | ||
156 | case 0: { | ||
157 | struct ipc_kludge tmp; | ||
158 | if (!ptr) | ||
159 | return -EINVAL; | ||
160 | |||
161 | if (copy_from_user(&tmp, | ||
162 | (struct ipc_kludge __user *) ptr, | ||
163 | sizeof (tmp))) | ||
164 | return -EFAULT; | ||
165 | return sys_msgrcv (first, tmp.msgp, second, | ||
166 | tmp.msgtyp, third); | ||
167 | } | ||
168 | default: | ||
169 | return sys_msgrcv (first, | ||
170 | (struct msgbuf __user *) ptr, | ||
171 | second, fifth, third); | ||
172 | } | ||
173 | case MSGGET: | ||
174 | return sys_msgget ((key_t) first, second); | ||
175 | case MSGCTL: | ||
176 | return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); | ||
177 | |||
178 | case SHMAT: | ||
179 | switch (version) { | ||
180 | default: { | ||
181 | ulong raddr; | ||
182 | ret = do_shmat (first, (char __user *) ptr, second, &raddr); | ||
183 | if (ret) | ||
184 | return ret; | ||
185 | return put_user (raddr, (ulong __user *) third); | ||
186 | } | ||
187 | case 1: /* iBCS2 emulator entry point */ | ||
188 | if (!segment_eq(get_fs(), get_ds())) | ||
189 | return -EINVAL; | ||
190 | /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ | ||
191 | return do_shmat (first, (char __user *) ptr, second, (ulong *) third); | ||
192 | } | ||
193 | case SHMDT: | ||
194 | return sys_shmdt ((char __user *)ptr); | ||
195 | case SHMGET: | ||
196 | return sys_shmget (first, second, third); | ||
197 | case SHMCTL: | ||
198 | return sys_shmctl (first, second, | ||
199 | (struct shmid_ds __user *) ptr); | ||
200 | default: | ||
201 | return -ENOSYS; | ||
202 | } | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * Old cruft | ||
207 | */ | ||
208 | asmlinkage int sys_uname(struct old_utsname __user * name) | ||
209 | { | ||
210 | int err; | ||
211 | if (!name) | ||
212 | return -EFAULT; | ||
213 | down_read(&uts_sem); | ||
214 | err = copy_to_user(name, utsname(), sizeof (*name)); | ||
215 | up_read(&uts_sem); | ||
216 | return err?-EFAULT:0; | ||
217 | } | ||
218 | |||
219 | asmlinkage int sys_olduname(struct oldold_utsname __user * name) | ||
220 | { | ||
221 | int error; | ||
222 | |||
223 | if (!name) | ||
224 | return -EFAULT; | ||
225 | if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) | ||
226 | return -EFAULT; | ||
227 | |||
228 | down_read(&uts_sem); | ||
229 | |||
230 | error = __copy_to_user(&name->sysname, &utsname()->sysname, | ||
231 | __OLD_UTS_LEN); | ||
232 | error |= __put_user(0, name->sysname + __OLD_UTS_LEN); | ||
233 | error |= __copy_to_user(&name->nodename, &utsname()->nodename, | ||
234 | __OLD_UTS_LEN); | ||
235 | error |= __put_user(0, name->nodename + __OLD_UTS_LEN); | ||
236 | error |= __copy_to_user(&name->release, &utsname()->release, | ||
237 | __OLD_UTS_LEN); | ||
238 | error |= __put_user(0, name->release + __OLD_UTS_LEN); | ||
239 | error |= __copy_to_user(&name->version, &utsname()->version, | ||
240 | __OLD_UTS_LEN); | ||
241 | error |= __put_user(0, name->version + __OLD_UTS_LEN); | ||
242 | error |= __copy_to_user(&name->machine, &utsname()->machine, | ||
243 | __OLD_UTS_LEN); | ||
244 | error |= __put_user(0, name->machine + __OLD_UTS_LEN); | ||
245 | |||
246 | up_read(&uts_sem); | ||
247 | |||
248 | error = error ? -EFAULT : 0; | ||
249 | |||
250 | return error; | ||
251 | } | ||
252 | |||
253 | |||
254 | /* | ||
255 | * Do a system call from kernel instead of calling sys_execve so we | ||
256 | * end up with proper pt_regs. | ||
257 | */ | ||
258 | int kernel_execve(const char *filename, char *const argv[], char *const envp[]) | ||
259 | { | ||
260 | long __res; | ||
261 | asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" | ||
262 | : "=a" (__res) | ||
263 | : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory"); | ||
264 | return __res; | ||
265 | } | ||
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S new file mode 100644 index 000000000000..8344c70adf61 --- /dev/null +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -0,0 +1,326 @@ | |||
1 | ENTRY(sys_call_table) | ||
2 | .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ | ||
3 | .long sys_exit | ||
4 | .long sys_fork | ||
5 | .long sys_read | ||
6 | .long sys_write | ||
7 | .long sys_open /* 5 */ | ||
8 | .long sys_close | ||
9 | .long sys_waitpid | ||
10 | .long sys_creat | ||
11 | .long sys_link | ||
12 | .long sys_unlink /* 10 */ | ||
13 | .long sys_execve | ||
14 | .long sys_chdir | ||
15 | .long sys_time | ||
16 | .long sys_mknod | ||
17 | .long sys_chmod /* 15 */ | ||
18 | .long sys_lchown16 | ||
19 | .long sys_ni_syscall /* old break syscall holder */ | ||
20 | .long sys_stat | ||
21 | .long sys_lseek | ||
22 | .long sys_getpid /* 20 */ | ||
23 | .long sys_mount | ||
24 | .long sys_oldumount | ||
25 | .long sys_setuid16 | ||
26 | .long sys_getuid16 | ||
27 | .long sys_stime /* 25 */ | ||
28 | .long sys_ptrace | ||
29 | .long sys_alarm | ||
30 | .long sys_fstat | ||
31 | .long sys_pause | ||
32 | .long sys_utime /* 30 */ | ||
33 | .long sys_ni_syscall /* old stty syscall holder */ | ||
34 | .long sys_ni_syscall /* old gtty syscall holder */ | ||
35 | .long sys_access | ||
36 | .long sys_nice | ||
37 | .long sys_ni_syscall /* 35 - old ftime syscall holder */ | ||
38 | .long sys_sync | ||
39 | .long sys_kill | ||
40 | .long sys_rename | ||
41 | .long sys_mkdir | ||
42 | .long sys_rmdir /* 40 */ | ||
43 | .long sys_dup | ||
44 | .long sys_pipe | ||
45 | .long sys_times | ||
46 | .long sys_ni_syscall /* old prof syscall holder */ | ||
47 | .long sys_brk /* 45 */ | ||
48 | .long sys_setgid16 | ||
49 | .long sys_getgid16 | ||
50 | .long sys_signal | ||
51 | .long sys_geteuid16 | ||
52 | .long sys_getegid16 /* 50 */ | ||
53 | .long sys_acct | ||
54 | .long sys_umount /* recycled never used phys() */ | ||
55 | .long sys_ni_syscall /* old lock syscall holder */ | ||
56 | .long sys_ioctl | ||
57 | .long sys_fcntl /* 55 */ | ||
58 | .long sys_ni_syscall /* old mpx syscall holder */ | ||
59 | .long sys_setpgid | ||
60 | .long sys_ni_syscall /* old ulimit syscall holder */ | ||
61 | .long sys_olduname | ||
62 | .long sys_umask /* 60 */ | ||
63 | .long sys_chroot | ||
64 | .long sys_ustat | ||
65 | .long sys_dup2 | ||
66 | .long sys_getppid | ||
67 | .long sys_getpgrp /* 65 */ | ||
68 | .long sys_setsid | ||
69 | .long sys_sigaction | ||
70 | .long sys_sgetmask | ||
71 | .long sys_ssetmask | ||
72 | .long sys_setreuid16 /* 70 */ | ||
73 | .long sys_setregid16 | ||
74 | .long sys_sigsuspend | ||
75 | .long sys_sigpending | ||
76 | .long sys_sethostname | ||
77 | .long sys_setrlimit /* 75 */ | ||
78 | .long sys_old_getrlimit | ||
79 | .long sys_getrusage | ||
80 | .long sys_gettimeofday | ||
81 | .long sys_settimeofday | ||
82 | .long sys_getgroups16 /* 80 */ | ||
83 | .long sys_setgroups16 | ||
84 | .long old_select | ||
85 | .long sys_symlink | ||
86 | .long sys_lstat | ||
87 | .long sys_readlink /* 85 */ | ||
88 | .long sys_uselib | ||
89 | .long sys_swapon | ||
90 | .long sys_reboot | ||
91 | .long old_readdir | ||
92 | .long old_mmap /* 90 */ | ||
93 | .long sys_munmap | ||
94 | .long sys_truncate | ||
95 | .long sys_ftruncate | ||
96 | .long sys_fchmod | ||
97 | .long sys_fchown16 /* 95 */ | ||
98 | .long sys_getpriority | ||
99 | .long sys_setpriority | ||
100 | .long sys_ni_syscall /* old profil syscall holder */ | ||
101 | .long sys_statfs | ||
102 | .long sys_fstatfs /* 100 */ | ||
103 | .long sys_ioperm | ||
104 | .long sys_socketcall | ||
105 | .long sys_syslog | ||
106 | .long sys_setitimer | ||
107 | .long sys_getitimer /* 105 */ | ||
108 | .long sys_newstat | ||
109 | .long sys_newlstat | ||
110 | .long sys_newfstat | ||
111 | .long sys_uname | ||
112 | .long sys_iopl /* 110 */ | ||
113 | .long sys_vhangup | ||
114 | .long sys_ni_syscall /* old "idle" system call */ | ||
115 | .long sys_vm86old | ||
116 | .long sys_wait4 | ||
117 | .long sys_swapoff /* 115 */ | ||
118 | .long sys_sysinfo | ||
119 | .long sys_ipc | ||
120 | .long sys_fsync | ||
121 | .long sys_sigreturn | ||
122 | .long sys_clone /* 120 */ | ||
123 | .long sys_setdomainname | ||
124 | .long sys_newuname | ||
125 | .long sys_modify_ldt | ||
126 | .long sys_adjtimex | ||
127 | .long sys_mprotect /* 125 */ | ||
128 | .long sys_sigprocmask | ||
129 | .long sys_ni_syscall /* old "create_module" */ | ||
130 | .long sys_init_module | ||
131 | .long sys_delete_module | ||
132 | .long sys_ni_syscall /* 130: old "get_kernel_syms" */ | ||
133 | .long sys_quotactl | ||
134 | .long sys_getpgid | ||
135 | .long sys_fchdir | ||
136 | .long sys_bdflush | ||
137 | .long sys_sysfs /* 135 */ | ||
138 | .long sys_personality | ||
139 | .long sys_ni_syscall /* reserved for afs_syscall */ | ||
140 | .long sys_setfsuid16 | ||
141 | .long sys_setfsgid16 | ||
142 | .long sys_llseek /* 140 */ | ||
143 | .long sys_getdents | ||
144 | .long sys_select | ||
145 | .long sys_flock | ||
146 | .long sys_msync | ||
147 | .long sys_readv /* 145 */ | ||
148 | .long sys_writev | ||
149 | .long sys_getsid | ||
150 | .long sys_fdatasync | ||
151 | .long sys_sysctl | ||
152 | .long sys_mlock /* 150 */ | ||
153 | .long sys_munlock | ||
154 | .long sys_mlockall | ||
155 | .long sys_munlockall | ||
156 | .long sys_sched_setparam | ||
157 | .long sys_sched_getparam /* 155 */ | ||
158 | .long sys_sched_setscheduler | ||
159 | .long sys_sched_getscheduler | ||
160 | .long sys_sched_yield | ||
161 | .long sys_sched_get_priority_max | ||
162 | .long sys_sched_get_priority_min /* 160 */ | ||
163 | .long sys_sched_rr_get_interval | ||
164 | .long sys_nanosleep | ||
165 | .long sys_mremap | ||
166 | .long sys_setresuid16 | ||
167 | .long sys_getresuid16 /* 165 */ | ||
168 | .long sys_vm86 | ||
169 | .long sys_ni_syscall /* Old sys_query_module */ | ||
170 | .long sys_poll | ||
171 | .long sys_nfsservctl | ||
172 | .long sys_setresgid16 /* 170 */ | ||
173 | .long sys_getresgid16 | ||
174 | .long sys_prctl | ||
175 | .long sys_rt_sigreturn | ||
176 | .long sys_rt_sigaction | ||
177 | .long sys_rt_sigprocmask /* 175 */ | ||
178 | .long sys_rt_sigpending | ||
179 | .long sys_rt_sigtimedwait | ||
180 | .long sys_rt_sigqueueinfo | ||
181 | .long sys_rt_sigsuspend | ||
182 | .long sys_pread64 /* 180 */ | ||
183 | .long sys_pwrite64 | ||
184 | .long sys_chown16 | ||
185 | .long sys_getcwd | ||
186 | .long sys_capget | ||
187 | .long sys_capset /* 185 */ | ||
188 | .long sys_sigaltstack | ||
189 | .long sys_sendfile | ||
190 | .long sys_ni_syscall /* reserved for streams1 */ | ||
191 | .long sys_ni_syscall /* reserved for streams2 */ | ||
192 | .long sys_vfork /* 190 */ | ||
193 | .long sys_getrlimit | ||
194 | .long sys_mmap2 | ||
195 | .long sys_truncate64 | ||
196 | .long sys_ftruncate64 | ||
197 | .long sys_stat64 /* 195 */ | ||
198 | .long sys_lstat64 | ||
199 | .long sys_fstat64 | ||
200 | .long sys_lchown | ||
201 | .long sys_getuid | ||
202 | .long sys_getgid /* 200 */ | ||
203 | .long sys_geteuid | ||
204 | .long sys_getegid | ||
205 | .long sys_setreuid | ||
206 | .long sys_setregid | ||
207 | .long sys_getgroups /* 205 */ | ||
208 | .long sys_setgroups | ||
209 | .long sys_fchown | ||
210 | .long sys_setresuid | ||
211 | .long sys_getresuid | ||
212 | .long sys_setresgid /* 210 */ | ||
213 | .long sys_getresgid | ||
214 | .long sys_chown | ||
215 | .long sys_setuid | ||
216 | .long sys_setgid | ||
217 | .long sys_setfsuid /* 215 */ | ||
218 | .long sys_setfsgid | ||
219 | .long sys_pivot_root | ||
220 | .long sys_mincore | ||
221 | .long sys_madvise | ||
222 | .long sys_getdents64 /* 220 */ | ||
223 | .long sys_fcntl64 | ||
224 | .long sys_ni_syscall /* reserved for TUX */ | ||
225 | .long sys_ni_syscall | ||
226 | .long sys_gettid | ||
227 | .long sys_readahead /* 225 */ | ||
228 | .long sys_setxattr | ||
229 | .long sys_lsetxattr | ||
230 | .long sys_fsetxattr | ||
231 | .long sys_getxattr | ||
232 | .long sys_lgetxattr /* 230 */ | ||
233 | .long sys_fgetxattr | ||
234 | .long sys_listxattr | ||
235 | .long sys_llistxattr | ||
236 | .long sys_flistxattr | ||
237 | .long sys_removexattr /* 235 */ | ||
238 | .long sys_lremovexattr | ||
239 | .long sys_fremovexattr | ||
240 | .long sys_tkill | ||
241 | .long sys_sendfile64 | ||
242 | .long sys_futex /* 240 */ | ||
243 | .long sys_sched_setaffinity | ||
244 | .long sys_sched_getaffinity | ||
245 | .long sys_set_thread_area | ||
246 | .long sys_get_thread_area | ||
247 | .long sys_io_setup /* 245 */ | ||
248 | .long sys_io_destroy | ||
249 | .long sys_io_getevents | ||
250 | .long sys_io_submit | ||
251 | .long sys_io_cancel | ||
252 | .long sys_fadvise64 /* 250 */ | ||
253 | .long sys_ni_syscall | ||
254 | .long sys_exit_group | ||
255 | .long sys_lookup_dcookie | ||
256 | .long sys_epoll_create | ||
257 | .long sys_epoll_ctl /* 255 */ | ||
258 | .long sys_epoll_wait | ||
259 | .long sys_remap_file_pages | ||
260 | .long sys_set_tid_address | ||
261 | .long sys_timer_create | ||
262 | .long sys_timer_settime /* 260 */ | ||
263 | .long sys_timer_gettime | ||
264 | .long sys_timer_getoverrun | ||
265 | .long sys_timer_delete | ||
266 | .long sys_clock_settime | ||
267 | .long sys_clock_gettime /* 265 */ | ||
268 | .long sys_clock_getres | ||
269 | .long sys_clock_nanosleep | ||
270 | .long sys_statfs64 | ||
271 | .long sys_fstatfs64 | ||
272 | .long sys_tgkill /* 270 */ | ||
273 | .long sys_utimes | ||
274 | .long sys_fadvise64_64 | ||
275 | .long sys_ni_syscall /* sys_vserver */ | ||
276 | .long sys_mbind | ||
277 | .long sys_get_mempolicy | ||
278 | .long sys_set_mempolicy | ||
279 | .long sys_mq_open | ||
280 | .long sys_mq_unlink | ||
281 | .long sys_mq_timedsend | ||
282 | .long sys_mq_timedreceive /* 280 */ | ||
283 | .long sys_mq_notify | ||
284 | .long sys_mq_getsetattr | ||
285 | .long sys_kexec_load | ||
286 | .long sys_waitid | ||
287 | .long sys_ni_syscall /* 285 */ /* available */ | ||
288 | .long sys_add_key | ||
289 | .long sys_request_key | ||
290 | .long sys_keyctl | ||
291 | .long sys_ioprio_set | ||
292 | .long sys_ioprio_get /* 290 */ | ||
293 | .long sys_inotify_init | ||
294 | .long sys_inotify_add_watch | ||
295 | .long sys_inotify_rm_watch | ||
296 | .long sys_migrate_pages | ||
297 | .long sys_openat /* 295 */ | ||
298 | .long sys_mkdirat | ||
299 | .long sys_mknodat | ||
300 | .long sys_fchownat | ||
301 | .long sys_futimesat | ||
302 | .long sys_fstatat64 /* 300 */ | ||
303 | .long sys_unlinkat | ||
304 | .long sys_renameat | ||
305 | .long sys_linkat | ||
306 | .long sys_symlinkat | ||
307 | .long sys_readlinkat /* 305 */ | ||
308 | .long sys_fchmodat | ||
309 | .long sys_faccessat | ||
310 | .long sys_pselect6 | ||
311 | .long sys_ppoll | ||
312 | .long sys_unshare /* 310 */ | ||
313 | .long sys_set_robust_list | ||
314 | .long sys_get_robust_list | ||
315 | .long sys_splice | ||
316 | .long sys_sync_file_range | ||
317 | .long sys_tee /* 315 */ | ||
318 | .long sys_vmsplice | ||
319 | .long sys_move_pages | ||
320 | .long sys_getcpu | ||
321 | .long sys_epoll_pwait | ||
322 | .long sys_utimensat /* 320 */ | ||
323 | .long sys_signalfd | ||
324 | .long sys_timerfd | ||
325 | .long sys_eventfd | ||
326 | .long sys_fallocate | ||
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c new file mode 100644 index 000000000000..4eb2e408764f --- /dev/null +++ b/arch/x86/kernel/sysenter_32.c | |||
@@ -0,0 +1,348 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/sysenter.c | ||
3 | * | ||
4 | * (C) Copyright 2002 Linus Torvalds | ||
5 | * Portions based on the vdso-randomization code from exec-shield: | ||
6 | * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar | ||
7 | * | ||
8 | * This file contains the needed initializations to support sysenter. | ||
9 | */ | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/smp.h> | ||
13 | #include <linux/thread_info.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/gfp.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/elf.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/err.h> | ||
20 | #include <linux/module.h> | ||
21 | |||
22 | #include <asm/cpufeature.h> | ||
23 | #include <asm/msr.h> | ||
24 | #include <asm/pgtable.h> | ||
25 | #include <asm/unistd.h> | ||
26 | #include <asm/elf.h> | ||
27 | #include <asm/tlbflush.h> | ||
28 | |||
29 | enum { | ||
30 | VDSO_DISABLED = 0, | ||
31 | VDSO_ENABLED = 1, | ||
32 | VDSO_COMPAT = 2, | ||
33 | }; | ||
34 | |||
35 | #ifdef CONFIG_COMPAT_VDSO | ||
36 | #define VDSO_DEFAULT VDSO_COMPAT | ||
37 | #else | ||
38 | #define VDSO_DEFAULT VDSO_ENABLED | ||
39 | #endif | ||
40 | |||
41 | /* | ||
42 | * Should the kernel map a VDSO page into processes and pass its | ||
43 | * address down to glibc upon exec()? | ||
44 | */ | ||
45 | unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; | ||
46 | |||
47 | EXPORT_SYMBOL_GPL(vdso_enabled); | ||
48 | |||
49 | static int __init vdso_setup(char *s) | ||
50 | { | ||
51 | vdso_enabled = simple_strtoul(s, NULL, 0); | ||
52 | |||
53 | return 1; | ||
54 | } | ||
55 | |||
56 | __setup("vdso=", vdso_setup); | ||
57 | |||
58 | extern asmlinkage void sysenter_entry(void); | ||
59 | |||
60 | static __init void reloc_symtab(Elf32_Ehdr *ehdr, | ||
61 | unsigned offset, unsigned size) | ||
62 | { | ||
63 | Elf32_Sym *sym = (void *)ehdr + offset; | ||
64 | unsigned nsym = size / sizeof(*sym); | ||
65 | unsigned i; | ||
66 | |||
67 | for(i = 0; i < nsym; i++, sym++) { | ||
68 | if (sym->st_shndx == SHN_UNDEF || | ||
69 | sym->st_shndx == SHN_ABS) | ||
70 | continue; /* skip */ | ||
71 | |||
72 | if (sym->st_shndx > SHN_LORESERVE) { | ||
73 | printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", | ||
74 | sym->st_shndx); | ||
75 | continue; | ||
76 | } | ||
77 | |||
78 | switch(ELF_ST_TYPE(sym->st_info)) { | ||
79 | case STT_OBJECT: | ||
80 | case STT_FUNC: | ||
81 | case STT_SECTION: | ||
82 | case STT_FILE: | ||
83 | sym->st_value += VDSO_HIGH_BASE; | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) | ||
89 | { | ||
90 | Elf32_Dyn *dyn = (void *)ehdr + offset; | ||
91 | |||
92 | for(; dyn->d_tag != DT_NULL; dyn++) | ||
93 | switch(dyn->d_tag) { | ||
94 | case DT_PLTGOT: | ||
95 | case DT_HASH: | ||
96 | case DT_STRTAB: | ||
97 | case DT_SYMTAB: | ||
98 | case DT_RELA: | ||
99 | case DT_INIT: | ||
100 | case DT_FINI: | ||
101 | case DT_REL: | ||
102 | case DT_DEBUG: | ||
103 | case DT_JMPREL: | ||
104 | case DT_VERSYM: | ||
105 | case DT_VERDEF: | ||
106 | case DT_VERNEED: | ||
107 | case DT_ADDRRNGLO ... DT_ADDRRNGHI: | ||
108 | /* definitely pointers needing relocation */ | ||
109 | dyn->d_un.d_ptr += VDSO_HIGH_BASE; | ||
110 | break; | ||
111 | |||
112 | case DT_ENCODING ... OLD_DT_LOOS-1: | ||
113 | case DT_LOOS ... DT_HIOS-1: | ||
114 | /* Tags above DT_ENCODING are pointers if | ||
115 | they're even */ | ||
116 | if (dyn->d_tag >= DT_ENCODING && | ||
117 | (dyn->d_tag & 1) == 0) | ||
118 | dyn->d_un.d_ptr += VDSO_HIGH_BASE; | ||
119 | break; | ||
120 | |||
121 | case DT_VERDEFNUM: | ||
122 | case DT_VERNEEDNUM: | ||
123 | case DT_FLAGS_1: | ||
124 | case DT_RELACOUNT: | ||
125 | case DT_RELCOUNT: | ||
126 | case DT_VALRNGLO ... DT_VALRNGHI: | ||
127 | /* definitely not pointers */ | ||
128 | break; | ||
129 | |||
130 | case OLD_DT_LOOS ... DT_LOOS-1: | ||
131 | case DT_HIOS ... DT_VALRNGLO-1: | ||
132 | default: | ||
133 | if (dyn->d_tag > DT_ENCODING) | ||
134 | printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", | ||
135 | dyn->d_tag); | ||
136 | break; | ||
137 | } | ||
138 | } | ||
139 | |||
140 | static __init void relocate_vdso(Elf32_Ehdr *ehdr) | ||
141 | { | ||
142 | Elf32_Phdr *phdr; | ||
143 | Elf32_Shdr *shdr; | ||
144 | int i; | ||
145 | |||
146 | BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || | ||
147 | !elf_check_arch(ehdr) || | ||
148 | ehdr->e_type != ET_DYN); | ||
149 | |||
150 | ehdr->e_entry += VDSO_HIGH_BASE; | ||
151 | |||
152 | /* rebase phdrs */ | ||
153 | phdr = (void *)ehdr + ehdr->e_phoff; | ||
154 | for (i = 0; i < ehdr->e_phnum; i++) { | ||
155 | phdr[i].p_vaddr += VDSO_HIGH_BASE; | ||
156 | |||
157 | /* relocate dynamic stuff */ | ||
158 | if (phdr[i].p_type == PT_DYNAMIC) | ||
159 | reloc_dyn(ehdr, phdr[i].p_offset); | ||
160 | } | ||
161 | |||
162 | /* rebase sections */ | ||
163 | shdr = (void *)ehdr + ehdr->e_shoff; | ||
164 | for(i = 0; i < ehdr->e_shnum; i++) { | ||
165 | if (!(shdr[i].sh_flags & SHF_ALLOC)) | ||
166 | continue; | ||
167 | |||
168 | shdr[i].sh_addr += VDSO_HIGH_BASE; | ||
169 | |||
170 | if (shdr[i].sh_type == SHT_SYMTAB || | ||
171 | shdr[i].sh_type == SHT_DYNSYM) | ||
172 | reloc_symtab(ehdr, shdr[i].sh_offset, | ||
173 | shdr[i].sh_size); | ||
174 | } | ||
175 | } | ||
176 | |||
177 | void enable_sep_cpu(void) | ||
178 | { | ||
179 | int cpu = get_cpu(); | ||
180 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
181 | |||
182 | if (!boot_cpu_has(X86_FEATURE_SEP)) { | ||
183 | put_cpu(); | ||
184 | return; | ||
185 | } | ||
186 | |||
187 | tss->x86_tss.ss1 = __KERNEL_CS; | ||
188 | tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; | ||
189 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | ||
190 | wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); | ||
191 | wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); | ||
192 | put_cpu(); | ||
193 | } | ||
194 | |||
195 | static struct vm_area_struct gate_vma; | ||
196 | |||
197 | static int __init gate_vma_init(void) | ||
198 | { | ||
199 | gate_vma.vm_mm = NULL; | ||
200 | gate_vma.vm_start = FIXADDR_USER_START; | ||
201 | gate_vma.vm_end = FIXADDR_USER_END; | ||
202 | gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; | ||
203 | gate_vma.vm_page_prot = __P101; | ||
204 | /* | ||
205 | * Make sure the vDSO gets into every core dump. | ||
206 | * Dumping its contents makes post-mortem fully interpretable later | ||
207 | * without matching up the same kernel and hardware config to see | ||
208 | * what PC values meant. | ||
209 | */ | ||
210 | gate_vma.vm_flags |= VM_ALWAYSDUMP; | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * These symbols are defined by vsyscall.o to mark the bounds | ||
216 | * of the ELF DSO images included therein. | ||
217 | */ | ||
218 | extern const char vsyscall_int80_start, vsyscall_int80_end; | ||
219 | extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; | ||
220 | static struct page *syscall_pages[1]; | ||
221 | |||
222 | static void map_compat_vdso(int map) | ||
223 | { | ||
224 | static int vdso_mapped; | ||
225 | |||
226 | if (map == vdso_mapped) | ||
227 | return; | ||
228 | |||
229 | vdso_mapped = map; | ||
230 | |||
231 | __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, | ||
232 | map ? PAGE_READONLY_EXEC : PAGE_NONE); | ||
233 | |||
234 | /* flush stray tlbs */ | ||
235 | flush_tlb_all(); | ||
236 | } | ||
237 | |||
238 | int __init sysenter_setup(void) | ||
239 | { | ||
240 | void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); | ||
241 | const void *vsyscall; | ||
242 | size_t vsyscall_len; | ||
243 | |||
244 | syscall_pages[0] = virt_to_page(syscall_page); | ||
245 | |||
246 | gate_vma_init(); | ||
247 | |||
248 | printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); | ||
249 | |||
250 | if (!boot_cpu_has(X86_FEATURE_SEP)) { | ||
251 | vsyscall = &vsyscall_int80_start; | ||
252 | vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; | ||
253 | } else { | ||
254 | vsyscall = &vsyscall_sysenter_start; | ||
255 | vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; | ||
256 | } | ||
257 | |||
258 | memcpy(syscall_page, vsyscall, vsyscall_len); | ||
259 | relocate_vdso(syscall_page); | ||
260 | |||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | /* Defined in vsyscall-sysenter.S */ | ||
265 | extern void SYSENTER_RETURN; | ||
266 | |||
267 | /* Setup a VMA at program startup for the vsyscall page */ | ||
268 | int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) | ||
269 | { | ||
270 | struct mm_struct *mm = current->mm; | ||
271 | unsigned long addr; | ||
272 | int ret = 0; | ||
273 | bool compat; | ||
274 | |||
275 | down_write(&mm->mmap_sem); | ||
276 | |||
277 | /* Test compat mode once here, in case someone | ||
278 | changes it via sysctl */ | ||
279 | compat = (vdso_enabled == VDSO_COMPAT); | ||
280 | |||
281 | map_compat_vdso(compat); | ||
282 | |||
283 | if (compat) | ||
284 | addr = VDSO_HIGH_BASE; | ||
285 | else { | ||
286 | addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); | ||
287 | if (IS_ERR_VALUE(addr)) { | ||
288 | ret = addr; | ||
289 | goto up_fail; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * MAYWRITE to allow gdb to COW and set breakpoints | ||
294 | * | ||
295 | * Make sure the vDSO gets into every core dump. | ||
296 | * Dumping its contents makes post-mortem fully | ||
297 | * interpretable later without matching up the same | ||
298 | * kernel and hardware config to see what PC values | ||
299 | * meant. | ||
300 | */ | ||
301 | ret = install_special_mapping(mm, addr, PAGE_SIZE, | ||
302 | VM_READ|VM_EXEC| | ||
303 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| | ||
304 | VM_ALWAYSDUMP, | ||
305 | syscall_pages); | ||
306 | |||
307 | if (ret) | ||
308 | goto up_fail; | ||
309 | } | ||
310 | |||
311 | current->mm->context.vdso = (void *)addr; | ||
312 | current_thread_info()->sysenter_return = | ||
313 | (void *)VDSO_SYM(&SYSENTER_RETURN); | ||
314 | |||
315 | up_fail: | ||
316 | up_write(&mm->mmap_sem); | ||
317 | |||
318 | return ret; | ||
319 | } | ||
320 | |||
321 | const char *arch_vma_name(struct vm_area_struct *vma) | ||
322 | { | ||
323 | if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) | ||
324 | return "[vdso]"; | ||
325 | return NULL; | ||
326 | } | ||
327 | |||
328 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | ||
329 | { | ||
330 | struct mm_struct *mm = tsk->mm; | ||
331 | |||
332 | /* Check to see if this task was created in compat vdso mode */ | ||
333 | if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) | ||
334 | return &gate_vma; | ||
335 | return NULL; | ||
336 | } | ||
337 | |||
338 | int in_gate_area(struct task_struct *task, unsigned long addr) | ||
339 | { | ||
340 | const struct vm_area_struct *vma = get_gate_vma(task); | ||
341 | |||
342 | return vma && addr >= vma->vm_start && addr < vma->vm_end; | ||
343 | } | ||
344 | |||
345 | int in_gate_area_no_task(unsigned long addr) | ||
346 | { | ||
347 | return 0; | ||
348 | } | ||
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c new file mode 100644 index 000000000000..19a6c678d02e --- /dev/null +++ b/arch/x86/kernel/time_32.c | |||
@@ -0,0 +1,236 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/kernel/time.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992, 1995 Linus Torvalds | ||
5 | * | ||
6 | * This file contains the PC-specific time handling details: | ||
7 | * reading the RTC at bootup, etc.. | ||
8 | * 1994-07-02 Alan Modra | ||
9 | * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime | ||
10 | * 1995-03-26 Markus Kuhn | ||
11 | * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 | ||
12 | * precision CMOS clock update | ||
13 | * 1996-05-03 Ingo Molnar | ||
14 | * fixed time warps in do_[slow|fast]_gettimeoffset() | ||
15 | * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 | ||
16 | * "A Kernel Model for Precision Timekeeping" by Dave Mills | ||
17 | * 1998-09-05 (Various) | ||
18 | * More robust do_fast_gettimeoffset() algorithm implemented | ||
19 | * (works with APM, Cyrix 6x86MX and Centaur C6), | ||
20 | * monotonic gettimeofday() with fast_get_timeoffset(), | ||
21 | * drift-proof precision TSC calibration on boot | ||
22 | * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. | ||
23 | * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; | ||
24 | * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). | ||
25 | * 1998-12-16 Andrea Arcangeli | ||
26 | * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy | ||
27 | * because was not accounting lost_ticks. | ||
28 | * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli | ||
29 | * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to | ||
30 | * serialize accesses to xtime/lost_ticks). | ||
31 | */ | ||
32 | |||
33 | #include <linux/errno.h> | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/param.h> | ||
37 | #include <linux/string.h> | ||
38 | #include <linux/mm.h> | ||
39 | #include <linux/interrupt.h> | ||
40 | #include <linux/time.h> | ||
41 | #include <linux/delay.h> | ||
42 | #include <linux/init.h> | ||
43 | #include <linux/smp.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/sysdev.h> | ||
46 | #include <linux/bcd.h> | ||
47 | #include <linux/efi.h> | ||
48 | #include <linux/mca.h> | ||
49 | |||
50 | #include <asm/io.h> | ||
51 | #include <asm/smp.h> | ||
52 | #include <asm/irq.h> | ||
53 | #include <asm/msr.h> | ||
54 | #include <asm/delay.h> | ||
55 | #include <asm/mpspec.h> | ||
56 | #include <asm/uaccess.h> | ||
57 | #include <asm/processor.h> | ||
58 | #include <asm/timer.h> | ||
59 | #include <asm/time.h> | ||
60 | |||
61 | #include "mach_time.h" | ||
62 | |||
63 | #include <linux/timex.h> | ||
64 | |||
65 | #include <asm/hpet.h> | ||
66 | |||
67 | #include <asm/arch_hooks.h> | ||
68 | |||
69 | #include "io_ports.h" | ||
70 | |||
71 | #include <asm/i8259.h> | ||
72 | |||
73 | #include "do_timer.h" | ||
74 | |||
75 | unsigned int cpu_khz; /* Detected as we calibrate the TSC */ | ||
76 | EXPORT_SYMBOL(cpu_khz); | ||
77 | |||
78 | DEFINE_SPINLOCK(rtc_lock); | ||
79 | EXPORT_SYMBOL(rtc_lock); | ||
80 | |||
81 | /* | ||
82 | * This is a special lock that is owned by the CPU and holds the index | ||
83 | * register we are working with. It is required for NMI access to the | ||
84 | * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. | ||
85 | */ | ||
86 | volatile unsigned long cmos_lock = 0; | ||
87 | EXPORT_SYMBOL(cmos_lock); | ||
88 | |||
89 | /* Routines for accessing the CMOS RAM/RTC. */ | ||
90 | unsigned char rtc_cmos_read(unsigned char addr) | ||
91 | { | ||
92 | unsigned char val; | ||
93 | lock_cmos_prefix(addr); | ||
94 | outb_p(addr, RTC_PORT(0)); | ||
95 | val = inb_p(RTC_PORT(1)); | ||
96 | lock_cmos_suffix(addr); | ||
97 | return val; | ||
98 | } | ||
99 | EXPORT_SYMBOL(rtc_cmos_read); | ||
100 | |||
101 | void rtc_cmos_write(unsigned char val, unsigned char addr) | ||
102 | { | ||
103 | lock_cmos_prefix(addr); | ||
104 | outb_p(addr, RTC_PORT(0)); | ||
105 | outb_p(val, RTC_PORT(1)); | ||
106 | lock_cmos_suffix(addr); | ||
107 | } | ||
108 | EXPORT_SYMBOL(rtc_cmos_write); | ||
109 | |||
110 | static int set_rtc_mmss(unsigned long nowtime) | ||
111 | { | ||
112 | int retval; | ||
113 | unsigned long flags; | ||
114 | |||
115 | /* gets recalled with irq locally disabled */ | ||
116 | /* XXX - does irqsave resolve this? -johnstul */ | ||
117 | spin_lock_irqsave(&rtc_lock, flags); | ||
118 | retval = set_wallclock(nowtime); | ||
119 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
120 | |||
121 | return retval; | ||
122 | } | ||
123 | |||
124 | |||
125 | int timer_ack; | ||
126 | |||
127 | unsigned long profile_pc(struct pt_regs *regs) | ||
128 | { | ||
129 | unsigned long pc = instruction_pointer(regs); | ||
130 | |||
131 | #ifdef CONFIG_SMP | ||
132 | if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && | ||
133 | in_lock_functions(pc)) { | ||
134 | #ifdef CONFIG_FRAME_POINTER | ||
135 | return *(unsigned long *)(regs->ebp + 4); | ||
136 | #else | ||
137 | unsigned long *sp = (unsigned long *)®s->esp; | ||
138 | |||
139 | /* Return address is either directly at stack pointer | ||
140 | or above a saved eflags. Eflags has bits 22-31 zero, | ||
141 | kernel addresses don't. */ | ||
142 | if (sp[0] >> 22) | ||
143 | return sp[0]; | ||
144 | if (sp[1] >> 22) | ||
145 | return sp[1]; | ||
146 | #endif | ||
147 | } | ||
148 | #endif | ||
149 | return pc; | ||
150 | } | ||
151 | EXPORT_SYMBOL(profile_pc); | ||
152 | |||
153 | /* | ||
154 | * This is the same as the above, except we _also_ save the current | ||
155 | * Time Stamp Counter value at the time of the timer interrupt, so that | ||
156 | * we later on can estimate the time of day more exactly. | ||
157 | */ | ||
158 | irqreturn_t timer_interrupt(int irq, void *dev_id) | ||
159 | { | ||
160 | #ifdef CONFIG_X86_IO_APIC | ||
161 | if (timer_ack) { | ||
162 | /* | ||
163 | * Subtle, when I/O APICs are used we have to ack timer IRQ | ||
164 | * manually to reset the IRR bit for do_slow_gettimeoffset(). | ||
165 | * This will also deassert NMI lines for the watchdog if run | ||
166 | * on an 82489DX-based system. | ||
167 | */ | ||
168 | spin_lock(&i8259A_lock); | ||
169 | outb(0x0c, PIC_MASTER_OCW3); | ||
170 | /* Ack the IRQ; AEOI will end it automatically. */ | ||
171 | inb(PIC_MASTER_POLL); | ||
172 | spin_unlock(&i8259A_lock); | ||
173 | } | ||
174 | #endif | ||
175 | |||
176 | do_timer_interrupt_hook(); | ||
177 | |||
178 | if (MCA_bus) { | ||
179 | /* The PS/2 uses level-triggered interrupts. You can't | ||
180 | turn them off, nor would you want to (any attempt to | ||
181 | enable edge-triggered interrupts usually gets intercepted by a | ||
182 | special hardware circuit). Hence we have to acknowledge | ||
183 | the timer interrupt. Through some incredibly stupid | ||
184 | design idea, the reset for IRQ 0 is done by setting the | ||
185 | high bit of the PPI port B (0x61). Note that some PS/2s, | ||
186 | notably the 55SX, work fine if this is removed. */ | ||
187 | |||
188 | u8 irq_v = inb_p( 0x61 ); /* read the current state */ | ||
189 | outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ | ||
190 | } | ||
191 | |||
192 | return IRQ_HANDLED; | ||
193 | } | ||
194 | |||
195 | /* not static: needed by APM */ | ||
196 | unsigned long read_persistent_clock(void) | ||
197 | { | ||
198 | unsigned long retval; | ||
199 | unsigned long flags; | ||
200 | |||
201 | spin_lock_irqsave(&rtc_lock, flags); | ||
202 | |||
203 | retval = get_wallclock(); | ||
204 | |||
205 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
206 | |||
207 | return retval; | ||
208 | } | ||
209 | |||
210 | int update_persistent_clock(struct timespec now) | ||
211 | { | ||
212 | return set_rtc_mmss(now.tv_sec); | ||
213 | } | ||
214 | |||
215 | extern void (*late_time_init)(void); | ||
216 | /* Duplicate of time_init() below, with hpet_enable part added */ | ||
217 | void __init hpet_time_init(void) | ||
218 | { | ||
219 | if (!hpet_enable()) | ||
220 | setup_pit_timer(); | ||
221 | time_init_hook(); | ||
222 | } | ||
223 | |||
224 | /* | ||
225 | * This is called directly from init code; we must delay timer setup in the | ||
226 | * HPET case as we can't make the decision to turn on HPET this early in the | ||
227 | * boot process. | ||
228 | * | ||
229 | * The chosen time_init function will usually be hpet_time_init, above, but | ||
230 | * in the case of virtual hardware, an alternative function may be substituted. | ||
231 | */ | ||
232 | void __init time_init(void) | ||
233 | { | ||
234 | tsc_init(); | ||
235 | late_time_init = choose_time_init(); | ||
236 | } | ||
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c new file mode 100644 index 000000000000..45782356a618 --- /dev/null +++ b/arch/x86/kernel/topology.c | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * arch/i386/kernel/topology.c - Populate sysfs with topology information | ||
3 | * | ||
4 | * Written by: Matthew Dobson, IBM Corporation | ||
5 | * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL | ||
6 | * | ||
7 | * Copyright (C) 2002, IBM Corp. | ||
8 | * | ||
9 | * All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, but | ||
17 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
19 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
20 | * details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU General Public License | ||
23 | * along with this program; if not, write to the Free Software | ||
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
25 | * | ||
26 | * Send feedback to <colpatch@us.ibm.com> | ||
27 | */ | ||
28 | #include <linux/init.h> | ||
29 | #include <linux/smp.h> | ||
30 | #include <linux/nodemask.h> | ||
31 | #include <linux/mmzone.h> | ||
32 | #include <asm/cpu.h> | ||
33 | |||
34 | static struct i386_cpu cpu_devices[NR_CPUS]; | ||
35 | |||
36 | int arch_register_cpu(int num) | ||
37 | { | ||
38 | /* | ||
39 | * CPU0 cannot be offlined due to several | ||
40 | * restrictions and assumptions in kernel. This basically | ||
41 | * doesnt add a control file, one cannot attempt to offline | ||
42 | * BSP. | ||
43 | * | ||
44 | * Also certain PCI quirks require not to enable hotplug control | ||
45 | * for all CPU's. | ||
46 | */ | ||
47 | if (num && enable_cpu_hotplug) | ||
48 | cpu_devices[num].cpu.hotpluggable = 1; | ||
49 | |||
50 | return register_cpu(&cpu_devices[num].cpu, num); | ||
51 | } | ||
52 | |||
53 | #ifdef CONFIG_HOTPLUG_CPU | ||
54 | int enable_cpu_hotplug = 1; | ||
55 | |||
56 | void arch_unregister_cpu(int num) { | ||
57 | return unregister_cpu(&cpu_devices[num].cpu); | ||
58 | } | ||
59 | EXPORT_SYMBOL(arch_register_cpu); | ||
60 | EXPORT_SYMBOL(arch_unregister_cpu); | ||
61 | #endif /*CONFIG_HOTPLUG_CPU*/ | ||
62 | |||
63 | static int __init topology_init(void) | ||
64 | { | ||
65 | int i; | ||
66 | |||
67 | #ifdef CONFIG_NUMA | ||
68 | for_each_online_node(i) | ||
69 | register_one_node(i); | ||
70 | #endif /* CONFIG_NUMA */ | ||
71 | |||
72 | for_each_present_cpu(i) | ||
73 | arch_register_cpu(i); | ||
74 | return 0; | ||
75 | } | ||
76 | |||
77 | subsys_initcall(topology_init); | ||
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S new file mode 100644 index 000000000000..f62815f8d06a --- /dev/null +++ b/arch/x86/kernel/trampoline_32.S | |||
@@ -0,0 +1,85 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Trampoline.S Derived from Setup.S by Linus Torvalds | ||
4 | * | ||
5 | * 4 Jan 1997 Michael Chastain: changed to gnu as. | ||
6 | * | ||
7 | * This is only used for booting secondary CPUs in SMP machine | ||
8 | * | ||
9 | * Entry: CS:IP point to the start of our code, we are | ||
10 | * in real mode with no stack, but the rest of the | ||
11 | * trampoline page to make our stack and everything else | ||
12 | * is a mystery. | ||
13 | * | ||
14 | * In fact we don't actually need a stack so we don't | ||
15 | * set one up. | ||
16 | * | ||
17 | * We jump into the boot/compressed/head.S code. So you'd | ||
18 | * better be running a compressed kernel image or you | ||
19 | * won't get very far. | ||
20 | * | ||
21 | * On entry to trampoline_data, the processor is in real mode | ||
22 | * with 16-bit addressing and 16-bit data. CS has some value | ||
23 | * and IP is zero. Thus, data addresses need to be absolute | ||
24 | * (no relocation) and are taken with regard to r_base. | ||
25 | * | ||
26 | * If you work on this file, check the object module with | ||
27 | * objdump --reloc to make sure there are no relocation | ||
28 | * entries except for: | ||
29 | * | ||
30 | * TYPE VALUE | ||
31 | * R_386_32 startup_32_smp | ||
32 | * R_386_32 boot_gdt | ||
33 | */ | ||
34 | |||
35 | #include <linux/linkage.h> | ||
36 | #include <asm/segment.h> | ||
37 | #include <asm/page.h> | ||
38 | |||
39 | .data | ||
40 | |||
41 | /* We can free up trampoline after bootup if cpu hotplug is not supported. */ | ||
42 | #ifndef CONFIG_HOTPLUG_CPU | ||
43 | .section ".init.data","aw",@progbits | ||
44 | #endif | ||
45 | |||
46 | .code16 | ||
47 | |||
48 | ENTRY(trampoline_data) | ||
49 | r_base = . | ||
50 | wbinvd # Needed for NUMA-Q should be harmless for others | ||
51 | mov %cs, %ax # Code and data in the same place | ||
52 | mov %ax, %ds | ||
53 | |||
54 | cli # We should be safe anyway | ||
55 | |||
56 | movl $0xA5A5A5A5, trampoline_data - r_base | ||
57 | # write marker for master knows we're running | ||
58 | |||
59 | /* GDT tables in non default location kernel can be beyond 16MB and | ||
60 | * lgdt will not be able to load the address as in real mode default | ||
61 | * operand size is 16bit. Use lgdtl instead to force operand size | ||
62 | * to 32 bit. | ||
63 | */ | ||
64 | |||
65 | lidtl boot_idt_descr - r_base # load idt with 0, 0 | ||
66 | lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate | ||
67 | |||
68 | xor %ax, %ax | ||
69 | inc %ax # protected mode (PE) bit | ||
70 | lmsw %ax # into protected mode | ||
71 | # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S | ||
72 | ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) | ||
73 | |||
74 | # These need to be in the same 64K segment as the above; | ||
75 | # hence we don't use the boot_gdt_descr defined in head.S | ||
76 | boot_gdt_descr: | ||
77 | .word __BOOT_DS + 7 # gdt limit | ||
78 | .long boot_gdt - __PAGE_OFFSET # gdt base | ||
79 | |||
80 | boot_idt_descr: | ||
81 | .word 0 # idt limit = 0 | ||
82 | .long 0 # idt base = 0L | ||
83 | |||
84 | .globl trampoline_end | ||
85 | trampoline_end: | ||
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c new file mode 100644 index 000000000000..47b0bef335bd --- /dev/null +++ b/arch/x86/kernel/traps_32.c | |||
@@ -0,0 +1,1250 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/traps.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * Pentium III FXSR, SSE support | ||
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
8 | */ | ||
9 | |||
10 | /* | ||
11 | * 'Traps.c' handles hardware traps and faults after we have saved some | ||
12 | * state in 'asm.s'. | ||
13 | */ | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/errno.h> | ||
18 | #include <linux/timer.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/delay.h> | ||
22 | #include <linux/spinlock.h> | ||
23 | #include <linux/interrupt.h> | ||
24 | #include <linux/highmem.h> | ||
25 | #include <linux/kallsyms.h> | ||
26 | #include <linux/ptrace.h> | ||
27 | #include <linux/utsname.h> | ||
28 | #include <linux/kprobes.h> | ||
29 | #include <linux/kexec.h> | ||
30 | #include <linux/unwind.h> | ||
31 | #include <linux/uaccess.h> | ||
32 | #include <linux/nmi.h> | ||
33 | #include <linux/bug.h> | ||
34 | |||
35 | #ifdef CONFIG_EISA | ||
36 | #include <linux/ioport.h> | ||
37 | #include <linux/eisa.h> | ||
38 | #endif | ||
39 | |||
40 | #ifdef CONFIG_MCA | ||
41 | #include <linux/mca.h> | ||
42 | #endif | ||
43 | |||
44 | #if defined(CONFIG_EDAC) | ||
45 | #include <linux/edac.h> | ||
46 | #endif | ||
47 | |||
48 | #include <asm/processor.h> | ||
49 | #include <asm/system.h> | ||
50 | #include <asm/io.h> | ||
51 | #include <asm/atomic.h> | ||
52 | #include <asm/debugreg.h> | ||
53 | #include <asm/desc.h> | ||
54 | #include <asm/i387.h> | ||
55 | #include <asm/nmi.h> | ||
56 | #include <asm/unwind.h> | ||
57 | #include <asm/smp.h> | ||
58 | #include <asm/arch_hooks.h> | ||
59 | #include <linux/kdebug.h> | ||
60 | #include <asm/stacktrace.h> | ||
61 | |||
62 | #include <linux/module.h> | ||
63 | |||
64 | #include "mach_traps.h" | ||
65 | |||
66 | int panic_on_unrecovered_nmi; | ||
67 | |||
68 | asmlinkage int system_call(void); | ||
69 | |||
70 | /* Do we ignore FPU interrupts ? */ | ||
71 | char ignore_fpu_irq = 0; | ||
72 | |||
73 | /* | ||
74 | * The IDT has to be page-aligned to simplify the Pentium | ||
75 | * F0 0F bug workaround.. We have a special link segment | ||
76 | * for this. | ||
77 | */ | ||
78 | struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; | ||
79 | |||
80 | asmlinkage void divide_error(void); | ||
81 | asmlinkage void debug(void); | ||
82 | asmlinkage void nmi(void); | ||
83 | asmlinkage void int3(void); | ||
84 | asmlinkage void overflow(void); | ||
85 | asmlinkage void bounds(void); | ||
86 | asmlinkage void invalid_op(void); | ||
87 | asmlinkage void device_not_available(void); | ||
88 | asmlinkage void coprocessor_segment_overrun(void); | ||
89 | asmlinkage void invalid_TSS(void); | ||
90 | asmlinkage void segment_not_present(void); | ||
91 | asmlinkage void stack_segment(void); | ||
92 | asmlinkage void general_protection(void); | ||
93 | asmlinkage void page_fault(void); | ||
94 | asmlinkage void coprocessor_error(void); | ||
95 | asmlinkage void simd_coprocessor_error(void); | ||
96 | asmlinkage void alignment_check(void); | ||
97 | asmlinkage void spurious_interrupt_bug(void); | ||
98 | asmlinkage void machine_check(void); | ||
99 | |||
100 | int kstack_depth_to_print = 24; | ||
101 | static unsigned int code_bytes = 64; | ||
102 | |||
103 | static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) | ||
104 | { | ||
105 | return p > (void *)tinfo && | ||
106 | p <= (void *)tinfo + THREAD_SIZE - size; | ||
107 | } | ||
108 | |||
109 | /* The form of the top of the frame on the stack */ | ||
110 | struct stack_frame { | ||
111 | struct stack_frame *next_frame; | ||
112 | unsigned long return_address; | ||
113 | }; | ||
114 | |||
115 | static inline unsigned long print_context_stack(struct thread_info *tinfo, | ||
116 | unsigned long *stack, unsigned long ebp, | ||
117 | struct stacktrace_ops *ops, void *data) | ||
118 | { | ||
119 | #ifdef CONFIG_FRAME_POINTER | ||
120 | struct stack_frame *frame = (struct stack_frame *)ebp; | ||
121 | while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) { | ||
122 | struct stack_frame *next; | ||
123 | unsigned long addr; | ||
124 | |||
125 | addr = frame->return_address; | ||
126 | ops->address(data, addr); | ||
127 | /* | ||
128 | * break out of recursive entries (such as | ||
129 | * end_of_stack_stop_unwind_function). Also, | ||
130 | * we can never allow a frame pointer to | ||
131 | * move downwards! | ||
132 | */ | ||
133 | next = frame->next_frame; | ||
134 | if (next <= frame) | ||
135 | break; | ||
136 | frame = next; | ||
137 | } | ||
138 | #else | ||
139 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { | ||
140 | unsigned long addr; | ||
141 | |||
142 | addr = *stack++; | ||
143 | if (__kernel_text_address(addr)) | ||
144 | ops->address(data, addr); | ||
145 | } | ||
146 | #endif | ||
147 | return ebp; | ||
148 | } | ||
149 | |||
150 | #define MSG(msg) ops->warning(data, msg) | ||
151 | |||
152 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
153 | unsigned long *stack, | ||
154 | struct stacktrace_ops *ops, void *data) | ||
155 | { | ||
156 | unsigned long ebp = 0; | ||
157 | |||
158 | if (!task) | ||
159 | task = current; | ||
160 | |||
161 | if (!stack) { | ||
162 | unsigned long dummy; | ||
163 | stack = &dummy; | ||
164 | if (task != current) | ||
165 | stack = (unsigned long *)task->thread.esp; | ||
166 | } | ||
167 | |||
168 | #ifdef CONFIG_FRAME_POINTER | ||
169 | if (!ebp) { | ||
170 | if (task == current) { | ||
171 | /* Grab ebp right from our regs */ | ||
172 | asm ("movl %%ebp, %0" : "=r" (ebp) : ); | ||
173 | } else { | ||
174 | /* ebp is the last reg pushed by switch_to */ | ||
175 | ebp = *(unsigned long *) task->thread.esp; | ||
176 | } | ||
177 | } | ||
178 | #endif | ||
179 | |||
180 | while (1) { | ||
181 | struct thread_info *context; | ||
182 | context = (struct thread_info *) | ||
183 | ((unsigned long)stack & (~(THREAD_SIZE - 1))); | ||
184 | ebp = print_context_stack(context, stack, ebp, ops, data); | ||
185 | /* Should be after the line below, but somewhere | ||
186 | in early boot context comes out corrupted and we | ||
187 | can't reference it -AK */ | ||
188 | if (ops->stack(data, "IRQ") < 0) | ||
189 | break; | ||
190 | stack = (unsigned long*)context->previous_esp; | ||
191 | if (!stack) | ||
192 | break; | ||
193 | touch_nmi_watchdog(); | ||
194 | } | ||
195 | } | ||
196 | EXPORT_SYMBOL(dump_trace); | ||
197 | |||
198 | static void | ||
199 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
200 | { | ||
201 | printk(data); | ||
202 | print_symbol(msg, symbol); | ||
203 | printk("\n"); | ||
204 | } | ||
205 | |||
206 | static void print_trace_warning(void *data, char *msg) | ||
207 | { | ||
208 | printk("%s%s\n", (char *)data, msg); | ||
209 | } | ||
210 | |||
211 | static int print_trace_stack(void *data, char *name) | ||
212 | { | ||
213 | return 0; | ||
214 | } | ||
215 | |||
216 | /* | ||
217 | * Print one address/symbol entries per line. | ||
218 | */ | ||
219 | static void print_trace_address(void *data, unsigned long addr) | ||
220 | { | ||
221 | printk("%s [<%08lx>] ", (char *)data, addr); | ||
222 | print_symbol("%s\n", addr); | ||
223 | touch_nmi_watchdog(); | ||
224 | } | ||
225 | |||
226 | static struct stacktrace_ops print_trace_ops = { | ||
227 | .warning = print_trace_warning, | ||
228 | .warning_symbol = print_trace_warning_symbol, | ||
229 | .stack = print_trace_stack, | ||
230 | .address = print_trace_address, | ||
231 | }; | ||
232 | |||
233 | static void | ||
234 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
235 | unsigned long * stack, char *log_lvl) | ||
236 | { | ||
237 | dump_trace(task, regs, stack, &print_trace_ops, log_lvl); | ||
238 | printk("%s =======================\n", log_lvl); | ||
239 | } | ||
240 | |||
241 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
242 | unsigned long * stack) | ||
243 | { | ||
244 | show_trace_log_lvl(task, regs, stack, ""); | ||
245 | } | ||
246 | |||
247 | static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
248 | unsigned long *esp, char *log_lvl) | ||
249 | { | ||
250 | unsigned long *stack; | ||
251 | int i; | ||
252 | |||
253 | if (esp == NULL) { | ||
254 | if (task) | ||
255 | esp = (unsigned long*)task->thread.esp; | ||
256 | else | ||
257 | esp = (unsigned long *)&esp; | ||
258 | } | ||
259 | |||
260 | stack = esp; | ||
261 | for(i = 0; i < kstack_depth_to_print; i++) { | ||
262 | if (kstack_end(stack)) | ||
263 | break; | ||
264 | if (i && ((i % 8) == 0)) | ||
265 | printk("\n%s ", log_lvl); | ||
266 | printk("%08lx ", *stack++); | ||
267 | } | ||
268 | printk("\n%sCall Trace:\n", log_lvl); | ||
269 | show_trace_log_lvl(task, regs, esp, log_lvl); | ||
270 | } | ||
271 | |||
272 | void show_stack(struct task_struct *task, unsigned long *esp) | ||
273 | { | ||
274 | printk(" "); | ||
275 | show_stack_log_lvl(task, NULL, esp, ""); | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * The architecture-independent dump_stack generator | ||
280 | */ | ||
281 | void dump_stack(void) | ||
282 | { | ||
283 | unsigned long stack; | ||
284 | |||
285 | show_trace(current, NULL, &stack); | ||
286 | } | ||
287 | |||
288 | EXPORT_SYMBOL(dump_stack); | ||
289 | |||
290 | void show_registers(struct pt_regs *regs) | ||
291 | { | ||
292 | int i; | ||
293 | int in_kernel = 1; | ||
294 | unsigned long esp; | ||
295 | unsigned short ss, gs; | ||
296 | |||
297 | esp = (unsigned long) (®s->esp); | ||
298 | savesegment(ss, ss); | ||
299 | savesegment(gs, gs); | ||
300 | if (user_mode_vm(regs)) { | ||
301 | in_kernel = 0; | ||
302 | esp = regs->esp; | ||
303 | ss = regs->xss & 0xffff; | ||
304 | } | ||
305 | print_modules(); | ||
306 | printk(KERN_EMERG "CPU: %d\n" | ||
307 | KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" | ||
308 | KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", | ||
309 | smp_processor_id(), 0xffff & regs->xcs, regs->eip, | ||
310 | print_tainted(), regs->eflags, init_utsname()->release, | ||
311 | (int)strcspn(init_utsname()->version, " "), | ||
312 | init_utsname()->version); | ||
313 | print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); | ||
314 | printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", | ||
315 | regs->eax, regs->ebx, regs->ecx, regs->edx); | ||
316 | printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", | ||
317 | regs->esi, regs->edi, regs->ebp, esp); | ||
318 | printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", | ||
319 | regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); | ||
320 | printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", | ||
321 | TASK_COMM_LEN, current->comm, current->pid, | ||
322 | current_thread_info(), current, task_thread_info(current)); | ||
323 | /* | ||
324 | * When in-kernel, we also print out the stack and code at the | ||
325 | * time of the fault.. | ||
326 | */ | ||
327 | if (in_kernel) { | ||
328 | u8 *eip; | ||
329 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
330 | unsigned int code_len = code_bytes; | ||
331 | unsigned char c; | ||
332 | |||
333 | printk("\n" KERN_EMERG "Stack: "); | ||
334 | show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); | ||
335 | |||
336 | printk(KERN_EMERG "Code: "); | ||
337 | |||
338 | eip = (u8 *)regs->eip - code_prologue; | ||
339 | if (eip < (u8 *)PAGE_OFFSET || | ||
340 | probe_kernel_address(eip, c)) { | ||
341 | /* try starting at EIP */ | ||
342 | eip = (u8 *)regs->eip; | ||
343 | code_len = code_len - code_prologue + 1; | ||
344 | } | ||
345 | for (i = 0; i < code_len; i++, eip++) { | ||
346 | if (eip < (u8 *)PAGE_OFFSET || | ||
347 | probe_kernel_address(eip, c)) { | ||
348 | printk(" Bad EIP value."); | ||
349 | break; | ||
350 | } | ||
351 | if (eip == (u8 *)regs->eip) | ||
352 | printk("<%02x> ", c); | ||
353 | else | ||
354 | printk("%02x ", c); | ||
355 | } | ||
356 | } | ||
357 | printk("\n"); | ||
358 | } | ||
359 | |||
360 | int is_valid_bugaddr(unsigned long eip) | ||
361 | { | ||
362 | unsigned short ud2; | ||
363 | |||
364 | if (eip < PAGE_OFFSET) | ||
365 | return 0; | ||
366 | if (probe_kernel_address((unsigned short *)eip, ud2)) | ||
367 | return 0; | ||
368 | |||
369 | return ud2 == 0x0b0f; | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * This is gone through when something in the kernel has done something bad and | ||
374 | * is about to be terminated. | ||
375 | */ | ||
376 | void die(const char * str, struct pt_regs * regs, long err) | ||
377 | { | ||
378 | static struct { | ||
379 | spinlock_t lock; | ||
380 | u32 lock_owner; | ||
381 | int lock_owner_depth; | ||
382 | } die = { | ||
383 | .lock = __SPIN_LOCK_UNLOCKED(die.lock), | ||
384 | .lock_owner = -1, | ||
385 | .lock_owner_depth = 0 | ||
386 | }; | ||
387 | static int die_counter; | ||
388 | unsigned long flags; | ||
389 | |||
390 | oops_enter(); | ||
391 | |||
392 | if (die.lock_owner != raw_smp_processor_id()) { | ||
393 | console_verbose(); | ||
394 | spin_lock_irqsave(&die.lock, flags); | ||
395 | die.lock_owner = smp_processor_id(); | ||
396 | die.lock_owner_depth = 0; | ||
397 | bust_spinlocks(1); | ||
398 | } | ||
399 | else | ||
400 | local_save_flags(flags); | ||
401 | |||
402 | if (++die.lock_owner_depth < 3) { | ||
403 | int nl = 0; | ||
404 | unsigned long esp; | ||
405 | unsigned short ss; | ||
406 | |||
407 | report_bug(regs->eip, regs); | ||
408 | |||
409 | printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); | ||
410 | #ifdef CONFIG_PREEMPT | ||
411 | printk(KERN_EMERG "PREEMPT "); | ||
412 | nl = 1; | ||
413 | #endif | ||
414 | #ifdef CONFIG_SMP | ||
415 | if (!nl) | ||
416 | printk(KERN_EMERG); | ||
417 | printk("SMP "); | ||
418 | nl = 1; | ||
419 | #endif | ||
420 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
421 | if (!nl) | ||
422 | printk(KERN_EMERG); | ||
423 | printk("DEBUG_PAGEALLOC"); | ||
424 | nl = 1; | ||
425 | #endif | ||
426 | if (nl) | ||
427 | printk("\n"); | ||
428 | if (notify_die(DIE_OOPS, str, regs, err, | ||
429 | current->thread.trap_no, SIGSEGV) != | ||
430 | NOTIFY_STOP) { | ||
431 | show_registers(regs); | ||
432 | /* Executive summary in case the oops scrolled away */ | ||
433 | esp = (unsigned long) (®s->esp); | ||
434 | savesegment(ss, ss); | ||
435 | if (user_mode(regs)) { | ||
436 | esp = regs->esp; | ||
437 | ss = regs->xss & 0xffff; | ||
438 | } | ||
439 | printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); | ||
440 | print_symbol("%s", regs->eip); | ||
441 | printk(" SS:ESP %04x:%08lx\n", ss, esp); | ||
442 | } | ||
443 | else | ||
444 | regs = NULL; | ||
445 | } else | ||
446 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | ||
447 | |||
448 | bust_spinlocks(0); | ||
449 | die.lock_owner = -1; | ||
450 | add_taint(TAINT_DIE); | ||
451 | spin_unlock_irqrestore(&die.lock, flags); | ||
452 | |||
453 | if (!regs) | ||
454 | return; | ||
455 | |||
456 | if (kexec_should_crash(current)) | ||
457 | crash_kexec(regs); | ||
458 | |||
459 | if (in_interrupt()) | ||
460 | panic("Fatal exception in interrupt"); | ||
461 | |||
462 | if (panic_on_oops) | ||
463 | panic("Fatal exception"); | ||
464 | |||
465 | oops_exit(); | ||
466 | do_exit(SIGSEGV); | ||
467 | } | ||
468 | |||
469 | static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) | ||
470 | { | ||
471 | if (!user_mode_vm(regs)) | ||
472 | die(str, regs, err); | ||
473 | } | ||
474 | |||
475 | static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, | ||
476 | struct pt_regs * regs, long error_code, | ||
477 | siginfo_t *info) | ||
478 | { | ||
479 | struct task_struct *tsk = current; | ||
480 | |||
481 | if (regs->eflags & VM_MASK) { | ||
482 | if (vm86) | ||
483 | goto vm86_trap; | ||
484 | goto trap_signal; | ||
485 | } | ||
486 | |||
487 | if (!user_mode(regs)) | ||
488 | goto kernel_trap; | ||
489 | |||
490 | trap_signal: { | ||
491 | /* | ||
492 | * We want error_code and trap_no set for userspace faults and | ||
493 | * kernelspace faults which result in die(), but not | ||
494 | * kernelspace faults which are fixed up. die() gives the | ||
495 | * process no chance to handle the signal and notice the | ||
496 | * kernel fault information, so that won't result in polluting | ||
497 | * the information about previously queued, but not yet | ||
498 | * delivered, faults. See also do_general_protection below. | ||
499 | */ | ||
500 | tsk->thread.error_code = error_code; | ||
501 | tsk->thread.trap_no = trapnr; | ||
502 | |||
503 | if (info) | ||
504 | force_sig_info(signr, info, tsk); | ||
505 | else | ||
506 | force_sig(signr, tsk); | ||
507 | return; | ||
508 | } | ||
509 | |||
510 | kernel_trap: { | ||
511 | if (!fixup_exception(regs)) { | ||
512 | tsk->thread.error_code = error_code; | ||
513 | tsk->thread.trap_no = trapnr; | ||
514 | die(str, regs, error_code); | ||
515 | } | ||
516 | return; | ||
517 | } | ||
518 | |||
519 | vm86_trap: { | ||
520 | int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); | ||
521 | if (ret) goto trap_signal; | ||
522 | return; | ||
523 | } | ||
524 | } | ||
525 | |||
526 | #define DO_ERROR(trapnr, signr, str, name) \ | ||
527 | fastcall void do_##name(struct pt_regs * regs, long error_code) \ | ||
528 | { \ | ||
529 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
530 | == NOTIFY_STOP) \ | ||
531 | return; \ | ||
532 | do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ | ||
533 | } | ||
534 | |||
535 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ | ||
536 | fastcall void do_##name(struct pt_regs * regs, long error_code) \ | ||
537 | { \ | ||
538 | siginfo_t info; \ | ||
539 | if (irq) \ | ||
540 | local_irq_enable(); \ | ||
541 | info.si_signo = signr; \ | ||
542 | info.si_errno = 0; \ | ||
543 | info.si_code = sicode; \ | ||
544 | info.si_addr = (void __user *)siaddr; \ | ||
545 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
546 | == NOTIFY_STOP) \ | ||
547 | return; \ | ||
548 | do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ | ||
549 | } | ||
550 | |||
551 | #define DO_VM86_ERROR(trapnr, signr, str, name) \ | ||
552 | fastcall void do_##name(struct pt_regs * regs, long error_code) \ | ||
553 | { \ | ||
554 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
555 | == NOTIFY_STOP) \ | ||
556 | return; \ | ||
557 | do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ | ||
558 | } | ||
559 | |||
560 | #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
561 | fastcall void do_##name(struct pt_regs * regs, long error_code) \ | ||
562 | { \ | ||
563 | siginfo_t info; \ | ||
564 | info.si_signo = signr; \ | ||
565 | info.si_errno = 0; \ | ||
566 | info.si_code = sicode; \ | ||
567 | info.si_addr = (void __user *)siaddr; \ | ||
568 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
569 | == NOTIFY_STOP) \ | ||
570 | return; \ | ||
571 | do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ | ||
572 | } | ||
573 | |||
574 | DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) | ||
575 | #ifndef CONFIG_KPROBES | ||
576 | DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) | ||
577 | #endif | ||
578 | DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) | ||
579 | DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) | ||
580 | DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) | ||
581 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
582 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | ||
583 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | ||
584 | DO_ERROR(12, SIGBUS, "stack segment", stack_segment) | ||
585 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) | ||
586 | DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) | ||
587 | |||
588 | fastcall void __kprobes do_general_protection(struct pt_regs * regs, | ||
589 | long error_code) | ||
590 | { | ||
591 | int cpu = get_cpu(); | ||
592 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
593 | struct thread_struct *thread = ¤t->thread; | ||
594 | |||
595 | /* | ||
596 | * Perform the lazy TSS's I/O bitmap copy. If the TSS has an | ||
597 | * invalid offset set (the LAZY one) and the faulting thread has | ||
598 | * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS | ||
599 | * and we set the offset field correctly. Then we let the CPU to | ||
600 | * restart the faulting instruction. | ||
601 | */ | ||
602 | if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && | ||
603 | thread->io_bitmap_ptr) { | ||
604 | memcpy(tss->io_bitmap, thread->io_bitmap_ptr, | ||
605 | thread->io_bitmap_max); | ||
606 | /* | ||
607 | * If the previously set map was extending to higher ports | ||
608 | * than the current one, pad extra space with 0xff (no access). | ||
609 | */ | ||
610 | if (thread->io_bitmap_max < tss->io_bitmap_max) | ||
611 | memset((char *) tss->io_bitmap + | ||
612 | thread->io_bitmap_max, 0xff, | ||
613 | tss->io_bitmap_max - thread->io_bitmap_max); | ||
614 | tss->io_bitmap_max = thread->io_bitmap_max; | ||
615 | tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; | ||
616 | tss->io_bitmap_owner = thread; | ||
617 | put_cpu(); | ||
618 | return; | ||
619 | } | ||
620 | put_cpu(); | ||
621 | |||
622 | if (regs->eflags & VM_MASK) | ||
623 | goto gp_in_vm86; | ||
624 | |||
625 | if (!user_mode(regs)) | ||
626 | goto gp_in_kernel; | ||
627 | |||
628 | current->thread.error_code = error_code; | ||
629 | current->thread.trap_no = 13; | ||
630 | if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && | ||
631 | printk_ratelimit()) | ||
632 | printk(KERN_INFO | ||
633 | "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", | ||
634 | current->comm, current->pid, | ||
635 | regs->eip, regs->esp, error_code); | ||
636 | |||
637 | force_sig(SIGSEGV, current); | ||
638 | return; | ||
639 | |||
640 | gp_in_vm86: | ||
641 | local_irq_enable(); | ||
642 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | ||
643 | return; | ||
644 | |||
645 | gp_in_kernel: | ||
646 | if (!fixup_exception(regs)) { | ||
647 | current->thread.error_code = error_code; | ||
648 | current->thread.trap_no = 13; | ||
649 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
650 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | ||
651 | return; | ||
652 | die("general protection fault", regs, error_code); | ||
653 | } | ||
654 | } | ||
655 | |||
656 | static __kprobes void | ||
657 | mem_parity_error(unsigned char reason, struct pt_regs * regs) | ||
658 | { | ||
659 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " | ||
660 | "CPU %d.\n", reason, smp_processor_id()); | ||
661 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | ||
662 | |||
663 | #if defined(CONFIG_EDAC) | ||
664 | if(edac_handler_set()) { | ||
665 | edac_atomic_assert_error(); | ||
666 | return; | ||
667 | } | ||
668 | #endif | ||
669 | |||
670 | if (panic_on_unrecovered_nmi) | ||
671 | panic("NMI: Not continuing"); | ||
672 | |||
673 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
674 | |||
675 | /* Clear and disable the memory parity error line. */ | ||
676 | clear_mem_error(reason); | ||
677 | } | ||
678 | |||
679 | static __kprobes void | ||
680 | io_check_error(unsigned char reason, struct pt_regs * regs) | ||
681 | { | ||
682 | unsigned long i; | ||
683 | |||
684 | printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); | ||
685 | show_registers(regs); | ||
686 | |||
687 | /* Re-enable the IOCK line, wait for a few seconds */ | ||
688 | reason = (reason & 0xf) | 8; | ||
689 | outb(reason, 0x61); | ||
690 | i = 2000; | ||
691 | while (--i) udelay(1000); | ||
692 | reason &= ~8; | ||
693 | outb(reason, 0x61); | ||
694 | } | ||
695 | |||
696 | static __kprobes void | ||
697 | unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | ||
698 | { | ||
699 | #ifdef CONFIG_MCA | ||
700 | /* Might actually be able to figure out what the guilty party | ||
701 | * is. */ | ||
702 | if( MCA_bus ) { | ||
703 | mca_handle_nmi(); | ||
704 | return; | ||
705 | } | ||
706 | #endif | ||
707 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " | ||
708 | "CPU %d.\n", reason, smp_processor_id()); | ||
709 | printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); | ||
710 | if (panic_on_unrecovered_nmi) | ||
711 | panic("NMI: Not continuing"); | ||
712 | |||
713 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
714 | } | ||
715 | |||
716 | static DEFINE_SPINLOCK(nmi_print_lock); | ||
717 | |||
718 | void __kprobes die_nmi(struct pt_regs *regs, const char *msg) | ||
719 | { | ||
720 | if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == | ||
721 | NOTIFY_STOP) | ||
722 | return; | ||
723 | |||
724 | spin_lock(&nmi_print_lock); | ||
725 | /* | ||
726 | * We are in trouble anyway, lets at least try | ||
727 | * to get a message out. | ||
728 | */ | ||
729 | bust_spinlocks(1); | ||
730 | printk(KERN_EMERG "%s", msg); | ||
731 | printk(" on CPU%d, eip %08lx, registers:\n", | ||
732 | smp_processor_id(), regs->eip); | ||
733 | show_registers(regs); | ||
734 | console_silent(); | ||
735 | spin_unlock(&nmi_print_lock); | ||
736 | bust_spinlocks(0); | ||
737 | |||
738 | /* If we are in kernel we are probably nested up pretty bad | ||
739 | * and might aswell get out now while we still can. | ||
740 | */ | ||
741 | if (!user_mode_vm(regs)) { | ||
742 | current->thread.trap_no = 2; | ||
743 | crash_kexec(regs); | ||
744 | } | ||
745 | |||
746 | do_exit(SIGSEGV); | ||
747 | } | ||
748 | |||
749 | static __kprobes void default_do_nmi(struct pt_regs * regs) | ||
750 | { | ||
751 | unsigned char reason = 0; | ||
752 | |||
753 | /* Only the BSP gets external NMIs from the system. */ | ||
754 | if (!smp_processor_id()) | ||
755 | reason = get_nmi_reason(); | ||
756 | |||
757 | if (!(reason & 0xc0)) { | ||
758 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | ||
759 | == NOTIFY_STOP) | ||
760 | return; | ||
761 | #ifdef CONFIG_X86_LOCAL_APIC | ||
762 | /* | ||
763 | * Ok, so this is none of the documented NMI sources, | ||
764 | * so it must be the NMI watchdog. | ||
765 | */ | ||
766 | if (nmi_watchdog_tick(regs, reason)) | ||
767 | return; | ||
768 | if (!do_nmi_callback(regs, smp_processor_id())) | ||
769 | #endif | ||
770 | unknown_nmi_error(reason, regs); | ||
771 | |||
772 | return; | ||
773 | } | ||
774 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | ||
775 | return; | ||
776 | if (reason & 0x80) | ||
777 | mem_parity_error(reason, regs); | ||
778 | if (reason & 0x40) | ||
779 | io_check_error(reason, regs); | ||
780 | /* | ||
781 | * Reassert NMI in case it became active meanwhile | ||
782 | * as it's edge-triggered. | ||
783 | */ | ||
784 | reassert_nmi(); | ||
785 | } | ||
786 | |||
787 | static int ignore_nmis; | ||
788 | |||
789 | fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) | ||
790 | { | ||
791 | int cpu; | ||
792 | |||
793 | nmi_enter(); | ||
794 | |||
795 | cpu = smp_processor_id(); | ||
796 | |||
797 | ++nmi_count(cpu); | ||
798 | |||
799 | if (!ignore_nmis) | ||
800 | default_do_nmi(regs); | ||
801 | |||
802 | nmi_exit(); | ||
803 | } | ||
804 | |||
805 | void stop_nmi(void) | ||
806 | { | ||
807 | acpi_nmi_disable(); | ||
808 | ignore_nmis++; | ||
809 | } | ||
810 | |||
811 | void restart_nmi(void) | ||
812 | { | ||
813 | ignore_nmis--; | ||
814 | acpi_nmi_enable(); | ||
815 | } | ||
816 | |||
817 | #ifdef CONFIG_KPROBES | ||
818 | fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) | ||
819 | { | ||
820 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | ||
821 | == NOTIFY_STOP) | ||
822 | return; | ||
823 | /* This is an interrupt gate, because kprobes wants interrupts | ||
824 | disabled. Normal trap handlers don't. */ | ||
825 | restore_interrupts(regs); | ||
826 | do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); | ||
827 | } | ||
828 | #endif | ||
829 | |||
830 | /* | ||
831 | * Our handling of the processor debug registers is non-trivial. | ||
832 | * We do not clear them on entry and exit from the kernel. Therefore | ||
833 | * it is possible to get a watchpoint trap here from inside the kernel. | ||
834 | * However, the code in ./ptrace.c has ensured that the user can | ||
835 | * only set watchpoints on userspace addresses. Therefore the in-kernel | ||
836 | * watchpoint trap can only occur in code which is reading/writing | ||
837 | * from user space. Such code must not hold kernel locks (since it | ||
838 | * can equally take a page fault), therefore it is safe to call | ||
839 | * force_sig_info even though that claims and releases locks. | ||
840 | * | ||
841 | * Code in ./signal.c ensures that the debug control register | ||
842 | * is restored before we deliver any signal, and therefore that | ||
843 | * user code runs with the correct debug control register even though | ||
844 | * we clear it here. | ||
845 | * | ||
846 | * Being careful here means that we don't have to be as careful in a | ||
847 | * lot of more complicated places (task switching can be a bit lazy | ||
848 | * about restoring all the debug state, and ptrace doesn't have to | ||
849 | * find every occurrence of the TF bit that could be saved away even | ||
850 | * by user code) | ||
851 | */ | ||
852 | fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) | ||
853 | { | ||
854 | unsigned int condition; | ||
855 | struct task_struct *tsk = current; | ||
856 | |||
857 | get_debugreg(condition, 6); | ||
858 | |||
859 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | ||
860 | SIGTRAP) == NOTIFY_STOP) | ||
861 | return; | ||
862 | /* It's safe to allow irq's after DR6 has been saved */ | ||
863 | if (regs->eflags & X86_EFLAGS_IF) | ||
864 | local_irq_enable(); | ||
865 | |||
866 | /* Mask out spurious debug traps due to lazy DR7 setting */ | ||
867 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | ||
868 | if (!tsk->thread.debugreg[7]) | ||
869 | goto clear_dr7; | ||
870 | } | ||
871 | |||
872 | if (regs->eflags & VM_MASK) | ||
873 | goto debug_vm86; | ||
874 | |||
875 | /* Save debug status register where ptrace can see it */ | ||
876 | tsk->thread.debugreg[6] = condition; | ||
877 | |||
878 | /* | ||
879 | * Single-stepping through TF: make sure we ignore any events in | ||
880 | * kernel space (but re-enable TF when returning to user mode). | ||
881 | */ | ||
882 | if (condition & DR_STEP) { | ||
883 | /* | ||
884 | * We already checked v86 mode above, so we can | ||
885 | * check for kernel mode by just checking the CPL | ||
886 | * of CS. | ||
887 | */ | ||
888 | if (!user_mode(regs)) | ||
889 | goto clear_TF_reenable; | ||
890 | } | ||
891 | |||
892 | /* Ok, finally something we can handle */ | ||
893 | send_sigtrap(tsk, regs, error_code); | ||
894 | |||
895 | /* Disable additional traps. They'll be re-enabled when | ||
896 | * the signal is delivered. | ||
897 | */ | ||
898 | clear_dr7: | ||
899 | set_debugreg(0, 7); | ||
900 | return; | ||
901 | |||
902 | debug_vm86: | ||
903 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); | ||
904 | return; | ||
905 | |||
906 | clear_TF_reenable: | ||
907 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | ||
908 | regs->eflags &= ~TF_MASK; | ||
909 | return; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Note that we play around with the 'TS' bit in an attempt to get | ||
914 | * the correct behaviour even in the presence of the asynchronous | ||
915 | * IRQ13 behaviour | ||
916 | */ | ||
917 | void math_error(void __user *eip) | ||
918 | { | ||
919 | struct task_struct * task; | ||
920 | siginfo_t info; | ||
921 | unsigned short cwd, swd; | ||
922 | |||
923 | /* | ||
924 | * Save the info for the exception handler and clear the error. | ||
925 | */ | ||
926 | task = current; | ||
927 | save_init_fpu(task); | ||
928 | task->thread.trap_no = 16; | ||
929 | task->thread.error_code = 0; | ||
930 | info.si_signo = SIGFPE; | ||
931 | info.si_errno = 0; | ||
932 | info.si_code = __SI_FAULT; | ||
933 | info.si_addr = eip; | ||
934 | /* | ||
935 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | ||
936 | * status. 0x3f is the exception bits in these regs, 0x200 is the | ||
937 | * C1 reg you need in case of a stack fault, 0x040 is the stack | ||
938 | * fault bit. We should only be taking one exception at a time, | ||
939 | * so if this combination doesn't produce any single exception, | ||
940 | * then we have a bad program that isn't syncronizing its FPU usage | ||
941 | * and it will suffer the consequences since we won't be able to | ||
942 | * fully reproduce the context of the exception | ||
943 | */ | ||
944 | cwd = get_fpu_cwd(task); | ||
945 | swd = get_fpu_swd(task); | ||
946 | switch (swd & ~cwd & 0x3f) { | ||
947 | case 0x000: /* No unmasked exception */ | ||
948 | return; | ||
949 | default: /* Multiple exceptions */ | ||
950 | break; | ||
951 | case 0x001: /* Invalid Op */ | ||
952 | /* | ||
953 | * swd & 0x240 == 0x040: Stack Underflow | ||
954 | * swd & 0x240 == 0x240: Stack Overflow | ||
955 | * User must clear the SF bit (0x40) if set | ||
956 | */ | ||
957 | info.si_code = FPE_FLTINV; | ||
958 | break; | ||
959 | case 0x002: /* Denormalize */ | ||
960 | case 0x010: /* Underflow */ | ||
961 | info.si_code = FPE_FLTUND; | ||
962 | break; | ||
963 | case 0x004: /* Zero Divide */ | ||
964 | info.si_code = FPE_FLTDIV; | ||
965 | break; | ||
966 | case 0x008: /* Overflow */ | ||
967 | info.si_code = FPE_FLTOVF; | ||
968 | break; | ||
969 | case 0x020: /* Precision */ | ||
970 | info.si_code = FPE_FLTRES; | ||
971 | break; | ||
972 | } | ||
973 | force_sig_info(SIGFPE, &info, task); | ||
974 | } | ||
975 | |||
976 | fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) | ||
977 | { | ||
978 | ignore_fpu_irq = 1; | ||
979 | math_error((void __user *)regs->eip); | ||
980 | } | ||
981 | |||
982 | static void simd_math_error(void __user *eip) | ||
983 | { | ||
984 | struct task_struct * task; | ||
985 | siginfo_t info; | ||
986 | unsigned short mxcsr; | ||
987 | |||
988 | /* | ||
989 | * Save the info for the exception handler and clear the error. | ||
990 | */ | ||
991 | task = current; | ||
992 | save_init_fpu(task); | ||
993 | task->thread.trap_no = 19; | ||
994 | task->thread.error_code = 0; | ||
995 | info.si_signo = SIGFPE; | ||
996 | info.si_errno = 0; | ||
997 | info.si_code = __SI_FAULT; | ||
998 | info.si_addr = eip; | ||
999 | /* | ||
1000 | * The SIMD FPU exceptions are handled a little differently, as there | ||
1001 | * is only a single status/control register. Thus, to determine which | ||
1002 | * unmasked exception was caught we must mask the exception mask bits | ||
1003 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
1004 | */ | ||
1005 | mxcsr = get_fpu_mxcsr(task); | ||
1006 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
1007 | case 0x000: | ||
1008 | default: | ||
1009 | break; | ||
1010 | case 0x001: /* Invalid Op */ | ||
1011 | info.si_code = FPE_FLTINV; | ||
1012 | break; | ||
1013 | case 0x002: /* Denormalize */ | ||
1014 | case 0x010: /* Underflow */ | ||
1015 | info.si_code = FPE_FLTUND; | ||
1016 | break; | ||
1017 | case 0x004: /* Zero Divide */ | ||
1018 | info.si_code = FPE_FLTDIV; | ||
1019 | break; | ||
1020 | case 0x008: /* Overflow */ | ||
1021 | info.si_code = FPE_FLTOVF; | ||
1022 | break; | ||
1023 | case 0x020: /* Precision */ | ||
1024 | info.si_code = FPE_FLTRES; | ||
1025 | break; | ||
1026 | } | ||
1027 | force_sig_info(SIGFPE, &info, task); | ||
1028 | } | ||
1029 | |||
1030 | fastcall void do_simd_coprocessor_error(struct pt_regs * regs, | ||
1031 | long error_code) | ||
1032 | { | ||
1033 | if (cpu_has_xmm) { | ||
1034 | /* Handle SIMD FPU exceptions on PIII+ processors. */ | ||
1035 | ignore_fpu_irq = 1; | ||
1036 | simd_math_error((void __user *)regs->eip); | ||
1037 | } else { | ||
1038 | /* | ||
1039 | * Handle strange cache flush from user space exception | ||
1040 | * in all other cases. This is undocumented behaviour. | ||
1041 | */ | ||
1042 | if (regs->eflags & VM_MASK) { | ||
1043 | handle_vm86_fault((struct kernel_vm86_regs *)regs, | ||
1044 | error_code); | ||
1045 | return; | ||
1046 | } | ||
1047 | current->thread.trap_no = 19; | ||
1048 | current->thread.error_code = error_code; | ||
1049 | die_if_kernel("cache flush denied", regs, error_code); | ||
1050 | force_sig(SIGSEGV, current); | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, | ||
1055 | long error_code) | ||
1056 | { | ||
1057 | #if 0 | ||
1058 | /* No need to warn about this any longer. */ | ||
1059 | printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); | ||
1060 | #endif | ||
1061 | } | ||
1062 | |||
1063 | fastcall unsigned long patch_espfix_desc(unsigned long uesp, | ||
1064 | unsigned long kesp) | ||
1065 | { | ||
1066 | struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; | ||
1067 | unsigned long base = (kesp - uesp) & -THREAD_SIZE; | ||
1068 | unsigned long new_kesp = kesp - base; | ||
1069 | unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; | ||
1070 | __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; | ||
1071 | /* Set up base for espfix segment */ | ||
1072 | desc &= 0x00f0ff0000000000ULL; | ||
1073 | desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | | ||
1074 | ((((__u64)base) << 32) & 0xff00000000000000ULL) | | ||
1075 | ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | | ||
1076 | (lim_pages & 0xffff); | ||
1077 | *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; | ||
1078 | return new_kesp; | ||
1079 | } | ||
1080 | |||
1081 | /* | ||
1082 | * 'math_state_restore()' saves the current math information in the | ||
1083 | * old math state array, and gets the new ones from the current task | ||
1084 | * | ||
1085 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | ||
1086 | * Don't touch unless you *really* know how it works. | ||
1087 | * | ||
1088 | * Must be called with kernel preemption disabled (in this case, | ||
1089 | * local interrupts are disabled at the call-site in entry.S). | ||
1090 | */ | ||
1091 | asmlinkage void math_state_restore(void) | ||
1092 | { | ||
1093 | struct thread_info *thread = current_thread_info(); | ||
1094 | struct task_struct *tsk = thread->task; | ||
1095 | |||
1096 | clts(); /* Allow maths ops (or we recurse) */ | ||
1097 | if (!tsk_used_math(tsk)) | ||
1098 | init_fpu(tsk); | ||
1099 | restore_fpu(tsk); | ||
1100 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | ||
1101 | tsk->fpu_counter++; | ||
1102 | } | ||
1103 | EXPORT_SYMBOL_GPL(math_state_restore); | ||
1104 | |||
1105 | #ifndef CONFIG_MATH_EMULATION | ||
1106 | |||
1107 | asmlinkage void math_emulate(long arg) | ||
1108 | { | ||
1109 | printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n"); | ||
1110 | printk(KERN_EMERG "killing %s.\n",current->comm); | ||
1111 | force_sig(SIGFPE,current); | ||
1112 | schedule(); | ||
1113 | } | ||
1114 | |||
1115 | #endif /* CONFIG_MATH_EMULATION */ | ||
1116 | |||
1117 | #ifdef CONFIG_X86_F00F_BUG | ||
1118 | void __init trap_init_f00f_bug(void) | ||
1119 | { | ||
1120 | __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); | ||
1121 | |||
1122 | /* | ||
1123 | * Update the IDT descriptor and reload the IDT so that | ||
1124 | * it uses the read-only mapped virtual address. | ||
1125 | */ | ||
1126 | idt_descr.address = fix_to_virt(FIX_F00F_IDT); | ||
1127 | load_idt(&idt_descr); | ||
1128 | } | ||
1129 | #endif | ||
1130 | |||
1131 | /* | ||
1132 | * This needs to use 'idt_table' rather than 'idt', and | ||
1133 | * thus use the _nonmapped_ version of the IDT, as the | ||
1134 | * Pentium F0 0F bugfix can have resulted in the mapped | ||
1135 | * IDT being write-protected. | ||
1136 | */ | ||
1137 | void set_intr_gate(unsigned int n, void *addr) | ||
1138 | { | ||
1139 | _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); | ||
1140 | } | ||
1141 | |||
1142 | /* | ||
1143 | * This routine sets up an interrupt gate at directory privilege level 3. | ||
1144 | */ | ||
1145 | static inline void set_system_intr_gate(unsigned int n, void *addr) | ||
1146 | { | ||
1147 | _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); | ||
1148 | } | ||
1149 | |||
1150 | static void __init set_trap_gate(unsigned int n, void *addr) | ||
1151 | { | ||
1152 | _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); | ||
1153 | } | ||
1154 | |||
1155 | static void __init set_system_gate(unsigned int n, void *addr) | ||
1156 | { | ||
1157 | _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); | ||
1158 | } | ||
1159 | |||
1160 | static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) | ||
1161 | { | ||
1162 | _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); | ||
1163 | } | ||
1164 | |||
1165 | |||
1166 | void __init trap_init(void) | ||
1167 | { | ||
1168 | #ifdef CONFIG_EISA | ||
1169 | void __iomem *p = ioremap(0x0FFFD9, 4); | ||
1170 | if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { | ||
1171 | EISA_bus = 1; | ||
1172 | } | ||
1173 | iounmap(p); | ||
1174 | #endif | ||
1175 | |||
1176 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1177 | init_apic_mappings(); | ||
1178 | #endif | ||
1179 | |||
1180 | set_trap_gate(0,÷_error); | ||
1181 | set_intr_gate(1,&debug); | ||
1182 | set_intr_gate(2,&nmi); | ||
1183 | set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ | ||
1184 | set_system_gate(4,&overflow); | ||
1185 | set_trap_gate(5,&bounds); | ||
1186 | set_trap_gate(6,&invalid_op); | ||
1187 | set_trap_gate(7,&device_not_available); | ||
1188 | set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); | ||
1189 | set_trap_gate(9,&coprocessor_segment_overrun); | ||
1190 | set_trap_gate(10,&invalid_TSS); | ||
1191 | set_trap_gate(11,&segment_not_present); | ||
1192 | set_trap_gate(12,&stack_segment); | ||
1193 | set_trap_gate(13,&general_protection); | ||
1194 | set_intr_gate(14,&page_fault); | ||
1195 | set_trap_gate(15,&spurious_interrupt_bug); | ||
1196 | set_trap_gate(16,&coprocessor_error); | ||
1197 | set_trap_gate(17,&alignment_check); | ||
1198 | #ifdef CONFIG_X86_MCE | ||
1199 | set_trap_gate(18,&machine_check); | ||
1200 | #endif | ||
1201 | set_trap_gate(19,&simd_coprocessor_error); | ||
1202 | |||
1203 | if (cpu_has_fxsr) { | ||
1204 | /* | ||
1205 | * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. | ||
1206 | * Generates a compile-time "error: zero width for bit-field" if | ||
1207 | * the alignment is wrong. | ||
1208 | */ | ||
1209 | struct fxsrAlignAssert { | ||
1210 | int _:!(offsetof(struct task_struct, | ||
1211 | thread.i387.fxsave) & 15); | ||
1212 | }; | ||
1213 | |||
1214 | printk(KERN_INFO "Enabling fast FPU save and restore... "); | ||
1215 | set_in_cr4(X86_CR4_OSFXSR); | ||
1216 | printk("done.\n"); | ||
1217 | } | ||
1218 | if (cpu_has_xmm) { | ||
1219 | printk(KERN_INFO "Enabling unmasked SIMD FPU exception " | ||
1220 | "support... "); | ||
1221 | set_in_cr4(X86_CR4_OSXMMEXCPT); | ||
1222 | printk("done.\n"); | ||
1223 | } | ||
1224 | |||
1225 | set_system_gate(SYSCALL_VECTOR,&system_call); | ||
1226 | |||
1227 | /* | ||
1228 | * Should be a barrier for any external CPU state. | ||
1229 | */ | ||
1230 | cpu_init(); | ||
1231 | |||
1232 | trap_init_hook(); | ||
1233 | } | ||
1234 | |||
1235 | static int __init kstack_setup(char *s) | ||
1236 | { | ||
1237 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
1238 | return 1; | ||
1239 | } | ||
1240 | __setup("kstack=", kstack_setup); | ||
1241 | |||
1242 | static int __init code_bytes_setup(char *s) | ||
1243 | { | ||
1244 | code_bytes = simple_strtoul(s, NULL, 0); | ||
1245 | if (code_bytes > 8192) | ||
1246 | code_bytes = 8192; | ||
1247 | |||
1248 | return 1; | ||
1249 | } | ||
1250 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c new file mode 100644 index 000000000000..a39280b4dd3a --- /dev/null +++ b/arch/x86/kernel/tsc_32.c | |||
@@ -0,0 +1,413 @@ | |||
1 | /* | ||
2 | * This code largely moved from arch/i386/kernel/timer/timer_tsc.c | ||
3 | * which was originally moved from arch/i386/kernel/time.c. | ||
4 | * See comments there for proper credits. | ||
5 | */ | ||
6 | |||
7 | #include <linux/sched.h> | ||
8 | #include <linux/clocksource.h> | ||
9 | #include <linux/workqueue.h> | ||
10 | #include <linux/cpufreq.h> | ||
11 | #include <linux/jiffies.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/dmi.h> | ||
14 | |||
15 | #include <asm/delay.h> | ||
16 | #include <asm/tsc.h> | ||
17 | #include <asm/io.h> | ||
18 | #include <asm/timer.h> | ||
19 | |||
20 | #include "mach_timer.h" | ||
21 | |||
22 | static int tsc_enabled; | ||
23 | |||
24 | /* | ||
25 | * On some systems the TSC frequency does not | ||
26 | * change with the cpu frequency. So we need | ||
27 | * an extra value to store the TSC freq | ||
28 | */ | ||
29 | unsigned int tsc_khz; | ||
30 | EXPORT_SYMBOL_GPL(tsc_khz); | ||
31 | |||
32 | int tsc_disable; | ||
33 | |||
34 | #ifdef CONFIG_X86_TSC | ||
35 | static int __init tsc_setup(char *str) | ||
36 | { | ||
37 | printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " | ||
38 | "cannot disable TSC.\n"); | ||
39 | return 1; | ||
40 | } | ||
41 | #else | ||
42 | /* | ||
43 | * disable flag for tsc. Takes effect by clearing the TSC cpu flag | ||
44 | * in cpu/common.c | ||
45 | */ | ||
46 | static int __init tsc_setup(char *str) | ||
47 | { | ||
48 | tsc_disable = 1; | ||
49 | |||
50 | return 1; | ||
51 | } | ||
52 | #endif | ||
53 | |||
54 | __setup("notsc", tsc_setup); | ||
55 | |||
56 | /* | ||
57 | * code to mark and check if the TSC is unstable | ||
58 | * due to cpufreq or due to unsynced TSCs | ||
59 | */ | ||
60 | static int tsc_unstable; | ||
61 | |||
62 | int check_tsc_unstable(void) | ||
63 | { | ||
64 | return tsc_unstable; | ||
65 | } | ||
66 | EXPORT_SYMBOL_GPL(check_tsc_unstable); | ||
67 | |||
68 | /* Accellerators for sched_clock() | ||
69 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
70 | * basic equation: | ||
71 | * ns = cycles / (freq / ns_per_sec) | ||
72 | * ns = cycles * (ns_per_sec / freq) | ||
73 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
74 | * ns = cycles * (10^6 / cpu_khz) | ||
75 | * | ||
76 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
77 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
78 | * ns = cycles * cyc2ns_scale / SC | ||
79 | * | ||
80 | * And since SC is a constant power of two, we can convert the div | ||
81 | * into a shift. | ||
82 | * | ||
83 | * We can use khz divisor instead of mhz to keep a better percision, since | ||
84 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
85 | * (mathieu.desnoyers@polymtl.ca) | ||
86 | * | ||
87 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
88 | */ | ||
89 | unsigned long cyc2ns_scale __read_mostly; | ||
90 | |||
91 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | ||
92 | |||
93 | static inline void set_cyc2ns_scale(unsigned long cpu_khz) | ||
94 | { | ||
95 | cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; | ||
96 | } | ||
97 | |||
98 | /* | ||
99 | * Scheduler clock - returns current time in nanosec units. | ||
100 | */ | ||
101 | unsigned long long native_sched_clock(void) | ||
102 | { | ||
103 | unsigned long long this_offset; | ||
104 | |||
105 | /* | ||
106 | * Fall back to jiffies if there's no TSC available: | ||
107 | * ( But note that we still use it if the TSC is marked | ||
108 | * unstable. We do this because unlike Time Of Day, | ||
109 | * the scheduler clock tolerates small errors and it's | ||
110 | * very important for it to be as fast as the platform | ||
111 | * can achive it. ) | ||
112 | */ | ||
113 | if (unlikely(!tsc_enabled && !tsc_unstable)) | ||
114 | /* No locking but a rare wrong value is not a big deal: */ | ||
115 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); | ||
116 | |||
117 | /* read the Time Stamp Counter: */ | ||
118 | rdtscll(this_offset); | ||
119 | |||
120 | /* return the value in ns */ | ||
121 | return cycles_2_ns(this_offset); | ||
122 | } | ||
123 | |||
124 | /* We need to define a real function for sched_clock, to override the | ||
125 | weak default version */ | ||
126 | #ifdef CONFIG_PARAVIRT | ||
127 | unsigned long long sched_clock(void) | ||
128 | { | ||
129 | return paravirt_sched_clock(); | ||
130 | } | ||
131 | #else | ||
132 | unsigned long long sched_clock(void) | ||
133 | __attribute__((alias("native_sched_clock"))); | ||
134 | #endif | ||
135 | |||
136 | unsigned long native_calculate_cpu_khz(void) | ||
137 | { | ||
138 | unsigned long long start, end; | ||
139 | unsigned long count; | ||
140 | u64 delta64; | ||
141 | int i; | ||
142 | unsigned long flags; | ||
143 | |||
144 | local_irq_save(flags); | ||
145 | |||
146 | /* run 3 times to ensure the cache is warm */ | ||
147 | for (i = 0; i < 3; i++) { | ||
148 | mach_prepare_counter(); | ||
149 | rdtscll(start); | ||
150 | mach_countup(&count); | ||
151 | rdtscll(end); | ||
152 | } | ||
153 | /* | ||
154 | * Error: ECTCNEVERSET | ||
155 | * The CTC wasn't reliable: we got a hit on the very first read, | ||
156 | * or the CPU was so fast/slow that the quotient wouldn't fit in | ||
157 | * 32 bits.. | ||
158 | */ | ||
159 | if (count <= 1) | ||
160 | goto err; | ||
161 | |||
162 | delta64 = end - start; | ||
163 | |||
164 | /* cpu freq too fast: */ | ||
165 | if (delta64 > (1ULL<<32)) | ||
166 | goto err; | ||
167 | |||
168 | /* cpu freq too slow: */ | ||
169 | if (delta64 <= CALIBRATE_TIME_MSEC) | ||
170 | goto err; | ||
171 | |||
172 | delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ | ||
173 | do_div(delta64,CALIBRATE_TIME_MSEC); | ||
174 | |||
175 | local_irq_restore(flags); | ||
176 | return (unsigned long)delta64; | ||
177 | err: | ||
178 | local_irq_restore(flags); | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | int recalibrate_cpu_khz(void) | ||
183 | { | ||
184 | #ifndef CONFIG_SMP | ||
185 | unsigned long cpu_khz_old = cpu_khz; | ||
186 | |||
187 | if (cpu_has_tsc) { | ||
188 | cpu_khz = calculate_cpu_khz(); | ||
189 | tsc_khz = cpu_khz; | ||
190 | cpu_data[0].loops_per_jiffy = | ||
191 | cpufreq_scale(cpu_data[0].loops_per_jiffy, | ||
192 | cpu_khz_old, cpu_khz); | ||
193 | return 0; | ||
194 | } else | ||
195 | return -ENODEV; | ||
196 | #else | ||
197 | return -ENODEV; | ||
198 | #endif | ||
199 | } | ||
200 | |||
201 | EXPORT_SYMBOL(recalibrate_cpu_khz); | ||
202 | |||
203 | #ifdef CONFIG_CPU_FREQ | ||
204 | |||
205 | /* | ||
206 | * if the CPU frequency is scaled, TSC-based delays will need a different | ||
207 | * loops_per_jiffy value to function properly. | ||
208 | */ | ||
209 | static unsigned int ref_freq = 0; | ||
210 | static unsigned long loops_per_jiffy_ref = 0; | ||
211 | static unsigned long cpu_khz_ref = 0; | ||
212 | |||
213 | static int | ||
214 | time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) | ||
215 | { | ||
216 | struct cpufreq_freqs *freq = data; | ||
217 | |||
218 | if (!ref_freq) { | ||
219 | if (!freq->old){ | ||
220 | ref_freq = freq->new; | ||
221 | return 0; | ||
222 | } | ||
223 | ref_freq = freq->old; | ||
224 | loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; | ||
225 | cpu_khz_ref = cpu_khz; | ||
226 | } | ||
227 | |||
228 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
229 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
230 | (val == CPUFREQ_RESUMECHANGE)) { | ||
231 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
232 | cpu_data[freq->cpu].loops_per_jiffy = | ||
233 | cpufreq_scale(loops_per_jiffy_ref, | ||
234 | ref_freq, freq->new); | ||
235 | |||
236 | if (cpu_khz) { | ||
237 | |||
238 | if (num_online_cpus() == 1) | ||
239 | cpu_khz = cpufreq_scale(cpu_khz_ref, | ||
240 | ref_freq, freq->new); | ||
241 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { | ||
242 | tsc_khz = cpu_khz; | ||
243 | set_cyc2ns_scale(cpu_khz); | ||
244 | /* | ||
245 | * TSC based sched_clock turns | ||
246 | * to junk w/ cpufreq | ||
247 | */ | ||
248 | mark_tsc_unstable("cpufreq changes"); | ||
249 | } | ||
250 | } | ||
251 | } | ||
252 | |||
253 | return 0; | ||
254 | } | ||
255 | |||
256 | static struct notifier_block time_cpufreq_notifier_block = { | ||
257 | .notifier_call = time_cpufreq_notifier | ||
258 | }; | ||
259 | |||
260 | static int __init cpufreq_tsc(void) | ||
261 | { | ||
262 | return cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
263 | CPUFREQ_TRANSITION_NOTIFIER); | ||
264 | } | ||
265 | core_initcall(cpufreq_tsc); | ||
266 | |||
267 | #endif | ||
268 | |||
269 | /* clock source code */ | ||
270 | |||
271 | static unsigned long current_tsc_khz = 0; | ||
272 | |||
273 | static cycle_t read_tsc(void) | ||
274 | { | ||
275 | cycle_t ret; | ||
276 | |||
277 | rdtscll(ret); | ||
278 | |||
279 | return ret; | ||
280 | } | ||
281 | |||
282 | static struct clocksource clocksource_tsc = { | ||
283 | .name = "tsc", | ||
284 | .rating = 300, | ||
285 | .read = read_tsc, | ||
286 | .mask = CLOCKSOURCE_MASK(64), | ||
287 | .mult = 0, /* to be set */ | ||
288 | .shift = 22, | ||
289 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | ||
290 | CLOCK_SOURCE_MUST_VERIFY, | ||
291 | }; | ||
292 | |||
293 | void mark_tsc_unstable(char *reason) | ||
294 | { | ||
295 | if (!tsc_unstable) { | ||
296 | tsc_unstable = 1; | ||
297 | tsc_enabled = 0; | ||
298 | printk("Marking TSC unstable due to: %s.\n", reason); | ||
299 | /* Can be called before registration */ | ||
300 | if (clocksource_tsc.mult) | ||
301 | clocksource_change_rating(&clocksource_tsc, 0); | ||
302 | else | ||
303 | clocksource_tsc.rating = 0; | ||
304 | } | ||
305 | } | ||
306 | EXPORT_SYMBOL_GPL(mark_tsc_unstable); | ||
307 | |||
308 | static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) | ||
309 | { | ||
310 | printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", | ||
311 | d->ident); | ||
312 | tsc_unstable = 1; | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | /* List of systems that have known TSC problems */ | ||
317 | static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { | ||
318 | { | ||
319 | .callback = dmi_mark_tsc_unstable, | ||
320 | .ident = "IBM Thinkpad 380XD", | ||
321 | .matches = { | ||
322 | DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), | ||
323 | DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), | ||
324 | }, | ||
325 | }, | ||
326 | {} | ||
327 | }; | ||
328 | |||
329 | /* | ||
330 | * Make an educated guess if the TSC is trustworthy and synchronized | ||
331 | * over all CPUs. | ||
332 | */ | ||
333 | __cpuinit int unsynchronized_tsc(void) | ||
334 | { | ||
335 | if (!cpu_has_tsc || tsc_unstable) | ||
336 | return 1; | ||
337 | /* | ||
338 | * Intel systems are normally all synchronized. | ||
339 | * Exceptions must mark TSC as unstable: | ||
340 | */ | ||
341 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { | ||
342 | /* assume multi socket systems are not synchronized: */ | ||
343 | if (num_possible_cpus() > 1) | ||
344 | tsc_unstable = 1; | ||
345 | } | ||
346 | return tsc_unstable; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * Geode_LX - the OLPC CPU has a possibly a very reliable TSC | ||
351 | */ | ||
352 | #ifdef CONFIG_MGEODE_LX | ||
353 | /* RTSC counts during suspend */ | ||
354 | #define RTSC_SUSP 0x100 | ||
355 | |||
356 | static void __init check_geode_tsc_reliable(void) | ||
357 | { | ||
358 | unsigned long val; | ||
359 | |||
360 | rdmsrl(MSR_GEODE_BUSCONT_CONF0, val); | ||
361 | if ((val & RTSC_SUSP)) | ||
362 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; | ||
363 | } | ||
364 | #else | ||
365 | static inline void check_geode_tsc_reliable(void) { } | ||
366 | #endif | ||
367 | |||
368 | |||
369 | void __init tsc_init(void) | ||
370 | { | ||
371 | if (!cpu_has_tsc || tsc_disable) | ||
372 | goto out_no_tsc; | ||
373 | |||
374 | cpu_khz = calculate_cpu_khz(); | ||
375 | tsc_khz = cpu_khz; | ||
376 | |||
377 | if (!cpu_khz) | ||
378 | goto out_no_tsc; | ||
379 | |||
380 | printk("Detected %lu.%03lu MHz processor.\n", | ||
381 | (unsigned long)cpu_khz / 1000, | ||
382 | (unsigned long)cpu_khz % 1000); | ||
383 | |||
384 | set_cyc2ns_scale(cpu_khz); | ||
385 | use_tsc_delay(); | ||
386 | |||
387 | /* Check and install the TSC clocksource */ | ||
388 | dmi_check_system(bad_tsc_dmi_table); | ||
389 | |||
390 | unsynchronized_tsc(); | ||
391 | check_geode_tsc_reliable(); | ||
392 | current_tsc_khz = tsc_khz; | ||
393 | clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, | ||
394 | clocksource_tsc.shift); | ||
395 | /* lower the rating if we already know its unstable: */ | ||
396 | if (check_tsc_unstable()) { | ||
397 | clocksource_tsc.rating = 0; | ||
398 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; | ||
399 | } else | ||
400 | tsc_enabled = 1; | ||
401 | |||
402 | clocksource_register(&clocksource_tsc); | ||
403 | |||
404 | return; | ||
405 | |||
406 | out_no_tsc: | ||
407 | /* | ||
408 | * Set the tsc_disable flag if there's no TSC support, this | ||
409 | * makes it a fast flag for the kernel to see whether it | ||
410 | * should be using the TSC. | ||
411 | */ | ||
412 | tsc_disable = 1; | ||
413 | } | ||
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c new file mode 100644 index 000000000000..12424629af87 --- /dev/null +++ b/arch/x86/kernel/tsc_sync.c | |||
@@ -0,0 +1 @@ | |||
#include "../../x86_64/kernel/tsc_sync.c" | |||
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c new file mode 100644 index 000000000000..f2dcd1d27c0a --- /dev/null +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -0,0 +1,843 @@ | |||
1 | /* | ||
2 | * linux/kernel/vm86.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * | ||
6 | * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86 | ||
7 | * stack - Manfred Spraul <manfred@colorfullife.com> | ||
8 | * | ||
9 | * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle | ||
10 | * them correctly. Now the emulation will be in a | ||
11 | * consistent state after stackfaults - Kasper Dupont | ||
12 | * <kasperd@daimi.au.dk> | ||
13 | * | ||
14 | * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont | ||
15 | * <kasperd@daimi.au.dk> | ||
16 | * | ||
17 | * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault | ||
18 | * caused by Kasper Dupont's changes - Stas Sergeev | ||
19 | * | ||
20 | * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes. | ||
21 | * Kasper Dupont <kasperd@daimi.au.dk> | ||
22 | * | ||
23 | * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault. | ||
24 | * Kasper Dupont <kasperd@daimi.au.dk> | ||
25 | * | ||
26 | * 9 apr 2002 - Changed stack access macros to jump to a label | ||
27 | * instead of returning to userspace. This simplifies | ||
28 | * do_int, and is needed by handle_vm6_fault. Kasper | ||
29 | * Dupont <kasperd@daimi.au.dk> | ||
30 | * | ||
31 | */ | ||
32 | |||
33 | #include <linux/capability.h> | ||
34 | #include <linux/errno.h> | ||
35 | #include <linux/interrupt.h> | ||
36 | #include <linux/sched.h> | ||
37 | #include <linux/kernel.h> | ||
38 | #include <linux/signal.h> | ||
39 | #include <linux/string.h> | ||
40 | #include <linux/mm.h> | ||
41 | #include <linux/smp.h> | ||
42 | #include <linux/highmem.h> | ||
43 | #include <linux/ptrace.h> | ||
44 | #include <linux/audit.h> | ||
45 | #include <linux/stddef.h> | ||
46 | |||
47 | #include <asm/uaccess.h> | ||
48 | #include <asm/io.h> | ||
49 | #include <asm/tlbflush.h> | ||
50 | #include <asm/irq.h> | ||
51 | |||
52 | /* | ||
53 | * Known problems: | ||
54 | * | ||
55 | * Interrupt handling is not guaranteed: | ||
56 | * - a real x86 will disable all interrupts for one instruction | ||
57 | * after a "mov ss,xx" to make stack handling atomic even without | ||
58 | * the 'lss' instruction. We can't guarantee this in v86 mode, | ||
59 | * as the next instruction might result in a page fault or similar. | ||
60 | * - a real x86 will have interrupts disabled for one instruction | ||
61 | * past the 'sti' that enables them. We don't bother with all the | ||
62 | * details yet. | ||
63 | * | ||
64 | * Let's hope these problems do not actually matter for anything. | ||
65 | */ | ||
66 | |||
67 | |||
68 | #define KVM86 ((struct kernel_vm86_struct *)regs) | ||
69 | #define VMPI KVM86->vm86plus | ||
70 | |||
71 | |||
72 | /* | ||
73 | * 8- and 16-bit register defines.. | ||
74 | */ | ||
75 | #define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) | ||
76 | #define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) | ||
77 | #define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) | ||
78 | #define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) | ||
79 | |||
80 | /* | ||
81 | * virtual flags (16 and 32-bit versions) | ||
82 | */ | ||
83 | #define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) | ||
84 | #define VEFLAGS (current->thread.v86flags) | ||
85 | |||
86 | #define set_flags(X,new,mask) \ | ||
87 | ((X) = ((X) & ~(mask)) | ((new) & (mask))) | ||
88 | |||
89 | #define SAFE_MASK (0xDD5) | ||
90 | #define RETURN_MASK (0xDFF) | ||
91 | |||
92 | /* convert kernel_vm86_regs to vm86_regs */ | ||
93 | static int copy_vm86_regs_to_user(struct vm86_regs __user *user, | ||
94 | const struct kernel_vm86_regs *regs) | ||
95 | { | ||
96 | int ret = 0; | ||
97 | |||
98 | /* kernel_vm86_regs is missing xgs, so copy everything up to | ||
99 | (but not including) orig_eax, and then rest including orig_eax. */ | ||
100 | ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); | ||
101 | ret += copy_to_user(&user->orig_eax, ®s->pt.orig_eax, | ||
102 | sizeof(struct kernel_vm86_regs) - | ||
103 | offsetof(struct kernel_vm86_regs, pt.orig_eax)); | ||
104 | |||
105 | return ret; | ||
106 | } | ||
107 | |||
108 | /* convert vm86_regs to kernel_vm86_regs */ | ||
109 | static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, | ||
110 | const struct vm86_regs __user *user, | ||
111 | unsigned extra) | ||
112 | { | ||
113 | int ret = 0; | ||
114 | |||
115 | /* copy eax-xfs inclusive */ | ||
116 | ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); | ||
117 | /* copy orig_eax-__gsh+extra */ | ||
118 | ret += copy_from_user(®s->pt.orig_eax, &user->orig_eax, | ||
119 | sizeof(struct kernel_vm86_regs) - | ||
120 | offsetof(struct kernel_vm86_regs, pt.orig_eax) + | ||
121 | extra); | ||
122 | return ret; | ||
123 | } | ||
124 | |||
125 | struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); | ||
126 | struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) | ||
127 | { | ||
128 | struct tss_struct *tss; | ||
129 | struct pt_regs *ret; | ||
130 | unsigned long tmp; | ||
131 | |||
132 | /* | ||
133 | * This gets called from entry.S with interrupts disabled, but | ||
134 | * from process context. Enable interrupts here, before trying | ||
135 | * to access user space. | ||
136 | */ | ||
137 | local_irq_enable(); | ||
138 | |||
139 | if (!current->thread.vm86_info) { | ||
140 | printk("no vm86_info: BAD\n"); | ||
141 | do_exit(SIGSEGV); | ||
142 | } | ||
143 | set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); | ||
144 | tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs,regs); | ||
145 | tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); | ||
146 | if (tmp) { | ||
147 | printk("vm86: could not access userspace vm86_info\n"); | ||
148 | do_exit(SIGSEGV); | ||
149 | } | ||
150 | |||
151 | tss = &per_cpu(init_tss, get_cpu()); | ||
152 | current->thread.esp0 = current->thread.saved_esp0; | ||
153 | current->thread.sysenter_cs = __KERNEL_CS; | ||
154 | load_esp0(tss, ¤t->thread); | ||
155 | current->thread.saved_esp0 = 0; | ||
156 | put_cpu(); | ||
157 | |||
158 | ret = KVM86->regs32; | ||
159 | |||
160 | ret->xfs = current->thread.saved_fs; | ||
161 | loadsegment(gs, current->thread.saved_gs); | ||
162 | |||
163 | return ret; | ||
164 | } | ||
165 | |||
166 | static void mark_screen_rdonly(struct mm_struct *mm) | ||
167 | { | ||
168 | pgd_t *pgd; | ||
169 | pud_t *pud; | ||
170 | pmd_t *pmd; | ||
171 | pte_t *pte; | ||
172 | spinlock_t *ptl; | ||
173 | int i; | ||
174 | |||
175 | pgd = pgd_offset(mm, 0xA0000); | ||
176 | if (pgd_none_or_clear_bad(pgd)) | ||
177 | goto out; | ||
178 | pud = pud_offset(pgd, 0xA0000); | ||
179 | if (pud_none_or_clear_bad(pud)) | ||
180 | goto out; | ||
181 | pmd = pmd_offset(pud, 0xA0000); | ||
182 | if (pmd_none_or_clear_bad(pmd)) | ||
183 | goto out; | ||
184 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); | ||
185 | for (i = 0; i < 32; i++) { | ||
186 | if (pte_present(*pte)) | ||
187 | set_pte(pte, pte_wrprotect(*pte)); | ||
188 | pte++; | ||
189 | } | ||
190 | pte_unmap_unlock(pte, ptl); | ||
191 | out: | ||
192 | flush_tlb(); | ||
193 | } | ||
194 | |||
195 | |||
196 | |||
197 | static int do_vm86_irq_handling(int subfunction, int irqnumber); | ||
198 | static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); | ||
199 | |||
200 | asmlinkage int sys_vm86old(struct pt_regs regs) | ||
201 | { | ||
202 | struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; | ||
203 | struct kernel_vm86_struct info; /* declare this _on top_, | ||
204 | * this avoids wasting of stack space. | ||
205 | * This remains on the stack until we | ||
206 | * return to 32 bit user space. | ||
207 | */ | ||
208 | struct task_struct *tsk; | ||
209 | int tmp, ret = -EPERM; | ||
210 | |||
211 | tsk = current; | ||
212 | if (tsk->thread.saved_esp0) | ||
213 | goto out; | ||
214 | tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, | ||
215 | offsetof(struct kernel_vm86_struct, vm86plus) - | ||
216 | sizeof(info.regs)); | ||
217 | ret = -EFAULT; | ||
218 | if (tmp) | ||
219 | goto out; | ||
220 | memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); | ||
221 | info.regs32 = ®s; | ||
222 | tsk->thread.vm86_info = v86; | ||
223 | do_sys_vm86(&info, tsk); | ||
224 | ret = 0; /* we never return here */ | ||
225 | out: | ||
226 | return ret; | ||
227 | } | ||
228 | |||
229 | |||
230 | asmlinkage int sys_vm86(struct pt_regs regs) | ||
231 | { | ||
232 | struct kernel_vm86_struct info; /* declare this _on top_, | ||
233 | * this avoids wasting of stack space. | ||
234 | * This remains on the stack until we | ||
235 | * return to 32 bit user space. | ||
236 | */ | ||
237 | struct task_struct *tsk; | ||
238 | int tmp, ret; | ||
239 | struct vm86plus_struct __user *v86; | ||
240 | |||
241 | tsk = current; | ||
242 | switch (regs.ebx) { | ||
243 | case VM86_REQUEST_IRQ: | ||
244 | case VM86_FREE_IRQ: | ||
245 | case VM86_GET_IRQ_BITS: | ||
246 | case VM86_GET_AND_RESET_IRQ: | ||
247 | ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); | ||
248 | goto out; | ||
249 | case VM86_PLUS_INSTALL_CHECK: | ||
250 | /* NOTE: on old vm86 stuff this will return the error | ||
251 | from access_ok(), because the subfunction is | ||
252 | interpreted as (invalid) address to vm86_struct. | ||
253 | So the installation check works. | ||
254 | */ | ||
255 | ret = 0; | ||
256 | goto out; | ||
257 | } | ||
258 | |||
259 | /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ | ||
260 | ret = -EPERM; | ||
261 | if (tsk->thread.saved_esp0) | ||
262 | goto out; | ||
263 | v86 = (struct vm86plus_struct __user *)regs.ecx; | ||
264 | tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, | ||
265 | offsetof(struct kernel_vm86_struct, regs32) - | ||
266 | sizeof(info.regs)); | ||
267 | ret = -EFAULT; | ||
268 | if (tmp) | ||
269 | goto out; | ||
270 | info.regs32 = ®s; | ||
271 | info.vm86plus.is_vm86pus = 1; | ||
272 | tsk->thread.vm86_info = (struct vm86_struct __user *)v86; | ||
273 | do_sys_vm86(&info, tsk); | ||
274 | ret = 0; /* we never return here */ | ||
275 | out: | ||
276 | return ret; | ||
277 | } | ||
278 | |||
279 | |||
280 | static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) | ||
281 | { | ||
282 | struct tss_struct *tss; | ||
283 | /* | ||
284 | * make sure the vm86() system call doesn't try to do anything silly | ||
285 | */ | ||
286 | info->regs.pt.xds = 0; | ||
287 | info->regs.pt.xes = 0; | ||
288 | info->regs.pt.xfs = 0; | ||
289 | |||
290 | /* we are clearing gs later just before "jmp resume_userspace", | ||
291 | * because it is not saved/restored. | ||
292 | */ | ||
293 | |||
294 | /* | ||
295 | * The eflags register is also special: we cannot trust that the user | ||
296 | * has set it up safely, so this makes sure interrupt etc flags are | ||
297 | * inherited from protected mode. | ||
298 | */ | ||
299 | VEFLAGS = info->regs.pt.eflags; | ||
300 | info->regs.pt.eflags &= SAFE_MASK; | ||
301 | info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; | ||
302 | info->regs.pt.eflags |= VM_MASK; | ||
303 | |||
304 | switch (info->cpu_type) { | ||
305 | case CPU_286: | ||
306 | tsk->thread.v86mask = 0; | ||
307 | break; | ||
308 | case CPU_386: | ||
309 | tsk->thread.v86mask = NT_MASK | IOPL_MASK; | ||
310 | break; | ||
311 | case CPU_486: | ||
312 | tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK; | ||
313 | break; | ||
314 | default: | ||
315 | tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; | ||
316 | break; | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * Save old state, set default return value (%eax) to 0 | ||
321 | */ | ||
322 | info->regs32->eax = 0; | ||
323 | tsk->thread.saved_esp0 = tsk->thread.esp0; | ||
324 | tsk->thread.saved_fs = info->regs32->xfs; | ||
325 | savesegment(gs, tsk->thread.saved_gs); | ||
326 | |||
327 | tss = &per_cpu(init_tss, get_cpu()); | ||
328 | tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; | ||
329 | if (cpu_has_sep) | ||
330 | tsk->thread.sysenter_cs = 0; | ||
331 | load_esp0(tss, &tsk->thread); | ||
332 | put_cpu(); | ||
333 | |||
334 | tsk->thread.screen_bitmap = info->screen_bitmap; | ||
335 | if (info->flags & VM86_SCREEN_BITMAP) | ||
336 | mark_screen_rdonly(tsk->mm); | ||
337 | |||
338 | /*call audit_syscall_exit since we do not exit via the normal paths */ | ||
339 | if (unlikely(current->audit_context)) | ||
340 | audit_syscall_exit(AUDITSC_RESULT(0), 0); | ||
341 | |||
342 | __asm__ __volatile__( | ||
343 | "movl %0,%%esp\n\t" | ||
344 | "movl %1,%%ebp\n\t" | ||
345 | "mov %2, %%gs\n\t" | ||
346 | "jmp resume_userspace" | ||
347 | : /* no outputs */ | ||
348 | :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); | ||
349 | /* we never return here */ | ||
350 | } | ||
351 | |||
352 | static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval) | ||
353 | { | ||
354 | struct pt_regs * regs32; | ||
355 | |||
356 | regs32 = save_v86_state(regs16); | ||
357 | regs32->eax = retval; | ||
358 | __asm__ __volatile__("movl %0,%%esp\n\t" | ||
359 | "movl %1,%%ebp\n\t" | ||
360 | "jmp resume_userspace" | ||
361 | : : "r" (regs32), "r" (current_thread_info())); | ||
362 | } | ||
363 | |||
364 | static inline void set_IF(struct kernel_vm86_regs * regs) | ||
365 | { | ||
366 | VEFLAGS |= VIF_MASK; | ||
367 | if (VEFLAGS & VIP_MASK) | ||
368 | return_to_32bit(regs, VM86_STI); | ||
369 | } | ||
370 | |||
371 | static inline void clear_IF(struct kernel_vm86_regs * regs) | ||
372 | { | ||
373 | VEFLAGS &= ~VIF_MASK; | ||
374 | } | ||
375 | |||
376 | static inline void clear_TF(struct kernel_vm86_regs * regs) | ||
377 | { | ||
378 | regs->pt.eflags &= ~TF_MASK; | ||
379 | } | ||
380 | |||
381 | static inline void clear_AC(struct kernel_vm86_regs * regs) | ||
382 | { | ||
383 | regs->pt.eflags &= ~AC_MASK; | ||
384 | } | ||
385 | |||
386 | /* It is correct to call set_IF(regs) from the set_vflags_* | ||
387 | * functions. However someone forgot to call clear_IF(regs) | ||
388 | * in the opposite case. | ||
389 | * After the command sequence CLI PUSHF STI POPF you should | ||
390 | * end up with interrups disabled, but you ended up with | ||
391 | * interrupts enabled. | ||
392 | * ( I was testing my own changes, but the only bug I | ||
393 | * could find was in a function I had not changed. ) | ||
394 | * [KD] | ||
395 | */ | ||
396 | |||
397 | static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) | ||
398 | { | ||
399 | set_flags(VEFLAGS, eflags, current->thread.v86mask); | ||
400 | set_flags(regs->pt.eflags, eflags, SAFE_MASK); | ||
401 | if (eflags & IF_MASK) | ||
402 | set_IF(regs); | ||
403 | else | ||
404 | clear_IF(regs); | ||
405 | } | ||
406 | |||
407 | static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) | ||
408 | { | ||
409 | set_flags(VFLAGS, flags, current->thread.v86mask); | ||
410 | set_flags(regs->pt.eflags, flags, SAFE_MASK); | ||
411 | if (flags & IF_MASK) | ||
412 | set_IF(regs); | ||
413 | else | ||
414 | clear_IF(regs); | ||
415 | } | ||
416 | |||
417 | static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) | ||
418 | { | ||
419 | unsigned long flags = regs->pt.eflags & RETURN_MASK; | ||
420 | |||
421 | if (VEFLAGS & VIF_MASK) | ||
422 | flags |= IF_MASK; | ||
423 | flags |= IOPL_MASK; | ||
424 | return flags | (VEFLAGS & current->thread.v86mask); | ||
425 | } | ||
426 | |||
427 | static inline int is_revectored(int nr, struct revectored_struct * bitmap) | ||
428 | { | ||
429 | __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" | ||
430 | :"=r" (nr) | ||
431 | :"m" (*bitmap),"r" (nr)); | ||
432 | return nr; | ||
433 | } | ||
434 | |||
435 | #define val_byte(val, n) (((__u8 *)&val)[n]) | ||
436 | |||
437 | #define pushb(base, ptr, val, err_label) \ | ||
438 | do { \ | ||
439 | __u8 __val = val; \ | ||
440 | ptr--; \ | ||
441 | if (put_user(__val, base + ptr) < 0) \ | ||
442 | goto err_label; \ | ||
443 | } while(0) | ||
444 | |||
445 | #define pushw(base, ptr, val, err_label) \ | ||
446 | do { \ | ||
447 | __u16 __val = val; \ | ||
448 | ptr--; \ | ||
449 | if (put_user(val_byte(__val, 1), base + ptr) < 0) \ | ||
450 | goto err_label; \ | ||
451 | ptr--; \ | ||
452 | if (put_user(val_byte(__val, 0), base + ptr) < 0) \ | ||
453 | goto err_label; \ | ||
454 | } while(0) | ||
455 | |||
456 | #define pushl(base, ptr, val, err_label) \ | ||
457 | do { \ | ||
458 | __u32 __val = val; \ | ||
459 | ptr--; \ | ||
460 | if (put_user(val_byte(__val, 3), base + ptr) < 0) \ | ||
461 | goto err_label; \ | ||
462 | ptr--; \ | ||
463 | if (put_user(val_byte(__val, 2), base + ptr) < 0) \ | ||
464 | goto err_label; \ | ||
465 | ptr--; \ | ||
466 | if (put_user(val_byte(__val, 1), base + ptr) < 0) \ | ||
467 | goto err_label; \ | ||
468 | ptr--; \ | ||
469 | if (put_user(val_byte(__val, 0), base + ptr) < 0) \ | ||
470 | goto err_label; \ | ||
471 | } while(0) | ||
472 | |||
473 | #define popb(base, ptr, err_label) \ | ||
474 | ({ \ | ||
475 | __u8 __res; \ | ||
476 | if (get_user(__res, base + ptr) < 0) \ | ||
477 | goto err_label; \ | ||
478 | ptr++; \ | ||
479 | __res; \ | ||
480 | }) | ||
481 | |||
482 | #define popw(base, ptr, err_label) \ | ||
483 | ({ \ | ||
484 | __u16 __res; \ | ||
485 | if (get_user(val_byte(__res, 0), base + ptr) < 0) \ | ||
486 | goto err_label; \ | ||
487 | ptr++; \ | ||
488 | if (get_user(val_byte(__res, 1), base + ptr) < 0) \ | ||
489 | goto err_label; \ | ||
490 | ptr++; \ | ||
491 | __res; \ | ||
492 | }) | ||
493 | |||
494 | #define popl(base, ptr, err_label) \ | ||
495 | ({ \ | ||
496 | __u32 __res; \ | ||
497 | if (get_user(val_byte(__res, 0), base + ptr) < 0) \ | ||
498 | goto err_label; \ | ||
499 | ptr++; \ | ||
500 | if (get_user(val_byte(__res, 1), base + ptr) < 0) \ | ||
501 | goto err_label; \ | ||
502 | ptr++; \ | ||
503 | if (get_user(val_byte(__res, 2), base + ptr) < 0) \ | ||
504 | goto err_label; \ | ||
505 | ptr++; \ | ||
506 | if (get_user(val_byte(__res, 3), base + ptr) < 0) \ | ||
507 | goto err_label; \ | ||
508 | ptr++; \ | ||
509 | __res; \ | ||
510 | }) | ||
511 | |||
512 | /* There are so many possible reasons for this function to return | ||
513 | * VM86_INTx, so adding another doesn't bother me. We can expect | ||
514 | * userspace programs to be able to handle it. (Getting a problem | ||
515 | * in userspace is always better than an Oops anyway.) [KD] | ||
516 | */ | ||
517 | static void do_int(struct kernel_vm86_regs *regs, int i, | ||
518 | unsigned char __user * ssp, unsigned short sp) | ||
519 | { | ||
520 | unsigned long __user *intr_ptr; | ||
521 | unsigned long segoffs; | ||
522 | |||
523 | if (regs->pt.xcs == BIOSSEG) | ||
524 | goto cannot_handle; | ||
525 | if (is_revectored(i, &KVM86->int_revectored)) | ||
526 | goto cannot_handle; | ||
527 | if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored)) | ||
528 | goto cannot_handle; | ||
529 | intr_ptr = (unsigned long __user *) (i << 2); | ||
530 | if (get_user(segoffs, intr_ptr)) | ||
531 | goto cannot_handle; | ||
532 | if ((segoffs >> 16) == BIOSSEG) | ||
533 | goto cannot_handle; | ||
534 | pushw(ssp, sp, get_vflags(regs), cannot_handle); | ||
535 | pushw(ssp, sp, regs->pt.xcs, cannot_handle); | ||
536 | pushw(ssp, sp, IP(regs), cannot_handle); | ||
537 | regs->pt.xcs = segoffs >> 16; | ||
538 | SP(regs) -= 6; | ||
539 | IP(regs) = segoffs & 0xffff; | ||
540 | clear_TF(regs); | ||
541 | clear_IF(regs); | ||
542 | clear_AC(regs); | ||
543 | return; | ||
544 | |||
545 | cannot_handle: | ||
546 | return_to_32bit(regs, VM86_INTx + (i << 8)); | ||
547 | } | ||
548 | |||
549 | int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno) | ||
550 | { | ||
551 | if (VMPI.is_vm86pus) { | ||
552 | if ( (trapno==3) || (trapno==1) ) | ||
553 | return_to_32bit(regs, VM86_TRAP + (trapno << 8)); | ||
554 | do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); | ||
555 | return 0; | ||
556 | } | ||
557 | if (trapno !=1) | ||
558 | return 1; /* we let this handle by the calling routine */ | ||
559 | if (current->ptrace & PT_PTRACED) { | ||
560 | unsigned long flags; | ||
561 | spin_lock_irqsave(¤t->sighand->siglock, flags); | ||
562 | sigdelset(¤t->blocked, SIGTRAP); | ||
563 | recalc_sigpending(); | ||
564 | spin_unlock_irqrestore(¤t->sighand->siglock, flags); | ||
565 | } | ||
566 | send_sig(SIGTRAP, current, 1); | ||
567 | current->thread.trap_no = trapno; | ||
568 | current->thread.error_code = error_code; | ||
569 | return 0; | ||
570 | } | ||
571 | |||
572 | void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) | ||
573 | { | ||
574 | unsigned char opcode; | ||
575 | unsigned char __user *csp; | ||
576 | unsigned char __user *ssp; | ||
577 | unsigned short ip, sp, orig_flags; | ||
578 | int data32, pref_done; | ||
579 | |||
580 | #define CHECK_IF_IN_TRAP \ | ||
581 | if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ | ||
582 | newflags |= TF_MASK | ||
583 | #define VM86_FAULT_RETURN do { \ | ||
584 | if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \ | ||
585 | return_to_32bit(regs, VM86_PICRETURN); \ | ||
586 | if (orig_flags & TF_MASK) \ | ||
587 | handle_vm86_trap(regs, 0, 1); \ | ||
588 | return; } while (0) | ||
589 | |||
590 | orig_flags = *(unsigned short *)®s->pt.eflags; | ||
591 | |||
592 | csp = (unsigned char __user *) (regs->pt.xcs << 4); | ||
593 | ssp = (unsigned char __user *) (regs->pt.xss << 4); | ||
594 | sp = SP(regs); | ||
595 | ip = IP(regs); | ||
596 | |||
597 | data32 = 0; | ||
598 | pref_done = 0; | ||
599 | do { | ||
600 | switch (opcode = popb(csp, ip, simulate_sigsegv)) { | ||
601 | case 0x66: /* 32-bit data */ data32=1; break; | ||
602 | case 0x67: /* 32-bit address */ break; | ||
603 | case 0x2e: /* CS */ break; | ||
604 | case 0x3e: /* DS */ break; | ||
605 | case 0x26: /* ES */ break; | ||
606 | case 0x36: /* SS */ break; | ||
607 | case 0x65: /* GS */ break; | ||
608 | case 0x64: /* FS */ break; | ||
609 | case 0xf2: /* repnz */ break; | ||
610 | case 0xf3: /* rep */ break; | ||
611 | default: pref_done = 1; | ||
612 | } | ||
613 | } while (!pref_done); | ||
614 | |||
615 | switch (opcode) { | ||
616 | |||
617 | /* pushf */ | ||
618 | case 0x9c: | ||
619 | if (data32) { | ||
620 | pushl(ssp, sp, get_vflags(regs), simulate_sigsegv); | ||
621 | SP(regs) -= 4; | ||
622 | } else { | ||
623 | pushw(ssp, sp, get_vflags(regs), simulate_sigsegv); | ||
624 | SP(regs) -= 2; | ||
625 | } | ||
626 | IP(regs) = ip; | ||
627 | VM86_FAULT_RETURN; | ||
628 | |||
629 | /* popf */ | ||
630 | case 0x9d: | ||
631 | { | ||
632 | unsigned long newflags; | ||
633 | if (data32) { | ||
634 | newflags=popl(ssp, sp, simulate_sigsegv); | ||
635 | SP(regs) += 4; | ||
636 | } else { | ||
637 | newflags = popw(ssp, sp, simulate_sigsegv); | ||
638 | SP(regs) += 2; | ||
639 | } | ||
640 | IP(regs) = ip; | ||
641 | CHECK_IF_IN_TRAP; | ||
642 | if (data32) { | ||
643 | set_vflags_long(newflags, regs); | ||
644 | } else { | ||
645 | set_vflags_short(newflags, regs); | ||
646 | } | ||
647 | VM86_FAULT_RETURN; | ||
648 | } | ||
649 | |||
650 | /* int xx */ | ||
651 | case 0xcd: { | ||
652 | int intno=popb(csp, ip, simulate_sigsegv); | ||
653 | IP(regs) = ip; | ||
654 | if (VMPI.vm86dbg_active) { | ||
655 | if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] ) | ||
656 | return_to_32bit(regs, VM86_INTx + (intno << 8)); | ||
657 | } | ||
658 | do_int(regs, intno, ssp, sp); | ||
659 | return; | ||
660 | } | ||
661 | |||
662 | /* iret */ | ||
663 | case 0xcf: | ||
664 | { | ||
665 | unsigned long newip; | ||
666 | unsigned long newcs; | ||
667 | unsigned long newflags; | ||
668 | if (data32) { | ||
669 | newip=popl(ssp, sp, simulate_sigsegv); | ||
670 | newcs=popl(ssp, sp, simulate_sigsegv); | ||
671 | newflags=popl(ssp, sp, simulate_sigsegv); | ||
672 | SP(regs) += 12; | ||
673 | } else { | ||
674 | newip = popw(ssp, sp, simulate_sigsegv); | ||
675 | newcs = popw(ssp, sp, simulate_sigsegv); | ||
676 | newflags = popw(ssp, sp, simulate_sigsegv); | ||
677 | SP(regs) += 6; | ||
678 | } | ||
679 | IP(regs) = newip; | ||
680 | regs->pt.xcs = newcs; | ||
681 | CHECK_IF_IN_TRAP; | ||
682 | if (data32) { | ||
683 | set_vflags_long(newflags, regs); | ||
684 | } else { | ||
685 | set_vflags_short(newflags, regs); | ||
686 | } | ||
687 | VM86_FAULT_RETURN; | ||
688 | } | ||
689 | |||
690 | /* cli */ | ||
691 | case 0xfa: | ||
692 | IP(regs) = ip; | ||
693 | clear_IF(regs); | ||
694 | VM86_FAULT_RETURN; | ||
695 | |||
696 | /* sti */ | ||
697 | /* | ||
698 | * Damn. This is incorrect: the 'sti' instruction should actually | ||
699 | * enable interrupts after the /next/ instruction. Not good. | ||
700 | * | ||
701 | * Probably needs some horsing around with the TF flag. Aiee.. | ||
702 | */ | ||
703 | case 0xfb: | ||
704 | IP(regs) = ip; | ||
705 | set_IF(regs); | ||
706 | VM86_FAULT_RETURN; | ||
707 | |||
708 | default: | ||
709 | return_to_32bit(regs, VM86_UNKNOWN); | ||
710 | } | ||
711 | |||
712 | return; | ||
713 | |||
714 | simulate_sigsegv: | ||
715 | /* FIXME: After a long discussion with Stas we finally | ||
716 | * agreed, that this is wrong. Here we should | ||
717 | * really send a SIGSEGV to the user program. | ||
718 | * But how do we create the correct context? We | ||
719 | * are inside a general protection fault handler | ||
720 | * and has just returned from a page fault handler. | ||
721 | * The correct context for the signal handler | ||
722 | * should be a mixture of the two, but how do we | ||
723 | * get the information? [KD] | ||
724 | */ | ||
725 | return_to_32bit(regs, VM86_UNKNOWN); | ||
726 | } | ||
727 | |||
728 | /* ---------------- vm86 special IRQ passing stuff ----------------- */ | ||
729 | |||
730 | #define VM86_IRQNAME "vm86irq" | ||
731 | |||
732 | static struct vm86_irqs { | ||
733 | struct task_struct *tsk; | ||
734 | int sig; | ||
735 | } vm86_irqs[16]; | ||
736 | |||
737 | static DEFINE_SPINLOCK(irqbits_lock); | ||
738 | static int irqbits; | ||
739 | |||
740 | #define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \ | ||
741 | | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \ | ||
742 | | (1 << SIGUNUSED) ) | ||
743 | |||
744 | static irqreturn_t irq_handler(int intno, void *dev_id) | ||
745 | { | ||
746 | int irq_bit; | ||
747 | unsigned long flags; | ||
748 | |||
749 | spin_lock_irqsave(&irqbits_lock, flags); | ||
750 | irq_bit = 1 << intno; | ||
751 | if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk) | ||
752 | goto out; | ||
753 | irqbits |= irq_bit; | ||
754 | if (vm86_irqs[intno].sig) | ||
755 | send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1); | ||
756 | /* | ||
757 | * IRQ will be re-enabled when user asks for the irq (whether | ||
758 | * polling or as a result of the signal) | ||
759 | */ | ||
760 | disable_irq_nosync(intno); | ||
761 | spin_unlock_irqrestore(&irqbits_lock, flags); | ||
762 | return IRQ_HANDLED; | ||
763 | |||
764 | out: | ||
765 | spin_unlock_irqrestore(&irqbits_lock, flags); | ||
766 | return IRQ_NONE; | ||
767 | } | ||
768 | |||
769 | static inline void free_vm86_irq(int irqnumber) | ||
770 | { | ||
771 | unsigned long flags; | ||
772 | |||
773 | free_irq(irqnumber, NULL); | ||
774 | vm86_irqs[irqnumber].tsk = NULL; | ||
775 | |||
776 | spin_lock_irqsave(&irqbits_lock, flags); | ||
777 | irqbits &= ~(1 << irqnumber); | ||
778 | spin_unlock_irqrestore(&irqbits_lock, flags); | ||
779 | } | ||
780 | |||
781 | void release_vm86_irqs(struct task_struct *task) | ||
782 | { | ||
783 | int i; | ||
784 | for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++) | ||
785 | if (vm86_irqs[i].tsk == task) | ||
786 | free_vm86_irq(i); | ||
787 | } | ||
788 | |||
789 | static inline int get_and_reset_irq(int irqnumber) | ||
790 | { | ||
791 | int bit; | ||
792 | unsigned long flags; | ||
793 | int ret = 0; | ||
794 | |||
795 | if (invalid_vm86_irq(irqnumber)) return 0; | ||
796 | if (vm86_irqs[irqnumber].tsk != current) return 0; | ||
797 | spin_lock_irqsave(&irqbits_lock, flags); | ||
798 | bit = irqbits & (1 << irqnumber); | ||
799 | irqbits &= ~bit; | ||
800 | if (bit) { | ||
801 | enable_irq(irqnumber); | ||
802 | ret = 1; | ||
803 | } | ||
804 | |||
805 | spin_unlock_irqrestore(&irqbits_lock, flags); | ||
806 | return ret; | ||
807 | } | ||
808 | |||
809 | |||
810 | static int do_vm86_irq_handling(int subfunction, int irqnumber) | ||
811 | { | ||
812 | int ret; | ||
813 | switch (subfunction) { | ||
814 | case VM86_GET_AND_RESET_IRQ: { | ||
815 | return get_and_reset_irq(irqnumber); | ||
816 | } | ||
817 | case VM86_GET_IRQ_BITS: { | ||
818 | return irqbits; | ||
819 | } | ||
820 | case VM86_REQUEST_IRQ: { | ||
821 | int sig = irqnumber >> 8; | ||
822 | int irq = irqnumber & 255; | ||
823 | if (!capable(CAP_SYS_ADMIN)) return -EPERM; | ||
824 | if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM; | ||
825 | if (invalid_vm86_irq(irq)) return -EPERM; | ||
826 | if (vm86_irqs[irq].tsk) return -EPERM; | ||
827 | ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL); | ||
828 | if (ret) return ret; | ||
829 | vm86_irqs[irq].sig = sig; | ||
830 | vm86_irqs[irq].tsk = current; | ||
831 | return irq; | ||
832 | } | ||
833 | case VM86_FREE_IRQ: { | ||
834 | if (invalid_vm86_irq(irqnumber)) return -EPERM; | ||
835 | if (!vm86_irqs[irqnumber].tsk) return 0; | ||
836 | if (vm86_irqs[irqnumber].tsk != current) return -EPERM; | ||
837 | free_vm86_irq(irqnumber); | ||
838 | return 0; | ||
839 | } | ||
840 | } | ||
841 | return -EINVAL; | ||
842 | } | ||
843 | |||
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c new file mode 100644 index 000000000000..18673e0f193b --- /dev/null +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -0,0 +1,981 @@ | |||
1 | /* | ||
2 | * VMI specific paravirt-ops implementation | ||
3 | * | ||
4 | * Copyright (C) 2005, VMware, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | * Send feedback to zach@vmware.com | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/cpu.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <asm/vmi.h> | ||
32 | #include <asm/io.h> | ||
33 | #include <asm/fixmap.h> | ||
34 | #include <asm/apicdef.h> | ||
35 | #include <asm/apic.h> | ||
36 | #include <asm/processor.h> | ||
37 | #include <asm/timer.h> | ||
38 | #include <asm/vmi_time.h> | ||
39 | #include <asm/kmap_types.h> | ||
40 | |||
41 | /* Convenient for calling VMI functions indirectly in the ROM */ | ||
42 | typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); | ||
43 | typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int); | ||
44 | |||
45 | #define call_vrom_func(rom,func) \ | ||
46 | (((VROMFUNC *)(rom->func))()) | ||
47 | |||
48 | #define call_vrom_long_func(rom,func,arg) \ | ||
49 | (((VROMLONGFUNC *)(rom->func)) (arg)) | ||
50 | |||
51 | static struct vrom_header *vmi_rom; | ||
52 | static int disable_pge; | ||
53 | static int disable_pse; | ||
54 | static int disable_sep; | ||
55 | static int disable_tsc; | ||
56 | static int disable_mtrr; | ||
57 | static int disable_noidle; | ||
58 | static int disable_vmi_timer; | ||
59 | |||
60 | /* Cached VMI operations */ | ||
61 | static struct { | ||
62 | void (*cpuid)(void /* non-c */); | ||
63 | void (*_set_ldt)(u32 selector); | ||
64 | void (*set_tr)(u32 selector); | ||
65 | void (*set_kernel_stack)(u32 selector, u32 esp0); | ||
66 | void (*allocate_page)(u32, u32, u32, u32, u32); | ||
67 | void (*release_page)(u32, u32); | ||
68 | void (*set_pte)(pte_t, pte_t *, unsigned); | ||
69 | void (*update_pte)(pte_t *, unsigned); | ||
70 | void (*set_linear_mapping)(int, void *, u32, u32); | ||
71 | void (*_flush_tlb)(int); | ||
72 | void (*set_initial_ap_state)(int, int); | ||
73 | void (*halt)(void); | ||
74 | void (*set_lazy_mode)(int mode); | ||
75 | } vmi_ops; | ||
76 | |||
77 | /* Cached VMI operations */ | ||
78 | struct vmi_timer_ops vmi_timer_ops; | ||
79 | |||
80 | /* | ||
81 | * VMI patching routines. | ||
82 | */ | ||
83 | #define MNEM_CALL 0xe8 | ||
84 | #define MNEM_JMP 0xe9 | ||
85 | #define MNEM_RET 0xc3 | ||
86 | |||
87 | #define IRQ_PATCH_INT_MASK 0 | ||
88 | #define IRQ_PATCH_DISABLE 5 | ||
89 | |||
90 | static inline void patch_offset(void *insnbuf, | ||
91 | unsigned long eip, unsigned long dest) | ||
92 | { | ||
93 | *(unsigned long *)(insnbuf+1) = dest-eip-5; | ||
94 | } | ||
95 | |||
96 | static unsigned patch_internal(int call, unsigned len, void *insnbuf, | ||
97 | unsigned long eip) | ||
98 | { | ||
99 | u64 reloc; | ||
100 | struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; | ||
101 | reloc = call_vrom_long_func(vmi_rom, get_reloc, call); | ||
102 | switch(rel->type) { | ||
103 | case VMI_RELOCATION_CALL_REL: | ||
104 | BUG_ON(len < 5); | ||
105 | *(char *)insnbuf = MNEM_CALL; | ||
106 | patch_offset(insnbuf, eip, (unsigned long)rel->eip); | ||
107 | return 5; | ||
108 | |||
109 | case VMI_RELOCATION_JUMP_REL: | ||
110 | BUG_ON(len < 5); | ||
111 | *(char *)insnbuf = MNEM_JMP; | ||
112 | patch_offset(insnbuf, eip, (unsigned long)rel->eip); | ||
113 | return 5; | ||
114 | |||
115 | case VMI_RELOCATION_NOP: | ||
116 | /* obliterate the whole thing */ | ||
117 | return 0; | ||
118 | |||
119 | case VMI_RELOCATION_NONE: | ||
120 | /* leave native code in place */ | ||
121 | break; | ||
122 | |||
123 | default: | ||
124 | BUG(); | ||
125 | } | ||
126 | return len; | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * Apply patch if appropriate, return length of new instruction | ||
131 | * sequence. The callee does nop padding for us. | ||
132 | */ | ||
133 | static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, | ||
134 | unsigned long eip, unsigned len) | ||
135 | { | ||
136 | switch (type) { | ||
137 | case PARAVIRT_PATCH(irq_disable): | ||
138 | return patch_internal(VMI_CALL_DisableInterrupts, len, | ||
139 | insns, eip); | ||
140 | case PARAVIRT_PATCH(irq_enable): | ||
141 | return patch_internal(VMI_CALL_EnableInterrupts, len, | ||
142 | insns, eip); | ||
143 | case PARAVIRT_PATCH(restore_fl): | ||
144 | return patch_internal(VMI_CALL_SetInterruptMask, len, | ||
145 | insns, eip); | ||
146 | case PARAVIRT_PATCH(save_fl): | ||
147 | return patch_internal(VMI_CALL_GetInterruptMask, len, | ||
148 | insns, eip); | ||
149 | case PARAVIRT_PATCH(iret): | ||
150 | return patch_internal(VMI_CALL_IRET, len, insns, eip); | ||
151 | case PARAVIRT_PATCH(irq_enable_sysexit): | ||
152 | return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); | ||
153 | default: | ||
154 | break; | ||
155 | } | ||
156 | return len; | ||
157 | } | ||
158 | |||
159 | /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ | ||
160 | static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, | ||
161 | unsigned int *ecx, unsigned int *edx) | ||
162 | { | ||
163 | int override = 0; | ||
164 | if (*eax == 1) | ||
165 | override = 1; | ||
166 | asm volatile ("call *%6" | ||
167 | : "=a" (*eax), | ||
168 | "=b" (*ebx), | ||
169 | "=c" (*ecx), | ||
170 | "=d" (*edx) | ||
171 | : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); | ||
172 | if (override) { | ||
173 | if (disable_pse) | ||
174 | *edx &= ~X86_FEATURE_PSE; | ||
175 | if (disable_pge) | ||
176 | *edx &= ~X86_FEATURE_PGE; | ||
177 | if (disable_sep) | ||
178 | *edx &= ~X86_FEATURE_SEP; | ||
179 | if (disable_tsc) | ||
180 | *edx &= ~X86_FEATURE_TSC; | ||
181 | if (disable_mtrr) | ||
182 | *edx &= ~X86_FEATURE_MTRR; | ||
183 | } | ||
184 | } | ||
185 | |||
186 | static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) | ||
187 | { | ||
188 | if (gdt[nr].a != new->a || gdt[nr].b != new->b) | ||
189 | write_gdt_entry(gdt, nr, new->a, new->b); | ||
190 | } | ||
191 | |||
192 | static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) | ||
193 | { | ||
194 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | ||
195 | vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]); | ||
196 | vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]); | ||
197 | vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]); | ||
198 | } | ||
199 | |||
200 | static void vmi_set_ldt(const void *addr, unsigned entries) | ||
201 | { | ||
202 | unsigned cpu = smp_processor_id(); | ||
203 | u32 low, high; | ||
204 | |||
205 | pack_descriptor(&low, &high, (unsigned long)addr, | ||
206 | entries * sizeof(struct desc_struct) - 1, | ||
207 | DESCTYPE_LDT, 0); | ||
208 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); | ||
209 | vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); | ||
210 | } | ||
211 | |||
212 | static void vmi_set_tr(void) | ||
213 | { | ||
214 | vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); | ||
215 | } | ||
216 | |||
217 | static void vmi_load_esp0(struct tss_struct *tss, | ||
218 | struct thread_struct *thread) | ||
219 | { | ||
220 | tss->x86_tss.esp0 = thread->esp0; | ||
221 | |||
222 | /* This can only happen when SEP is enabled, no need to test "SEP"arately */ | ||
223 | if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { | ||
224 | tss->x86_tss.ss1 = thread->sysenter_cs; | ||
225 | wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); | ||
226 | } | ||
227 | vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); | ||
228 | } | ||
229 | |||
230 | static void vmi_flush_tlb_user(void) | ||
231 | { | ||
232 | vmi_ops._flush_tlb(VMI_FLUSH_TLB); | ||
233 | } | ||
234 | |||
235 | static void vmi_flush_tlb_kernel(void) | ||
236 | { | ||
237 | vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); | ||
238 | } | ||
239 | |||
240 | /* Stub to do nothing at all; used for delays and unimplemented calls */ | ||
241 | static void vmi_nop(void) | ||
242 | { | ||
243 | } | ||
244 | |||
245 | #ifdef CONFIG_DEBUG_PAGE_TYPE | ||
246 | |||
247 | #ifdef CONFIG_X86_PAE | ||
248 | #define MAX_BOOT_PTS (2048+4+1) | ||
249 | #else | ||
250 | #define MAX_BOOT_PTS (1024+1) | ||
251 | #endif | ||
252 | |||
253 | /* | ||
254 | * During boot, mem_map is not yet available in paging_init, so stash | ||
255 | * all the boot page allocations here. | ||
256 | */ | ||
257 | static struct { | ||
258 | u32 pfn; | ||
259 | int type; | ||
260 | } boot_page_allocations[MAX_BOOT_PTS]; | ||
261 | static int num_boot_page_allocations; | ||
262 | static int boot_allocations_applied; | ||
263 | |||
264 | void vmi_apply_boot_page_allocations(void) | ||
265 | { | ||
266 | int i; | ||
267 | BUG_ON(!mem_map); | ||
268 | for (i = 0; i < num_boot_page_allocations; i++) { | ||
269 | struct page *page = pfn_to_page(boot_page_allocations[i].pfn); | ||
270 | page->type = boot_page_allocations[i].type; | ||
271 | page->type = boot_page_allocations[i].type & | ||
272 | ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); | ||
273 | } | ||
274 | boot_allocations_applied = 1; | ||
275 | } | ||
276 | |||
277 | static void record_page_type(u32 pfn, int type) | ||
278 | { | ||
279 | BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); | ||
280 | boot_page_allocations[num_boot_page_allocations].pfn = pfn; | ||
281 | boot_page_allocations[num_boot_page_allocations].type = type; | ||
282 | num_boot_page_allocations++; | ||
283 | } | ||
284 | |||
285 | static void check_zeroed_page(u32 pfn, int type, struct page *page) | ||
286 | { | ||
287 | u32 *ptr; | ||
288 | int i; | ||
289 | int limit = PAGE_SIZE / sizeof(int); | ||
290 | |||
291 | if (page_address(page)) | ||
292 | ptr = (u32 *)page_address(page); | ||
293 | else | ||
294 | ptr = (u32 *)__va(pfn << PAGE_SHIFT); | ||
295 | /* | ||
296 | * When cloning the root in non-PAE mode, only the userspace | ||
297 | * pdes need to be zeroed. | ||
298 | */ | ||
299 | if (type & VMI_PAGE_CLONE) | ||
300 | limit = USER_PTRS_PER_PGD; | ||
301 | for (i = 0; i < limit; i++) | ||
302 | BUG_ON(ptr[i]); | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * We stash the page type into struct page so we can verify the page | ||
307 | * types are used properly. | ||
308 | */ | ||
309 | static void vmi_set_page_type(u32 pfn, int type) | ||
310 | { | ||
311 | /* PAE can have multiple roots per page - don't track */ | ||
312 | if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) | ||
313 | return; | ||
314 | |||
315 | if (boot_allocations_applied) { | ||
316 | struct page *page = pfn_to_page(pfn); | ||
317 | if (type != VMI_PAGE_NORMAL) | ||
318 | BUG_ON(page->type); | ||
319 | else | ||
320 | BUG_ON(page->type == VMI_PAGE_NORMAL); | ||
321 | page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); | ||
322 | if (type & VMI_PAGE_ZEROED) | ||
323 | check_zeroed_page(pfn, type, page); | ||
324 | } else { | ||
325 | record_page_type(pfn, type); | ||
326 | } | ||
327 | } | ||
328 | |||
329 | static void vmi_check_page_type(u32 pfn, int type) | ||
330 | { | ||
331 | /* PAE can have multiple roots per page - skip checks */ | ||
332 | if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) | ||
333 | return; | ||
334 | |||
335 | type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); | ||
336 | if (boot_allocations_applied) { | ||
337 | struct page *page = pfn_to_page(pfn); | ||
338 | BUG_ON((page->type ^ type) & VMI_PAGE_PAE); | ||
339 | BUG_ON(type == VMI_PAGE_NORMAL && page->type); | ||
340 | BUG_ON((type & page->type) == 0); | ||
341 | } | ||
342 | } | ||
343 | #else | ||
344 | #define vmi_set_page_type(p,t) do { } while (0) | ||
345 | #define vmi_check_page_type(p,t) do { } while (0) | ||
346 | #endif | ||
347 | |||
348 | #ifdef CONFIG_HIGHPTE | ||
349 | static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) | ||
350 | { | ||
351 | void *va = kmap_atomic(page, type); | ||
352 | |||
353 | /* | ||
354 | * Internally, the VMI ROM must map virtual addresses to physical | ||
355 | * addresses for processing MMU updates. By the time MMU updates | ||
356 | * are issued, this information is typically already lost. | ||
357 | * Fortunately, the VMI provides a cache of mapping slots for active | ||
358 | * page tables. | ||
359 | * | ||
360 | * We use slot zero for the linear mapping of physical memory, and | ||
361 | * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1. | ||
362 | * | ||
363 | * args: SLOT VA COUNT PFN | ||
364 | */ | ||
365 | BUG_ON(type != KM_PTE0 && type != KM_PTE1); | ||
366 | vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); | ||
367 | |||
368 | return va; | ||
369 | } | ||
370 | #endif | ||
371 | |||
372 | static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) | ||
373 | { | ||
374 | vmi_set_page_type(pfn, VMI_PAGE_L1); | ||
375 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); | ||
376 | } | ||
377 | |||
378 | static void vmi_allocate_pd(u32 pfn) | ||
379 | { | ||
380 | /* | ||
381 | * This call comes in very early, before mem_map is setup. | ||
382 | * It is called only for swapper_pg_dir, which already has | ||
383 | * data on it. | ||
384 | */ | ||
385 | vmi_set_page_type(pfn, VMI_PAGE_L2); | ||
386 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); | ||
387 | } | ||
388 | |||
389 | static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) | ||
390 | { | ||
391 | vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); | ||
392 | vmi_check_page_type(clonepfn, VMI_PAGE_L2); | ||
393 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); | ||
394 | } | ||
395 | |||
396 | static void vmi_release_pt(u32 pfn) | ||
397 | { | ||
398 | vmi_ops.release_page(pfn, VMI_PAGE_L1); | ||
399 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); | ||
400 | } | ||
401 | |||
402 | static void vmi_release_pd(u32 pfn) | ||
403 | { | ||
404 | vmi_ops.release_page(pfn, VMI_PAGE_L2); | ||
405 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); | ||
406 | } | ||
407 | |||
408 | /* | ||
409 | * Helper macros for MMU update flags. We can defer updates until a flush | ||
410 | * or page invalidation only if the update is to the current address space | ||
411 | * (otherwise, there is no flush). We must check against init_mm, since | ||
412 | * this could be a kernel update, which usually passes init_mm, although | ||
413 | * sometimes this check can be skipped if we know the particular function | ||
414 | * is only called on user mode PTEs. We could change the kernel to pass | ||
415 | * current->active_mm here, but in particular, I was unsure if changing | ||
416 | * mm/highmem.c to do this would still be correct on other architectures. | ||
417 | */ | ||
418 | #define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \ | ||
419 | (!mustbeuser && (mm) == &init_mm)) | ||
420 | #define vmi_flags_addr(mm, addr, level, user) \ | ||
421 | ((level) | (is_current_as(mm, user) ? \ | ||
422 | (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) | ||
423 | #define vmi_flags_addr_defer(mm, addr, level, user) \ | ||
424 | ((level) | (is_current_as(mm, user) ? \ | ||
425 | (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) | ||
426 | |||
427 | static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
428 | { | ||
429 | vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); | ||
430 | vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); | ||
431 | } | ||
432 | |||
433 | static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
434 | { | ||
435 | vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); | ||
436 | vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); | ||
437 | } | ||
438 | |||
439 | static void vmi_set_pte(pte_t *ptep, pte_t pte) | ||
440 | { | ||
441 | /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ | ||
442 | vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); | ||
443 | vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); | ||
444 | } | ||
445 | |||
446 | static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) | ||
447 | { | ||
448 | vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); | ||
449 | vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); | ||
450 | } | ||
451 | |||
452 | static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
453 | { | ||
454 | #ifdef CONFIG_X86_PAE | ||
455 | const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; | ||
456 | vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); | ||
457 | #else | ||
458 | const pte_t pte = { pmdval.pud.pgd.pgd }; | ||
459 | vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); | ||
460 | #endif | ||
461 | vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); | ||
462 | } | ||
463 | |||
464 | #ifdef CONFIG_X86_PAE | ||
465 | |||
466 | static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) | ||
467 | { | ||
468 | /* | ||
469 | * XXX This is called from set_pmd_pte, but at both PT | ||
470 | * and PD layers so the VMI_PAGE_PT flag is wrong. But | ||
471 | * it is only called for large page mapping changes, | ||
472 | * the Xen backend, doesn't support large pages, and the | ||
473 | * ESX backend doesn't depend on the flag. | ||
474 | */ | ||
475 | set_64bit((unsigned long long *)ptep,pte_val(pteval)); | ||
476 | vmi_ops.update_pte(ptep, VMI_PAGE_PT); | ||
477 | } | ||
478 | |||
479 | static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) | ||
480 | { | ||
481 | vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); | ||
482 | vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); | ||
483 | } | ||
484 | |||
485 | static void vmi_set_pud(pud_t *pudp, pud_t pudval) | ||
486 | { | ||
487 | /* Um, eww */ | ||
488 | const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; | ||
489 | vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); | ||
490 | vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); | ||
491 | } | ||
492 | |||
493 | static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
494 | { | ||
495 | const pte_t pte = { 0 }; | ||
496 | vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); | ||
497 | vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); | ||
498 | } | ||
499 | |||
500 | static void vmi_pmd_clear(pmd_t *pmd) | ||
501 | { | ||
502 | const pte_t pte = { 0 }; | ||
503 | vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); | ||
504 | vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); | ||
505 | } | ||
506 | #endif | ||
507 | |||
508 | #ifdef CONFIG_SMP | ||
509 | static void __devinit | ||
510 | vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, | ||
511 | unsigned long start_esp) | ||
512 | { | ||
513 | struct vmi_ap_state ap; | ||
514 | |||
515 | /* Default everything to zero. This is fine for most GPRs. */ | ||
516 | memset(&ap, 0, sizeof(struct vmi_ap_state)); | ||
517 | |||
518 | ap.gdtr_limit = GDT_SIZE - 1; | ||
519 | ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid); | ||
520 | |||
521 | ap.idtr_limit = IDT_ENTRIES * 8 - 1; | ||
522 | ap.idtr_base = (unsigned long) idt_table; | ||
523 | |||
524 | ap.ldtr = 0; | ||
525 | |||
526 | ap.cs = __KERNEL_CS; | ||
527 | ap.eip = (unsigned long) start_eip; | ||
528 | ap.ss = __KERNEL_DS; | ||
529 | ap.esp = (unsigned long) start_esp; | ||
530 | |||
531 | ap.ds = __USER_DS; | ||
532 | ap.es = __USER_DS; | ||
533 | ap.fs = __KERNEL_PERCPU; | ||
534 | ap.gs = 0; | ||
535 | |||
536 | ap.eflags = 0; | ||
537 | |||
538 | #ifdef CONFIG_X86_PAE | ||
539 | /* efer should match BSP efer. */ | ||
540 | if (cpu_has_nx) { | ||
541 | unsigned l, h; | ||
542 | rdmsr(MSR_EFER, l, h); | ||
543 | ap.efer = (unsigned long long) h << 32 | l; | ||
544 | } | ||
545 | #endif | ||
546 | |||
547 | ap.cr3 = __pa(swapper_pg_dir); | ||
548 | /* Protected mode, paging, AM, WP, NE, MP. */ | ||
549 | ap.cr0 = 0x80050023; | ||
550 | ap.cr4 = mmu_cr4_features; | ||
551 | vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid); | ||
552 | } | ||
553 | #endif | ||
554 | |||
555 | static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) | ||
556 | { | ||
557 | static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); | ||
558 | |||
559 | if (!vmi_ops.set_lazy_mode) | ||
560 | return; | ||
561 | |||
562 | /* Modes should never nest or overlap */ | ||
563 | BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || | ||
564 | mode == PARAVIRT_LAZY_FLUSH)); | ||
565 | |||
566 | if (mode == PARAVIRT_LAZY_FLUSH) { | ||
567 | vmi_ops.set_lazy_mode(0); | ||
568 | vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); | ||
569 | } else { | ||
570 | vmi_ops.set_lazy_mode(mode); | ||
571 | __get_cpu_var(lazy_mode) = mode; | ||
572 | } | ||
573 | } | ||
574 | |||
575 | static inline int __init check_vmi_rom(struct vrom_header *rom) | ||
576 | { | ||
577 | struct pci_header *pci; | ||
578 | struct pnp_header *pnp; | ||
579 | const char *manufacturer = "UNKNOWN"; | ||
580 | const char *product = "UNKNOWN"; | ||
581 | const char *license = "unspecified"; | ||
582 | |||
583 | if (rom->rom_signature != 0xaa55) | ||
584 | return 0; | ||
585 | if (rom->vrom_signature != VMI_SIGNATURE) | ||
586 | return 0; | ||
587 | if (rom->api_version_maj != VMI_API_REV_MAJOR || | ||
588 | rom->api_version_min+1 < VMI_API_REV_MINOR+1) { | ||
589 | printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n", | ||
590 | rom->api_version_maj, | ||
591 | rom->api_version_min); | ||
592 | return 0; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * Relying on the VMI_SIGNATURE field is not 100% safe, so check | ||
597 | * the PCI header and device type to make sure this is really a | ||
598 | * VMI device. | ||
599 | */ | ||
600 | if (!rom->pci_header_offs) { | ||
601 | printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n"); | ||
602 | return 0; | ||
603 | } | ||
604 | |||
605 | pci = (struct pci_header *)((char *)rom+rom->pci_header_offs); | ||
606 | if (pci->vendorID != PCI_VENDOR_ID_VMWARE || | ||
607 | pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) { | ||
608 | /* Allow it to run... anyways, but warn */ | ||
609 | printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n"); | ||
610 | } | ||
611 | |||
612 | if (rom->pnp_header_offs) { | ||
613 | pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs); | ||
614 | if (pnp->manufacturer_offset) | ||
615 | manufacturer = (const char *)rom+pnp->manufacturer_offset; | ||
616 | if (pnp->product_offset) | ||
617 | product = (const char *)rom+pnp->product_offset; | ||
618 | } | ||
619 | |||
620 | if (rom->license_offs) | ||
621 | license = (char *)rom+rom->license_offs; | ||
622 | |||
623 | printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n", | ||
624 | manufacturer, product, | ||
625 | rom->api_version_maj, rom->api_version_min, | ||
626 | pci->rom_version_maj, pci->rom_version_min); | ||
627 | |||
628 | /* Don't allow BSD/MIT here for now because we don't want to end up | ||
629 | with any binary only shim layers */ | ||
630 | if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) { | ||
631 | printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n", | ||
632 | license); | ||
633 | return 0; | ||
634 | } | ||
635 | |||
636 | return 1; | ||
637 | } | ||
638 | |||
639 | /* | ||
640 | * Probe for the VMI option ROM | ||
641 | */ | ||
642 | static inline int __init probe_vmi_rom(void) | ||
643 | { | ||
644 | unsigned long base; | ||
645 | |||
646 | /* VMI ROM is in option ROM area, check signature */ | ||
647 | for (base = 0xC0000; base < 0xE0000; base += 2048) { | ||
648 | struct vrom_header *romstart; | ||
649 | romstart = (struct vrom_header *)isa_bus_to_virt(base); | ||
650 | if (check_vmi_rom(romstart)) { | ||
651 | vmi_rom = romstart; | ||
652 | return 1; | ||
653 | } | ||
654 | } | ||
655 | return 0; | ||
656 | } | ||
657 | |||
658 | /* | ||
659 | * VMI setup common to all processors | ||
660 | */ | ||
661 | void vmi_bringup(void) | ||
662 | { | ||
663 | /* We must establish the lowmem mapping for MMU ops to work */ | ||
664 | if (vmi_ops.set_linear_mapping) | ||
665 | vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); | ||
666 | } | ||
667 | |||
668 | /* | ||
669 | * Return a pointer to a VMI function or NULL if unimplemented | ||
670 | */ | ||
671 | static void *vmi_get_function(int vmicall) | ||
672 | { | ||
673 | u64 reloc; | ||
674 | const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; | ||
675 | reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall); | ||
676 | BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); | ||
677 | if (rel->type == VMI_RELOCATION_CALL_REL) | ||
678 | return (void *)rel->eip; | ||
679 | else | ||
680 | return NULL; | ||
681 | } | ||
682 | |||
683 | /* | ||
684 | * Helper macro for making the VMI paravirt-ops fill code readable. | ||
685 | * For unimplemented operations, fall back to default, unless nop | ||
686 | * is returned by the ROM. | ||
687 | */ | ||
688 | #define para_fill(opname, vmicall) \ | ||
689 | do { \ | ||
690 | reloc = call_vrom_long_func(vmi_rom, get_reloc, \ | ||
691 | VMI_CALL_##vmicall); \ | ||
692 | if (rel->type == VMI_RELOCATION_CALL_REL) \ | ||
693 | paravirt_ops.opname = (void *)rel->eip; \ | ||
694 | else if (rel->type == VMI_RELOCATION_NOP) \ | ||
695 | paravirt_ops.opname = (void *)vmi_nop; \ | ||
696 | else if (rel->type != VMI_RELOCATION_NONE) \ | ||
697 | printk(KERN_WARNING "VMI: Unknown relocation " \ | ||
698 | "type %d for " #vmicall"\n",\ | ||
699 | rel->type); \ | ||
700 | } while (0) | ||
701 | |||
702 | /* | ||
703 | * Helper macro for making the VMI paravirt-ops fill code readable. | ||
704 | * For cached operations which do not match the VMI ROM ABI and must | ||
705 | * go through a tranlation stub. Ignore NOPs, since it is not clear | ||
706 | * a NOP * VMI function corresponds to a NOP paravirt-op when the | ||
707 | * functions are not in 1-1 correspondence. | ||
708 | */ | ||
709 | #define para_wrap(opname, wrapper, cache, vmicall) \ | ||
710 | do { \ | ||
711 | reloc = call_vrom_long_func(vmi_rom, get_reloc, \ | ||
712 | VMI_CALL_##vmicall); \ | ||
713 | BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ | ||
714 | if (rel->type == VMI_RELOCATION_CALL_REL) { \ | ||
715 | paravirt_ops.opname = wrapper; \ | ||
716 | vmi_ops.cache = (void *)rel->eip; \ | ||
717 | } \ | ||
718 | } while (0) | ||
719 | |||
720 | /* | ||
721 | * Activate the VMI interface and switch into paravirtualized mode | ||
722 | */ | ||
723 | static inline int __init activate_vmi(void) | ||
724 | { | ||
725 | short kernel_cs; | ||
726 | u64 reloc; | ||
727 | const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; | ||
728 | |||
729 | if (call_vrom_func(vmi_rom, vmi_init) != 0) { | ||
730 | printk(KERN_ERR "VMI ROM failed to initialize!"); | ||
731 | return 0; | ||
732 | } | ||
733 | savesegment(cs, kernel_cs); | ||
734 | |||
735 | paravirt_ops.paravirt_enabled = 1; | ||
736 | paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; | ||
737 | |||
738 | paravirt_ops.patch = vmi_patch; | ||
739 | paravirt_ops.name = "vmi"; | ||
740 | |||
741 | /* | ||
742 | * Many of these operations are ABI compatible with VMI. | ||
743 | * This means we can fill in the paravirt-ops with direct | ||
744 | * pointers into the VMI ROM. If the calling convention for | ||
745 | * these operations changes, this code needs to be updated. | ||
746 | * | ||
747 | * Exceptions | ||
748 | * CPUID paravirt-op uses pointers, not the native ISA | ||
749 | * halt has no VMI equivalent; all VMI halts are "safe" | ||
750 | * no MSR support yet - just trap and emulate. VMI uses the | ||
751 | * same ABI as the native ISA, but Linux wants exceptions | ||
752 | * from bogus MSR read / write handled | ||
753 | * rdpmc is not yet used in Linux | ||
754 | */ | ||
755 | |||
756 | /* CPUID is special, so very special it gets wrapped like a present */ | ||
757 | para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); | ||
758 | |||
759 | para_fill(clts, CLTS); | ||
760 | para_fill(get_debugreg, GetDR); | ||
761 | para_fill(set_debugreg, SetDR); | ||
762 | para_fill(read_cr0, GetCR0); | ||
763 | para_fill(read_cr2, GetCR2); | ||
764 | para_fill(read_cr3, GetCR3); | ||
765 | para_fill(read_cr4, GetCR4); | ||
766 | para_fill(write_cr0, SetCR0); | ||
767 | para_fill(write_cr2, SetCR2); | ||
768 | para_fill(write_cr3, SetCR3); | ||
769 | para_fill(write_cr4, SetCR4); | ||
770 | para_fill(save_fl, GetInterruptMask); | ||
771 | para_fill(restore_fl, SetInterruptMask); | ||
772 | para_fill(irq_disable, DisableInterrupts); | ||
773 | para_fill(irq_enable, EnableInterrupts); | ||
774 | |||
775 | para_fill(wbinvd, WBINVD); | ||
776 | para_fill(read_tsc, RDTSC); | ||
777 | |||
778 | /* The following we emulate with trap and emulate for now */ | ||
779 | /* paravirt_ops.read_msr = vmi_rdmsr */ | ||
780 | /* paravirt_ops.write_msr = vmi_wrmsr */ | ||
781 | /* paravirt_ops.rdpmc = vmi_rdpmc */ | ||
782 | |||
783 | /* TR interface doesn't pass TR value, wrap */ | ||
784 | para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); | ||
785 | |||
786 | /* LDT is special, too */ | ||
787 | para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); | ||
788 | |||
789 | para_fill(load_gdt, SetGDT); | ||
790 | para_fill(load_idt, SetIDT); | ||
791 | para_fill(store_gdt, GetGDT); | ||
792 | para_fill(store_idt, GetIDT); | ||
793 | para_fill(store_tr, GetTR); | ||
794 | paravirt_ops.load_tls = vmi_load_tls; | ||
795 | para_fill(write_ldt_entry, WriteLDTEntry); | ||
796 | para_fill(write_gdt_entry, WriteGDTEntry); | ||
797 | para_fill(write_idt_entry, WriteIDTEntry); | ||
798 | para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); | ||
799 | para_fill(set_iopl_mask, SetIOPLMask); | ||
800 | para_fill(io_delay, IODelay); | ||
801 | para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); | ||
802 | |||
803 | /* user and kernel flush are just handled with different flags to FlushTLB */ | ||
804 | para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); | ||
805 | para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); | ||
806 | para_fill(flush_tlb_single, InvalPage); | ||
807 | |||
808 | /* | ||
809 | * Until a standard flag format can be agreed on, we need to | ||
810 | * implement these as wrappers in Linux. Get the VMI ROM | ||
811 | * function pointers for the two backend calls. | ||
812 | */ | ||
813 | #ifdef CONFIG_X86_PAE | ||
814 | vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong); | ||
815 | vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong); | ||
816 | #else | ||
817 | vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE); | ||
818 | vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE); | ||
819 | #endif | ||
820 | |||
821 | if (vmi_ops.set_pte) { | ||
822 | paravirt_ops.set_pte = vmi_set_pte; | ||
823 | paravirt_ops.set_pte_at = vmi_set_pte_at; | ||
824 | paravirt_ops.set_pmd = vmi_set_pmd; | ||
825 | #ifdef CONFIG_X86_PAE | ||
826 | paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; | ||
827 | paravirt_ops.set_pte_present = vmi_set_pte_present; | ||
828 | paravirt_ops.set_pud = vmi_set_pud; | ||
829 | paravirt_ops.pte_clear = vmi_pte_clear; | ||
830 | paravirt_ops.pmd_clear = vmi_pmd_clear; | ||
831 | #endif | ||
832 | } | ||
833 | |||
834 | if (vmi_ops.update_pte) { | ||
835 | paravirt_ops.pte_update = vmi_update_pte; | ||
836 | paravirt_ops.pte_update_defer = vmi_update_pte_defer; | ||
837 | } | ||
838 | |||
839 | vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); | ||
840 | if (vmi_ops.allocate_page) { | ||
841 | paravirt_ops.alloc_pt = vmi_allocate_pt; | ||
842 | paravirt_ops.alloc_pd = vmi_allocate_pd; | ||
843 | paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; | ||
844 | } | ||
845 | |||
846 | vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); | ||
847 | if (vmi_ops.release_page) { | ||
848 | paravirt_ops.release_pt = vmi_release_pt; | ||
849 | paravirt_ops.release_pd = vmi_release_pd; | ||
850 | } | ||
851 | |||
852 | /* Set linear is needed in all cases */ | ||
853 | vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); | ||
854 | #ifdef CONFIG_HIGHPTE | ||
855 | if (vmi_ops.set_linear_mapping) | ||
856 | paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; | ||
857 | #endif | ||
858 | |||
859 | /* | ||
860 | * These MUST always be patched. Don't support indirect jumps | ||
861 | * through these operations, as the VMI interface may use either | ||
862 | * a jump or a call to get to these operations, depending on | ||
863 | * the backend. They are performance critical anyway, so requiring | ||
864 | * a patch is not a big problem. | ||
865 | */ | ||
866 | paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; | ||
867 | paravirt_ops.iret = (void *)0xbadbab0; | ||
868 | |||
869 | #ifdef CONFIG_SMP | ||
870 | para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); | ||
871 | #endif | ||
872 | |||
873 | #ifdef CONFIG_X86_LOCAL_APIC | ||
874 | para_fill(apic_read, APICRead); | ||
875 | para_fill(apic_write, APICWrite); | ||
876 | para_fill(apic_write_atomic, APICWrite); | ||
877 | #endif | ||
878 | |||
879 | /* | ||
880 | * Check for VMI timer functionality by probing for a cycle frequency method | ||
881 | */ | ||
882 | reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency); | ||
883 | if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) { | ||
884 | vmi_timer_ops.get_cycle_frequency = (void *)rel->eip; | ||
885 | vmi_timer_ops.get_cycle_counter = | ||
886 | vmi_get_function(VMI_CALL_GetCycleCounter); | ||
887 | vmi_timer_ops.get_wallclock = | ||
888 | vmi_get_function(VMI_CALL_GetWallclockTime); | ||
889 | vmi_timer_ops.wallclock_updated = | ||
890 | vmi_get_function(VMI_CALL_WallclockUpdated); | ||
891 | vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); | ||
892 | vmi_timer_ops.cancel_alarm = | ||
893 | vmi_get_function(VMI_CALL_CancelAlarm); | ||
894 | paravirt_ops.time_init = vmi_time_init; | ||
895 | paravirt_ops.get_wallclock = vmi_get_wallclock; | ||
896 | paravirt_ops.set_wallclock = vmi_set_wallclock; | ||
897 | #ifdef CONFIG_X86_LOCAL_APIC | ||
898 | paravirt_ops.setup_boot_clock = vmi_time_bsp_init; | ||
899 | paravirt_ops.setup_secondary_clock = vmi_time_ap_init; | ||
900 | #endif | ||
901 | paravirt_ops.sched_clock = vmi_sched_clock; | ||
902 | paravirt_ops.get_cpu_khz = vmi_cpu_khz; | ||
903 | |||
904 | /* We have true wallclock functions; disable CMOS clock sync */ | ||
905 | no_sync_cmos_clock = 1; | ||
906 | } else { | ||
907 | disable_noidle = 1; | ||
908 | disable_vmi_timer = 1; | ||
909 | } | ||
910 | |||
911 | para_fill(safe_halt, Halt); | ||
912 | |||
913 | /* | ||
914 | * Alternative instruction rewriting doesn't happen soon enough | ||
915 | * to convert VMI_IRET to a call instead of a jump; so we have | ||
916 | * to do this before IRQs get reenabled. Fortunately, it is | ||
917 | * idempotent. | ||
918 | */ | ||
919 | apply_paravirt(__parainstructions, __parainstructions_end); | ||
920 | |||
921 | vmi_bringup(); | ||
922 | |||
923 | return 1; | ||
924 | } | ||
925 | |||
926 | #undef para_fill | ||
927 | |||
928 | void __init vmi_init(void) | ||
929 | { | ||
930 | unsigned long flags; | ||
931 | |||
932 | if (!vmi_rom) | ||
933 | probe_vmi_rom(); | ||
934 | else | ||
935 | check_vmi_rom(vmi_rom); | ||
936 | |||
937 | /* In case probing for or validating the ROM failed, basil */ | ||
938 | if (!vmi_rom) | ||
939 | return; | ||
940 | |||
941 | reserve_top_address(-vmi_rom->virtual_top); | ||
942 | |||
943 | local_irq_save(flags); | ||
944 | activate_vmi(); | ||
945 | |||
946 | #ifdef CONFIG_X86_IO_APIC | ||
947 | /* This is virtual hardware; timer routing is wired correctly */ | ||
948 | no_timer_check = 1; | ||
949 | #endif | ||
950 | local_irq_restore(flags & X86_EFLAGS_IF); | ||
951 | } | ||
952 | |||
953 | static int __init parse_vmi(char *arg) | ||
954 | { | ||
955 | if (!arg) | ||
956 | return -EINVAL; | ||
957 | |||
958 | if (!strcmp(arg, "disable_pge")) { | ||
959 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | ||
960 | disable_pge = 1; | ||
961 | } else if (!strcmp(arg, "disable_pse")) { | ||
962 | clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); | ||
963 | disable_pse = 1; | ||
964 | } else if (!strcmp(arg, "disable_sep")) { | ||
965 | clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); | ||
966 | disable_sep = 1; | ||
967 | } else if (!strcmp(arg, "disable_tsc")) { | ||
968 | clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); | ||
969 | disable_tsc = 1; | ||
970 | } else if (!strcmp(arg, "disable_mtrr")) { | ||
971 | clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); | ||
972 | disable_mtrr = 1; | ||
973 | } else if (!strcmp(arg, "disable_timer")) { | ||
974 | disable_vmi_timer = 1; | ||
975 | disable_noidle = 1; | ||
976 | } else if (!strcmp(arg, "disable_noidle")) | ||
977 | disable_noidle = 1; | ||
978 | return 0; | ||
979 | } | ||
980 | |||
981 | early_param("vmi", parse_vmi); | ||
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c new file mode 100644 index 000000000000..b1b5ab08b26e --- /dev/null +++ b/arch/x86/kernel/vmiclock_32.c | |||
@@ -0,0 +1,320 @@ | |||
1 | /* | ||
2 | * VMI paravirtual timer support routines. | ||
3 | * | ||
4 | * Copyright (C) 2007, VMware, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/smp.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/cpumask.h> | ||
26 | #include <linux/clocksource.h> | ||
27 | #include <linux/clockchips.h> | ||
28 | |||
29 | #include <asm/vmi.h> | ||
30 | #include <asm/vmi_time.h> | ||
31 | #include <asm/arch_hooks.h> | ||
32 | #include <asm/apicdef.h> | ||
33 | #include <asm/apic.h> | ||
34 | #include <asm/timer.h> | ||
35 | #include <asm/i8253.h> | ||
36 | |||
37 | #include <irq_vectors.h> | ||
38 | #include "io_ports.h" | ||
39 | |||
40 | #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) | ||
41 | #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) | ||
42 | |||
43 | static DEFINE_PER_CPU(struct clock_event_device, local_events); | ||
44 | |||
45 | static inline u32 vmi_counter(u32 flags) | ||
46 | { | ||
47 | /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding | ||
48 | * cycle counter. */ | ||
49 | return flags & VMI_ALARM_COUNTER_MASK; | ||
50 | } | ||
51 | |||
52 | /* paravirt_ops.get_wallclock = vmi_get_wallclock */ | ||
53 | unsigned long vmi_get_wallclock(void) | ||
54 | { | ||
55 | unsigned long long wallclock; | ||
56 | wallclock = vmi_timer_ops.get_wallclock(); // nsec | ||
57 | (void)do_div(wallclock, 1000000000); // sec | ||
58 | |||
59 | return wallclock; | ||
60 | } | ||
61 | |||
62 | /* paravirt_ops.set_wallclock = vmi_set_wallclock */ | ||
63 | int vmi_set_wallclock(unsigned long now) | ||
64 | { | ||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | /* paravirt_ops.sched_clock = vmi_sched_clock */ | ||
69 | unsigned long long vmi_sched_clock(void) | ||
70 | { | ||
71 | return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); | ||
72 | } | ||
73 | |||
74 | /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ | ||
75 | unsigned long vmi_cpu_khz(void) | ||
76 | { | ||
77 | unsigned long long khz; | ||
78 | khz = vmi_timer_ops.get_cycle_frequency(); | ||
79 | (void)do_div(khz, 1000); | ||
80 | return khz; | ||
81 | } | ||
82 | |||
83 | static inline unsigned int vmi_get_timer_vector(void) | ||
84 | { | ||
85 | #ifdef CONFIG_X86_IO_APIC | ||
86 | return FIRST_DEVICE_VECTOR; | ||
87 | #else | ||
88 | return FIRST_EXTERNAL_VECTOR; | ||
89 | #endif | ||
90 | } | ||
91 | |||
92 | /** vmi clockchip */ | ||
93 | #ifdef CONFIG_X86_LOCAL_APIC | ||
94 | static unsigned int startup_timer_irq(unsigned int irq) | ||
95 | { | ||
96 | unsigned long val = apic_read(APIC_LVTT); | ||
97 | apic_write(APIC_LVTT, vmi_get_timer_vector()); | ||
98 | |||
99 | return (val & APIC_SEND_PENDING); | ||
100 | } | ||
101 | |||
102 | static void mask_timer_irq(unsigned int irq) | ||
103 | { | ||
104 | unsigned long val = apic_read(APIC_LVTT); | ||
105 | apic_write(APIC_LVTT, val | APIC_LVT_MASKED); | ||
106 | } | ||
107 | |||
108 | static void unmask_timer_irq(unsigned int irq) | ||
109 | { | ||
110 | unsigned long val = apic_read(APIC_LVTT); | ||
111 | apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED); | ||
112 | } | ||
113 | |||
114 | static void ack_timer_irq(unsigned int irq) | ||
115 | { | ||
116 | ack_APIC_irq(); | ||
117 | } | ||
118 | |||
119 | static struct irq_chip vmi_chip __read_mostly = { | ||
120 | .name = "VMI-LOCAL", | ||
121 | .startup = startup_timer_irq, | ||
122 | .mask = mask_timer_irq, | ||
123 | .unmask = unmask_timer_irq, | ||
124 | .ack = ack_timer_irq | ||
125 | }; | ||
126 | #endif | ||
127 | |||
128 | /** vmi clockevent */ | ||
129 | #define VMI_ALARM_WIRED_IRQ0 0x00000000 | ||
130 | #define VMI_ALARM_WIRED_LVTT 0x00010000 | ||
131 | static int vmi_wiring = VMI_ALARM_WIRED_IRQ0; | ||
132 | |||
133 | static inline int vmi_get_alarm_wiring(void) | ||
134 | { | ||
135 | return vmi_wiring; | ||
136 | } | ||
137 | |||
138 | static void vmi_timer_set_mode(enum clock_event_mode mode, | ||
139 | struct clock_event_device *evt) | ||
140 | { | ||
141 | cycle_t now, cycles_per_hz; | ||
142 | BUG_ON(!irqs_disabled()); | ||
143 | |||
144 | switch (mode) { | ||
145 | case CLOCK_EVT_MODE_ONESHOT: | ||
146 | case CLOCK_EVT_MODE_RESUME: | ||
147 | break; | ||
148 | case CLOCK_EVT_MODE_PERIODIC: | ||
149 | cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); | ||
150 | (void)do_div(cycles_per_hz, HZ); | ||
151 | now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC)); | ||
152 | vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz); | ||
153 | break; | ||
154 | case CLOCK_EVT_MODE_UNUSED: | ||
155 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
156 | switch (evt->mode) { | ||
157 | case CLOCK_EVT_MODE_ONESHOT: | ||
158 | vmi_timer_ops.cancel_alarm(VMI_ONESHOT); | ||
159 | break; | ||
160 | case CLOCK_EVT_MODE_PERIODIC: | ||
161 | vmi_timer_ops.cancel_alarm(VMI_PERIODIC); | ||
162 | break; | ||
163 | default: | ||
164 | break; | ||
165 | } | ||
166 | break; | ||
167 | default: | ||
168 | break; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | static int vmi_timer_next_event(unsigned long delta, | ||
173 | struct clock_event_device *evt) | ||
174 | { | ||
175 | /* Unfortunately, set_next_event interface only passes relative | ||
176 | * expiry, but we want absolute expiry. It'd be better if were | ||
177 | * were passed an aboslute expiry, since a bunch of time may | ||
178 | * have been stolen between the time the delta is computed and | ||
179 | * when we set the alarm below. */ | ||
180 | cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); | ||
181 | |||
182 | BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | ||
183 | vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0); | ||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | static struct clock_event_device vmi_clockevent = { | ||
188 | .name = "vmi-timer", | ||
189 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, | ||
190 | .shift = 22, | ||
191 | .set_mode = vmi_timer_set_mode, | ||
192 | .set_next_event = vmi_timer_next_event, | ||
193 | .rating = 1000, | ||
194 | .irq = 0, | ||
195 | }; | ||
196 | |||
197 | static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) | ||
198 | { | ||
199 | struct clock_event_device *evt = &__get_cpu_var(local_events); | ||
200 | evt->event_handler(evt); | ||
201 | return IRQ_HANDLED; | ||
202 | } | ||
203 | |||
204 | static struct irqaction vmi_clock_action = { | ||
205 | .name = "vmi-timer", | ||
206 | .handler = vmi_timer_interrupt, | ||
207 | .flags = IRQF_DISABLED | IRQF_NOBALANCING, | ||
208 | .mask = CPU_MASK_ALL, | ||
209 | }; | ||
210 | |||
211 | static void __devinit vmi_time_init_clockevent(void) | ||
212 | { | ||
213 | cycle_t cycles_per_msec; | ||
214 | struct clock_event_device *evt; | ||
215 | |||
216 | int cpu = smp_processor_id(); | ||
217 | evt = &__get_cpu_var(local_events); | ||
218 | |||
219 | /* Use cycles_per_msec since div_sc params are 32-bits. */ | ||
220 | cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); | ||
221 | (void)do_div(cycles_per_msec, 1000); | ||
222 | |||
223 | memcpy(evt, &vmi_clockevent, sizeof(*evt)); | ||
224 | /* Must pick .shift such that .mult fits in 32-bits. Choosing | ||
225 | * .shift to be 22 allows 2^(32-22) cycles per nano-seconds | ||
226 | * before overflow. */ | ||
227 | evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift); | ||
228 | /* Upper bound is clockevent's use of ulong for cycle deltas. */ | ||
229 | evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); | ||
230 | evt->min_delta_ns = clockevent_delta2ns(1, evt); | ||
231 | evt->cpumask = cpumask_of_cpu(cpu); | ||
232 | |||
233 | printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", | ||
234 | evt->name, evt->mult, evt->shift); | ||
235 | clockevents_register_device(evt); | ||
236 | } | ||
237 | |||
238 | void __init vmi_time_init(void) | ||
239 | { | ||
240 | /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ | ||
241 | outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ | ||
242 | |||
243 | vmi_time_init_clockevent(); | ||
244 | setup_irq(0, &vmi_clock_action); | ||
245 | } | ||
246 | |||
247 | #ifdef CONFIG_X86_LOCAL_APIC | ||
248 | void __devinit vmi_time_bsp_init(void) | ||
249 | { | ||
250 | /* | ||
251 | * On APIC systems, we want local timers to fire on each cpu. We do | ||
252 | * this by programming LVTT to deliver timer events to the IRQ handler | ||
253 | * for IRQ-0, since we can't re-use the APIC local timer handler | ||
254 | * without interfering with that code. | ||
255 | */ | ||
256 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | ||
257 | local_irq_disable(); | ||
258 | #ifdef CONFIG_X86_SMP | ||
259 | /* | ||
260 | * XXX handle_percpu_irq only defined for SMP; we need to switch over | ||
261 | * to using it, since this is a local interrupt, which each CPU must | ||
262 | * handle individually without locking out or dropping simultaneous | ||
263 | * local timers on other CPUs. We also don't want to trigger the | ||
264 | * quirk workaround code for interrupts which gets invoked from | ||
265 | * handle_percpu_irq via eoi, so we use our own IRQ chip. | ||
266 | */ | ||
267 | set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt"); | ||
268 | #else | ||
269 | set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt"); | ||
270 | #endif | ||
271 | vmi_wiring = VMI_ALARM_WIRED_LVTT; | ||
272 | apic_write(APIC_LVTT, vmi_get_timer_vector()); | ||
273 | local_irq_enable(); | ||
274 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | ||
275 | } | ||
276 | |||
277 | void __devinit vmi_time_ap_init(void) | ||
278 | { | ||
279 | vmi_time_init_clockevent(); | ||
280 | apic_write(APIC_LVTT, vmi_get_timer_vector()); | ||
281 | } | ||
282 | #endif | ||
283 | |||
284 | /** vmi clocksource */ | ||
285 | |||
286 | static cycle_t read_real_cycles(void) | ||
287 | { | ||
288 | return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); | ||
289 | } | ||
290 | |||
291 | static struct clocksource clocksource_vmi = { | ||
292 | .name = "vmi-timer", | ||
293 | .rating = 450, | ||
294 | .read = read_real_cycles, | ||
295 | .mask = CLOCKSOURCE_MASK(64), | ||
296 | .mult = 0, /* to be set */ | ||
297 | .shift = 22, | ||
298 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
299 | }; | ||
300 | |||
301 | static int __init init_vmi_clocksource(void) | ||
302 | { | ||
303 | cycle_t cycles_per_msec; | ||
304 | |||
305 | if (!vmi_timer_ops.get_cycle_frequency) | ||
306 | return 0; | ||
307 | /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */ | ||
308 | cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); | ||
309 | (void)do_div(cycles_per_msec, 1000); | ||
310 | |||
311 | /* Note that clocksource.{mult, shift} converts in the opposite direction | ||
312 | * as clockevents. */ | ||
313 | clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, | ||
314 | clocksource_vmi.shift); | ||
315 | |||
316 | printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec); | ||
317 | return clocksource_register(&clocksource_vmi); | ||
318 | |||
319 | } | ||
320 | module_init(init_vmi_clocksource); | ||
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S new file mode 100644 index 000000000000..849ee611f013 --- /dev/null +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -0,0 +1,5 @@ | |||
1 | #ifdef CONFIG_X86_32 | ||
2 | # include "vmlinux_32.lds.S" | ||
3 | #else | ||
4 | # include "vmlinux_64.lds.S" | ||
5 | #endif | ||
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S new file mode 100644 index 000000000000..7d72cce00529 --- /dev/null +++ b/arch/x86/kernel/vmlinux_32.lds.S | |||
@@ -0,0 +1,213 @@ | |||
1 | /* ld script to make i386 Linux kernel | ||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | ||
3 | * | ||
4 | * Don't define absolute symbols until and unless you know that symbol | ||
5 | * value is should remain constant even if kernel image is relocated | ||
6 | * at run time. Absolute symbols are not relocated. If symbol value should | ||
7 | * change if kernel is relocated, make the symbol section relative and | ||
8 | * put it inside the section definition. | ||
9 | */ | ||
10 | |||
11 | /* Don't define absolute symbols until and unless you know that symbol | ||
12 | * value is should remain constant even if kernel image is relocated | ||
13 | * at run time. Absolute symbols are not relocated. If symbol value should | ||
14 | * change if kernel is relocated, make the symbol section relative and | ||
15 | * put it inside the section definition. | ||
16 | */ | ||
17 | #define LOAD_OFFSET __PAGE_OFFSET | ||
18 | |||
19 | #include <asm-generic/vmlinux.lds.h> | ||
20 | #include <asm/thread_info.h> | ||
21 | #include <asm/page.h> | ||
22 | #include <asm/cache.h> | ||
23 | #include <asm/boot.h> | ||
24 | |||
25 | OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") | ||
26 | OUTPUT_ARCH(i386) | ||
27 | ENTRY(phys_startup_32) | ||
28 | jiffies = jiffies_64; | ||
29 | |||
30 | PHDRS { | ||
31 | text PT_LOAD FLAGS(5); /* R_E */ | ||
32 | data PT_LOAD FLAGS(7); /* RWE */ | ||
33 | note PT_NOTE FLAGS(0); /* ___ */ | ||
34 | } | ||
35 | SECTIONS | ||
36 | { | ||
37 | . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; | ||
38 | phys_startup_32 = startup_32 - LOAD_OFFSET; | ||
39 | |||
40 | .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { | ||
41 | _text = .; /* Text and read-only data */ | ||
42 | *(.text.head) | ||
43 | } :text = 0x9090 | ||
44 | |||
45 | /* read-only */ | ||
46 | .text : AT(ADDR(.text) - LOAD_OFFSET) { | ||
47 | TEXT_TEXT | ||
48 | SCHED_TEXT | ||
49 | LOCK_TEXT | ||
50 | KPROBES_TEXT | ||
51 | *(.fixup) | ||
52 | *(.gnu.warning) | ||
53 | _etext = .; /* End of text section */ | ||
54 | } :text = 0x9090 | ||
55 | |||
56 | . = ALIGN(16); /* Exception table */ | ||
57 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { | ||
58 | __start___ex_table = .; | ||
59 | *(__ex_table) | ||
60 | __stop___ex_table = .; | ||
61 | } | ||
62 | |||
63 | NOTES :text :note | ||
64 | |||
65 | BUG_TABLE :text | ||
66 | |||
67 | . = ALIGN(4); | ||
68 | .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { | ||
69 | __tracedata_start = .; | ||
70 | *(.tracedata) | ||
71 | __tracedata_end = .; | ||
72 | } | ||
73 | |||
74 | RODATA | ||
75 | |||
76 | /* writeable */ | ||
77 | . = ALIGN(4096); | ||
78 | .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ | ||
79 | DATA_DATA | ||
80 | CONSTRUCTORS | ||
81 | } :data | ||
82 | |||
83 | . = ALIGN(4096); | ||
84 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
85 | __nosave_begin = .; | ||
86 | *(.data.nosave) | ||
87 | . = ALIGN(4096); | ||
88 | __nosave_end = .; | ||
89 | } | ||
90 | |||
91 | . = ALIGN(4096); | ||
92 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
93 | *(.data.page_aligned) | ||
94 | *(.data.idt) | ||
95 | } | ||
96 | |||
97 | . = ALIGN(32); | ||
98 | .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
99 | *(.data.cacheline_aligned) | ||
100 | } | ||
101 | |||
102 | /* rarely changed data like cpu maps */ | ||
103 | . = ALIGN(32); | ||
104 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
105 | *(.data.read_mostly) | ||
106 | _edata = .; /* End of data section */ | ||
107 | } | ||
108 | |||
109 | . = ALIGN(THREAD_SIZE); /* init_task */ | ||
110 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | ||
111 | *(.data.init_task) | ||
112 | } | ||
113 | |||
114 | /* might get freed after init */ | ||
115 | . = ALIGN(4096); | ||
116 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
117 | __smp_locks = .; | ||
118 | *(.smp_locks) | ||
119 | __smp_locks_end = .; | ||
120 | } | ||
121 | /* will be freed after init | ||
122 | * Following ALIGN() is required to make sure no other data falls on the | ||
123 | * same page where __smp_alt_end is pointing as that page might be freed | ||
124 | * after boot. Always make sure that ALIGN() directive is present after | ||
125 | * the section which contains __smp_alt_end. | ||
126 | */ | ||
127 | . = ALIGN(4096); | ||
128 | |||
129 | /* will be freed after init */ | ||
130 | . = ALIGN(4096); /* Init code and data */ | ||
131 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | ||
132 | __init_begin = .; | ||
133 | _sinittext = .; | ||
134 | *(.init.text) | ||
135 | _einittext = .; | ||
136 | } | ||
137 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } | ||
138 | . = ALIGN(16); | ||
139 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { | ||
140 | __setup_start = .; | ||
141 | *(.init.setup) | ||
142 | __setup_end = .; | ||
143 | } | ||
144 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { | ||
145 | __initcall_start = .; | ||
146 | INITCALLS | ||
147 | __initcall_end = .; | ||
148 | } | ||
149 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { | ||
150 | __con_initcall_start = .; | ||
151 | *(.con_initcall.init) | ||
152 | __con_initcall_end = .; | ||
153 | } | ||
154 | SECURITY_INIT | ||
155 | . = ALIGN(4); | ||
156 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { | ||
157 | __alt_instructions = .; | ||
158 | *(.altinstructions) | ||
159 | __alt_instructions_end = .; | ||
160 | } | ||
161 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { | ||
162 | *(.altinstr_replacement) | ||
163 | } | ||
164 | . = ALIGN(4); | ||
165 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { | ||
166 | __parainstructions = .; | ||
167 | *(.parainstructions) | ||
168 | __parainstructions_end = .; | ||
169 | } | ||
170 | /* .exit.text is discard at runtime, not link time, to deal with references | ||
171 | from .altinstructions and .eh_frame */ | ||
172 | .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } | ||
173 | .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } | ||
174 | #if defined(CONFIG_BLK_DEV_INITRD) | ||
175 | . = ALIGN(4096); | ||
176 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { | ||
177 | __initramfs_start = .; | ||
178 | *(.init.ramfs) | ||
179 | __initramfs_end = .; | ||
180 | } | ||
181 | #endif | ||
182 | . = ALIGN(4096); | ||
183 | .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { | ||
184 | __per_cpu_start = .; | ||
185 | *(.data.percpu) | ||
186 | *(.data.percpu.shared_aligned) | ||
187 | __per_cpu_end = .; | ||
188 | } | ||
189 | . = ALIGN(4096); | ||
190 | /* freed after init ends here */ | ||
191 | |||
192 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { | ||
193 | __init_end = .; | ||
194 | __bss_start = .; /* BSS */ | ||
195 | *(.bss.page_aligned) | ||
196 | *(.bss) | ||
197 | . = ALIGN(4); | ||
198 | __bss_stop = .; | ||
199 | _end = . ; | ||
200 | /* This is where the kernel creates the early boot page tables */ | ||
201 | . = ALIGN(4096); | ||
202 | pg0 = . ; | ||
203 | } | ||
204 | |||
205 | /* Sections to be discarded */ | ||
206 | /DISCARD/ : { | ||
207 | *(.exitcall.exit) | ||
208 | } | ||
209 | |||
210 | STABS_DEBUG | ||
211 | |||
212 | DWARF_DEBUG | ||
213 | } | ||
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S new file mode 100644 index 000000000000..103cab6aa7c0 --- /dev/null +++ b/arch/x86/kernel/vsyscall-int80_32.S | |||
@@ -0,0 +1,53 @@ | |||
1 | /* | ||
2 | * Code for the vsyscall page. This version uses the old int $0x80 method. | ||
3 | * | ||
4 | * NOTE: | ||
5 | * 1) __kernel_vsyscall _must_ be first in this page. | ||
6 | * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S | ||
7 | * for details. | ||
8 | */ | ||
9 | |||
10 | .text | ||
11 | .globl __kernel_vsyscall | ||
12 | .type __kernel_vsyscall,@function | ||
13 | __kernel_vsyscall: | ||
14 | .LSTART_vsyscall: | ||
15 | int $0x80 | ||
16 | ret | ||
17 | .LEND_vsyscall: | ||
18 | .size __kernel_vsyscall,.-.LSTART_vsyscall | ||
19 | .previous | ||
20 | |||
21 | .section .eh_frame,"a",@progbits | ||
22 | .LSTARTFRAMEDLSI: | ||
23 | .long .LENDCIEDLSI-.LSTARTCIEDLSI | ||
24 | .LSTARTCIEDLSI: | ||
25 | .long 0 /* CIE ID */ | ||
26 | .byte 1 /* Version number */ | ||
27 | .string "zR" /* NUL-terminated augmentation string */ | ||
28 | .uleb128 1 /* Code alignment factor */ | ||
29 | .sleb128 -4 /* Data alignment factor */ | ||
30 | .byte 8 /* Return address register column */ | ||
31 | .uleb128 1 /* Augmentation value length */ | ||
32 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
33 | .byte 0x0c /* DW_CFA_def_cfa */ | ||
34 | .uleb128 4 | ||
35 | .uleb128 4 | ||
36 | .byte 0x88 /* DW_CFA_offset, column 0x8 */ | ||
37 | .uleb128 1 | ||
38 | .align 4 | ||
39 | .LENDCIEDLSI: | ||
40 | .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ | ||
41 | .LSTARTFDEDLSI: | ||
42 | .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ | ||
43 | .long .LSTART_vsyscall-. /* PC-relative start address */ | ||
44 | .long .LEND_vsyscall-.LSTART_vsyscall | ||
45 | .uleb128 0 | ||
46 | .align 4 | ||
47 | .LENDFDEDLSI: | ||
48 | .previous | ||
49 | |||
50 | /* | ||
51 | * Get the common code for the sigreturn entry points. | ||
52 | */ | ||
53 | #include "vsyscall-sigreturn_32.S" | ||
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S new file mode 100644 index 000000000000..fcf376a37f79 --- /dev/null +++ b/arch/x86/kernel/vsyscall-note_32.S | |||
@@ -0,0 +1,45 @@ | |||
1 | /* | ||
2 | * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. | ||
3 | * Here we can supply some information useful to userland. | ||
4 | */ | ||
5 | |||
6 | #include <linux/version.h> | ||
7 | #include <linux/elfnote.h> | ||
8 | |||
9 | /* Ideally this would use UTS_NAME, but using a quoted string here | ||
10 | doesn't work. Remember to change this when changing the | ||
11 | kernel's name. */ | ||
12 | ELFNOTE_START(Linux, 0, "a") | ||
13 | .long LINUX_VERSION_CODE | ||
14 | ELFNOTE_END | ||
15 | |||
16 | #ifdef CONFIG_XEN | ||
17 | /* | ||
18 | * Add a special note telling glibc's dynamic linker a fake hardware | ||
19 | * flavor that it will use to choose the search path for libraries in the | ||
20 | * same way it uses real hardware capabilities like "mmx". | ||
21 | * We supply "nosegneg" as the fake capability, to indicate that we | ||
22 | * do not like negative offsets in instructions using segment overrides, | ||
23 | * since we implement those inefficiently. This makes it possible to | ||
24 | * install libraries optimized to avoid those access patterns in someplace | ||
25 | * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file | ||
26 | * corresponding to the bits here is needed to make ldconfig work right. | ||
27 | * It should contain: | ||
28 | * hwcap 1 nosegneg | ||
29 | * to match the mapping of bit to name that we give here. | ||
30 | * | ||
31 | * At runtime, the fake hardware feature will be considered to be present | ||
32 | * if its bit is set in the mask word. So, we start with the mask 0, and | ||
33 | * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. | ||
34 | */ | ||
35 | |||
36 | #include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ | ||
37 | |||
38 | .globl VDSO_NOTE_MASK | ||
39 | ELFNOTE_START(GNU, 2, "a") | ||
40 | .long 1 /* ncaps */ | ||
41 | VDSO_NOTE_MASK: | ||
42 | .long 0 /* mask */ | ||
43 | .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ | ||
44 | ELFNOTE_END | ||
45 | #endif | ||
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S new file mode 100644 index 000000000000..a92262f41659 --- /dev/null +++ b/arch/x86/kernel/vsyscall-sigreturn_32.S | |||
@@ -0,0 +1,143 @@ | |||
1 | /* | ||
2 | * Common code for the sigreturn entry points on the vsyscall page. | ||
3 | * So far this code is the same for both int80 and sysenter versions. | ||
4 | * This file is #include'd by vsyscall-*.S to define them after the | ||
5 | * vsyscall entry point. The kernel assumes that the addresses of these | ||
6 | * routines are constant for all vsyscall implementations. | ||
7 | */ | ||
8 | |||
9 | #include <asm/unistd.h> | ||
10 | #include <asm/asm-offsets.h> | ||
11 | |||
12 | |||
13 | /* XXX | ||
14 | Should these be named "_sigtramp" or something? | ||
15 | */ | ||
16 | |||
17 | .text | ||
18 | .org __kernel_vsyscall+32,0x90 | ||
19 | .globl __kernel_sigreturn | ||
20 | .type __kernel_sigreturn,@function | ||
21 | __kernel_sigreturn: | ||
22 | .LSTART_sigreturn: | ||
23 | popl %eax /* XXX does this mean it needs unwind info? */ | ||
24 | movl $__NR_sigreturn, %eax | ||
25 | int $0x80 | ||
26 | .LEND_sigreturn: | ||
27 | .size __kernel_sigreturn,.-.LSTART_sigreturn | ||
28 | |||
29 | .balign 32 | ||
30 | .globl __kernel_rt_sigreturn | ||
31 | .type __kernel_rt_sigreturn,@function | ||
32 | __kernel_rt_sigreturn: | ||
33 | .LSTART_rt_sigreturn: | ||
34 | movl $__NR_rt_sigreturn, %eax | ||
35 | int $0x80 | ||
36 | .LEND_rt_sigreturn: | ||
37 | .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn | ||
38 | .balign 32 | ||
39 | .previous | ||
40 | |||
41 | .section .eh_frame,"a",@progbits | ||
42 | .LSTARTFRAMEDLSI1: | ||
43 | .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 | ||
44 | .LSTARTCIEDLSI1: | ||
45 | .long 0 /* CIE ID */ | ||
46 | .byte 1 /* Version number */ | ||
47 | .string "zRS" /* NUL-terminated augmentation string */ | ||
48 | .uleb128 1 /* Code alignment factor */ | ||
49 | .sleb128 -4 /* Data alignment factor */ | ||
50 | .byte 8 /* Return address register column */ | ||
51 | .uleb128 1 /* Augmentation value length */ | ||
52 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
53 | .byte 0 /* DW_CFA_nop */ | ||
54 | .align 4 | ||
55 | .LENDCIEDLSI1: | ||
56 | .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ | ||
57 | .LSTARTFDEDLSI1: | ||
58 | .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ | ||
59 | /* HACK: The dwarf2 unwind routines will subtract 1 from the | ||
60 | return address to get an address in the middle of the | ||
61 | presumed call instruction. Since we didn't get here via | ||
62 | a call, we need to include the nop before the real start | ||
63 | to make up for it. */ | ||
64 | .long .LSTART_sigreturn-1-. /* PC-relative start address */ | ||
65 | .long .LEND_sigreturn-.LSTART_sigreturn+1 | ||
66 | .uleb128 0 /* Augmentation */ | ||
67 | /* What follows are the instructions for the table generation. | ||
68 | We record the locations of each register saved. This is | ||
69 | complicated by the fact that the "CFA" is always assumed to | ||
70 | be the value of the stack pointer in the caller. This means | ||
71 | that we must define the CFA of this body of code to be the | ||
72 | saved value of the stack pointer in the sigcontext. Which | ||
73 | also means that there is no fixed relation to the other | ||
74 | saved registers, which means that we must use DW_CFA_expression | ||
75 | to compute their addresses. It also means that when we | ||
76 | adjust the stack with the popl, we have to do it all over again. */ | ||
77 | |||
78 | #define do_cfa_expr(offset) \ | ||
79 | .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ | ||
80 | .uleb128 1f-0f; /* length */ \ | ||
81 | 0: .byte 0x74; /* DW_OP_breg4 */ \ | ||
82 | .sleb128 offset; /* offset */ \ | ||
83 | .byte 0x06; /* DW_OP_deref */ \ | ||
84 | 1: | ||
85 | |||
86 | #define do_expr(regno, offset) \ | ||
87 | .byte 0x10; /* DW_CFA_expression */ \ | ||
88 | .uleb128 regno; /* regno */ \ | ||
89 | .uleb128 1f-0f; /* length */ \ | ||
90 | 0: .byte 0x74; /* DW_OP_breg4 */ \ | ||
91 | .sleb128 offset; /* offset */ \ | ||
92 | 1: | ||
93 | |||
94 | do_cfa_expr(SIGCONTEXT_esp+4) | ||
95 | do_expr(0, SIGCONTEXT_eax+4) | ||
96 | do_expr(1, SIGCONTEXT_ecx+4) | ||
97 | do_expr(2, SIGCONTEXT_edx+4) | ||
98 | do_expr(3, SIGCONTEXT_ebx+4) | ||
99 | do_expr(5, SIGCONTEXT_ebp+4) | ||
100 | do_expr(6, SIGCONTEXT_esi+4) | ||
101 | do_expr(7, SIGCONTEXT_edi+4) | ||
102 | do_expr(8, SIGCONTEXT_eip+4) | ||
103 | |||
104 | .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ | ||
105 | |||
106 | do_cfa_expr(SIGCONTEXT_esp) | ||
107 | do_expr(0, SIGCONTEXT_eax) | ||
108 | do_expr(1, SIGCONTEXT_ecx) | ||
109 | do_expr(2, SIGCONTEXT_edx) | ||
110 | do_expr(3, SIGCONTEXT_ebx) | ||
111 | do_expr(5, SIGCONTEXT_ebp) | ||
112 | do_expr(6, SIGCONTEXT_esi) | ||
113 | do_expr(7, SIGCONTEXT_edi) | ||
114 | do_expr(8, SIGCONTEXT_eip) | ||
115 | |||
116 | .align 4 | ||
117 | .LENDFDEDLSI1: | ||
118 | |||
119 | .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ | ||
120 | .LSTARTFDEDLSI2: | ||
121 | .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ | ||
122 | /* HACK: See above wrt unwind library assumptions. */ | ||
123 | .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ | ||
124 | .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 | ||
125 | .uleb128 0 /* Augmentation */ | ||
126 | /* What follows are the instructions for the table generation. | ||
127 | We record the locations of each register saved. This is | ||
128 | slightly less complicated than the above, since we don't | ||
129 | modify the stack pointer in the process. */ | ||
130 | |||
131 | do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp) | ||
132 | do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax) | ||
133 | do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx) | ||
134 | do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx) | ||
135 | do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx) | ||
136 | do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp) | ||
137 | do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi) | ||
138 | do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi) | ||
139 | do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip) | ||
140 | |||
141 | .align 4 | ||
142 | .LENDFDEDLSI2: | ||
143 | .previous | ||
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S new file mode 100644 index 000000000000..ed879bf42995 --- /dev/null +++ b/arch/x86/kernel/vsyscall-sysenter_32.S | |||
@@ -0,0 +1,122 @@ | |||
1 | /* | ||
2 | * Code for the vsyscall page. This version uses the sysenter instruction. | ||
3 | * | ||
4 | * NOTE: | ||
5 | * 1) __kernel_vsyscall _must_ be first in this page. | ||
6 | * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S | ||
7 | * for details. | ||
8 | */ | ||
9 | |||
10 | /* | ||
11 | * The caller puts arg2 in %ecx, which gets pushed. The kernel will use | ||
12 | * %ecx itself for arg2. The pushing is because the sysexit instruction | ||
13 | * (found in entry.S) requires that we clobber %ecx with the desired %esp. | ||
14 | * User code might expect that %ecx is unclobbered though, as it would be | ||
15 | * for returning via the iret instruction, so we must push and pop. | ||
16 | * | ||
17 | * The caller puts arg3 in %edx, which the sysexit instruction requires | ||
18 | * for %eip. Thus, exactly as for arg2, we must push and pop. | ||
19 | * | ||
20 | * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter | ||
21 | * instruction clobbers %esp, the user's %esp won't even survive entry | ||
22 | * into the kernel. We store %esp in %ebp. Code in entry.S must fetch | ||
23 | * arg6 from the stack. | ||
24 | * | ||
25 | * You can not use this vsyscall for the clone() syscall because the | ||
26 | * three dwords on the parent stack do not get copied to the child. | ||
27 | */ | ||
28 | .text | ||
29 | .globl __kernel_vsyscall | ||
30 | .type __kernel_vsyscall,@function | ||
31 | __kernel_vsyscall: | ||
32 | .LSTART_vsyscall: | ||
33 | push %ecx | ||
34 | .Lpush_ecx: | ||
35 | push %edx | ||
36 | .Lpush_edx: | ||
37 | push %ebp | ||
38 | .Lenter_kernel: | ||
39 | movl %esp,%ebp | ||
40 | sysenter | ||
41 | |||
42 | /* 7: align return point with nop's to make disassembly easier */ | ||
43 | .space 7,0x90 | ||
44 | |||
45 | /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ | ||
46 | jmp .Lenter_kernel | ||
47 | /* 16: System call normal return point is here! */ | ||
48 | .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ | ||
49 | SYSENTER_RETURN: | ||
50 | pop %ebp | ||
51 | .Lpop_ebp: | ||
52 | pop %edx | ||
53 | .Lpop_edx: | ||
54 | pop %ecx | ||
55 | .Lpop_ecx: | ||
56 | ret | ||
57 | .LEND_vsyscall: | ||
58 | .size __kernel_vsyscall,.-.LSTART_vsyscall | ||
59 | .previous | ||
60 | |||
61 | .section .eh_frame,"a",@progbits | ||
62 | .LSTARTFRAMEDLSI: | ||
63 | .long .LENDCIEDLSI-.LSTARTCIEDLSI | ||
64 | .LSTARTCIEDLSI: | ||
65 | .long 0 /* CIE ID */ | ||
66 | .byte 1 /* Version number */ | ||
67 | .string "zR" /* NUL-terminated augmentation string */ | ||
68 | .uleb128 1 /* Code alignment factor */ | ||
69 | .sleb128 -4 /* Data alignment factor */ | ||
70 | .byte 8 /* Return address register column */ | ||
71 | .uleb128 1 /* Augmentation value length */ | ||
72 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
73 | .byte 0x0c /* DW_CFA_def_cfa */ | ||
74 | .uleb128 4 | ||
75 | .uleb128 4 | ||
76 | .byte 0x88 /* DW_CFA_offset, column 0x8 */ | ||
77 | .uleb128 1 | ||
78 | .align 4 | ||
79 | .LENDCIEDLSI: | ||
80 | .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ | ||
81 | .LSTARTFDEDLSI: | ||
82 | .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ | ||
83 | .long .LSTART_vsyscall-. /* PC-relative start address */ | ||
84 | .long .LEND_vsyscall-.LSTART_vsyscall | ||
85 | .uleb128 0 | ||
86 | /* What follows are the instructions for the table generation. | ||
87 | We have to record all changes of the stack pointer. */ | ||
88 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
89 | .long .Lpush_ecx-.LSTART_vsyscall | ||
90 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
91 | .byte 0x08 /* RA at offset 8 now */ | ||
92 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
93 | .long .Lpush_edx-.Lpush_ecx | ||
94 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
95 | .byte 0x0c /* RA at offset 12 now */ | ||
96 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
97 | .long .Lenter_kernel-.Lpush_edx | ||
98 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
99 | .byte 0x10 /* RA at offset 16 now */ | ||
100 | .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ | ||
101 | /* Finally the epilogue. */ | ||
102 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
103 | .long .Lpop_ebp-.Lenter_kernel | ||
104 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
105 | .byte 0x0c /* RA at offset 12 now */ | ||
106 | .byte 0xc5 /* DW_CFA_restore %ebp */ | ||
107 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
108 | .long .Lpop_edx-.Lpop_ebp | ||
109 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
110 | .byte 0x08 /* RA at offset 8 now */ | ||
111 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
112 | .long .Lpop_ecx-.Lpop_edx | ||
113 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
114 | .byte 0x04 /* RA at offset 4 now */ | ||
115 | .align 4 | ||
116 | .LENDFDEDLSI: | ||
117 | .previous | ||
118 | |||
119 | /* | ||
120 | * Get the common code for the sigreturn entry points. | ||
121 | */ | ||
122 | #include "vsyscall-sigreturn_32.S" | ||
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S new file mode 100644 index 000000000000..a5ab3dc4fd25 --- /dev/null +++ b/arch/x86/kernel/vsyscall_32.S | |||
@@ -0,0 +1,15 @@ | |||
1 | #include <linux/init.h> | ||
2 | |||
3 | __INITDATA | ||
4 | |||
5 | .globl vsyscall_int80_start, vsyscall_int80_end | ||
6 | vsyscall_int80_start: | ||
7 | .incbin "arch/x86/kernel/vsyscall-int80_32.so" | ||
8 | vsyscall_int80_end: | ||
9 | |||
10 | .globl vsyscall_sysenter_start, vsyscall_sysenter_end | ||
11 | vsyscall_sysenter_start: | ||
12 | .incbin "arch/x86/kernel/vsyscall-sysenter_32.so" | ||
13 | vsyscall_sysenter_end: | ||
14 | |||
15 | __FINIT | ||
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S new file mode 100644 index 000000000000..4a8b0ed9b8fb --- /dev/null +++ b/arch/x86/kernel/vsyscall_32.lds.S | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Linker script for vsyscall DSO. The vsyscall page is an ELF shared | ||
3 | * object prelinked to its virtual address, and with only one read-only | ||
4 | * segment (that fits in one page). This script controls its layout. | ||
5 | */ | ||
6 | #include <asm/asm-offsets.h> | ||
7 | |||
8 | SECTIONS | ||
9 | { | ||
10 | . = VDSO_PRELINK_asm + SIZEOF_HEADERS; | ||
11 | |||
12 | .hash : { *(.hash) } :text | ||
13 | .gnu.hash : { *(.gnu.hash) } | ||
14 | .dynsym : { *(.dynsym) } | ||
15 | .dynstr : { *(.dynstr) } | ||
16 | .gnu.version : { *(.gnu.version) } | ||
17 | .gnu.version_d : { *(.gnu.version_d) } | ||
18 | .gnu.version_r : { *(.gnu.version_r) } | ||
19 | |||
20 | /* This linker script is used both with -r and with -shared. | ||
21 | For the layouts to match, we need to skip more than enough | ||
22 | space for the dynamic symbol table et al. If this amount | ||
23 | is insufficient, ld -shared will barf. Just increase it here. */ | ||
24 | . = VDSO_PRELINK_asm + 0x400; | ||
25 | |||
26 | .text : { *(.text) } :text =0x90909090 | ||
27 | .note : { *(.note.*) } :text :note | ||
28 | .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr | ||
29 | .eh_frame : { KEEP (*(.eh_frame)) } :text | ||
30 | .dynamic : { *(.dynamic) } :text :dynamic | ||
31 | .useless : { | ||
32 | *(.got.plt) *(.got) | ||
33 | *(.data .data.* .gnu.linkonce.d.*) | ||
34 | *(.dynbss) | ||
35 | *(.bss .bss.* .gnu.linkonce.b.*) | ||
36 | } :text | ||
37 | } | ||
38 | |||
39 | /* | ||
40 | * We must supply the ELF program headers explicitly to get just one | ||
41 | * PT_LOAD segment, and set the flags explicitly to make segments read-only. | ||
42 | */ | ||
43 | PHDRS | ||
44 | { | ||
45 | text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ | ||
46 | dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ | ||
47 | note PT_NOTE FLAGS(4); /* PF_R */ | ||
48 | eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ | ||
49 | } | ||
50 | |||
51 | /* | ||
52 | * This controls what symbols we export from the DSO. | ||
53 | */ | ||
54 | VERSION | ||
55 | { | ||
56 | LINUX_2.5 { | ||
57 | global: | ||
58 | __kernel_vsyscall; | ||
59 | __kernel_sigreturn; | ||
60 | __kernel_rt_sigreturn; | ||
61 | |||
62 | local: *; | ||
63 | }; | ||
64 | } | ||
65 | |||
66 | /* The ELF entry point can be used to set the AT_SYSINFO value. */ | ||
67 | ENTRY(__kernel_vsyscall); | ||