diff options
Diffstat (limited to 'arch/x86_64')
151 files changed, 33 insertions, 38930 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index b4d9089a6a06..eb80f5aca54e 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig | |||
@@ -36,6 +36,18 @@ config GENERIC_CMOS_UPDATE | |||
36 | bool | 36 | bool |
37 | default y | 37 | default y |
38 | 38 | ||
39 | config CLOCKSOURCE_WATCHDOG | ||
40 | bool | ||
41 | default y | ||
42 | |||
43 | config GENERIC_CLOCKEVENTS | ||
44 | bool | ||
45 | default y | ||
46 | |||
47 | config GENERIC_CLOCKEVENTS_BROADCAST | ||
48 | bool | ||
49 | default y | ||
50 | |||
39 | config ZONE_DMA32 | 51 | config ZONE_DMA32 |
40 | bool | 52 | bool |
41 | default y | 53 | default y |
@@ -130,6 +142,8 @@ source "init/Kconfig" | |||
130 | 142 | ||
131 | menu "Processor type and features" | 143 | menu "Processor type and features" |
132 | 144 | ||
145 | source "kernel/time/Kconfig" | ||
146 | |||
133 | choice | 147 | choice |
134 | prompt "Subarchitecture Type" | 148 | prompt "Subarchitecture Type" |
135 | default X86_PC | 149 | default X86_PC |
@@ -704,7 +718,7 @@ source kernel/power/Kconfig | |||
704 | 718 | ||
705 | source "drivers/acpi/Kconfig" | 719 | source "drivers/acpi/Kconfig" |
706 | 720 | ||
707 | source "arch/x86_64/kernel/cpufreq/Kconfig" | 721 | source "arch/x86/kernel/cpufreq/Kconfig" |
708 | 722 | ||
709 | endmenu | 723 | endmenu |
710 | 724 | ||
@@ -778,7 +792,7 @@ source fs/Kconfig | |||
778 | menu "Instrumentation Support" | 792 | menu "Instrumentation Support" |
779 | depends on EXPERIMENTAL | 793 | depends on EXPERIMENTAL |
780 | 794 | ||
781 | source "arch/x86_64/oprofile/Kconfig" | 795 | source "arch/x86/oprofile/Kconfig" |
782 | 796 | ||
783 | config KPROBES | 797 | config KPROBES |
784 | bool "Kprobes" | 798 | bool "Kprobes" |
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index b024e4a86895..8bffb94c71b5 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile | |||
@@ -21,6 +21,9 @@ | |||
21 | # | 21 | # |
22 | # $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ | 22 | # $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ |
23 | 23 | ||
24 | # Fill in SRCARCH | ||
25 | SRCARCH := x86 | ||
26 | |||
24 | LDFLAGS := -m elf_x86_64 | 27 | LDFLAGS := -m elf_x86_64 |
25 | OBJCOPYFLAGS := -O binary -R .note -R .comment -S | 28 | OBJCOPYFLAGS := -O binary -R .note -R .comment -S |
26 | LDFLAGS_vmlinux := | 29 | LDFLAGS_vmlinux := |
@@ -71,18 +74,18 @@ CFLAGS += $(cflags-y) | |||
71 | CFLAGS_KERNEL += $(cflags-kernel-y) | 74 | CFLAGS_KERNEL += $(cflags-kernel-y) |
72 | AFLAGS += -m64 | 75 | AFLAGS += -m64 |
73 | 76 | ||
74 | head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o | 77 | head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task_64.o |
75 | 78 | ||
76 | libs-y += arch/x86_64/lib/ | 79 | libs-y += arch/x86/lib/ |
77 | core-y += arch/x86_64/kernel/ \ | 80 | core-y += arch/x86/kernel/ \ |
78 | arch/x86_64/mm/ \ | 81 | arch/x86/mm/ \ |
79 | arch/x86_64/crypto/ \ | 82 | arch/x86/crypto/ \ |
80 | arch/x86_64/vdso/ | 83 | arch/x86/vdso/ |
81 | core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ | 84 | core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ |
82 | drivers-$(CONFIG_PCI) += arch/x86_64/pci/ | 85 | drivers-$(CONFIG_PCI) += arch/x86/pci/ |
83 | drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ | 86 | drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ |
84 | 87 | ||
85 | boot := arch/x86_64/boot | 88 | boot := arch/x86/boot |
86 | 89 | ||
87 | PHONY += bzImage bzlilo install archmrproper \ | 90 | PHONY += bzImage bzlilo install archmrproper \ |
88 | fdimage fdimage144 fdimage288 isoimage archclean | 91 | fdimage fdimage144 fdimage288 isoimage archclean |
@@ -90,10 +93,12 @@ PHONY += bzImage bzlilo install archmrproper \ | |||
90 | #Default target when executing "make" | 93 | #Default target when executing "make" |
91 | all: bzImage | 94 | all: bzImage |
92 | 95 | ||
93 | BOOTIMAGE := arch/x86_64/boot/bzImage | 96 | BOOTIMAGE := arch/x86/boot/bzImage |
94 | KBUILD_IMAGE := $(BOOTIMAGE) | 97 | KBUILD_IMAGE := $(BOOTIMAGE) |
95 | 98 | ||
96 | bzImage: vmlinux | 99 | bzImage: vmlinux |
100 | $(Q)mkdir -p $(objtree)/arch/x86_64/boot | ||
101 | $(Q)ln -fsn $(objtree)/arch/x86/boot/bzImage $(objtree)/arch/x86_64/boot/bzImage | ||
97 | $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE) | 102 | $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE) |
98 | 103 | ||
99 | bzlilo: vmlinux | 104 | bzlilo: vmlinux |
@@ -109,6 +114,7 @@ install: | |||
109 | $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ | 114 | $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ |
110 | 115 | ||
111 | archclean: | 116 | archclean: |
117 | $(Q)rm -rf $(objtree)/arch/x86_64/boot | ||
112 | $(Q)$(MAKE) $(clean)=$(boot) | 118 | $(Q)$(MAKE) $(clean)=$(boot) |
113 | 119 | ||
114 | define archhelp | 120 | define archhelp |
diff --git a/arch/x86_64/boot/.gitignore b/arch/x86_64/boot/.gitignore deleted file mode 100644 index 18465143cfa2..000000000000 --- a/arch/x86_64/boot/.gitignore +++ /dev/null | |||
@@ -1,5 +0,0 @@ | |||
1 | bootsect | ||
2 | bzImage | ||
3 | setup | ||
4 | setup.bin | ||
5 | setup.elf | ||
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile deleted file mode 100644 index 67096389de1f..000000000000 --- a/arch/x86_64/boot/Makefile +++ /dev/null | |||
@@ -1,9 +0,0 @@ | |||
1 | # | ||
2 | # arch/x86_64/boot/Makefile | ||
3 | # | ||
4 | # The actual boot code is shared with i386 including the Makefile. | ||
5 | # So tell kbuild that we fetch the code from i386 and include the | ||
6 | # Makefile from i386 too. | ||
7 | |||
8 | src := arch/i386/boot | ||
9 | include $(src)/Makefile | ||
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile deleted file mode 100644 index 877c0bdbbc67..000000000000 --- a/arch/x86_64/boot/compressed/Makefile +++ /dev/null | |||
@@ -1,30 +0,0 @@ | |||
1 | # | ||
2 | # linux/arch/x86_64/boot/compressed/Makefile | ||
3 | # | ||
4 | # create a compressed vmlinux image from the original vmlinux | ||
5 | # | ||
6 | |||
7 | targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o | ||
8 | |||
9 | CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \ | ||
10 | -fno-strict-aliasing -fPIC -mcmodel=small \ | ||
11 | $(call cc-option, -ffreestanding) \ | ||
12 | $(call cc-option, -fno-stack-protector) | ||
13 | AFLAGS := $(CFLAGS) -D__ASSEMBLY__ | ||
14 | LDFLAGS := -m elf_x86_64 | ||
15 | |||
16 | LDFLAGS_vmlinux := -T | ||
17 | $(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE | ||
18 | $(call if_changed,ld) | ||
19 | @: | ||
20 | |||
21 | $(obj)/vmlinux.bin: vmlinux FORCE | ||
22 | $(call if_changed,objcopy) | ||
23 | |||
24 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE | ||
25 | $(call if_changed,gzip) | ||
26 | |||
27 | LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T | ||
28 | |||
29 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE | ||
30 | $(call if_changed,ld) | ||
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S deleted file mode 100644 index 9fd8030cc54f..000000000000 --- a/arch/x86_64/boot/compressed/head.S +++ /dev/null | |||
@@ -1,311 +0,0 @@ | |||
1 | /* | ||
2 | * linux/boot/head.S | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992, 1993 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * head.S contains the 32-bit startup code. | ||
9 | * | ||
10 | * NOTE!!! Startup happens at absolute address 0x00001000, which is also where | ||
11 | * the page directory will exist. The startup code will be overwritten by | ||
12 | * the page directory. [According to comments etc elsewhere on a compressed | ||
13 | * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] | ||
14 | * | ||
15 | * Page 0 is deliberately kept safe, since System Management Mode code in | ||
16 | * laptops may need to access the BIOS data stored there. This is also | ||
17 | * useful for future device drivers that either access the BIOS via VM86 | ||
18 | * mode. | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | ||
23 | */ | ||
24 | .code32 | ||
25 | .text | ||
26 | |||
27 | #include <linux/linkage.h> | ||
28 | #include <asm/segment.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/page.h> | ||
31 | #include <asm/msr.h> | ||
32 | |||
33 | .section ".text.head" | ||
34 | .code32 | ||
35 | .globl startup_32 | ||
36 | |||
37 | startup_32: | ||
38 | cld | ||
39 | cli | ||
40 | movl $(__KERNEL_DS), %eax | ||
41 | movl %eax, %ds | ||
42 | movl %eax, %es | ||
43 | movl %eax, %ss | ||
44 | |||
45 | /* Calculate the delta between where we were compiled to run | ||
46 | * at and where we were actually loaded at. This can only be done | ||
47 | * with a short local call on x86. Nothing else will tell us what | ||
48 | * address we are running at. The reserved chunk of the real-mode | ||
49 | * data at 0x1e4 (defined as a scratch field) are used as the stack | ||
50 | * for this calculation. Only 4 bytes are needed. | ||
51 | */ | ||
52 | leal (0x1e4+4)(%esi), %esp | ||
53 | call 1f | ||
54 | 1: popl %ebp | ||
55 | subl $1b, %ebp | ||
56 | |||
57 | /* setup a stack and make sure cpu supports long mode. */ | ||
58 | movl $user_stack_end, %eax | ||
59 | addl %ebp, %eax | ||
60 | movl %eax, %esp | ||
61 | |||
62 | call verify_cpu | ||
63 | testl %eax, %eax | ||
64 | jnz no_longmode | ||
65 | |||
66 | /* Compute the delta between where we were compiled to run at | ||
67 | * and where the code will actually run at. | ||
68 | */ | ||
69 | /* %ebp contains the address we are loaded at by the boot loader and %ebx | ||
70 | * contains the address where we should move the kernel image temporarily | ||
71 | * for safe in-place decompression. | ||
72 | */ | ||
73 | |||
74 | #ifdef CONFIG_RELOCATABLE | ||
75 | movl %ebp, %ebx | ||
76 | addl $(LARGE_PAGE_SIZE -1), %ebx | ||
77 | andl $LARGE_PAGE_MASK, %ebx | ||
78 | #else | ||
79 | movl $CONFIG_PHYSICAL_START, %ebx | ||
80 | #endif | ||
81 | |||
82 | /* Replace the compressed data size with the uncompressed size */ | ||
83 | subl input_len(%ebp), %ebx | ||
84 | movl output_len(%ebp), %eax | ||
85 | addl %eax, %ebx | ||
86 | /* Add 8 bytes for every 32K input block */ | ||
87 | shrl $12, %eax | ||
88 | addl %eax, %ebx | ||
89 | /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ | ||
90 | addl $(32768 + 18 + 4095), %ebx | ||
91 | andl $~4095, %ebx | ||
92 | |||
93 | /* | ||
94 | * Prepare for entering 64 bit mode | ||
95 | */ | ||
96 | |||
97 | /* Load new GDT with the 64bit segments using 32bit descriptor */ | ||
98 | leal gdt(%ebp), %eax | ||
99 | movl %eax, gdt+2(%ebp) | ||
100 | lgdt gdt(%ebp) | ||
101 | |||
102 | /* Enable PAE mode */ | ||
103 | xorl %eax, %eax | ||
104 | orl $(1 << 5), %eax | ||
105 | movl %eax, %cr4 | ||
106 | |||
107 | /* | ||
108 | * Build early 4G boot pagetable | ||
109 | */ | ||
110 | /* Initialize Page tables to 0*/ | ||
111 | leal pgtable(%ebx), %edi | ||
112 | xorl %eax, %eax | ||
113 | movl $((4096*6)/4), %ecx | ||
114 | rep stosl | ||
115 | |||
116 | /* Build Level 4 */ | ||
117 | leal pgtable + 0(%ebx), %edi | ||
118 | leal 0x1007 (%edi), %eax | ||
119 | movl %eax, 0(%edi) | ||
120 | |||
121 | /* Build Level 3 */ | ||
122 | leal pgtable + 0x1000(%ebx), %edi | ||
123 | leal 0x1007(%edi), %eax | ||
124 | movl $4, %ecx | ||
125 | 1: movl %eax, 0x00(%edi) | ||
126 | addl $0x00001000, %eax | ||
127 | addl $8, %edi | ||
128 | decl %ecx | ||
129 | jnz 1b | ||
130 | |||
131 | /* Build Level 2 */ | ||
132 | leal pgtable + 0x2000(%ebx), %edi | ||
133 | movl $0x00000183, %eax | ||
134 | movl $2048, %ecx | ||
135 | 1: movl %eax, 0(%edi) | ||
136 | addl $0x00200000, %eax | ||
137 | addl $8, %edi | ||
138 | decl %ecx | ||
139 | jnz 1b | ||
140 | |||
141 | /* Enable the boot page tables */ | ||
142 | leal pgtable(%ebx), %eax | ||
143 | movl %eax, %cr3 | ||
144 | |||
145 | /* Enable Long mode in EFER (Extended Feature Enable Register) */ | ||
146 | movl $MSR_EFER, %ecx | ||
147 | rdmsr | ||
148 | btsl $_EFER_LME, %eax | ||
149 | wrmsr | ||
150 | |||
151 | /* Setup for the jump to 64bit mode | ||
152 | * | ||
153 | * When the jump is performend we will be in long mode but | ||
154 | * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 | ||
155 | * (and in turn EFER.LMA = 1). To jump into 64bit mode we use | ||
156 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
157 | * We place all of the values on our mini stack so lret can | ||
158 | * used to perform that far jump. | ||
159 | */ | ||
160 | pushl $__KERNEL_CS | ||
161 | leal startup_64(%ebp), %eax | ||
162 | pushl %eax | ||
163 | |||
164 | /* Enter paged protected Mode, activating Long Mode */ | ||
165 | movl $0x80000001, %eax /* Enable Paging and Protected mode */ | ||
166 | movl %eax, %cr0 | ||
167 | |||
168 | /* Jump from 32bit compatibility mode into 64bit mode. */ | ||
169 | lret | ||
170 | |||
171 | no_longmode: | ||
172 | /* This isn't an x86-64 CPU so hang */ | ||
173 | 1: | ||
174 | hlt | ||
175 | jmp 1b | ||
176 | |||
177 | #include "../../kernel/verify_cpu.S" | ||
178 | |||
179 | /* Be careful here startup_64 needs to be at a predictable | ||
180 | * address so I can export it in an ELF header. Bootloaders | ||
181 | * should look at the ELF header to find this address, as | ||
182 | * it may change in the future. | ||
183 | */ | ||
184 | .code64 | ||
185 | .org 0x200 | ||
186 | ENTRY(startup_64) | ||
187 | /* We come here either from startup_32 or directly from a | ||
188 | * 64bit bootloader. If we come here from a bootloader we depend on | ||
189 | * an identity mapped page table being provied that maps our | ||
190 | * entire text+data+bss and hopefully all of memory. | ||
191 | */ | ||
192 | |||
193 | /* Setup data segments. */ | ||
194 | xorl %eax, %eax | ||
195 | movl %eax, %ds | ||
196 | movl %eax, %es | ||
197 | movl %eax, %ss | ||
198 | movl %eax, %fs | ||
199 | movl %eax, %gs | ||
200 | lldt %ax | ||
201 | movl $0x20, %eax | ||
202 | ltr %ax | ||
203 | |||
204 | /* Compute the decompressed kernel start address. It is where | ||
205 | * we were loaded at aligned to a 2M boundary. %rbp contains the | ||
206 | * decompressed kernel start address. | ||
207 | * | ||
208 | * If it is a relocatable kernel then decompress and run the kernel | ||
209 | * from load address aligned to 2MB addr, otherwise decompress and | ||
210 | * run the kernel from CONFIG_PHYSICAL_START | ||
211 | */ | ||
212 | |||
213 | /* Start with the delta to where the kernel will run at. */ | ||
214 | #ifdef CONFIG_RELOCATABLE | ||
215 | leaq startup_32(%rip) /* - $startup_32 */, %rbp | ||
216 | addq $(LARGE_PAGE_SIZE - 1), %rbp | ||
217 | andq $LARGE_PAGE_MASK, %rbp | ||
218 | movq %rbp, %rbx | ||
219 | #else | ||
220 | movq $CONFIG_PHYSICAL_START, %rbp | ||
221 | movq %rbp, %rbx | ||
222 | #endif | ||
223 | |||
224 | /* Replace the compressed data size with the uncompressed size */ | ||
225 | movl input_len(%rip), %eax | ||
226 | subq %rax, %rbx | ||
227 | movl output_len(%rip), %eax | ||
228 | addq %rax, %rbx | ||
229 | /* Add 8 bytes for every 32K input block */ | ||
230 | shrq $12, %rax | ||
231 | addq %rax, %rbx | ||
232 | /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ | ||
233 | addq $(32768 + 18 + 4095), %rbx | ||
234 | andq $~4095, %rbx | ||
235 | |||
236 | /* Copy the compressed kernel to the end of our buffer | ||
237 | * where decompression in place becomes safe. | ||
238 | */ | ||
239 | leaq _end(%rip), %r8 | ||
240 | leaq _end(%rbx), %r9 | ||
241 | movq $_end /* - $startup_32 */, %rcx | ||
242 | 1: subq $8, %r8 | ||
243 | subq $8, %r9 | ||
244 | movq 0(%r8), %rax | ||
245 | movq %rax, 0(%r9) | ||
246 | subq $8, %rcx | ||
247 | jnz 1b | ||
248 | |||
249 | /* | ||
250 | * Jump to the relocated address. | ||
251 | */ | ||
252 | leaq relocated(%rbx), %rax | ||
253 | jmp *%rax | ||
254 | |||
255 | .section ".text" | ||
256 | relocated: | ||
257 | |||
258 | /* | ||
259 | * Clear BSS | ||
260 | */ | ||
261 | xorq %rax, %rax | ||
262 | leaq _edata(%rbx), %rdi | ||
263 | leaq _end(%rbx), %rcx | ||
264 | subq %rdi, %rcx | ||
265 | cld | ||
266 | rep | ||
267 | stosb | ||
268 | |||
269 | /* Setup the stack */ | ||
270 | leaq user_stack_end(%rip), %rsp | ||
271 | |||
272 | /* zero EFLAGS after setting rsp */ | ||
273 | pushq $0 | ||
274 | popfq | ||
275 | |||
276 | /* | ||
277 | * Do the decompression, and jump to the new kernel.. | ||
278 | */ | ||
279 | pushq %rsi # Save the real mode argument | ||
280 | movq %rsi, %rdi # real mode address | ||
281 | leaq _heap(%rip), %rsi # _heap | ||
282 | leaq input_data(%rip), %rdx # input_data | ||
283 | movl input_len(%rip), %eax | ||
284 | movq %rax, %rcx # input_len | ||
285 | movq %rbp, %r8 # output | ||
286 | call decompress_kernel | ||
287 | popq %rsi | ||
288 | |||
289 | |||
290 | /* | ||
291 | * Jump to the decompressed kernel. | ||
292 | */ | ||
293 | jmp *%rbp | ||
294 | |||
295 | .data | ||
296 | gdt: | ||
297 | .word gdt_end - gdt | ||
298 | .long gdt | ||
299 | .word 0 | ||
300 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
301 | .quad 0x00af9a000000ffff /* __KERNEL_CS */ | ||
302 | .quad 0x00cf92000000ffff /* __KERNEL_DS */ | ||
303 | .quad 0x0080890000000000 /* TS descriptor */ | ||
304 | .quad 0x0000000000000000 /* TS continued */ | ||
305 | gdt_end: | ||
306 | .bss | ||
307 | /* Stack for uncompression */ | ||
308 | .balign 4 | ||
309 | user_stack: | ||
310 | .fill 4096,4,0 | ||
311 | user_stack_end: | ||
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c deleted file mode 100644 index f932b0e89096..000000000000 --- a/arch/x86_64/boot/compressed/misc.c +++ /dev/null | |||
@@ -1,371 +0,0 @@ | |||
1 | /* | ||
2 | * misc.c | ||
3 | * | ||
4 | * This is a collection of several routines from gzip-1.0.3 | ||
5 | * adapted for Linux. | ||
6 | * | ||
7 | * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 | ||
8 | * puts by Nick Holloway 1993, better puts by Martin Mares 1995 | ||
9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | ||
10 | */ | ||
11 | |||
12 | #define _LINUX_STRING_H_ 1 | ||
13 | #define __LINUX_BITMAP_H 1 | ||
14 | |||
15 | #include <linux/linkage.h> | ||
16 | #include <linux/screen_info.h> | ||
17 | #include <asm/io.h> | ||
18 | #include <asm/page.h> | ||
19 | |||
20 | /* WARNING!! | ||
21 | * This code is compiled with -fPIC and it is relocated dynamically | ||
22 | * at run time, but no relocation processing is performed. | ||
23 | * This means that it is not safe to place pointers in static structures. | ||
24 | */ | ||
25 | |||
26 | /* | ||
27 | * Getting to provable safe in place decompression is hard. | ||
28 | * Worst case behaviours need to be analized. | ||
29 | * Background information: | ||
30 | * | ||
31 | * The file layout is: | ||
32 | * magic[2] | ||
33 | * method[1] | ||
34 | * flags[1] | ||
35 | * timestamp[4] | ||
36 | * extraflags[1] | ||
37 | * os[1] | ||
38 | * compressed data blocks[N] | ||
39 | * crc[4] orig_len[4] | ||
40 | * | ||
41 | * resulting in 18 bytes of non compressed data overhead. | ||
42 | * | ||
43 | * Files divided into blocks | ||
44 | * 1 bit (last block flag) | ||
45 | * 2 bits (block type) | ||
46 | * | ||
47 | * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. | ||
48 | * The smallest block type encoding is always used. | ||
49 | * | ||
50 | * stored: | ||
51 | * 32 bits length in bytes. | ||
52 | * | ||
53 | * fixed: | ||
54 | * magic fixed tree. | ||
55 | * symbols. | ||
56 | * | ||
57 | * dynamic: | ||
58 | * dynamic tree encoding. | ||
59 | * symbols. | ||
60 | * | ||
61 | * | ||
62 | * The buffer for decompression in place is the length of the | ||
63 | * uncompressed data, plus a small amount extra to keep the algorithm safe. | ||
64 | * The compressed data is placed at the end of the buffer. The output | ||
65 | * pointer is placed at the start of the buffer and the input pointer | ||
66 | * is placed where the compressed data starts. Problems will occur | ||
67 | * when the output pointer overruns the input pointer. | ||
68 | * | ||
69 | * The output pointer can only overrun the input pointer if the input | ||
70 | * pointer is moving faster than the output pointer. A condition only | ||
71 | * triggered by data whose compressed form is larger than the uncompressed | ||
72 | * form. | ||
73 | * | ||
74 | * The worst case at the block level is a growth of the compressed data | ||
75 | * of 5 bytes per 32767 bytes. | ||
76 | * | ||
77 | * The worst case internal to a compressed block is very hard to figure. | ||
78 | * The worst case can at least be boundined by having one bit that represents | ||
79 | * 32764 bytes and then all of the rest of the bytes representing the very | ||
80 | * very last byte. | ||
81 | * | ||
82 | * All of which is enough to compute an amount of extra data that is required | ||
83 | * to be safe. To avoid problems at the block level allocating 5 extra bytes | ||
84 | * per 32767 bytes of data is sufficient. To avoind problems internal to a block | ||
85 | * adding an extra 32767 bytes (the worst case uncompressed block size) is | ||
86 | * sufficient, to ensure that in the worst case the decompressed data for | ||
87 | * block will stop the byte before the compressed data for a block begins. | ||
88 | * To avoid problems with the compressed data's meta information an extra 18 | ||
89 | * bytes are needed. Leading to the formula: | ||
90 | * | ||
91 | * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. | ||
92 | * | ||
93 | * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. | ||
94 | * Adding 32768 instead of 32767 just makes for round numbers. | ||
95 | * Adding the decompressor_size is necessary as it musht live after all | ||
96 | * of the data as well. Last I measured the decompressor is about 14K. | ||
97 | * 10K of actuall data and 4K of bss. | ||
98 | * | ||
99 | */ | ||
100 | |||
101 | /* | ||
102 | * gzip declarations | ||
103 | */ | ||
104 | |||
105 | #define OF(args) args | ||
106 | #define STATIC static | ||
107 | |||
108 | #undef memset | ||
109 | #undef memcpy | ||
110 | #define memzero(s, n) memset ((s), 0, (n)) | ||
111 | |||
112 | typedef unsigned char uch; | ||
113 | typedef unsigned short ush; | ||
114 | typedef unsigned long ulg; | ||
115 | |||
116 | #define WSIZE 0x80000000 /* Window size must be at least 32k, | ||
117 | * and a power of two | ||
118 | * We don't actually have a window just | ||
119 | * a huge output buffer so I report | ||
120 | * a 2G windows size, as that should | ||
121 | * always be larger than our output buffer. | ||
122 | */ | ||
123 | |||
124 | static uch *inbuf; /* input buffer */ | ||
125 | static uch *window; /* Sliding window buffer, (and final output buffer) */ | ||
126 | |||
127 | static unsigned insize; /* valid bytes in inbuf */ | ||
128 | static unsigned inptr; /* index of next byte to be processed in inbuf */ | ||
129 | static unsigned outcnt; /* bytes in output buffer */ | ||
130 | |||
131 | /* gzip flag byte */ | ||
132 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ | ||
133 | #define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ | ||
134 | #define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ | ||
135 | #define ORIG_NAME 0x08 /* bit 3 set: original file name present */ | ||
136 | #define COMMENT 0x10 /* bit 4 set: file comment present */ | ||
137 | #define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ | ||
138 | #define RESERVED 0xC0 /* bit 6,7: reserved */ | ||
139 | |||
140 | #define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) | ||
141 | |||
142 | /* Diagnostic functions */ | ||
143 | #ifdef DEBUG | ||
144 | # define Assert(cond,msg) {if(!(cond)) error(msg);} | ||
145 | # define Trace(x) fprintf x | ||
146 | # define Tracev(x) {if (verbose) fprintf x ;} | ||
147 | # define Tracevv(x) {if (verbose>1) fprintf x ;} | ||
148 | # define Tracec(c,x) {if (verbose && (c)) fprintf x ;} | ||
149 | # define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} | ||
150 | #else | ||
151 | # define Assert(cond,msg) | ||
152 | # define Trace(x) | ||
153 | # define Tracev(x) | ||
154 | # define Tracevv(x) | ||
155 | # define Tracec(c,x) | ||
156 | # define Tracecv(c,x) | ||
157 | #endif | ||
158 | |||
159 | static int fill_inbuf(void); | ||
160 | static void flush_window(void); | ||
161 | static void error(char *m); | ||
162 | static void gzip_mark(void **); | ||
163 | static void gzip_release(void **); | ||
164 | |||
165 | /* | ||
166 | * This is set up by the setup-routine at boot-time | ||
167 | */ | ||
168 | static unsigned char *real_mode; /* Pointer to real-mode data */ | ||
169 | |||
170 | #define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) | ||
171 | #ifndef STANDARD_MEMORY_BIOS_CALL | ||
172 | #define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) | ||
173 | #endif | ||
174 | #define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) | ||
175 | |||
176 | extern unsigned char input_data[]; | ||
177 | extern int input_len; | ||
178 | |||
179 | static long bytes_out = 0; | ||
180 | |||
181 | static void *malloc(int size); | ||
182 | static void free(void *where); | ||
183 | |||
184 | static void *memset(void *s, int c, unsigned n); | ||
185 | static void *memcpy(void *dest, const void *src, unsigned n); | ||
186 | |||
187 | static void putstr(const char *); | ||
188 | |||
189 | static long free_mem_ptr; | ||
190 | static long free_mem_end_ptr; | ||
191 | |||
192 | #define HEAP_SIZE 0x7000 | ||
193 | |||
194 | static char *vidmem = (char *)0xb8000; | ||
195 | static int vidport; | ||
196 | static int lines, cols; | ||
197 | |||
198 | #include "../../../../lib/inflate.c" | ||
199 | |||
200 | static void *malloc(int size) | ||
201 | { | ||
202 | void *p; | ||
203 | |||
204 | if (size <0) error("Malloc error"); | ||
205 | if (free_mem_ptr <= 0) error("Memory error"); | ||
206 | |||
207 | free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ | ||
208 | |||
209 | p = (void *)free_mem_ptr; | ||
210 | free_mem_ptr += size; | ||
211 | |||
212 | if (free_mem_ptr >= free_mem_end_ptr) | ||
213 | error("Out of memory"); | ||
214 | |||
215 | return p; | ||
216 | } | ||
217 | |||
218 | static void free(void *where) | ||
219 | { /* Don't care */ | ||
220 | } | ||
221 | |||
222 | static void gzip_mark(void **ptr) | ||
223 | { | ||
224 | *ptr = (void *) free_mem_ptr; | ||
225 | } | ||
226 | |||
227 | static void gzip_release(void **ptr) | ||
228 | { | ||
229 | free_mem_ptr = (long) *ptr; | ||
230 | } | ||
231 | |||
232 | static void scroll(void) | ||
233 | { | ||
234 | int i; | ||
235 | |||
236 | memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 ); | ||
237 | for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 ) | ||
238 | vidmem[i] = ' '; | ||
239 | } | ||
240 | |||
241 | static void putstr(const char *s) | ||
242 | { | ||
243 | int x,y,pos; | ||
244 | char c; | ||
245 | |||
246 | x = RM_SCREEN_INFO.orig_x; | ||
247 | y = RM_SCREEN_INFO.orig_y; | ||
248 | |||
249 | while ( ( c = *s++ ) != '\0' ) { | ||
250 | if ( c == '\n' ) { | ||
251 | x = 0; | ||
252 | if ( ++y >= lines ) { | ||
253 | scroll(); | ||
254 | y--; | ||
255 | } | ||
256 | } else { | ||
257 | vidmem [ ( x + cols * y ) * 2 ] = c; | ||
258 | if ( ++x >= cols ) { | ||
259 | x = 0; | ||
260 | if ( ++y >= lines ) { | ||
261 | scroll(); | ||
262 | y--; | ||
263 | } | ||
264 | } | ||
265 | } | ||
266 | } | ||
267 | |||
268 | RM_SCREEN_INFO.orig_x = x; | ||
269 | RM_SCREEN_INFO.orig_y = y; | ||
270 | |||
271 | pos = (x + cols * y) * 2; /* Update cursor position */ | ||
272 | outb_p(14, vidport); | ||
273 | outb_p(0xff & (pos >> 9), vidport+1); | ||
274 | outb_p(15, vidport); | ||
275 | outb_p(0xff & (pos >> 1), vidport+1); | ||
276 | } | ||
277 | |||
278 | static void* memset(void* s, int c, unsigned n) | ||
279 | { | ||
280 | int i; | ||
281 | char *ss = (char*)s; | ||
282 | |||
283 | for (i=0;i<n;i++) ss[i] = c; | ||
284 | return s; | ||
285 | } | ||
286 | |||
287 | static void* memcpy(void* dest, const void* src, unsigned n) | ||
288 | { | ||
289 | int i; | ||
290 | char *d = (char *)dest, *s = (char *)src; | ||
291 | |||
292 | for (i=0;i<n;i++) d[i] = s[i]; | ||
293 | return dest; | ||
294 | } | ||
295 | |||
296 | /* =========================================================================== | ||
297 | * Fill the input buffer. This is called only when the buffer is empty | ||
298 | * and at least one byte is really needed. | ||
299 | */ | ||
300 | static int fill_inbuf(void) | ||
301 | { | ||
302 | error("ran out of input data"); | ||
303 | return 0; | ||
304 | } | ||
305 | |||
306 | /* =========================================================================== | ||
307 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. | ||
308 | * (Used for the decompressed data only.) | ||
309 | */ | ||
310 | static void flush_window(void) | ||
311 | { | ||
312 | /* With my window equal to my output buffer | ||
313 | * I only need to compute the crc here. | ||
314 | */ | ||
315 | ulg c = crc; /* temporary variable */ | ||
316 | unsigned n; | ||
317 | uch *in, ch; | ||
318 | |||
319 | in = window; | ||
320 | for (n = 0; n < outcnt; n++) { | ||
321 | ch = *in++; | ||
322 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
323 | } | ||
324 | crc = c; | ||
325 | bytes_out += (ulg)outcnt; | ||
326 | outcnt = 0; | ||
327 | } | ||
328 | |||
329 | static void error(char *x) | ||
330 | { | ||
331 | putstr("\n\n"); | ||
332 | putstr(x); | ||
333 | putstr("\n\n -- System halted"); | ||
334 | |||
335 | while(1); /* Halt */ | ||
336 | } | ||
337 | |||
338 | asmlinkage void decompress_kernel(void *rmode, unsigned long heap, | ||
339 | uch *input_data, unsigned long input_len, uch *output) | ||
340 | { | ||
341 | real_mode = rmode; | ||
342 | |||
343 | if (RM_SCREEN_INFO.orig_video_mode == 7) { | ||
344 | vidmem = (char *) 0xb0000; | ||
345 | vidport = 0x3b4; | ||
346 | } else { | ||
347 | vidmem = (char *) 0xb8000; | ||
348 | vidport = 0x3d4; | ||
349 | } | ||
350 | |||
351 | lines = RM_SCREEN_INFO.orig_video_lines; | ||
352 | cols = RM_SCREEN_INFO.orig_video_cols; | ||
353 | |||
354 | window = output; /* Output buffer (Normally at 1M) */ | ||
355 | free_mem_ptr = heap; /* Heap */ | ||
356 | free_mem_end_ptr = heap + HEAP_SIZE; | ||
357 | inbuf = input_data; /* Input buffer */ | ||
358 | insize = input_len; | ||
359 | inptr = 0; | ||
360 | |||
361 | if ((ulg)output & (__KERNEL_ALIGN - 1)) | ||
362 | error("Destination address not 2M aligned"); | ||
363 | if ((ulg)output >= 0xffffffffffUL) | ||
364 | error("Destination address too large"); | ||
365 | |||
366 | makecrc(); | ||
367 | putstr(".\nDecompressing Linux..."); | ||
368 | gunzip(); | ||
369 | putstr("done.\nBooting the kernel.\n"); | ||
370 | return; | ||
371 | } | ||
diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds deleted file mode 100644 index 94c13e557fb4..000000000000 --- a/arch/x86_64/boot/compressed/vmlinux.lds +++ /dev/null | |||
@@ -1,44 +0,0 @@ | |||
1 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
2 | OUTPUT_ARCH(i386:x86-64) | ||
3 | ENTRY(startup_64) | ||
4 | SECTIONS | ||
5 | { | ||
6 | /* Be careful parts of head.S assume startup_32 is at | ||
7 | * address 0. | ||
8 | */ | ||
9 | . = 0; | ||
10 | .text : { | ||
11 | _head = . ; | ||
12 | *(.text.head) | ||
13 | _ehead = . ; | ||
14 | *(.text.compressed) | ||
15 | _text = .; /* Text */ | ||
16 | *(.text) | ||
17 | *(.text.*) | ||
18 | _etext = . ; | ||
19 | } | ||
20 | .rodata : { | ||
21 | _rodata = . ; | ||
22 | *(.rodata) /* read-only data */ | ||
23 | *(.rodata.*) | ||
24 | _erodata = . ; | ||
25 | } | ||
26 | .data : { | ||
27 | _data = . ; | ||
28 | *(.data) | ||
29 | *(.data.*) | ||
30 | _edata = . ; | ||
31 | } | ||
32 | .bss : { | ||
33 | _bss = . ; | ||
34 | *(.bss) | ||
35 | *(.bss.*) | ||
36 | *(COMMON) | ||
37 | . = ALIGN(8); | ||
38 | _end = . ; | ||
39 | . = ALIGN(4096); | ||
40 | pgtable = . ; | ||
41 | . = . + 4096 * 6; | ||
42 | _heap = .; | ||
43 | } | ||
44 | } | ||
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr deleted file mode 100644 index bd1429ce193e..000000000000 --- a/arch/x86_64/boot/compressed/vmlinux.scr +++ /dev/null | |||
@@ -1,10 +0,0 @@ | |||
1 | SECTIONS | ||
2 | { | ||
3 | .text.compressed : { | ||
4 | input_len = .; | ||
5 | LONG(input_data_end - input_data) input_data = .; | ||
6 | *(.data) | ||
7 | output_len = . - 4; | ||
8 | input_data_end = .; | ||
9 | } | ||
10 | } | ||
diff --git a/arch/x86_64/boot/tools/.gitignore b/arch/x86_64/boot/tools/.gitignore deleted file mode 100644 index 378eac25d311..000000000000 --- a/arch/x86_64/boot/tools/.gitignore +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | build | ||
diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile deleted file mode 100644 index 15b538a8b7f7..000000000000 --- a/arch/x86_64/crypto/Makefile +++ /dev/null | |||
@@ -1,12 +0,0 @@ | |||
1 | # | ||
2 | # x86_64/crypto/Makefile | ||
3 | # | ||
4 | # Arch-specific CryptoAPI modules. | ||
5 | # | ||
6 | |||
7 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o | ||
8 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o | ||
9 | |||
10 | aes-x86_64-y := aes-x86_64-asm.o aes.o | ||
11 | twofish-x86_64-y := twofish-x86_64-asm.o twofish.o | ||
12 | |||
diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S deleted file mode 100644 index 26b40de4d0b0..000000000000 --- a/arch/x86_64/crypto/aes-x86_64-asm.S +++ /dev/null | |||
@@ -1,190 +0,0 @@ | |||
1 | /* AES (Rijndael) implementation (FIPS PUB 197) for x86_64 | ||
2 | * | ||
3 | * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de> | ||
4 | * | ||
5 | * License: | ||
6 | * This code can be distributed under the terms of the GNU General Public | ||
7 | * License (GPL) Version 2 provided that the above header down to and | ||
8 | * including this sentence is retained in full. | ||
9 | */ | ||
10 | |||
11 | .extern aes_ft_tab | ||
12 | .extern aes_it_tab | ||
13 | .extern aes_fl_tab | ||
14 | .extern aes_il_tab | ||
15 | |||
16 | .text | ||
17 | |||
18 | #include <asm/asm-offsets.h> | ||
19 | |||
20 | #define BASE crypto_tfm_ctx_offset | ||
21 | |||
22 | #define R1 %rax | ||
23 | #define R1E %eax | ||
24 | #define R1X %ax | ||
25 | #define R1H %ah | ||
26 | #define R1L %al | ||
27 | #define R2 %rbx | ||
28 | #define R2E %ebx | ||
29 | #define R2X %bx | ||
30 | #define R2H %bh | ||
31 | #define R2L %bl | ||
32 | #define R3 %rcx | ||
33 | #define R3E %ecx | ||
34 | #define R3X %cx | ||
35 | #define R3H %ch | ||
36 | #define R3L %cl | ||
37 | #define R4 %rdx | ||
38 | #define R4E %edx | ||
39 | #define R4X %dx | ||
40 | #define R4H %dh | ||
41 | #define R4L %dl | ||
42 | #define R5 %rsi | ||
43 | #define R5E %esi | ||
44 | #define R6 %rdi | ||
45 | #define R6E %edi | ||
46 | #define R7 %rbp | ||
47 | #define R7E %ebp | ||
48 | #define R8 %r8 | ||
49 | #define R9 %r9 | ||
50 | #define R10 %r10 | ||
51 | #define R11 %r11 | ||
52 | |||
53 | #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ | ||
54 | .global FUNC; \ | ||
55 | .type FUNC,@function; \ | ||
56 | .align 8; \ | ||
57 | FUNC: movq r1,r2; \ | ||
58 | movq r3,r4; \ | ||
59 | leaq BASE+KEY+52(r8),r9; \ | ||
60 | movq r10,r11; \ | ||
61 | movl (r7),r5 ## E; \ | ||
62 | movl 4(r7),r1 ## E; \ | ||
63 | movl 8(r7),r6 ## E; \ | ||
64 | movl 12(r7),r7 ## E; \ | ||
65 | movl BASE(r8),r10 ## E; \ | ||
66 | xorl -48(r9),r5 ## E; \ | ||
67 | xorl -44(r9),r1 ## E; \ | ||
68 | xorl -40(r9),r6 ## E; \ | ||
69 | xorl -36(r9),r7 ## E; \ | ||
70 | cmpl $24,r10 ## E; \ | ||
71 | jb B128; \ | ||
72 | leaq 32(r9),r9; \ | ||
73 | je B192; \ | ||
74 | leaq 32(r9),r9; | ||
75 | |||
76 | #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ | ||
77 | movq r1,r2; \ | ||
78 | movq r3,r4; \ | ||
79 | movl r5 ## E,(r9); \ | ||
80 | movl r6 ## E,4(r9); \ | ||
81 | movl r7 ## E,8(r9); \ | ||
82 | movl r8 ## E,12(r9); \ | ||
83 | ret; | ||
84 | |||
85 | #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ | ||
86 | movzbl r2 ## H,r5 ## E; \ | ||
87 | movzbl r2 ## L,r6 ## E; \ | ||
88 | movl TAB+1024(,r5,4),r5 ## E;\ | ||
89 | movw r4 ## X,r2 ## X; \ | ||
90 | movl TAB(,r6,4),r6 ## E; \ | ||
91 | roll $16,r2 ## E; \ | ||
92 | shrl $16,r4 ## E; \ | ||
93 | movzbl r4 ## H,r7 ## E; \ | ||
94 | movzbl r4 ## L,r4 ## E; \ | ||
95 | xorl OFFSET(r8),ra ## E; \ | ||
96 | xorl OFFSET+4(r8),rb ## E; \ | ||
97 | xorl TAB+3072(,r7,4),r5 ## E;\ | ||
98 | xorl TAB+2048(,r4,4),r6 ## E;\ | ||
99 | movzbl r1 ## L,r7 ## E; \ | ||
100 | movzbl r1 ## H,r4 ## E; \ | ||
101 | movl TAB+1024(,r4,4),r4 ## E;\ | ||
102 | movw r3 ## X,r1 ## X; \ | ||
103 | roll $16,r1 ## E; \ | ||
104 | shrl $16,r3 ## E; \ | ||
105 | xorl TAB(,r7,4),r5 ## E; \ | ||
106 | movzbl r3 ## H,r7 ## E; \ | ||
107 | movzbl r3 ## L,r3 ## E; \ | ||
108 | xorl TAB+3072(,r7,4),r4 ## E;\ | ||
109 | xorl TAB+2048(,r3,4),r5 ## E;\ | ||
110 | movzbl r1 ## H,r7 ## E; \ | ||
111 | movzbl r1 ## L,r3 ## E; \ | ||
112 | shrl $16,r1 ## E; \ | ||
113 | xorl TAB+3072(,r7,4),r6 ## E;\ | ||
114 | movl TAB+2048(,r3,4),r3 ## E;\ | ||
115 | movzbl r1 ## H,r7 ## E; \ | ||
116 | movzbl r1 ## L,r1 ## E; \ | ||
117 | xorl TAB+1024(,r7,4),r6 ## E;\ | ||
118 | xorl TAB(,r1,4),r3 ## E; \ | ||
119 | movzbl r2 ## H,r1 ## E; \ | ||
120 | movzbl r2 ## L,r7 ## E; \ | ||
121 | shrl $16,r2 ## E; \ | ||
122 | xorl TAB+3072(,r1,4),r3 ## E;\ | ||
123 | xorl TAB+2048(,r7,4),r4 ## E;\ | ||
124 | movzbl r2 ## H,r1 ## E; \ | ||
125 | movzbl r2 ## L,r2 ## E; \ | ||
126 | xorl OFFSET+8(r8),rc ## E; \ | ||
127 | xorl OFFSET+12(r8),rd ## E; \ | ||
128 | xorl TAB+1024(,r1,4),r3 ## E;\ | ||
129 | xorl TAB(,r2,4),r4 ## E; | ||
130 | |||
131 | #define move_regs(r1,r2,r3,r4) \ | ||
132 | movl r3 ## E,r1 ## E; \ | ||
133 | movl r4 ## E,r2 ## E; | ||
134 | |||
135 | #define entry(FUNC,KEY,B128,B192) \ | ||
136 | prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) | ||
137 | |||
138 | #define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) | ||
139 | |||
140 | #define encrypt_round(TAB,OFFSET) \ | ||
141 | round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ | ||
142 | move_regs(R1,R2,R5,R6) | ||
143 | |||
144 | #define encrypt_final(TAB,OFFSET) \ | ||
145 | round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) | ||
146 | |||
147 | #define decrypt_round(TAB,OFFSET) \ | ||
148 | round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \ | ||
149 | move_regs(R1,R2,R5,R6) | ||
150 | |||
151 | #define decrypt_final(TAB,OFFSET) \ | ||
152 | round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) | ||
153 | |||
154 | /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ | ||
155 | |||
156 | entry(aes_enc_blk,0,enc128,enc192) | ||
157 | encrypt_round(aes_ft_tab,-96) | ||
158 | encrypt_round(aes_ft_tab,-80) | ||
159 | enc192: encrypt_round(aes_ft_tab,-64) | ||
160 | encrypt_round(aes_ft_tab,-48) | ||
161 | enc128: encrypt_round(aes_ft_tab,-32) | ||
162 | encrypt_round(aes_ft_tab,-16) | ||
163 | encrypt_round(aes_ft_tab, 0) | ||
164 | encrypt_round(aes_ft_tab, 16) | ||
165 | encrypt_round(aes_ft_tab, 32) | ||
166 | encrypt_round(aes_ft_tab, 48) | ||
167 | encrypt_round(aes_ft_tab, 64) | ||
168 | encrypt_round(aes_ft_tab, 80) | ||
169 | encrypt_round(aes_ft_tab, 96) | ||
170 | encrypt_final(aes_fl_tab,112) | ||
171 | return | ||
172 | |||
173 | /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ | ||
174 | |||
175 | entry(aes_dec_blk,240,dec128,dec192) | ||
176 | decrypt_round(aes_it_tab,-96) | ||
177 | decrypt_round(aes_it_tab,-80) | ||
178 | dec192: decrypt_round(aes_it_tab,-64) | ||
179 | decrypt_round(aes_it_tab,-48) | ||
180 | dec128: decrypt_round(aes_it_tab,-32) | ||
181 | decrypt_round(aes_it_tab,-16) | ||
182 | decrypt_round(aes_it_tab, 0) | ||
183 | decrypt_round(aes_it_tab, 16) | ||
184 | decrypt_round(aes_it_tab, 32) | ||
185 | decrypt_round(aes_it_tab, 48) | ||
186 | decrypt_round(aes_it_tab, 64) | ||
187 | decrypt_round(aes_it_tab, 80) | ||
188 | decrypt_round(aes_it_tab, 96) | ||
189 | decrypt_final(aes_il_tab,112) | ||
190 | return | ||
diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c deleted file mode 100644 index 5cdb13ea5cc2..000000000000 --- a/arch/x86_64/crypto/aes.c +++ /dev/null | |||
@@ -1,336 +0,0 @@ | |||
1 | /* | ||
2 | * Cryptographic API. | ||
3 | * | ||
4 | * AES Cipher Algorithm. | ||
5 | * | ||
6 | * Based on Brian Gladman's code. | ||
7 | * | ||
8 | * Linux developers: | ||
9 | * Alexander Kjeldaas <astor@fast.no> | ||
10 | * Herbert Valerio Riedel <hvr@hvrlab.org> | ||
11 | * Kyle McMartin <kyle@debian.org> | ||
12 | * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API). | ||
13 | * Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler) | ||
14 | * | ||
15 | * This program is free software; you can redistribute it and/or modify | ||
16 | * it under the terms of the GNU General Public License as published by | ||
17 | * the Free Software Foundation; either version 2 of the License, or | ||
18 | * (at your option) any later version. | ||
19 | * | ||
20 | * --------------------------------------------------------------------------- | ||
21 | * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK. | ||
22 | * All rights reserved. | ||
23 | * | ||
24 | * LICENSE TERMS | ||
25 | * | ||
26 | * The free distribution and use of this software in both source and binary | ||
27 | * form is allowed (with or without changes) provided that: | ||
28 | * | ||
29 | * 1. distributions of this source code include the above copyright | ||
30 | * notice, this list of conditions and the following disclaimer; | ||
31 | * | ||
32 | * 2. distributions in binary form include the above copyright | ||
33 | * notice, this list of conditions and the following disclaimer | ||
34 | * in the documentation and/or other associated materials; | ||
35 | * | ||
36 | * 3. the copyright holder's name is not used to endorse products | ||
37 | * built using this software without specific written permission. | ||
38 | * | ||
39 | * ALTERNATIVELY, provided that this notice is retained in full, this product | ||
40 | * may be distributed under the terms of the GNU General Public License (GPL), | ||
41 | * in which case the provisions of the GPL apply INSTEAD OF those given above. | ||
42 | * | ||
43 | * DISCLAIMER | ||
44 | * | ||
45 | * This software is provided 'as is' with no explicit or implied warranties | ||
46 | * in respect of its properties, including, but not limited to, correctness | ||
47 | * and/or fitness for purpose. | ||
48 | * --------------------------------------------------------------------------- | ||
49 | */ | ||
50 | |||
51 | /* Some changes from the Gladman version: | ||
52 | s/RIJNDAEL(e_key)/E_KEY/g | ||
53 | s/RIJNDAEL(d_key)/D_KEY/g | ||
54 | */ | ||
55 | |||
56 | #include <asm/byteorder.h> | ||
57 | #include <linux/bitops.h> | ||
58 | #include <linux/crypto.h> | ||
59 | #include <linux/errno.h> | ||
60 | #include <linux/init.h> | ||
61 | #include <linux/module.h> | ||
62 | #include <linux/types.h> | ||
63 | |||
64 | #define AES_MIN_KEY_SIZE 16 | ||
65 | #define AES_MAX_KEY_SIZE 32 | ||
66 | |||
67 | #define AES_BLOCK_SIZE 16 | ||
68 | |||
69 | /* | ||
70 | * #define byte(x, nr) ((unsigned char)((x) >> (nr*8))) | ||
71 | */ | ||
72 | static inline u8 byte(const u32 x, const unsigned n) | ||
73 | { | ||
74 | return x >> (n << 3); | ||
75 | } | ||
76 | |||
77 | struct aes_ctx | ||
78 | { | ||
79 | u32 key_length; | ||
80 | u32 buf[120]; | ||
81 | }; | ||
82 | |||
83 | #define E_KEY (&ctx->buf[0]) | ||
84 | #define D_KEY (&ctx->buf[60]) | ||
85 | |||
86 | static u8 pow_tab[256] __initdata; | ||
87 | static u8 log_tab[256] __initdata; | ||
88 | static u8 sbx_tab[256] __initdata; | ||
89 | static u8 isb_tab[256] __initdata; | ||
90 | static u32 rco_tab[10]; | ||
91 | u32 aes_ft_tab[4][256]; | ||
92 | u32 aes_it_tab[4][256]; | ||
93 | |||
94 | u32 aes_fl_tab[4][256]; | ||
95 | u32 aes_il_tab[4][256]; | ||
96 | |||
97 | static inline u8 f_mult(u8 a, u8 b) | ||
98 | { | ||
99 | u8 aa = log_tab[a], cc = aa + log_tab[b]; | ||
100 | |||
101 | return pow_tab[cc + (cc < aa ? 1 : 0)]; | ||
102 | } | ||
103 | |||
104 | #define ff_mult(a, b) (a && b ? f_mult(a, b) : 0) | ||
105 | |||
106 | #define ls_box(x) \ | ||
107 | (aes_fl_tab[0][byte(x, 0)] ^ \ | ||
108 | aes_fl_tab[1][byte(x, 1)] ^ \ | ||
109 | aes_fl_tab[2][byte(x, 2)] ^ \ | ||
110 | aes_fl_tab[3][byte(x, 3)]) | ||
111 | |||
112 | static void __init gen_tabs(void) | ||
113 | { | ||
114 | u32 i, t; | ||
115 | u8 p, q; | ||
116 | |||
117 | /* log and power tables for GF(2**8) finite field with | ||
118 | 0x011b as modular polynomial - the simplest primitive | ||
119 | root is 0x03, used here to generate the tables */ | ||
120 | |||
121 | for (i = 0, p = 1; i < 256; ++i) { | ||
122 | pow_tab[i] = (u8)p; | ||
123 | log_tab[p] = (u8)i; | ||
124 | |||
125 | p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0); | ||
126 | } | ||
127 | |||
128 | log_tab[1] = 0; | ||
129 | |||
130 | for (i = 0, p = 1; i < 10; ++i) { | ||
131 | rco_tab[i] = p; | ||
132 | |||
133 | p = (p << 1) ^ (p & 0x80 ? 0x01b : 0); | ||
134 | } | ||
135 | |||
136 | for (i = 0; i < 256; ++i) { | ||
137 | p = (i ? pow_tab[255 - log_tab[i]] : 0); | ||
138 | q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2)); | ||
139 | p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2)); | ||
140 | sbx_tab[i] = p; | ||
141 | isb_tab[p] = (u8)i; | ||
142 | } | ||
143 | |||
144 | for (i = 0; i < 256; ++i) { | ||
145 | p = sbx_tab[i]; | ||
146 | |||
147 | t = p; | ||
148 | aes_fl_tab[0][i] = t; | ||
149 | aes_fl_tab[1][i] = rol32(t, 8); | ||
150 | aes_fl_tab[2][i] = rol32(t, 16); | ||
151 | aes_fl_tab[3][i] = rol32(t, 24); | ||
152 | |||
153 | t = ((u32)ff_mult(2, p)) | | ||
154 | ((u32)p << 8) | | ||
155 | ((u32)p << 16) | ((u32)ff_mult(3, p) << 24); | ||
156 | |||
157 | aes_ft_tab[0][i] = t; | ||
158 | aes_ft_tab[1][i] = rol32(t, 8); | ||
159 | aes_ft_tab[2][i] = rol32(t, 16); | ||
160 | aes_ft_tab[3][i] = rol32(t, 24); | ||
161 | |||
162 | p = isb_tab[i]; | ||
163 | |||
164 | t = p; | ||
165 | aes_il_tab[0][i] = t; | ||
166 | aes_il_tab[1][i] = rol32(t, 8); | ||
167 | aes_il_tab[2][i] = rol32(t, 16); | ||
168 | aes_il_tab[3][i] = rol32(t, 24); | ||
169 | |||
170 | t = ((u32)ff_mult(14, p)) | | ||
171 | ((u32)ff_mult(9, p) << 8) | | ||
172 | ((u32)ff_mult(13, p) << 16) | | ||
173 | ((u32)ff_mult(11, p) << 24); | ||
174 | |||
175 | aes_it_tab[0][i] = t; | ||
176 | aes_it_tab[1][i] = rol32(t, 8); | ||
177 | aes_it_tab[2][i] = rol32(t, 16); | ||
178 | aes_it_tab[3][i] = rol32(t, 24); | ||
179 | } | ||
180 | } | ||
181 | |||
182 | #define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b) | ||
183 | |||
184 | #define imix_col(y, x) \ | ||
185 | u = star_x(x); \ | ||
186 | v = star_x(u); \ | ||
187 | w = star_x(v); \ | ||
188 | t = w ^ (x); \ | ||
189 | (y) = u ^ v ^ w; \ | ||
190 | (y) ^= ror32(u ^ t, 8) ^ \ | ||
191 | ror32(v ^ t, 16) ^ \ | ||
192 | ror32(t, 24) | ||
193 | |||
194 | /* initialise the key schedule from the user supplied key */ | ||
195 | |||
196 | #define loop4(i) \ | ||
197 | { \ | ||
198 | t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ | ||
199 | t ^= E_KEY[4 * i]; E_KEY[4 * i + 4] = t; \ | ||
200 | t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t; \ | ||
201 | t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t; \ | ||
202 | t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t; \ | ||
203 | } | ||
204 | |||
205 | #define loop6(i) \ | ||
206 | { \ | ||
207 | t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ | ||
208 | t ^= E_KEY[6 * i]; E_KEY[6 * i + 6] = t; \ | ||
209 | t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t; \ | ||
210 | t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t; \ | ||
211 | t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t; \ | ||
212 | t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t; \ | ||
213 | t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t; \ | ||
214 | } | ||
215 | |||
216 | #define loop8(i) \ | ||
217 | { \ | ||
218 | t = ror32(t, 8); ; t = ls_box(t) ^ rco_tab[i]; \ | ||
219 | t ^= E_KEY[8 * i]; E_KEY[8 * i + 8] = t; \ | ||
220 | t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t; \ | ||
221 | t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t; \ | ||
222 | t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t; \ | ||
223 | t = E_KEY[8 * i + 4] ^ ls_box(t); \ | ||
224 | E_KEY[8 * i + 12] = t; \ | ||
225 | t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t; \ | ||
226 | t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t; \ | ||
227 | t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \ | ||
228 | } | ||
229 | |||
230 | static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, | ||
231 | unsigned int key_len) | ||
232 | { | ||
233 | struct aes_ctx *ctx = crypto_tfm_ctx(tfm); | ||
234 | const __le32 *key = (const __le32 *)in_key; | ||
235 | u32 *flags = &tfm->crt_flags; | ||
236 | u32 i, j, t, u, v, w; | ||
237 | |||
238 | if (key_len % 8) { | ||
239 | *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
240 | return -EINVAL; | ||
241 | } | ||
242 | |||
243 | ctx->key_length = key_len; | ||
244 | |||
245 | D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]); | ||
246 | D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]); | ||
247 | D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]); | ||
248 | D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]); | ||
249 | |||
250 | switch (key_len) { | ||
251 | case 16: | ||
252 | t = E_KEY[3]; | ||
253 | for (i = 0; i < 10; ++i) | ||
254 | loop4(i); | ||
255 | break; | ||
256 | |||
257 | case 24: | ||
258 | E_KEY[4] = le32_to_cpu(key[4]); | ||
259 | t = E_KEY[5] = le32_to_cpu(key[5]); | ||
260 | for (i = 0; i < 8; ++i) | ||
261 | loop6 (i); | ||
262 | break; | ||
263 | |||
264 | case 32: | ||
265 | E_KEY[4] = le32_to_cpu(key[4]); | ||
266 | E_KEY[5] = le32_to_cpu(key[5]); | ||
267 | E_KEY[6] = le32_to_cpu(key[6]); | ||
268 | t = E_KEY[7] = le32_to_cpu(key[7]); | ||
269 | for (i = 0; i < 7; ++i) | ||
270 | loop8(i); | ||
271 | break; | ||
272 | } | ||
273 | |||
274 | D_KEY[0] = E_KEY[key_len + 24]; | ||
275 | D_KEY[1] = E_KEY[key_len + 25]; | ||
276 | D_KEY[2] = E_KEY[key_len + 26]; | ||
277 | D_KEY[3] = E_KEY[key_len + 27]; | ||
278 | |||
279 | for (i = 4; i < key_len + 24; ++i) { | ||
280 | j = key_len + 24 - (i & ~3) + (i & 3); | ||
281 | imix_col(D_KEY[j], E_KEY[i]); | ||
282 | } | ||
283 | |||
284 | return 0; | ||
285 | } | ||
286 | |||
287 | asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); | ||
288 | asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); | ||
289 | |||
290 | static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | ||
291 | { | ||
292 | aes_enc_blk(tfm, dst, src); | ||
293 | } | ||
294 | |||
295 | static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | ||
296 | { | ||
297 | aes_dec_blk(tfm, dst, src); | ||
298 | } | ||
299 | |||
300 | static struct crypto_alg aes_alg = { | ||
301 | .cra_name = "aes", | ||
302 | .cra_driver_name = "aes-x86_64", | ||
303 | .cra_priority = 200, | ||
304 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER, | ||
305 | .cra_blocksize = AES_BLOCK_SIZE, | ||
306 | .cra_ctxsize = sizeof(struct aes_ctx), | ||
307 | .cra_module = THIS_MODULE, | ||
308 | .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), | ||
309 | .cra_u = { | ||
310 | .cipher = { | ||
311 | .cia_min_keysize = AES_MIN_KEY_SIZE, | ||
312 | .cia_max_keysize = AES_MAX_KEY_SIZE, | ||
313 | .cia_setkey = aes_set_key, | ||
314 | .cia_encrypt = aes_encrypt, | ||
315 | .cia_decrypt = aes_decrypt | ||
316 | } | ||
317 | } | ||
318 | }; | ||
319 | |||
320 | static int __init aes_init(void) | ||
321 | { | ||
322 | gen_tabs(); | ||
323 | return crypto_register_alg(&aes_alg); | ||
324 | } | ||
325 | |||
326 | static void __exit aes_fini(void) | ||
327 | { | ||
328 | crypto_unregister_alg(&aes_alg); | ||
329 | } | ||
330 | |||
331 | module_init(aes_init); | ||
332 | module_exit(aes_fini); | ||
333 | |||
334 | MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); | ||
335 | MODULE_LICENSE("GPL"); | ||
336 | MODULE_ALIAS("aes"); | ||
diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S deleted file mode 100644 index 35974a586615..000000000000 --- a/arch/x86_64/crypto/twofish-x86_64-asm.S +++ /dev/null | |||
@@ -1,324 +0,0 @@ | |||
1 | /*************************************************************************** | ||
2 | * Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * | ||
3 | * * | ||
4 | * This program is free software; you can redistribute it and/or modify * | ||
5 | * it under the terms of the GNU General Public License as published by * | ||
6 | * the Free Software Foundation; either version 2 of the License, or * | ||
7 | * (at your option) any later version. * | ||
8 | * * | ||
9 | * This program is distributed in the hope that it will be useful, * | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of * | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * | ||
12 | * GNU General Public License for more details. * | ||
13 | * * | ||
14 | * You should have received a copy of the GNU General Public License * | ||
15 | * along with this program; if not, write to the * | ||
16 | * Free Software Foundation, Inc., * | ||
17 | * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * | ||
18 | ***************************************************************************/ | ||
19 | |||
20 | .file "twofish-x86_64-asm.S" | ||
21 | .text | ||
22 | |||
23 | #include <asm/asm-offsets.h> | ||
24 | |||
25 | #define a_offset 0 | ||
26 | #define b_offset 4 | ||
27 | #define c_offset 8 | ||
28 | #define d_offset 12 | ||
29 | |||
30 | /* Structure of the crypto context struct*/ | ||
31 | |||
32 | #define s0 0 /* S0 Array 256 Words each */ | ||
33 | #define s1 1024 /* S1 Array */ | ||
34 | #define s2 2048 /* S2 Array */ | ||
35 | #define s3 3072 /* S3 Array */ | ||
36 | #define w 4096 /* 8 whitening keys (word) */ | ||
37 | #define k 4128 /* key 1-32 ( word ) */ | ||
38 | |||
39 | /* define a few register aliases to allow macro substitution */ | ||
40 | |||
41 | #define R0 %rax | ||
42 | #define R0D %eax | ||
43 | #define R0B %al | ||
44 | #define R0H %ah | ||
45 | |||
46 | #define R1 %rbx | ||
47 | #define R1D %ebx | ||
48 | #define R1B %bl | ||
49 | #define R1H %bh | ||
50 | |||
51 | #define R2 %rcx | ||
52 | #define R2D %ecx | ||
53 | #define R2B %cl | ||
54 | #define R2H %ch | ||
55 | |||
56 | #define R3 %rdx | ||
57 | #define R3D %edx | ||
58 | #define R3B %dl | ||
59 | #define R3H %dh | ||
60 | |||
61 | |||
62 | /* performs input whitening */ | ||
63 | #define input_whitening(src,context,offset)\ | ||
64 | xor w+offset(context), src; | ||
65 | |||
66 | /* performs input whitening */ | ||
67 | #define output_whitening(src,context,offset)\ | ||
68 | xor w+16+offset(context), src; | ||
69 | |||
70 | |||
71 | /* | ||
72 | * a input register containing a (rotated 16) | ||
73 | * b input register containing b | ||
74 | * c input register containing c | ||
75 | * d input register containing d (already rol $1) | ||
76 | * operations on a and b are interleaved to increase performance | ||
77 | */ | ||
78 | #define encrypt_round(a,b,c,d,round)\ | ||
79 | movzx b ## B, %edi;\ | ||
80 | mov s1(%r11,%rdi,4),%r8d;\ | ||
81 | movzx a ## B, %edi;\ | ||
82 | mov s2(%r11,%rdi,4),%r9d;\ | ||
83 | movzx b ## H, %edi;\ | ||
84 | ror $16, b ## D;\ | ||
85 | xor s2(%r11,%rdi,4),%r8d;\ | ||
86 | movzx a ## H, %edi;\ | ||
87 | ror $16, a ## D;\ | ||
88 | xor s3(%r11,%rdi,4),%r9d;\ | ||
89 | movzx b ## B, %edi;\ | ||
90 | xor s3(%r11,%rdi,4),%r8d;\ | ||
91 | movzx a ## B, %edi;\ | ||
92 | xor (%r11,%rdi,4), %r9d;\ | ||
93 | movzx b ## H, %edi;\ | ||
94 | ror $15, b ## D;\ | ||
95 | xor (%r11,%rdi,4), %r8d;\ | ||
96 | movzx a ## H, %edi;\ | ||
97 | xor s1(%r11,%rdi,4),%r9d;\ | ||
98 | add %r8d, %r9d;\ | ||
99 | add %r9d, %r8d;\ | ||
100 | add k+round(%r11), %r9d;\ | ||
101 | xor %r9d, c ## D;\ | ||
102 | rol $15, c ## D;\ | ||
103 | add k+4+round(%r11),%r8d;\ | ||
104 | xor %r8d, d ## D; | ||
105 | |||
106 | /* | ||
107 | * a input register containing a(rotated 16) | ||
108 | * b input register containing b | ||
109 | * c input register containing c | ||
110 | * d input register containing d (already rol $1) | ||
111 | * operations on a and b are interleaved to increase performance | ||
112 | * during the round a and b are prepared for the output whitening | ||
113 | */ | ||
114 | #define encrypt_last_round(a,b,c,d,round)\ | ||
115 | mov b ## D, %r10d;\ | ||
116 | shl $32, %r10;\ | ||
117 | movzx b ## B, %edi;\ | ||
118 | mov s1(%r11,%rdi,4),%r8d;\ | ||
119 | movzx a ## B, %edi;\ | ||
120 | mov s2(%r11,%rdi,4),%r9d;\ | ||
121 | movzx b ## H, %edi;\ | ||
122 | ror $16, b ## D;\ | ||
123 | xor s2(%r11,%rdi,4),%r8d;\ | ||
124 | movzx a ## H, %edi;\ | ||
125 | ror $16, a ## D;\ | ||
126 | xor s3(%r11,%rdi,4),%r9d;\ | ||
127 | movzx b ## B, %edi;\ | ||
128 | xor s3(%r11,%rdi,4),%r8d;\ | ||
129 | movzx a ## B, %edi;\ | ||
130 | xor (%r11,%rdi,4), %r9d;\ | ||
131 | xor a, %r10;\ | ||
132 | movzx b ## H, %edi;\ | ||
133 | xor (%r11,%rdi,4), %r8d;\ | ||
134 | movzx a ## H, %edi;\ | ||
135 | xor s1(%r11,%rdi,4),%r9d;\ | ||
136 | add %r8d, %r9d;\ | ||
137 | add %r9d, %r8d;\ | ||
138 | add k+round(%r11), %r9d;\ | ||
139 | xor %r9d, c ## D;\ | ||
140 | ror $1, c ## D;\ | ||
141 | add k+4+round(%r11),%r8d;\ | ||
142 | xor %r8d, d ## D | ||
143 | |||
144 | /* | ||
145 | * a input register containing a | ||
146 | * b input register containing b (rotated 16) | ||
147 | * c input register containing c (already rol $1) | ||
148 | * d input register containing d | ||
149 | * operations on a and b are interleaved to increase performance | ||
150 | */ | ||
151 | #define decrypt_round(a,b,c,d,round)\ | ||
152 | movzx a ## B, %edi;\ | ||
153 | mov (%r11,%rdi,4), %r9d;\ | ||
154 | movzx b ## B, %edi;\ | ||
155 | mov s3(%r11,%rdi,4),%r8d;\ | ||
156 | movzx a ## H, %edi;\ | ||
157 | ror $16, a ## D;\ | ||
158 | xor s1(%r11,%rdi,4),%r9d;\ | ||
159 | movzx b ## H, %edi;\ | ||
160 | ror $16, b ## D;\ | ||
161 | xor (%r11,%rdi,4), %r8d;\ | ||
162 | movzx a ## B, %edi;\ | ||
163 | xor s2(%r11,%rdi,4),%r9d;\ | ||
164 | movzx b ## B, %edi;\ | ||
165 | xor s1(%r11,%rdi,4),%r8d;\ | ||
166 | movzx a ## H, %edi;\ | ||
167 | ror $15, a ## D;\ | ||
168 | xor s3(%r11,%rdi,4),%r9d;\ | ||
169 | movzx b ## H, %edi;\ | ||
170 | xor s2(%r11,%rdi,4),%r8d;\ | ||
171 | add %r8d, %r9d;\ | ||
172 | add %r9d, %r8d;\ | ||
173 | add k+round(%r11), %r9d;\ | ||
174 | xor %r9d, c ## D;\ | ||
175 | add k+4+round(%r11),%r8d;\ | ||
176 | xor %r8d, d ## D;\ | ||
177 | rol $15, d ## D; | ||
178 | |||
179 | /* | ||
180 | * a input register containing a | ||
181 | * b input register containing b | ||
182 | * c input register containing c (already rol $1) | ||
183 | * d input register containing d | ||
184 | * operations on a and b are interleaved to increase performance | ||
185 | * during the round a and b are prepared for the output whitening | ||
186 | */ | ||
187 | #define decrypt_last_round(a,b,c,d,round)\ | ||
188 | movzx a ## B, %edi;\ | ||
189 | mov (%r11,%rdi,4), %r9d;\ | ||
190 | movzx b ## B, %edi;\ | ||
191 | mov s3(%r11,%rdi,4),%r8d;\ | ||
192 | movzx b ## H, %edi;\ | ||
193 | ror $16, b ## D;\ | ||
194 | xor (%r11,%rdi,4), %r8d;\ | ||
195 | movzx a ## H, %edi;\ | ||
196 | mov b ## D, %r10d;\ | ||
197 | shl $32, %r10;\ | ||
198 | xor a, %r10;\ | ||
199 | ror $16, a ## D;\ | ||
200 | xor s1(%r11,%rdi,4),%r9d;\ | ||
201 | movzx b ## B, %edi;\ | ||
202 | xor s1(%r11,%rdi,4),%r8d;\ | ||
203 | movzx a ## B, %edi;\ | ||
204 | xor s2(%r11,%rdi,4),%r9d;\ | ||
205 | movzx b ## H, %edi;\ | ||
206 | xor s2(%r11,%rdi,4),%r8d;\ | ||
207 | movzx a ## H, %edi;\ | ||
208 | xor s3(%r11,%rdi,4),%r9d;\ | ||
209 | add %r8d, %r9d;\ | ||
210 | add %r9d, %r8d;\ | ||
211 | add k+round(%r11), %r9d;\ | ||
212 | xor %r9d, c ## D;\ | ||
213 | add k+4+round(%r11),%r8d;\ | ||
214 | xor %r8d, d ## D;\ | ||
215 | ror $1, d ## D; | ||
216 | |||
217 | .align 8 | ||
218 | .global twofish_enc_blk | ||
219 | .global twofish_dec_blk | ||
220 | |||
221 | twofish_enc_blk: | ||
222 | pushq R1 | ||
223 | |||
224 | /* %rdi contains the crypto tfm adress */ | ||
225 | /* %rsi contains the output adress */ | ||
226 | /* %rdx contains the input adress */ | ||
227 | add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ | ||
228 | /* ctx adress is moved to free one non-rex register | ||
229 | as target for the 8bit high operations */ | ||
230 | mov %rdi, %r11 | ||
231 | |||
232 | movq (R3), R1 | ||
233 | movq 8(R3), R3 | ||
234 | input_whitening(R1,%r11,a_offset) | ||
235 | input_whitening(R3,%r11,c_offset) | ||
236 | mov R1D, R0D | ||
237 | rol $16, R0D | ||
238 | shr $32, R1 | ||
239 | mov R3D, R2D | ||
240 | shr $32, R3 | ||
241 | rol $1, R3D | ||
242 | |||
243 | encrypt_round(R0,R1,R2,R3,0); | ||
244 | encrypt_round(R2,R3,R0,R1,8); | ||
245 | encrypt_round(R0,R1,R2,R3,2*8); | ||
246 | encrypt_round(R2,R3,R0,R1,3*8); | ||
247 | encrypt_round(R0,R1,R2,R3,4*8); | ||
248 | encrypt_round(R2,R3,R0,R1,5*8); | ||
249 | encrypt_round(R0,R1,R2,R3,6*8); | ||
250 | encrypt_round(R2,R3,R0,R1,7*8); | ||
251 | encrypt_round(R0,R1,R2,R3,8*8); | ||
252 | encrypt_round(R2,R3,R0,R1,9*8); | ||
253 | encrypt_round(R0,R1,R2,R3,10*8); | ||
254 | encrypt_round(R2,R3,R0,R1,11*8); | ||
255 | encrypt_round(R0,R1,R2,R3,12*8); | ||
256 | encrypt_round(R2,R3,R0,R1,13*8); | ||
257 | encrypt_round(R0,R1,R2,R3,14*8); | ||
258 | encrypt_last_round(R2,R3,R0,R1,15*8); | ||
259 | |||
260 | |||
261 | output_whitening(%r10,%r11,a_offset) | ||
262 | movq %r10, (%rsi) | ||
263 | |||
264 | shl $32, R1 | ||
265 | xor R0, R1 | ||
266 | |||
267 | output_whitening(R1,%r11,c_offset) | ||
268 | movq R1, 8(%rsi) | ||
269 | |||
270 | popq R1 | ||
271 | movq $1,%rax | ||
272 | ret | ||
273 | |||
274 | twofish_dec_blk: | ||
275 | pushq R1 | ||
276 | |||
277 | /* %rdi contains the crypto tfm adress */ | ||
278 | /* %rsi contains the output adress */ | ||
279 | /* %rdx contains the input adress */ | ||
280 | add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ | ||
281 | /* ctx adress is moved to free one non-rex register | ||
282 | as target for the 8bit high operations */ | ||
283 | mov %rdi, %r11 | ||
284 | |||
285 | movq (R3), R1 | ||
286 | movq 8(R3), R3 | ||
287 | output_whitening(R1,%r11,a_offset) | ||
288 | output_whitening(R3,%r11,c_offset) | ||
289 | mov R1D, R0D | ||
290 | shr $32, R1 | ||
291 | rol $16, R1D | ||
292 | mov R3D, R2D | ||
293 | shr $32, R3 | ||
294 | rol $1, R2D | ||
295 | |||
296 | decrypt_round(R0,R1,R2,R3,15*8); | ||
297 | decrypt_round(R2,R3,R0,R1,14*8); | ||
298 | decrypt_round(R0,R1,R2,R3,13*8); | ||
299 | decrypt_round(R2,R3,R0,R1,12*8); | ||
300 | decrypt_round(R0,R1,R2,R3,11*8); | ||
301 | decrypt_round(R2,R3,R0,R1,10*8); | ||
302 | decrypt_round(R0,R1,R2,R3,9*8); | ||
303 | decrypt_round(R2,R3,R0,R1,8*8); | ||
304 | decrypt_round(R0,R1,R2,R3,7*8); | ||
305 | decrypt_round(R2,R3,R0,R1,6*8); | ||
306 | decrypt_round(R0,R1,R2,R3,5*8); | ||
307 | decrypt_round(R2,R3,R0,R1,4*8); | ||
308 | decrypt_round(R0,R1,R2,R3,3*8); | ||
309 | decrypt_round(R2,R3,R0,R1,2*8); | ||
310 | decrypt_round(R0,R1,R2,R3,1*8); | ||
311 | decrypt_last_round(R2,R3,R0,R1,0); | ||
312 | |||
313 | input_whitening(%r10,%r11,a_offset) | ||
314 | movq %r10, (%rsi) | ||
315 | |||
316 | shl $32, R1 | ||
317 | xor R0, R1 | ||
318 | |||
319 | input_whitening(R1,%r11,c_offset) | ||
320 | movq R1, 8(%rsi) | ||
321 | |||
322 | popq R1 | ||
323 | movq $1,%rax | ||
324 | ret | ||
diff --git a/arch/x86_64/crypto/twofish.c b/arch/x86_64/crypto/twofish.c deleted file mode 100644 index 182d91d5cfb9..000000000000 --- a/arch/x86_64/crypto/twofish.c +++ /dev/null | |||
@@ -1,97 +0,0 @@ | |||
1 | /* | ||
2 | * Glue Code for optimized x86_64 assembler version of TWOFISH | ||
3 | * | ||
4 | * Originally Twofish for GPG | ||
5 | * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 | ||
6 | * 256-bit key length added March 20, 1999 | ||
7 | * Some modifications to reduce the text size by Werner Koch, April, 1998 | ||
8 | * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com> | ||
9 | * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net> | ||
10 | * | ||
11 | * The original author has disclaimed all copyright interest in this | ||
12 | * code and thus put it in the public domain. The subsequent authors | ||
13 | * have put this under the GNU General Public License. | ||
14 | * | ||
15 | * This program is free software; you can redistribute it and/or modify | ||
16 | * it under the terms of the GNU General Public License as published by | ||
17 | * the Free Software Foundation; either version 2 of the License, or | ||
18 | * (at your option) any later version. | ||
19 | * | ||
20 | * This program is distributed in the hope that it will be useful, | ||
21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
23 | * GNU General Public License for more details. | ||
24 | * | ||
25 | * You should have received a copy of the GNU General Public License | ||
26 | * along with this program; if not, write to the Free Software | ||
27 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | ||
28 | * USA | ||
29 | * | ||
30 | * This code is a "clean room" implementation, written from the paper | ||
31 | * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey, | ||
32 | * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available | ||
33 | * through http://www.counterpane.com/twofish.html | ||
34 | * | ||
35 | * For background information on multiplication in finite fields, used for | ||
36 | * the matrix operations in the key schedule, see the book _Contemporary | ||
37 | * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the | ||
38 | * Third Edition. | ||
39 | */ | ||
40 | |||
41 | #include <crypto/twofish.h> | ||
42 | #include <linux/crypto.h> | ||
43 | #include <linux/init.h> | ||
44 | #include <linux/kernel.h> | ||
45 | #include <linux/module.h> | ||
46 | #include <linux/types.h> | ||
47 | |||
48 | asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); | ||
49 | asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); | ||
50 | |||
51 | static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | ||
52 | { | ||
53 | twofish_enc_blk(tfm, dst, src); | ||
54 | } | ||
55 | |||
56 | static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | ||
57 | { | ||
58 | twofish_dec_blk(tfm, dst, src); | ||
59 | } | ||
60 | |||
61 | static struct crypto_alg alg = { | ||
62 | .cra_name = "twofish", | ||
63 | .cra_driver_name = "twofish-x86_64", | ||
64 | .cra_priority = 200, | ||
65 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER, | ||
66 | .cra_blocksize = TF_BLOCK_SIZE, | ||
67 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
68 | .cra_alignmask = 3, | ||
69 | .cra_module = THIS_MODULE, | ||
70 | .cra_list = LIST_HEAD_INIT(alg.cra_list), | ||
71 | .cra_u = { | ||
72 | .cipher = { | ||
73 | .cia_min_keysize = TF_MIN_KEY_SIZE, | ||
74 | .cia_max_keysize = TF_MAX_KEY_SIZE, | ||
75 | .cia_setkey = twofish_setkey, | ||
76 | .cia_encrypt = twofish_encrypt, | ||
77 | .cia_decrypt = twofish_decrypt | ||
78 | } | ||
79 | } | ||
80 | }; | ||
81 | |||
82 | static int __init init(void) | ||
83 | { | ||
84 | return crypto_register_alg(&alg); | ||
85 | } | ||
86 | |||
87 | static void __exit fini(void) | ||
88 | { | ||
89 | crypto_unregister_alg(&alg); | ||
90 | } | ||
91 | |||
92 | module_init(init); | ||
93 | module_exit(fini); | ||
94 | |||
95 | MODULE_LICENSE("GPL"); | ||
96 | MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized"); | ||
97 | MODULE_ALIAS("twofish"); | ||
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile deleted file mode 100644 index cdae36435e21..000000000000 --- a/arch/x86_64/ia32/Makefile +++ /dev/null | |||
@@ -1,35 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the ia32 kernel emulation subsystem. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \ | ||
6 | ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \ | ||
7 | mmap32.o | ||
8 | |||
9 | sysv-$(CONFIG_SYSVIPC) := ipc32.o | ||
10 | obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) | ||
11 | |||
12 | obj-$(CONFIG_IA32_AOUT) += ia32_aout.o | ||
13 | |||
14 | audit-class-$(CONFIG_AUDIT) := audit.o | ||
15 | obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) | ||
16 | |||
17 | $(obj)/syscall32_syscall.o: \ | ||
18 | $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) | ||
19 | |||
20 | # Teach kbuild about targets | ||
21 | targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so) | ||
22 | |||
23 | # The DSO images are built using a special linker script | ||
24 | quiet_cmd_syscall = SYSCALL $@ | ||
25 | cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ | ||
26 | $(call ld-option, -Wl$(comma)--hash-style=sysv) \ | ||
27 | -Wl,-soname=linux-gate.so.1 -o $@ \ | ||
28 | -Wl,-T,$(filter-out FORCE,$^) | ||
29 | |||
30 | $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \ | ||
31 | $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE | ||
32 | $(call if_changed,syscall) | ||
33 | |||
34 | AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 | ||
35 | AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 | ||
diff --git a/arch/x86_64/ia32/audit.c b/arch/x86_64/ia32/audit.c deleted file mode 100644 index 8850fe40ea34..000000000000 --- a/arch/x86_64/ia32/audit.c +++ /dev/null | |||
@@ -1,42 +0,0 @@ | |||
1 | #include <asm-i386/unistd.h> | ||
2 | |||
3 | unsigned ia32_dir_class[] = { | ||
4 | #include <asm-generic/audit_dir_write.h> | ||
5 | ~0U | ||
6 | }; | ||
7 | |||
8 | unsigned ia32_chattr_class[] = { | ||
9 | #include <asm-generic/audit_change_attr.h> | ||
10 | ~0U | ||
11 | }; | ||
12 | |||
13 | unsigned ia32_write_class[] = { | ||
14 | #include <asm-generic/audit_write.h> | ||
15 | ~0U | ||
16 | }; | ||
17 | |||
18 | unsigned ia32_read_class[] = { | ||
19 | #include <asm-generic/audit_read.h> | ||
20 | ~0U | ||
21 | }; | ||
22 | |||
23 | unsigned ia32_signal_class[] = { | ||
24 | #include <asm-generic/audit_signal.h> | ||
25 | ~0U | ||
26 | }; | ||
27 | |||
28 | int ia32_classify_syscall(unsigned syscall) | ||
29 | { | ||
30 | switch(syscall) { | ||
31 | case __NR_open: | ||
32 | return 2; | ||
33 | case __NR_openat: | ||
34 | return 3; | ||
35 | case __NR_socketcall: | ||
36 | return 4; | ||
37 | case __NR_execve: | ||
38 | return 5; | ||
39 | default: | ||
40 | return 1; | ||
41 | } | ||
42 | } | ||
diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c deleted file mode 100644 index 2c8209a3605a..000000000000 --- a/arch/x86_64/ia32/fpu32.c +++ /dev/null | |||
@@ -1,183 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
3 | * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. | ||
4 | * This is used for ptrace, signals and coredumps in 32bit emulation. | ||
5 | */ | ||
6 | |||
7 | #include <linux/sched.h> | ||
8 | #include <asm/sigcontext32.h> | ||
9 | #include <asm/processor.h> | ||
10 | #include <asm/uaccess.h> | ||
11 | #include <asm/i387.h> | ||
12 | |||
13 | static inline unsigned short twd_i387_to_fxsr(unsigned short twd) | ||
14 | { | ||
15 | unsigned int tmp; /* to avoid 16 bit prefixes in the code */ | ||
16 | |||
17 | /* Transform each pair of bits into 01 (valid) or 00 (empty) */ | ||
18 | tmp = ~twd; | ||
19 | tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ | ||
20 | /* and move the valid bits to the lower byte. */ | ||
21 | tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ | ||
22 | tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ | ||
23 | tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ | ||
24 | return tmp; | ||
25 | } | ||
26 | |||
27 | static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) | ||
28 | { | ||
29 | struct _fpxreg *st = NULL; | ||
30 | unsigned long tos = (fxsave->swd >> 11) & 7; | ||
31 | unsigned long twd = (unsigned long) fxsave->twd; | ||
32 | unsigned long tag; | ||
33 | unsigned long ret = 0xffff0000; | ||
34 | int i; | ||
35 | |||
36 | #define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); | ||
37 | |||
38 | for (i = 0 ; i < 8 ; i++) { | ||
39 | if (twd & 0x1) { | ||
40 | st = FPREG_ADDR( fxsave, (i - tos) & 7 ); | ||
41 | |||
42 | switch (st->exponent & 0x7fff) { | ||
43 | case 0x7fff: | ||
44 | tag = 2; /* Special */ | ||
45 | break; | ||
46 | case 0x0000: | ||
47 | if ( !st->significand[0] && | ||
48 | !st->significand[1] && | ||
49 | !st->significand[2] && | ||
50 | !st->significand[3] ) { | ||
51 | tag = 1; /* Zero */ | ||
52 | } else { | ||
53 | tag = 2; /* Special */ | ||
54 | } | ||
55 | break; | ||
56 | default: | ||
57 | if (st->significand[3] & 0x8000) { | ||
58 | tag = 0; /* Valid */ | ||
59 | } else { | ||
60 | tag = 2; /* Special */ | ||
61 | } | ||
62 | break; | ||
63 | } | ||
64 | } else { | ||
65 | tag = 3; /* Empty */ | ||
66 | } | ||
67 | ret |= (tag << (2 * i)); | ||
68 | twd = twd >> 1; | ||
69 | } | ||
70 | return ret; | ||
71 | } | ||
72 | |||
73 | |||
74 | static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave, | ||
75 | struct _fpstate_ia32 __user *buf) | ||
76 | { | ||
77 | struct _fpxreg *to; | ||
78 | struct _fpreg __user *from; | ||
79 | int i; | ||
80 | u32 v; | ||
81 | int err = 0; | ||
82 | |||
83 | #define G(num,val) err |= __get_user(val, num + (u32 __user *)buf) | ||
84 | G(0, fxsave->cwd); | ||
85 | G(1, fxsave->swd); | ||
86 | G(2, fxsave->twd); | ||
87 | fxsave->twd = twd_i387_to_fxsr(fxsave->twd); | ||
88 | G(3, fxsave->rip); | ||
89 | G(4, v); | ||
90 | fxsave->fop = v>>16; /* cs ignored */ | ||
91 | G(5, fxsave->rdp); | ||
92 | /* 6: ds ignored */ | ||
93 | #undef G | ||
94 | if (err) | ||
95 | return -1; | ||
96 | |||
97 | to = (struct _fpxreg *)&fxsave->st_space[0]; | ||
98 | from = &buf->_st[0]; | ||
99 | for (i = 0 ; i < 8 ; i++, to++, from++) { | ||
100 | if (__copy_from_user(to, from, sizeof(*from))) | ||
101 | return -1; | ||
102 | } | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | |||
107 | static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf, | ||
108 | struct i387_fxsave_struct *fxsave, | ||
109 | struct pt_regs *regs, | ||
110 | struct task_struct *tsk) | ||
111 | { | ||
112 | struct _fpreg __user *to; | ||
113 | struct _fpxreg *from; | ||
114 | int i; | ||
115 | u16 cs,ds; | ||
116 | int err = 0; | ||
117 | |||
118 | if (tsk == current) { | ||
119 | /* should be actually ds/cs at fpu exception time, | ||
120 | but that information is not available in 64bit mode. */ | ||
121 | asm("movw %%ds,%0 " : "=r" (ds)); | ||
122 | asm("movw %%cs,%0 " : "=r" (cs)); | ||
123 | } else { /* ptrace. task has stopped. */ | ||
124 | ds = tsk->thread.ds; | ||
125 | cs = regs->cs; | ||
126 | } | ||
127 | |||
128 | #define P(num,val) err |= __put_user(val, num + (u32 __user *)buf) | ||
129 | P(0, (u32)fxsave->cwd | 0xffff0000); | ||
130 | P(1, (u32)fxsave->swd | 0xffff0000); | ||
131 | P(2, twd_fxsr_to_i387(fxsave)); | ||
132 | P(3, (u32)fxsave->rip); | ||
133 | P(4, cs | ((u32)fxsave->fop) << 16); | ||
134 | P(5, fxsave->rdp); | ||
135 | P(6, 0xffff0000 | ds); | ||
136 | #undef P | ||
137 | |||
138 | if (err) | ||
139 | return -1; | ||
140 | |||
141 | to = &buf->_st[0]; | ||
142 | from = (struct _fpxreg *) &fxsave->st_space[0]; | ||
143 | for ( i = 0 ; i < 8 ; i++, to++, from++ ) { | ||
144 | if (__copy_to_user(to, from, sizeof(*to))) | ||
145 | return -1; | ||
146 | } | ||
147 | return 0; | ||
148 | } | ||
149 | |||
150 | int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) | ||
151 | { | ||
152 | clear_fpu(tsk); | ||
153 | if (!fsave) { | ||
154 | if (__copy_from_user(&tsk->thread.i387.fxsave, | ||
155 | &buf->_fxsr_env[0], | ||
156 | sizeof(struct i387_fxsave_struct))) | ||
157 | return -1; | ||
158 | tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; | ||
159 | set_stopped_child_used_math(tsk); | ||
160 | } | ||
161 | return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf); | ||
162 | } | ||
163 | |||
164 | int save_i387_ia32(struct task_struct *tsk, | ||
165 | struct _fpstate_ia32 __user *buf, | ||
166 | struct pt_regs *regs, | ||
167 | int fsave) | ||
168 | { | ||
169 | int err = 0; | ||
170 | |||
171 | init_fpu(tsk); | ||
172 | if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk)) | ||
173 | return -1; | ||
174 | if (fsave) | ||
175 | return 0; | ||
176 | err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); | ||
177 | if (fsave) | ||
178 | return err ? -1 : 1; | ||
179 | err |= __put_user(X86_FXSR_MAGIC, &buf->magic); | ||
180 | err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, | ||
181 | sizeof(struct i387_fxsave_struct)); | ||
182 | return err ? -1 : 1; | ||
183 | } | ||
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c deleted file mode 100644 index 08781370256d..000000000000 --- a/arch/x86_64/ia32/ia32_aout.c +++ /dev/null | |||
@@ -1,528 +0,0 @@ | |||
1 | /* | ||
2 | * a.out loader for x86-64 | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992, 1996 Linus Torvalds | ||
5 | * Hacked together by Andi Kleen | ||
6 | */ | ||
7 | |||
8 | #include <linux/module.h> | ||
9 | |||
10 | #include <linux/time.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/mman.h> | ||
14 | #include <linux/a.out.h> | ||
15 | #include <linux/errno.h> | ||
16 | #include <linux/signal.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <linux/fs.h> | ||
19 | #include <linux/file.h> | ||
20 | #include <linux/stat.h> | ||
21 | #include <linux/fcntl.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/user.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/binfmts.h> | ||
26 | #include <linux/personality.h> | ||
27 | #include <linux/init.h> | ||
28 | |||
29 | #include <asm/system.h> | ||
30 | #include <asm/uaccess.h> | ||
31 | #include <asm/pgalloc.h> | ||
32 | #include <asm/cacheflush.h> | ||
33 | #include <asm/user32.h> | ||
34 | #include <asm/ia32.h> | ||
35 | |||
36 | #undef WARN_OLD | ||
37 | #undef CORE_DUMP /* probably broken */ | ||
38 | |||
39 | static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); | ||
40 | static int load_aout_library(struct file*); | ||
41 | |||
42 | #ifdef CORE_DUMP | ||
43 | static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); | ||
44 | |||
45 | /* | ||
46 | * fill in the user structure for a core dump.. | ||
47 | */ | ||
48 | static void dump_thread32(struct pt_regs * regs, struct user32 * dump) | ||
49 | { | ||
50 | u32 fs,gs; | ||
51 | |||
52 | /* changed the size calculations - should hopefully work better. lbt */ | ||
53 | dump->magic = CMAGIC; | ||
54 | dump->start_code = 0; | ||
55 | dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); | ||
56 | dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; | ||
57 | dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; | ||
58 | dump->u_dsize -= dump->u_tsize; | ||
59 | dump->u_ssize = 0; | ||
60 | dump->u_debugreg[0] = current->thread.debugreg0; | ||
61 | dump->u_debugreg[1] = current->thread.debugreg1; | ||
62 | dump->u_debugreg[2] = current->thread.debugreg2; | ||
63 | dump->u_debugreg[3] = current->thread.debugreg3; | ||
64 | dump->u_debugreg[4] = 0; | ||
65 | dump->u_debugreg[5] = 0; | ||
66 | dump->u_debugreg[6] = current->thread.debugreg6; | ||
67 | dump->u_debugreg[7] = current->thread.debugreg7; | ||
68 | |||
69 | if (dump->start_stack < 0xc0000000) | ||
70 | dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; | ||
71 | |||
72 | dump->regs.ebx = regs->rbx; | ||
73 | dump->regs.ecx = regs->rcx; | ||
74 | dump->regs.edx = regs->rdx; | ||
75 | dump->regs.esi = regs->rsi; | ||
76 | dump->regs.edi = regs->rdi; | ||
77 | dump->regs.ebp = regs->rbp; | ||
78 | dump->regs.eax = regs->rax; | ||
79 | dump->regs.ds = current->thread.ds; | ||
80 | dump->regs.es = current->thread.es; | ||
81 | asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; | ||
82 | asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; | ||
83 | dump->regs.orig_eax = regs->orig_rax; | ||
84 | dump->regs.eip = regs->rip; | ||
85 | dump->regs.cs = regs->cs; | ||
86 | dump->regs.eflags = regs->eflags; | ||
87 | dump->regs.esp = regs->rsp; | ||
88 | dump->regs.ss = regs->ss; | ||
89 | |||
90 | #if 1 /* FIXME */ | ||
91 | dump->u_fpvalid = 0; | ||
92 | #else | ||
93 | dump->u_fpvalid = dump_fpu (regs, &dump->i387); | ||
94 | #endif | ||
95 | } | ||
96 | |||
97 | #endif | ||
98 | |||
99 | static struct linux_binfmt aout_format = { | ||
100 | .module = THIS_MODULE, | ||
101 | .load_binary = load_aout_binary, | ||
102 | .load_shlib = load_aout_library, | ||
103 | #ifdef CORE_DUMP | ||
104 | .core_dump = aout_core_dump, | ||
105 | #endif | ||
106 | .min_coredump = PAGE_SIZE | ||
107 | }; | ||
108 | |||
109 | static void set_brk(unsigned long start, unsigned long end) | ||
110 | { | ||
111 | start = PAGE_ALIGN(start); | ||
112 | end = PAGE_ALIGN(end); | ||
113 | if (end <= start) | ||
114 | return; | ||
115 | down_write(¤t->mm->mmap_sem); | ||
116 | do_brk(start, end - start); | ||
117 | up_write(¤t->mm->mmap_sem); | ||
118 | } | ||
119 | |||
120 | #ifdef CORE_DUMP | ||
121 | /* | ||
122 | * These are the only things you should do on a core-file: use only these | ||
123 | * macros to write out all the necessary info. | ||
124 | */ | ||
125 | |||
126 | static int dump_write(struct file *file, const void *addr, int nr) | ||
127 | { | ||
128 | return file->f_op->write(file, addr, nr, &file->f_pos) == nr; | ||
129 | } | ||
130 | |||
131 | #define DUMP_WRITE(addr, nr) \ | ||
132 | if (!dump_write(file, (void *)(addr), (nr))) \ | ||
133 | goto end_coredump; | ||
134 | |||
135 | #define DUMP_SEEK(offset) \ | ||
136 | if (file->f_op->llseek) { \ | ||
137 | if (file->f_op->llseek(file,(offset),0) != (offset)) \ | ||
138 | goto end_coredump; \ | ||
139 | } else file->f_pos = (offset) | ||
140 | |||
141 | /* | ||
142 | * Routine writes a core dump image in the current directory. | ||
143 | * Currently only a stub-function. | ||
144 | * | ||
145 | * Note that setuid/setgid files won't make a core-dump if the uid/gid | ||
146 | * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" | ||
147 | * field, which also makes sure the core-dumps won't be recursive if the | ||
148 | * dumping of the process results in another error.. | ||
149 | */ | ||
150 | |||
151 | static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file) | ||
152 | { | ||
153 | mm_segment_t fs; | ||
154 | int has_dumped = 0; | ||
155 | unsigned long dump_start, dump_size; | ||
156 | struct user32 dump; | ||
157 | # define START_DATA(u) (u.u_tsize << PAGE_SHIFT) | ||
158 | # define START_STACK(u) (u.start_stack) | ||
159 | |||
160 | fs = get_fs(); | ||
161 | set_fs(KERNEL_DS); | ||
162 | has_dumped = 1; | ||
163 | current->flags |= PF_DUMPCORE; | ||
164 | strncpy(dump.u_comm, current->comm, sizeof(current->comm)); | ||
165 | dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); | ||
166 | dump.signal = signr; | ||
167 | dump_thread32(regs, &dump); | ||
168 | |||
169 | /* If the size of the dump file exceeds the rlimit, then see what would happen | ||
170 | if we wrote the stack, but not the data area. */ | ||
171 | if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE > | ||
172 | current->signal->rlim[RLIMIT_CORE].rlim_cur) | ||
173 | dump.u_dsize = 0; | ||
174 | |||
175 | /* Make sure we have enough room to write the stack and data areas. */ | ||
176 | if ((dump.u_ssize+1) * PAGE_SIZE > | ||
177 | current->signal->rlim[RLIMIT_CORE].rlim_cur) | ||
178 | dump.u_ssize = 0; | ||
179 | |||
180 | /* make sure we actually have a data and stack area to dump */ | ||
181 | set_fs(USER_DS); | ||
182 | if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) | ||
183 | dump.u_dsize = 0; | ||
184 | if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) | ||
185 | dump.u_ssize = 0; | ||
186 | |||
187 | set_fs(KERNEL_DS); | ||
188 | /* struct user */ | ||
189 | DUMP_WRITE(&dump,sizeof(dump)); | ||
190 | /* Now dump all of the user data. Include malloced stuff as well */ | ||
191 | DUMP_SEEK(PAGE_SIZE); | ||
192 | /* now we start writing out the user space info */ | ||
193 | set_fs(USER_DS); | ||
194 | /* Dump the data area */ | ||
195 | if (dump.u_dsize != 0) { | ||
196 | dump_start = START_DATA(dump); | ||
197 | dump_size = dump.u_dsize << PAGE_SHIFT; | ||
198 | DUMP_WRITE(dump_start,dump_size); | ||
199 | } | ||
200 | /* Now prepare to dump the stack area */ | ||
201 | if (dump.u_ssize != 0) { | ||
202 | dump_start = START_STACK(dump); | ||
203 | dump_size = dump.u_ssize << PAGE_SHIFT; | ||
204 | DUMP_WRITE(dump_start,dump_size); | ||
205 | } | ||
206 | /* Finally dump the task struct. Not be used by gdb, but could be useful */ | ||
207 | set_fs(KERNEL_DS); | ||
208 | DUMP_WRITE(current,sizeof(*current)); | ||
209 | end_coredump: | ||
210 | set_fs(fs); | ||
211 | return has_dumped; | ||
212 | } | ||
213 | #endif | ||
214 | |||
215 | /* | ||
216 | * create_aout_tables() parses the env- and arg-strings in new user | ||
217 | * memory and creates the pointer tables from them, and puts their | ||
218 | * addresses on the "stack", returning the new stack pointer value. | ||
219 | */ | ||
220 | static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) | ||
221 | { | ||
222 | u32 __user *argv; | ||
223 | u32 __user *envp; | ||
224 | u32 __user *sp; | ||
225 | int argc = bprm->argc; | ||
226 | int envc = bprm->envc; | ||
227 | |||
228 | sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); | ||
229 | sp -= envc+1; | ||
230 | envp = sp; | ||
231 | sp -= argc+1; | ||
232 | argv = sp; | ||
233 | put_user((unsigned long) envp,--sp); | ||
234 | put_user((unsigned long) argv,--sp); | ||
235 | put_user(argc,--sp); | ||
236 | current->mm->arg_start = (unsigned long) p; | ||
237 | while (argc-->0) { | ||
238 | char c; | ||
239 | put_user((u32)(unsigned long)p,argv++); | ||
240 | do { | ||
241 | get_user(c,p++); | ||
242 | } while (c); | ||
243 | } | ||
244 | put_user(0, argv); | ||
245 | current->mm->arg_end = current->mm->env_start = (unsigned long) p; | ||
246 | while (envc-->0) { | ||
247 | char c; | ||
248 | put_user((u32)(unsigned long)p,envp++); | ||
249 | do { | ||
250 | get_user(c,p++); | ||
251 | } while (c); | ||
252 | } | ||
253 | put_user(0, envp); | ||
254 | current->mm->env_end = (unsigned long) p; | ||
255 | return sp; | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * These are the functions used to load a.out style executables and shared | ||
260 | * libraries. There is no binary dependent code anywhere else. | ||
261 | */ | ||
262 | |||
263 | static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) | ||
264 | { | ||
265 | struct exec ex; | ||
266 | unsigned long error; | ||
267 | unsigned long fd_offset; | ||
268 | unsigned long rlim; | ||
269 | int retval; | ||
270 | |||
271 | ex = *((struct exec *) bprm->buf); /* exec-header */ | ||
272 | if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && | ||
273 | N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || | ||
274 | N_TRSIZE(ex) || N_DRSIZE(ex) || | ||
275 | i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { | ||
276 | return -ENOEXEC; | ||
277 | } | ||
278 | |||
279 | fd_offset = N_TXTOFF(ex); | ||
280 | |||
281 | /* Check initial limits. This avoids letting people circumvent | ||
282 | * size limits imposed on them by creating programs with large | ||
283 | * arrays in the data or bss. | ||
284 | */ | ||
285 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | ||
286 | if (rlim >= RLIM_INFINITY) | ||
287 | rlim = ~0; | ||
288 | if (ex.a_data + ex.a_bss > rlim) | ||
289 | return -ENOMEM; | ||
290 | |||
291 | /* Flush all traces of the currently running executable */ | ||
292 | retval = flush_old_exec(bprm); | ||
293 | if (retval) | ||
294 | return retval; | ||
295 | |||
296 | regs->cs = __USER32_CS; | ||
297 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = | ||
298 | regs->r13 = regs->r14 = regs->r15 = 0; | ||
299 | |||
300 | /* OK, This is the point of no return */ | ||
301 | set_personality(PER_LINUX); | ||
302 | set_thread_flag(TIF_IA32); | ||
303 | clear_thread_flag(TIF_ABI_PENDING); | ||
304 | |||
305 | current->mm->end_code = ex.a_text + | ||
306 | (current->mm->start_code = N_TXTADDR(ex)); | ||
307 | current->mm->end_data = ex.a_data + | ||
308 | (current->mm->start_data = N_DATADDR(ex)); | ||
309 | current->mm->brk = ex.a_bss + | ||
310 | (current->mm->start_brk = N_BSSADDR(ex)); | ||
311 | current->mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
312 | current->mm->cached_hole_size = 0; | ||
313 | |||
314 | current->mm->mmap = NULL; | ||
315 | compute_creds(bprm); | ||
316 | current->flags &= ~PF_FORKNOEXEC; | ||
317 | |||
318 | if (N_MAGIC(ex) == OMAGIC) { | ||
319 | unsigned long text_addr, map_size; | ||
320 | loff_t pos; | ||
321 | |||
322 | text_addr = N_TXTADDR(ex); | ||
323 | |||
324 | pos = 32; | ||
325 | map_size = ex.a_text+ex.a_data; | ||
326 | |||
327 | down_write(¤t->mm->mmap_sem); | ||
328 | error = do_brk(text_addr & PAGE_MASK, map_size); | ||
329 | up_write(¤t->mm->mmap_sem); | ||
330 | |||
331 | if (error != (text_addr & PAGE_MASK)) { | ||
332 | send_sig(SIGKILL, current, 0); | ||
333 | return error; | ||
334 | } | ||
335 | |||
336 | error = bprm->file->f_op->read(bprm->file, | ||
337 | (char __user *)text_addr, | ||
338 | ex.a_text+ex.a_data, &pos); | ||
339 | if ((signed long)error < 0) { | ||
340 | send_sig(SIGKILL, current, 0); | ||
341 | return error; | ||
342 | } | ||
343 | |||
344 | flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); | ||
345 | } else { | ||
346 | #ifdef WARN_OLD | ||
347 | static unsigned long error_time, error_time2; | ||
348 | if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && | ||
349 | (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) | ||
350 | { | ||
351 | printk(KERN_NOTICE "executable not page aligned\n"); | ||
352 | error_time2 = jiffies; | ||
353 | } | ||
354 | |||
355 | if ((fd_offset & ~PAGE_MASK) != 0 && | ||
356 | (jiffies-error_time) > 5*HZ) | ||
357 | { | ||
358 | printk(KERN_WARNING | ||
359 | "fd_offset is not page aligned. Please convert program: %s\n", | ||
360 | bprm->file->f_path.dentry->d_name.name); | ||
361 | error_time = jiffies; | ||
362 | } | ||
363 | #endif | ||
364 | |||
365 | if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { | ||
366 | loff_t pos = fd_offset; | ||
367 | down_write(¤t->mm->mmap_sem); | ||
368 | do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); | ||
369 | up_write(¤t->mm->mmap_sem); | ||
370 | bprm->file->f_op->read(bprm->file, | ||
371 | (char __user *)N_TXTADDR(ex), | ||
372 | ex.a_text+ex.a_data, &pos); | ||
373 | flush_icache_range((unsigned long) N_TXTADDR(ex), | ||
374 | (unsigned long) N_TXTADDR(ex) + | ||
375 | ex.a_text+ex.a_data); | ||
376 | goto beyond_if; | ||
377 | } | ||
378 | |||
379 | down_write(¤t->mm->mmap_sem); | ||
380 | error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, | ||
381 | PROT_READ | PROT_EXEC, | ||
382 | MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, | ||
383 | fd_offset); | ||
384 | up_write(¤t->mm->mmap_sem); | ||
385 | |||
386 | if (error != N_TXTADDR(ex)) { | ||
387 | send_sig(SIGKILL, current, 0); | ||
388 | return error; | ||
389 | } | ||
390 | |||
391 | down_write(¤t->mm->mmap_sem); | ||
392 | error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, | ||
393 | PROT_READ | PROT_WRITE | PROT_EXEC, | ||
394 | MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, | ||
395 | fd_offset + ex.a_text); | ||
396 | up_write(¤t->mm->mmap_sem); | ||
397 | if (error != N_DATADDR(ex)) { | ||
398 | send_sig(SIGKILL, current, 0); | ||
399 | return error; | ||
400 | } | ||
401 | } | ||
402 | beyond_if: | ||
403 | set_binfmt(&aout_format); | ||
404 | |||
405 | set_brk(current->mm->start_brk, current->mm->brk); | ||
406 | |||
407 | retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); | ||
408 | if (retval < 0) { | ||
409 | /* Someone check-me: is this error path enough? */ | ||
410 | send_sig(SIGKILL, current, 0); | ||
411 | return retval; | ||
412 | } | ||
413 | |||
414 | current->mm->start_stack = | ||
415 | (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); | ||
416 | /* start thread */ | ||
417 | asm volatile("movl %0,%%fs" :: "r" (0)); \ | ||
418 | asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); | ||
419 | load_gs_index(0); | ||
420 | (regs)->rip = ex.a_entry; | ||
421 | (regs)->rsp = current->mm->start_stack; | ||
422 | (regs)->eflags = 0x200; | ||
423 | (regs)->cs = __USER32_CS; | ||
424 | (regs)->ss = __USER32_DS; | ||
425 | set_fs(USER_DS); | ||
426 | if (unlikely(current->ptrace & PT_PTRACED)) { | ||
427 | if (current->ptrace & PT_TRACE_EXEC) | ||
428 | ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); | ||
429 | else | ||
430 | send_sig(SIGTRAP, current, 0); | ||
431 | } | ||
432 | return 0; | ||
433 | } | ||
434 | |||
435 | static int load_aout_library(struct file *file) | ||
436 | { | ||
437 | struct inode * inode; | ||
438 | unsigned long bss, start_addr, len; | ||
439 | unsigned long error; | ||
440 | int retval; | ||
441 | struct exec ex; | ||
442 | |||
443 | inode = file->f_path.dentry->d_inode; | ||
444 | |||
445 | retval = -ENOEXEC; | ||
446 | error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); | ||
447 | if (error != sizeof(ex)) | ||
448 | goto out; | ||
449 | |||
450 | /* We come in here for the regular a.out style of shared libraries */ | ||
451 | if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || | ||
452 | N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || | ||
453 | i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { | ||
454 | goto out; | ||
455 | } | ||
456 | |||
457 | if (N_FLAGS(ex)) | ||
458 | goto out; | ||
459 | |||
460 | /* For QMAGIC, the starting address is 0x20 into the page. We mask | ||
461 | this off to get the starting address for the page */ | ||
462 | |||
463 | start_addr = ex.a_entry & 0xfffff000; | ||
464 | |||
465 | if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { | ||
466 | loff_t pos = N_TXTOFF(ex); | ||
467 | |||
468 | #ifdef WARN_OLD | ||
469 | static unsigned long error_time; | ||
470 | if ((jiffies-error_time) > 5*HZ) | ||
471 | { | ||
472 | printk(KERN_WARNING | ||
473 | "N_TXTOFF is not page aligned. Please convert library: %s\n", | ||
474 | file->f_path.dentry->d_name.name); | ||
475 | error_time = jiffies; | ||
476 | } | ||
477 | #endif | ||
478 | down_write(¤t->mm->mmap_sem); | ||
479 | do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); | ||
480 | up_write(¤t->mm->mmap_sem); | ||
481 | |||
482 | file->f_op->read(file, (char __user *)start_addr, | ||
483 | ex.a_text + ex.a_data, &pos); | ||
484 | flush_icache_range((unsigned long) start_addr, | ||
485 | (unsigned long) start_addr + ex.a_text + ex.a_data); | ||
486 | |||
487 | retval = 0; | ||
488 | goto out; | ||
489 | } | ||
490 | /* Now use mmap to map the library into memory. */ | ||
491 | down_write(¤t->mm->mmap_sem); | ||
492 | error = do_mmap(file, start_addr, ex.a_text + ex.a_data, | ||
493 | PROT_READ | PROT_WRITE | PROT_EXEC, | ||
494 | MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT, | ||
495 | N_TXTOFF(ex)); | ||
496 | up_write(¤t->mm->mmap_sem); | ||
497 | retval = error; | ||
498 | if (error != start_addr) | ||
499 | goto out; | ||
500 | |||
501 | len = PAGE_ALIGN(ex.a_text + ex.a_data); | ||
502 | bss = ex.a_text + ex.a_data + ex.a_bss; | ||
503 | if (bss > len) { | ||
504 | down_write(¤t->mm->mmap_sem); | ||
505 | error = do_brk(start_addr + len, bss - len); | ||
506 | up_write(¤t->mm->mmap_sem); | ||
507 | retval = error; | ||
508 | if (error != start_addr + len) | ||
509 | goto out; | ||
510 | } | ||
511 | retval = 0; | ||
512 | out: | ||
513 | return retval; | ||
514 | } | ||
515 | |||
516 | static int __init init_aout_binfmt(void) | ||
517 | { | ||
518 | return register_binfmt(&aout_format); | ||
519 | } | ||
520 | |||
521 | static void __exit exit_aout_binfmt(void) | ||
522 | { | ||
523 | unregister_binfmt(&aout_format); | ||
524 | } | ||
525 | |||
526 | module_init(init_aout_binfmt); | ||
527 | module_exit(exit_aout_binfmt); | ||
528 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c deleted file mode 100644 index dffd2ac72747..000000000000 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ /dev/null | |||
@@ -1,320 +0,0 @@ | |||
1 | /* | ||
2 | * Written 2000,2002 by Andi Kleen. | ||
3 | * | ||
4 | * Loosely based on the sparc64 and IA64 32bit emulation loaders. | ||
5 | * This tricks binfmt_elf.c into loading 32bit binaries using lots | ||
6 | * of ugly preprocessor tricks. Talk about very very poor man's inheritance. | ||
7 | */ | ||
8 | #define __ASM_X86_64_ELF_H 1 | ||
9 | |||
10 | #undef ELF_CLASS | ||
11 | #define ELF_CLASS ELFCLASS32 | ||
12 | |||
13 | #include <linux/types.h> | ||
14 | #include <linux/stddef.h> | ||
15 | #include <linux/rwsem.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/compat.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/binfmts.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/security.h> | ||
22 | |||
23 | #include <asm/segment.h> | ||
24 | #include <asm/ptrace.h> | ||
25 | #include <asm/processor.h> | ||
26 | #include <asm/user32.h> | ||
27 | #include <asm/sigcontext32.h> | ||
28 | #include <asm/fpu32.h> | ||
29 | #include <asm/i387.h> | ||
30 | #include <asm/uaccess.h> | ||
31 | #include <asm/ia32.h> | ||
32 | #include <asm/vsyscall32.h> | ||
33 | |||
34 | #define ELF_NAME "elf/i386" | ||
35 | |||
36 | #define AT_SYSINFO 32 | ||
37 | #define AT_SYSINFO_EHDR 33 | ||
38 | |||
39 | int sysctl_vsyscall32 = 1; | ||
40 | |||
41 | #undef ARCH_DLINFO | ||
42 | #define ARCH_DLINFO do { \ | ||
43 | if (sysctl_vsyscall32) { \ | ||
44 | current->mm->context.vdso = (void *)VSYSCALL32_BASE; \ | ||
45 | NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ | ||
46 | NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \ | ||
47 | } \ | ||
48 | } while(0) | ||
49 | |||
50 | struct file; | ||
51 | struct elf_phdr; | ||
52 | |||
53 | #define IA32_EMULATOR 1 | ||
54 | |||
55 | #define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) | ||
56 | |||
57 | #undef ELF_ARCH | ||
58 | #define ELF_ARCH EM_386 | ||
59 | |||
60 | #define ELF_DATA ELFDATA2LSB | ||
61 | |||
62 | #define USE_ELF_CORE_DUMP 1 | ||
63 | |||
64 | /* Override elfcore.h */ | ||
65 | #define _LINUX_ELFCORE_H 1 | ||
66 | typedef unsigned int elf_greg_t; | ||
67 | |||
68 | #define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t)) | ||
69 | typedef elf_greg_t elf_gregset_t[ELF_NGREG]; | ||
70 | |||
71 | struct elf_siginfo | ||
72 | { | ||
73 | int si_signo; /* signal number */ | ||
74 | int si_code; /* extra code */ | ||
75 | int si_errno; /* errno */ | ||
76 | }; | ||
77 | |||
78 | #define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) | ||
79 | |||
80 | struct elf_prstatus | ||
81 | { | ||
82 | struct elf_siginfo pr_info; /* Info associated with signal */ | ||
83 | short pr_cursig; /* Current signal */ | ||
84 | unsigned int pr_sigpend; /* Set of pending signals */ | ||
85 | unsigned int pr_sighold; /* Set of held signals */ | ||
86 | pid_t pr_pid; | ||
87 | pid_t pr_ppid; | ||
88 | pid_t pr_pgrp; | ||
89 | pid_t pr_sid; | ||
90 | struct compat_timeval pr_utime; /* User time */ | ||
91 | struct compat_timeval pr_stime; /* System time */ | ||
92 | struct compat_timeval pr_cutime; /* Cumulative user time */ | ||
93 | struct compat_timeval pr_cstime; /* Cumulative system time */ | ||
94 | elf_gregset_t pr_reg; /* GP registers */ | ||
95 | int pr_fpvalid; /* True if math co-processor being used. */ | ||
96 | }; | ||
97 | |||
98 | #define ELF_PRARGSZ (80) /* Number of chars for args */ | ||
99 | |||
100 | struct elf_prpsinfo | ||
101 | { | ||
102 | char pr_state; /* numeric process state */ | ||
103 | char pr_sname; /* char for pr_state */ | ||
104 | char pr_zomb; /* zombie */ | ||
105 | char pr_nice; /* nice val */ | ||
106 | unsigned int pr_flag; /* flags */ | ||
107 | __u16 pr_uid; | ||
108 | __u16 pr_gid; | ||
109 | pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; | ||
110 | /* Lots missing */ | ||
111 | char pr_fname[16]; /* filename of executable */ | ||
112 | char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ | ||
113 | }; | ||
114 | |||
115 | #define __STR(x) #x | ||
116 | #define STR(x) __STR(x) | ||
117 | |||
118 | #define _GET_SEG(x) \ | ||
119 | ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; }) | ||
120 | |||
121 | /* Assumes current==process to be dumped */ | ||
122 | #define ELF_CORE_COPY_REGS(pr_reg, regs) \ | ||
123 | pr_reg[0] = regs->rbx; \ | ||
124 | pr_reg[1] = regs->rcx; \ | ||
125 | pr_reg[2] = regs->rdx; \ | ||
126 | pr_reg[3] = regs->rsi; \ | ||
127 | pr_reg[4] = regs->rdi; \ | ||
128 | pr_reg[5] = regs->rbp; \ | ||
129 | pr_reg[6] = regs->rax; \ | ||
130 | pr_reg[7] = _GET_SEG(ds); \ | ||
131 | pr_reg[8] = _GET_SEG(es); \ | ||
132 | pr_reg[9] = _GET_SEG(fs); \ | ||
133 | pr_reg[10] = _GET_SEG(gs); \ | ||
134 | pr_reg[11] = regs->orig_rax; \ | ||
135 | pr_reg[12] = regs->rip; \ | ||
136 | pr_reg[13] = regs->cs; \ | ||
137 | pr_reg[14] = regs->eflags; \ | ||
138 | pr_reg[15] = regs->rsp; \ | ||
139 | pr_reg[16] = regs->ss; | ||
140 | |||
141 | #define user user32 | ||
142 | |||
143 | #undef elf_read_implies_exec | ||
144 | #define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) | ||
145 | //#include <asm/ia32.h> | ||
146 | #include <linux/elf.h> | ||
147 | |||
148 | typedef struct user_i387_ia32_struct elf_fpregset_t; | ||
149 | typedef struct user32_fxsr_struct elf_fpxregset_t; | ||
150 | |||
151 | |||
152 | static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) | ||
153 | { | ||
154 | ELF_CORE_COPY_REGS((*elfregs), regs) | ||
155 | } | ||
156 | |||
157 | static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) | ||
158 | { | ||
159 | struct pt_regs *pp = task_pt_regs(t); | ||
160 | ELF_CORE_COPY_REGS((*elfregs), pp); | ||
161 | /* fix wrong segments */ | ||
162 | (*elfregs)[7] = t->thread.ds; | ||
163 | (*elfregs)[9] = t->thread.fsindex; | ||
164 | (*elfregs)[10] = t->thread.gsindex; | ||
165 | (*elfregs)[8] = t->thread.es; | ||
166 | return 1; | ||
167 | } | ||
168 | |||
169 | static inline int | ||
170 | elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu) | ||
171 | { | ||
172 | struct _fpstate_ia32 *fpstate = (void*)fpu; | ||
173 | mm_segment_t oldfs = get_fs(); | ||
174 | |||
175 | if (!tsk_used_math(tsk)) | ||
176 | return 0; | ||
177 | if (!regs) | ||
178 | regs = task_pt_regs(tsk); | ||
179 | if (tsk == current) | ||
180 | unlazy_fpu(tsk); | ||
181 | set_fs(KERNEL_DS); | ||
182 | save_i387_ia32(tsk, fpstate, regs, 1); | ||
183 | /* Correct for i386 bug. It puts the fop into the upper 16bits of | ||
184 | the tag word (like FXSAVE), not into the fcs*/ | ||
185 | fpstate->cssel |= fpstate->tag & 0xffff0000; | ||
186 | set_fs(oldfs); | ||
187 | return 1; | ||
188 | } | ||
189 | |||
190 | #define ELF_CORE_COPY_XFPREGS 1 | ||
191 | static inline int | ||
192 | elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) | ||
193 | { | ||
194 | struct pt_regs *regs = task_pt_regs(t); | ||
195 | if (!tsk_used_math(t)) | ||
196 | return 0; | ||
197 | if (t == current) | ||
198 | unlazy_fpu(t); | ||
199 | memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t)); | ||
200 | xfpu->fcs = regs->cs; | ||
201 | xfpu->fos = t->thread.ds; /* right? */ | ||
202 | return 1; | ||
203 | } | ||
204 | |||
205 | #undef elf_check_arch | ||
206 | #define elf_check_arch(x) \ | ||
207 | ((x)->e_machine == EM_386) | ||
208 | |||
209 | extern int force_personality32; | ||
210 | |||
211 | #define ELF_EXEC_PAGESIZE PAGE_SIZE | ||
212 | #define ELF_HWCAP (boot_cpu_data.x86_capability[0]) | ||
213 | #define ELF_PLATFORM ("i686") | ||
214 | #define SET_PERSONALITY(ex, ibcs2) \ | ||
215 | do { \ | ||
216 | unsigned long new_flags = 0; \ | ||
217 | if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ | ||
218 | new_flags = _TIF_IA32; \ | ||
219 | if ((current_thread_info()->flags & _TIF_IA32) \ | ||
220 | != new_flags) \ | ||
221 | set_thread_flag(TIF_ABI_PENDING); \ | ||
222 | else \ | ||
223 | clear_thread_flag(TIF_ABI_PENDING); \ | ||
224 | /* XXX This overwrites the user set personality */ \ | ||
225 | current->personality |= force_personality32; \ | ||
226 | } while (0) | ||
227 | |||
228 | /* Override some function names */ | ||
229 | #define elf_format elf32_format | ||
230 | |||
231 | #define init_elf_binfmt init_elf32_binfmt | ||
232 | #define exit_elf_binfmt exit_elf32_binfmt | ||
233 | |||
234 | #define load_elf_binary load_elf32_binary | ||
235 | |||
236 | #define ELF_PLAT_INIT(r, load_addr) elf32_init(r) | ||
237 | |||
238 | #undef start_thread | ||
239 | #define start_thread(regs,new_rip,new_rsp) do { \ | ||
240 | asm volatile("movl %0,%%fs" :: "r" (0)); \ | ||
241 | asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \ | ||
242 | load_gs_index(0); \ | ||
243 | (regs)->rip = (new_rip); \ | ||
244 | (regs)->rsp = (new_rsp); \ | ||
245 | (regs)->eflags = 0x200; \ | ||
246 | (regs)->cs = __USER32_CS; \ | ||
247 | (regs)->ss = __USER32_DS; \ | ||
248 | set_fs(USER_DS); \ | ||
249 | } while(0) | ||
250 | |||
251 | |||
252 | #include <linux/module.h> | ||
253 | |||
254 | MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); | ||
255 | MODULE_AUTHOR("Eric Youngdale, Andi Kleen"); | ||
256 | |||
257 | #undef MODULE_DESCRIPTION | ||
258 | #undef MODULE_AUTHOR | ||
259 | |||
260 | static void elf32_init(struct pt_regs *); | ||
261 | |||
262 | #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 | ||
263 | #define arch_setup_additional_pages syscall32_setup_pages | ||
264 | extern int syscall32_setup_pages(struct linux_binprm *, int exstack); | ||
265 | |||
266 | #include "../../../fs/binfmt_elf.c" | ||
267 | |||
268 | static void elf32_init(struct pt_regs *regs) | ||
269 | { | ||
270 | struct task_struct *me = current; | ||
271 | regs->rdi = 0; | ||
272 | regs->rsi = 0; | ||
273 | regs->rdx = 0; | ||
274 | regs->rcx = 0; | ||
275 | regs->rax = 0; | ||
276 | regs->rbx = 0; | ||
277 | regs->rbp = 0; | ||
278 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = | ||
279 | regs->r13 = regs->r14 = regs->r15 = 0; | ||
280 | me->thread.fs = 0; | ||
281 | me->thread.gs = 0; | ||
282 | me->thread.fsindex = 0; | ||
283 | me->thread.gsindex = 0; | ||
284 | me->thread.ds = __USER_DS; | ||
285 | me->thread.es = __USER_DS; | ||
286 | } | ||
287 | |||
288 | #ifdef CONFIG_SYSCTL | ||
289 | /* Register vsyscall32 into the ABI table */ | ||
290 | #include <linux/sysctl.h> | ||
291 | |||
292 | static ctl_table abi_table2[] = { | ||
293 | { | ||
294 | .ctl_name = 99, | ||
295 | .procname = "vsyscall32", | ||
296 | .data = &sysctl_vsyscall32, | ||
297 | .maxlen = sizeof(int), | ||
298 | .mode = 0644, | ||
299 | .proc_handler = proc_dointvec | ||
300 | }, | ||
301 | {} | ||
302 | }; | ||
303 | |||
304 | static ctl_table abi_root_table2[] = { | ||
305 | { | ||
306 | .ctl_name = CTL_ABI, | ||
307 | .procname = "abi", | ||
308 | .mode = 0555, | ||
309 | .child = abi_table2 | ||
310 | }, | ||
311 | {} | ||
312 | }; | ||
313 | |||
314 | static __init int ia32_binfmt_init(void) | ||
315 | { | ||
316 | register_sysctl_table(abi_root_table2); | ||
317 | return 0; | ||
318 | } | ||
319 | __initcall(ia32_binfmt_init); | ||
320 | #endif | ||
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c deleted file mode 100644 index 6ea19c25f90d..000000000000 --- a/arch/x86_64/ia32/ia32_signal.c +++ /dev/null | |||
@@ -1,617 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/ia32/ia32_signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson | ||
7 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | ||
8 | * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen | ||
9 | */ | ||
10 | |||
11 | #include <linux/sched.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/signal.h> | ||
16 | #include <linux/errno.h> | ||
17 | #include <linux/wait.h> | ||
18 | #include <linux/ptrace.h> | ||
19 | #include <linux/unistd.h> | ||
20 | #include <linux/stddef.h> | ||
21 | #include <linux/personality.h> | ||
22 | #include <linux/compat.h> | ||
23 | #include <linux/binfmts.h> | ||
24 | #include <asm/ucontext.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/i387.h> | ||
27 | #include <asm/ia32.h> | ||
28 | #include <asm/ptrace.h> | ||
29 | #include <asm/ia32_unistd.h> | ||
30 | #include <asm/user32.h> | ||
31 | #include <asm/sigcontext32.h> | ||
32 | #include <asm/fpu32.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm/vsyscall32.h> | ||
35 | |||
36 | #define DEBUG_SIG 0 | ||
37 | |||
38 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | ||
39 | |||
40 | asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); | ||
41 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where); | ||
42 | |||
43 | int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) | ||
44 | { | ||
45 | int err; | ||
46 | if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) | ||
47 | return -EFAULT; | ||
48 | |||
49 | /* If you change siginfo_t structure, please make sure that | ||
50 | this code is fixed accordingly. | ||
51 | It should never copy any pad contained in the structure | ||
52 | to avoid security leaks, but must copy the generic | ||
53 | 3 ints plus the relevant union member. */ | ||
54 | err = __put_user(from->si_signo, &to->si_signo); | ||
55 | err |= __put_user(from->si_errno, &to->si_errno); | ||
56 | err |= __put_user((short)from->si_code, &to->si_code); | ||
57 | |||
58 | if (from->si_code < 0) { | ||
59 | err |= __put_user(from->si_pid, &to->si_pid); | ||
60 | err |= __put_user(from->si_uid, &to->si_uid); | ||
61 | err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); | ||
62 | } else { | ||
63 | /* First 32bits of unions are always present: | ||
64 | * si_pid === si_band === si_tid === si_addr(LS half) */ | ||
65 | err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); | ||
66 | switch (from->si_code >> 16) { | ||
67 | case __SI_FAULT >> 16: | ||
68 | break; | ||
69 | case __SI_CHLD >> 16: | ||
70 | err |= __put_user(from->si_utime, &to->si_utime); | ||
71 | err |= __put_user(from->si_stime, &to->si_stime); | ||
72 | err |= __put_user(from->si_status, &to->si_status); | ||
73 | /* FALL THROUGH */ | ||
74 | default: | ||
75 | case __SI_KILL >> 16: | ||
76 | err |= __put_user(from->si_uid, &to->si_uid); | ||
77 | break; | ||
78 | case __SI_POLL >> 16: | ||
79 | err |= __put_user(from->si_fd, &to->si_fd); | ||
80 | break; | ||
81 | case __SI_TIMER >> 16: | ||
82 | err |= __put_user(from->si_overrun, &to->si_overrun); | ||
83 | err |= __put_user(ptr_to_compat(from->si_ptr), | ||
84 | &to->si_ptr); | ||
85 | break; | ||
86 | case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ | ||
87 | case __SI_MESGQ >> 16: | ||
88 | err |= __put_user(from->si_uid, &to->si_uid); | ||
89 | err |= __put_user(from->si_int, &to->si_int); | ||
90 | break; | ||
91 | } | ||
92 | } | ||
93 | return err; | ||
94 | } | ||
95 | |||
96 | int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) | ||
97 | { | ||
98 | int err; | ||
99 | u32 ptr32; | ||
100 | if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) | ||
101 | return -EFAULT; | ||
102 | |||
103 | err = __get_user(to->si_signo, &from->si_signo); | ||
104 | err |= __get_user(to->si_errno, &from->si_errno); | ||
105 | err |= __get_user(to->si_code, &from->si_code); | ||
106 | |||
107 | err |= __get_user(to->si_pid, &from->si_pid); | ||
108 | err |= __get_user(to->si_uid, &from->si_uid); | ||
109 | err |= __get_user(ptr32, &from->si_ptr); | ||
110 | to->si_ptr = compat_ptr(ptr32); | ||
111 | |||
112 | return err; | ||
113 | } | ||
114 | |||
115 | asmlinkage long | ||
116 | sys32_sigsuspend(int history0, int history1, old_sigset_t mask) | ||
117 | { | ||
118 | mask &= _BLOCKABLE; | ||
119 | spin_lock_irq(¤t->sighand->siglock); | ||
120 | current->saved_sigmask = current->blocked; | ||
121 | siginitset(¤t->blocked, mask); | ||
122 | recalc_sigpending(); | ||
123 | spin_unlock_irq(¤t->sighand->siglock); | ||
124 | |||
125 | current->state = TASK_INTERRUPTIBLE; | ||
126 | schedule(); | ||
127 | set_thread_flag(TIF_RESTORE_SIGMASK); | ||
128 | return -ERESTARTNOHAND; | ||
129 | } | ||
130 | |||
131 | asmlinkage long | ||
132 | sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, | ||
133 | stack_ia32_t __user *uoss_ptr, | ||
134 | struct pt_regs *regs) | ||
135 | { | ||
136 | stack_t uss,uoss; | ||
137 | int ret; | ||
138 | mm_segment_t seg; | ||
139 | if (uss_ptr) { | ||
140 | u32 ptr; | ||
141 | memset(&uss,0,sizeof(stack_t)); | ||
142 | if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || | ||
143 | __get_user(ptr, &uss_ptr->ss_sp) || | ||
144 | __get_user(uss.ss_flags, &uss_ptr->ss_flags) || | ||
145 | __get_user(uss.ss_size, &uss_ptr->ss_size)) | ||
146 | return -EFAULT; | ||
147 | uss.ss_sp = compat_ptr(ptr); | ||
148 | } | ||
149 | seg = get_fs(); | ||
150 | set_fs(KERNEL_DS); | ||
151 | ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); | ||
152 | set_fs(seg); | ||
153 | if (ret >= 0 && uoss_ptr) { | ||
154 | if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || | ||
155 | __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || | ||
156 | __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || | ||
157 | __put_user(uoss.ss_size, &uoss_ptr->ss_size)) | ||
158 | ret = -EFAULT; | ||
159 | } | ||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * Do a signal return; undo the signal stack. | ||
165 | */ | ||
166 | |||
167 | struct sigframe | ||
168 | { | ||
169 | u32 pretcode; | ||
170 | int sig; | ||
171 | struct sigcontext_ia32 sc; | ||
172 | struct _fpstate_ia32 fpstate; | ||
173 | unsigned int extramask[_COMPAT_NSIG_WORDS-1]; | ||
174 | char retcode[8]; | ||
175 | }; | ||
176 | |||
177 | struct rt_sigframe | ||
178 | { | ||
179 | u32 pretcode; | ||
180 | int sig; | ||
181 | u32 pinfo; | ||
182 | u32 puc; | ||
183 | compat_siginfo_t info; | ||
184 | struct ucontext_ia32 uc; | ||
185 | struct _fpstate_ia32 fpstate; | ||
186 | char retcode[8]; | ||
187 | }; | ||
188 | |||
189 | static int | ||
190 | ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) | ||
191 | { | ||
192 | unsigned int err = 0; | ||
193 | |||
194 | /* Always make any pending restarted system calls return -EINTR */ | ||
195 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | ||
196 | |||
197 | #if DEBUG_SIG | ||
198 | printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", | ||
199 | sc, sc->err, sc->eip, sc->cs, sc->eflags); | ||
200 | #endif | ||
201 | #define COPY(x) { \ | ||
202 | unsigned int reg; \ | ||
203 | err |= __get_user(reg, &sc->e ##x); \ | ||
204 | regs->r ## x = reg; \ | ||
205 | } | ||
206 | |||
207 | #define RELOAD_SEG(seg,mask) \ | ||
208 | { unsigned int cur; \ | ||
209 | unsigned short pre; \ | ||
210 | err |= __get_user(pre, &sc->seg); \ | ||
211 | asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ | ||
212 | pre |= mask; \ | ||
213 | if (pre != cur) loadsegment(seg,pre); } | ||
214 | |||
215 | /* Reload fs and gs if they have changed in the signal handler. | ||
216 | This does not handle long fs/gs base changes in the handler, but | ||
217 | does not clobber them at least in the normal case. */ | ||
218 | |||
219 | { | ||
220 | unsigned gs, oldgs; | ||
221 | err |= __get_user(gs, &sc->gs); | ||
222 | gs |= 3; | ||
223 | asm("movl %%gs,%0" : "=r" (oldgs)); | ||
224 | if (gs != oldgs) | ||
225 | load_gs_index(gs); | ||
226 | } | ||
227 | RELOAD_SEG(fs,3); | ||
228 | RELOAD_SEG(ds,3); | ||
229 | RELOAD_SEG(es,3); | ||
230 | |||
231 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | ||
232 | COPY(dx); COPY(cx); COPY(ip); | ||
233 | /* Don't touch extended registers */ | ||
234 | |||
235 | err |= __get_user(regs->cs, &sc->cs); | ||
236 | regs->cs |= 3; | ||
237 | err |= __get_user(regs->ss, &sc->ss); | ||
238 | regs->ss |= 3; | ||
239 | |||
240 | { | ||
241 | unsigned int tmpflags; | ||
242 | err |= __get_user(tmpflags, &sc->eflags); | ||
243 | regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); | ||
244 | regs->orig_rax = -1; /* disable syscall checks */ | ||
245 | } | ||
246 | |||
247 | { | ||
248 | u32 tmp; | ||
249 | struct _fpstate_ia32 __user * buf; | ||
250 | err |= __get_user(tmp, &sc->fpstate); | ||
251 | buf = compat_ptr(tmp); | ||
252 | if (buf) { | ||
253 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
254 | goto badframe; | ||
255 | err |= restore_i387_ia32(current, buf, 0); | ||
256 | } else { | ||
257 | struct task_struct *me = current; | ||
258 | if (used_math()) { | ||
259 | clear_fpu(me); | ||
260 | clear_used_math(); | ||
261 | } | ||
262 | } | ||
263 | } | ||
264 | |||
265 | { | ||
266 | u32 tmp; | ||
267 | err |= __get_user(tmp, &sc->eax); | ||
268 | *peax = tmp; | ||
269 | } | ||
270 | return err; | ||
271 | |||
272 | badframe: | ||
273 | return 1; | ||
274 | } | ||
275 | |||
276 | asmlinkage long sys32_sigreturn(struct pt_regs *regs) | ||
277 | { | ||
278 | struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); | ||
279 | sigset_t set; | ||
280 | unsigned int eax; | ||
281 | |||
282 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | ||
283 | goto badframe; | ||
284 | if (__get_user(set.sig[0], &frame->sc.oldmask) | ||
285 | || (_COMPAT_NSIG_WORDS > 1 | ||
286 | && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, | ||
287 | sizeof(frame->extramask)))) | ||
288 | goto badframe; | ||
289 | |||
290 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
291 | spin_lock_irq(¤t->sighand->siglock); | ||
292 | current->blocked = set; | ||
293 | recalc_sigpending(); | ||
294 | spin_unlock_irq(¤t->sighand->siglock); | ||
295 | |||
296 | if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) | ||
297 | goto badframe; | ||
298 | return eax; | ||
299 | |||
300 | badframe: | ||
301 | signal_fault(regs, frame, "32bit sigreturn"); | ||
302 | return 0; | ||
303 | } | ||
304 | |||
305 | asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) | ||
306 | { | ||
307 | struct rt_sigframe __user *frame; | ||
308 | sigset_t set; | ||
309 | unsigned int eax; | ||
310 | struct pt_regs tregs; | ||
311 | |||
312 | frame = (struct rt_sigframe __user *)(regs->rsp - 4); | ||
313 | |||
314 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | ||
315 | goto badframe; | ||
316 | if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) | ||
317 | goto badframe; | ||
318 | |||
319 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
320 | spin_lock_irq(¤t->sighand->siglock); | ||
321 | current->blocked = set; | ||
322 | recalc_sigpending(); | ||
323 | spin_unlock_irq(¤t->sighand->siglock); | ||
324 | |||
325 | if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) | ||
326 | goto badframe; | ||
327 | |||
328 | tregs = *regs; | ||
329 | if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) | ||
330 | goto badframe; | ||
331 | |||
332 | return eax; | ||
333 | |||
334 | badframe: | ||
335 | signal_fault(regs,frame,"32bit rt sigreturn"); | ||
336 | return 0; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Set up a signal frame. | ||
341 | */ | ||
342 | |||
343 | static int | ||
344 | ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, | ||
345 | struct pt_regs *regs, unsigned int mask) | ||
346 | { | ||
347 | int tmp, err = 0; | ||
348 | |||
349 | tmp = 0; | ||
350 | __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); | ||
351 | err |= __put_user(tmp, (unsigned int __user *)&sc->gs); | ||
352 | __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); | ||
353 | err |= __put_user(tmp, (unsigned int __user *)&sc->fs); | ||
354 | __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); | ||
355 | err |= __put_user(tmp, (unsigned int __user *)&sc->ds); | ||
356 | __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); | ||
357 | err |= __put_user(tmp, (unsigned int __user *)&sc->es); | ||
358 | |||
359 | err |= __put_user((u32)regs->rdi, &sc->edi); | ||
360 | err |= __put_user((u32)regs->rsi, &sc->esi); | ||
361 | err |= __put_user((u32)regs->rbp, &sc->ebp); | ||
362 | err |= __put_user((u32)regs->rsp, &sc->esp); | ||
363 | err |= __put_user((u32)regs->rbx, &sc->ebx); | ||
364 | err |= __put_user((u32)regs->rdx, &sc->edx); | ||
365 | err |= __put_user((u32)regs->rcx, &sc->ecx); | ||
366 | err |= __put_user((u32)regs->rax, &sc->eax); | ||
367 | err |= __put_user((u32)regs->cs, &sc->cs); | ||
368 | err |= __put_user((u32)regs->ss, &sc->ss); | ||
369 | err |= __put_user(current->thread.trap_no, &sc->trapno); | ||
370 | err |= __put_user(current->thread.error_code, &sc->err); | ||
371 | err |= __put_user((u32)regs->rip, &sc->eip); | ||
372 | err |= __put_user((u32)regs->eflags, &sc->eflags); | ||
373 | err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); | ||
374 | |||
375 | tmp = save_i387_ia32(current, fpstate, regs, 0); | ||
376 | if (tmp < 0) | ||
377 | err = -EFAULT; | ||
378 | else { | ||
379 | clear_used_math(); | ||
380 | stts(); | ||
381 | err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), | ||
382 | &sc->fpstate); | ||
383 | } | ||
384 | |||
385 | /* non-iBCS2 extensions.. */ | ||
386 | err |= __put_user(mask, &sc->oldmask); | ||
387 | err |= __put_user(current->thread.cr2, &sc->cr2); | ||
388 | |||
389 | return err; | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * Determine which stack to use.. | ||
394 | */ | ||
395 | static void __user * | ||
396 | get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) | ||
397 | { | ||
398 | unsigned long rsp; | ||
399 | |||
400 | /* Default to using normal stack */ | ||
401 | rsp = regs->rsp; | ||
402 | |||
403 | /* This is the X/Open sanctioned signal stack switching. */ | ||
404 | if (ka->sa.sa_flags & SA_ONSTACK) { | ||
405 | if (sas_ss_flags(rsp) == 0) | ||
406 | rsp = current->sas_ss_sp + current->sas_ss_size; | ||
407 | } | ||
408 | |||
409 | /* This is the legacy signal stack switching. */ | ||
410 | else if ((regs->ss & 0xffff) != __USER_DS && | ||
411 | !(ka->sa.sa_flags & SA_RESTORER) && | ||
412 | ka->sa.sa_restorer) { | ||
413 | rsp = (unsigned long) ka->sa.sa_restorer; | ||
414 | } | ||
415 | |||
416 | rsp -= frame_size; | ||
417 | /* Align the stack pointer according to the i386 ABI, | ||
418 | * i.e. so that on function entry ((sp + 4) & 15) == 0. */ | ||
419 | rsp = ((rsp + 4) & -16ul) - 4; | ||
420 | return (void __user *) rsp; | ||
421 | } | ||
422 | |||
423 | int ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
424 | compat_sigset_t *set, struct pt_regs * regs) | ||
425 | { | ||
426 | struct sigframe __user *frame; | ||
427 | int err = 0; | ||
428 | |||
429 | frame = get_sigframe(ka, regs, sizeof(*frame)); | ||
430 | |||
431 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
432 | goto give_sigsegv; | ||
433 | |||
434 | err |= __put_user(sig, &frame->sig); | ||
435 | if (err) | ||
436 | goto give_sigsegv; | ||
437 | |||
438 | err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, | ||
439 | set->sig[0]); | ||
440 | if (err) | ||
441 | goto give_sigsegv; | ||
442 | |||
443 | if (_COMPAT_NSIG_WORDS > 1) { | ||
444 | err |= __copy_to_user(frame->extramask, &set->sig[1], | ||
445 | sizeof(frame->extramask)); | ||
446 | } | ||
447 | if (err) | ||
448 | goto give_sigsegv; | ||
449 | |||
450 | /* Return stub is in 32bit vsyscall page */ | ||
451 | { | ||
452 | void __user *restorer; | ||
453 | if (current->binfmt->hasvdso) | ||
454 | restorer = VSYSCALL32_SIGRETURN; | ||
455 | else | ||
456 | restorer = (void *)&frame->retcode; | ||
457 | if (ka->sa.sa_flags & SA_RESTORER) | ||
458 | restorer = ka->sa.sa_restorer; | ||
459 | err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); | ||
460 | } | ||
461 | /* These are actually not used anymore, but left because some | ||
462 | gdb versions depend on them as a marker. */ | ||
463 | { | ||
464 | /* copy_to_user optimizes that into a single 8 byte store */ | ||
465 | static const struct { | ||
466 | u16 poplmovl; | ||
467 | u32 val; | ||
468 | u16 int80; | ||
469 | u16 pad; | ||
470 | } __attribute__((packed)) code = { | ||
471 | 0xb858, /* popl %eax ; movl $...,%eax */ | ||
472 | __NR_ia32_sigreturn, | ||
473 | 0x80cd, /* int $0x80 */ | ||
474 | 0, | ||
475 | }; | ||
476 | err |= __copy_to_user(frame->retcode, &code, 8); | ||
477 | } | ||
478 | if (err) | ||
479 | goto give_sigsegv; | ||
480 | |||
481 | /* Set up registers for signal handler */ | ||
482 | regs->rsp = (unsigned long) frame; | ||
483 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
484 | |||
485 | /* Make -mregparm=3 work */ | ||
486 | regs->rax = sig; | ||
487 | regs->rdx = 0; | ||
488 | regs->rcx = 0; | ||
489 | |||
490 | asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); | ||
491 | asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); | ||
492 | |||
493 | regs->cs = __USER32_CS; | ||
494 | regs->ss = __USER32_DS; | ||
495 | |||
496 | set_fs(USER_DS); | ||
497 | regs->eflags &= ~TF_MASK; | ||
498 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
499 | ptrace_notify(SIGTRAP); | ||
500 | |||
501 | #if DEBUG_SIG | ||
502 | printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", | ||
503 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
504 | #endif | ||
505 | |||
506 | return 0; | ||
507 | |||
508 | give_sigsegv: | ||
509 | force_sigsegv(sig, current); | ||
510 | return -EFAULT; | ||
511 | } | ||
512 | |||
513 | int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
514 | compat_sigset_t *set, struct pt_regs * regs) | ||
515 | { | ||
516 | struct rt_sigframe __user *frame; | ||
517 | int err = 0; | ||
518 | |||
519 | frame = get_sigframe(ka, regs, sizeof(*frame)); | ||
520 | |||
521 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
522 | goto give_sigsegv; | ||
523 | |||
524 | { | ||
525 | struct exec_domain *ed = current_thread_info()->exec_domain; | ||
526 | err |= __put_user((ed | ||
527 | && ed->signal_invmap | ||
528 | && sig < 32 | ||
529 | ? ed->signal_invmap[sig] | ||
530 | : sig), | ||
531 | &frame->sig); | ||
532 | } | ||
533 | err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); | ||
534 | err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); | ||
535 | err |= copy_siginfo_to_user32(&frame->info, info); | ||
536 | if (err) | ||
537 | goto give_sigsegv; | ||
538 | |||
539 | /* Create the ucontext. */ | ||
540 | err |= __put_user(0, &frame->uc.uc_flags); | ||
541 | err |= __put_user(0, &frame->uc.uc_link); | ||
542 | err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | ||
543 | err |= __put_user(sas_ss_flags(regs->rsp), | ||
544 | &frame->uc.uc_stack.ss_flags); | ||
545 | err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
546 | err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, | ||
547 | regs, set->sig[0]); | ||
548 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | ||
549 | if (err) | ||
550 | goto give_sigsegv; | ||
551 | |||
552 | |||
553 | { | ||
554 | void __user *restorer = VSYSCALL32_RTSIGRETURN; | ||
555 | if (ka->sa.sa_flags & SA_RESTORER) | ||
556 | restorer = ka->sa.sa_restorer; | ||
557 | err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); | ||
558 | } | ||
559 | |||
560 | /* This is movl $,%eax ; int $0x80 */ | ||
561 | /* Not actually used anymore, but left because some gdb versions | ||
562 | need it. */ | ||
563 | { | ||
564 | /* __copy_to_user optimizes that into a single 8 byte store */ | ||
565 | static const struct { | ||
566 | u8 movl; | ||
567 | u32 val; | ||
568 | u16 int80; | ||
569 | u16 pad; | ||
570 | u8 pad2; | ||
571 | } __attribute__((packed)) code = { | ||
572 | 0xb8, | ||
573 | __NR_ia32_rt_sigreturn, | ||
574 | 0x80cd, | ||
575 | 0, | ||
576 | }; | ||
577 | err |= __copy_to_user(frame->retcode, &code, 8); | ||
578 | } | ||
579 | if (err) | ||
580 | goto give_sigsegv; | ||
581 | |||
582 | /* Set up registers for signal handler */ | ||
583 | regs->rsp = (unsigned long) frame; | ||
584 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
585 | |||
586 | /* Make -mregparm=3 work */ | ||
587 | regs->rax = sig; | ||
588 | regs->rdx = (unsigned long) &frame->info; | ||
589 | regs->rcx = (unsigned long) &frame->uc; | ||
590 | |||
591 | /* Make -mregparm=3 work */ | ||
592 | regs->rax = sig; | ||
593 | regs->rdx = (unsigned long) &frame->info; | ||
594 | regs->rcx = (unsigned long) &frame->uc; | ||
595 | |||
596 | asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); | ||
597 | asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); | ||
598 | |||
599 | regs->cs = __USER32_CS; | ||
600 | regs->ss = __USER32_DS; | ||
601 | |||
602 | set_fs(USER_DS); | ||
603 | regs->eflags &= ~TF_MASK; | ||
604 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
605 | ptrace_notify(SIGTRAP); | ||
606 | |||
607 | #if DEBUG_SIG | ||
608 | printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", | ||
609 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
610 | #endif | ||
611 | |||
612 | return 0; | ||
613 | |||
614 | give_sigsegv: | ||
615 | force_sigsegv(sig, current); | ||
616 | return -EFAULT; | ||
617 | } | ||
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S deleted file mode 100644 index 18b231810908..000000000000 --- a/arch/x86_64/ia32/ia32entry.S +++ /dev/null | |||
@@ -1,736 +0,0 @@ | |||
1 | /* | ||
2 | * Compatibility mode system call entry point for x86-64. | ||
3 | * | ||
4 | * Copyright 2000-2002 Andi Kleen, SuSE Labs. | ||
5 | */ | ||
6 | |||
7 | #include <asm/dwarf2.h> | ||
8 | #include <asm/calling.h> | ||
9 | #include <asm/asm-offsets.h> | ||
10 | #include <asm/current.h> | ||
11 | #include <asm/errno.h> | ||
12 | #include <asm/ia32_unistd.h> | ||
13 | #include <asm/thread_info.h> | ||
14 | #include <asm/segment.h> | ||
15 | #include <asm/vsyscall32.h> | ||
16 | #include <asm/irqflags.h> | ||
17 | #include <linux/linkage.h> | ||
18 | |||
19 | #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) | ||
20 | |||
21 | .macro IA32_ARG_FIXUP noebp=0 | ||
22 | movl %edi,%r8d | ||
23 | .if \noebp | ||
24 | .else | ||
25 | movl %ebp,%r9d | ||
26 | .endif | ||
27 | xchg %ecx,%esi | ||
28 | movl %ebx,%edi | ||
29 | movl %edx,%edx /* zero extension */ | ||
30 | .endm | ||
31 | |||
32 | /* clobbers %eax */ | ||
33 | .macro CLEAR_RREGS | ||
34 | xorl %eax,%eax | ||
35 | movq %rax,R11(%rsp) | ||
36 | movq %rax,R10(%rsp) | ||
37 | movq %rax,R9(%rsp) | ||
38 | movq %rax,R8(%rsp) | ||
39 | .endm | ||
40 | |||
41 | .macro LOAD_ARGS32 offset | ||
42 | movl \offset(%rsp),%r11d | ||
43 | movl \offset+8(%rsp),%r10d | ||
44 | movl \offset+16(%rsp),%r9d | ||
45 | movl \offset+24(%rsp),%r8d | ||
46 | movl \offset+40(%rsp),%ecx | ||
47 | movl \offset+48(%rsp),%edx | ||
48 | movl \offset+56(%rsp),%esi | ||
49 | movl \offset+64(%rsp),%edi | ||
50 | movl \offset+72(%rsp),%eax | ||
51 | .endm | ||
52 | |||
53 | .macro CFI_STARTPROC32 simple | ||
54 | CFI_STARTPROC \simple | ||
55 | CFI_UNDEFINED r8 | ||
56 | CFI_UNDEFINED r9 | ||
57 | CFI_UNDEFINED r10 | ||
58 | CFI_UNDEFINED r11 | ||
59 | CFI_UNDEFINED r12 | ||
60 | CFI_UNDEFINED r13 | ||
61 | CFI_UNDEFINED r14 | ||
62 | CFI_UNDEFINED r15 | ||
63 | .endm | ||
64 | |||
65 | /* | ||
66 | * 32bit SYSENTER instruction entry. | ||
67 | * | ||
68 | * Arguments: | ||
69 | * %eax System call number. | ||
70 | * %ebx Arg1 | ||
71 | * %ecx Arg2 | ||
72 | * %edx Arg3 | ||
73 | * %esi Arg4 | ||
74 | * %edi Arg5 | ||
75 | * %ebp user stack | ||
76 | * 0(%ebp) Arg6 | ||
77 | * | ||
78 | * Interrupts off. | ||
79 | * | ||
80 | * This is purely a fast path. For anything complicated we use the int 0x80 | ||
81 | * path below. Set up a complete hardware stack frame to share code | ||
82 | * with the int 0x80 path. | ||
83 | */ | ||
84 | ENTRY(ia32_sysenter_target) | ||
85 | CFI_STARTPROC32 simple | ||
86 | CFI_SIGNAL_FRAME | ||
87 | CFI_DEF_CFA rsp,0 | ||
88 | CFI_REGISTER rsp,rbp | ||
89 | swapgs | ||
90 | movq %gs:pda_kernelstack, %rsp | ||
91 | addq $(PDA_STACKOFFSET),%rsp | ||
92 | /* | ||
93 | * No need to follow this irqs on/off section: the syscall | ||
94 | * disabled irqs, here we enable it straight after entry: | ||
95 | */ | ||
96 | sti | ||
97 | movl %ebp,%ebp /* zero extension */ | ||
98 | pushq $__USER32_DS | ||
99 | CFI_ADJUST_CFA_OFFSET 8 | ||
100 | /*CFI_REL_OFFSET ss,0*/ | ||
101 | pushq %rbp | ||
102 | CFI_ADJUST_CFA_OFFSET 8 | ||
103 | CFI_REL_OFFSET rsp,0 | ||
104 | pushfq | ||
105 | CFI_ADJUST_CFA_OFFSET 8 | ||
106 | /*CFI_REL_OFFSET rflags,0*/ | ||
107 | movl $VSYSCALL32_SYSEXIT, %r10d | ||
108 | CFI_REGISTER rip,r10 | ||
109 | pushq $__USER32_CS | ||
110 | CFI_ADJUST_CFA_OFFSET 8 | ||
111 | /*CFI_REL_OFFSET cs,0*/ | ||
112 | movl %eax, %eax | ||
113 | pushq %r10 | ||
114 | CFI_ADJUST_CFA_OFFSET 8 | ||
115 | CFI_REL_OFFSET rip,0 | ||
116 | pushq %rax | ||
117 | CFI_ADJUST_CFA_OFFSET 8 | ||
118 | cld | ||
119 | SAVE_ARGS 0,0,1 | ||
120 | /* no need to do an access_ok check here because rbp has been | ||
121 | 32bit zero extended */ | ||
122 | 1: movl (%rbp),%r9d | ||
123 | .section __ex_table,"a" | ||
124 | .quad 1b,ia32_badarg | ||
125 | .previous | ||
126 | GET_THREAD_INFO(%r10) | ||
127 | orl $TS_COMPAT,threadinfo_status(%r10) | ||
128 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | ||
129 | CFI_REMEMBER_STATE | ||
130 | jnz sysenter_tracesys | ||
131 | sysenter_do_call: | ||
132 | cmpl $(IA32_NR_syscalls-1),%eax | ||
133 | ja ia32_badsys | ||
134 | IA32_ARG_FIXUP 1 | ||
135 | call *ia32_sys_call_table(,%rax,8) | ||
136 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
137 | GET_THREAD_INFO(%r10) | ||
138 | cli | ||
139 | TRACE_IRQS_OFF | ||
140 | testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) | ||
141 | jnz int_ret_from_sys_call | ||
142 | andl $~TS_COMPAT,threadinfo_status(%r10) | ||
143 | /* clear IF, that popfq doesn't enable interrupts early */ | ||
144 | andl $~0x200,EFLAGS-R11(%rsp) | ||
145 | RESTORE_ARGS 1,24,1,1,1,1 | ||
146 | popfq | ||
147 | CFI_ADJUST_CFA_OFFSET -8 | ||
148 | /*CFI_RESTORE rflags*/ | ||
149 | popq %rcx /* User %esp */ | ||
150 | CFI_ADJUST_CFA_OFFSET -8 | ||
151 | CFI_REGISTER rsp,rcx | ||
152 | movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ | ||
153 | CFI_REGISTER rip,rdx | ||
154 | TRACE_IRQS_ON | ||
155 | swapgs | ||
156 | sti /* sti only takes effect after the next instruction */ | ||
157 | /* sysexit */ | ||
158 | .byte 0xf, 0x35 | ||
159 | |||
160 | sysenter_tracesys: | ||
161 | CFI_RESTORE_STATE | ||
162 | SAVE_REST | ||
163 | CLEAR_RREGS | ||
164 | movq $-ENOSYS,RAX(%rsp) /* really needed? */ | ||
165 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | ||
166 | call syscall_trace_enter | ||
167 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
168 | RESTORE_REST | ||
169 | movl %ebp, %ebp | ||
170 | /* no need to do an access_ok check here because rbp has been | ||
171 | 32bit zero extended */ | ||
172 | 1: movl (%rbp),%r9d | ||
173 | .section __ex_table,"a" | ||
174 | .quad 1b,ia32_badarg | ||
175 | .previous | ||
176 | jmp sysenter_do_call | ||
177 | CFI_ENDPROC | ||
178 | ENDPROC(ia32_sysenter_target) | ||
179 | |||
180 | /* | ||
181 | * 32bit SYSCALL instruction entry. | ||
182 | * | ||
183 | * Arguments: | ||
184 | * %eax System call number. | ||
185 | * %ebx Arg1 | ||
186 | * %ecx return EIP | ||
187 | * %edx Arg3 | ||
188 | * %esi Arg4 | ||
189 | * %edi Arg5 | ||
190 | * %ebp Arg2 [note: not saved in the stack frame, should not be touched] | ||
191 | * %esp user stack | ||
192 | * 0(%esp) Arg6 | ||
193 | * | ||
194 | * Interrupts off. | ||
195 | * | ||
196 | * This is purely a fast path. For anything complicated we use the int 0x80 | ||
197 | * path below. Set up a complete hardware stack frame to share code | ||
198 | * with the int 0x80 path. | ||
199 | */ | ||
200 | ENTRY(ia32_cstar_target) | ||
201 | CFI_STARTPROC32 simple | ||
202 | CFI_SIGNAL_FRAME | ||
203 | CFI_DEF_CFA rsp,PDA_STACKOFFSET | ||
204 | CFI_REGISTER rip,rcx | ||
205 | /*CFI_REGISTER rflags,r11*/ | ||
206 | swapgs | ||
207 | movl %esp,%r8d | ||
208 | CFI_REGISTER rsp,r8 | ||
209 | movq %gs:pda_kernelstack,%rsp | ||
210 | /* | ||
211 | * No need to follow this irqs on/off section: the syscall | ||
212 | * disabled irqs and here we enable it straight after entry: | ||
213 | */ | ||
214 | sti | ||
215 | SAVE_ARGS 8,1,1 | ||
216 | movl %eax,%eax /* zero extension */ | ||
217 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | ||
218 | movq %rcx,RIP-ARGOFFSET(%rsp) | ||
219 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | ||
220 | movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ | ||
221 | movl %ebp,%ecx | ||
222 | movq $__USER32_CS,CS-ARGOFFSET(%rsp) | ||
223 | movq $__USER32_DS,SS-ARGOFFSET(%rsp) | ||
224 | movq %r11,EFLAGS-ARGOFFSET(%rsp) | ||
225 | /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | ||
226 | movq %r8,RSP-ARGOFFSET(%rsp) | ||
227 | CFI_REL_OFFSET rsp,RSP-ARGOFFSET | ||
228 | /* no need to do an access_ok check here because r8 has been | ||
229 | 32bit zero extended */ | ||
230 | /* hardware stack frame is complete now */ | ||
231 | 1: movl (%r8),%r9d | ||
232 | .section __ex_table,"a" | ||
233 | .quad 1b,ia32_badarg | ||
234 | .previous | ||
235 | GET_THREAD_INFO(%r10) | ||
236 | orl $TS_COMPAT,threadinfo_status(%r10) | ||
237 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | ||
238 | CFI_REMEMBER_STATE | ||
239 | jnz cstar_tracesys | ||
240 | cstar_do_call: | ||
241 | cmpl $IA32_NR_syscalls-1,%eax | ||
242 | ja ia32_badsys | ||
243 | IA32_ARG_FIXUP 1 | ||
244 | call *ia32_sys_call_table(,%rax,8) | ||
245 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
246 | GET_THREAD_INFO(%r10) | ||
247 | cli | ||
248 | TRACE_IRQS_OFF | ||
249 | testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) | ||
250 | jnz int_ret_from_sys_call | ||
251 | andl $~TS_COMPAT,threadinfo_status(%r10) | ||
252 | RESTORE_ARGS 1,-ARG_SKIP,1,1,1 | ||
253 | movl RIP-ARGOFFSET(%rsp),%ecx | ||
254 | CFI_REGISTER rip,rcx | ||
255 | movl EFLAGS-ARGOFFSET(%rsp),%r11d | ||
256 | /*CFI_REGISTER rflags,r11*/ | ||
257 | TRACE_IRQS_ON | ||
258 | movl RSP-ARGOFFSET(%rsp),%esp | ||
259 | CFI_RESTORE rsp | ||
260 | swapgs | ||
261 | sysretl | ||
262 | |||
263 | cstar_tracesys: | ||
264 | CFI_RESTORE_STATE | ||
265 | SAVE_REST | ||
266 | CLEAR_RREGS | ||
267 | movq $-ENOSYS,RAX(%rsp) /* really needed? */ | ||
268 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | ||
269 | call syscall_trace_enter | ||
270 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
271 | RESTORE_REST | ||
272 | movl RSP-ARGOFFSET(%rsp), %r8d | ||
273 | /* no need to do an access_ok check here because r8 has been | ||
274 | 32bit zero extended */ | ||
275 | 1: movl (%r8),%r9d | ||
276 | .section __ex_table,"a" | ||
277 | .quad 1b,ia32_badarg | ||
278 | .previous | ||
279 | jmp cstar_do_call | ||
280 | END(ia32_cstar_target) | ||
281 | |||
282 | ia32_badarg: | ||
283 | movq $-EFAULT,%rax | ||
284 | jmp ia32_sysret | ||
285 | CFI_ENDPROC | ||
286 | |||
287 | /* | ||
288 | * Emulated IA32 system calls via int 0x80. | ||
289 | * | ||
290 | * Arguments: | ||
291 | * %eax System call number. | ||
292 | * %ebx Arg1 | ||
293 | * %ecx Arg2 | ||
294 | * %edx Arg3 | ||
295 | * %esi Arg4 | ||
296 | * %edi Arg5 | ||
297 | * %ebp Arg6 [note: not saved in the stack frame, should not be touched] | ||
298 | * | ||
299 | * Notes: | ||
300 | * Uses the same stack frame as the x86-64 version. | ||
301 | * All registers except %eax must be saved (but ptrace may violate that) | ||
302 | * Arguments are zero extended. For system calls that want sign extension and | ||
303 | * take long arguments a wrapper is needed. Most calls can just be called | ||
304 | * directly. | ||
305 | * Assumes it is only called from user space and entered with interrupts off. | ||
306 | */ | ||
307 | |||
308 | ENTRY(ia32_syscall) | ||
309 | CFI_STARTPROC32 simple | ||
310 | CFI_SIGNAL_FRAME | ||
311 | CFI_DEF_CFA rsp,SS+8-RIP | ||
312 | /*CFI_REL_OFFSET ss,SS-RIP*/ | ||
313 | CFI_REL_OFFSET rsp,RSP-RIP | ||
314 | /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ | ||
315 | /*CFI_REL_OFFSET cs,CS-RIP*/ | ||
316 | CFI_REL_OFFSET rip,RIP-RIP | ||
317 | swapgs | ||
318 | /* | ||
319 | * No need to follow this irqs on/off section: the syscall | ||
320 | * disabled irqs and here we enable it straight after entry: | ||
321 | */ | ||
322 | sti | ||
323 | movl %eax,%eax | ||
324 | pushq %rax | ||
325 | CFI_ADJUST_CFA_OFFSET 8 | ||
326 | cld | ||
327 | /* note the registers are not zero extended to the sf. | ||
328 | this could be a problem. */ | ||
329 | SAVE_ARGS 0,0,1 | ||
330 | GET_THREAD_INFO(%r10) | ||
331 | orl $TS_COMPAT,threadinfo_status(%r10) | ||
332 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) | ||
333 | jnz ia32_tracesys | ||
334 | ia32_do_syscall: | ||
335 | cmpl $(IA32_NR_syscalls-1),%eax | ||
336 | ja ia32_badsys | ||
337 | IA32_ARG_FIXUP | ||
338 | call *ia32_sys_call_table(,%rax,8) # xxx: rip relative | ||
339 | ia32_sysret: | ||
340 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
341 | jmp int_ret_from_sys_call | ||
342 | |||
343 | ia32_tracesys: | ||
344 | SAVE_REST | ||
345 | CLEAR_RREGS | ||
346 | movq $-ENOSYS,RAX(%rsp) /* really needed? */ | ||
347 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | ||
348 | call syscall_trace_enter | ||
349 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
350 | RESTORE_REST | ||
351 | jmp ia32_do_syscall | ||
352 | END(ia32_syscall) | ||
353 | |||
354 | ia32_badsys: | ||
355 | movq $0,ORIG_RAX-ARGOFFSET(%rsp) | ||
356 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | ||
357 | jmp int_ret_from_sys_call | ||
358 | |||
359 | quiet_ni_syscall: | ||
360 | movq $-ENOSYS,%rax | ||
361 | ret | ||
362 | CFI_ENDPROC | ||
363 | |||
364 | .macro PTREGSCALL label, func, arg | ||
365 | .globl \label | ||
366 | \label: | ||
367 | leaq \func(%rip),%rax | ||
368 | leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | ||
369 | jmp ia32_ptregs_common | ||
370 | .endm | ||
371 | |||
372 | CFI_STARTPROC32 | ||
373 | |||
374 | PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi | ||
375 | PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi | ||
376 | PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx | ||
377 | PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx | ||
378 | PTREGSCALL stub32_execve, sys32_execve, %rcx | ||
379 | PTREGSCALL stub32_fork, sys_fork, %rdi | ||
380 | PTREGSCALL stub32_clone, sys32_clone, %rdx | ||
381 | PTREGSCALL stub32_vfork, sys_vfork, %rdi | ||
382 | PTREGSCALL stub32_iopl, sys_iopl, %rsi | ||
383 | PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx | ||
384 | |||
385 | ENTRY(ia32_ptregs_common) | ||
386 | popq %r11 | ||
387 | CFI_ENDPROC | ||
388 | CFI_STARTPROC32 simple | ||
389 | CFI_SIGNAL_FRAME | ||
390 | CFI_DEF_CFA rsp,SS+8-ARGOFFSET | ||
391 | CFI_REL_OFFSET rax,RAX-ARGOFFSET | ||
392 | CFI_REL_OFFSET rcx,RCX-ARGOFFSET | ||
393 | CFI_REL_OFFSET rdx,RDX-ARGOFFSET | ||
394 | CFI_REL_OFFSET rsi,RSI-ARGOFFSET | ||
395 | CFI_REL_OFFSET rdi,RDI-ARGOFFSET | ||
396 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | ||
397 | /* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ | ||
398 | /* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ | ||
399 | CFI_REL_OFFSET rsp,RSP-ARGOFFSET | ||
400 | /* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ | ||
401 | SAVE_REST | ||
402 | call *%rax | ||
403 | RESTORE_REST | ||
404 | jmp ia32_sysret /* misbalances the return cache */ | ||
405 | CFI_ENDPROC | ||
406 | END(ia32_ptregs_common) | ||
407 | |||
408 | .section .rodata,"a" | ||
409 | .align 8 | ||
410 | ia32_sys_call_table: | ||
411 | .quad sys_restart_syscall | ||
412 | .quad sys_exit | ||
413 | .quad stub32_fork | ||
414 | .quad sys_read | ||
415 | .quad sys_write | ||
416 | .quad compat_sys_open /* 5 */ | ||
417 | .quad sys_close | ||
418 | .quad sys32_waitpid | ||
419 | .quad sys_creat | ||
420 | .quad sys_link | ||
421 | .quad sys_unlink /* 10 */ | ||
422 | .quad stub32_execve | ||
423 | .quad sys_chdir | ||
424 | .quad compat_sys_time | ||
425 | .quad sys_mknod | ||
426 | .quad sys_chmod /* 15 */ | ||
427 | .quad sys_lchown16 | ||
428 | .quad quiet_ni_syscall /* old break syscall holder */ | ||
429 | .quad sys_stat | ||
430 | .quad sys32_lseek | ||
431 | .quad sys_getpid /* 20 */ | ||
432 | .quad compat_sys_mount /* mount */ | ||
433 | .quad sys_oldumount /* old_umount */ | ||
434 | .quad sys_setuid16 | ||
435 | .quad sys_getuid16 | ||
436 | .quad compat_sys_stime /* stime */ /* 25 */ | ||
437 | .quad sys32_ptrace /* ptrace */ | ||
438 | .quad sys_alarm | ||
439 | .quad sys_fstat /* (old)fstat */ | ||
440 | .quad sys_pause | ||
441 | .quad compat_sys_utime /* 30 */ | ||
442 | .quad quiet_ni_syscall /* old stty syscall holder */ | ||
443 | .quad quiet_ni_syscall /* old gtty syscall holder */ | ||
444 | .quad sys_access | ||
445 | .quad sys_nice | ||
446 | .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ | ||
447 | .quad sys_sync | ||
448 | .quad sys32_kill | ||
449 | .quad sys_rename | ||
450 | .quad sys_mkdir | ||
451 | .quad sys_rmdir /* 40 */ | ||
452 | .quad sys_dup | ||
453 | .quad sys32_pipe | ||
454 | .quad compat_sys_times | ||
455 | .quad quiet_ni_syscall /* old prof syscall holder */ | ||
456 | .quad sys_brk /* 45 */ | ||
457 | .quad sys_setgid16 | ||
458 | .quad sys_getgid16 | ||
459 | .quad sys_signal | ||
460 | .quad sys_geteuid16 | ||
461 | .quad sys_getegid16 /* 50 */ | ||
462 | .quad sys_acct | ||
463 | .quad sys_umount /* new_umount */ | ||
464 | .quad quiet_ni_syscall /* old lock syscall holder */ | ||
465 | .quad compat_sys_ioctl | ||
466 | .quad compat_sys_fcntl64 /* 55 */ | ||
467 | .quad quiet_ni_syscall /* old mpx syscall holder */ | ||
468 | .quad sys_setpgid | ||
469 | .quad quiet_ni_syscall /* old ulimit syscall holder */ | ||
470 | .quad sys32_olduname | ||
471 | .quad sys_umask /* 60 */ | ||
472 | .quad sys_chroot | ||
473 | .quad sys32_ustat | ||
474 | .quad sys_dup2 | ||
475 | .quad sys_getppid | ||
476 | .quad sys_getpgrp /* 65 */ | ||
477 | .quad sys_setsid | ||
478 | .quad sys32_sigaction | ||
479 | .quad sys_sgetmask | ||
480 | .quad sys_ssetmask | ||
481 | .quad sys_setreuid16 /* 70 */ | ||
482 | .quad sys_setregid16 | ||
483 | .quad stub32_sigsuspend | ||
484 | .quad compat_sys_sigpending | ||
485 | .quad sys_sethostname | ||
486 | .quad compat_sys_setrlimit /* 75 */ | ||
487 | .quad compat_sys_old_getrlimit /* old_getrlimit */ | ||
488 | .quad compat_sys_getrusage | ||
489 | .quad sys32_gettimeofday | ||
490 | .quad sys32_settimeofday | ||
491 | .quad sys_getgroups16 /* 80 */ | ||
492 | .quad sys_setgroups16 | ||
493 | .quad sys32_old_select | ||
494 | .quad sys_symlink | ||
495 | .quad sys_lstat | ||
496 | .quad sys_readlink /* 85 */ | ||
497 | .quad sys_uselib | ||
498 | .quad sys_swapon | ||
499 | .quad sys_reboot | ||
500 | .quad compat_sys_old_readdir | ||
501 | .quad sys32_mmap /* 90 */ | ||
502 | .quad sys_munmap | ||
503 | .quad sys_truncate | ||
504 | .quad sys_ftruncate | ||
505 | .quad sys_fchmod | ||
506 | .quad sys_fchown16 /* 95 */ | ||
507 | .quad sys_getpriority | ||
508 | .quad sys_setpriority | ||
509 | .quad quiet_ni_syscall /* old profil syscall holder */ | ||
510 | .quad compat_sys_statfs | ||
511 | .quad compat_sys_fstatfs /* 100 */ | ||
512 | .quad sys_ioperm | ||
513 | .quad compat_sys_socketcall | ||
514 | .quad sys_syslog | ||
515 | .quad compat_sys_setitimer | ||
516 | .quad compat_sys_getitimer /* 105 */ | ||
517 | .quad compat_sys_newstat | ||
518 | .quad compat_sys_newlstat | ||
519 | .quad compat_sys_newfstat | ||
520 | .quad sys32_uname | ||
521 | .quad stub32_iopl /* 110 */ | ||
522 | .quad sys_vhangup | ||
523 | .quad quiet_ni_syscall /* old "idle" system call */ | ||
524 | .quad sys32_vm86_warning /* vm86old */ | ||
525 | .quad compat_sys_wait4 | ||
526 | .quad sys_swapoff /* 115 */ | ||
527 | .quad compat_sys_sysinfo | ||
528 | .quad sys32_ipc | ||
529 | .quad sys_fsync | ||
530 | .quad stub32_sigreturn | ||
531 | .quad stub32_clone /* 120 */ | ||
532 | .quad sys_setdomainname | ||
533 | .quad sys_uname | ||
534 | .quad sys_modify_ldt | ||
535 | .quad compat_sys_adjtimex | ||
536 | .quad sys32_mprotect /* 125 */ | ||
537 | .quad compat_sys_sigprocmask | ||
538 | .quad quiet_ni_syscall /* create_module */ | ||
539 | .quad sys_init_module | ||
540 | .quad sys_delete_module | ||
541 | .quad quiet_ni_syscall /* 130 get_kernel_syms */ | ||
542 | .quad sys32_quotactl | ||
543 | .quad sys_getpgid | ||
544 | .quad sys_fchdir | ||
545 | .quad quiet_ni_syscall /* bdflush */ | ||
546 | .quad sys_sysfs /* 135 */ | ||
547 | .quad sys_personality | ||
548 | .quad quiet_ni_syscall /* for afs_syscall */ | ||
549 | .quad sys_setfsuid16 | ||
550 | .quad sys_setfsgid16 | ||
551 | .quad sys_llseek /* 140 */ | ||
552 | .quad compat_sys_getdents | ||
553 | .quad compat_sys_select | ||
554 | .quad sys_flock | ||
555 | .quad sys_msync | ||
556 | .quad compat_sys_readv /* 145 */ | ||
557 | .quad compat_sys_writev | ||
558 | .quad sys_getsid | ||
559 | .quad sys_fdatasync | ||
560 | .quad sys32_sysctl /* sysctl */ | ||
561 | .quad sys_mlock /* 150 */ | ||
562 | .quad sys_munlock | ||
563 | .quad sys_mlockall | ||
564 | .quad sys_munlockall | ||
565 | .quad sys_sched_setparam | ||
566 | .quad sys_sched_getparam /* 155 */ | ||
567 | .quad sys_sched_setscheduler | ||
568 | .quad sys_sched_getscheduler | ||
569 | .quad sys_sched_yield | ||
570 | .quad sys_sched_get_priority_max | ||
571 | .quad sys_sched_get_priority_min /* 160 */ | ||
572 | .quad sys32_sched_rr_get_interval | ||
573 | .quad compat_sys_nanosleep | ||
574 | .quad sys_mremap | ||
575 | .quad sys_setresuid16 | ||
576 | .quad sys_getresuid16 /* 165 */ | ||
577 | .quad sys32_vm86_warning /* vm86 */ | ||
578 | .quad quiet_ni_syscall /* query_module */ | ||
579 | .quad sys_poll | ||
580 | .quad compat_sys_nfsservctl | ||
581 | .quad sys_setresgid16 /* 170 */ | ||
582 | .quad sys_getresgid16 | ||
583 | .quad sys_prctl | ||
584 | .quad stub32_rt_sigreturn | ||
585 | .quad sys32_rt_sigaction | ||
586 | .quad sys32_rt_sigprocmask /* 175 */ | ||
587 | .quad sys32_rt_sigpending | ||
588 | .quad compat_sys_rt_sigtimedwait | ||
589 | .quad sys32_rt_sigqueueinfo | ||
590 | .quad stub32_rt_sigsuspend | ||
591 | .quad sys32_pread /* 180 */ | ||
592 | .quad sys32_pwrite | ||
593 | .quad sys_chown16 | ||
594 | .quad sys_getcwd | ||
595 | .quad sys_capget | ||
596 | .quad sys_capset | ||
597 | .quad stub32_sigaltstack | ||
598 | .quad sys32_sendfile | ||
599 | .quad quiet_ni_syscall /* streams1 */ | ||
600 | .quad quiet_ni_syscall /* streams2 */ | ||
601 | .quad stub32_vfork /* 190 */ | ||
602 | .quad compat_sys_getrlimit | ||
603 | .quad sys32_mmap2 | ||
604 | .quad sys32_truncate64 | ||
605 | .quad sys32_ftruncate64 | ||
606 | .quad sys32_stat64 /* 195 */ | ||
607 | .quad sys32_lstat64 | ||
608 | .quad sys32_fstat64 | ||
609 | .quad sys_lchown | ||
610 | .quad sys_getuid | ||
611 | .quad sys_getgid /* 200 */ | ||
612 | .quad sys_geteuid | ||
613 | .quad sys_getegid | ||
614 | .quad sys_setreuid | ||
615 | .quad sys_setregid | ||
616 | .quad sys_getgroups /* 205 */ | ||
617 | .quad sys_setgroups | ||
618 | .quad sys_fchown | ||
619 | .quad sys_setresuid | ||
620 | .quad sys_getresuid | ||
621 | .quad sys_setresgid /* 210 */ | ||
622 | .quad sys_getresgid | ||
623 | .quad sys_chown | ||
624 | .quad sys_setuid | ||
625 | .quad sys_setgid | ||
626 | .quad sys_setfsuid /* 215 */ | ||
627 | .quad sys_setfsgid | ||
628 | .quad sys_pivot_root | ||
629 | .quad sys_mincore | ||
630 | .quad sys_madvise | ||
631 | .quad compat_sys_getdents64 /* 220 getdents64 */ | ||
632 | .quad compat_sys_fcntl64 | ||
633 | .quad quiet_ni_syscall /* tux */ | ||
634 | .quad quiet_ni_syscall /* security */ | ||
635 | .quad sys_gettid | ||
636 | .quad sys32_readahead /* 225 */ | ||
637 | .quad sys_setxattr | ||
638 | .quad sys_lsetxattr | ||
639 | .quad sys_fsetxattr | ||
640 | .quad sys_getxattr | ||
641 | .quad sys_lgetxattr /* 230 */ | ||
642 | .quad sys_fgetxattr | ||
643 | .quad sys_listxattr | ||
644 | .quad sys_llistxattr | ||
645 | .quad sys_flistxattr | ||
646 | .quad sys_removexattr /* 235 */ | ||
647 | .quad sys_lremovexattr | ||
648 | .quad sys_fremovexattr | ||
649 | .quad sys_tkill | ||
650 | .quad sys_sendfile64 | ||
651 | .quad compat_sys_futex /* 240 */ | ||
652 | .quad compat_sys_sched_setaffinity | ||
653 | .quad compat_sys_sched_getaffinity | ||
654 | .quad sys32_set_thread_area | ||
655 | .quad sys32_get_thread_area | ||
656 | .quad compat_sys_io_setup /* 245 */ | ||
657 | .quad sys_io_destroy | ||
658 | .quad compat_sys_io_getevents | ||
659 | .quad compat_sys_io_submit | ||
660 | .quad sys_io_cancel | ||
661 | .quad sys32_fadvise64 /* 250 */ | ||
662 | .quad quiet_ni_syscall /* free_huge_pages */ | ||
663 | .quad sys_exit_group | ||
664 | .quad sys32_lookup_dcookie | ||
665 | .quad sys_epoll_create | ||
666 | .quad sys_epoll_ctl /* 255 */ | ||
667 | .quad sys_epoll_wait | ||
668 | .quad sys_remap_file_pages | ||
669 | .quad sys_set_tid_address | ||
670 | .quad compat_sys_timer_create | ||
671 | .quad compat_sys_timer_settime /* 260 */ | ||
672 | .quad compat_sys_timer_gettime | ||
673 | .quad sys_timer_getoverrun | ||
674 | .quad sys_timer_delete | ||
675 | .quad compat_sys_clock_settime | ||
676 | .quad compat_sys_clock_gettime /* 265 */ | ||
677 | .quad compat_sys_clock_getres | ||
678 | .quad compat_sys_clock_nanosleep | ||
679 | .quad compat_sys_statfs64 | ||
680 | .quad compat_sys_fstatfs64 | ||
681 | .quad sys_tgkill /* 270 */ | ||
682 | .quad compat_sys_utimes | ||
683 | .quad sys32_fadvise64_64 | ||
684 | .quad quiet_ni_syscall /* sys_vserver */ | ||
685 | .quad sys_mbind | ||
686 | .quad compat_sys_get_mempolicy /* 275 */ | ||
687 | .quad sys_set_mempolicy | ||
688 | .quad compat_sys_mq_open | ||
689 | .quad sys_mq_unlink | ||
690 | .quad compat_sys_mq_timedsend | ||
691 | .quad compat_sys_mq_timedreceive /* 280 */ | ||
692 | .quad compat_sys_mq_notify | ||
693 | .quad compat_sys_mq_getsetattr | ||
694 | .quad compat_sys_kexec_load /* reserved for kexec */ | ||
695 | .quad compat_sys_waitid | ||
696 | .quad quiet_ni_syscall /* 285: sys_altroot */ | ||
697 | .quad sys_add_key | ||
698 | .quad sys_request_key | ||
699 | .quad sys_keyctl | ||
700 | .quad sys_ioprio_set | ||
701 | .quad sys_ioprio_get /* 290 */ | ||
702 | .quad sys_inotify_init | ||
703 | .quad sys_inotify_add_watch | ||
704 | .quad sys_inotify_rm_watch | ||
705 | .quad sys_migrate_pages | ||
706 | .quad compat_sys_openat /* 295 */ | ||
707 | .quad sys_mkdirat | ||
708 | .quad sys_mknodat | ||
709 | .quad sys_fchownat | ||
710 | .quad compat_sys_futimesat | ||
711 | .quad sys32_fstatat /* 300 */ | ||
712 | .quad sys_unlinkat | ||
713 | .quad sys_renameat | ||
714 | .quad sys_linkat | ||
715 | .quad sys_symlinkat | ||
716 | .quad sys_readlinkat /* 305 */ | ||
717 | .quad sys_fchmodat | ||
718 | .quad sys_faccessat | ||
719 | .quad compat_sys_pselect6 | ||
720 | .quad compat_sys_ppoll | ||
721 | .quad sys_unshare /* 310 */ | ||
722 | .quad compat_sys_set_robust_list | ||
723 | .quad compat_sys_get_robust_list | ||
724 | .quad sys_splice | ||
725 | .quad sys32_sync_file_range | ||
726 | .quad sys_tee /* 315 */ | ||
727 | .quad compat_sys_vmsplice | ||
728 | .quad compat_sys_move_pages | ||
729 | .quad sys_getcpu | ||
730 | .quad sys_epoll_pwait | ||
731 | .quad compat_sys_utimensat /* 320 */ | ||
732 | .quad compat_sys_signalfd | ||
733 | .quad compat_sys_timerfd | ||
734 | .quad sys_eventfd | ||
735 | .quad sys32_fallocate | ||
736 | ia32_syscall_end: | ||
diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c deleted file mode 100644 index 369151dc3213..000000000000 --- a/arch/x86_64/ia32/ipc32.c +++ /dev/null | |||
@@ -1,57 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/spinlock.h> | ||
3 | #include <linux/list.h> | ||
4 | #include <linux/syscalls.h> | ||
5 | #include <linux/time.h> | ||
6 | #include <linux/sem.h> | ||
7 | #include <linux/msg.h> | ||
8 | #include <linux/shm.h> | ||
9 | #include <linux/ipc.h> | ||
10 | #include <linux/compat.h> | ||
11 | |||
12 | #include <asm-i386/ipc.h> | ||
13 | |||
14 | asmlinkage long | ||
15 | sys32_ipc(u32 call, int first, int second, int third, | ||
16 | compat_uptr_t ptr, u32 fifth) | ||
17 | { | ||
18 | int version; | ||
19 | |||
20 | version = call >> 16; /* hack for backward compatibility */ | ||
21 | call &= 0xffff; | ||
22 | |||
23 | switch (call) { | ||
24 | case SEMOP: | ||
25 | /* struct sembuf is the same on 32 and 64bit :)) */ | ||
26 | return sys_semtimedop(first, compat_ptr(ptr), second, NULL); | ||
27 | case SEMTIMEDOP: | ||
28 | return compat_sys_semtimedop(first, compat_ptr(ptr), second, | ||
29 | compat_ptr(fifth)); | ||
30 | case SEMGET: | ||
31 | return sys_semget(first, second, third); | ||
32 | case SEMCTL: | ||
33 | return compat_sys_semctl(first, second, third, compat_ptr(ptr)); | ||
34 | |||
35 | case MSGSND: | ||
36 | return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); | ||
37 | case MSGRCV: | ||
38 | return compat_sys_msgrcv(first, second, fifth, third, | ||
39 | version, compat_ptr(ptr)); | ||
40 | case MSGGET: | ||
41 | return sys_msgget((key_t) first, second); | ||
42 | case MSGCTL: | ||
43 | return compat_sys_msgctl(first, second, compat_ptr(ptr)); | ||
44 | |||
45 | case SHMAT: | ||
46 | return compat_sys_shmat(first, second, third, version, | ||
47 | compat_ptr(ptr)); | ||
48 | break; | ||
49 | case SHMDT: | ||
50 | return sys_shmdt(compat_ptr(ptr)); | ||
51 | case SHMGET: | ||
52 | return sys_shmget(first, (unsigned)second, third); | ||
53 | case SHMCTL: | ||
54 | return compat_sys_shmctl(first, second, compat_ptr(ptr)); | ||
55 | } | ||
56 | return -ENOSYS; | ||
57 | } | ||
diff --git a/arch/x86_64/ia32/mmap32.c b/arch/x86_64/ia32/mmap32.c deleted file mode 100644 index e4b84b4a417a..000000000000 --- a/arch/x86_64/ia32/mmap32.c +++ /dev/null | |||
@@ -1,79 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/ia32/mm/mmap.c | ||
3 | * | ||
4 | * flexible mmap layout support | ||
5 | * | ||
6 | * Based on the i386 version which was | ||
7 | * | ||
8 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | ||
9 | * All Rights Reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to the Free Software | ||
23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
24 | * | ||
25 | * | ||
26 | * Started by Ingo Molnar <mingo@elte.hu> | ||
27 | */ | ||
28 | |||
29 | #include <linux/personality.h> | ||
30 | #include <linux/mm.h> | ||
31 | #include <linux/random.h> | ||
32 | #include <linux/sched.h> | ||
33 | |||
34 | /* | ||
35 | * Top of mmap area (just below the process stack). | ||
36 | * | ||
37 | * Leave an at least ~128 MB hole. | ||
38 | */ | ||
39 | #define MIN_GAP (128*1024*1024) | ||
40 | #define MAX_GAP (TASK_SIZE/6*5) | ||
41 | |||
42 | static inline unsigned long mmap_base(struct mm_struct *mm) | ||
43 | { | ||
44 | unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; | ||
45 | unsigned long random_factor = 0; | ||
46 | |||
47 | if (current->flags & PF_RANDOMIZE) | ||
48 | random_factor = get_random_int() % (1024*1024); | ||
49 | |||
50 | if (gap < MIN_GAP) | ||
51 | gap = MIN_GAP; | ||
52 | else if (gap > MAX_GAP) | ||
53 | gap = MAX_GAP; | ||
54 | |||
55 | return PAGE_ALIGN(TASK_SIZE - gap - random_factor); | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * This function, called very early during the creation of a new | ||
60 | * process VM image, sets up which VM layout function to use: | ||
61 | */ | ||
62 | void ia32_pick_mmap_layout(struct mm_struct *mm) | ||
63 | { | ||
64 | /* | ||
65 | * Fall back to the standard layout if the personality | ||
66 | * bit is set, or if the expected stack growth is unlimited: | ||
67 | */ | ||
68 | if (sysctl_legacy_va_layout || | ||
69 | (current->personality & ADDR_COMPAT_LAYOUT) || | ||
70 | current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { | ||
71 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
72 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
73 | mm->unmap_area = arch_unmap_area; | ||
74 | } else { | ||
75 | mm->mmap_base = mmap_base(mm); | ||
76 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
77 | mm->unmap_area = arch_unmap_area_topdown; | ||
78 | } | ||
79 | } | ||
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c deleted file mode 100644 index 4a233ad6269c..000000000000 --- a/arch/x86_64/ia32/ptrace32.c +++ /dev/null | |||
@@ -1,404 +0,0 @@ | |||
1 | /* | ||
2 | * 32bit ptrace for x86-64. | ||
3 | * | ||
4 | * Copyright 2001,2002 Andi Kleen, SuSE Labs. | ||
5 | * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier | ||
6 | * copyright. | ||
7 | * | ||
8 | * This allows to access 64bit processes too; but there is no way to see the extended | ||
9 | * register contents. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/stddef.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/syscalls.h> | ||
16 | #include <linux/unistd.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/err.h> | ||
19 | #include <linux/ptrace.h> | ||
20 | #include <asm/ptrace.h> | ||
21 | #include <asm/compat.h> | ||
22 | #include <asm/uaccess.h> | ||
23 | #include <asm/user32.h> | ||
24 | #include <asm/user.h> | ||
25 | #include <asm/errno.h> | ||
26 | #include <asm/debugreg.h> | ||
27 | #include <asm/i387.h> | ||
28 | #include <asm/fpu32.h> | ||
29 | #include <asm/ia32.h> | ||
30 | |||
31 | /* | ||
32 | * Determines which flags the user has access to [1 = access, 0 = no access]. | ||
33 | * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). | ||
34 | * Also masks reserved bits (31-22, 15, 5, 3, 1). | ||
35 | */ | ||
36 | #define FLAG_MASK 0x54dd5UL | ||
37 | |||
38 | #define R32(l,q) \ | ||
39 | case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break | ||
40 | |||
41 | static int putreg32(struct task_struct *child, unsigned regno, u32 val) | ||
42 | { | ||
43 | int i; | ||
44 | __u64 *stack = (__u64 *)task_pt_regs(child); | ||
45 | |||
46 | switch (regno) { | ||
47 | case offsetof(struct user32, regs.fs): | ||
48 | if (val && (val & 3) != 3) return -EIO; | ||
49 | child->thread.fsindex = val & 0xffff; | ||
50 | break; | ||
51 | case offsetof(struct user32, regs.gs): | ||
52 | if (val && (val & 3) != 3) return -EIO; | ||
53 | child->thread.gsindex = val & 0xffff; | ||
54 | break; | ||
55 | case offsetof(struct user32, regs.ds): | ||
56 | if (val && (val & 3) != 3) return -EIO; | ||
57 | child->thread.ds = val & 0xffff; | ||
58 | break; | ||
59 | case offsetof(struct user32, regs.es): | ||
60 | child->thread.es = val & 0xffff; | ||
61 | break; | ||
62 | case offsetof(struct user32, regs.ss): | ||
63 | if ((val & 3) != 3) return -EIO; | ||
64 | stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff; | ||
65 | break; | ||
66 | case offsetof(struct user32, regs.cs): | ||
67 | if ((val & 3) != 3) return -EIO; | ||
68 | stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff; | ||
69 | break; | ||
70 | |||
71 | R32(ebx, rbx); | ||
72 | R32(ecx, rcx); | ||
73 | R32(edx, rdx); | ||
74 | R32(edi, rdi); | ||
75 | R32(esi, rsi); | ||
76 | R32(ebp, rbp); | ||
77 | R32(eax, rax); | ||
78 | R32(orig_eax, orig_rax); | ||
79 | R32(eip, rip); | ||
80 | R32(esp, rsp); | ||
81 | |||
82 | case offsetof(struct user32, regs.eflags): { | ||
83 | __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8]; | ||
84 | val &= FLAG_MASK; | ||
85 | *flags = val | (*flags & ~FLAG_MASK); | ||
86 | break; | ||
87 | } | ||
88 | |||
89 | case offsetof(struct user32, u_debugreg[4]): | ||
90 | case offsetof(struct user32, u_debugreg[5]): | ||
91 | return -EIO; | ||
92 | |||
93 | case offsetof(struct user32, u_debugreg[0]): | ||
94 | child->thread.debugreg0 = val; | ||
95 | break; | ||
96 | |||
97 | case offsetof(struct user32, u_debugreg[1]): | ||
98 | child->thread.debugreg1 = val; | ||
99 | break; | ||
100 | |||
101 | case offsetof(struct user32, u_debugreg[2]): | ||
102 | child->thread.debugreg2 = val; | ||
103 | break; | ||
104 | |||
105 | case offsetof(struct user32, u_debugreg[3]): | ||
106 | child->thread.debugreg3 = val; | ||
107 | break; | ||
108 | |||
109 | case offsetof(struct user32, u_debugreg[6]): | ||
110 | child->thread.debugreg6 = val; | ||
111 | break; | ||
112 | |||
113 | case offsetof(struct user32, u_debugreg[7]): | ||
114 | val &= ~DR_CONTROL_RESERVED; | ||
115 | /* See arch/i386/kernel/ptrace.c for an explanation of | ||
116 | * this awkward check.*/ | ||
117 | for(i=0; i<4; i++) | ||
118 | if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) | ||
119 | return -EIO; | ||
120 | child->thread.debugreg7 = val; | ||
121 | if (val) | ||
122 | set_tsk_thread_flag(child, TIF_DEBUG); | ||
123 | else | ||
124 | clear_tsk_thread_flag(child, TIF_DEBUG); | ||
125 | break; | ||
126 | |||
127 | default: | ||
128 | if (regno > sizeof(struct user32) || (regno & 3)) | ||
129 | return -EIO; | ||
130 | |||
131 | /* Other dummy fields in the virtual user structure are ignored */ | ||
132 | break; | ||
133 | } | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | #undef R32 | ||
138 | |||
139 | #define R32(l,q) \ | ||
140 | case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break | ||
141 | |||
142 | static int getreg32(struct task_struct *child, unsigned regno, u32 *val) | ||
143 | { | ||
144 | __u64 *stack = (__u64 *)task_pt_regs(child); | ||
145 | |||
146 | switch (regno) { | ||
147 | case offsetof(struct user32, regs.fs): | ||
148 | *val = child->thread.fsindex; | ||
149 | break; | ||
150 | case offsetof(struct user32, regs.gs): | ||
151 | *val = child->thread.gsindex; | ||
152 | break; | ||
153 | case offsetof(struct user32, regs.ds): | ||
154 | *val = child->thread.ds; | ||
155 | break; | ||
156 | case offsetof(struct user32, regs.es): | ||
157 | *val = child->thread.es; | ||
158 | break; | ||
159 | |||
160 | R32(cs, cs); | ||
161 | R32(ss, ss); | ||
162 | R32(ebx, rbx); | ||
163 | R32(ecx, rcx); | ||
164 | R32(edx, rdx); | ||
165 | R32(edi, rdi); | ||
166 | R32(esi, rsi); | ||
167 | R32(ebp, rbp); | ||
168 | R32(eax, rax); | ||
169 | R32(orig_eax, orig_rax); | ||
170 | R32(eip, rip); | ||
171 | R32(eflags, eflags); | ||
172 | R32(esp, rsp); | ||
173 | |||
174 | case offsetof(struct user32, u_debugreg[0]): | ||
175 | *val = child->thread.debugreg0; | ||
176 | break; | ||
177 | case offsetof(struct user32, u_debugreg[1]): | ||
178 | *val = child->thread.debugreg1; | ||
179 | break; | ||
180 | case offsetof(struct user32, u_debugreg[2]): | ||
181 | *val = child->thread.debugreg2; | ||
182 | break; | ||
183 | case offsetof(struct user32, u_debugreg[3]): | ||
184 | *val = child->thread.debugreg3; | ||
185 | break; | ||
186 | case offsetof(struct user32, u_debugreg[6]): | ||
187 | *val = child->thread.debugreg6; | ||
188 | break; | ||
189 | case offsetof(struct user32, u_debugreg[7]): | ||
190 | *val = child->thread.debugreg7; | ||
191 | break; | ||
192 | |||
193 | default: | ||
194 | if (regno > sizeof(struct user32) || (regno & 3)) | ||
195 | return -EIO; | ||
196 | |||
197 | /* Other dummy fields in the virtual user structure are ignored */ | ||
198 | *val = 0; | ||
199 | break; | ||
200 | } | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | #undef R32 | ||
205 | |||
206 | static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) | ||
207 | { | ||
208 | int ret; | ||
209 | compat_siginfo_t __user *si32 = compat_ptr(data); | ||
210 | siginfo_t ssi; | ||
211 | siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); | ||
212 | if (request == PTRACE_SETSIGINFO) { | ||
213 | memset(&ssi, 0, sizeof(siginfo_t)); | ||
214 | ret = copy_siginfo_from_user32(&ssi, si32); | ||
215 | if (ret) | ||
216 | return ret; | ||
217 | if (copy_to_user(si, &ssi, sizeof(siginfo_t))) | ||
218 | return -EFAULT; | ||
219 | } | ||
220 | ret = sys_ptrace(request, pid, addr, (unsigned long)si); | ||
221 | if (ret) | ||
222 | return ret; | ||
223 | if (request == PTRACE_GETSIGINFO) { | ||
224 | if (copy_from_user(&ssi, si, sizeof(siginfo_t))) | ||
225 | return -EFAULT; | ||
226 | ret = copy_siginfo_to_user32(si32, &ssi); | ||
227 | } | ||
228 | return ret; | ||
229 | } | ||
230 | |||
231 | asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) | ||
232 | { | ||
233 | struct task_struct *child; | ||
234 | struct pt_regs *childregs; | ||
235 | void __user *datap = compat_ptr(data); | ||
236 | int ret; | ||
237 | __u32 val; | ||
238 | |||
239 | switch (request) { | ||
240 | case PTRACE_TRACEME: | ||
241 | case PTRACE_ATTACH: | ||
242 | case PTRACE_KILL: | ||
243 | case PTRACE_CONT: | ||
244 | case PTRACE_SINGLESTEP: | ||
245 | case PTRACE_DETACH: | ||
246 | case PTRACE_SYSCALL: | ||
247 | case PTRACE_OLDSETOPTIONS: | ||
248 | case PTRACE_SETOPTIONS: | ||
249 | case PTRACE_SET_THREAD_AREA: | ||
250 | case PTRACE_GET_THREAD_AREA: | ||
251 | return sys_ptrace(request, pid, addr, data); | ||
252 | |||
253 | default: | ||
254 | return -EINVAL; | ||
255 | |||
256 | case PTRACE_PEEKTEXT: | ||
257 | case PTRACE_PEEKDATA: | ||
258 | case PTRACE_POKEDATA: | ||
259 | case PTRACE_POKETEXT: | ||
260 | case PTRACE_POKEUSR: | ||
261 | case PTRACE_PEEKUSR: | ||
262 | case PTRACE_GETREGS: | ||
263 | case PTRACE_SETREGS: | ||
264 | case PTRACE_SETFPREGS: | ||
265 | case PTRACE_GETFPREGS: | ||
266 | case PTRACE_SETFPXREGS: | ||
267 | case PTRACE_GETFPXREGS: | ||
268 | case PTRACE_GETEVENTMSG: | ||
269 | break; | ||
270 | |||
271 | case PTRACE_SETSIGINFO: | ||
272 | case PTRACE_GETSIGINFO: | ||
273 | return ptrace32_siginfo(request, pid, addr, data); | ||
274 | } | ||
275 | |||
276 | child = ptrace_get_task_struct(pid); | ||
277 | if (IS_ERR(child)) | ||
278 | return PTR_ERR(child); | ||
279 | |||
280 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | ||
281 | if (ret < 0) | ||
282 | goto out; | ||
283 | |||
284 | childregs = task_pt_regs(child); | ||
285 | |||
286 | switch (request) { | ||
287 | case PTRACE_PEEKDATA: | ||
288 | case PTRACE_PEEKTEXT: | ||
289 | ret = 0; | ||
290 | if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32)) | ||
291 | ret = -EIO; | ||
292 | else | ||
293 | ret = put_user(val, (unsigned int __user *)datap); | ||
294 | break; | ||
295 | |||
296 | case PTRACE_POKEDATA: | ||
297 | case PTRACE_POKETEXT: | ||
298 | ret = 0; | ||
299 | if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32)) | ||
300 | ret = -EIO; | ||
301 | break; | ||
302 | |||
303 | case PTRACE_PEEKUSR: | ||
304 | ret = getreg32(child, addr, &val); | ||
305 | if (ret == 0) | ||
306 | ret = put_user(val, (__u32 __user *)datap); | ||
307 | break; | ||
308 | |||
309 | case PTRACE_POKEUSR: | ||
310 | ret = putreg32(child, addr, data); | ||
311 | break; | ||
312 | |||
313 | case PTRACE_GETREGS: { /* Get all gp regs from the child. */ | ||
314 | int i; | ||
315 | if (!access_ok(VERIFY_WRITE, datap, 16*4)) { | ||
316 | ret = -EIO; | ||
317 | break; | ||
318 | } | ||
319 | ret = 0; | ||
320 | for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) { | ||
321 | getreg32(child, i, &val); | ||
322 | ret |= __put_user(val,(u32 __user *)datap); | ||
323 | datap += sizeof(u32); | ||
324 | } | ||
325 | break; | ||
326 | } | ||
327 | |||
328 | case PTRACE_SETREGS: { /* Set all gp regs in the child. */ | ||
329 | unsigned long tmp; | ||
330 | int i; | ||
331 | if (!access_ok(VERIFY_READ, datap, 16*4)) { | ||
332 | ret = -EIO; | ||
333 | break; | ||
334 | } | ||
335 | ret = 0; | ||
336 | for ( i = 0; i <= 16*4; i += sizeof(u32) ) { | ||
337 | ret |= __get_user(tmp, (u32 __user *)datap); | ||
338 | putreg32(child, i, tmp); | ||
339 | datap += sizeof(u32); | ||
340 | } | ||
341 | break; | ||
342 | } | ||
343 | |||
344 | case PTRACE_GETFPREGS: | ||
345 | ret = -EIO; | ||
346 | if (!access_ok(VERIFY_READ, compat_ptr(data), | ||
347 | sizeof(struct user_i387_struct))) | ||
348 | break; | ||
349 | save_i387_ia32(child, datap, childregs, 1); | ||
350 | ret = 0; | ||
351 | break; | ||
352 | |||
353 | case PTRACE_SETFPREGS: | ||
354 | ret = -EIO; | ||
355 | if (!access_ok(VERIFY_WRITE, datap, | ||
356 | sizeof(struct user_i387_struct))) | ||
357 | break; | ||
358 | ret = 0; | ||
359 | /* don't check EFAULT to be bug-to-bug compatible to i386 */ | ||
360 | restore_i387_ia32(child, datap, 1); | ||
361 | break; | ||
362 | |||
363 | case PTRACE_GETFPXREGS: { | ||
364 | struct user32_fxsr_struct __user *u = datap; | ||
365 | init_fpu(child); | ||
366 | ret = -EIO; | ||
367 | if (!access_ok(VERIFY_WRITE, u, sizeof(*u))) | ||
368 | break; | ||
369 | ret = -EFAULT; | ||
370 | if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u))) | ||
371 | break; | ||
372 | ret = __put_user(childregs->cs, &u->fcs); | ||
373 | ret |= __put_user(child->thread.ds, &u->fos); | ||
374 | break; | ||
375 | } | ||
376 | case PTRACE_SETFPXREGS: { | ||
377 | struct user32_fxsr_struct __user *u = datap; | ||
378 | unlazy_fpu(child); | ||
379 | ret = -EIO; | ||
380 | if (!access_ok(VERIFY_READ, u, sizeof(*u))) | ||
381 | break; | ||
382 | /* no checking to be bug-to-bug compatible with i386. */ | ||
383 | /* but silence warning */ | ||
384 | if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u))) | ||
385 | ; | ||
386 | set_stopped_child_used_math(child); | ||
387 | child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; | ||
388 | ret = 0; | ||
389 | break; | ||
390 | } | ||
391 | |||
392 | case PTRACE_GETEVENTMSG: | ||
393 | ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data)); | ||
394 | break; | ||
395 | |||
396 | default: | ||
397 | BUG(); | ||
398 | } | ||
399 | |||
400 | out: | ||
401 | put_task_struct(child); | ||
402 | return ret; | ||
403 | } | ||
404 | |||
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c deleted file mode 100644 index bee96d614432..000000000000 --- a/arch/x86_64/ia32/sys_ia32.c +++ /dev/null | |||
@@ -1,889 +0,0 @@ | |||
1 | /* | ||
2 | * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on | ||
3 | * sys_sparc32 | ||
4 | * | ||
5 | * Copyright (C) 2000 VA Linux Co | ||
6 | * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> | ||
7 | * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> | ||
8 | * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) | ||
9 | * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) | ||
10 | * Copyright (C) 2000 Hewlett-Packard Co. | ||
11 | * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> | ||
12 | * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) | ||
13 | * | ||
14 | * These routines maintain argument size conversion between 32bit and 64bit | ||
15 | * environment. In 2.5 most of this should be moved to a generic directory. | ||
16 | * | ||
17 | * This file assumes that there is a hole at the end of user address space. | ||
18 | * | ||
19 | * Some of the functions are LE specific currently. These are hopefully all marked. | ||
20 | * This should be fixed. | ||
21 | */ | ||
22 | |||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/sched.h> | ||
25 | #include <linux/fs.h> | ||
26 | #include <linux/file.h> | ||
27 | #include <linux/signal.h> | ||
28 | #include <linux/syscalls.h> | ||
29 | #include <linux/resource.h> | ||
30 | #include <linux/times.h> | ||
31 | #include <linux/utsname.h> | ||
32 | #include <linux/smp.h> | ||
33 | #include <linux/smp_lock.h> | ||
34 | #include <linux/sem.h> | ||
35 | #include <linux/msg.h> | ||
36 | #include <linux/mm.h> | ||
37 | #include <linux/shm.h> | ||
38 | #include <linux/slab.h> | ||
39 | #include <linux/uio.h> | ||
40 | #include <linux/nfs_fs.h> | ||
41 | #include <linux/quota.h> | ||
42 | #include <linux/module.h> | ||
43 | #include <linux/sunrpc/svc.h> | ||
44 | #include <linux/nfsd/nfsd.h> | ||
45 | #include <linux/nfsd/cache.h> | ||
46 | #include <linux/nfsd/xdr.h> | ||
47 | #include <linux/nfsd/syscall.h> | ||
48 | #include <linux/poll.h> | ||
49 | #include <linux/personality.h> | ||
50 | #include <linux/stat.h> | ||
51 | #include <linux/ipc.h> | ||
52 | #include <linux/rwsem.h> | ||
53 | #include <linux/binfmts.h> | ||
54 | #include <linux/init.h> | ||
55 | #include <linux/aio_abi.h> | ||
56 | #include <linux/aio.h> | ||
57 | #include <linux/compat.h> | ||
58 | #include <linux/vfs.h> | ||
59 | #include <linux/ptrace.h> | ||
60 | #include <linux/highuid.h> | ||
61 | #include <linux/vmalloc.h> | ||
62 | #include <linux/fsnotify.h> | ||
63 | #include <linux/sysctl.h> | ||
64 | #include <asm/mman.h> | ||
65 | #include <asm/types.h> | ||
66 | #include <asm/uaccess.h> | ||
67 | #include <asm/semaphore.h> | ||
68 | #include <asm/atomic.h> | ||
69 | #include <asm/ldt.h> | ||
70 | |||
71 | #include <net/scm.h> | ||
72 | #include <net/sock.h> | ||
73 | #include <asm/ia32.h> | ||
74 | |||
75 | #define AA(__x) ((unsigned long)(__x)) | ||
76 | |||
77 | int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) | ||
78 | { | ||
79 | compat_ino_t ino; | ||
80 | |||
81 | typeof(ubuf->st_uid) uid = 0; | ||
82 | typeof(ubuf->st_gid) gid = 0; | ||
83 | SET_UID(uid, kbuf->uid); | ||
84 | SET_GID(gid, kbuf->gid); | ||
85 | if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) | ||
86 | return -EOVERFLOW; | ||
87 | if (kbuf->size >= 0x7fffffff) | ||
88 | return -EOVERFLOW; | ||
89 | ino = kbuf->ino; | ||
90 | if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) | ||
91 | return -EOVERFLOW; | ||
92 | if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || | ||
93 | __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || | ||
94 | __put_user (ino, &ubuf->st_ino) || | ||
95 | __put_user (kbuf->mode, &ubuf->st_mode) || | ||
96 | __put_user (kbuf->nlink, &ubuf->st_nlink) || | ||
97 | __put_user (uid, &ubuf->st_uid) || | ||
98 | __put_user (gid, &ubuf->st_gid) || | ||
99 | __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || | ||
100 | __put_user (kbuf->size, &ubuf->st_size) || | ||
101 | __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || | ||
102 | __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || | ||
103 | __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || | ||
104 | __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || | ||
105 | __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || | ||
106 | __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || | ||
107 | __put_user (kbuf->blksize, &ubuf->st_blksize) || | ||
108 | __put_user (kbuf->blocks, &ubuf->st_blocks)) | ||
109 | return -EFAULT; | ||
110 | return 0; | ||
111 | } | ||
112 | |||
113 | asmlinkage long | ||
114 | sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) | ||
115 | { | ||
116 | return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); | ||
117 | } | ||
118 | |||
119 | asmlinkage long | ||
120 | sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) | ||
121 | { | ||
122 | return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); | ||
123 | } | ||
124 | |||
125 | /* Another set for IA32/LFS -- x86_64 struct stat is different due to | ||
126 | support for 64bit inode numbers. */ | ||
127 | |||
128 | static int | ||
129 | cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) | ||
130 | { | ||
131 | typeof(ubuf->st_uid) uid = 0; | ||
132 | typeof(ubuf->st_gid) gid = 0; | ||
133 | SET_UID(uid, stat->uid); | ||
134 | SET_GID(gid, stat->gid); | ||
135 | if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || | ||
136 | __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || | ||
137 | __put_user (stat->ino, &ubuf->__st_ino) || | ||
138 | __put_user (stat->ino, &ubuf->st_ino) || | ||
139 | __put_user (stat->mode, &ubuf->st_mode) || | ||
140 | __put_user (stat->nlink, &ubuf->st_nlink) || | ||
141 | __put_user (uid, &ubuf->st_uid) || | ||
142 | __put_user (gid, &ubuf->st_gid) || | ||
143 | __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || | ||
144 | __put_user (stat->size, &ubuf->st_size) || | ||
145 | __put_user (stat->atime.tv_sec, &ubuf->st_atime) || | ||
146 | __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || | ||
147 | __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || | ||
148 | __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || | ||
149 | __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || | ||
150 | __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || | ||
151 | __put_user (stat->blksize, &ubuf->st_blksize) || | ||
152 | __put_user (stat->blocks, &ubuf->st_blocks)) | ||
153 | return -EFAULT; | ||
154 | return 0; | ||
155 | } | ||
156 | |||
157 | asmlinkage long | ||
158 | sys32_stat64(char __user * filename, struct stat64 __user *statbuf) | ||
159 | { | ||
160 | struct kstat stat; | ||
161 | int ret = vfs_stat(filename, &stat); | ||
162 | if (!ret) | ||
163 | ret = cp_stat64(statbuf, &stat); | ||
164 | return ret; | ||
165 | } | ||
166 | |||
167 | asmlinkage long | ||
168 | sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) | ||
169 | { | ||
170 | struct kstat stat; | ||
171 | int ret = vfs_lstat(filename, &stat); | ||
172 | if (!ret) | ||
173 | ret = cp_stat64(statbuf, &stat); | ||
174 | return ret; | ||
175 | } | ||
176 | |||
177 | asmlinkage long | ||
178 | sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) | ||
179 | { | ||
180 | struct kstat stat; | ||
181 | int ret = vfs_fstat(fd, &stat); | ||
182 | if (!ret) | ||
183 | ret = cp_stat64(statbuf, &stat); | ||
184 | return ret; | ||
185 | } | ||
186 | |||
187 | asmlinkage long | ||
188 | sys32_fstatat(unsigned int dfd, char __user *filename, | ||
189 | struct stat64 __user* statbuf, int flag) | ||
190 | { | ||
191 | struct kstat stat; | ||
192 | int error = -EINVAL; | ||
193 | |||
194 | if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) | ||
195 | goto out; | ||
196 | |||
197 | if (flag & AT_SYMLINK_NOFOLLOW) | ||
198 | error = vfs_lstat_fd(dfd, filename, &stat); | ||
199 | else | ||
200 | error = vfs_stat_fd(dfd, filename, &stat); | ||
201 | |||
202 | if (!error) | ||
203 | error = cp_stat64(statbuf, &stat); | ||
204 | |||
205 | out: | ||
206 | return error; | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * Linux/i386 didn't use to be able to handle more than | ||
211 | * 4 system call parameters, so these system calls used a memory | ||
212 | * block for parameter passing.. | ||
213 | */ | ||
214 | |||
215 | struct mmap_arg_struct { | ||
216 | unsigned int addr; | ||
217 | unsigned int len; | ||
218 | unsigned int prot; | ||
219 | unsigned int flags; | ||
220 | unsigned int fd; | ||
221 | unsigned int offset; | ||
222 | }; | ||
223 | |||
224 | asmlinkage long | ||
225 | sys32_mmap(struct mmap_arg_struct __user *arg) | ||
226 | { | ||
227 | struct mmap_arg_struct a; | ||
228 | struct file *file = NULL; | ||
229 | unsigned long retval; | ||
230 | struct mm_struct *mm ; | ||
231 | |||
232 | if (copy_from_user(&a, arg, sizeof(a))) | ||
233 | return -EFAULT; | ||
234 | |||
235 | if (a.offset & ~PAGE_MASK) | ||
236 | return -EINVAL; | ||
237 | |||
238 | if (!(a.flags & MAP_ANONYMOUS)) { | ||
239 | file = fget(a.fd); | ||
240 | if (!file) | ||
241 | return -EBADF; | ||
242 | } | ||
243 | |||
244 | mm = current->mm; | ||
245 | down_write(&mm->mmap_sem); | ||
246 | retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); | ||
247 | if (file) | ||
248 | fput(file); | ||
249 | |||
250 | up_write(&mm->mmap_sem); | ||
251 | |||
252 | return retval; | ||
253 | } | ||
254 | |||
255 | asmlinkage long | ||
256 | sys32_mprotect(unsigned long start, size_t len, unsigned long prot) | ||
257 | { | ||
258 | return sys_mprotect(start,len,prot); | ||
259 | } | ||
260 | |||
261 | asmlinkage long | ||
262 | sys32_pipe(int __user *fd) | ||
263 | { | ||
264 | int retval; | ||
265 | int fds[2]; | ||
266 | |||
267 | retval = do_pipe(fds); | ||
268 | if (retval) | ||
269 | goto out; | ||
270 | if (copy_to_user(fd, fds, sizeof(fds))) | ||
271 | retval = -EFAULT; | ||
272 | out: | ||
273 | return retval; | ||
274 | } | ||
275 | |||
276 | asmlinkage long | ||
277 | sys32_rt_sigaction(int sig, struct sigaction32 __user *act, | ||
278 | struct sigaction32 __user *oact, unsigned int sigsetsize) | ||
279 | { | ||
280 | struct k_sigaction new_ka, old_ka; | ||
281 | int ret; | ||
282 | compat_sigset_t set32; | ||
283 | |||
284 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
285 | if (sigsetsize != sizeof(compat_sigset_t)) | ||
286 | return -EINVAL; | ||
287 | |||
288 | if (act) { | ||
289 | compat_uptr_t handler, restorer; | ||
290 | |||
291 | if (!access_ok(VERIFY_READ, act, sizeof(*act)) || | ||
292 | __get_user(handler, &act->sa_handler) || | ||
293 | __get_user(new_ka.sa.sa_flags, &act->sa_flags) || | ||
294 | __get_user(restorer, &act->sa_restorer)|| | ||
295 | __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) | ||
296 | return -EFAULT; | ||
297 | new_ka.sa.sa_handler = compat_ptr(handler); | ||
298 | new_ka.sa.sa_restorer = compat_ptr(restorer); | ||
299 | /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ | ||
300 | switch (_NSIG_WORDS) { | ||
301 | case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] | ||
302 | | (((long)set32.sig[7]) << 32); | ||
303 | case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4] | ||
304 | | (((long)set32.sig[5]) << 32); | ||
305 | case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2] | ||
306 | | (((long)set32.sig[3]) << 32); | ||
307 | case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0] | ||
308 | | (((long)set32.sig[1]) << 32); | ||
309 | } | ||
310 | } | ||
311 | |||
312 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
313 | |||
314 | if (!ret && oact) { | ||
315 | /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ | ||
316 | switch (_NSIG_WORDS) { | ||
317 | case 4: | ||
318 | set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); | ||
319 | set32.sig[6] = old_ka.sa.sa_mask.sig[3]; | ||
320 | case 3: | ||
321 | set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32); | ||
322 | set32.sig[4] = old_ka.sa.sa_mask.sig[2]; | ||
323 | case 2: | ||
324 | set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32); | ||
325 | set32.sig[2] = old_ka.sa.sa_mask.sig[1]; | ||
326 | case 1: | ||
327 | set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32); | ||
328 | set32.sig[0] = old_ka.sa.sa_mask.sig[0]; | ||
329 | } | ||
330 | if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || | ||
331 | __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || | ||
332 | __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || | ||
333 | __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || | ||
334 | __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) | ||
335 | return -EFAULT; | ||
336 | } | ||
337 | |||
338 | return ret; | ||
339 | } | ||
340 | |||
341 | asmlinkage long | ||
342 | sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) | ||
343 | { | ||
344 | struct k_sigaction new_ka, old_ka; | ||
345 | int ret; | ||
346 | |||
347 | if (act) { | ||
348 | compat_old_sigset_t mask; | ||
349 | compat_uptr_t handler, restorer; | ||
350 | |||
351 | if (!access_ok(VERIFY_READ, act, sizeof(*act)) || | ||
352 | __get_user(handler, &act->sa_handler) || | ||
353 | __get_user(new_ka.sa.sa_flags, &act->sa_flags) || | ||
354 | __get_user(restorer, &act->sa_restorer) || | ||
355 | __get_user(mask, &act->sa_mask)) | ||
356 | return -EFAULT; | ||
357 | |||
358 | new_ka.sa.sa_handler = compat_ptr(handler); | ||
359 | new_ka.sa.sa_restorer = compat_ptr(restorer); | ||
360 | |||
361 | siginitset(&new_ka.sa.sa_mask, mask); | ||
362 | } | ||
363 | |||
364 | ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); | ||
365 | |||
366 | if (!ret && oact) { | ||
367 | if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || | ||
368 | __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || | ||
369 | __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || | ||
370 | __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || | ||
371 | __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) | ||
372 | return -EFAULT; | ||
373 | } | ||
374 | |||
375 | return ret; | ||
376 | } | ||
377 | |||
378 | asmlinkage long | ||
379 | sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, | ||
380 | compat_sigset_t __user *oset, unsigned int sigsetsize) | ||
381 | { | ||
382 | sigset_t s; | ||
383 | compat_sigset_t s32; | ||
384 | int ret; | ||
385 | mm_segment_t old_fs = get_fs(); | ||
386 | |||
387 | if (set) { | ||
388 | if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) | ||
389 | return -EFAULT; | ||
390 | switch (_NSIG_WORDS) { | ||
391 | case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); | ||
392 | case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32); | ||
393 | case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32); | ||
394 | case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); | ||
395 | } | ||
396 | } | ||
397 | set_fs (KERNEL_DS); | ||
398 | ret = sys_rt_sigprocmask(how, | ||
399 | set ? (sigset_t __user *)&s : NULL, | ||
400 | oset ? (sigset_t __user *)&s : NULL, | ||
401 | sigsetsize); | ||
402 | set_fs (old_fs); | ||
403 | if (ret) return ret; | ||
404 | if (oset) { | ||
405 | switch (_NSIG_WORDS) { | ||
406 | case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; | ||
407 | case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; | ||
408 | case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; | ||
409 | case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; | ||
410 | } | ||
411 | if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) | ||
412 | return -EFAULT; | ||
413 | } | ||
414 | return 0; | ||
415 | } | ||
416 | |||
417 | static inline long | ||
418 | get_tv32(struct timeval *o, struct compat_timeval __user *i) | ||
419 | { | ||
420 | int err = -EFAULT; | ||
421 | if (access_ok(VERIFY_READ, i, sizeof(*i))) { | ||
422 | err = __get_user(o->tv_sec, &i->tv_sec); | ||
423 | err |= __get_user(o->tv_usec, &i->tv_usec); | ||
424 | } | ||
425 | return err; | ||
426 | } | ||
427 | |||
428 | static inline long | ||
429 | put_tv32(struct compat_timeval __user *o, struct timeval *i) | ||
430 | { | ||
431 | int err = -EFAULT; | ||
432 | if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { | ||
433 | err = __put_user(i->tv_sec, &o->tv_sec); | ||
434 | err |= __put_user(i->tv_usec, &o->tv_usec); | ||
435 | } | ||
436 | return err; | ||
437 | } | ||
438 | |||
439 | extern unsigned int alarm_setitimer(unsigned int seconds); | ||
440 | |||
441 | asmlinkage long | ||
442 | sys32_alarm(unsigned int seconds) | ||
443 | { | ||
444 | return alarm_setitimer(seconds); | ||
445 | } | ||
446 | |||
447 | /* Translations due to time_t size differences. Which affects all | ||
448 | sorts of things, like timeval and itimerval. */ | ||
449 | |||
450 | extern struct timezone sys_tz; | ||
451 | |||
452 | asmlinkage long | ||
453 | sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) | ||
454 | { | ||
455 | if (tv) { | ||
456 | struct timeval ktv; | ||
457 | do_gettimeofday(&ktv); | ||
458 | if (put_tv32(tv, &ktv)) | ||
459 | return -EFAULT; | ||
460 | } | ||
461 | if (tz) { | ||
462 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | ||
463 | return -EFAULT; | ||
464 | } | ||
465 | return 0; | ||
466 | } | ||
467 | |||
468 | asmlinkage long | ||
469 | sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) | ||
470 | { | ||
471 | struct timeval ktv; | ||
472 | struct timespec kts; | ||
473 | struct timezone ktz; | ||
474 | |||
475 | if (tv) { | ||
476 | if (get_tv32(&ktv, tv)) | ||
477 | return -EFAULT; | ||
478 | kts.tv_sec = ktv.tv_sec; | ||
479 | kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; | ||
480 | } | ||
481 | if (tz) { | ||
482 | if (copy_from_user(&ktz, tz, sizeof(ktz))) | ||
483 | return -EFAULT; | ||
484 | } | ||
485 | |||
486 | return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); | ||
487 | } | ||
488 | |||
489 | struct sel_arg_struct { | ||
490 | unsigned int n; | ||
491 | unsigned int inp; | ||
492 | unsigned int outp; | ||
493 | unsigned int exp; | ||
494 | unsigned int tvp; | ||
495 | }; | ||
496 | |||
497 | asmlinkage long | ||
498 | sys32_old_select(struct sel_arg_struct __user *arg) | ||
499 | { | ||
500 | struct sel_arg_struct a; | ||
501 | |||
502 | if (copy_from_user(&a, arg, sizeof(a))) | ||
503 | return -EFAULT; | ||
504 | return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), | ||
505 | compat_ptr(a.exp), compat_ptr(a.tvp)); | ||
506 | } | ||
507 | |||
508 | extern asmlinkage long | ||
509 | compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, | ||
510 | struct compat_rusage *ru); | ||
511 | |||
512 | asmlinkage long | ||
513 | sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) | ||
514 | { | ||
515 | return compat_sys_wait4(pid, stat_addr, options, NULL); | ||
516 | } | ||
517 | |||
518 | /* 32-bit timeval and related flotsam. */ | ||
519 | |||
520 | asmlinkage long | ||
521 | sys32_sysfs(int option, u32 arg1, u32 arg2) | ||
522 | { | ||
523 | return sys_sysfs(option, arg1, arg2); | ||
524 | } | ||
525 | |||
526 | asmlinkage long | ||
527 | sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) | ||
528 | { | ||
529 | struct timespec t; | ||
530 | int ret; | ||
531 | mm_segment_t old_fs = get_fs (); | ||
532 | |||
533 | set_fs (KERNEL_DS); | ||
534 | ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); | ||
535 | set_fs (old_fs); | ||
536 | if (put_compat_timespec(&t, interval)) | ||
537 | return -EFAULT; | ||
538 | return ret; | ||
539 | } | ||
540 | |||
541 | asmlinkage long | ||
542 | sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) | ||
543 | { | ||
544 | sigset_t s; | ||
545 | compat_sigset_t s32; | ||
546 | int ret; | ||
547 | mm_segment_t old_fs = get_fs(); | ||
548 | |||
549 | set_fs (KERNEL_DS); | ||
550 | ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); | ||
551 | set_fs (old_fs); | ||
552 | if (!ret) { | ||
553 | switch (_NSIG_WORDS) { | ||
554 | case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; | ||
555 | case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; | ||
556 | case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; | ||
557 | case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; | ||
558 | } | ||
559 | if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) | ||
560 | return -EFAULT; | ||
561 | } | ||
562 | return ret; | ||
563 | } | ||
564 | |||
565 | asmlinkage long | ||
566 | sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) | ||
567 | { | ||
568 | siginfo_t info; | ||
569 | int ret; | ||
570 | mm_segment_t old_fs = get_fs(); | ||
571 | |||
572 | if (copy_siginfo_from_user32(&info, uinfo)) | ||
573 | return -EFAULT; | ||
574 | set_fs (KERNEL_DS); | ||
575 | ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); | ||
576 | set_fs (old_fs); | ||
577 | return ret; | ||
578 | } | ||
579 | |||
580 | /* These are here just in case some old ia32 binary calls it. */ | ||
581 | asmlinkage long | ||
582 | sys32_pause(void) | ||
583 | { | ||
584 | current->state = TASK_INTERRUPTIBLE; | ||
585 | schedule(); | ||
586 | return -ERESTARTNOHAND; | ||
587 | } | ||
588 | |||
589 | |||
590 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
591 | struct sysctl_ia32 { | ||
592 | unsigned int name; | ||
593 | int nlen; | ||
594 | unsigned int oldval; | ||
595 | unsigned int oldlenp; | ||
596 | unsigned int newval; | ||
597 | unsigned int newlen; | ||
598 | unsigned int __unused[4]; | ||
599 | }; | ||
600 | |||
601 | |||
602 | asmlinkage long | ||
603 | sys32_sysctl(struct sysctl_ia32 __user *args32) | ||
604 | { | ||
605 | struct sysctl_ia32 a32; | ||
606 | mm_segment_t old_fs = get_fs (); | ||
607 | void __user *oldvalp, *newvalp; | ||
608 | size_t oldlen; | ||
609 | int __user *namep; | ||
610 | long ret; | ||
611 | |||
612 | if (copy_from_user(&a32, args32, sizeof (a32))) | ||
613 | return -EFAULT; | ||
614 | |||
615 | /* | ||
616 | * We need to pre-validate these because we have to disable address checking | ||
617 | * before calling do_sysctl() because of OLDLEN but we can't run the risk of the | ||
618 | * user specifying bad addresses here. Well, since we're dealing with 32 bit | ||
619 | * addresses, we KNOW that access_ok() will always succeed, so this is an | ||
620 | * expensive NOP, but so what... | ||
621 | */ | ||
622 | namep = compat_ptr(a32.name); | ||
623 | oldvalp = compat_ptr(a32.oldval); | ||
624 | newvalp = compat_ptr(a32.newval); | ||
625 | |||
626 | if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) | ||
627 | || !access_ok(VERIFY_WRITE, namep, 0) | ||
628 | || !access_ok(VERIFY_WRITE, oldvalp, 0) | ||
629 | || !access_ok(VERIFY_WRITE, newvalp, 0)) | ||
630 | return -EFAULT; | ||
631 | |||
632 | set_fs(KERNEL_DS); | ||
633 | lock_kernel(); | ||
634 | ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen, | ||
635 | newvalp, (size_t) a32.newlen); | ||
636 | unlock_kernel(); | ||
637 | set_fs(old_fs); | ||
638 | |||
639 | if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) | ||
640 | return -EFAULT; | ||
641 | |||
642 | return ret; | ||
643 | } | ||
644 | #endif | ||
645 | |||
646 | /* warning: next two assume little endian */ | ||
647 | asmlinkage long | ||
648 | sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) | ||
649 | { | ||
650 | return sys_pread64(fd, ubuf, count, | ||
651 | ((loff_t)AA(poshi) << 32) | AA(poslo)); | ||
652 | } | ||
653 | |||
654 | asmlinkage long | ||
655 | sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) | ||
656 | { | ||
657 | return sys_pwrite64(fd, ubuf, count, | ||
658 | ((loff_t)AA(poshi) << 32) | AA(poslo)); | ||
659 | } | ||
660 | |||
661 | |||
662 | asmlinkage long | ||
663 | sys32_personality(unsigned long personality) | ||
664 | { | ||
665 | int ret; | ||
666 | if (personality(current->personality) == PER_LINUX32 && | ||
667 | personality == PER_LINUX) | ||
668 | personality = PER_LINUX32; | ||
669 | ret = sys_personality(personality); | ||
670 | if (ret == PER_LINUX32) | ||
671 | ret = PER_LINUX; | ||
672 | return ret; | ||
673 | } | ||
674 | |||
675 | asmlinkage long | ||
676 | sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) | ||
677 | { | ||
678 | mm_segment_t old_fs = get_fs(); | ||
679 | int ret; | ||
680 | off_t of; | ||
681 | |||
682 | if (offset && get_user(of, offset)) | ||
683 | return -EFAULT; | ||
684 | |||
685 | set_fs(KERNEL_DS); | ||
686 | ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, | ||
687 | count); | ||
688 | set_fs(old_fs); | ||
689 | |||
690 | if (offset && put_user(of, offset)) | ||
691 | return -EFAULT; | ||
692 | |||
693 | return ret; | ||
694 | } | ||
695 | |||
696 | asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, | ||
697 | unsigned long prot, unsigned long flags, | ||
698 | unsigned long fd, unsigned long pgoff) | ||
699 | { | ||
700 | struct mm_struct *mm = current->mm; | ||
701 | unsigned long error; | ||
702 | struct file * file = NULL; | ||
703 | |||
704 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
705 | if (!(flags & MAP_ANONYMOUS)) { | ||
706 | file = fget(fd); | ||
707 | if (!file) | ||
708 | return -EBADF; | ||
709 | } | ||
710 | |||
711 | down_write(&mm->mmap_sem); | ||
712 | error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
713 | up_write(&mm->mmap_sem); | ||
714 | |||
715 | if (file) | ||
716 | fput(file); | ||
717 | return error; | ||
718 | } | ||
719 | |||
720 | asmlinkage long sys32_olduname(struct oldold_utsname __user * name) | ||
721 | { | ||
722 | int err; | ||
723 | |||
724 | if (!name) | ||
725 | return -EFAULT; | ||
726 | if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) | ||
727 | return -EFAULT; | ||
728 | |||
729 | down_read(&uts_sem); | ||
730 | |||
731 | err = __copy_to_user(&name->sysname,&utsname()->sysname, | ||
732 | __OLD_UTS_LEN); | ||
733 | err |= __put_user(0,name->sysname+__OLD_UTS_LEN); | ||
734 | err |= __copy_to_user(&name->nodename,&utsname()->nodename, | ||
735 | __OLD_UTS_LEN); | ||
736 | err |= __put_user(0,name->nodename+__OLD_UTS_LEN); | ||
737 | err |= __copy_to_user(&name->release,&utsname()->release, | ||
738 | __OLD_UTS_LEN); | ||
739 | err |= __put_user(0,name->release+__OLD_UTS_LEN); | ||
740 | err |= __copy_to_user(&name->version,&utsname()->version, | ||
741 | __OLD_UTS_LEN); | ||
742 | err |= __put_user(0,name->version+__OLD_UTS_LEN); | ||
743 | { | ||
744 | char *arch = "x86_64"; | ||
745 | if (personality(current->personality) == PER_LINUX32) | ||
746 | arch = "i686"; | ||
747 | |||
748 | err |= __copy_to_user(&name->machine, arch, strlen(arch)+1); | ||
749 | } | ||
750 | |||
751 | up_read(&uts_sem); | ||
752 | |||
753 | err = err ? -EFAULT : 0; | ||
754 | |||
755 | return err; | ||
756 | } | ||
757 | |||
758 | long sys32_uname(struct old_utsname __user * name) | ||
759 | { | ||
760 | int err; | ||
761 | if (!name) | ||
762 | return -EFAULT; | ||
763 | down_read(&uts_sem); | ||
764 | err = copy_to_user(name, utsname(), sizeof (*name)); | ||
765 | up_read(&uts_sem); | ||
766 | if (personality(current->personality) == PER_LINUX32) | ||
767 | err |= copy_to_user(&name->machine, "i686", 5); | ||
768 | return err?-EFAULT:0; | ||
769 | } | ||
770 | |||
771 | long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) | ||
772 | { | ||
773 | struct ustat u; | ||
774 | mm_segment_t seg; | ||
775 | int ret; | ||
776 | |||
777 | seg = get_fs(); | ||
778 | set_fs(KERNEL_DS); | ||
779 | ret = sys_ustat(dev, (struct ustat __user *)&u); | ||
780 | set_fs(seg); | ||
781 | if (ret >= 0) { | ||
782 | if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || | ||
783 | __put_user((__u32) u.f_tfree, &u32p->f_tfree) || | ||
784 | __put_user((__u32) u.f_tinode, &u32p->f_tfree) || | ||
785 | __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || | ||
786 | __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) | ||
787 | ret = -EFAULT; | ||
788 | } | ||
789 | return ret; | ||
790 | } | ||
791 | |||
792 | asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, | ||
793 | compat_uptr_t __user *envp, struct pt_regs *regs) | ||
794 | { | ||
795 | long error; | ||
796 | char * filename; | ||
797 | |||
798 | filename = getname(name); | ||
799 | error = PTR_ERR(filename); | ||
800 | if (IS_ERR(filename)) | ||
801 | return error; | ||
802 | error = compat_do_execve(filename, argv, envp, regs); | ||
803 | if (error == 0) { | ||
804 | task_lock(current); | ||
805 | current->ptrace &= ~PT_DTRACE; | ||
806 | task_unlock(current); | ||
807 | } | ||
808 | putname(filename); | ||
809 | return error; | ||
810 | } | ||
811 | |||
812 | asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, | ||
813 | struct pt_regs *regs) | ||
814 | { | ||
815 | void __user *parent_tid = (void __user *)regs->rdx; | ||
816 | void __user *child_tid = (void __user *)regs->rdi; | ||
817 | if (!newsp) | ||
818 | newsp = regs->rsp; | ||
819 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | ||
820 | } | ||
821 | |||
822 | /* | ||
823 | * Some system calls that need sign extended arguments. This could be done by a generic wrapper. | ||
824 | */ | ||
825 | |||
826 | long sys32_lseek (unsigned int fd, int offset, unsigned int whence) | ||
827 | { | ||
828 | return sys_lseek(fd, offset, whence); | ||
829 | } | ||
830 | |||
831 | long sys32_kill(int pid, int sig) | ||
832 | { | ||
833 | return sys_kill(pid, sig); | ||
834 | } | ||
835 | |||
836 | long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, | ||
837 | __u32 len_low, __u32 len_high, int advice) | ||
838 | { | ||
839 | return sys_fadvise64_64(fd, | ||
840 | (((u64)offset_high)<<32) | offset_low, | ||
841 | (((u64)len_high)<<32) | len_low, | ||
842 | advice); | ||
843 | } | ||
844 | |||
845 | long sys32_vm86_warning(void) | ||
846 | { | ||
847 | struct task_struct *me = current; | ||
848 | static char lastcomm[sizeof(me->comm)]; | ||
849 | if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { | ||
850 | compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", | ||
851 | me->comm); | ||
852 | strncpy(lastcomm, me->comm, sizeof(lastcomm)); | ||
853 | } | ||
854 | return -ENOSYS; | ||
855 | } | ||
856 | |||
857 | long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, | ||
858 | char __user * buf, size_t len) | ||
859 | { | ||
860 | return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); | ||
861 | } | ||
862 | |||
863 | asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) | ||
864 | { | ||
865 | return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); | ||
866 | } | ||
867 | |||
868 | asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, | ||
869 | unsigned n_low, unsigned n_hi, int flags) | ||
870 | { | ||
871 | return sys_sync_file_range(fd, | ||
872 | ((u64)off_hi << 32) | off_low, | ||
873 | ((u64)n_hi << 32) | n_low, flags); | ||
874 | } | ||
875 | |||
876 | asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len, | ||
877 | int advice) | ||
878 | { | ||
879 | return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, | ||
880 | len, advice); | ||
881 | } | ||
882 | |||
883 | asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, | ||
884 | unsigned offset_hi, unsigned len_lo, | ||
885 | unsigned len_hi) | ||
886 | { | ||
887 | return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, | ||
888 | ((u64)len_hi << 32) | len_lo); | ||
889 | } | ||
diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c deleted file mode 100644 index 15013bac181c..000000000000 --- a/arch/x86_64/ia32/syscall32.c +++ /dev/null | |||
@@ -1,83 +0,0 @@ | |||
1 | /* Copyright 2002,2003 Andi Kleen, SuSE Labs */ | ||
2 | |||
3 | /* vsyscall handling for 32bit processes. Map a stub page into it | ||
4 | on demand because 32bit cannot reach the kernel's fixmaps */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/gfp.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/stringify.h> | ||
12 | #include <linux/security.h> | ||
13 | #include <asm/proto.h> | ||
14 | #include <asm/tlbflush.h> | ||
15 | #include <asm/ia32_unistd.h> | ||
16 | #include <asm/vsyscall32.h> | ||
17 | |||
18 | extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; | ||
19 | extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; | ||
20 | extern int sysctl_vsyscall32; | ||
21 | |||
22 | static struct page *syscall32_pages[1]; | ||
23 | static int use_sysenter = -1; | ||
24 | |||
25 | struct linux_binprm; | ||
26 | |||
27 | /* Setup a VMA at program startup for the vsyscall page */ | ||
28 | int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) | ||
29 | { | ||
30 | struct mm_struct *mm = current->mm; | ||
31 | int ret; | ||
32 | |||
33 | down_write(&mm->mmap_sem); | ||
34 | /* | ||
35 | * MAYWRITE to allow gdb to COW and set breakpoints | ||
36 | * | ||
37 | * Make sure the vDSO gets into every core dump. | ||
38 | * Dumping its contents makes post-mortem fully interpretable later | ||
39 | * without matching up the same kernel and hardware config to see | ||
40 | * what PC values meant. | ||
41 | */ | ||
42 | /* Could randomize here */ | ||
43 | ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE, | ||
44 | VM_READ|VM_EXEC| | ||
45 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| | ||
46 | VM_ALWAYSDUMP, | ||
47 | syscall32_pages); | ||
48 | up_write(&mm->mmap_sem); | ||
49 | return ret; | ||
50 | } | ||
51 | |||
52 | static int __init init_syscall32(void) | ||
53 | { | ||
54 | char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); | ||
55 | if (!syscall32_page) | ||
56 | panic("Cannot allocate syscall32 page"); | ||
57 | syscall32_pages[0] = virt_to_page(syscall32_page); | ||
58 | if (use_sysenter > 0) { | ||
59 | memcpy(syscall32_page, syscall32_sysenter, | ||
60 | syscall32_sysenter_end - syscall32_sysenter); | ||
61 | } else { | ||
62 | memcpy(syscall32_page, syscall32_syscall, | ||
63 | syscall32_syscall_end - syscall32_syscall); | ||
64 | } | ||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | __initcall(init_syscall32); | ||
69 | |||
70 | /* May not be __init: called during resume */ | ||
71 | void syscall32_cpu_init(void) | ||
72 | { | ||
73 | if (use_sysenter < 0) | ||
74 | use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); | ||
75 | |||
76 | /* Load these always in case some future AMD CPU supports | ||
77 | SYSENTER from compat mode too. */ | ||
78 | checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); | ||
79 | checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); | ||
80 | checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); | ||
81 | |||
82 | wrmsrl(MSR_CSTAR, ia32_cstar_target); | ||
83 | } | ||
diff --git a/arch/x86_64/ia32/syscall32_syscall.S b/arch/x86_64/ia32/syscall32_syscall.S deleted file mode 100644 index 8f8271bdf135..000000000000 --- a/arch/x86_64/ia32/syscall32_syscall.S +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | /* 32bit VDSOs mapped into user space. */ | ||
2 | |||
3 | .section ".init.data","aw" | ||
4 | |||
5 | .globl syscall32_syscall | ||
6 | .globl syscall32_syscall_end | ||
7 | |||
8 | syscall32_syscall: | ||
9 | .incbin "arch/x86_64/ia32/vsyscall-syscall.so" | ||
10 | syscall32_syscall_end: | ||
11 | |||
12 | .globl syscall32_sysenter | ||
13 | .globl syscall32_sysenter_end | ||
14 | |||
15 | syscall32_sysenter: | ||
16 | .incbin "arch/x86_64/ia32/vsyscall-sysenter.so" | ||
17 | syscall32_sysenter_end: | ||
diff --git a/arch/x86_64/ia32/tls32.c b/arch/x86_64/ia32/tls32.c deleted file mode 100644 index 1cc4340de3ca..000000000000 --- a/arch/x86_64/ia32/tls32.c +++ /dev/null | |||
@@ -1,163 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/user.h> | ||
5 | |||
6 | #include <asm/uaccess.h> | ||
7 | #include <asm/desc.h> | ||
8 | #include <asm/system.h> | ||
9 | #include <asm/ldt.h> | ||
10 | #include <asm/processor.h> | ||
11 | #include <asm/proto.h> | ||
12 | |||
13 | /* | ||
14 | * sys_alloc_thread_area: get a yet unused TLS descriptor index. | ||
15 | */ | ||
16 | static int get_free_idx(void) | ||
17 | { | ||
18 | struct thread_struct *t = ¤t->thread; | ||
19 | int idx; | ||
20 | |||
21 | for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) | ||
22 | if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx)) | ||
23 | return idx + GDT_ENTRY_TLS_MIN; | ||
24 | return -ESRCH; | ||
25 | } | ||
26 | |||
27 | /* | ||
28 | * Set a given TLS descriptor: | ||
29 | * When you want addresses > 32bit use arch_prctl() | ||
30 | */ | ||
31 | int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info) | ||
32 | { | ||
33 | struct user_desc info; | ||
34 | struct n_desc_struct *desc; | ||
35 | int cpu, idx; | ||
36 | |||
37 | if (copy_from_user(&info, u_info, sizeof(info))) | ||
38 | return -EFAULT; | ||
39 | |||
40 | idx = info.entry_number; | ||
41 | |||
42 | /* | ||
43 | * index -1 means the kernel should try to find and | ||
44 | * allocate an empty descriptor: | ||
45 | */ | ||
46 | if (idx == -1) { | ||
47 | idx = get_free_idx(); | ||
48 | if (idx < 0) | ||
49 | return idx; | ||
50 | if (put_user(idx, &u_info->entry_number)) | ||
51 | return -EFAULT; | ||
52 | } | ||
53 | |||
54 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
55 | return -EINVAL; | ||
56 | |||
57 | desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; | ||
58 | |||
59 | /* | ||
60 | * We must not get preempted while modifying the TLS. | ||
61 | */ | ||
62 | cpu = get_cpu(); | ||
63 | |||
64 | if (LDT_empty(&info)) { | ||
65 | desc->a = 0; | ||
66 | desc->b = 0; | ||
67 | } else { | ||
68 | desc->a = LDT_entry_a(&info); | ||
69 | desc->b = LDT_entry_b(&info); | ||
70 | } | ||
71 | if (t == ¤t->thread) | ||
72 | load_TLS(t, cpu); | ||
73 | |||
74 | put_cpu(); | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info) | ||
79 | { | ||
80 | return do_set_thread_area(¤t->thread, u_info); | ||
81 | } | ||
82 | |||
83 | |||
84 | /* | ||
85 | * Get the current Thread-Local Storage area: | ||
86 | */ | ||
87 | |||
88 | #define GET_BASE(desc) ( \ | ||
89 | (((desc)->a >> 16) & 0x0000ffff) | \ | ||
90 | (((desc)->b << 16) & 0x00ff0000) | \ | ||
91 | ( (desc)->b & 0xff000000) ) | ||
92 | |||
93 | #define GET_LIMIT(desc) ( \ | ||
94 | ((desc)->a & 0x0ffff) | \ | ||
95 | ((desc)->b & 0xf0000) ) | ||
96 | |||
97 | #define GET_32BIT(desc) (((desc)->b >> 22) & 1) | ||
98 | #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) | ||
99 | #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) | ||
100 | #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) | ||
101 | #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) | ||
102 | #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) | ||
103 | #define GET_LONGMODE(desc) (((desc)->b >> 21) & 1) | ||
104 | |||
105 | int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info) | ||
106 | { | ||
107 | struct user_desc info; | ||
108 | struct n_desc_struct *desc; | ||
109 | int idx; | ||
110 | |||
111 | if (get_user(idx, &u_info->entry_number)) | ||
112 | return -EFAULT; | ||
113 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
114 | return -EINVAL; | ||
115 | |||
116 | desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; | ||
117 | |||
118 | memset(&info, 0, sizeof(struct user_desc)); | ||
119 | info.entry_number = idx; | ||
120 | info.base_addr = GET_BASE(desc); | ||
121 | info.limit = GET_LIMIT(desc); | ||
122 | info.seg_32bit = GET_32BIT(desc); | ||
123 | info.contents = GET_CONTENTS(desc); | ||
124 | info.read_exec_only = !GET_WRITABLE(desc); | ||
125 | info.limit_in_pages = GET_LIMIT_PAGES(desc); | ||
126 | info.seg_not_present = !GET_PRESENT(desc); | ||
127 | info.useable = GET_USEABLE(desc); | ||
128 | info.lm = GET_LONGMODE(desc); | ||
129 | |||
130 | if (copy_to_user(u_info, &info, sizeof(info))) | ||
131 | return -EFAULT; | ||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info) | ||
136 | { | ||
137 | return do_get_thread_area(¤t->thread, u_info); | ||
138 | } | ||
139 | |||
140 | |||
141 | int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs) | ||
142 | { | ||
143 | struct n_desc_struct *desc; | ||
144 | struct user_desc info; | ||
145 | struct user_desc __user *cp; | ||
146 | int idx; | ||
147 | |||
148 | cp = (void __user *)childregs->rsi; | ||
149 | if (copy_from_user(&info, cp, sizeof(info))) | ||
150 | return -EFAULT; | ||
151 | if (LDT_empty(&info)) | ||
152 | return -EINVAL; | ||
153 | |||
154 | idx = info.entry_number; | ||
155 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | ||
156 | return -EINVAL; | ||
157 | |||
158 | desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN; | ||
159 | desc->a = LDT_entry_a(&info); | ||
160 | desc->b = LDT_entry_b(&info); | ||
161 | |||
162 | return 0; | ||
163 | } | ||
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S deleted file mode 100644 index 1384367cdbe1..000000000000 --- a/arch/x86_64/ia32/vsyscall-sigreturn.S +++ /dev/null | |||
@@ -1,143 +0,0 @@ | |||
1 | /* | ||
2 | * Common code for the sigreturn entry points on the vsyscall page. | ||
3 | * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80) | ||
4 | * to enter the kernel. | ||
5 | * This file is #include'd by vsyscall-*.S to define them after the | ||
6 | * vsyscall entry point. The addresses we get for these entry points | ||
7 | * by doing ".balign 32" must match in both versions of the page. | ||
8 | */ | ||
9 | |||
10 | .code32 | ||
11 | .section .text.sigreturn,"ax" | ||
12 | .balign 32 | ||
13 | .globl __kernel_sigreturn | ||
14 | .type __kernel_sigreturn,@function | ||
15 | __kernel_sigreturn: | ||
16 | .LSTART_sigreturn: | ||
17 | popl %eax | ||
18 | movl $__NR_ia32_sigreturn, %eax | ||
19 | SYSCALL_ENTER_KERNEL | ||
20 | .LEND_sigreturn: | ||
21 | .size __kernel_sigreturn,.-.LSTART_sigreturn | ||
22 | |||
23 | .section .text.rtsigreturn,"ax" | ||
24 | .balign 32 | ||
25 | .globl __kernel_rt_sigreturn | ||
26 | .type __kernel_rt_sigreturn,@function | ||
27 | __kernel_rt_sigreturn: | ||
28 | .LSTART_rt_sigreturn: | ||
29 | movl $__NR_ia32_rt_sigreturn, %eax | ||
30 | SYSCALL_ENTER_KERNEL | ||
31 | .LEND_rt_sigreturn: | ||
32 | .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn | ||
33 | |||
34 | .section .eh_frame,"a",@progbits | ||
35 | .LSTARTFRAMES: | ||
36 | .long .LENDCIES-.LSTARTCIES | ||
37 | .LSTARTCIES: | ||
38 | .long 0 /* CIE ID */ | ||
39 | .byte 1 /* Version number */ | ||
40 | .string "zRS" /* NUL-terminated augmentation string */ | ||
41 | .uleb128 1 /* Code alignment factor */ | ||
42 | .sleb128 -4 /* Data alignment factor */ | ||
43 | .byte 8 /* Return address register column */ | ||
44 | .uleb128 1 /* Augmentation value length */ | ||
45 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
46 | .byte 0x0c /* DW_CFA_def_cfa */ | ||
47 | .uleb128 4 | ||
48 | .uleb128 4 | ||
49 | .byte 0x88 /* DW_CFA_offset, column 0x8 */ | ||
50 | .uleb128 1 | ||
51 | .align 4 | ||
52 | .LENDCIES: | ||
53 | |||
54 | .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */ | ||
55 | .LSTARTFDE2: | ||
56 | .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */ | ||
57 | /* HACK: The dwarf2 unwind routines will subtract 1 from the | ||
58 | return address to get an address in the middle of the | ||
59 | presumed call instruction. Since we didn't get here via | ||
60 | a call, we need to include the nop before the real start | ||
61 | to make up for it. */ | ||
62 | .long .LSTART_sigreturn-1-. /* PC-relative start address */ | ||
63 | .long .LEND_sigreturn-.LSTART_sigreturn+1 | ||
64 | .uleb128 0 /* Augmentation length */ | ||
65 | /* What follows are the instructions for the table generation. | ||
66 | We record the locations of each register saved. This is | ||
67 | complicated by the fact that the "CFA" is always assumed to | ||
68 | be the value of the stack pointer in the caller. This means | ||
69 | that we must define the CFA of this body of code to be the | ||
70 | saved value of the stack pointer in the sigcontext. Which | ||
71 | also means that there is no fixed relation to the other | ||
72 | saved registers, which means that we must use DW_CFA_expression | ||
73 | to compute their addresses. It also means that when we | ||
74 | adjust the stack with the popl, we have to do it all over again. */ | ||
75 | |||
76 | #define do_cfa_expr(offset) \ | ||
77 | .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ | ||
78 | .uleb128 1f-0f; /* length */ \ | ||
79 | 0: .byte 0x74; /* DW_OP_breg4 */ \ | ||
80 | .sleb128 offset; /* offset */ \ | ||
81 | .byte 0x06; /* DW_OP_deref */ \ | ||
82 | 1: | ||
83 | |||
84 | #define do_expr(regno, offset) \ | ||
85 | .byte 0x10; /* DW_CFA_expression */ \ | ||
86 | .uleb128 regno; /* regno */ \ | ||
87 | .uleb128 1f-0f; /* length */ \ | ||
88 | 0: .byte 0x74; /* DW_OP_breg4 */ \ | ||
89 | .sleb128 offset; /* offset */ \ | ||
90 | 1: | ||
91 | |||
92 | do_cfa_expr(IA32_SIGCONTEXT_esp+4) | ||
93 | do_expr(0, IA32_SIGCONTEXT_eax+4) | ||
94 | do_expr(1, IA32_SIGCONTEXT_ecx+4) | ||
95 | do_expr(2, IA32_SIGCONTEXT_edx+4) | ||
96 | do_expr(3, IA32_SIGCONTEXT_ebx+4) | ||
97 | do_expr(5, IA32_SIGCONTEXT_ebp+4) | ||
98 | do_expr(6, IA32_SIGCONTEXT_esi+4) | ||
99 | do_expr(7, IA32_SIGCONTEXT_edi+4) | ||
100 | do_expr(8, IA32_SIGCONTEXT_eip+4) | ||
101 | |||
102 | .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ | ||
103 | |||
104 | do_cfa_expr(IA32_SIGCONTEXT_esp) | ||
105 | do_expr(0, IA32_SIGCONTEXT_eax) | ||
106 | do_expr(1, IA32_SIGCONTEXT_ecx) | ||
107 | do_expr(2, IA32_SIGCONTEXT_edx) | ||
108 | do_expr(3, IA32_SIGCONTEXT_ebx) | ||
109 | do_expr(5, IA32_SIGCONTEXT_ebp) | ||
110 | do_expr(6, IA32_SIGCONTEXT_esi) | ||
111 | do_expr(7, IA32_SIGCONTEXT_edi) | ||
112 | do_expr(8, IA32_SIGCONTEXT_eip) | ||
113 | |||
114 | .align 4 | ||
115 | .LENDFDE2: | ||
116 | |||
117 | .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */ | ||
118 | .LSTARTFDE3: | ||
119 | .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */ | ||
120 | /* HACK: See above wrt unwind library assumptions. */ | ||
121 | .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ | ||
122 | .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 | ||
123 | .uleb128 0 /* Augmentation */ | ||
124 | /* What follows are the instructions for the table generation. | ||
125 | We record the locations of each register saved. This is | ||
126 | slightly less complicated than the above, since we don't | ||
127 | modify the stack pointer in the process. */ | ||
128 | |||
129 | do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp) | ||
130 | do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax) | ||
131 | do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx) | ||
132 | do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx) | ||
133 | do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx) | ||
134 | do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp) | ||
135 | do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi) | ||
136 | do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi) | ||
137 | do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip) | ||
138 | |||
139 | .align 4 | ||
140 | .LENDFDE3: | ||
141 | |||
142 | #include "../../i386/kernel/vsyscall-note.S" | ||
143 | |||
diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S deleted file mode 100644 index cf9ef678de3e..000000000000 --- a/arch/x86_64/ia32/vsyscall-syscall.S +++ /dev/null | |||
@@ -1,69 +0,0 @@ | |||
1 | /* | ||
2 | * Code for the vsyscall page. This version uses the syscall instruction. | ||
3 | */ | ||
4 | |||
5 | #include <asm/ia32_unistd.h> | ||
6 | #include <asm/asm-offsets.h> | ||
7 | #include <asm/segment.h> | ||
8 | |||
9 | .code32 | ||
10 | .text | ||
11 | .section .text.vsyscall,"ax" | ||
12 | .globl __kernel_vsyscall | ||
13 | .type __kernel_vsyscall,@function | ||
14 | __kernel_vsyscall: | ||
15 | .LSTART_vsyscall: | ||
16 | push %ebp | ||
17 | .Lpush_ebp: | ||
18 | movl %ecx, %ebp | ||
19 | syscall | ||
20 | movl $__USER32_DS, %ecx | ||
21 | movl %ecx, %ss | ||
22 | movl %ebp, %ecx | ||
23 | popl %ebp | ||
24 | .Lpop_ebp: | ||
25 | ret | ||
26 | .LEND_vsyscall: | ||
27 | .size __kernel_vsyscall,.-.LSTART_vsyscall | ||
28 | |||
29 | .section .eh_frame,"a",@progbits | ||
30 | .LSTARTFRAME: | ||
31 | .long .LENDCIE-.LSTARTCIE | ||
32 | .LSTARTCIE: | ||
33 | .long 0 /* CIE ID */ | ||
34 | .byte 1 /* Version number */ | ||
35 | .string "zR" /* NUL-terminated augmentation string */ | ||
36 | .uleb128 1 /* Code alignment factor */ | ||
37 | .sleb128 -4 /* Data alignment factor */ | ||
38 | .byte 8 /* Return address register column */ | ||
39 | .uleb128 1 /* Augmentation value length */ | ||
40 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
41 | .byte 0x0c /* DW_CFA_def_cfa */ | ||
42 | .uleb128 4 | ||
43 | .uleb128 4 | ||
44 | .byte 0x88 /* DW_CFA_offset, column 0x8 */ | ||
45 | .uleb128 1 | ||
46 | .align 4 | ||
47 | .LENDCIE: | ||
48 | |||
49 | .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ | ||
50 | .LSTARTFDE1: | ||
51 | .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ | ||
52 | .long .LSTART_vsyscall-. /* PC-relative start address */ | ||
53 | .long .LEND_vsyscall-.LSTART_vsyscall | ||
54 | .uleb128 0 /* Augmentation length */ | ||
55 | /* What follows are the instructions for the table generation. | ||
56 | We have to record all changes of the stack pointer. */ | ||
57 | .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ | ||
58 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
59 | .uleb128 8 | ||
60 | .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ | ||
61 | .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ | ||
62 | .byte 0xc5 /* DW_CFA_restore %ebp */ | ||
63 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
64 | .uleb128 4 | ||
65 | .align 4 | ||
66 | .LENDFDE1: | ||
67 | |||
68 | #define SYSCALL_ENTER_KERNEL syscall | ||
69 | #include "vsyscall-sigreturn.S" | ||
diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S deleted file mode 100644 index ae056e553d13..000000000000 --- a/arch/x86_64/ia32/vsyscall-sysenter.S +++ /dev/null | |||
@@ -1,95 +0,0 @@ | |||
1 | /* | ||
2 | * Code for the vsyscall page. This version uses the sysenter instruction. | ||
3 | */ | ||
4 | |||
5 | #include <asm/ia32_unistd.h> | ||
6 | #include <asm/asm-offsets.h> | ||
7 | |||
8 | .code32 | ||
9 | .text | ||
10 | .section .text.vsyscall,"ax" | ||
11 | .globl __kernel_vsyscall | ||
12 | .type __kernel_vsyscall,@function | ||
13 | __kernel_vsyscall: | ||
14 | .LSTART_vsyscall: | ||
15 | push %ecx | ||
16 | .Lpush_ecx: | ||
17 | push %edx | ||
18 | .Lpush_edx: | ||
19 | push %ebp | ||
20 | .Lenter_kernel: | ||
21 | movl %esp,%ebp | ||
22 | sysenter | ||
23 | .space 7,0x90 | ||
24 | jmp .Lenter_kernel | ||
25 | /* 16: System call normal return point is here! */ | ||
26 | pop %ebp | ||
27 | .Lpop_ebp: | ||
28 | pop %edx | ||
29 | .Lpop_edx: | ||
30 | pop %ecx | ||
31 | .Lpop_ecx: | ||
32 | ret | ||
33 | .LEND_vsyscall: | ||
34 | .size __kernel_vsyscall,.-.LSTART_vsyscall | ||
35 | |||
36 | .section .eh_frame,"a",@progbits | ||
37 | .LSTARTFRAME: | ||
38 | .long .LENDCIE-.LSTARTCIE | ||
39 | .LSTARTCIE: | ||
40 | .long 0 /* CIE ID */ | ||
41 | .byte 1 /* Version number */ | ||
42 | .string "zR" /* NUL-terminated augmentation string */ | ||
43 | .uleb128 1 /* Code alignment factor */ | ||
44 | .sleb128 -4 /* Data alignment factor */ | ||
45 | .byte 8 /* Return address register column */ | ||
46 | .uleb128 1 /* Augmentation value length */ | ||
47 | .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ | ||
48 | .byte 0x0c /* DW_CFA_def_cfa */ | ||
49 | .uleb128 4 | ||
50 | .uleb128 4 | ||
51 | .byte 0x88 /* DW_CFA_offset, column 0x8 */ | ||
52 | .uleb128 1 | ||
53 | .align 4 | ||
54 | .LENDCIE: | ||
55 | |||
56 | .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ | ||
57 | .LSTARTFDE1: | ||
58 | .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ | ||
59 | .long .LSTART_vsyscall-. /* PC-relative start address */ | ||
60 | .long .LEND_vsyscall-.LSTART_vsyscall | ||
61 | .uleb128 0 /* Augmentation length */ | ||
62 | /* What follows are the instructions for the table generation. | ||
63 | We have to record all changes of the stack pointer. */ | ||
64 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
65 | .long .Lpush_ecx-.LSTART_vsyscall | ||
66 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
67 | .byte 0x08 /* RA at offset 8 now */ | ||
68 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
69 | .long .Lpush_edx-.Lpush_ecx | ||
70 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
71 | .byte 0x0c /* RA at offset 12 now */ | ||
72 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
73 | .long .Lenter_kernel-.Lpush_edx | ||
74 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
75 | .byte 0x10 /* RA at offset 16 now */ | ||
76 | .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ | ||
77 | /* Finally the epilogue. */ | ||
78 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
79 | .long .Lpop_ebp-.Lenter_kernel | ||
80 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
81 | .byte 0x12 /* RA at offset 12 now */ | ||
82 | .byte 0xc5 /* DW_CFA_restore %ebp */ | ||
83 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
84 | .long .Lpop_edx-.Lpop_ebp | ||
85 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
86 | .byte 0x08 /* RA at offset 8 now */ | ||
87 | .byte 0x04 /* DW_CFA_advance_loc4 */ | ||
88 | .long .Lpop_ecx-.Lpop_edx | ||
89 | .byte 0x0e /* DW_CFA_def_cfa_offset */ | ||
90 | .byte 0x04 /* RA at offset 4 now */ | ||
91 | .align 4 | ||
92 | .LENDFDE1: | ||
93 | |||
94 | #define SYSCALL_ENTER_KERNEL int $0x80 | ||
95 | #include "vsyscall-sigreturn.S" | ||
diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds deleted file mode 100644 index 1dc86ff5bcb9..000000000000 --- a/arch/x86_64/ia32/vsyscall.lds +++ /dev/null | |||
@@ -1,80 +0,0 @@ | |||
1 | /* | ||
2 | * Linker script for vsyscall DSO. The vsyscall page is an ELF shared | ||
3 | * object prelinked to its virtual address. This script controls its layout. | ||
4 | */ | ||
5 | |||
6 | /* This must match <asm/fixmap.h>. */ | ||
7 | VSYSCALL_BASE = 0xffffe000; | ||
8 | |||
9 | SECTIONS | ||
10 | { | ||
11 | . = VSYSCALL_BASE + SIZEOF_HEADERS; | ||
12 | |||
13 | .hash : { *(.hash) } :text | ||
14 | .gnu.hash : { *(.gnu.hash) } | ||
15 | .dynsym : { *(.dynsym) } | ||
16 | .dynstr : { *(.dynstr) } | ||
17 | .gnu.version : { *(.gnu.version) } | ||
18 | .gnu.version_d : { *(.gnu.version_d) } | ||
19 | .gnu.version_r : { *(.gnu.version_r) } | ||
20 | |||
21 | /* This linker script is used both with -r and with -shared. | ||
22 | For the layouts to match, we need to skip more than enough | ||
23 | space for the dynamic symbol table et al. If this amount | ||
24 | is insufficient, ld -shared will barf. Just increase it here. */ | ||
25 | . = VSYSCALL_BASE + 0x400; | ||
26 | |||
27 | .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090 | ||
28 | |||
29 | /* This is an 32bit object and we cannot easily get the offsets | ||
30 | into the 64bit kernel. Just hardcode them here. This assumes | ||
31 | that all the stubs don't need more than 0x100 bytes. */ | ||
32 | . = VSYSCALL_BASE + 0x500; | ||
33 | |||
34 | .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090 | ||
35 | |||
36 | . = VSYSCALL_BASE + 0x600; | ||
37 | |||
38 | .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090 | ||
39 | |||
40 | .note : { *(.note.*) } :text :note | ||
41 | .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr | ||
42 | .eh_frame : { KEEP (*(.eh_frame)) } :text | ||
43 | .dynamic : { *(.dynamic) } :text :dynamic | ||
44 | .useless : { | ||
45 | *(.got.plt) *(.got) | ||
46 | *(.data .data.* .gnu.linkonce.d.*) | ||
47 | *(.dynbss) | ||
48 | *(.bss .bss.* .gnu.linkonce.b.*) | ||
49 | } :text | ||
50 | } | ||
51 | |||
52 | /* | ||
53 | * We must supply the ELF program headers explicitly to get just one | ||
54 | * PT_LOAD segment, and set the flags explicitly to make segments read-only. | ||
55 | */ | ||
56 | PHDRS | ||
57 | { | ||
58 | text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ | ||
59 | dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ | ||
60 | note PT_NOTE FLAGS(4); /* PF_R */ | ||
61 | eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * This controls what symbols we export from the DSO. | ||
66 | */ | ||
67 | VERSION | ||
68 | { | ||
69 | LINUX_2.5 { | ||
70 | global: | ||
71 | __kernel_vsyscall; | ||
72 | __kernel_sigreturn; | ||
73 | __kernel_rt_sigreturn; | ||
74 | |||
75 | local: *; | ||
76 | }; | ||
77 | } | ||
78 | |||
79 | /* The ELF entry point can be used to set the AT_SYSINFO value. */ | ||
80 | ENTRY(__kernel_vsyscall); | ||
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile deleted file mode 100644 index ff5d8c9b96d9..000000000000 --- a/arch/x86_64/kernel/Makefile +++ /dev/null | |||
@@ -1,63 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | extra-y := head.o head64.o init_task.o vmlinux.lds | ||
6 | EXTRA_AFLAGS := -traditional | ||
7 | obj-y := process.o signal.o entry.o traps.o irq.o \ | ||
8 | ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ | ||
9 | x8664_ksyms.o i387.o syscall.o vsyscall.o \ | ||
10 | setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ | ||
11 | pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \ | ||
12 | perfctr-watchdog.o | ||
13 | |||
14 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | ||
15 | obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o | ||
16 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o | ||
17 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o | ||
18 | obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ | ||
19 | obj-$(CONFIG_ACPI) += acpi/ | ||
20 | obj-$(CONFIG_X86_MSR) += msr.o | ||
21 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
22 | obj-$(CONFIG_X86_CPUID) += cpuid.o | ||
23 | obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o | ||
24 | obj-y += apic.o nmi.o | ||
25 | obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o | ||
26 | obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o | ||
27 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | ||
28 | obj-$(CONFIG_PM) += suspend.o | ||
29 | obj-$(CONFIG_HIBERNATION) += suspend_asm.o | ||
30 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | ||
31 | obj-$(CONFIG_EARLY_PRINTK) += early_printk.o | ||
32 | obj-$(CONFIG_IOMMU) += pci-gart.o aperture.o | ||
33 | obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary.o tce.o | ||
34 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o | ||
35 | obj-$(CONFIG_KPROBES) += kprobes.o | ||
36 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o | ||
37 | obj-$(CONFIG_X86_VSMP) += vsmp.o | ||
38 | obj-$(CONFIG_K8_NB) += k8.o | ||
39 | obj-$(CONFIG_AUDIT) += audit.o | ||
40 | |||
41 | obj-$(CONFIG_MODULES) += module.o | ||
42 | obj-$(CONFIG_PCI) += early-quirks.o | ||
43 | |||
44 | obj-y += topology.o | ||
45 | obj-y += intel_cacheinfo.o | ||
46 | obj-y += addon_cpuid_features.o | ||
47 | obj-y += pcspeaker.o | ||
48 | |||
49 | CFLAGS_vsyscall.o := $(PROFILING) -g0 | ||
50 | |||
51 | therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o | ||
52 | bootflag-y += ../../i386/kernel/bootflag.o | ||
53 | cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o | ||
54 | topology-y += ../../i386/kernel/topology.o | ||
55 | microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o | ||
56 | intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o | ||
57 | addon_cpuid_features-y += ../../i386/kernel/cpu/addon_cpuid_features.o | ||
58 | quirks-y += ../../i386/kernel/quirks.o | ||
59 | i8237-y += ../../i386/kernel/i8237.o | ||
60 | msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o | ||
61 | alternative-y += ../../i386/kernel/alternative.o | ||
62 | pcspeaker-y += ../../i386/kernel/pcspeaker.o | ||
63 | perfctr-watchdog-y += ../../i386/kernel/cpu/perfctr-watchdog.o | ||
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile deleted file mode 100644 index 080b9963f1bc..000000000000 --- a/arch/x86_64/kernel/acpi/Makefile +++ /dev/null | |||
@@ -1,9 +0,0 @@ | |||
1 | obj-y := boot.o | ||
2 | boot-y := ../../../i386/kernel/acpi/boot.o | ||
3 | obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o | ||
4 | |||
5 | ifneq ($(CONFIG_ACPI_PROCESSOR),) | ||
6 | obj-y += processor.o | ||
7 | processor-y := ../../../i386/kernel/acpi/processor.o ../../../i386/kernel/acpi/cstate.o | ||
8 | endif | ||
9 | |||
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c deleted file mode 100644 index 79475d237071..000000000000 --- a/arch/x86_64/kernel/acpi/sleep.c +++ /dev/null | |||
@@ -1,120 +0,0 @@ | |||
1 | /* | ||
2 | * acpi.c - Architecture-Specific Low-Level ACPI Support | ||
3 | * | ||
4 | * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> | ||
5 | * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> | ||
6 | * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> | ||
7 | * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) | ||
8 | * Copyright (C) 2003 Pavel Machek, SuSE Labs | ||
9 | * | ||
10 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License as published by | ||
14 | * the Free Software Foundation; either version 2 of the License, or | ||
15 | * (at your option) any later version. | ||
16 | * | ||
17 | * This program is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
20 | * GNU General Public License for more details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU General Public License | ||
23 | * along with this program; if not, write to the Free Software | ||
24 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
25 | * | ||
26 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
27 | */ | ||
28 | |||
29 | #include <linux/kernel.h> | ||
30 | #include <linux/init.h> | ||
31 | #include <linux/types.h> | ||
32 | #include <linux/stddef.h> | ||
33 | #include <linux/slab.h> | ||
34 | #include <linux/pci.h> | ||
35 | #include <linux/bootmem.h> | ||
36 | #include <linux/acpi.h> | ||
37 | #include <linux/cpumask.h> | ||
38 | |||
39 | #include <asm/mpspec.h> | ||
40 | #include <asm/io.h> | ||
41 | #include <asm/apic.h> | ||
42 | #include <asm/apicdef.h> | ||
43 | #include <asm/page.h> | ||
44 | #include <asm/pgtable.h> | ||
45 | #include <asm/pgalloc.h> | ||
46 | #include <asm/io_apic.h> | ||
47 | #include <asm/proto.h> | ||
48 | #include <asm/tlbflush.h> | ||
49 | |||
50 | /* -------------------------------------------------------------------------- | ||
51 | Low-Level Sleep Support | ||
52 | -------------------------------------------------------------------------- */ | ||
53 | |||
54 | /* address in low memory of the wakeup routine. */ | ||
55 | unsigned long acpi_wakeup_address = 0; | ||
56 | unsigned long acpi_realmode_flags; | ||
57 | extern char wakeup_start, wakeup_end; | ||
58 | |||
59 | extern unsigned long acpi_copy_wakeup_routine(unsigned long); | ||
60 | |||
61 | /** | ||
62 | * acpi_save_state_mem - save kernel state | ||
63 | * | ||
64 | * Create an identity mapped page table and copy the wakeup routine to | ||
65 | * low memory. | ||
66 | */ | ||
67 | int acpi_save_state_mem(void) | ||
68 | { | ||
69 | memcpy((void *)acpi_wakeup_address, &wakeup_start, | ||
70 | &wakeup_end - &wakeup_start); | ||
71 | acpi_copy_wakeup_routine(acpi_wakeup_address); | ||
72 | |||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * acpi_restore_state | ||
78 | */ | ||
79 | void acpi_restore_state_mem(void) | ||
80 | { | ||
81 | } | ||
82 | |||
83 | /** | ||
84 | * acpi_reserve_bootmem - do _very_ early ACPI initialisation | ||
85 | * | ||
86 | * We allocate a page in low memory for the wakeup | ||
87 | * routine for when we come back from a sleep state. The | ||
88 | * runtime allocator allows specification of <16M pages, but not | ||
89 | * <1M pages. | ||
90 | */ | ||
91 | void __init acpi_reserve_bootmem(void) | ||
92 | { | ||
93 | acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); | ||
94 | if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) | ||
95 | printk(KERN_CRIT | ||
96 | "ACPI: Wakeup code way too big, will crash on attempt" | ||
97 | " to suspend\n"); | ||
98 | } | ||
99 | |||
100 | static int __init acpi_sleep_setup(char *str) | ||
101 | { | ||
102 | while ((str != NULL) && (*str != '\0')) { | ||
103 | if (strncmp(str, "s3_bios", 7) == 0) | ||
104 | acpi_realmode_flags |= 1; | ||
105 | if (strncmp(str, "s3_mode", 7) == 0) | ||
106 | acpi_realmode_flags |= 2; | ||
107 | if (strncmp(str, "s3_beep", 7) == 0) | ||
108 | acpi_realmode_flags |= 4; | ||
109 | str = strchr(str, ','); | ||
110 | if (str != NULL) | ||
111 | str += strspn(str, ", \t"); | ||
112 | } | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | __setup("acpi_sleep=", acpi_sleep_setup); | ||
117 | |||
118 | void acpi_pci_link_exit(void) | ||
119 | { | ||
120 | } | ||
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S deleted file mode 100644 index a06f2bcabef9..000000000000 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ /dev/null | |||
@@ -1,456 +0,0 @@ | |||
1 | .text | ||
2 | #include <linux/linkage.h> | ||
3 | #include <asm/segment.h> | ||
4 | #include <asm/pgtable.h> | ||
5 | #include <asm/page.h> | ||
6 | #include <asm/msr.h> | ||
7 | |||
8 | # Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 | ||
9 | # | ||
10 | # wakeup_code runs in real mode, and at unknown address (determined at run-time). | ||
11 | # Therefore it must only use relative jumps/calls. | ||
12 | # | ||
13 | # Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled | ||
14 | # | ||
15 | # If physical address of wakeup_code is 0x12345, BIOS should call us with | ||
16 | # cs = 0x1234, eip = 0x05 | ||
17 | # | ||
18 | |||
19 | #define BEEP \ | ||
20 | inb $97, %al; \ | ||
21 | outb %al, $0x80; \ | ||
22 | movb $3, %al; \ | ||
23 | outb %al, $97; \ | ||
24 | outb %al, $0x80; \ | ||
25 | movb $-74, %al; \ | ||
26 | outb %al, $67; \ | ||
27 | outb %al, $0x80; \ | ||
28 | movb $-119, %al; \ | ||
29 | outb %al, $66; \ | ||
30 | outb %al, $0x80; \ | ||
31 | movb $15, %al; \ | ||
32 | outb %al, $66; | ||
33 | |||
34 | |||
35 | ALIGN | ||
36 | .align 16 | ||
37 | ENTRY(wakeup_start) | ||
38 | wakeup_code: | ||
39 | wakeup_code_start = . | ||
40 | .code16 | ||
41 | |||
42 | # Running in *copy* of this code, somewhere in low 1MB. | ||
43 | |||
44 | movb $0xa1, %al ; outb %al, $0x80 | ||
45 | cli | ||
46 | cld | ||
47 | # setup data segment | ||
48 | movw %cs, %ax | ||
49 | movw %ax, %ds # Make ds:0 point to wakeup_start | ||
50 | movw %ax, %ss | ||
51 | |||
52 | # Data segment must be set up before we can see whether to beep. | ||
53 | testl $4, realmode_flags - wakeup_code | ||
54 | jz 1f | ||
55 | BEEP | ||
56 | 1: | ||
57 | |||
58 | # Private stack is needed for ASUS board | ||
59 | mov $(wakeup_stack - wakeup_code), %sp | ||
60 | |||
61 | pushl $0 # Kill any dangerous flags | ||
62 | popfl | ||
63 | |||
64 | movl real_magic - wakeup_code, %eax | ||
65 | cmpl $0x12345678, %eax | ||
66 | jne bogus_real_magic | ||
67 | |||
68 | call verify_cpu # Verify the cpu supports long | ||
69 | # mode | ||
70 | testl %eax, %eax | ||
71 | jnz no_longmode | ||
72 | |||
73 | testl $1, realmode_flags - wakeup_code | ||
74 | jz 1f | ||
75 | lcall $0xc000,$3 | ||
76 | movw %cs, %ax | ||
77 | movw %ax, %ds # Bios might have played with that | ||
78 | movw %ax, %ss | ||
79 | 1: | ||
80 | |||
81 | testl $2, realmode_flags - wakeup_code | ||
82 | jz 1f | ||
83 | mov video_mode - wakeup_code, %ax | ||
84 | call mode_set | ||
85 | 1: | ||
86 | |||
87 | movw $0xb800, %ax | ||
88 | movw %ax,%fs | ||
89 | movw $0x0e00 + 'L', %fs:(0x10) | ||
90 | |||
91 | movb $0xa2, %al ; outb %al, $0x80 | ||
92 | |||
93 | mov %ds, %ax # Find 32bit wakeup_code addr | ||
94 | movzx %ax, %esi # (Convert %ds:gdt to a liner ptr) | ||
95 | shll $4, %esi | ||
96 | # Fix up the vectors | ||
97 | addl %esi, wakeup_32_vector - wakeup_code | ||
98 | addl %esi, wakeup_long64_vector - wakeup_code | ||
99 | addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer | ||
100 | |||
101 | lidtl %ds:idt_48a - wakeup_code | ||
102 | lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is | ||
103 | # appropriate | ||
104 | |||
105 | movl $1, %eax # protected mode (PE) bit | ||
106 | lmsw %ax # This is it! | ||
107 | jmp 1f | ||
108 | 1: | ||
109 | |||
110 | ljmpl *(wakeup_32_vector - wakeup_code) | ||
111 | |||
112 | .balign 4 | ||
113 | wakeup_32_vector: | ||
114 | .long wakeup_32 - wakeup_code | ||
115 | .word __KERNEL32_CS, 0 | ||
116 | |||
117 | .code32 | ||
118 | wakeup_32: | ||
119 | # Running in this code, but at low address; paging is not yet turned on. | ||
120 | movb $0xa5, %al ; outb %al, $0x80 | ||
121 | |||
122 | movl $__KERNEL_DS, %eax | ||
123 | movl %eax, %ds | ||
124 | |||
125 | movw $0x0e00 + 'i', %ds:(0xb8012) | ||
126 | movb $0xa8, %al ; outb %al, $0x80; | ||
127 | |||
128 | /* | ||
129 | * Prepare for entering 64bits mode | ||
130 | */ | ||
131 | |||
132 | /* Enable PAE */ | ||
133 | xorl %eax, %eax | ||
134 | btsl $5, %eax | ||
135 | movl %eax, %cr4 | ||
136 | |||
137 | /* Setup early boot stage 4 level pagetables */ | ||
138 | leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax | ||
139 | movl %eax, %cr3 | ||
140 | |||
141 | /* Check if nx is implemented */ | ||
142 | movl $0x80000001, %eax | ||
143 | cpuid | ||
144 | movl %edx,%edi | ||
145 | |||
146 | /* Enable Long Mode */ | ||
147 | xorl %eax, %eax | ||
148 | btsl $_EFER_LME, %eax | ||
149 | |||
150 | /* No Execute supported? */ | ||
151 | btl $20,%edi | ||
152 | jnc 1f | ||
153 | btsl $_EFER_NX, %eax | ||
154 | |||
155 | /* Make changes effective */ | ||
156 | 1: movl $MSR_EFER, %ecx | ||
157 | xorl %edx, %edx | ||
158 | wrmsr | ||
159 | |||
160 | xorl %eax, %eax | ||
161 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | ||
162 | btsl $0, %eax /* Enable protected mode */ | ||
163 | |||
164 | /* Make changes effective */ | ||
165 | movl %eax, %cr0 | ||
166 | |||
167 | /* At this point: | ||
168 | CR4.PAE must be 1 | ||
169 | CS.L must be 0 | ||
170 | CR3 must point to PML4 | ||
171 | Next instruction must be a branch | ||
172 | This must be on identity-mapped page | ||
173 | */ | ||
174 | /* | ||
175 | * At this point we're in long mode but in 32bit compatibility mode | ||
176 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
177 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load | ||
178 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
179 | */ | ||
180 | |||
181 | /* Finally jump in 64bit mode */ | ||
182 | ljmp *(wakeup_long64_vector - wakeup_code)(%esi) | ||
183 | |||
184 | .balign 4 | ||
185 | wakeup_long64_vector: | ||
186 | .long wakeup_long64 - wakeup_code | ||
187 | .word __KERNEL_CS, 0 | ||
188 | |||
189 | .code64 | ||
190 | |||
191 | /* Hooray, we are in Long 64-bit mode (but still running in | ||
192 | * low memory) | ||
193 | */ | ||
194 | wakeup_long64: | ||
195 | /* | ||
196 | * We must switch to a new descriptor in kernel space for the GDT | ||
197 | * because soon the kernel won't have access anymore to the userspace | ||
198 | * addresses where we're currently running on. We have to do that here | ||
199 | * because in 32bit we couldn't load a 64bit linear address. | ||
200 | */ | ||
201 | lgdt cpu_gdt_descr | ||
202 | |||
203 | movw $0x0e00 + 'n', %ds:(0xb8014) | ||
204 | movb $0xa9, %al ; outb %al, $0x80 | ||
205 | |||
206 | movq saved_magic, %rax | ||
207 | movq $0x123456789abcdef0, %rdx | ||
208 | cmpq %rdx, %rax | ||
209 | jne bogus_64_magic | ||
210 | |||
211 | movw $0x0e00 + 'u', %ds:(0xb8016) | ||
212 | |||
213 | nop | ||
214 | nop | ||
215 | movw $__KERNEL_DS, %ax | ||
216 | movw %ax, %ss | ||
217 | movw %ax, %ds | ||
218 | movw %ax, %es | ||
219 | movw %ax, %fs | ||
220 | movw %ax, %gs | ||
221 | movq saved_rsp, %rsp | ||
222 | |||
223 | movw $0x0e00 + 'x', %ds:(0xb8018) | ||
224 | movq saved_rbx, %rbx | ||
225 | movq saved_rdi, %rdi | ||
226 | movq saved_rsi, %rsi | ||
227 | movq saved_rbp, %rbp | ||
228 | |||
229 | movw $0x0e00 + '!', %ds:(0xb801a) | ||
230 | movq saved_rip, %rax | ||
231 | jmp *%rax | ||
232 | |||
233 | .code32 | ||
234 | |||
235 | .align 64 | ||
236 | gdta: | ||
237 | /* Its good to keep gdt in sync with one in trampoline.S */ | ||
238 | .word 0, 0, 0, 0 # dummy | ||
239 | /* ??? Why I need the accessed bit set in order for this to work? */ | ||
240 | .quad 0x00cf9b000000ffff # __KERNEL32_CS | ||
241 | .quad 0x00af9b000000ffff # __KERNEL_CS | ||
242 | .quad 0x00cf93000000ffff # __KERNEL_DS | ||
243 | |||
244 | idt_48a: | ||
245 | .word 0 # idt limit = 0 | ||
246 | .word 0, 0 # idt base = 0L | ||
247 | |||
248 | gdt_48a: | ||
249 | .word 0x800 # gdt limit=2048, | ||
250 | # 256 GDT entries | ||
251 | .long gdta - wakeup_code # gdt base (relocated in later) | ||
252 | |||
253 | real_magic: .quad 0 | ||
254 | video_mode: .quad 0 | ||
255 | realmode_flags: .quad 0 | ||
256 | |||
257 | .code16 | ||
258 | bogus_real_magic: | ||
259 | movb $0xba,%al ; outb %al,$0x80 | ||
260 | jmp bogus_real_magic | ||
261 | |||
262 | .code64 | ||
263 | bogus_64_magic: | ||
264 | movb $0xb3,%al ; outb %al,$0x80 | ||
265 | jmp bogus_64_magic | ||
266 | |||
267 | .code16 | ||
268 | no_longmode: | ||
269 | movb $0xbc,%al ; outb %al,$0x80 | ||
270 | jmp no_longmode | ||
271 | |||
272 | #include "../verify_cpu.S" | ||
273 | |||
274 | /* This code uses an extended set of video mode numbers. These include: | ||
275 | * Aliases for standard modes | ||
276 | * NORMAL_VGA (-1) | ||
277 | * EXTENDED_VGA (-2) | ||
278 | * ASK_VGA (-3) | ||
279 | * Video modes numbered by menu position -- NOT RECOMMENDED because of lack | ||
280 | * of compatibility when extending the table. These are between 0x00 and 0xff. | ||
281 | */ | ||
282 | #define VIDEO_FIRST_MENU 0x0000 | ||
283 | |||
284 | /* Standard BIOS video modes (BIOS number + 0x0100) */ | ||
285 | #define VIDEO_FIRST_BIOS 0x0100 | ||
286 | |||
287 | /* VESA BIOS video modes (VESA number + 0x0200) */ | ||
288 | #define VIDEO_FIRST_VESA 0x0200 | ||
289 | |||
290 | /* Video7 special modes (BIOS number + 0x0900) */ | ||
291 | #define VIDEO_FIRST_V7 0x0900 | ||
292 | |||
293 | # Setting of user mode (AX=mode ID) => CF=success | ||
294 | |||
295 | # For now, we only handle VESA modes (0x0200..0x03ff). To handle other | ||
296 | # modes, we should probably compile in the video code from the boot | ||
297 | # directory. | ||
298 | .code16 | ||
299 | mode_set: | ||
300 | movw %ax, %bx | ||
301 | subb $VIDEO_FIRST_VESA>>8, %bh | ||
302 | cmpb $2, %bh | ||
303 | jb check_vesa | ||
304 | |||
305 | setbad: | ||
306 | clc | ||
307 | ret | ||
308 | |||
309 | check_vesa: | ||
310 | orw $0x4000, %bx # Use linear frame buffer | ||
311 | movw $0x4f02, %ax # VESA BIOS mode set call | ||
312 | int $0x10 | ||
313 | cmpw $0x004f, %ax # AL=4f if implemented | ||
314 | jnz setbad # AH=0 if OK | ||
315 | |||
316 | stc | ||
317 | ret | ||
318 | |||
319 | wakeup_stack_begin: # Stack grows down | ||
320 | |||
321 | .org 0xff0 | ||
322 | wakeup_stack: # Just below end of page | ||
323 | |||
324 | .org 0x1000 | ||
325 | ENTRY(wakeup_level4_pgt) | ||
326 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
327 | .fill 510,8,0 | ||
328 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
329 | .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
330 | |||
331 | ENTRY(wakeup_end) | ||
332 | |||
333 | ## | ||
334 | # acpi_copy_wakeup_routine | ||
335 | # | ||
336 | # Copy the above routine to low memory. | ||
337 | # | ||
338 | # Parameters: | ||
339 | # %rdi: place to copy wakeup routine to | ||
340 | # | ||
341 | # Returned address is location of code in low memory (past data and stack) | ||
342 | # | ||
343 | .code64 | ||
344 | ENTRY(acpi_copy_wakeup_routine) | ||
345 | pushq %rax | ||
346 | pushq %rdx | ||
347 | |||
348 | movl saved_video_mode, %edx | ||
349 | movl %edx, video_mode - wakeup_start (,%rdi) | ||
350 | movl acpi_realmode_flags, %edx | ||
351 | movl %edx, realmode_flags - wakeup_start (,%rdi) | ||
352 | movq $0x12345678, real_magic - wakeup_start (,%rdi) | ||
353 | movq $0x123456789abcdef0, %rdx | ||
354 | movq %rdx, saved_magic | ||
355 | |||
356 | movq saved_magic, %rax | ||
357 | movq $0x123456789abcdef0, %rdx | ||
358 | cmpq %rdx, %rax | ||
359 | jne bogus_64_magic | ||
360 | |||
361 | # restore the regs we used | ||
362 | popq %rdx | ||
363 | popq %rax | ||
364 | ENTRY(do_suspend_lowlevel_s4bios) | ||
365 | ret | ||
366 | |||
367 | .align 2 | ||
368 | .p2align 4,,15 | ||
369 | .globl do_suspend_lowlevel | ||
370 | .type do_suspend_lowlevel,@function | ||
371 | do_suspend_lowlevel: | ||
372 | .LFB5: | ||
373 | subq $8, %rsp | ||
374 | xorl %eax, %eax | ||
375 | call save_processor_state | ||
376 | |||
377 | movq %rsp, saved_context_esp(%rip) | ||
378 | movq %rax, saved_context_eax(%rip) | ||
379 | movq %rbx, saved_context_ebx(%rip) | ||
380 | movq %rcx, saved_context_ecx(%rip) | ||
381 | movq %rdx, saved_context_edx(%rip) | ||
382 | movq %rbp, saved_context_ebp(%rip) | ||
383 | movq %rsi, saved_context_esi(%rip) | ||
384 | movq %rdi, saved_context_edi(%rip) | ||
385 | movq %r8, saved_context_r08(%rip) | ||
386 | movq %r9, saved_context_r09(%rip) | ||
387 | movq %r10, saved_context_r10(%rip) | ||
388 | movq %r11, saved_context_r11(%rip) | ||
389 | movq %r12, saved_context_r12(%rip) | ||
390 | movq %r13, saved_context_r13(%rip) | ||
391 | movq %r14, saved_context_r14(%rip) | ||
392 | movq %r15, saved_context_r15(%rip) | ||
393 | pushfq ; popq saved_context_eflags(%rip) | ||
394 | |||
395 | movq $.L97, saved_rip(%rip) | ||
396 | |||
397 | movq %rsp,saved_rsp | ||
398 | movq %rbp,saved_rbp | ||
399 | movq %rbx,saved_rbx | ||
400 | movq %rdi,saved_rdi | ||
401 | movq %rsi,saved_rsi | ||
402 | |||
403 | addq $8, %rsp | ||
404 | movl $3, %edi | ||
405 | xorl %eax, %eax | ||
406 | jmp acpi_enter_sleep_state | ||
407 | .L97: | ||
408 | .p2align 4,,7 | ||
409 | .L99: | ||
410 | .align 4 | ||
411 | movl $24, %eax | ||
412 | movw %ax, %ds | ||
413 | movq saved_context+58(%rip), %rax | ||
414 | movq %rax, %cr4 | ||
415 | movq saved_context+50(%rip), %rax | ||
416 | movq %rax, %cr3 | ||
417 | movq saved_context+42(%rip), %rax | ||
418 | movq %rax, %cr2 | ||
419 | movq saved_context+34(%rip), %rax | ||
420 | movq %rax, %cr0 | ||
421 | pushq saved_context_eflags(%rip) ; popfq | ||
422 | movq saved_context_esp(%rip), %rsp | ||
423 | movq saved_context_ebp(%rip), %rbp | ||
424 | movq saved_context_eax(%rip), %rax | ||
425 | movq saved_context_ebx(%rip), %rbx | ||
426 | movq saved_context_ecx(%rip), %rcx | ||
427 | movq saved_context_edx(%rip), %rdx | ||
428 | movq saved_context_esi(%rip), %rsi | ||
429 | movq saved_context_edi(%rip), %rdi | ||
430 | movq saved_context_r08(%rip), %r8 | ||
431 | movq saved_context_r09(%rip), %r9 | ||
432 | movq saved_context_r10(%rip), %r10 | ||
433 | movq saved_context_r11(%rip), %r11 | ||
434 | movq saved_context_r12(%rip), %r12 | ||
435 | movq saved_context_r13(%rip), %r13 | ||
436 | movq saved_context_r14(%rip), %r14 | ||
437 | movq saved_context_r15(%rip), %r15 | ||
438 | |||
439 | xorl %eax, %eax | ||
440 | addq $8, %rsp | ||
441 | jmp restore_processor_state | ||
442 | .LFE5: | ||
443 | .Lfe5: | ||
444 | .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel | ||
445 | |||
446 | .data | ||
447 | ALIGN | ||
448 | ENTRY(saved_rbp) .quad 0 | ||
449 | ENTRY(saved_rsi) .quad 0 | ||
450 | ENTRY(saved_rdi) .quad 0 | ||
451 | ENTRY(saved_rbx) .quad 0 | ||
452 | |||
453 | ENTRY(saved_rip) .quad 0 | ||
454 | ENTRY(saved_rsp) .quad 0 | ||
455 | |||
456 | ENTRY(saved_magic) .quad 0 | ||
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c deleted file mode 100644 index 8f681cae7bf7..000000000000 --- a/arch/x86_64/kernel/aperture.c +++ /dev/null | |||
@@ -1,298 +0,0 @@ | |||
1 | /* | ||
2 | * Firmware replacement code. | ||
3 | * | ||
4 | * Work around broken BIOSes that don't set an aperture or only set the | ||
5 | * aperture in the AGP bridge. | ||
6 | * If all fails map the aperture over some low memory. This is cheaper than | ||
7 | * doing bounce buffering. The memory is lost. This is done at early boot | ||
8 | * because only the bootmem allocator can allocate 32+MB. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | */ | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/types.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/bootmem.h> | ||
16 | #include <linux/mmzone.h> | ||
17 | #include <linux/pci_ids.h> | ||
18 | #include <linux/pci.h> | ||
19 | #include <linux/bitops.h> | ||
20 | #include <linux/ioport.h> | ||
21 | #include <asm/e820.h> | ||
22 | #include <asm/io.h> | ||
23 | #include <asm/iommu.h> | ||
24 | #include <asm/pci-direct.h> | ||
25 | #include <asm/dma.h> | ||
26 | #include <asm/k8.h> | ||
27 | |||
28 | int iommu_aperture; | ||
29 | int iommu_aperture_disabled __initdata = 0; | ||
30 | int iommu_aperture_allowed __initdata = 0; | ||
31 | |||
32 | int fallback_aper_order __initdata = 1; /* 64MB */ | ||
33 | int fallback_aper_force __initdata = 0; | ||
34 | |||
35 | int fix_aperture __initdata = 1; | ||
36 | |||
37 | static struct resource gart_resource = { | ||
38 | .name = "GART", | ||
39 | .flags = IORESOURCE_MEM, | ||
40 | }; | ||
41 | |||
42 | static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) | ||
43 | { | ||
44 | gart_resource.start = aper_base; | ||
45 | gart_resource.end = aper_base + aper_size - 1; | ||
46 | insert_resource(&iomem_resource, &gart_resource); | ||
47 | } | ||
48 | |||
49 | /* This code runs before the PCI subsystem is initialized, so just | ||
50 | access the northbridge directly. */ | ||
51 | |||
52 | static u32 __init allocate_aperture(void) | ||
53 | { | ||
54 | u32 aper_size; | ||
55 | void *p; | ||
56 | |||
57 | if (fallback_aper_order > 7) | ||
58 | fallback_aper_order = 7; | ||
59 | aper_size = (32 * 1024 * 1024) << fallback_aper_order; | ||
60 | |||
61 | /* | ||
62 | * Aperture has to be naturally aligned. This means an 2GB aperture won't | ||
63 | * have much chance of finding a place in the lower 4GB of memory. | ||
64 | * Unfortunately we cannot move it up because that would make the | ||
65 | * IOMMU useless. | ||
66 | */ | ||
67 | p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); | ||
68 | if (!p || __pa(p)+aper_size > 0xffffffff) { | ||
69 | printk("Cannot allocate aperture memory hole (%p,%uK)\n", | ||
70 | p, aper_size>>10); | ||
71 | if (p) | ||
72 | free_bootmem(__pa(p), aper_size); | ||
73 | return 0; | ||
74 | } | ||
75 | printk("Mapping aperture over %d KB of RAM @ %lx\n", | ||
76 | aper_size >> 10, __pa(p)); | ||
77 | insert_aperture_resource((u32)__pa(p), aper_size); | ||
78 | return (u32)__pa(p); | ||
79 | } | ||
80 | |||
81 | static int __init aperture_valid(u64 aper_base, u32 aper_size) | ||
82 | { | ||
83 | if (!aper_base) | ||
84 | return 0; | ||
85 | if (aper_size < 64*1024*1024) { | ||
86 | printk("Aperture too small (%d MB)\n", aper_size>>20); | ||
87 | return 0; | ||
88 | } | ||
89 | if (aper_base + aper_size > 0x100000000UL) { | ||
90 | printk("Aperture beyond 4GB. Ignoring.\n"); | ||
91 | return 0; | ||
92 | } | ||
93 | if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { | ||
94 | printk("Aperture pointing to e820 RAM. Ignoring.\n"); | ||
95 | return 0; | ||
96 | } | ||
97 | return 1; | ||
98 | } | ||
99 | |||
100 | /* Find a PCI capability */ | ||
101 | static __u32 __init find_cap(int num, int slot, int func, int cap) | ||
102 | { | ||
103 | u8 pos; | ||
104 | int bytes; | ||
105 | if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) | ||
106 | return 0; | ||
107 | pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); | ||
108 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { | ||
109 | u8 id; | ||
110 | pos &= ~3; | ||
111 | id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); | ||
112 | if (id == 0xff) | ||
113 | break; | ||
114 | if (id == cap) | ||
115 | return pos; | ||
116 | pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); | ||
117 | } | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | /* Read a standard AGPv3 bridge header */ | ||
122 | static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) | ||
123 | { | ||
124 | u32 apsize; | ||
125 | u32 apsizereg; | ||
126 | int nbits; | ||
127 | u32 aper_low, aper_hi; | ||
128 | u64 aper; | ||
129 | |||
130 | printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); | ||
131 | apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); | ||
132 | if (apsizereg == 0xffffffff) { | ||
133 | printk("APSIZE in AGP bridge unreadable\n"); | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | apsize = apsizereg & 0xfff; | ||
138 | /* Some BIOS use weird encodings not in the AGPv3 table. */ | ||
139 | if (apsize & 0xff) | ||
140 | apsize |= 0xf00; | ||
141 | nbits = hweight16(apsize); | ||
142 | *order = 7 - nbits; | ||
143 | if ((int)*order < 0) /* < 32MB */ | ||
144 | *order = 0; | ||
145 | |||
146 | aper_low = read_pci_config(num,slot,func, 0x10); | ||
147 | aper_hi = read_pci_config(num,slot,func,0x14); | ||
148 | aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); | ||
149 | |||
150 | printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", | ||
151 | aper, 32 << *order, apsizereg); | ||
152 | |||
153 | if (!aperture_valid(aper, (32*1024*1024) << *order)) | ||
154 | return 0; | ||
155 | return (u32)aper; | ||
156 | } | ||
157 | |||
158 | /* Look for an AGP bridge. Windows only expects the aperture in the | ||
159 | AGP bridge and some BIOS forget to initialize the Northbridge too. | ||
160 | Work around this here. | ||
161 | |||
162 | Do an PCI bus scan by hand because we're running before the PCI | ||
163 | subsystem. | ||
164 | |||
165 | All K8 AGP bridges are AGPv3 compliant, so we can do this scan | ||
166 | generically. It's probably overkill to always scan all slots because | ||
167 | the AGP bridges should be always an own bus on the HT hierarchy, | ||
168 | but do it here for future safety. */ | ||
169 | static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) | ||
170 | { | ||
171 | int num, slot, func; | ||
172 | |||
173 | /* Poor man's PCI discovery */ | ||
174 | for (num = 0; num < 256; num++) { | ||
175 | for (slot = 0; slot < 32; slot++) { | ||
176 | for (func = 0; func < 8; func++) { | ||
177 | u32 class, cap; | ||
178 | u8 type; | ||
179 | class = read_pci_config(num,slot,func, | ||
180 | PCI_CLASS_REVISION); | ||
181 | if (class == 0xffffffff) | ||
182 | break; | ||
183 | |||
184 | switch (class >> 16) { | ||
185 | case PCI_CLASS_BRIDGE_HOST: | ||
186 | case PCI_CLASS_BRIDGE_OTHER: /* needed? */ | ||
187 | /* AGP bridge? */ | ||
188 | cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); | ||
189 | if (!cap) | ||
190 | break; | ||
191 | *valid_agp = 1; | ||
192 | return read_agp(num,slot,func,cap,order); | ||
193 | } | ||
194 | |||
195 | /* No multi-function device? */ | ||
196 | type = read_pci_config_byte(num,slot,func, | ||
197 | PCI_HEADER_TYPE); | ||
198 | if (!(type & 0x80)) | ||
199 | break; | ||
200 | } | ||
201 | } | ||
202 | } | ||
203 | printk("No AGP bridge found\n"); | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | void __init iommu_hole_init(void) | ||
208 | { | ||
209 | int fix, num; | ||
210 | u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; | ||
211 | u64 aper_base, last_aper_base = 0; | ||
212 | int valid_agp = 0; | ||
213 | |||
214 | if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) | ||
215 | return; | ||
216 | |||
217 | printk(KERN_INFO "Checking aperture...\n"); | ||
218 | |||
219 | fix = 0; | ||
220 | for (num = 24; num < 32; num++) { | ||
221 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | ||
222 | continue; | ||
223 | |||
224 | iommu_detected = 1; | ||
225 | iommu_aperture = 1; | ||
226 | |||
227 | aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; | ||
228 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
229 | aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; | ||
230 | aper_base <<= 25; | ||
231 | |||
232 | printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, | ||
233 | aper_base, aper_size>>20); | ||
234 | |||
235 | if (!aperture_valid(aper_base, aper_size)) { | ||
236 | fix = 1; | ||
237 | break; | ||
238 | } | ||
239 | |||
240 | if ((last_aper_order && aper_order != last_aper_order) || | ||
241 | (last_aper_base && aper_base != last_aper_base)) { | ||
242 | fix = 1; | ||
243 | break; | ||
244 | } | ||
245 | last_aper_order = aper_order; | ||
246 | last_aper_base = aper_base; | ||
247 | } | ||
248 | |||
249 | if (!fix && !fallback_aper_force) { | ||
250 | if (last_aper_base) { | ||
251 | unsigned long n = (32 * 1024 * 1024) << last_aper_order; | ||
252 | insert_aperture_resource((u32)last_aper_base, n); | ||
253 | } | ||
254 | return; | ||
255 | } | ||
256 | |||
257 | if (!fallback_aper_force) | ||
258 | aper_alloc = search_agp_bridge(&aper_order, &valid_agp); | ||
259 | |||
260 | if (aper_alloc) { | ||
261 | /* Got the aperture from the AGP bridge */ | ||
262 | } else if (swiotlb && !valid_agp) { | ||
263 | /* Do nothing */ | ||
264 | } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || | ||
265 | force_iommu || | ||
266 | valid_agp || | ||
267 | fallback_aper_force) { | ||
268 | printk("Your BIOS doesn't leave a aperture memory hole\n"); | ||
269 | printk("Please enable the IOMMU option in the BIOS setup\n"); | ||
270 | printk("This costs you %d MB of RAM\n", | ||
271 | 32 << fallback_aper_order); | ||
272 | |||
273 | aper_order = fallback_aper_order; | ||
274 | aper_alloc = allocate_aperture(); | ||
275 | if (!aper_alloc) { | ||
276 | /* Could disable AGP and IOMMU here, but it's probably | ||
277 | not worth it. But the later users cannot deal with | ||
278 | bad apertures and turning on the aperture over memory | ||
279 | causes very strange problems, so it's better to | ||
280 | panic early. */ | ||
281 | panic("Not enough memory for aperture"); | ||
282 | } | ||
283 | } else { | ||
284 | return; | ||
285 | } | ||
286 | |||
287 | /* Fix up the north bridges */ | ||
288 | for (num = 24; num < 32; num++) { | ||
289 | if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) | ||
290 | continue; | ||
291 | |||
292 | /* Don't enable translation yet. That is done later. | ||
293 | Assume this BIOS didn't initialise the GART so | ||
294 | just overwrite all previous bits */ | ||
295 | write_pci_config(0, num, 3, 0x90, aper_order<<1); | ||
296 | write_pci_config(0, num, 3, 0x94, aper_alloc>>25); | ||
297 | } | ||
298 | } | ||
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c deleted file mode 100644 index 925758dbca0c..000000000000 --- a/arch/x86_64/kernel/apic.c +++ /dev/null | |||
@@ -1,1253 +0,0 @@ | |||
1 | /* | ||
2 | * Local APIC handling, local APIC timers | ||
3 | * | ||
4 | * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes | ||
7 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
8 | * thanks to Eric Gilmore | ||
9 | * and Rolf G. Tews | ||
10 | * for testing these extensively. | ||
11 | * Maciej W. Rozycki : Various updates and fixes. | ||
12 | * Mikael Pettersson : Power Management for UP-APIC. | ||
13 | * Pavel Machek and | ||
14 | * Mikael Pettersson : PM converted to driver model. | ||
15 | */ | ||
16 | |||
17 | #include <linux/init.h> | ||
18 | |||
19 | #include <linux/mm.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/bootmem.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | #include <linux/mc146818rtc.h> | ||
24 | #include <linux/kernel_stat.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/ioport.h> | ||
28 | |||
29 | #include <asm/atomic.h> | ||
30 | #include <asm/smp.h> | ||
31 | #include <asm/mtrr.h> | ||
32 | #include <asm/mpspec.h> | ||
33 | #include <asm/pgalloc.h> | ||
34 | #include <asm/mach_apic.h> | ||
35 | #include <asm/nmi.h> | ||
36 | #include <asm/idle.h> | ||
37 | #include <asm/proto.h> | ||
38 | #include <asm/timex.h> | ||
39 | #include <asm/hpet.h> | ||
40 | #include <asm/apic.h> | ||
41 | |||
42 | int apic_mapped; | ||
43 | int apic_verbosity; | ||
44 | int apic_runs_main_timer; | ||
45 | int apic_calibrate_pmtmr __initdata; | ||
46 | |||
47 | int disable_apic_timer __initdata; | ||
48 | |||
49 | /* Local APIC timer works in C2? */ | ||
50 | int local_apic_timer_c2_ok; | ||
51 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); | ||
52 | |||
53 | static struct resource *ioapic_resources; | ||
54 | static struct resource lapic_resource = { | ||
55 | .name = "Local APIC", | ||
56 | .flags = IORESOURCE_MEM | IORESOURCE_BUSY, | ||
57 | }; | ||
58 | |||
59 | /* | ||
60 | * cpu_mask that denotes the CPUs that needs timer interrupt coming in as | ||
61 | * IPIs in place of local APIC timers | ||
62 | */ | ||
63 | static cpumask_t timer_interrupt_broadcast_ipi_mask; | ||
64 | |||
65 | /* Using APIC to generate smp_local_timer_interrupt? */ | ||
66 | int using_apic_timer __read_mostly = 0; | ||
67 | |||
68 | static void apic_pm_activate(void); | ||
69 | |||
70 | void apic_wait_icr_idle(void) | ||
71 | { | ||
72 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) | ||
73 | cpu_relax(); | ||
74 | } | ||
75 | |||
76 | unsigned int safe_apic_wait_icr_idle(void) | ||
77 | { | ||
78 | unsigned int send_status; | ||
79 | int timeout; | ||
80 | |||
81 | timeout = 0; | ||
82 | do { | ||
83 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
84 | if (!send_status) | ||
85 | break; | ||
86 | udelay(100); | ||
87 | } while (timeout++ < 1000); | ||
88 | |||
89 | return send_status; | ||
90 | } | ||
91 | |||
92 | void enable_NMI_through_LVT0 (void * dummy) | ||
93 | { | ||
94 | unsigned int v; | ||
95 | |||
96 | /* unmask and set to NMI */ | ||
97 | v = APIC_DM_NMI; | ||
98 | apic_write(APIC_LVT0, v); | ||
99 | } | ||
100 | |||
101 | int get_maxlvt(void) | ||
102 | { | ||
103 | unsigned int v, maxlvt; | ||
104 | |||
105 | v = apic_read(APIC_LVR); | ||
106 | maxlvt = GET_APIC_MAXLVT(v); | ||
107 | return maxlvt; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
112 | * each architecture has to answer this themselves. | ||
113 | */ | ||
114 | void ack_bad_irq(unsigned int irq) | ||
115 | { | ||
116 | printk("unexpected IRQ trap at vector %02x\n", irq); | ||
117 | /* | ||
118 | * Currently unexpected vectors happen only on SMP and APIC. | ||
119 | * We _must_ ack these because every local APIC has only N | ||
120 | * irq slots per priority level, and a 'hanging, unacked' IRQ | ||
121 | * holds up an irq slot - in excessive cases (when multiple | ||
122 | * unexpected vectors occur) that might lock up the APIC | ||
123 | * completely. | ||
124 | * But don't ack when the APIC is disabled. -AK | ||
125 | */ | ||
126 | if (!disable_apic) | ||
127 | ack_APIC_irq(); | ||
128 | } | ||
129 | |||
130 | void clear_local_APIC(void) | ||
131 | { | ||
132 | int maxlvt; | ||
133 | unsigned int v; | ||
134 | |||
135 | maxlvt = get_maxlvt(); | ||
136 | |||
137 | /* | ||
138 | * Masking an LVT entry can trigger a local APIC error | ||
139 | * if the vector is zero. Mask LVTERR first to prevent this. | ||
140 | */ | ||
141 | if (maxlvt >= 3) { | ||
142 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ | ||
143 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); | ||
144 | } | ||
145 | /* | ||
146 | * Careful: we have to set masks only first to deassert | ||
147 | * any level-triggered sources. | ||
148 | */ | ||
149 | v = apic_read(APIC_LVTT); | ||
150 | apic_write(APIC_LVTT, v | APIC_LVT_MASKED); | ||
151 | v = apic_read(APIC_LVT0); | ||
152 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | ||
153 | v = apic_read(APIC_LVT1); | ||
154 | apic_write(APIC_LVT1, v | APIC_LVT_MASKED); | ||
155 | if (maxlvt >= 4) { | ||
156 | v = apic_read(APIC_LVTPC); | ||
157 | apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Clean APIC state for other OSs: | ||
162 | */ | ||
163 | apic_write(APIC_LVTT, APIC_LVT_MASKED); | ||
164 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
165 | apic_write(APIC_LVT1, APIC_LVT_MASKED); | ||
166 | if (maxlvt >= 3) | ||
167 | apic_write(APIC_LVTERR, APIC_LVT_MASKED); | ||
168 | if (maxlvt >= 4) | ||
169 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); | ||
170 | apic_write(APIC_ESR, 0); | ||
171 | apic_read(APIC_ESR); | ||
172 | } | ||
173 | |||
174 | void disconnect_bsp_APIC(int virt_wire_setup) | ||
175 | { | ||
176 | /* Go back to Virtual Wire compatibility mode */ | ||
177 | unsigned long value; | ||
178 | |||
179 | /* For the spurious interrupt use vector F, and enable it */ | ||
180 | value = apic_read(APIC_SPIV); | ||
181 | value &= ~APIC_VECTOR_MASK; | ||
182 | value |= APIC_SPIV_APIC_ENABLED; | ||
183 | value |= 0xf; | ||
184 | apic_write(APIC_SPIV, value); | ||
185 | |||
186 | if (!virt_wire_setup) { | ||
187 | /* For LVT0 make it edge triggered, active high, external and enabled */ | ||
188 | value = apic_read(APIC_LVT0); | ||
189 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
190 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
191 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); | ||
192 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
193 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); | ||
194 | apic_write(APIC_LVT0, value); | ||
195 | } else { | ||
196 | /* Disable LVT0 */ | ||
197 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
198 | } | ||
199 | |||
200 | /* For LVT1 make it edge triggered, active high, nmi and enabled */ | ||
201 | value = apic_read(APIC_LVT1); | ||
202 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
203 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
204 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
205 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
206 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | ||
207 | apic_write(APIC_LVT1, value); | ||
208 | } | ||
209 | |||
210 | void disable_local_APIC(void) | ||
211 | { | ||
212 | unsigned int value; | ||
213 | |||
214 | clear_local_APIC(); | ||
215 | |||
216 | /* | ||
217 | * Disable APIC (implies clearing of registers | ||
218 | * for 82489DX!). | ||
219 | */ | ||
220 | value = apic_read(APIC_SPIV); | ||
221 | value &= ~APIC_SPIV_APIC_ENABLED; | ||
222 | apic_write(APIC_SPIV, value); | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * This is to verify that we're looking at a real local APIC. | ||
227 | * Check these against your board if the CPUs aren't getting | ||
228 | * started for no apparent reason. | ||
229 | */ | ||
230 | int __init verify_local_APIC(void) | ||
231 | { | ||
232 | unsigned int reg0, reg1; | ||
233 | |||
234 | /* | ||
235 | * The version register is read-only in a real APIC. | ||
236 | */ | ||
237 | reg0 = apic_read(APIC_LVR); | ||
238 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); | ||
239 | apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); | ||
240 | reg1 = apic_read(APIC_LVR); | ||
241 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); | ||
242 | |||
243 | /* | ||
244 | * The two version reads above should print the same | ||
245 | * numbers. If the second one is different, then we | ||
246 | * poke at a non-APIC. | ||
247 | */ | ||
248 | if (reg1 != reg0) | ||
249 | return 0; | ||
250 | |||
251 | /* | ||
252 | * Check if the version looks reasonably. | ||
253 | */ | ||
254 | reg1 = GET_APIC_VERSION(reg0); | ||
255 | if (reg1 == 0x00 || reg1 == 0xff) | ||
256 | return 0; | ||
257 | reg1 = get_maxlvt(); | ||
258 | if (reg1 < 0x02 || reg1 == 0xff) | ||
259 | return 0; | ||
260 | |||
261 | /* | ||
262 | * The ID register is read/write in a real APIC. | ||
263 | */ | ||
264 | reg0 = apic_read(APIC_ID); | ||
265 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | ||
266 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); | ||
267 | reg1 = apic_read(APIC_ID); | ||
268 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | ||
269 | apic_write(APIC_ID, reg0); | ||
270 | if (reg1 != (reg0 ^ APIC_ID_MASK)) | ||
271 | return 0; | ||
272 | |||
273 | /* | ||
274 | * The next two are just to see if we have sane values. | ||
275 | * They're only really relevant if we're in Virtual Wire | ||
276 | * compatibility mode, but most boxes are anymore. | ||
277 | */ | ||
278 | reg0 = apic_read(APIC_LVT0); | ||
279 | apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); | ||
280 | reg1 = apic_read(APIC_LVT1); | ||
281 | apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); | ||
282 | |||
283 | return 1; | ||
284 | } | ||
285 | |||
286 | void __init sync_Arb_IDs(void) | ||
287 | { | ||
288 | /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ | ||
289 | unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
290 | if (ver >= 0x14) /* P4 or higher */ | ||
291 | return; | ||
292 | |||
293 | /* | ||
294 | * Wait for idle. | ||
295 | */ | ||
296 | apic_wait_icr_idle(); | ||
297 | |||
298 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | ||
299 | apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | ||
300 | | APIC_DM_INIT); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * An initial setup of the virtual wire mode. | ||
305 | */ | ||
306 | void __init init_bsp_APIC(void) | ||
307 | { | ||
308 | unsigned int value; | ||
309 | |||
310 | /* | ||
311 | * Don't do the setup now if we have a SMP BIOS as the | ||
312 | * through-I/O-APIC virtual wire mode might be active. | ||
313 | */ | ||
314 | if (smp_found_config || !cpu_has_apic) | ||
315 | return; | ||
316 | |||
317 | value = apic_read(APIC_LVR); | ||
318 | |||
319 | /* | ||
320 | * Do not trust the local APIC being empty at bootup. | ||
321 | */ | ||
322 | clear_local_APIC(); | ||
323 | |||
324 | /* | ||
325 | * Enable APIC. | ||
326 | */ | ||
327 | value = apic_read(APIC_SPIV); | ||
328 | value &= ~APIC_VECTOR_MASK; | ||
329 | value |= APIC_SPIV_APIC_ENABLED; | ||
330 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
331 | value |= SPURIOUS_APIC_VECTOR; | ||
332 | apic_write(APIC_SPIV, value); | ||
333 | |||
334 | /* | ||
335 | * Set up the virtual wire mode. | ||
336 | */ | ||
337 | apic_write(APIC_LVT0, APIC_DM_EXTINT); | ||
338 | value = APIC_DM_NMI; | ||
339 | apic_write(APIC_LVT1, value); | ||
340 | } | ||
341 | |||
342 | void __cpuinit setup_local_APIC (void) | ||
343 | { | ||
344 | unsigned int value, maxlvt; | ||
345 | int i, j; | ||
346 | |||
347 | value = apic_read(APIC_LVR); | ||
348 | |||
349 | BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); | ||
350 | |||
351 | /* | ||
352 | * Double-check whether this APIC is really registered. | ||
353 | * This is meaningless in clustered apic mode, so we skip it. | ||
354 | */ | ||
355 | if (!apic_id_registered()) | ||
356 | BUG(); | ||
357 | |||
358 | /* | ||
359 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
360 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
361 | * document number 292116). So here it goes... | ||
362 | */ | ||
363 | init_apic_ldr(); | ||
364 | |||
365 | /* | ||
366 | * Set Task Priority to 'accept all'. We never change this | ||
367 | * later on. | ||
368 | */ | ||
369 | value = apic_read(APIC_TASKPRI); | ||
370 | value &= ~APIC_TPRI_MASK; | ||
371 | apic_write(APIC_TASKPRI, value); | ||
372 | |||
373 | /* | ||
374 | * After a crash, we no longer service the interrupts and a pending | ||
375 | * interrupt from previous kernel might still have ISR bit set. | ||
376 | * | ||
377 | * Most probably by now CPU has serviced that pending interrupt and | ||
378 | * it might not have done the ack_APIC_irq() because it thought, | ||
379 | * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it | ||
380 | * does not clear the ISR bit and cpu thinks it has already serivced | ||
381 | * the interrupt. Hence a vector might get locked. It was noticed | ||
382 | * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. | ||
383 | */ | ||
384 | for (i = APIC_ISR_NR - 1; i >= 0; i--) { | ||
385 | value = apic_read(APIC_ISR + i*0x10); | ||
386 | for (j = 31; j >= 0; j--) { | ||
387 | if (value & (1<<j)) | ||
388 | ack_APIC_irq(); | ||
389 | } | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * Now that we are all set up, enable the APIC | ||
394 | */ | ||
395 | value = apic_read(APIC_SPIV); | ||
396 | value &= ~APIC_VECTOR_MASK; | ||
397 | /* | ||
398 | * Enable APIC | ||
399 | */ | ||
400 | value |= APIC_SPIV_APIC_ENABLED; | ||
401 | |||
402 | /* We always use processor focus */ | ||
403 | |||
404 | /* | ||
405 | * Set spurious IRQ vector | ||
406 | */ | ||
407 | value |= SPURIOUS_APIC_VECTOR; | ||
408 | apic_write(APIC_SPIV, value); | ||
409 | |||
410 | /* | ||
411 | * Set up LVT0, LVT1: | ||
412 | * | ||
413 | * set up through-local-APIC on the BP's LINT0. This is not | ||
414 | * strictly necessary in pure symmetric-IO mode, but sometimes | ||
415 | * we delegate interrupts to the 8259A. | ||
416 | */ | ||
417 | /* | ||
418 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro | ||
419 | */ | ||
420 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; | ||
421 | if (!smp_processor_id() && !value) { | ||
422 | value = APIC_DM_EXTINT; | ||
423 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); | ||
424 | } else { | ||
425 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; | ||
426 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); | ||
427 | } | ||
428 | apic_write(APIC_LVT0, value); | ||
429 | |||
430 | /* | ||
431 | * only the BP should see the LINT1 NMI signal, obviously. | ||
432 | */ | ||
433 | if (!smp_processor_id()) | ||
434 | value = APIC_DM_NMI; | ||
435 | else | ||
436 | value = APIC_DM_NMI | APIC_LVT_MASKED; | ||
437 | apic_write(APIC_LVT1, value); | ||
438 | |||
439 | { | ||
440 | unsigned oldvalue; | ||
441 | maxlvt = get_maxlvt(); | ||
442 | oldvalue = apic_read(APIC_ESR); | ||
443 | value = ERROR_APIC_VECTOR; // enables sending errors | ||
444 | apic_write(APIC_LVTERR, value); | ||
445 | /* | ||
446 | * spec says clear errors after enabling vector. | ||
447 | */ | ||
448 | if (maxlvt > 3) | ||
449 | apic_write(APIC_ESR, 0); | ||
450 | value = apic_read(APIC_ESR); | ||
451 | if (value != oldvalue) | ||
452 | apic_printk(APIC_VERBOSE, | ||
453 | "ESR value after enabling vector: %08x, after %08x\n", | ||
454 | oldvalue, value); | ||
455 | } | ||
456 | |||
457 | nmi_watchdog_default(); | ||
458 | setup_apic_nmi_watchdog(NULL); | ||
459 | apic_pm_activate(); | ||
460 | } | ||
461 | |||
462 | #ifdef CONFIG_PM | ||
463 | |||
464 | static struct { | ||
465 | /* 'active' is true if the local APIC was enabled by us and | ||
466 | not the BIOS; this signifies that we are also responsible | ||
467 | for disabling it before entering apm/acpi suspend */ | ||
468 | int active; | ||
469 | /* r/w apic fields */ | ||
470 | unsigned int apic_id; | ||
471 | unsigned int apic_taskpri; | ||
472 | unsigned int apic_ldr; | ||
473 | unsigned int apic_dfr; | ||
474 | unsigned int apic_spiv; | ||
475 | unsigned int apic_lvtt; | ||
476 | unsigned int apic_lvtpc; | ||
477 | unsigned int apic_lvt0; | ||
478 | unsigned int apic_lvt1; | ||
479 | unsigned int apic_lvterr; | ||
480 | unsigned int apic_tmict; | ||
481 | unsigned int apic_tdcr; | ||
482 | unsigned int apic_thmr; | ||
483 | } apic_pm_state; | ||
484 | |||
485 | static int lapic_suspend(struct sys_device *dev, pm_message_t state) | ||
486 | { | ||
487 | unsigned long flags; | ||
488 | int maxlvt; | ||
489 | |||
490 | if (!apic_pm_state.active) | ||
491 | return 0; | ||
492 | |||
493 | maxlvt = get_maxlvt(); | ||
494 | |||
495 | apic_pm_state.apic_id = apic_read(APIC_ID); | ||
496 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); | ||
497 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); | ||
498 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); | ||
499 | apic_pm_state.apic_spiv = apic_read(APIC_SPIV); | ||
500 | apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); | ||
501 | if (maxlvt >= 4) | ||
502 | apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); | ||
503 | apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); | ||
504 | apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); | ||
505 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | ||
506 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | ||
507 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | ||
508 | #ifdef CONFIG_X86_MCE_INTEL | ||
509 | if (maxlvt >= 5) | ||
510 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | ||
511 | #endif | ||
512 | local_irq_save(flags); | ||
513 | disable_local_APIC(); | ||
514 | local_irq_restore(flags); | ||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | static int lapic_resume(struct sys_device *dev) | ||
519 | { | ||
520 | unsigned int l, h; | ||
521 | unsigned long flags; | ||
522 | int maxlvt; | ||
523 | |||
524 | if (!apic_pm_state.active) | ||
525 | return 0; | ||
526 | |||
527 | maxlvt = get_maxlvt(); | ||
528 | |||
529 | local_irq_save(flags); | ||
530 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
531 | l &= ~MSR_IA32_APICBASE_BASE; | ||
532 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | ||
533 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
534 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | ||
535 | apic_write(APIC_ID, apic_pm_state.apic_id); | ||
536 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); | ||
537 | apic_write(APIC_LDR, apic_pm_state.apic_ldr); | ||
538 | apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); | ||
539 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | ||
540 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | ||
541 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | ||
542 | #ifdef CONFIG_X86_MCE_INTEL | ||
543 | if (maxlvt >= 5) | ||
544 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | ||
545 | #endif | ||
546 | if (maxlvt >= 4) | ||
547 | apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); | ||
548 | apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); | ||
549 | apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); | ||
550 | apic_write(APIC_TMICT, apic_pm_state.apic_tmict); | ||
551 | apic_write(APIC_ESR, 0); | ||
552 | apic_read(APIC_ESR); | ||
553 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | ||
554 | apic_write(APIC_ESR, 0); | ||
555 | apic_read(APIC_ESR); | ||
556 | local_irq_restore(flags); | ||
557 | return 0; | ||
558 | } | ||
559 | |||
560 | static struct sysdev_class lapic_sysclass = { | ||
561 | set_kset_name("lapic"), | ||
562 | .resume = lapic_resume, | ||
563 | .suspend = lapic_suspend, | ||
564 | }; | ||
565 | |||
566 | static struct sys_device device_lapic = { | ||
567 | .id = 0, | ||
568 | .cls = &lapic_sysclass, | ||
569 | }; | ||
570 | |||
571 | static void __cpuinit apic_pm_activate(void) | ||
572 | { | ||
573 | apic_pm_state.active = 1; | ||
574 | } | ||
575 | |||
576 | static int __init init_lapic_sysfs(void) | ||
577 | { | ||
578 | int error; | ||
579 | if (!cpu_has_apic) | ||
580 | return 0; | ||
581 | /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ | ||
582 | error = sysdev_class_register(&lapic_sysclass); | ||
583 | if (!error) | ||
584 | error = sysdev_register(&device_lapic); | ||
585 | return error; | ||
586 | } | ||
587 | device_initcall(init_lapic_sysfs); | ||
588 | |||
589 | #else /* CONFIG_PM */ | ||
590 | |||
591 | static void apic_pm_activate(void) { } | ||
592 | |||
593 | #endif /* CONFIG_PM */ | ||
594 | |||
595 | static int __init apic_set_verbosity(char *str) | ||
596 | { | ||
597 | if (str == NULL) { | ||
598 | skip_ioapic_setup = 0; | ||
599 | ioapic_force = 1; | ||
600 | return 0; | ||
601 | } | ||
602 | if (strcmp("debug", str) == 0) | ||
603 | apic_verbosity = APIC_DEBUG; | ||
604 | else if (strcmp("verbose", str) == 0) | ||
605 | apic_verbosity = APIC_VERBOSE; | ||
606 | else { | ||
607 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
608 | " use apic=verbose or apic=debug\n", str); | ||
609 | return -EINVAL; | ||
610 | } | ||
611 | |||
612 | return 0; | ||
613 | } | ||
614 | early_param("apic", apic_set_verbosity); | ||
615 | |||
616 | /* | ||
617 | * Detect and enable local APICs on non-SMP boards. | ||
618 | * Original code written by Keir Fraser. | ||
619 | * On AMD64 we trust the BIOS - if it says no APIC it is likely | ||
620 | * not correctly set up (usually the APIC timer won't work etc.) | ||
621 | */ | ||
622 | |||
623 | static int __init detect_init_APIC (void) | ||
624 | { | ||
625 | if (!cpu_has_apic) { | ||
626 | printk(KERN_INFO "No local APIC present\n"); | ||
627 | return -1; | ||
628 | } | ||
629 | |||
630 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
631 | boot_cpu_id = 0; | ||
632 | return 0; | ||
633 | } | ||
634 | |||
635 | #ifdef CONFIG_X86_IO_APIC | ||
636 | static struct resource * __init ioapic_setup_resources(void) | ||
637 | { | ||
638 | #define IOAPIC_RESOURCE_NAME_SIZE 11 | ||
639 | unsigned long n; | ||
640 | struct resource *res; | ||
641 | char *mem; | ||
642 | int i; | ||
643 | |||
644 | if (nr_ioapics <= 0) | ||
645 | return NULL; | ||
646 | |||
647 | n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); | ||
648 | n *= nr_ioapics; | ||
649 | |||
650 | mem = alloc_bootmem(n); | ||
651 | res = (void *)mem; | ||
652 | |||
653 | if (mem != NULL) { | ||
654 | memset(mem, 0, n); | ||
655 | mem += sizeof(struct resource) * nr_ioapics; | ||
656 | |||
657 | for (i = 0; i < nr_ioapics; i++) { | ||
658 | res[i].name = mem; | ||
659 | res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
660 | sprintf(mem, "IOAPIC %u", i); | ||
661 | mem += IOAPIC_RESOURCE_NAME_SIZE; | ||
662 | } | ||
663 | } | ||
664 | |||
665 | ioapic_resources = res; | ||
666 | |||
667 | return res; | ||
668 | } | ||
669 | |||
670 | static int __init ioapic_insert_resources(void) | ||
671 | { | ||
672 | int i; | ||
673 | struct resource *r = ioapic_resources; | ||
674 | |||
675 | if (!r) { | ||
676 | printk("IO APIC resources could be not be allocated.\n"); | ||
677 | return -1; | ||
678 | } | ||
679 | |||
680 | for (i = 0; i < nr_ioapics; i++) { | ||
681 | insert_resource(&iomem_resource, r); | ||
682 | r++; | ||
683 | } | ||
684 | |||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | /* Insert the IO APIC resources after PCI initialization has occured to handle | ||
689 | * IO APICS that are mapped in on a BAR in PCI space. */ | ||
690 | late_initcall(ioapic_insert_resources); | ||
691 | #endif | ||
692 | |||
693 | void __init init_apic_mappings(void) | ||
694 | { | ||
695 | unsigned long apic_phys; | ||
696 | |||
697 | /* | ||
698 | * If no local APIC can be found then set up a fake all | ||
699 | * zeroes page to simulate the local APIC and another | ||
700 | * one for the IO-APIC. | ||
701 | */ | ||
702 | if (!smp_found_config && detect_init_APIC()) { | ||
703 | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
704 | apic_phys = __pa(apic_phys); | ||
705 | } else | ||
706 | apic_phys = mp_lapic_addr; | ||
707 | |||
708 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | ||
709 | apic_mapped = 1; | ||
710 | apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); | ||
711 | |||
712 | /* Put local APIC into the resource map. */ | ||
713 | lapic_resource.start = apic_phys; | ||
714 | lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; | ||
715 | insert_resource(&iomem_resource, &lapic_resource); | ||
716 | |||
717 | /* | ||
718 | * Fetch the APIC ID of the BSP in case we have a | ||
719 | * default configuration (or the MP table is broken). | ||
720 | */ | ||
721 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
722 | |||
723 | { | ||
724 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
725 | int i; | ||
726 | struct resource *ioapic_res; | ||
727 | |||
728 | ioapic_res = ioapic_setup_resources(); | ||
729 | for (i = 0; i < nr_ioapics; i++) { | ||
730 | if (smp_found_config) { | ||
731 | ioapic_phys = mp_ioapics[i].mpc_apicaddr; | ||
732 | } else { | ||
733 | ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
734 | ioapic_phys = __pa(ioapic_phys); | ||
735 | } | ||
736 | set_fixmap_nocache(idx, ioapic_phys); | ||
737 | apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", | ||
738 | __fix_to_virt(idx), ioapic_phys); | ||
739 | idx++; | ||
740 | |||
741 | if (ioapic_res != NULL) { | ||
742 | ioapic_res->start = ioapic_phys; | ||
743 | ioapic_res->end = ioapic_phys + (4 * 1024) - 1; | ||
744 | ioapic_res++; | ||
745 | } | ||
746 | } | ||
747 | } | ||
748 | } | ||
749 | |||
750 | /* | ||
751 | * This function sets up the local APIC timer, with a timeout of | ||
752 | * 'clocks' APIC bus clock. During calibration we actually call | ||
753 | * this function twice on the boot CPU, once with a bogus timeout | ||
754 | * value, second time for real. The other (noncalibrating) CPUs | ||
755 | * call this function only once, with the real, calibrated value. | ||
756 | * | ||
757 | * We do reads before writes even if unnecessary, to get around the | ||
758 | * P5 APIC double write bug. | ||
759 | */ | ||
760 | |||
761 | #define APIC_DIVISOR 16 | ||
762 | |||
763 | static void __setup_APIC_LVTT(unsigned int clocks) | ||
764 | { | ||
765 | unsigned int lvtt_value, tmp_value; | ||
766 | int cpu = smp_processor_id(); | ||
767 | |||
768 | lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; | ||
769 | |||
770 | if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) | ||
771 | lvtt_value |= APIC_LVT_MASKED; | ||
772 | |||
773 | apic_write(APIC_LVTT, lvtt_value); | ||
774 | |||
775 | /* | ||
776 | * Divide PICLK by 16 | ||
777 | */ | ||
778 | tmp_value = apic_read(APIC_TDCR); | ||
779 | apic_write(APIC_TDCR, (tmp_value | ||
780 | & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | ||
781 | | APIC_TDR_DIV_16); | ||
782 | |||
783 | apic_write(APIC_TMICT, clocks/APIC_DIVISOR); | ||
784 | } | ||
785 | |||
786 | static void setup_APIC_timer(unsigned int clocks) | ||
787 | { | ||
788 | unsigned long flags; | ||
789 | |||
790 | local_irq_save(flags); | ||
791 | |||
792 | /* wait for irq slice */ | ||
793 | if (hpet_address && hpet_use_timer) { | ||
794 | u32 trigger = hpet_readl(HPET_T0_CMP); | ||
795 | while (hpet_readl(HPET_T0_CMP) == trigger) | ||
796 | /* do nothing */ ; | ||
797 | } else { | ||
798 | int c1, c2; | ||
799 | outb_p(0x00, 0x43); | ||
800 | c2 = inb_p(0x40); | ||
801 | c2 |= inb_p(0x40) << 8; | ||
802 | do { | ||
803 | c1 = c2; | ||
804 | outb_p(0x00, 0x43); | ||
805 | c2 = inb_p(0x40); | ||
806 | c2 |= inb_p(0x40) << 8; | ||
807 | } while (c2 - c1 < 300); | ||
808 | } | ||
809 | __setup_APIC_LVTT(clocks); | ||
810 | /* Turn off PIT interrupt if we use APIC timer as main timer. | ||
811 | Only works with the PM timer right now | ||
812 | TBD fix it for HPET too. */ | ||
813 | if ((pmtmr_ioport != 0) && | ||
814 | smp_processor_id() == boot_cpu_id && | ||
815 | apic_runs_main_timer == 1 && | ||
816 | !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { | ||
817 | stop_timer_interrupt(); | ||
818 | apic_runs_main_timer++; | ||
819 | } | ||
820 | local_irq_restore(flags); | ||
821 | } | ||
822 | |||
823 | /* | ||
824 | * In this function we calibrate APIC bus clocks to the external | ||
825 | * timer. Unfortunately we cannot use jiffies and the timer irq | ||
826 | * to calibrate, since some later bootup code depends on getting | ||
827 | * the first irq? Ugh. | ||
828 | * | ||
829 | * We want to do the calibration only once since we | ||
830 | * want to have local timer irqs syncron. CPUs connected | ||
831 | * by the same APIC bus have the very same bus frequency. | ||
832 | * And we want to have irqs off anyways, no accidental | ||
833 | * APIC irq that way. | ||
834 | */ | ||
835 | |||
836 | #define TICK_COUNT 100000000 | ||
837 | |||
838 | static int __init calibrate_APIC_clock(void) | ||
839 | { | ||
840 | unsigned apic, apic_start; | ||
841 | unsigned long tsc, tsc_start; | ||
842 | int result; | ||
843 | /* | ||
844 | * Put whatever arbitrary (but long enough) timeout | ||
845 | * value into the APIC clock, we just want to get the | ||
846 | * counter running for calibration. | ||
847 | */ | ||
848 | __setup_APIC_LVTT(4000000000); | ||
849 | |||
850 | apic_start = apic_read(APIC_TMCCT); | ||
851 | #ifdef CONFIG_X86_PM_TIMER | ||
852 | if (apic_calibrate_pmtmr && pmtmr_ioport) { | ||
853 | pmtimer_wait(5000); /* 5ms wait */ | ||
854 | apic = apic_read(APIC_TMCCT); | ||
855 | result = (apic_start - apic) * 1000L / 5; | ||
856 | } else | ||
857 | #endif | ||
858 | { | ||
859 | rdtscll(tsc_start); | ||
860 | |||
861 | do { | ||
862 | apic = apic_read(APIC_TMCCT); | ||
863 | rdtscll(tsc); | ||
864 | } while ((tsc - tsc_start) < TICK_COUNT && | ||
865 | (apic_start - apic) < TICK_COUNT); | ||
866 | |||
867 | result = (apic_start - apic) * 1000L * tsc_khz / | ||
868 | (tsc - tsc_start); | ||
869 | } | ||
870 | printk("result %d\n", result); | ||
871 | |||
872 | |||
873 | printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", | ||
874 | result / 1000 / 1000, result / 1000 % 1000); | ||
875 | |||
876 | return result * APIC_DIVISOR / HZ; | ||
877 | } | ||
878 | |||
879 | static unsigned int calibration_result; | ||
880 | |||
881 | void __init setup_boot_APIC_clock (void) | ||
882 | { | ||
883 | if (disable_apic_timer) { | ||
884 | printk(KERN_INFO "Disabling APIC timer\n"); | ||
885 | return; | ||
886 | } | ||
887 | |||
888 | printk(KERN_INFO "Using local APIC timer interrupts.\n"); | ||
889 | using_apic_timer = 1; | ||
890 | |||
891 | local_irq_disable(); | ||
892 | |||
893 | calibration_result = calibrate_APIC_clock(); | ||
894 | /* | ||
895 | * Now set up the timer for real. | ||
896 | */ | ||
897 | setup_APIC_timer(calibration_result); | ||
898 | |||
899 | local_irq_enable(); | ||
900 | } | ||
901 | |||
902 | void __cpuinit setup_secondary_APIC_clock(void) | ||
903 | { | ||
904 | local_irq_disable(); /* FIXME: Do we need this? --RR */ | ||
905 | setup_APIC_timer(calibration_result); | ||
906 | local_irq_enable(); | ||
907 | } | ||
908 | |||
909 | void disable_APIC_timer(void) | ||
910 | { | ||
911 | if (using_apic_timer) { | ||
912 | unsigned long v; | ||
913 | |||
914 | v = apic_read(APIC_LVTT); | ||
915 | /* | ||
916 | * When an illegal vector value (0-15) is written to an LVT | ||
917 | * entry and delivery mode is Fixed, the APIC may signal an | ||
918 | * illegal vector error, with out regard to whether the mask | ||
919 | * bit is set or whether an interrupt is actually seen on input. | ||
920 | * | ||
921 | * Boot sequence might call this function when the LVTT has | ||
922 | * '0' vector value. So make sure vector field is set to | ||
923 | * valid value. | ||
924 | */ | ||
925 | v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
926 | apic_write(APIC_LVTT, v); | ||
927 | } | ||
928 | } | ||
929 | |||
930 | void enable_APIC_timer(void) | ||
931 | { | ||
932 | int cpu = smp_processor_id(); | ||
933 | |||
934 | if (using_apic_timer && | ||
935 | !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { | ||
936 | unsigned long v; | ||
937 | |||
938 | v = apic_read(APIC_LVTT); | ||
939 | apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED); | ||
940 | } | ||
941 | } | ||
942 | |||
943 | void switch_APIC_timer_to_ipi(void *cpumask) | ||
944 | { | ||
945 | cpumask_t mask = *(cpumask_t *)cpumask; | ||
946 | int cpu = smp_processor_id(); | ||
947 | |||
948 | if (cpu_isset(cpu, mask) && | ||
949 | !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { | ||
950 | disable_APIC_timer(); | ||
951 | cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); | ||
952 | } | ||
953 | } | ||
954 | EXPORT_SYMBOL(switch_APIC_timer_to_ipi); | ||
955 | |||
956 | void smp_send_timer_broadcast_ipi(void) | ||
957 | { | ||
958 | int cpu = smp_processor_id(); | ||
959 | cpumask_t mask; | ||
960 | |||
961 | cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask); | ||
962 | |||
963 | if (cpu_isset(cpu, mask)) { | ||
964 | cpu_clear(cpu, mask); | ||
965 | add_pda(apic_timer_irqs, 1); | ||
966 | smp_local_timer_interrupt(); | ||
967 | } | ||
968 | |||
969 | if (!cpus_empty(mask)) { | ||
970 | send_IPI_mask(mask, LOCAL_TIMER_VECTOR); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | void switch_ipi_to_APIC_timer(void *cpumask) | ||
975 | { | ||
976 | cpumask_t mask = *(cpumask_t *)cpumask; | ||
977 | int cpu = smp_processor_id(); | ||
978 | |||
979 | if (cpu_isset(cpu, mask) && | ||
980 | cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { | ||
981 | cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask); | ||
982 | enable_APIC_timer(); | ||
983 | } | ||
984 | } | ||
985 | EXPORT_SYMBOL(switch_ipi_to_APIC_timer); | ||
986 | |||
987 | int setup_profiling_timer(unsigned int multiplier) | ||
988 | { | ||
989 | return -EINVAL; | ||
990 | } | ||
991 | |||
992 | void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, | ||
993 | unsigned char msg_type, unsigned char mask) | ||
994 | { | ||
995 | unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; | ||
996 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; | ||
997 | apic_write(reg, v); | ||
998 | } | ||
999 | |||
1000 | #undef APIC_DIVISOR | ||
1001 | |||
1002 | /* | ||
1003 | * Local timer interrupt handler. It does both profiling and | ||
1004 | * process statistics/rescheduling. | ||
1005 | * | ||
1006 | * We do profiling in every local tick, statistics/rescheduling | ||
1007 | * happen only every 'profiling multiplier' ticks. The default | ||
1008 | * multiplier is 1 and it can be changed by writing the new multiplier | ||
1009 | * value into /proc/profile. | ||
1010 | */ | ||
1011 | |||
1012 | void smp_local_timer_interrupt(void) | ||
1013 | { | ||
1014 | profile_tick(CPU_PROFILING); | ||
1015 | #ifdef CONFIG_SMP | ||
1016 | update_process_times(user_mode(get_irq_regs())); | ||
1017 | #endif | ||
1018 | if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) | ||
1019 | main_timer_handler(); | ||
1020 | /* | ||
1021 | * We take the 'long' return path, and there every subsystem | ||
1022 | * grabs the appropriate locks (kernel lock/ irq lock). | ||
1023 | * | ||
1024 | * We might want to decouple profiling from the 'long path', | ||
1025 | * and do the profiling totally in assembly. | ||
1026 | * | ||
1027 | * Currently this isn't too much of an issue (performance wise), | ||
1028 | * we can take more than 100K local irqs per second on a 100 MHz P5. | ||
1029 | */ | ||
1030 | } | ||
1031 | |||
1032 | /* | ||
1033 | * Local APIC timer interrupt. This is the most natural way for doing | ||
1034 | * local interrupts, but local timer interrupts can be emulated by | ||
1035 | * broadcast interrupts too. [in case the hw doesn't support APIC timers] | ||
1036 | * | ||
1037 | * [ if a single-CPU system runs an SMP kernel then we call the local | ||
1038 | * interrupt as well. Thus we cannot inline the local irq ... ] | ||
1039 | */ | ||
1040 | void smp_apic_timer_interrupt(struct pt_regs *regs) | ||
1041 | { | ||
1042 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
1043 | |||
1044 | /* | ||
1045 | * the NMI deadlock-detector uses this. | ||
1046 | */ | ||
1047 | add_pda(apic_timer_irqs, 1); | ||
1048 | |||
1049 | /* | ||
1050 | * NOTE! We'd better ACK the irq immediately, | ||
1051 | * because timer handling can be slow. | ||
1052 | */ | ||
1053 | ack_APIC_irq(); | ||
1054 | /* | ||
1055 | * update_process_times() expects us to have done irq_enter(). | ||
1056 | * Besides, if we don't timer interrupts ignore the global | ||
1057 | * interrupt lock, which is the WrongThing (tm) to do. | ||
1058 | */ | ||
1059 | exit_idle(); | ||
1060 | irq_enter(); | ||
1061 | smp_local_timer_interrupt(); | ||
1062 | irq_exit(); | ||
1063 | set_irq_regs(old_regs); | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * apic_is_clustered_box() -- Check if we can expect good TSC | ||
1068 | * | ||
1069 | * Thus far, the major user of this is IBM's Summit2 series: | ||
1070 | * | ||
1071 | * Clustered boxes may have unsynced TSC problems if they are | ||
1072 | * multi-chassis. Use available data to take a good guess. | ||
1073 | * If in doubt, go HPET. | ||
1074 | */ | ||
1075 | __cpuinit int apic_is_clustered_box(void) | ||
1076 | { | ||
1077 | int i, clusters, zeros; | ||
1078 | unsigned id; | ||
1079 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | ||
1080 | |||
1081 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); | ||
1082 | |||
1083 | for (i = 0; i < NR_CPUS; i++) { | ||
1084 | id = bios_cpu_apicid[i]; | ||
1085 | if (id != BAD_APICID) | ||
1086 | __set_bit(APIC_CLUSTERID(id), clustermap); | ||
1087 | } | ||
1088 | |||
1089 | /* Problem: Partially populated chassis may not have CPUs in some of | ||
1090 | * the APIC clusters they have been allocated. Only present CPUs have | ||
1091 | * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since | ||
1092 | * clusters are allocated sequentially, count zeros only if they are | ||
1093 | * bounded by ones. | ||
1094 | */ | ||
1095 | clusters = 0; | ||
1096 | zeros = 0; | ||
1097 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
1098 | if (test_bit(i, clustermap)) { | ||
1099 | clusters += 1 + zeros; | ||
1100 | zeros = 0; | ||
1101 | } else | ||
1102 | ++zeros; | ||
1103 | } | ||
1104 | |||
1105 | /* | ||
1106 | * If clusters > 2, then should be multi-chassis. | ||
1107 | * May have to revisit this when multi-core + hyperthreaded CPUs come | ||
1108 | * out, but AFAIK this will work even for them. | ||
1109 | */ | ||
1110 | return (clusters > 2); | ||
1111 | } | ||
1112 | |||
1113 | /* | ||
1114 | * This interrupt should _never_ happen with our APIC/SMP architecture | ||
1115 | */ | ||
1116 | asmlinkage void smp_spurious_interrupt(void) | ||
1117 | { | ||
1118 | unsigned int v; | ||
1119 | exit_idle(); | ||
1120 | irq_enter(); | ||
1121 | /* | ||
1122 | * Check if this really is a spurious interrupt and ACK it | ||
1123 | * if it is a vectored one. Just in case... | ||
1124 | * Spurious interrupts should not be ACKed. | ||
1125 | */ | ||
1126 | v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); | ||
1127 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | ||
1128 | ack_APIC_irq(); | ||
1129 | |||
1130 | irq_exit(); | ||
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * This interrupt should never happen with our APIC/SMP architecture | ||
1135 | */ | ||
1136 | |||
1137 | asmlinkage void smp_error_interrupt(void) | ||
1138 | { | ||
1139 | unsigned int v, v1; | ||
1140 | |||
1141 | exit_idle(); | ||
1142 | irq_enter(); | ||
1143 | /* First tickle the hardware, only then report what went on. -- REW */ | ||
1144 | v = apic_read(APIC_ESR); | ||
1145 | apic_write(APIC_ESR, 0); | ||
1146 | v1 = apic_read(APIC_ESR); | ||
1147 | ack_APIC_irq(); | ||
1148 | atomic_inc(&irq_err_count); | ||
1149 | |||
1150 | /* Here is what the APIC error bits mean: | ||
1151 | 0: Send CS error | ||
1152 | 1: Receive CS error | ||
1153 | 2: Send accept error | ||
1154 | 3: Receive accept error | ||
1155 | 4: Reserved | ||
1156 | 5: Send illegal vector | ||
1157 | 6: Received illegal vector | ||
1158 | 7: Illegal register address | ||
1159 | */ | ||
1160 | printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", | ||
1161 | smp_processor_id(), v , v1); | ||
1162 | irq_exit(); | ||
1163 | } | ||
1164 | |||
1165 | int disable_apic; | ||
1166 | |||
1167 | /* | ||
1168 | * This initializes the IO-APIC and APIC hardware if this is | ||
1169 | * a UP kernel. | ||
1170 | */ | ||
1171 | int __init APIC_init_uniprocessor (void) | ||
1172 | { | ||
1173 | if (disable_apic) { | ||
1174 | printk(KERN_INFO "Apic disabled\n"); | ||
1175 | return -1; | ||
1176 | } | ||
1177 | if (!cpu_has_apic) { | ||
1178 | disable_apic = 1; | ||
1179 | printk(KERN_INFO "Apic disabled by BIOS\n"); | ||
1180 | return -1; | ||
1181 | } | ||
1182 | |||
1183 | verify_local_APIC(); | ||
1184 | |||
1185 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); | ||
1186 | apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); | ||
1187 | |||
1188 | setup_local_APIC(); | ||
1189 | |||
1190 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | ||
1191 | setup_IO_APIC(); | ||
1192 | else | ||
1193 | nr_ioapics = 0; | ||
1194 | setup_boot_APIC_clock(); | ||
1195 | check_nmi_watchdog(); | ||
1196 | return 0; | ||
1197 | } | ||
1198 | |||
1199 | static __init int setup_disableapic(char *str) | ||
1200 | { | ||
1201 | disable_apic = 1; | ||
1202 | clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); | ||
1203 | return 0; | ||
1204 | } | ||
1205 | early_param("disableapic", setup_disableapic); | ||
1206 | |||
1207 | /* same as disableapic, for compatibility */ | ||
1208 | static __init int setup_nolapic(char *str) | ||
1209 | { | ||
1210 | return setup_disableapic(str); | ||
1211 | } | ||
1212 | early_param("nolapic", setup_nolapic); | ||
1213 | |||
1214 | static int __init parse_lapic_timer_c2_ok(char *arg) | ||
1215 | { | ||
1216 | local_apic_timer_c2_ok = 1; | ||
1217 | return 0; | ||
1218 | } | ||
1219 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); | ||
1220 | |||
1221 | static __init int setup_noapictimer(char *str) | ||
1222 | { | ||
1223 | if (str[0] != ' ' && str[0] != 0) | ||
1224 | return 0; | ||
1225 | disable_apic_timer = 1; | ||
1226 | return 1; | ||
1227 | } | ||
1228 | |||
1229 | static __init int setup_apicmaintimer(char *str) | ||
1230 | { | ||
1231 | apic_runs_main_timer = 1; | ||
1232 | nohpet = 1; | ||
1233 | return 1; | ||
1234 | } | ||
1235 | __setup("apicmaintimer", setup_apicmaintimer); | ||
1236 | |||
1237 | static __init int setup_noapicmaintimer(char *str) | ||
1238 | { | ||
1239 | apic_runs_main_timer = -1; | ||
1240 | return 1; | ||
1241 | } | ||
1242 | __setup("noapicmaintimer", setup_noapicmaintimer); | ||
1243 | |||
1244 | static __init int setup_apicpmtimer(char *s) | ||
1245 | { | ||
1246 | apic_calibrate_pmtmr = 1; | ||
1247 | notsc_setup(NULL); | ||
1248 | return setup_apicmaintimer(NULL); | ||
1249 | } | ||
1250 | __setup("apicpmtimer", setup_apicpmtimer); | ||
1251 | |||
1252 | __setup("noapictimer", setup_noapictimer); | ||
1253 | |||
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c deleted file mode 100644 index 778953bc636c..000000000000 --- a/arch/x86_64/kernel/asm-offsets.c +++ /dev/null | |||
@@ -1,85 +0,0 @@ | |||
1 | /* | ||
2 | * Generate definitions needed by assembly language modules. | ||
3 | * This code generates raw asm output which is post-processed to extract | ||
4 | * and format the required data. | ||
5 | */ | ||
6 | |||
7 | #include <linux/crypto.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/stddef.h> | ||
10 | #include <linux/errno.h> | ||
11 | #include <linux/hardirq.h> | ||
12 | #include <linux/suspend.h> | ||
13 | #include <asm/pda.h> | ||
14 | #include <asm/processor.h> | ||
15 | #include <asm/segment.h> | ||
16 | #include <asm/thread_info.h> | ||
17 | #include <asm/ia32.h> | ||
18 | |||
19 | #define DEFINE(sym, val) \ | ||
20 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) | ||
21 | |||
22 | #define BLANK() asm volatile("\n->" : : ) | ||
23 | |||
24 | #define __NO_STUBS 1 | ||
25 | #undef __SYSCALL | ||
26 | #undef _ASM_X86_64_UNISTD_H_ | ||
27 | #define __SYSCALL(nr, sym) [nr] = 1, | ||
28 | static char syscalls[] = { | ||
29 | #include <asm/unistd.h> | ||
30 | }; | ||
31 | |||
32 | int main(void) | ||
33 | { | ||
34 | #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) | ||
35 | ENTRY(state); | ||
36 | ENTRY(flags); | ||
37 | ENTRY(thread); | ||
38 | ENTRY(pid); | ||
39 | BLANK(); | ||
40 | #undef ENTRY | ||
41 | #define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) | ||
42 | ENTRY(flags); | ||
43 | ENTRY(addr_limit); | ||
44 | ENTRY(preempt_count); | ||
45 | ENTRY(status); | ||
46 | BLANK(); | ||
47 | #undef ENTRY | ||
48 | #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) | ||
49 | ENTRY(kernelstack); | ||
50 | ENTRY(oldrsp); | ||
51 | ENTRY(pcurrent); | ||
52 | ENTRY(irqcount); | ||
53 | ENTRY(cpunumber); | ||
54 | ENTRY(irqstackptr); | ||
55 | ENTRY(data_offset); | ||
56 | BLANK(); | ||
57 | #undef ENTRY | ||
58 | #ifdef CONFIG_IA32_EMULATION | ||
59 | #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) | ||
60 | ENTRY(eax); | ||
61 | ENTRY(ebx); | ||
62 | ENTRY(ecx); | ||
63 | ENTRY(edx); | ||
64 | ENTRY(esi); | ||
65 | ENTRY(edi); | ||
66 | ENTRY(ebp); | ||
67 | ENTRY(esp); | ||
68 | ENTRY(eip); | ||
69 | BLANK(); | ||
70 | #undef ENTRY | ||
71 | DEFINE(IA32_RT_SIGFRAME_sigcontext, | ||
72 | offsetof (struct rt_sigframe32, uc.uc_mcontext)); | ||
73 | BLANK(); | ||
74 | #endif | ||
75 | DEFINE(pbe_address, offsetof(struct pbe, address)); | ||
76 | DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); | ||
77 | DEFINE(pbe_next, offsetof(struct pbe, next)); | ||
78 | BLANK(); | ||
79 | DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); | ||
80 | BLANK(); | ||
81 | DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); | ||
82 | BLANK(); | ||
83 | DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); | ||
84 | return 0; | ||
85 | } | ||
diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c deleted file mode 100644 index 06d3e5a14d9d..000000000000 --- a/arch/x86_64/kernel/audit.c +++ /dev/null | |||
@@ -1,81 +0,0 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/types.h> | ||
3 | #include <linux/audit.h> | ||
4 | #include <asm/unistd.h> | ||
5 | |||
6 | static unsigned dir_class[] = { | ||
7 | #include <asm-generic/audit_dir_write.h> | ||
8 | ~0U | ||
9 | }; | ||
10 | |||
11 | static unsigned read_class[] = { | ||
12 | #include <asm-generic/audit_read.h> | ||
13 | ~0U | ||
14 | }; | ||
15 | |||
16 | static unsigned write_class[] = { | ||
17 | #include <asm-generic/audit_write.h> | ||
18 | ~0U | ||
19 | }; | ||
20 | |||
21 | static unsigned chattr_class[] = { | ||
22 | #include <asm-generic/audit_change_attr.h> | ||
23 | ~0U | ||
24 | }; | ||
25 | |||
26 | static unsigned signal_class[] = { | ||
27 | #include <asm-generic/audit_signal.h> | ||
28 | ~0U | ||
29 | }; | ||
30 | |||
31 | int audit_classify_arch(int arch) | ||
32 | { | ||
33 | #ifdef CONFIG_IA32_EMULATION | ||
34 | if (arch == AUDIT_ARCH_I386) | ||
35 | return 1; | ||
36 | #endif | ||
37 | return 0; | ||
38 | } | ||
39 | |||
40 | int audit_classify_syscall(int abi, unsigned syscall) | ||
41 | { | ||
42 | #ifdef CONFIG_IA32_EMULATION | ||
43 | extern int ia32_classify_syscall(unsigned); | ||
44 | if (abi == AUDIT_ARCH_I386) | ||
45 | return ia32_classify_syscall(syscall); | ||
46 | #endif | ||
47 | switch(syscall) { | ||
48 | case __NR_open: | ||
49 | return 2; | ||
50 | case __NR_openat: | ||
51 | return 3; | ||
52 | case __NR_execve: | ||
53 | return 5; | ||
54 | default: | ||
55 | return 0; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | static int __init audit_classes_init(void) | ||
60 | { | ||
61 | #ifdef CONFIG_IA32_EMULATION | ||
62 | extern __u32 ia32_dir_class[]; | ||
63 | extern __u32 ia32_write_class[]; | ||
64 | extern __u32 ia32_read_class[]; | ||
65 | extern __u32 ia32_chattr_class[]; | ||
66 | extern __u32 ia32_signal_class[]; | ||
67 | audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); | ||
68 | audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); | ||
69 | audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); | ||
70 | audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); | ||
71 | audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class); | ||
72 | #endif | ||
73 | audit_register_class(AUDIT_CLASS_WRITE, write_class); | ||
74 | audit_register_class(AUDIT_CLASS_READ, read_class); | ||
75 | audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); | ||
76 | audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); | ||
77 | audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | __initcall(audit_classes_init); | ||
diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c deleted file mode 100644 index 4e5e9d364d63..000000000000 --- a/arch/x86_64/kernel/bugs.c +++ /dev/null | |||
@@ -1,24 +0,0 @@ | |||
1 | /* | ||
2 | * arch/x86_64/kernel/bugs.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * Copyright (C) 2000 SuSE | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <asm/alternative.h> | ||
11 | #include <asm/bugs.h> | ||
12 | #include <asm/processor.h> | ||
13 | #include <asm/mtrr.h> | ||
14 | |||
15 | void __init check_bugs(void) | ||
16 | { | ||
17 | identify_cpu(&boot_cpu_data); | ||
18 | mtrr_bp_init(); | ||
19 | #if !defined(CONFIG_SMP) | ||
20 | printk("CPU: "); | ||
21 | print_cpu_info(&boot_cpu_data); | ||
22 | #endif | ||
23 | alternative_instructions(); | ||
24 | } | ||
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig deleted file mode 100644 index a3fd51926cbd..000000000000 --- a/arch/x86_64/kernel/cpufreq/Kconfig +++ /dev/null | |||
@@ -1,108 +0,0 @@ | |||
1 | # | ||
2 | # CPU Frequency scaling | ||
3 | # | ||
4 | |||
5 | menu "CPU Frequency scaling" | ||
6 | |||
7 | source "drivers/cpufreq/Kconfig" | ||
8 | |||
9 | if CPU_FREQ | ||
10 | |||
11 | comment "CPUFreq processor drivers" | ||
12 | |||
13 | config X86_POWERNOW_K8 | ||
14 | tristate "AMD Opteron/Athlon64 PowerNow!" | ||
15 | select CPU_FREQ_TABLE | ||
16 | help | ||
17 | This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. | ||
18 | |||
19 | To compile this driver as a module, choose M here: the | ||
20 | module will be called powernow-k8. | ||
21 | |||
22 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
23 | |||
24 | If in doubt, say N. | ||
25 | |||
26 | config X86_POWERNOW_K8_ACPI | ||
27 | bool | ||
28 | depends on X86_POWERNOW_K8 && ACPI_PROCESSOR | ||
29 | depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) | ||
30 | default y | ||
31 | |||
32 | config X86_SPEEDSTEP_CENTRINO | ||
33 | tristate "Intel Enhanced SpeedStep (deprecated)" | ||
34 | select CPU_FREQ_TABLE | ||
35 | depends on ACPI_PROCESSOR | ||
36 | help | ||
37 | This is deprecated and this functionality is now merged into | ||
38 | acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of | ||
39 | speedstep_centrino. | ||
40 | This adds the CPUFreq driver for Enhanced SpeedStep enabled | ||
41 | mobile CPUs. This means Intel Pentium M (Centrino) CPUs | ||
42 | or 64bit enabled Intel Xeons. | ||
43 | |||
44 | To compile this driver as a module, choose M here: the | ||
45 | module will be called speedstep-centrino. | ||
46 | |||
47 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
48 | |||
49 | If in doubt, say N. | ||
50 | |||
51 | config X86_ACPI_CPUFREQ | ||
52 | tristate "ACPI Processor P-States driver" | ||
53 | select CPU_FREQ_TABLE | ||
54 | depends on ACPI_PROCESSOR | ||
55 | help | ||
56 | This driver adds a CPUFreq driver which utilizes the ACPI | ||
57 | Processor Performance States. | ||
58 | This driver also supports Intel Enhanced Speedstep. | ||
59 | |||
60 | To compile this driver as a module, choose M here: the | ||
61 | module will be called acpi-cpufreq. | ||
62 | |||
63 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
64 | |||
65 | If in doubt, say N. | ||
66 | |||
67 | comment "shared options" | ||
68 | |||
69 | config X86_ACPI_CPUFREQ_PROC_INTF | ||
70 | bool "/proc/acpi/processor/../performance interface (deprecated)" | ||
71 | depends on PROC_FS | ||
72 | depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K8_ACPI | ||
73 | help | ||
74 | This enables the deprecated /proc/acpi/processor/../performance | ||
75 | interface. While it is helpful for debugging, the generic, | ||
76 | cross-architecture cpufreq interfaces should be used. | ||
77 | |||
78 | If in doubt, say N. | ||
79 | |||
80 | config X86_P4_CLOCKMOD | ||
81 | tristate "Intel Pentium 4 clock modulation" | ||
82 | depends on EMBEDDED | ||
83 | select CPU_FREQ_TABLE | ||
84 | help | ||
85 | This adds the clock modulation driver for Intel Pentium 4 / XEON | ||
86 | processors. When enabled it will lower CPU temperature by skipping | ||
87 | clocks. | ||
88 | |||
89 | This driver should be only used in exceptional | ||
90 | circumstances when very low power is needed because it causes severe | ||
91 | slowdowns and noticeable latencies. Normally Speedstep should be used | ||
92 | instead. | ||
93 | |||
94 | To compile this driver as a module, choose M here: the | ||
95 | module will be called p4-clockmod. | ||
96 | |||
97 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
98 | |||
99 | Unless you are absolutely sure say N. | ||
100 | |||
101 | |||
102 | config X86_SPEEDSTEP_LIB | ||
103 | tristate | ||
104 | default X86_P4_CLOCKMOD | ||
105 | |||
106 | endif | ||
107 | |||
108 | endmenu | ||
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile deleted file mode 100644 index 753ce1dd418e..000000000000 --- a/arch/x86_64/kernel/cpufreq/Makefile +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | # | ||
2 | # Reuse the i386 cpufreq drivers | ||
3 | # | ||
4 | |||
5 | SRCDIR := ../../../i386/kernel/cpu/cpufreq | ||
6 | |||
7 | obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o | ||
8 | obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o | ||
9 | obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o | ||
10 | obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o | ||
11 | obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o | ||
12 | |||
13 | powernow-k8-objs := ${SRCDIR}/powernow-k8.o | ||
14 | speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o | ||
15 | acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o | ||
16 | p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o | ||
17 | speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o | ||
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c deleted file mode 100644 index 13432a1ae904..000000000000 --- a/arch/x86_64/kernel/crash.c +++ /dev/null | |||
@@ -1,135 +0,0 @@ | |||
1 | /* | ||
2 | * Architecture specific (x86_64) functions for kexec based crash dumps. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2004. All rights reserved. | ||
7 | * | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/irq.h> | ||
15 | #include <linux/reboot.h> | ||
16 | #include <linux/kexec.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/elf.h> | ||
19 | #include <linux/elfcore.h> | ||
20 | #include <linux/kdebug.h> | ||
21 | |||
22 | #include <asm/processor.h> | ||
23 | #include <asm/hardirq.h> | ||
24 | #include <asm/nmi.h> | ||
25 | #include <asm/hw_irq.h> | ||
26 | #include <asm/mach_apic.h> | ||
27 | |||
28 | /* This keeps a track of which one is crashing cpu. */ | ||
29 | static int crashing_cpu; | ||
30 | |||
31 | #ifdef CONFIG_SMP | ||
32 | static atomic_t waiting_for_crash_ipi; | ||
33 | |||
34 | static int crash_nmi_callback(struct notifier_block *self, | ||
35 | unsigned long val, void *data) | ||
36 | { | ||
37 | struct pt_regs *regs; | ||
38 | int cpu; | ||
39 | |||
40 | if (val != DIE_NMI_IPI) | ||
41 | return NOTIFY_OK; | ||
42 | |||
43 | regs = ((struct die_args *)data)->regs; | ||
44 | cpu = raw_smp_processor_id(); | ||
45 | |||
46 | /* | ||
47 | * Don't do anything if this handler is invoked on crashing cpu. | ||
48 | * Otherwise, system will completely hang. Crashing cpu can get | ||
49 | * an NMI if system was initially booted with nmi_watchdog parameter. | ||
50 | */ | ||
51 | if (cpu == crashing_cpu) | ||
52 | return NOTIFY_STOP; | ||
53 | local_irq_disable(); | ||
54 | |||
55 | crash_save_cpu(regs, cpu); | ||
56 | disable_local_APIC(); | ||
57 | atomic_dec(&waiting_for_crash_ipi); | ||
58 | /* Assume hlt works */ | ||
59 | for(;;) | ||
60 | halt(); | ||
61 | |||
62 | return 1; | ||
63 | } | ||
64 | |||
65 | static void smp_send_nmi_allbutself(void) | ||
66 | { | ||
67 | send_IPI_allbutself(NMI_VECTOR); | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * This code is a best effort heuristic to get the | ||
72 | * other cpus to stop executing. So races with | ||
73 | * cpu hotplug shouldn't matter. | ||
74 | */ | ||
75 | |||
76 | static struct notifier_block crash_nmi_nb = { | ||
77 | .notifier_call = crash_nmi_callback, | ||
78 | }; | ||
79 | |||
80 | static void nmi_shootdown_cpus(void) | ||
81 | { | ||
82 | unsigned long msecs; | ||
83 | |||
84 | atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); | ||
85 | if (register_die_notifier(&crash_nmi_nb)) | ||
86 | return; /* return what? */ | ||
87 | |||
88 | /* | ||
89 | * Ensure the new callback function is set before sending | ||
90 | * out the NMI | ||
91 | */ | ||
92 | wmb(); | ||
93 | |||
94 | smp_send_nmi_allbutself(); | ||
95 | |||
96 | msecs = 1000; /* Wait at most a second for the other cpus to stop */ | ||
97 | while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { | ||
98 | mdelay(1); | ||
99 | msecs--; | ||
100 | } | ||
101 | /* Leave the nmi callback set */ | ||
102 | disable_local_APIC(); | ||
103 | } | ||
104 | #else | ||
105 | static void nmi_shootdown_cpus(void) | ||
106 | { | ||
107 | /* There are no cpus to shootdown */ | ||
108 | } | ||
109 | #endif | ||
110 | |||
111 | void machine_crash_shutdown(struct pt_regs *regs) | ||
112 | { | ||
113 | /* | ||
114 | * This function is only called after the system | ||
115 | * has panicked or is otherwise in a critical state. | ||
116 | * The minimum amount of code to allow a kexec'd kernel | ||
117 | * to run successfully needs to happen here. | ||
118 | * | ||
119 | * In practice this means shooting down the other cpus in | ||
120 | * an SMP system. | ||
121 | */ | ||
122 | /* The kernel is broken so disable interrupts */ | ||
123 | local_irq_disable(); | ||
124 | |||
125 | /* Make a note of crashing cpu. Will be used in NMI callback.*/ | ||
126 | crashing_cpu = smp_processor_id(); | ||
127 | nmi_shootdown_cpus(); | ||
128 | |||
129 | if(cpu_has_apic) | ||
130 | disable_local_APIC(); | ||
131 | |||
132 | disable_IO_APIC(); | ||
133 | |||
134 | crash_save_cpu(regs, smp_processor_id()); | ||
135 | } | ||
diff --git a/arch/x86_64/kernel/crash_dump.c b/arch/x86_64/kernel/crash_dump.c deleted file mode 100644 index 942deac4d43a..000000000000 --- a/arch/x86_64/kernel/crash_dump.c +++ /dev/null | |||
@@ -1,47 +0,0 @@ | |||
1 | /* | ||
2 | * kernel/crash_dump.c - Memory preserving reboot related code. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * Copyright (C) IBM Corporation, 2004. All rights reserved | ||
6 | */ | ||
7 | |||
8 | #include <linux/errno.h> | ||
9 | #include <linux/crash_dump.h> | ||
10 | |||
11 | #include <asm/uaccess.h> | ||
12 | #include <asm/io.h> | ||
13 | |||
14 | /** | ||
15 | * copy_oldmem_page - copy one page from "oldmem" | ||
16 | * @pfn: page frame number to be copied | ||
17 | * @buf: target memory address for the copy; this can be in kernel address | ||
18 | * space or user address space (see @userbuf) | ||
19 | * @csize: number of bytes to copy | ||
20 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
21 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
22 | * otherwise @buf is in kernel address space, use memcpy(). | ||
23 | * | ||
24 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
25 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
26 | */ | ||
27 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
28 | size_t csize, unsigned long offset, int userbuf) | ||
29 | { | ||
30 | void *vaddr; | ||
31 | |||
32 | if (!csize) | ||
33 | return 0; | ||
34 | |||
35 | vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); | ||
36 | |||
37 | if (userbuf) { | ||
38 | if (copy_to_user(buf, (vaddr + offset), csize)) { | ||
39 | iounmap(vaddr); | ||
40 | return -EFAULT; | ||
41 | } | ||
42 | } else | ||
43 | memcpy(buf, (vaddr + offset), csize); | ||
44 | |||
45 | iounmap(vaddr); | ||
46 | return csize; | ||
47 | } | ||
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c deleted file mode 100644 index 0f4d5e209e9b..000000000000 --- a/arch/x86_64/kernel/e820.c +++ /dev/null | |||
@@ -1,725 +0,0 @@ | |||
1 | /* | ||
2 | * Handle the memory map. | ||
3 | * The functions here do the job until bootmem takes over. | ||
4 | * | ||
5 | * Getting sanitize_e820_map() in sync with i386 version by applying change: | ||
6 | * - Provisions for empty E820 memory regions (reported by certain BIOSes). | ||
7 | * Alex Achenbach <xela@slit.de>, December 2002. | ||
8 | * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | ||
9 | * | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/ioport.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/kexec.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/suspend.h> | ||
21 | #include <linux/pfn.h> | ||
22 | |||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/page.h> | ||
25 | #include <asm/e820.h> | ||
26 | #include <asm/proto.h> | ||
27 | #include <asm/bootsetup.h> | ||
28 | #include <asm/sections.h> | ||
29 | |||
30 | struct e820map e820; | ||
31 | |||
32 | /* | ||
33 | * PFN of last memory page. | ||
34 | */ | ||
35 | unsigned long end_pfn; | ||
36 | EXPORT_SYMBOL(end_pfn); | ||
37 | |||
38 | /* | ||
39 | * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. | ||
40 | * The direct mapping extends to end_pfn_map, so that we can directly access | ||
41 | * apertures, ACPI and other tables without having to play with fixmaps. | ||
42 | */ | ||
43 | unsigned long end_pfn_map; | ||
44 | |||
45 | /* | ||
46 | * Last pfn which the user wants to use. | ||
47 | */ | ||
48 | static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; | ||
49 | |||
50 | extern struct resource code_resource, data_resource; | ||
51 | |||
52 | /* Check for some hardcoded bad areas that early boot is not allowed to touch */ | ||
53 | static inline int bad_addr(unsigned long *addrp, unsigned long size) | ||
54 | { | ||
55 | unsigned long addr = *addrp, last = addr + size; | ||
56 | |||
57 | /* various gunk below that needed for SMP startup */ | ||
58 | if (addr < 0x8000) { | ||
59 | *addrp = PAGE_ALIGN(0x8000); | ||
60 | return 1; | ||
61 | } | ||
62 | |||
63 | /* direct mapping tables of the kernel */ | ||
64 | if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { | ||
65 | *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | /* initrd */ | ||
70 | #ifdef CONFIG_BLK_DEV_INITRD | ||
71 | if (LOADER_TYPE && INITRD_START && last >= INITRD_START && | ||
72 | addr < INITRD_START+INITRD_SIZE) { | ||
73 | *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); | ||
74 | return 1; | ||
75 | } | ||
76 | #endif | ||
77 | /* kernel code */ | ||
78 | if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { | ||
79 | *addrp = PAGE_ALIGN(__pa_symbol(&_end)); | ||
80 | return 1; | ||
81 | } | ||
82 | |||
83 | if (last >= ebda_addr && addr < ebda_addr + ebda_size) { | ||
84 | *addrp = PAGE_ALIGN(ebda_addr + ebda_size); | ||
85 | return 1; | ||
86 | } | ||
87 | |||
88 | #ifdef CONFIG_NUMA | ||
89 | /* NUMA memory to node map */ | ||
90 | if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { | ||
91 | *addrp = nodemap_addr + nodemap_size; | ||
92 | return 1; | ||
93 | } | ||
94 | #endif | ||
95 | /* XXX ramdisk image here? */ | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * This function checks if any part of the range <start,end> is mapped | ||
101 | * with type. | ||
102 | */ | ||
103 | int | ||
104 | e820_any_mapped(unsigned long start, unsigned long end, unsigned type) | ||
105 | { | ||
106 | int i; | ||
107 | for (i = 0; i < e820.nr_map; i++) { | ||
108 | struct e820entry *ei = &e820.map[i]; | ||
109 | if (type && ei->type != type) | ||
110 | continue; | ||
111 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
112 | continue; | ||
113 | return 1; | ||
114 | } | ||
115 | return 0; | ||
116 | } | ||
117 | EXPORT_SYMBOL_GPL(e820_any_mapped); | ||
118 | |||
119 | /* | ||
120 | * This function checks if the entire range <start,end> is mapped with type. | ||
121 | * | ||
122 | * Note: this function only works correct if the e820 table is sorted and | ||
123 | * not-overlapping, which is the case | ||
124 | */ | ||
125 | int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) | ||
126 | { | ||
127 | int i; | ||
128 | for (i = 0; i < e820.nr_map; i++) { | ||
129 | struct e820entry *ei = &e820.map[i]; | ||
130 | if (type && ei->type != type) | ||
131 | continue; | ||
132 | /* is the region (part) in overlap with the current region ?*/ | ||
133 | if (ei->addr >= end || ei->addr + ei->size <= start) | ||
134 | continue; | ||
135 | |||
136 | /* if the region is at the beginning of <start,end> we move | ||
137 | * start to the end of the region since it's ok until there | ||
138 | */ | ||
139 | if (ei->addr <= start) | ||
140 | start = ei->addr + ei->size; | ||
141 | /* if start is now at or beyond end, we're done, full coverage */ | ||
142 | if (start >= end) | ||
143 | return 1; /* we're done */ | ||
144 | } | ||
145 | return 0; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Find a free area in a specific range. | ||
150 | */ | ||
151 | unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) | ||
152 | { | ||
153 | int i; | ||
154 | for (i = 0; i < e820.nr_map; i++) { | ||
155 | struct e820entry *ei = &e820.map[i]; | ||
156 | unsigned long addr = ei->addr, last; | ||
157 | if (ei->type != E820_RAM) | ||
158 | continue; | ||
159 | if (addr < start) | ||
160 | addr = start; | ||
161 | if (addr > ei->addr + ei->size) | ||
162 | continue; | ||
163 | while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) | ||
164 | ; | ||
165 | last = PAGE_ALIGN(addr) + size; | ||
166 | if (last > ei->addr + ei->size) | ||
167 | continue; | ||
168 | if (last > end) | ||
169 | continue; | ||
170 | return addr; | ||
171 | } | ||
172 | return -1UL; | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Find the highest page frame number we have available | ||
177 | */ | ||
178 | unsigned long __init e820_end_of_ram(void) | ||
179 | { | ||
180 | unsigned long end_pfn = 0; | ||
181 | end_pfn = find_max_pfn_with_active_regions(); | ||
182 | |||
183 | if (end_pfn > end_pfn_map) | ||
184 | end_pfn_map = end_pfn; | ||
185 | if (end_pfn_map > MAXMEM>>PAGE_SHIFT) | ||
186 | end_pfn_map = MAXMEM>>PAGE_SHIFT; | ||
187 | if (end_pfn > end_user_pfn) | ||
188 | end_pfn = end_user_pfn; | ||
189 | if (end_pfn > end_pfn_map) | ||
190 | end_pfn = end_pfn_map; | ||
191 | |||
192 | printk("end_pfn_map = %lu\n", end_pfn_map); | ||
193 | return end_pfn; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Mark e820 reserved areas as busy for the resource manager. | ||
198 | */ | ||
199 | void __init e820_reserve_resources(void) | ||
200 | { | ||
201 | int i; | ||
202 | for (i = 0; i < e820.nr_map; i++) { | ||
203 | struct resource *res; | ||
204 | res = alloc_bootmem_low(sizeof(struct resource)); | ||
205 | switch (e820.map[i].type) { | ||
206 | case E820_RAM: res->name = "System RAM"; break; | ||
207 | case E820_ACPI: res->name = "ACPI Tables"; break; | ||
208 | case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | ||
209 | default: res->name = "reserved"; | ||
210 | } | ||
211 | res->start = e820.map[i].addr; | ||
212 | res->end = res->start + e820.map[i].size - 1; | ||
213 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
214 | request_resource(&iomem_resource, res); | ||
215 | if (e820.map[i].type == E820_RAM) { | ||
216 | /* | ||
217 | * We don't know which RAM region contains kernel data, | ||
218 | * so we try it repeatedly and let the resource manager | ||
219 | * test it. | ||
220 | */ | ||
221 | request_resource(res, &code_resource); | ||
222 | request_resource(res, &data_resource); | ||
223 | #ifdef CONFIG_KEXEC | ||
224 | request_resource(res, &crashk_res); | ||
225 | #endif | ||
226 | } | ||
227 | } | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * Find the ranges of physical addresses that do not correspond to | ||
232 | * e820 RAM areas and mark the corresponding pages as nosave for software | ||
233 | * suspend and suspend to RAM. | ||
234 | * | ||
235 | * This function requires the e820 map to be sorted and without any | ||
236 | * overlapping entries and assumes the first e820 area to be RAM. | ||
237 | */ | ||
238 | void __init e820_mark_nosave_regions(void) | ||
239 | { | ||
240 | int i; | ||
241 | unsigned long paddr; | ||
242 | |||
243 | paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); | ||
244 | for (i = 1; i < e820.nr_map; i++) { | ||
245 | struct e820entry *ei = &e820.map[i]; | ||
246 | |||
247 | if (paddr < ei->addr) | ||
248 | register_nosave_region(PFN_DOWN(paddr), | ||
249 | PFN_UP(ei->addr)); | ||
250 | |||
251 | paddr = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
252 | if (ei->type != E820_RAM) | ||
253 | register_nosave_region(PFN_UP(ei->addr), | ||
254 | PFN_DOWN(paddr)); | ||
255 | |||
256 | if (paddr >= (end_pfn << PAGE_SHIFT)) | ||
257 | break; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * Finds an active region in the address range from start_pfn to end_pfn and | ||
263 | * returns its range in ei_startpfn and ei_endpfn for the e820 entry. | ||
264 | */ | ||
265 | static int __init e820_find_active_region(const struct e820entry *ei, | ||
266 | unsigned long start_pfn, | ||
267 | unsigned long end_pfn, | ||
268 | unsigned long *ei_startpfn, | ||
269 | unsigned long *ei_endpfn) | ||
270 | { | ||
271 | *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; | ||
272 | *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; | ||
273 | |||
274 | /* Skip map entries smaller than a page */ | ||
275 | if (*ei_startpfn >= *ei_endpfn) | ||
276 | return 0; | ||
277 | |||
278 | /* Check if end_pfn_map should be updated */ | ||
279 | if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) | ||
280 | end_pfn_map = *ei_endpfn; | ||
281 | |||
282 | /* Skip if map is outside the node */ | ||
283 | if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || | ||
284 | *ei_startpfn >= end_pfn) | ||
285 | return 0; | ||
286 | |||
287 | /* Check for overlaps */ | ||
288 | if (*ei_startpfn < start_pfn) | ||
289 | *ei_startpfn = start_pfn; | ||
290 | if (*ei_endpfn > end_pfn) | ||
291 | *ei_endpfn = end_pfn; | ||
292 | |||
293 | /* Obey end_user_pfn to save on memmap */ | ||
294 | if (*ei_startpfn >= end_user_pfn) | ||
295 | return 0; | ||
296 | if (*ei_endpfn > end_user_pfn) | ||
297 | *ei_endpfn = end_user_pfn; | ||
298 | |||
299 | return 1; | ||
300 | } | ||
301 | |||
302 | /* Walk the e820 map and register active regions within a node */ | ||
303 | void __init | ||
304 | e820_register_active_regions(int nid, unsigned long start_pfn, | ||
305 | unsigned long end_pfn) | ||
306 | { | ||
307 | unsigned long ei_startpfn; | ||
308 | unsigned long ei_endpfn; | ||
309 | int i; | ||
310 | |||
311 | for (i = 0; i < e820.nr_map; i++) | ||
312 | if (e820_find_active_region(&e820.map[i], | ||
313 | start_pfn, end_pfn, | ||
314 | &ei_startpfn, &ei_endpfn)) | ||
315 | add_active_range(nid, ei_startpfn, ei_endpfn); | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Add a memory region to the kernel e820 map. | ||
320 | */ | ||
321 | void __init add_memory_region(unsigned long start, unsigned long size, int type) | ||
322 | { | ||
323 | int x = e820.nr_map; | ||
324 | |||
325 | if (x == E820MAX) { | ||
326 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
327 | return; | ||
328 | } | ||
329 | |||
330 | e820.map[x].addr = start; | ||
331 | e820.map[x].size = size; | ||
332 | e820.map[x].type = type; | ||
333 | e820.nr_map++; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Find the hole size (in bytes) in the memory range. | ||
338 | * @start: starting address of the memory range to scan | ||
339 | * @end: ending address of the memory range to scan | ||
340 | */ | ||
341 | unsigned long __init e820_hole_size(unsigned long start, unsigned long end) | ||
342 | { | ||
343 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
344 | unsigned long end_pfn = end >> PAGE_SHIFT; | ||
345 | unsigned long ei_startpfn; | ||
346 | unsigned long ei_endpfn; | ||
347 | unsigned long ram = 0; | ||
348 | int i; | ||
349 | |||
350 | for (i = 0; i < e820.nr_map; i++) { | ||
351 | if (e820_find_active_region(&e820.map[i], | ||
352 | start_pfn, end_pfn, | ||
353 | &ei_startpfn, &ei_endpfn)) | ||
354 | ram += ei_endpfn - ei_startpfn; | ||
355 | } | ||
356 | return end - start - (ram << PAGE_SHIFT); | ||
357 | } | ||
358 | |||
359 | void __init e820_print_map(char *who) | ||
360 | { | ||
361 | int i; | ||
362 | |||
363 | for (i = 0; i < e820.nr_map; i++) { | ||
364 | printk(KERN_INFO " %s: %016Lx - %016Lx ", who, | ||
365 | (unsigned long long) e820.map[i].addr, | ||
366 | (unsigned long long) (e820.map[i].addr + e820.map[i].size)); | ||
367 | switch (e820.map[i].type) { | ||
368 | case E820_RAM: printk("(usable)\n"); | ||
369 | break; | ||
370 | case E820_RESERVED: | ||
371 | printk("(reserved)\n"); | ||
372 | break; | ||
373 | case E820_ACPI: | ||
374 | printk("(ACPI data)\n"); | ||
375 | break; | ||
376 | case E820_NVS: | ||
377 | printk("(ACPI NVS)\n"); | ||
378 | break; | ||
379 | default: printk("type %u\n", e820.map[i].type); | ||
380 | break; | ||
381 | } | ||
382 | } | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | * Sanitize the BIOS e820 map. | ||
387 | * | ||
388 | * Some e820 responses include overlapping entries. The following | ||
389 | * replaces the original e820 map with a new one, removing overlaps. | ||
390 | * | ||
391 | */ | ||
392 | static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | ||
393 | { | ||
394 | struct change_member { | ||
395 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
396 | unsigned long long addr; /* address for this change point */ | ||
397 | }; | ||
398 | static struct change_member change_point_list[2*E820MAX] __initdata; | ||
399 | static struct change_member *change_point[2*E820MAX] __initdata; | ||
400 | static struct e820entry *overlap_list[E820MAX] __initdata; | ||
401 | static struct e820entry new_bios[E820MAX] __initdata; | ||
402 | struct change_member *change_tmp; | ||
403 | unsigned long current_type, last_type; | ||
404 | unsigned long long last_addr; | ||
405 | int chgidx, still_changing; | ||
406 | int overlap_entries; | ||
407 | int new_bios_entry; | ||
408 | int old_nr, new_nr, chg_nr; | ||
409 | int i; | ||
410 | |||
411 | /* | ||
412 | Visually we're performing the following (1,2,3,4 = memory types)... | ||
413 | |||
414 | Sample memory map (w/overlaps): | ||
415 | ____22__________________ | ||
416 | ______________________4_ | ||
417 | ____1111________________ | ||
418 | _44_____________________ | ||
419 | 11111111________________ | ||
420 | ____________________33__ | ||
421 | ___________44___________ | ||
422 | __________33333_________ | ||
423 | ______________22________ | ||
424 | ___________________2222_ | ||
425 | _________111111111______ | ||
426 | _____________________11_ | ||
427 | _________________4______ | ||
428 | |||
429 | Sanitized equivalent (no overlap): | ||
430 | 1_______________________ | ||
431 | _44_____________________ | ||
432 | ___1____________________ | ||
433 | ____22__________________ | ||
434 | ______11________________ | ||
435 | _________1______________ | ||
436 | __________3_____________ | ||
437 | ___________44___________ | ||
438 | _____________33_________ | ||
439 | _______________2________ | ||
440 | ________________1_______ | ||
441 | _________________4______ | ||
442 | ___________________2____ | ||
443 | ____________________33__ | ||
444 | ______________________4_ | ||
445 | */ | ||
446 | |||
447 | /* if there's only one memory region, don't bother */ | ||
448 | if (*pnr_map < 2) | ||
449 | return -1; | ||
450 | |||
451 | old_nr = *pnr_map; | ||
452 | |||
453 | /* bail out if we find any unreasonable addresses in bios map */ | ||
454 | for (i=0; i<old_nr; i++) | ||
455 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | ||
456 | return -1; | ||
457 | |||
458 | /* create pointers for initial change-point information (for sorting) */ | ||
459 | for (i=0; i < 2*old_nr; i++) | ||
460 | change_point[i] = &change_point_list[i]; | ||
461 | |||
462 | /* record all known change-points (starting and ending addresses), | ||
463 | omitting those that are for empty memory regions */ | ||
464 | chgidx = 0; | ||
465 | for (i=0; i < old_nr; i++) { | ||
466 | if (biosmap[i].size != 0) { | ||
467 | change_point[chgidx]->addr = biosmap[i].addr; | ||
468 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
469 | change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | ||
470 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
471 | } | ||
472 | } | ||
473 | chg_nr = chgidx; | ||
474 | |||
475 | /* sort change-point list by memory addresses (low -> high) */ | ||
476 | still_changing = 1; | ||
477 | while (still_changing) { | ||
478 | still_changing = 0; | ||
479 | for (i=1; i < chg_nr; i++) { | ||
480 | /* if <current_addr> > <last_addr>, swap */ | ||
481 | /* or, if current=<start_addr> & last=<end_addr>, swap */ | ||
482 | if ((change_point[i]->addr < change_point[i-1]->addr) || | ||
483 | ((change_point[i]->addr == change_point[i-1]->addr) && | ||
484 | (change_point[i]->addr == change_point[i]->pbios->addr) && | ||
485 | (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | ||
486 | ) | ||
487 | { | ||
488 | change_tmp = change_point[i]; | ||
489 | change_point[i] = change_point[i-1]; | ||
490 | change_point[i-1] = change_tmp; | ||
491 | still_changing=1; | ||
492 | } | ||
493 | } | ||
494 | } | ||
495 | |||
496 | /* create a new bios memory map, removing overlaps */ | ||
497 | overlap_entries=0; /* number of entries in the overlap table */ | ||
498 | new_bios_entry=0; /* index for creating new bios map entries */ | ||
499 | last_type = 0; /* start with undefined memory type */ | ||
500 | last_addr = 0; /* start with 0 as last starting address */ | ||
501 | /* loop through change-points, determining affect on the new bios map */ | ||
502 | for (chgidx=0; chgidx < chg_nr; chgidx++) | ||
503 | { | ||
504 | /* keep track of all overlapping bios entries */ | ||
505 | if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | ||
506 | { | ||
507 | /* add map entry to overlap list (> 1 entry implies an overlap) */ | ||
508 | overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | ||
509 | } | ||
510 | else | ||
511 | { | ||
512 | /* remove entry from list (order independent, so swap with last) */ | ||
513 | for (i=0; i<overlap_entries; i++) | ||
514 | { | ||
515 | if (overlap_list[i] == change_point[chgidx]->pbios) | ||
516 | overlap_list[i] = overlap_list[overlap_entries-1]; | ||
517 | } | ||
518 | overlap_entries--; | ||
519 | } | ||
520 | /* if there are overlapping entries, decide which "type" to use */ | ||
521 | /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | ||
522 | current_type = 0; | ||
523 | for (i=0; i<overlap_entries; i++) | ||
524 | if (overlap_list[i]->type > current_type) | ||
525 | current_type = overlap_list[i]->type; | ||
526 | /* continue building up new bios map based on this information */ | ||
527 | if (current_type != last_type) { | ||
528 | if (last_type != 0) { | ||
529 | new_bios[new_bios_entry].size = | ||
530 | change_point[chgidx]->addr - last_addr; | ||
531 | /* move forward only if the new size was non-zero */ | ||
532 | if (new_bios[new_bios_entry].size != 0) | ||
533 | if (++new_bios_entry >= E820MAX) | ||
534 | break; /* no more space left for new bios entries */ | ||
535 | } | ||
536 | if (current_type != 0) { | ||
537 | new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | ||
538 | new_bios[new_bios_entry].type = current_type; | ||
539 | last_addr=change_point[chgidx]->addr; | ||
540 | } | ||
541 | last_type = current_type; | ||
542 | } | ||
543 | } | ||
544 | new_nr = new_bios_entry; /* retain count for new bios entries */ | ||
545 | |||
546 | /* copy new bios mapping into original location */ | ||
547 | memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | ||
548 | *pnr_map = new_nr; | ||
549 | |||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | /* | ||
554 | * Copy the BIOS e820 map into a safe place. | ||
555 | * | ||
556 | * Sanity-check it while we're at it.. | ||
557 | * | ||
558 | * If we're lucky and live on a modern system, the setup code | ||
559 | * will have given us a memory map that we can use to properly | ||
560 | * set up memory. If we aren't, we'll fake a memory map. | ||
561 | */ | ||
562 | static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | ||
563 | { | ||
564 | /* Only one memory region (or negative)? Ignore it */ | ||
565 | if (nr_map < 2) | ||
566 | return -1; | ||
567 | |||
568 | do { | ||
569 | unsigned long start = biosmap->addr; | ||
570 | unsigned long size = biosmap->size; | ||
571 | unsigned long end = start + size; | ||
572 | unsigned long type = biosmap->type; | ||
573 | |||
574 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
575 | if (start > end) | ||
576 | return -1; | ||
577 | |||
578 | add_memory_region(start, size, type); | ||
579 | } while (biosmap++,--nr_map); | ||
580 | return 0; | ||
581 | } | ||
582 | |||
583 | void early_panic(char *msg) | ||
584 | { | ||
585 | early_printk(msg); | ||
586 | panic(msg); | ||
587 | } | ||
588 | |||
589 | void __init setup_memory_region(void) | ||
590 | { | ||
591 | /* | ||
592 | * Try to copy the BIOS-supplied E820-map. | ||
593 | * | ||
594 | * Otherwise fake a memory map; one section from 0k->640k, | ||
595 | * the next section from 1mb->appropriate_mem_k | ||
596 | */ | ||
597 | sanitize_e820_map(E820_MAP, &E820_MAP_NR); | ||
598 | if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) | ||
599 | early_panic("Cannot find a valid memory map"); | ||
600 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
601 | e820_print_map("BIOS-e820"); | ||
602 | } | ||
603 | |||
604 | static int __init parse_memopt(char *p) | ||
605 | { | ||
606 | if (!p) | ||
607 | return -EINVAL; | ||
608 | end_user_pfn = memparse(p, &p); | ||
609 | end_user_pfn >>= PAGE_SHIFT; | ||
610 | return 0; | ||
611 | } | ||
612 | early_param("mem", parse_memopt); | ||
613 | |||
614 | static int userdef __initdata; | ||
615 | |||
616 | static int __init parse_memmap_opt(char *p) | ||
617 | { | ||
618 | char *oldp; | ||
619 | unsigned long long start_at, mem_size; | ||
620 | |||
621 | if (!strcmp(p, "exactmap")) { | ||
622 | #ifdef CONFIG_CRASH_DUMP | ||
623 | /* If we are doing a crash dump, we | ||
624 | * still need to know the real mem | ||
625 | * size before original memory map is | ||
626 | * reset. | ||
627 | */ | ||
628 | e820_register_active_regions(0, 0, -1UL); | ||
629 | saved_max_pfn = e820_end_of_ram(); | ||
630 | remove_all_active_ranges(); | ||
631 | #endif | ||
632 | end_pfn_map = 0; | ||
633 | e820.nr_map = 0; | ||
634 | userdef = 1; | ||
635 | return 0; | ||
636 | } | ||
637 | |||
638 | oldp = p; | ||
639 | mem_size = memparse(p, &p); | ||
640 | if (p == oldp) | ||
641 | return -EINVAL; | ||
642 | if (*p == '@') { | ||
643 | start_at = memparse(p+1, &p); | ||
644 | add_memory_region(start_at, mem_size, E820_RAM); | ||
645 | } else if (*p == '#') { | ||
646 | start_at = memparse(p+1, &p); | ||
647 | add_memory_region(start_at, mem_size, E820_ACPI); | ||
648 | } else if (*p == '$') { | ||
649 | start_at = memparse(p+1, &p); | ||
650 | add_memory_region(start_at, mem_size, E820_RESERVED); | ||
651 | } else { | ||
652 | end_user_pfn = (mem_size >> PAGE_SHIFT); | ||
653 | } | ||
654 | return *p == '\0' ? 0 : -EINVAL; | ||
655 | } | ||
656 | early_param("memmap", parse_memmap_opt); | ||
657 | |||
658 | void __init finish_e820_parsing(void) | ||
659 | { | ||
660 | if (userdef) { | ||
661 | printk(KERN_INFO "user-defined physical RAM map:\n"); | ||
662 | e820_print_map("user"); | ||
663 | } | ||
664 | } | ||
665 | |||
666 | unsigned long pci_mem_start = 0xaeedbabe; | ||
667 | EXPORT_SYMBOL(pci_mem_start); | ||
668 | |||
669 | /* | ||
670 | * Search for the biggest gap in the low 32 bits of the e820 | ||
671 | * memory space. We pass this space to PCI to assign MMIO resources | ||
672 | * for hotplug or unconfigured devices in. | ||
673 | * Hopefully the BIOS let enough space left. | ||
674 | */ | ||
675 | __init void e820_setup_gap(void) | ||
676 | { | ||
677 | unsigned long gapstart, gapsize, round; | ||
678 | unsigned long last; | ||
679 | int i; | ||
680 | int found = 0; | ||
681 | |||
682 | last = 0x100000000ull; | ||
683 | gapstart = 0x10000000; | ||
684 | gapsize = 0x400000; | ||
685 | i = e820.nr_map; | ||
686 | while (--i >= 0) { | ||
687 | unsigned long long start = e820.map[i].addr; | ||
688 | unsigned long long end = start + e820.map[i].size; | ||
689 | |||
690 | /* | ||
691 | * Since "last" is at most 4GB, we know we'll | ||
692 | * fit in 32 bits if this condition is true | ||
693 | */ | ||
694 | if (last > end) { | ||
695 | unsigned long gap = last - end; | ||
696 | |||
697 | if (gap > gapsize) { | ||
698 | gapsize = gap; | ||
699 | gapstart = end; | ||
700 | found = 1; | ||
701 | } | ||
702 | } | ||
703 | if (start < last) | ||
704 | last = start; | ||
705 | } | ||
706 | |||
707 | if (!found) { | ||
708 | gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; | ||
709 | printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" | ||
710 | KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); | ||
711 | } | ||
712 | |||
713 | /* | ||
714 | * See how much we want to round up: start off with | ||
715 | * rounding to the next 1MB area. | ||
716 | */ | ||
717 | round = 0x100000; | ||
718 | while ((gapsize >> 4) > round) | ||
719 | round += round; | ||
720 | /* Fun with two's complement */ | ||
721 | pci_mem_start = (gapstart + round) & -round; | ||
722 | |||
723 | printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", | ||
724 | pci_mem_start, gapstart, gapsize); | ||
725 | } | ||
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c deleted file mode 100644 index 13aa4fd728f3..000000000000 --- a/arch/x86_64/kernel/early-quirks.c +++ /dev/null | |||
@@ -1,127 +0,0 @@ | |||
1 | /* Various workarounds for chipset bugs. | ||
2 | This code runs very early and can't use the regular PCI subsystem | ||
3 | The entries are keyed to PCI bridges which usually identify chipsets | ||
4 | uniquely. | ||
5 | This is only for whole classes of chipsets with specific problems which | ||
6 | need early invasive action (e.g. before the timers are initialized). | ||
7 | Most PCI device specific workarounds can be done later and should be | ||
8 | in standard PCI quirks | ||
9 | Mainboard specific bugs should be handled by DMI entries. | ||
10 | CPU specific bugs in setup.c */ | ||
11 | |||
12 | #include <linux/pci.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/pci_ids.h> | ||
15 | #include <asm/pci-direct.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/iommu.h> | ||
18 | #include <asm/dma.h> | ||
19 | |||
20 | static void __init via_bugs(void) | ||
21 | { | ||
22 | #ifdef CONFIG_IOMMU | ||
23 | if ((end_pfn > MAX_DMA32_PFN || force_iommu) && | ||
24 | !iommu_aperture_allowed) { | ||
25 | printk(KERN_INFO | ||
26 | "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); | ||
27 | iommu_aperture_disabled = 1; | ||
28 | } | ||
29 | #endif | ||
30 | } | ||
31 | |||
32 | #ifdef CONFIG_ACPI | ||
33 | |||
34 | static int __init nvidia_hpet_check(struct acpi_table_header *header) | ||
35 | { | ||
36 | return 0; | ||
37 | } | ||
38 | #endif | ||
39 | |||
40 | static void __init nvidia_bugs(void) | ||
41 | { | ||
42 | #ifdef CONFIG_ACPI | ||
43 | /* | ||
44 | * All timer overrides on Nvidia are | ||
45 | * wrong unless HPET is enabled. | ||
46 | * Unfortunately that's not true on many Asus boards. | ||
47 | * We don't know yet how to detect this automatically, but | ||
48 | * at least allow a command line override. | ||
49 | */ | ||
50 | if (acpi_use_timer_override) | ||
51 | return; | ||
52 | |||
53 | if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) { | ||
54 | acpi_skip_timer_override = 1; | ||
55 | printk(KERN_INFO "Nvidia board " | ||
56 | "detected. Ignoring ACPI " | ||
57 | "timer override.\n"); | ||
58 | printk(KERN_INFO "If you got timer trouble " | ||
59 | "try acpi_use_timer_override\n"); | ||
60 | } | ||
61 | #endif | ||
62 | /* RED-PEN skip them on mptables too? */ | ||
63 | |||
64 | } | ||
65 | |||
66 | static void __init ati_bugs(void) | ||
67 | { | ||
68 | if (timer_over_8254 == 1) { | ||
69 | timer_over_8254 = 0; | ||
70 | printk(KERN_INFO | ||
71 | "ATI board detected. Disabling timer routing over 8254.\n"); | ||
72 | } | ||
73 | } | ||
74 | |||
75 | struct chipset { | ||
76 | u16 vendor; | ||
77 | void (*f)(void); | ||
78 | }; | ||
79 | |||
80 | static struct chipset early_qrk[] __initdata = { | ||
81 | { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, | ||
82 | { PCI_VENDOR_ID_VIA, via_bugs }, | ||
83 | { PCI_VENDOR_ID_ATI, ati_bugs }, | ||
84 | {} | ||
85 | }; | ||
86 | |||
87 | void __init early_quirks(void) | ||
88 | { | ||
89 | int num, slot, func; | ||
90 | |||
91 | if (!early_pci_allowed()) | ||
92 | return; | ||
93 | |||
94 | /* Poor man's PCI discovery */ | ||
95 | for (num = 0; num < 32; num++) { | ||
96 | for (slot = 0; slot < 32; slot++) { | ||
97 | for (func = 0; func < 8; func++) { | ||
98 | u32 class; | ||
99 | u32 vendor; | ||
100 | u8 type; | ||
101 | int i; | ||
102 | class = read_pci_config(num,slot,func, | ||
103 | PCI_CLASS_REVISION); | ||
104 | if (class == 0xffffffff) | ||
105 | break; | ||
106 | |||
107 | if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) | ||
108 | continue; | ||
109 | |||
110 | vendor = read_pci_config(num, slot, func, | ||
111 | PCI_VENDOR_ID); | ||
112 | vendor &= 0xffff; | ||
113 | |||
114 | for (i = 0; early_qrk[i].f; i++) | ||
115 | if (early_qrk[i].vendor == vendor) { | ||
116 | early_qrk[i].f(); | ||
117 | return; | ||
118 | } | ||
119 | |||
120 | type = read_pci_config_byte(num, slot, func, | ||
121 | PCI_HEADER_TYPE); | ||
122 | if (!(type & 0x80)) | ||
123 | break; | ||
124 | } | ||
125 | } | ||
126 | } | ||
127 | } | ||
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c deleted file mode 100644 index fd9aff3f3890..000000000000 --- a/arch/x86_64/kernel/early_printk.c +++ /dev/null | |||
@@ -1,259 +0,0 @@ | |||
1 | #include <linux/console.h> | ||
2 | #include <linux/kernel.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <linux/screen_info.h> | ||
6 | #include <asm/io.h> | ||
7 | #include <asm/processor.h> | ||
8 | #include <asm/fcntl.h> | ||
9 | #include <xen/hvc-console.h> | ||
10 | |||
11 | /* Simple VGA output */ | ||
12 | |||
13 | #ifdef __i386__ | ||
14 | #include <asm/setup.h> | ||
15 | #else | ||
16 | #include <asm/bootsetup.h> | ||
17 | #endif | ||
18 | #define VGABASE (__ISA_IO_base + 0xb8000) | ||
19 | |||
20 | static int max_ypos = 25, max_xpos = 80; | ||
21 | static int current_ypos = 25, current_xpos = 0; | ||
22 | |||
23 | static void early_vga_write(struct console *con, const char *str, unsigned n) | ||
24 | { | ||
25 | char c; | ||
26 | int i, k, j; | ||
27 | |||
28 | while ((c = *str++) != '\0' && n-- > 0) { | ||
29 | if (current_ypos >= max_ypos) { | ||
30 | /* scroll 1 line up */ | ||
31 | for (k = 1, j = 0; k < max_ypos; k++, j++) { | ||
32 | for (i = 0; i < max_xpos; i++) { | ||
33 | writew(readw(VGABASE+2*(max_xpos*k+i)), | ||
34 | VGABASE + 2*(max_xpos*j + i)); | ||
35 | } | ||
36 | } | ||
37 | for (i = 0; i < max_xpos; i++) | ||
38 | writew(0x720, VGABASE + 2*(max_xpos*j + i)); | ||
39 | current_ypos = max_ypos-1; | ||
40 | } | ||
41 | if (c == '\n') { | ||
42 | current_xpos = 0; | ||
43 | current_ypos++; | ||
44 | } else if (c != '\r') { | ||
45 | writew(((0x7 << 8) | (unsigned short) c), | ||
46 | VGABASE + 2*(max_xpos*current_ypos + | ||
47 | current_xpos++)); | ||
48 | if (current_xpos >= max_xpos) { | ||
49 | current_xpos = 0; | ||
50 | current_ypos++; | ||
51 | } | ||
52 | } | ||
53 | } | ||
54 | } | ||
55 | |||
56 | static struct console early_vga_console = { | ||
57 | .name = "earlyvga", | ||
58 | .write = early_vga_write, | ||
59 | .flags = CON_PRINTBUFFER, | ||
60 | .index = -1, | ||
61 | }; | ||
62 | |||
63 | /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ | ||
64 | |||
65 | static int early_serial_base = 0x3f8; /* ttyS0 */ | ||
66 | |||
67 | #define XMTRDY 0x20 | ||
68 | |||
69 | #define DLAB 0x80 | ||
70 | |||
71 | #define TXR 0 /* Transmit register (WRITE) */ | ||
72 | #define RXR 0 /* Receive register (READ) */ | ||
73 | #define IER 1 /* Interrupt Enable */ | ||
74 | #define IIR 2 /* Interrupt ID */ | ||
75 | #define FCR 2 /* FIFO control */ | ||
76 | #define LCR 3 /* Line control */ | ||
77 | #define MCR 4 /* Modem control */ | ||
78 | #define LSR 5 /* Line Status */ | ||
79 | #define MSR 6 /* Modem Status */ | ||
80 | #define DLL 0 /* Divisor Latch Low */ | ||
81 | #define DLH 1 /* Divisor latch High */ | ||
82 | |||
83 | static int early_serial_putc(unsigned char ch) | ||
84 | { | ||
85 | unsigned timeout = 0xffff; | ||
86 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) | ||
87 | cpu_relax(); | ||
88 | outb(ch, early_serial_base + TXR); | ||
89 | return timeout ? 0 : -1; | ||
90 | } | ||
91 | |||
92 | static void early_serial_write(struct console *con, const char *s, unsigned n) | ||
93 | { | ||
94 | while (*s && n-- > 0) { | ||
95 | if (*s == '\n') | ||
96 | early_serial_putc('\r'); | ||
97 | early_serial_putc(*s); | ||
98 | s++; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | #define DEFAULT_BAUD 9600 | ||
103 | |||
104 | static __init void early_serial_init(char *s) | ||
105 | { | ||
106 | unsigned char c; | ||
107 | unsigned divisor; | ||
108 | unsigned baud = DEFAULT_BAUD; | ||
109 | char *e; | ||
110 | |||
111 | if (*s == ',') | ||
112 | ++s; | ||
113 | |||
114 | if (*s) { | ||
115 | unsigned port; | ||
116 | if (!strncmp(s,"0x",2)) { | ||
117 | early_serial_base = simple_strtoul(s, &e, 16); | ||
118 | } else { | ||
119 | static int bases[] = { 0x3f8, 0x2f8 }; | ||
120 | |||
121 | if (!strncmp(s,"ttyS",4)) | ||
122 | s += 4; | ||
123 | port = simple_strtoul(s, &e, 10); | ||
124 | if (port > 1 || s == e) | ||
125 | port = 0; | ||
126 | early_serial_base = bases[port]; | ||
127 | } | ||
128 | s += strcspn(s, ","); | ||
129 | if (*s == ',') | ||
130 | s++; | ||
131 | } | ||
132 | |||
133 | outb(0x3, early_serial_base + LCR); /* 8n1 */ | ||
134 | outb(0, early_serial_base + IER); /* no interrupt */ | ||
135 | outb(0, early_serial_base + FCR); /* no fifo */ | ||
136 | outb(0x3, early_serial_base + MCR); /* DTR + RTS */ | ||
137 | |||
138 | if (*s) { | ||
139 | baud = simple_strtoul(s, &e, 0); | ||
140 | if (baud == 0 || s == e) | ||
141 | baud = DEFAULT_BAUD; | ||
142 | } | ||
143 | |||
144 | divisor = 115200 / baud; | ||
145 | c = inb(early_serial_base + LCR); | ||
146 | outb(c | DLAB, early_serial_base + LCR); | ||
147 | outb(divisor & 0xff, early_serial_base + DLL); | ||
148 | outb((divisor >> 8) & 0xff, early_serial_base + DLH); | ||
149 | outb(c & ~DLAB, early_serial_base + LCR); | ||
150 | } | ||
151 | |||
152 | static struct console early_serial_console = { | ||
153 | .name = "earlyser", | ||
154 | .write = early_serial_write, | ||
155 | .flags = CON_PRINTBUFFER, | ||
156 | .index = -1, | ||
157 | }; | ||
158 | |||
159 | /* Console interface to a host file on AMD's SimNow! */ | ||
160 | |||
161 | static int simnow_fd; | ||
162 | |||
163 | enum { | ||
164 | MAGIC1 = 0xBACCD00A, | ||
165 | MAGIC2 = 0xCA110000, | ||
166 | XOPEN = 5, | ||
167 | XWRITE = 4, | ||
168 | }; | ||
169 | |||
170 | static noinline long simnow(long cmd, long a, long b, long c) | ||
171 | { | ||
172 | long ret; | ||
173 | asm volatile("cpuid" : | ||
174 | "=a" (ret) : | ||
175 | "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); | ||
176 | return ret; | ||
177 | } | ||
178 | |||
179 | static void __init simnow_init(char *str) | ||
180 | { | ||
181 | char *fn = "klog"; | ||
182 | if (*str == '=') | ||
183 | fn = ++str; | ||
184 | /* error ignored */ | ||
185 | simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); | ||
186 | } | ||
187 | |||
188 | static void simnow_write(struct console *con, const char *s, unsigned n) | ||
189 | { | ||
190 | simnow(XWRITE, simnow_fd, (unsigned long)s, n); | ||
191 | } | ||
192 | |||
193 | static struct console simnow_console = { | ||
194 | .name = "simnow", | ||
195 | .write = simnow_write, | ||
196 | .flags = CON_PRINTBUFFER, | ||
197 | .index = -1, | ||
198 | }; | ||
199 | |||
200 | /* Direct interface for emergencies */ | ||
201 | struct console *early_console = &early_vga_console; | ||
202 | static int early_console_initialized = 0; | ||
203 | |||
204 | void early_printk(const char *fmt, ...) | ||
205 | { | ||
206 | char buf[512]; | ||
207 | int n; | ||
208 | va_list ap; | ||
209 | |||
210 | va_start(ap,fmt); | ||
211 | n = vscnprintf(buf,512,fmt,ap); | ||
212 | early_console->write(early_console,buf,n); | ||
213 | va_end(ap); | ||
214 | } | ||
215 | |||
216 | static int __initdata keep_early; | ||
217 | |||
218 | static int __init setup_early_printk(char *buf) | ||
219 | { | ||
220 | if (!buf) | ||
221 | return 0; | ||
222 | |||
223 | if (early_console_initialized) | ||
224 | return 0; | ||
225 | early_console_initialized = 1; | ||
226 | |||
227 | if (strstr(buf, "keep")) | ||
228 | keep_early = 1; | ||
229 | |||
230 | if (!strncmp(buf, "serial", 6)) { | ||
231 | early_serial_init(buf + 6); | ||
232 | early_console = &early_serial_console; | ||
233 | } else if (!strncmp(buf, "ttyS", 4)) { | ||
234 | early_serial_init(buf); | ||
235 | early_console = &early_serial_console; | ||
236 | } else if (!strncmp(buf, "vga", 3) | ||
237 | && SCREEN_INFO.orig_video_isVGA == 1) { | ||
238 | max_xpos = SCREEN_INFO.orig_video_cols; | ||
239 | max_ypos = SCREEN_INFO.orig_video_lines; | ||
240 | current_ypos = SCREEN_INFO.orig_y; | ||
241 | early_console = &early_vga_console; | ||
242 | } else if (!strncmp(buf, "simnow", 6)) { | ||
243 | simnow_init(buf + 6); | ||
244 | early_console = &simnow_console; | ||
245 | keep_early = 1; | ||
246 | #ifdef CONFIG_HVC_XEN | ||
247 | } else if (!strncmp(buf, "xen", 3)) { | ||
248 | early_console = &xenboot_console; | ||
249 | #endif | ||
250 | } | ||
251 | |||
252 | if (keep_early) | ||
253 | early_console->flags &= ~CON_BOOT; | ||
254 | else | ||
255 | early_console->flags |= CON_BOOT; | ||
256 | register_console(early_console); | ||
257 | return 0; | ||
258 | } | ||
259 | early_param("earlyprintk", setup_early_printk); | ||
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S deleted file mode 100644 index 1d232e5f5658..000000000000 --- a/arch/x86_64/kernel/entry.S +++ /dev/null | |||
@@ -1,1172 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/entry.S | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * entry.S contains the system-call and fault low-level handling routines. | ||
11 | * | ||
12 | * NOTE: This code handles signal-recognition, which happens every time | ||
13 | * after an interrupt and after each system call. | ||
14 | * | ||
15 | * Normal syscalls and interrupts don't save a full stack frame, this is | ||
16 | * only done for syscall tracing, signals or fork/exec et.al. | ||
17 | * | ||
18 | * A note on terminology: | ||
19 | * - top of stack: Architecture defined interrupt frame from SS to RIP | ||
20 | * at the top of the kernel process stack. | ||
21 | * - partial stack frame: partially saved registers upto R11. | ||
22 | * - full stack frame: Like partial stack frame, but all register saved. | ||
23 | * | ||
24 | * Some macro usage: | ||
25 | * - CFI macros are used to generate dwarf2 unwind information for better | ||
26 | * backtraces. They don't change any code. | ||
27 | * - SAVE_ALL/RESTORE_ALL - Save/restore all registers | ||
28 | * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. | ||
29 | * There are unfortunately lots of special cases where some registers | ||
30 | * not touched. The macro is a big mess that should be cleaned up. | ||
31 | * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. | ||
32 | * Gives a full stack frame. | ||
33 | * - ENTRY/END Define functions in the symbol table. | ||
34 | * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack | ||
35 | * frame that is otherwise undefined after a SYSCALL | ||
36 | * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. | ||
37 | * - errorentry/paranoidentry/zeroentry - Define exception entry points. | ||
38 | */ | ||
39 | |||
40 | #include <linux/linkage.h> | ||
41 | #include <asm/segment.h> | ||
42 | #include <asm/cache.h> | ||
43 | #include <asm/errno.h> | ||
44 | #include <asm/dwarf2.h> | ||
45 | #include <asm/calling.h> | ||
46 | #include <asm/asm-offsets.h> | ||
47 | #include <asm/msr.h> | ||
48 | #include <asm/unistd.h> | ||
49 | #include <asm/thread_info.h> | ||
50 | #include <asm/hw_irq.h> | ||
51 | #include <asm/page.h> | ||
52 | #include <asm/irqflags.h> | ||
53 | |||
54 | .code64 | ||
55 | |||
56 | #ifndef CONFIG_PREEMPT | ||
57 | #define retint_kernel retint_restore_args | ||
58 | #endif | ||
59 | |||
60 | |||
61 | .macro TRACE_IRQS_IRETQ offset=ARGOFFSET | ||
62 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
63 | bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ | ||
64 | jnc 1f | ||
65 | TRACE_IRQS_ON | ||
66 | 1: | ||
67 | #endif | ||
68 | .endm | ||
69 | |||
70 | /* | ||
71 | * C code is not supposed to know about undefined top of stack. Every time | ||
72 | * a C function with an pt_regs argument is called from the SYSCALL based | ||
73 | * fast path FIXUP_TOP_OF_STACK is needed. | ||
74 | * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | ||
75 | * manipulation. | ||
76 | */ | ||
77 | |||
78 | /* %rsp:at FRAMEEND */ | ||
79 | .macro FIXUP_TOP_OF_STACK tmp | ||
80 | movq %gs:pda_oldrsp,\tmp | ||
81 | movq \tmp,RSP(%rsp) | ||
82 | movq $__USER_DS,SS(%rsp) | ||
83 | movq $__USER_CS,CS(%rsp) | ||
84 | movq $-1,RCX(%rsp) | ||
85 | movq R11(%rsp),\tmp /* get eflags */ | ||
86 | movq \tmp,EFLAGS(%rsp) | ||
87 | .endm | ||
88 | |||
89 | .macro RESTORE_TOP_OF_STACK tmp,offset=0 | ||
90 | movq RSP-\offset(%rsp),\tmp | ||
91 | movq \tmp,%gs:pda_oldrsp | ||
92 | movq EFLAGS-\offset(%rsp),\tmp | ||
93 | movq \tmp,R11-\offset(%rsp) | ||
94 | .endm | ||
95 | |||
96 | .macro FAKE_STACK_FRAME child_rip | ||
97 | /* push in order ss, rsp, eflags, cs, rip */ | ||
98 | xorl %eax, %eax | ||
99 | pushq %rax /* ss */ | ||
100 | CFI_ADJUST_CFA_OFFSET 8 | ||
101 | /*CFI_REL_OFFSET ss,0*/ | ||
102 | pushq %rax /* rsp */ | ||
103 | CFI_ADJUST_CFA_OFFSET 8 | ||
104 | CFI_REL_OFFSET rsp,0 | ||
105 | pushq $(1<<9) /* eflags - interrupts on */ | ||
106 | CFI_ADJUST_CFA_OFFSET 8 | ||
107 | /*CFI_REL_OFFSET rflags,0*/ | ||
108 | pushq $__KERNEL_CS /* cs */ | ||
109 | CFI_ADJUST_CFA_OFFSET 8 | ||
110 | /*CFI_REL_OFFSET cs,0*/ | ||
111 | pushq \child_rip /* rip */ | ||
112 | CFI_ADJUST_CFA_OFFSET 8 | ||
113 | CFI_REL_OFFSET rip,0 | ||
114 | pushq %rax /* orig rax */ | ||
115 | CFI_ADJUST_CFA_OFFSET 8 | ||
116 | .endm | ||
117 | |||
118 | .macro UNFAKE_STACK_FRAME | ||
119 | addq $8*6, %rsp | ||
120 | CFI_ADJUST_CFA_OFFSET -(6*8) | ||
121 | .endm | ||
122 | |||
123 | .macro CFI_DEFAULT_STACK start=1 | ||
124 | .if \start | ||
125 | CFI_STARTPROC simple | ||
126 | CFI_SIGNAL_FRAME | ||
127 | CFI_DEF_CFA rsp,SS+8 | ||
128 | .else | ||
129 | CFI_DEF_CFA_OFFSET SS+8 | ||
130 | .endif | ||
131 | CFI_REL_OFFSET r15,R15 | ||
132 | CFI_REL_OFFSET r14,R14 | ||
133 | CFI_REL_OFFSET r13,R13 | ||
134 | CFI_REL_OFFSET r12,R12 | ||
135 | CFI_REL_OFFSET rbp,RBP | ||
136 | CFI_REL_OFFSET rbx,RBX | ||
137 | CFI_REL_OFFSET r11,R11 | ||
138 | CFI_REL_OFFSET r10,R10 | ||
139 | CFI_REL_OFFSET r9,R9 | ||
140 | CFI_REL_OFFSET r8,R8 | ||
141 | CFI_REL_OFFSET rax,RAX | ||
142 | CFI_REL_OFFSET rcx,RCX | ||
143 | CFI_REL_OFFSET rdx,RDX | ||
144 | CFI_REL_OFFSET rsi,RSI | ||
145 | CFI_REL_OFFSET rdi,RDI | ||
146 | CFI_REL_OFFSET rip,RIP | ||
147 | /*CFI_REL_OFFSET cs,CS*/ | ||
148 | /*CFI_REL_OFFSET rflags,EFLAGS*/ | ||
149 | CFI_REL_OFFSET rsp,RSP | ||
150 | /*CFI_REL_OFFSET ss,SS*/ | ||
151 | .endm | ||
152 | /* | ||
153 | * A newly forked process directly context switches into this. | ||
154 | */ | ||
155 | /* rdi: prev */ | ||
156 | ENTRY(ret_from_fork) | ||
157 | CFI_DEFAULT_STACK | ||
158 | push kernel_eflags(%rip) | ||
159 | CFI_ADJUST_CFA_OFFSET 4 | ||
160 | popf # reset kernel eflags | ||
161 | CFI_ADJUST_CFA_OFFSET -4 | ||
162 | call schedule_tail | ||
163 | GET_THREAD_INFO(%rcx) | ||
164 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) | ||
165 | jnz rff_trace | ||
166 | rff_action: | ||
167 | RESTORE_REST | ||
168 | testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? | ||
169 | je int_ret_from_sys_call | ||
170 | testl $_TIF_IA32,threadinfo_flags(%rcx) | ||
171 | jnz int_ret_from_sys_call | ||
172 | RESTORE_TOP_OF_STACK %rdi,ARGOFFSET | ||
173 | jmp ret_from_sys_call | ||
174 | rff_trace: | ||
175 | movq %rsp,%rdi | ||
176 | call syscall_trace_leave | ||
177 | GET_THREAD_INFO(%rcx) | ||
178 | jmp rff_action | ||
179 | CFI_ENDPROC | ||
180 | END(ret_from_fork) | ||
181 | |||
182 | /* | ||
183 | * System call entry. Upto 6 arguments in registers are supported. | ||
184 | * | ||
185 | * SYSCALL does not save anything on the stack and does not change the | ||
186 | * stack pointer. | ||
187 | */ | ||
188 | |||
189 | /* | ||
190 | * Register setup: | ||
191 | * rax system call number | ||
192 | * rdi arg0 | ||
193 | * rcx return address for syscall/sysret, C arg3 | ||
194 | * rsi arg1 | ||
195 | * rdx arg2 | ||
196 | * r10 arg3 (--> moved to rcx for C) | ||
197 | * r8 arg4 | ||
198 | * r9 arg5 | ||
199 | * r11 eflags for syscall/sysret, temporary for C | ||
200 | * r12-r15,rbp,rbx saved by C code, not touched. | ||
201 | * | ||
202 | * Interrupts are off on entry. | ||
203 | * Only called from user space. | ||
204 | * | ||
205 | * XXX if we had a free scratch register we could save the RSP into the stack frame | ||
206 | * and report it properly in ps. Unfortunately we haven't. | ||
207 | * | ||
208 | * When user can change the frames always force IRET. That is because | ||
209 | * it deals with uncanonical addresses better. SYSRET has trouble | ||
210 | * with them due to bugs in both AMD and Intel CPUs. | ||
211 | */ | ||
212 | |||
213 | ENTRY(system_call) | ||
214 | CFI_STARTPROC simple | ||
215 | CFI_SIGNAL_FRAME | ||
216 | CFI_DEF_CFA rsp,PDA_STACKOFFSET | ||
217 | CFI_REGISTER rip,rcx | ||
218 | /*CFI_REGISTER rflags,r11*/ | ||
219 | swapgs | ||
220 | movq %rsp,%gs:pda_oldrsp | ||
221 | movq %gs:pda_kernelstack,%rsp | ||
222 | /* | ||
223 | * No need to follow this irqs off/on section - it's straight | ||
224 | * and short: | ||
225 | */ | ||
226 | sti | ||
227 | SAVE_ARGS 8,1 | ||
228 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | ||
229 | movq %rcx,RIP-ARGOFFSET(%rsp) | ||
230 | CFI_REL_OFFSET rip,RIP-ARGOFFSET | ||
231 | GET_THREAD_INFO(%rcx) | ||
232 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) | ||
233 | jnz tracesys | ||
234 | cmpq $__NR_syscall_max,%rax | ||
235 | ja badsys | ||
236 | movq %r10,%rcx | ||
237 | call *sys_call_table(,%rax,8) # XXX: rip relative | ||
238 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
239 | /* | ||
240 | * Syscall return path ending with SYSRET (fast path) | ||
241 | * Has incomplete stack frame and undefined top of stack. | ||
242 | */ | ||
243 | ret_from_sys_call: | ||
244 | movl $_TIF_ALLWORK_MASK,%edi | ||
245 | /* edi: flagmask */ | ||
246 | sysret_check: | ||
247 | GET_THREAD_INFO(%rcx) | ||
248 | cli | ||
249 | TRACE_IRQS_OFF | ||
250 | movl threadinfo_flags(%rcx),%edx | ||
251 | andl %edi,%edx | ||
252 | jnz sysret_careful | ||
253 | CFI_REMEMBER_STATE | ||
254 | /* | ||
255 | * sysretq will re-enable interrupts: | ||
256 | */ | ||
257 | TRACE_IRQS_ON | ||
258 | movq RIP-ARGOFFSET(%rsp),%rcx | ||
259 | CFI_REGISTER rip,rcx | ||
260 | RESTORE_ARGS 0,-ARG_SKIP,1 | ||
261 | /*CFI_REGISTER rflags,r11*/ | ||
262 | movq %gs:pda_oldrsp,%rsp | ||
263 | swapgs | ||
264 | sysretq | ||
265 | |||
266 | CFI_RESTORE_STATE | ||
267 | /* Handle reschedules */ | ||
268 | /* edx: work, edi: workmask */ | ||
269 | sysret_careful: | ||
270 | bt $TIF_NEED_RESCHED,%edx | ||
271 | jnc sysret_signal | ||
272 | TRACE_IRQS_ON | ||
273 | sti | ||
274 | pushq %rdi | ||
275 | CFI_ADJUST_CFA_OFFSET 8 | ||
276 | call schedule | ||
277 | popq %rdi | ||
278 | CFI_ADJUST_CFA_OFFSET -8 | ||
279 | jmp sysret_check | ||
280 | |||
281 | /* Handle a signal */ | ||
282 | sysret_signal: | ||
283 | TRACE_IRQS_ON | ||
284 | sti | ||
285 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | ||
286 | jz 1f | ||
287 | |||
288 | /* Really a signal */ | ||
289 | /* edx: work flags (arg3) */ | ||
290 | leaq do_notify_resume(%rip),%rax | ||
291 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | ||
292 | xorl %esi,%esi # oldset -> arg2 | ||
293 | call ptregscall_common | ||
294 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
295 | /* Use IRET because user could have changed frame. This | ||
296 | works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ | ||
297 | cli | ||
298 | TRACE_IRQS_OFF | ||
299 | jmp int_with_check | ||
300 | |||
301 | badsys: | ||
302 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | ||
303 | jmp ret_from_sys_call | ||
304 | |||
305 | /* Do syscall tracing */ | ||
306 | tracesys: | ||
307 | SAVE_REST | ||
308 | movq $-ENOSYS,RAX(%rsp) | ||
309 | FIXUP_TOP_OF_STACK %rdi | ||
310 | movq %rsp,%rdi | ||
311 | call syscall_trace_enter | ||
312 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
313 | RESTORE_REST | ||
314 | cmpq $__NR_syscall_max,%rax | ||
315 | movq $-ENOSYS,%rcx | ||
316 | cmova %rcx,%rax | ||
317 | ja 1f | ||
318 | movq %r10,%rcx /* fixup for C */ | ||
319 | call *sys_call_table(,%rax,8) | ||
320 | 1: movq %rax,RAX-ARGOFFSET(%rsp) | ||
321 | /* Use IRET because user could have changed frame */ | ||
322 | |||
323 | /* | ||
324 | * Syscall return path ending with IRET. | ||
325 | * Has correct top of stack, but partial stack frame. | ||
326 | */ | ||
327 | .globl int_ret_from_sys_call | ||
328 | int_ret_from_sys_call: | ||
329 | cli | ||
330 | TRACE_IRQS_OFF | ||
331 | testl $3,CS-ARGOFFSET(%rsp) | ||
332 | je retint_restore_args | ||
333 | movl $_TIF_ALLWORK_MASK,%edi | ||
334 | /* edi: mask to check */ | ||
335 | int_with_check: | ||
336 | GET_THREAD_INFO(%rcx) | ||
337 | movl threadinfo_flags(%rcx),%edx | ||
338 | andl %edi,%edx | ||
339 | jnz int_careful | ||
340 | andl $~TS_COMPAT,threadinfo_status(%rcx) | ||
341 | jmp retint_swapgs | ||
342 | |||
343 | /* Either reschedule or signal or syscall exit tracking needed. */ | ||
344 | /* First do a reschedule test. */ | ||
345 | /* edx: work, edi: workmask */ | ||
346 | int_careful: | ||
347 | bt $TIF_NEED_RESCHED,%edx | ||
348 | jnc int_very_careful | ||
349 | TRACE_IRQS_ON | ||
350 | sti | ||
351 | pushq %rdi | ||
352 | CFI_ADJUST_CFA_OFFSET 8 | ||
353 | call schedule | ||
354 | popq %rdi | ||
355 | CFI_ADJUST_CFA_OFFSET -8 | ||
356 | cli | ||
357 | TRACE_IRQS_OFF | ||
358 | jmp int_with_check | ||
359 | |||
360 | /* handle signals and tracing -- both require a full stack frame */ | ||
361 | int_very_careful: | ||
362 | TRACE_IRQS_ON | ||
363 | sti | ||
364 | SAVE_REST | ||
365 | /* Check for syscall exit trace */ | ||
366 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx | ||
367 | jz int_signal | ||
368 | pushq %rdi | ||
369 | CFI_ADJUST_CFA_OFFSET 8 | ||
370 | leaq 8(%rsp),%rdi # &ptregs -> arg1 | ||
371 | call syscall_trace_leave | ||
372 | popq %rdi | ||
373 | CFI_ADJUST_CFA_OFFSET -8 | ||
374 | andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi | ||
375 | jmp int_restore_rest | ||
376 | |||
377 | int_signal: | ||
378 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | ||
379 | jz 1f | ||
380 | movq %rsp,%rdi # &ptregs -> arg1 | ||
381 | xorl %esi,%esi # oldset -> arg2 | ||
382 | call do_notify_resume | ||
383 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
384 | int_restore_rest: | ||
385 | RESTORE_REST | ||
386 | cli | ||
387 | TRACE_IRQS_OFF | ||
388 | jmp int_with_check | ||
389 | CFI_ENDPROC | ||
390 | END(system_call) | ||
391 | |||
392 | /* | ||
393 | * Certain special system calls that need to save a complete full stack frame. | ||
394 | */ | ||
395 | |||
396 | .macro PTREGSCALL label,func,arg | ||
397 | .globl \label | ||
398 | \label: | ||
399 | leaq \func(%rip),%rax | ||
400 | leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | ||
401 | jmp ptregscall_common | ||
402 | END(\label) | ||
403 | .endm | ||
404 | |||
405 | CFI_STARTPROC | ||
406 | |||
407 | PTREGSCALL stub_clone, sys_clone, %r8 | ||
408 | PTREGSCALL stub_fork, sys_fork, %rdi | ||
409 | PTREGSCALL stub_vfork, sys_vfork, %rdi | ||
410 | PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx | ||
411 | PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx | ||
412 | PTREGSCALL stub_iopl, sys_iopl, %rsi | ||
413 | |||
414 | ENTRY(ptregscall_common) | ||
415 | popq %r11 | ||
416 | CFI_ADJUST_CFA_OFFSET -8 | ||
417 | CFI_REGISTER rip, r11 | ||
418 | SAVE_REST | ||
419 | movq %r11, %r15 | ||
420 | CFI_REGISTER rip, r15 | ||
421 | FIXUP_TOP_OF_STACK %r11 | ||
422 | call *%rax | ||
423 | RESTORE_TOP_OF_STACK %r11 | ||
424 | movq %r15, %r11 | ||
425 | CFI_REGISTER rip, r11 | ||
426 | RESTORE_REST | ||
427 | pushq %r11 | ||
428 | CFI_ADJUST_CFA_OFFSET 8 | ||
429 | CFI_REL_OFFSET rip, 0 | ||
430 | ret | ||
431 | CFI_ENDPROC | ||
432 | END(ptregscall_common) | ||
433 | |||
434 | ENTRY(stub_execve) | ||
435 | CFI_STARTPROC | ||
436 | popq %r11 | ||
437 | CFI_ADJUST_CFA_OFFSET -8 | ||
438 | CFI_REGISTER rip, r11 | ||
439 | SAVE_REST | ||
440 | FIXUP_TOP_OF_STACK %r11 | ||
441 | call sys_execve | ||
442 | RESTORE_TOP_OF_STACK %r11 | ||
443 | movq %rax,RAX(%rsp) | ||
444 | RESTORE_REST | ||
445 | jmp int_ret_from_sys_call | ||
446 | CFI_ENDPROC | ||
447 | END(stub_execve) | ||
448 | |||
449 | /* | ||
450 | * sigreturn is special because it needs to restore all registers on return. | ||
451 | * This cannot be done with SYSRET, so use the IRET return path instead. | ||
452 | */ | ||
453 | ENTRY(stub_rt_sigreturn) | ||
454 | CFI_STARTPROC | ||
455 | addq $8, %rsp | ||
456 | CFI_ADJUST_CFA_OFFSET -8 | ||
457 | SAVE_REST | ||
458 | movq %rsp,%rdi | ||
459 | FIXUP_TOP_OF_STACK %r11 | ||
460 | call sys_rt_sigreturn | ||
461 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | ||
462 | RESTORE_REST | ||
463 | jmp int_ret_from_sys_call | ||
464 | CFI_ENDPROC | ||
465 | END(stub_rt_sigreturn) | ||
466 | |||
467 | /* | ||
468 | * initial frame state for interrupts and exceptions | ||
469 | */ | ||
470 | .macro _frame ref | ||
471 | CFI_STARTPROC simple | ||
472 | CFI_SIGNAL_FRAME | ||
473 | CFI_DEF_CFA rsp,SS+8-\ref | ||
474 | /*CFI_REL_OFFSET ss,SS-\ref*/ | ||
475 | CFI_REL_OFFSET rsp,RSP-\ref | ||
476 | /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ | ||
477 | /*CFI_REL_OFFSET cs,CS-\ref*/ | ||
478 | CFI_REL_OFFSET rip,RIP-\ref | ||
479 | .endm | ||
480 | |||
481 | /* initial frame state for interrupts (and exceptions without error code) */ | ||
482 | #define INTR_FRAME _frame RIP | ||
483 | /* initial frame state for exceptions with error code (and interrupts with | ||
484 | vector already pushed) */ | ||
485 | #define XCPT_FRAME _frame ORIG_RAX | ||
486 | |||
487 | /* | ||
488 | * Interrupt entry/exit. | ||
489 | * | ||
490 | * Interrupt entry points save only callee clobbered registers in fast path. | ||
491 | * | ||
492 | * Entry runs with interrupts off. | ||
493 | */ | ||
494 | |||
495 | /* 0(%rsp): interrupt number */ | ||
496 | .macro interrupt func | ||
497 | cld | ||
498 | SAVE_ARGS | ||
499 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler | ||
500 | pushq %rbp | ||
501 | CFI_ADJUST_CFA_OFFSET 8 | ||
502 | CFI_REL_OFFSET rbp, 0 | ||
503 | movq %rsp,%rbp | ||
504 | CFI_DEF_CFA_REGISTER rbp | ||
505 | testl $3,CS(%rdi) | ||
506 | je 1f | ||
507 | swapgs | ||
508 | /* irqcount is used to check if a CPU is already on an interrupt | ||
509 | stack or not. While this is essentially redundant with preempt_count | ||
510 | it is a little cheaper to use a separate counter in the PDA | ||
511 | (short of moving irq_enter into assembly, which would be too | ||
512 | much work) */ | ||
513 | 1: incl %gs:pda_irqcount | ||
514 | cmoveq %gs:pda_irqstackptr,%rsp | ||
515 | push %rbp # backlink for old unwinder | ||
516 | /* | ||
517 | * We entered an interrupt context - irqs are off: | ||
518 | */ | ||
519 | TRACE_IRQS_OFF | ||
520 | call \func | ||
521 | .endm | ||
522 | |||
523 | ENTRY(common_interrupt) | ||
524 | XCPT_FRAME | ||
525 | interrupt do_IRQ | ||
526 | /* 0(%rsp): oldrsp-ARGOFFSET */ | ||
527 | ret_from_intr: | ||
528 | cli | ||
529 | TRACE_IRQS_OFF | ||
530 | decl %gs:pda_irqcount | ||
531 | leaveq | ||
532 | CFI_DEF_CFA_REGISTER rsp | ||
533 | CFI_ADJUST_CFA_OFFSET -8 | ||
534 | exit_intr: | ||
535 | GET_THREAD_INFO(%rcx) | ||
536 | testl $3,CS-ARGOFFSET(%rsp) | ||
537 | je retint_kernel | ||
538 | |||
539 | /* Interrupt came from user space */ | ||
540 | /* | ||
541 | * Has a correct top of stack, but a partial stack frame | ||
542 | * %rcx: thread info. Interrupts off. | ||
543 | */ | ||
544 | retint_with_reschedule: | ||
545 | movl $_TIF_WORK_MASK,%edi | ||
546 | retint_check: | ||
547 | movl threadinfo_flags(%rcx),%edx | ||
548 | andl %edi,%edx | ||
549 | CFI_REMEMBER_STATE | ||
550 | jnz retint_careful | ||
551 | retint_swapgs: | ||
552 | /* | ||
553 | * The iretq could re-enable interrupts: | ||
554 | */ | ||
555 | cli | ||
556 | TRACE_IRQS_IRETQ | ||
557 | swapgs | ||
558 | jmp restore_args | ||
559 | |||
560 | retint_restore_args: | ||
561 | cli | ||
562 | /* | ||
563 | * The iretq could re-enable interrupts: | ||
564 | */ | ||
565 | TRACE_IRQS_IRETQ | ||
566 | restore_args: | ||
567 | RESTORE_ARGS 0,8,0 | ||
568 | iret_label: | ||
569 | iretq | ||
570 | |||
571 | .section __ex_table,"a" | ||
572 | .quad iret_label,bad_iret | ||
573 | .previous | ||
574 | .section .fixup,"ax" | ||
575 | /* force a signal here? this matches i386 behaviour */ | ||
576 | /* running with kernel gs */ | ||
577 | bad_iret: | ||
578 | movq $11,%rdi /* SIGSEGV */ | ||
579 | TRACE_IRQS_ON | ||
580 | sti | ||
581 | jmp do_exit | ||
582 | .previous | ||
583 | |||
584 | /* edi: workmask, edx: work */ | ||
585 | retint_careful: | ||
586 | CFI_RESTORE_STATE | ||
587 | bt $TIF_NEED_RESCHED,%edx | ||
588 | jnc retint_signal | ||
589 | TRACE_IRQS_ON | ||
590 | sti | ||
591 | pushq %rdi | ||
592 | CFI_ADJUST_CFA_OFFSET 8 | ||
593 | call schedule | ||
594 | popq %rdi | ||
595 | CFI_ADJUST_CFA_OFFSET -8 | ||
596 | GET_THREAD_INFO(%rcx) | ||
597 | cli | ||
598 | TRACE_IRQS_OFF | ||
599 | jmp retint_check | ||
600 | |||
601 | retint_signal: | ||
602 | testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx | ||
603 | jz retint_swapgs | ||
604 | TRACE_IRQS_ON | ||
605 | sti | ||
606 | SAVE_REST | ||
607 | movq $-1,ORIG_RAX(%rsp) | ||
608 | xorl %esi,%esi # oldset | ||
609 | movq %rsp,%rdi # &pt_regs | ||
610 | call do_notify_resume | ||
611 | RESTORE_REST | ||
612 | cli | ||
613 | TRACE_IRQS_OFF | ||
614 | movl $_TIF_NEED_RESCHED,%edi | ||
615 | GET_THREAD_INFO(%rcx) | ||
616 | jmp retint_check | ||
617 | |||
618 | #ifdef CONFIG_PREEMPT | ||
619 | /* Returning to kernel space. Check if we need preemption */ | ||
620 | /* rcx: threadinfo. interrupts off. */ | ||
621 | ENTRY(retint_kernel) | ||
622 | cmpl $0,threadinfo_preempt_count(%rcx) | ||
623 | jnz retint_restore_args | ||
624 | bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) | ||
625 | jnc retint_restore_args | ||
626 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | ||
627 | jnc retint_restore_args | ||
628 | call preempt_schedule_irq | ||
629 | jmp exit_intr | ||
630 | #endif | ||
631 | |||
632 | CFI_ENDPROC | ||
633 | END(common_interrupt) | ||
634 | |||
635 | /* | ||
636 | * APIC interrupts. | ||
637 | */ | ||
638 | .macro apicinterrupt num,func | ||
639 | INTR_FRAME | ||
640 | pushq $~(\num) | ||
641 | CFI_ADJUST_CFA_OFFSET 8 | ||
642 | interrupt \func | ||
643 | jmp ret_from_intr | ||
644 | CFI_ENDPROC | ||
645 | .endm | ||
646 | |||
647 | ENTRY(thermal_interrupt) | ||
648 | apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt | ||
649 | END(thermal_interrupt) | ||
650 | |||
651 | ENTRY(threshold_interrupt) | ||
652 | apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt | ||
653 | END(threshold_interrupt) | ||
654 | |||
655 | #ifdef CONFIG_SMP | ||
656 | ENTRY(reschedule_interrupt) | ||
657 | apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt | ||
658 | END(reschedule_interrupt) | ||
659 | |||
660 | .macro INVALIDATE_ENTRY num | ||
661 | ENTRY(invalidate_interrupt\num) | ||
662 | apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt | ||
663 | END(invalidate_interrupt\num) | ||
664 | .endm | ||
665 | |||
666 | INVALIDATE_ENTRY 0 | ||
667 | INVALIDATE_ENTRY 1 | ||
668 | INVALIDATE_ENTRY 2 | ||
669 | INVALIDATE_ENTRY 3 | ||
670 | INVALIDATE_ENTRY 4 | ||
671 | INVALIDATE_ENTRY 5 | ||
672 | INVALIDATE_ENTRY 6 | ||
673 | INVALIDATE_ENTRY 7 | ||
674 | |||
675 | ENTRY(call_function_interrupt) | ||
676 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt | ||
677 | END(call_function_interrupt) | ||
678 | ENTRY(irq_move_cleanup_interrupt) | ||
679 | apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt | ||
680 | END(irq_move_cleanup_interrupt) | ||
681 | #endif | ||
682 | |||
683 | ENTRY(apic_timer_interrupt) | ||
684 | apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt | ||
685 | END(apic_timer_interrupt) | ||
686 | |||
687 | ENTRY(error_interrupt) | ||
688 | apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt | ||
689 | END(error_interrupt) | ||
690 | |||
691 | ENTRY(spurious_interrupt) | ||
692 | apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt | ||
693 | END(spurious_interrupt) | ||
694 | |||
695 | /* | ||
696 | * Exception entry points. | ||
697 | */ | ||
698 | .macro zeroentry sym | ||
699 | INTR_FRAME | ||
700 | pushq $0 /* push error code/oldrax */ | ||
701 | CFI_ADJUST_CFA_OFFSET 8 | ||
702 | pushq %rax /* push real oldrax to the rdi slot */ | ||
703 | CFI_ADJUST_CFA_OFFSET 8 | ||
704 | CFI_REL_OFFSET rax,0 | ||
705 | leaq \sym(%rip),%rax | ||
706 | jmp error_entry | ||
707 | CFI_ENDPROC | ||
708 | .endm | ||
709 | |||
710 | .macro errorentry sym | ||
711 | XCPT_FRAME | ||
712 | pushq %rax | ||
713 | CFI_ADJUST_CFA_OFFSET 8 | ||
714 | CFI_REL_OFFSET rax,0 | ||
715 | leaq \sym(%rip),%rax | ||
716 | jmp error_entry | ||
717 | CFI_ENDPROC | ||
718 | .endm | ||
719 | |||
720 | /* error code is on the stack already */ | ||
721 | /* handle NMI like exceptions that can happen everywhere */ | ||
722 | .macro paranoidentry sym, ist=0, irqtrace=1 | ||
723 | SAVE_ALL | ||
724 | cld | ||
725 | movl $1,%ebx | ||
726 | movl $MSR_GS_BASE,%ecx | ||
727 | rdmsr | ||
728 | testl %edx,%edx | ||
729 | js 1f | ||
730 | swapgs | ||
731 | xorl %ebx,%ebx | ||
732 | 1: | ||
733 | .if \ist | ||
734 | movq %gs:pda_data_offset, %rbp | ||
735 | .endif | ||
736 | movq %rsp,%rdi | ||
737 | movq ORIG_RAX(%rsp),%rsi | ||
738 | movq $-1,ORIG_RAX(%rsp) | ||
739 | .if \ist | ||
740 | subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) | ||
741 | .endif | ||
742 | call \sym | ||
743 | .if \ist | ||
744 | addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) | ||
745 | .endif | ||
746 | cli | ||
747 | .if \irqtrace | ||
748 | TRACE_IRQS_OFF | ||
749 | .endif | ||
750 | .endm | ||
751 | |||
752 | /* | ||
753 | * "Paranoid" exit path from exception stack. | ||
754 | * Paranoid because this is used by NMIs and cannot take | ||
755 | * any kernel state for granted. | ||
756 | * We don't do kernel preemption checks here, because only | ||
757 | * NMI should be common and it does not enable IRQs and | ||
758 | * cannot get reschedule ticks. | ||
759 | * | ||
760 | * "trace" is 0 for the NMI handler only, because irq-tracing | ||
761 | * is fundamentally NMI-unsafe. (we cannot change the soft and | ||
762 | * hard flags at once, atomically) | ||
763 | */ | ||
764 | .macro paranoidexit trace=1 | ||
765 | /* ebx: no swapgs flag */ | ||
766 | paranoid_exit\trace: | ||
767 | testl %ebx,%ebx /* swapgs needed? */ | ||
768 | jnz paranoid_restore\trace | ||
769 | testl $3,CS(%rsp) | ||
770 | jnz paranoid_userspace\trace | ||
771 | paranoid_swapgs\trace: | ||
772 | .if \trace | ||
773 | TRACE_IRQS_IRETQ 0 | ||
774 | .endif | ||
775 | swapgs | ||
776 | paranoid_restore\trace: | ||
777 | RESTORE_ALL 8 | ||
778 | iretq | ||
779 | paranoid_userspace\trace: | ||
780 | GET_THREAD_INFO(%rcx) | ||
781 | movl threadinfo_flags(%rcx),%ebx | ||
782 | andl $_TIF_WORK_MASK,%ebx | ||
783 | jz paranoid_swapgs\trace | ||
784 | movq %rsp,%rdi /* &pt_regs */ | ||
785 | call sync_regs | ||
786 | movq %rax,%rsp /* switch stack for scheduling */ | ||
787 | testl $_TIF_NEED_RESCHED,%ebx | ||
788 | jnz paranoid_schedule\trace | ||
789 | movl %ebx,%edx /* arg3: thread flags */ | ||
790 | .if \trace | ||
791 | TRACE_IRQS_ON | ||
792 | .endif | ||
793 | sti | ||
794 | xorl %esi,%esi /* arg2: oldset */ | ||
795 | movq %rsp,%rdi /* arg1: &pt_regs */ | ||
796 | call do_notify_resume | ||
797 | cli | ||
798 | .if \trace | ||
799 | TRACE_IRQS_OFF | ||
800 | .endif | ||
801 | jmp paranoid_userspace\trace | ||
802 | paranoid_schedule\trace: | ||
803 | .if \trace | ||
804 | TRACE_IRQS_ON | ||
805 | .endif | ||
806 | sti | ||
807 | call schedule | ||
808 | cli | ||
809 | .if \trace | ||
810 | TRACE_IRQS_OFF | ||
811 | .endif | ||
812 | jmp paranoid_userspace\trace | ||
813 | CFI_ENDPROC | ||
814 | .endm | ||
815 | |||
816 | /* | ||
817 | * Exception entry point. This expects an error code/orig_rax on the stack | ||
818 | * and the exception handler in %rax. | ||
819 | */ | ||
820 | KPROBE_ENTRY(error_entry) | ||
821 | _frame RDI | ||
822 | CFI_REL_OFFSET rax,0 | ||
823 | /* rdi slot contains rax, oldrax contains error code */ | ||
824 | cld | ||
825 | subq $14*8,%rsp | ||
826 | CFI_ADJUST_CFA_OFFSET (14*8) | ||
827 | movq %rsi,13*8(%rsp) | ||
828 | CFI_REL_OFFSET rsi,RSI | ||
829 | movq 14*8(%rsp),%rsi /* load rax from rdi slot */ | ||
830 | CFI_REGISTER rax,rsi | ||
831 | movq %rdx,12*8(%rsp) | ||
832 | CFI_REL_OFFSET rdx,RDX | ||
833 | movq %rcx,11*8(%rsp) | ||
834 | CFI_REL_OFFSET rcx,RCX | ||
835 | movq %rsi,10*8(%rsp) /* store rax */ | ||
836 | CFI_REL_OFFSET rax,RAX | ||
837 | movq %r8, 9*8(%rsp) | ||
838 | CFI_REL_OFFSET r8,R8 | ||
839 | movq %r9, 8*8(%rsp) | ||
840 | CFI_REL_OFFSET r9,R9 | ||
841 | movq %r10,7*8(%rsp) | ||
842 | CFI_REL_OFFSET r10,R10 | ||
843 | movq %r11,6*8(%rsp) | ||
844 | CFI_REL_OFFSET r11,R11 | ||
845 | movq %rbx,5*8(%rsp) | ||
846 | CFI_REL_OFFSET rbx,RBX | ||
847 | movq %rbp,4*8(%rsp) | ||
848 | CFI_REL_OFFSET rbp,RBP | ||
849 | movq %r12,3*8(%rsp) | ||
850 | CFI_REL_OFFSET r12,R12 | ||
851 | movq %r13,2*8(%rsp) | ||
852 | CFI_REL_OFFSET r13,R13 | ||
853 | movq %r14,1*8(%rsp) | ||
854 | CFI_REL_OFFSET r14,R14 | ||
855 | movq %r15,(%rsp) | ||
856 | CFI_REL_OFFSET r15,R15 | ||
857 | xorl %ebx,%ebx | ||
858 | testl $3,CS(%rsp) | ||
859 | je error_kernelspace | ||
860 | error_swapgs: | ||
861 | swapgs | ||
862 | error_sti: | ||
863 | movq %rdi,RDI(%rsp) | ||
864 | CFI_REL_OFFSET rdi,RDI | ||
865 | movq %rsp,%rdi | ||
866 | movq ORIG_RAX(%rsp),%rsi /* get error code */ | ||
867 | movq $-1,ORIG_RAX(%rsp) | ||
868 | call *%rax | ||
869 | /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ | ||
870 | error_exit: | ||
871 | movl %ebx,%eax | ||
872 | RESTORE_REST | ||
873 | cli | ||
874 | TRACE_IRQS_OFF | ||
875 | GET_THREAD_INFO(%rcx) | ||
876 | testl %eax,%eax | ||
877 | jne retint_kernel | ||
878 | movl threadinfo_flags(%rcx),%edx | ||
879 | movl $_TIF_WORK_MASK,%edi | ||
880 | andl %edi,%edx | ||
881 | jnz retint_careful | ||
882 | /* | ||
883 | * The iret might restore flags: | ||
884 | */ | ||
885 | TRACE_IRQS_IRETQ | ||
886 | swapgs | ||
887 | RESTORE_ARGS 0,8,0 | ||
888 | jmp iret_label | ||
889 | CFI_ENDPROC | ||
890 | |||
891 | error_kernelspace: | ||
892 | incl %ebx | ||
893 | /* There are two places in the kernel that can potentially fault with | ||
894 | usergs. Handle them here. The exception handlers after | ||
895 | iret run with kernel gs again, so don't set the user space flag. | ||
896 | B stepping K8s sometimes report an truncated RIP for IRET | ||
897 | exceptions returning to compat mode. Check for these here too. */ | ||
898 | leaq iret_label(%rip),%rbp | ||
899 | cmpq %rbp,RIP(%rsp) | ||
900 | je error_swapgs | ||
901 | movl %ebp,%ebp /* zero extend */ | ||
902 | cmpq %rbp,RIP(%rsp) | ||
903 | je error_swapgs | ||
904 | cmpq $gs_change,RIP(%rsp) | ||
905 | je error_swapgs | ||
906 | jmp error_sti | ||
907 | KPROBE_END(error_entry) | ||
908 | |||
909 | /* Reload gs selector with exception handling */ | ||
910 | /* edi: new selector */ | ||
911 | ENTRY(load_gs_index) | ||
912 | CFI_STARTPROC | ||
913 | pushf | ||
914 | CFI_ADJUST_CFA_OFFSET 8 | ||
915 | cli | ||
916 | swapgs | ||
917 | gs_change: | ||
918 | movl %edi,%gs | ||
919 | 2: mfence /* workaround */ | ||
920 | swapgs | ||
921 | popf | ||
922 | CFI_ADJUST_CFA_OFFSET -8 | ||
923 | ret | ||
924 | CFI_ENDPROC | ||
925 | ENDPROC(load_gs_index) | ||
926 | |||
927 | .section __ex_table,"a" | ||
928 | .align 8 | ||
929 | .quad gs_change,bad_gs | ||
930 | .previous | ||
931 | .section .fixup,"ax" | ||
932 | /* running with kernelgs */ | ||
933 | bad_gs: | ||
934 | swapgs /* switch back to user gs */ | ||
935 | xorl %eax,%eax | ||
936 | movl %eax,%gs | ||
937 | jmp 2b | ||
938 | .previous | ||
939 | |||
940 | /* | ||
941 | * Create a kernel thread. | ||
942 | * | ||
943 | * C extern interface: | ||
944 | * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | ||
945 | * | ||
946 | * asm input arguments: | ||
947 | * rdi: fn, rsi: arg, rdx: flags | ||
948 | */ | ||
949 | ENTRY(kernel_thread) | ||
950 | CFI_STARTPROC | ||
951 | FAKE_STACK_FRAME $child_rip | ||
952 | SAVE_ALL | ||
953 | |||
954 | # rdi: flags, rsi: usp, rdx: will be &pt_regs | ||
955 | movq %rdx,%rdi | ||
956 | orq kernel_thread_flags(%rip),%rdi | ||
957 | movq $-1, %rsi | ||
958 | movq %rsp, %rdx | ||
959 | |||
960 | xorl %r8d,%r8d | ||
961 | xorl %r9d,%r9d | ||
962 | |||
963 | # clone now | ||
964 | call do_fork | ||
965 | movq %rax,RAX(%rsp) | ||
966 | xorl %edi,%edi | ||
967 | |||
968 | /* | ||
969 | * It isn't worth to check for reschedule here, | ||
970 | * so internally to the x86_64 port you can rely on kernel_thread() | ||
971 | * not to reschedule the child before returning, this avoids the need | ||
972 | * of hacks for example to fork off the per-CPU idle tasks. | ||
973 | * [Hopefully no generic code relies on the reschedule -AK] | ||
974 | */ | ||
975 | RESTORE_ALL | ||
976 | UNFAKE_STACK_FRAME | ||
977 | ret | ||
978 | CFI_ENDPROC | ||
979 | ENDPROC(kernel_thread) | ||
980 | |||
981 | child_rip: | ||
982 | pushq $0 # fake return address | ||
983 | CFI_STARTPROC | ||
984 | /* | ||
985 | * Here we are in the child and the registers are set as they were | ||
986 | * at kernel_thread() invocation in the parent. | ||
987 | */ | ||
988 | movq %rdi, %rax | ||
989 | movq %rsi, %rdi | ||
990 | call *%rax | ||
991 | # exit | ||
992 | xorl %edi, %edi | ||
993 | call do_exit | ||
994 | CFI_ENDPROC | ||
995 | ENDPROC(child_rip) | ||
996 | |||
997 | /* | ||
998 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. | ||
999 | * | ||
1000 | * C extern interface: | ||
1001 | * extern long execve(char *name, char **argv, char **envp) | ||
1002 | * | ||
1003 | * asm input arguments: | ||
1004 | * rdi: name, rsi: argv, rdx: envp | ||
1005 | * | ||
1006 | * We want to fallback into: | ||
1007 | * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) | ||
1008 | * | ||
1009 | * do_sys_execve asm fallback arguments: | ||
1010 | * rdi: name, rsi: argv, rdx: envp, fake frame on the stack | ||
1011 | */ | ||
1012 | ENTRY(kernel_execve) | ||
1013 | CFI_STARTPROC | ||
1014 | FAKE_STACK_FRAME $0 | ||
1015 | SAVE_ALL | ||
1016 | call sys_execve | ||
1017 | movq %rax, RAX(%rsp) | ||
1018 | RESTORE_REST | ||
1019 | testq %rax,%rax | ||
1020 | je int_ret_from_sys_call | ||
1021 | RESTORE_ARGS | ||
1022 | UNFAKE_STACK_FRAME | ||
1023 | ret | ||
1024 | CFI_ENDPROC | ||
1025 | ENDPROC(kernel_execve) | ||
1026 | |||
1027 | KPROBE_ENTRY(page_fault) | ||
1028 | errorentry do_page_fault | ||
1029 | KPROBE_END(page_fault) | ||
1030 | |||
1031 | ENTRY(coprocessor_error) | ||
1032 | zeroentry do_coprocessor_error | ||
1033 | END(coprocessor_error) | ||
1034 | |||
1035 | ENTRY(simd_coprocessor_error) | ||
1036 | zeroentry do_simd_coprocessor_error | ||
1037 | END(simd_coprocessor_error) | ||
1038 | |||
1039 | ENTRY(device_not_available) | ||
1040 | zeroentry math_state_restore | ||
1041 | END(device_not_available) | ||
1042 | |||
1043 | /* runs on exception stack */ | ||
1044 | KPROBE_ENTRY(debug) | ||
1045 | INTR_FRAME | ||
1046 | pushq $0 | ||
1047 | CFI_ADJUST_CFA_OFFSET 8 | ||
1048 | paranoidentry do_debug, DEBUG_STACK | ||
1049 | paranoidexit | ||
1050 | KPROBE_END(debug) | ||
1051 | |||
1052 | /* runs on exception stack */ | ||
1053 | KPROBE_ENTRY(nmi) | ||
1054 | INTR_FRAME | ||
1055 | pushq $-1 | ||
1056 | CFI_ADJUST_CFA_OFFSET 8 | ||
1057 | paranoidentry do_nmi, 0, 0 | ||
1058 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1059 | paranoidexit 0 | ||
1060 | #else | ||
1061 | jmp paranoid_exit1 | ||
1062 | CFI_ENDPROC | ||
1063 | #endif | ||
1064 | KPROBE_END(nmi) | ||
1065 | |||
1066 | KPROBE_ENTRY(int3) | ||
1067 | INTR_FRAME | ||
1068 | pushq $0 | ||
1069 | CFI_ADJUST_CFA_OFFSET 8 | ||
1070 | paranoidentry do_int3, DEBUG_STACK | ||
1071 | jmp paranoid_exit1 | ||
1072 | CFI_ENDPROC | ||
1073 | KPROBE_END(int3) | ||
1074 | |||
1075 | ENTRY(overflow) | ||
1076 | zeroentry do_overflow | ||
1077 | END(overflow) | ||
1078 | |||
1079 | ENTRY(bounds) | ||
1080 | zeroentry do_bounds | ||
1081 | END(bounds) | ||
1082 | |||
1083 | ENTRY(invalid_op) | ||
1084 | zeroentry do_invalid_op | ||
1085 | END(invalid_op) | ||
1086 | |||
1087 | ENTRY(coprocessor_segment_overrun) | ||
1088 | zeroentry do_coprocessor_segment_overrun | ||
1089 | END(coprocessor_segment_overrun) | ||
1090 | |||
1091 | ENTRY(reserved) | ||
1092 | zeroentry do_reserved | ||
1093 | END(reserved) | ||
1094 | |||
1095 | /* runs on exception stack */ | ||
1096 | ENTRY(double_fault) | ||
1097 | XCPT_FRAME | ||
1098 | paranoidentry do_double_fault | ||
1099 | jmp paranoid_exit1 | ||
1100 | CFI_ENDPROC | ||
1101 | END(double_fault) | ||
1102 | |||
1103 | ENTRY(invalid_TSS) | ||
1104 | errorentry do_invalid_TSS | ||
1105 | END(invalid_TSS) | ||
1106 | |||
1107 | ENTRY(segment_not_present) | ||
1108 | errorentry do_segment_not_present | ||
1109 | END(segment_not_present) | ||
1110 | |||
1111 | /* runs on exception stack */ | ||
1112 | ENTRY(stack_segment) | ||
1113 | XCPT_FRAME | ||
1114 | paranoidentry do_stack_segment | ||
1115 | jmp paranoid_exit1 | ||
1116 | CFI_ENDPROC | ||
1117 | END(stack_segment) | ||
1118 | |||
1119 | KPROBE_ENTRY(general_protection) | ||
1120 | errorentry do_general_protection | ||
1121 | KPROBE_END(general_protection) | ||
1122 | |||
1123 | ENTRY(alignment_check) | ||
1124 | errorentry do_alignment_check | ||
1125 | END(alignment_check) | ||
1126 | |||
1127 | ENTRY(divide_error) | ||
1128 | zeroentry do_divide_error | ||
1129 | END(divide_error) | ||
1130 | |||
1131 | ENTRY(spurious_interrupt_bug) | ||
1132 | zeroentry do_spurious_interrupt_bug | ||
1133 | END(spurious_interrupt_bug) | ||
1134 | |||
1135 | #ifdef CONFIG_X86_MCE | ||
1136 | /* runs on exception stack */ | ||
1137 | ENTRY(machine_check) | ||
1138 | INTR_FRAME | ||
1139 | pushq $0 | ||
1140 | CFI_ADJUST_CFA_OFFSET 8 | ||
1141 | paranoidentry do_machine_check | ||
1142 | jmp paranoid_exit1 | ||
1143 | CFI_ENDPROC | ||
1144 | END(machine_check) | ||
1145 | #endif | ||
1146 | |||
1147 | /* Call softirq on interrupt stack. Interrupts are off. */ | ||
1148 | ENTRY(call_softirq) | ||
1149 | CFI_STARTPROC | ||
1150 | push %rbp | ||
1151 | CFI_ADJUST_CFA_OFFSET 8 | ||
1152 | CFI_REL_OFFSET rbp,0 | ||
1153 | mov %rsp,%rbp | ||
1154 | CFI_DEF_CFA_REGISTER rbp | ||
1155 | incl %gs:pda_irqcount | ||
1156 | cmove %gs:pda_irqstackptr,%rsp | ||
1157 | push %rbp # backlink for old unwinder | ||
1158 | call __do_softirq | ||
1159 | leaveq | ||
1160 | CFI_DEF_CFA_REGISTER rsp | ||
1161 | CFI_ADJUST_CFA_OFFSET -8 | ||
1162 | decl %gs:pda_irqcount | ||
1163 | ret | ||
1164 | CFI_ENDPROC | ||
1165 | ENDPROC(call_softirq) | ||
1166 | |||
1167 | KPROBE_ENTRY(ignore_sysret) | ||
1168 | CFI_STARTPROC | ||
1169 | mov $-ENOSYS,%eax | ||
1170 | sysret | ||
1171 | CFI_ENDPROC | ||
1172 | ENDPROC(ignore_sysret) | ||
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c deleted file mode 100644 index 47496a40e84f..000000000000 --- a/arch/x86_64/kernel/genapic.c +++ /dev/null | |||
@@ -1,66 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Generic APIC sub-arch probe layer. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/threads.h> | ||
12 | #include <linux/cpumask.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | |||
19 | #include <asm/smp.h> | ||
20 | #include <asm/ipi.h> | ||
21 | #include <asm/genapic.h> | ||
22 | |||
23 | #ifdef CONFIG_ACPI | ||
24 | #include <acpi/acpi_bus.h> | ||
25 | #endif | ||
26 | |||
27 | /* which logical CPU number maps to which CPU (physical APIC ID) */ | ||
28 | u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly | ||
29 | = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
30 | EXPORT_SYMBOL(x86_cpu_to_apicid); | ||
31 | |||
32 | u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
33 | |||
34 | struct genapic __read_mostly *genapic = &apic_flat; | ||
35 | |||
36 | /* | ||
37 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | ||
38 | */ | ||
39 | void __init setup_apic_routing(void) | ||
40 | { | ||
41 | #ifdef CONFIG_ACPI | ||
42 | /* | ||
43 | * Quirk: some x86_64 machines can only use physical APIC mode | ||
44 | * regardless of how many processors are present (x86_64 ES7000 | ||
45 | * is an example). | ||
46 | */ | ||
47 | if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && | ||
48 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) | ||
49 | genapic = &apic_physflat; | ||
50 | else | ||
51 | #endif | ||
52 | |||
53 | if (cpus_weight(cpu_possible_map) <= 8) | ||
54 | genapic = &apic_flat; | ||
55 | else | ||
56 | genapic = &apic_physflat; | ||
57 | |||
58 | printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); | ||
59 | } | ||
60 | |||
61 | /* Same for both flat and physical. */ | ||
62 | |||
63 | void send_IPI_self(int vector) | ||
64 | { | ||
65 | __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); | ||
66 | } | ||
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c deleted file mode 100644 index ecb01eefdd27..000000000000 --- a/arch/x86_64/kernel/genapic_flat.c +++ /dev/null | |||
@@ -1,194 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Flat APIC subarch code. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/threads.h> | ||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <asm/smp.h> | ||
19 | #include <asm/ipi.h> | ||
20 | #include <asm/genapic.h> | ||
21 | |||
22 | static cpumask_t flat_target_cpus(void) | ||
23 | { | ||
24 | return cpu_online_map; | ||
25 | } | ||
26 | |||
27 | static cpumask_t flat_vector_allocation_domain(int cpu) | ||
28 | { | ||
29 | /* Careful. Some cpus do not strictly honor the set of cpus | ||
30 | * specified in the interrupt destination when using lowest | ||
31 | * priority interrupt delivery mode. | ||
32 | * | ||
33 | * In particular there was a hyperthreading cpu observed to | ||
34 | * deliver interrupts to the wrong hyperthread when only one | ||
35 | * hyperthread was specified in the interrupt desitination. | ||
36 | */ | ||
37 | cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; | ||
38 | return domain; | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Set up the logical destination ID. | ||
43 | * | ||
44 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
45 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
46 | * document number 292116). So here it goes... | ||
47 | */ | ||
48 | static void flat_init_apic_ldr(void) | ||
49 | { | ||
50 | unsigned long val; | ||
51 | unsigned long num, id; | ||
52 | |||
53 | num = smp_processor_id(); | ||
54 | id = 1UL << num; | ||
55 | x86_cpu_to_log_apicid[num] = id; | ||
56 | apic_write(APIC_DFR, APIC_DFR_FLAT); | ||
57 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; | ||
58 | val |= SET_APIC_LOGICAL_ID(id); | ||
59 | apic_write(APIC_LDR, val); | ||
60 | } | ||
61 | |||
62 | static void flat_send_IPI_mask(cpumask_t cpumask, int vector) | ||
63 | { | ||
64 | unsigned long mask = cpus_addr(cpumask)[0]; | ||
65 | unsigned long flags; | ||
66 | |||
67 | local_irq_save(flags); | ||
68 | __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); | ||
69 | local_irq_restore(flags); | ||
70 | } | ||
71 | |||
72 | static void flat_send_IPI_allbutself(int vector) | ||
73 | { | ||
74 | #ifdef CONFIG_HOTPLUG_CPU | ||
75 | int hotplug = 1; | ||
76 | #else | ||
77 | int hotplug = 0; | ||
78 | #endif | ||
79 | if (hotplug || vector == NMI_VECTOR) { | ||
80 | cpumask_t allbutme = cpu_online_map; | ||
81 | |||
82 | cpu_clear(smp_processor_id(), allbutme); | ||
83 | |||
84 | if (!cpus_empty(allbutme)) | ||
85 | flat_send_IPI_mask(allbutme, vector); | ||
86 | } else if (num_online_cpus() > 1) { | ||
87 | __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); | ||
88 | } | ||
89 | } | ||
90 | |||
91 | static void flat_send_IPI_all(int vector) | ||
92 | { | ||
93 | if (vector == NMI_VECTOR) | ||
94 | flat_send_IPI_mask(cpu_online_map, vector); | ||
95 | else | ||
96 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | ||
97 | } | ||
98 | |||
99 | static int flat_apic_id_registered(void) | ||
100 | { | ||
101 | return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); | ||
102 | } | ||
103 | |||
104 | static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) | ||
105 | { | ||
106 | return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; | ||
107 | } | ||
108 | |||
109 | static unsigned int phys_pkg_id(int index_msb) | ||
110 | { | ||
111 | return hard_smp_processor_id() >> index_msb; | ||
112 | } | ||
113 | |||
114 | struct genapic apic_flat = { | ||
115 | .name = "flat", | ||
116 | .int_delivery_mode = dest_LowestPrio, | ||
117 | .int_dest_mode = (APIC_DEST_LOGICAL != 0), | ||
118 | .target_cpus = flat_target_cpus, | ||
119 | .vector_allocation_domain = flat_vector_allocation_domain, | ||
120 | .apic_id_registered = flat_apic_id_registered, | ||
121 | .init_apic_ldr = flat_init_apic_ldr, | ||
122 | .send_IPI_all = flat_send_IPI_all, | ||
123 | .send_IPI_allbutself = flat_send_IPI_allbutself, | ||
124 | .send_IPI_mask = flat_send_IPI_mask, | ||
125 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, | ||
126 | .phys_pkg_id = phys_pkg_id, | ||
127 | }; | ||
128 | |||
129 | /* | ||
130 | * Physflat mode is used when there are more than 8 CPUs on a AMD system. | ||
131 | * We cannot use logical delivery in this case because the mask | ||
132 | * overflows, so use physical mode. | ||
133 | */ | ||
134 | |||
135 | static cpumask_t physflat_target_cpus(void) | ||
136 | { | ||
137 | return cpu_online_map; | ||
138 | } | ||
139 | |||
140 | static cpumask_t physflat_vector_allocation_domain(int cpu) | ||
141 | { | ||
142 | cpumask_t domain = CPU_MASK_NONE; | ||
143 | cpu_set(cpu, domain); | ||
144 | return domain; | ||
145 | } | ||
146 | |||
147 | |||
148 | static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) | ||
149 | { | ||
150 | send_IPI_mask_sequence(cpumask, vector); | ||
151 | } | ||
152 | |||
153 | static void physflat_send_IPI_allbutself(int vector) | ||
154 | { | ||
155 | cpumask_t allbutme = cpu_online_map; | ||
156 | |||
157 | cpu_clear(smp_processor_id(), allbutme); | ||
158 | physflat_send_IPI_mask(allbutme, vector); | ||
159 | } | ||
160 | |||
161 | static void physflat_send_IPI_all(int vector) | ||
162 | { | ||
163 | physflat_send_IPI_mask(cpu_online_map, vector); | ||
164 | } | ||
165 | |||
166 | static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) | ||
167 | { | ||
168 | int cpu; | ||
169 | |||
170 | /* | ||
171 | * We're using fixed IRQ delivery, can only return one phys APIC ID. | ||
172 | * May as well be the first. | ||
173 | */ | ||
174 | cpu = first_cpu(cpumask); | ||
175 | if ((unsigned)cpu < NR_CPUS) | ||
176 | return x86_cpu_to_apicid[cpu]; | ||
177 | else | ||
178 | return BAD_APICID; | ||
179 | } | ||
180 | |||
181 | struct genapic apic_physflat = { | ||
182 | .name = "physical flat", | ||
183 | .int_delivery_mode = dest_Fixed, | ||
184 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | ||
185 | .target_cpus = physflat_target_cpus, | ||
186 | .vector_allocation_domain = physflat_vector_allocation_domain, | ||
187 | .apic_id_registered = flat_apic_id_registered, | ||
188 | .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ | ||
189 | .send_IPI_all = physflat_send_IPI_all, | ||
190 | .send_IPI_allbutself = physflat_send_IPI_allbutself, | ||
191 | .send_IPI_mask = physflat_send_IPI_mask, | ||
192 | .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, | ||
193 | .phys_pkg_id = phys_pkg_id, | ||
194 | }; | ||
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S deleted file mode 100644 index b6167fe3330e..000000000000 --- a/arch/x86_64/kernel/head.S +++ /dev/null | |||
@@ -1,416 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | ||
7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | ||
8 | * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> | ||
9 | */ | ||
10 | |||
11 | |||
12 | #include <linux/linkage.h> | ||
13 | #include <linux/threads.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <asm/desc.h> | ||
16 | #include <asm/segment.h> | ||
17 | #include <asm/pgtable.h> | ||
18 | #include <asm/page.h> | ||
19 | #include <asm/msr.h> | ||
20 | #include <asm/cache.h> | ||
21 | |||
22 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | ||
23 | * because we need identity-mapped pages. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | .text | ||
28 | .section .text.head | ||
29 | .code64 | ||
30 | .globl startup_64 | ||
31 | startup_64: | ||
32 | |||
33 | /* | ||
34 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | ||
35 | * and someone has loaded an identity mapped page table | ||
36 | * for us. These identity mapped page tables map all of the | ||
37 | * kernel pages and possibly all of memory. | ||
38 | * | ||
39 | * %esi holds a physical pointer to real_mode_data. | ||
40 | * | ||
41 | * We come here either directly from a 64bit bootloader, or from | ||
42 | * arch/x86_64/boot/compressed/head.S. | ||
43 | * | ||
44 | * We only come here initially at boot nothing else comes here. | ||
45 | * | ||
46 | * Since we may be loaded at an address different from what we were | ||
47 | * compiled to run at we first fixup the physical addresses in our page | ||
48 | * tables and then reload them. | ||
49 | */ | ||
50 | |||
51 | /* Compute the delta between the address I am compiled to run at and the | ||
52 | * address I am actually running at. | ||
53 | */ | ||
54 | leaq _text(%rip), %rbp | ||
55 | subq $_text - __START_KERNEL_map, %rbp | ||
56 | |||
57 | /* Is the address not 2M aligned? */ | ||
58 | movq %rbp, %rax | ||
59 | andl $~LARGE_PAGE_MASK, %eax | ||
60 | testl %eax, %eax | ||
61 | jnz bad_address | ||
62 | |||
63 | /* Is the address too large? */ | ||
64 | leaq _text(%rip), %rdx | ||
65 | movq $PGDIR_SIZE, %rax | ||
66 | cmpq %rax, %rdx | ||
67 | jae bad_address | ||
68 | |||
69 | /* Fixup the physical addresses in the page table | ||
70 | */ | ||
71 | addq %rbp, init_level4_pgt + 0(%rip) | ||
72 | addq %rbp, init_level4_pgt + (258*8)(%rip) | ||
73 | addq %rbp, init_level4_pgt + (511*8)(%rip) | ||
74 | |||
75 | addq %rbp, level3_ident_pgt + 0(%rip) | ||
76 | |||
77 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | ||
78 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) | ||
79 | |||
80 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) | ||
81 | |||
82 | /* Add an Identity mapping if I am above 1G */ | ||
83 | leaq _text(%rip), %rdi | ||
84 | andq $LARGE_PAGE_MASK, %rdi | ||
85 | |||
86 | movq %rdi, %rax | ||
87 | shrq $PUD_SHIFT, %rax | ||
88 | andq $(PTRS_PER_PUD - 1), %rax | ||
89 | jz ident_complete | ||
90 | |||
91 | leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | ||
92 | leaq level3_ident_pgt(%rip), %rbx | ||
93 | movq %rdx, 0(%rbx, %rax, 8) | ||
94 | |||
95 | movq %rdi, %rax | ||
96 | shrq $PMD_SHIFT, %rax | ||
97 | andq $(PTRS_PER_PMD - 1), %rax | ||
98 | leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx | ||
99 | leaq level2_spare_pgt(%rip), %rbx | ||
100 | movq %rdx, 0(%rbx, %rax, 8) | ||
101 | ident_complete: | ||
102 | |||
103 | /* Fixup the kernel text+data virtual addresses | ||
104 | */ | ||
105 | leaq level2_kernel_pgt(%rip), %rdi | ||
106 | leaq 4096(%rdi), %r8 | ||
107 | /* See if it is a valid page table entry */ | ||
108 | 1: testq $1, 0(%rdi) | ||
109 | jz 2f | ||
110 | addq %rbp, 0(%rdi) | ||
111 | /* Go to the next page */ | ||
112 | 2: addq $8, %rdi | ||
113 | cmp %r8, %rdi | ||
114 | jne 1b | ||
115 | |||
116 | /* Fixup phys_base */ | ||
117 | addq %rbp, phys_base(%rip) | ||
118 | |||
119 | #ifdef CONFIG_SMP | ||
120 | addq %rbp, trampoline_level4_pgt + 0(%rip) | ||
121 | addq %rbp, trampoline_level4_pgt + (511*8)(%rip) | ||
122 | #endif | ||
123 | #ifdef CONFIG_ACPI_SLEEP | ||
124 | addq %rbp, wakeup_level4_pgt + 0(%rip) | ||
125 | addq %rbp, wakeup_level4_pgt + (511*8)(%rip) | ||
126 | #endif | ||
127 | |||
128 | /* Due to ENTRY(), sometimes the empty space gets filled with | ||
129 | * zeros. Better take a jmp than relying on empty space being | ||
130 | * filled with 0x90 (nop) | ||
131 | */ | ||
132 | jmp secondary_startup_64 | ||
133 | ENTRY(secondary_startup_64) | ||
134 | /* | ||
135 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | ||
136 | * and someone has loaded a mapped page table. | ||
137 | * | ||
138 | * %esi holds a physical pointer to real_mode_data. | ||
139 | * | ||
140 | * We come here either from startup_64 (using physical addresses) | ||
141 | * or from trampoline.S (using virtual addresses). | ||
142 | * | ||
143 | * Using virtual addresses from trampoline.S removes the need | ||
144 | * to have any identity mapped pages in the kernel page table | ||
145 | * after the boot processor executes this code. | ||
146 | */ | ||
147 | |||
148 | /* Enable PAE mode and PGE */ | ||
149 | xorq %rax, %rax | ||
150 | btsq $5, %rax | ||
151 | btsq $7, %rax | ||
152 | movq %rax, %cr4 | ||
153 | |||
154 | /* Setup early boot stage 4 level pagetables. */ | ||
155 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
156 | addq phys_base(%rip), %rax | ||
157 | movq %rax, %cr3 | ||
158 | |||
159 | /* Ensure I am executing from virtual addresses */ | ||
160 | movq $1f, %rax | ||
161 | jmp *%rax | ||
162 | 1: | ||
163 | |||
164 | /* Check if nx is implemented */ | ||
165 | movl $0x80000001, %eax | ||
166 | cpuid | ||
167 | movl %edx,%edi | ||
168 | |||
169 | /* Setup EFER (Extended Feature Enable Register) */ | ||
170 | movl $MSR_EFER, %ecx | ||
171 | rdmsr | ||
172 | btsl $_EFER_SCE, %eax /* Enable System Call */ | ||
173 | btl $20,%edi /* No Execute supported? */ | ||
174 | jnc 1f | ||
175 | btsl $_EFER_NX, %eax | ||
176 | 1: wrmsr /* Make changes effective */ | ||
177 | |||
178 | /* Setup cr0 */ | ||
179 | #define CR0_PM 1 /* protected mode */ | ||
180 | #define CR0_MP (1<<1) | ||
181 | #define CR0_ET (1<<4) | ||
182 | #define CR0_NE (1<<5) | ||
183 | #define CR0_WP (1<<16) | ||
184 | #define CR0_AM (1<<18) | ||
185 | #define CR0_PAGING (1<<31) | ||
186 | movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax | ||
187 | /* Make changes effective */ | ||
188 | movq %rax, %cr0 | ||
189 | |||
190 | /* Setup a boot time stack */ | ||
191 | movq init_rsp(%rip),%rsp | ||
192 | |||
193 | /* zero EFLAGS after setting rsp */ | ||
194 | pushq $0 | ||
195 | popfq | ||
196 | |||
197 | /* | ||
198 | * We must switch to a new descriptor in kernel space for the GDT | ||
199 | * because soon the kernel won't have access anymore to the userspace | ||
200 | * addresses where we're currently running on. We have to do that here | ||
201 | * because in 32bit we couldn't load a 64bit linear address. | ||
202 | */ | ||
203 | lgdt cpu_gdt_descr(%rip) | ||
204 | |||
205 | /* set up data segments. actually 0 would do too */ | ||
206 | movl $__KERNEL_DS,%eax | ||
207 | movl %eax,%ds | ||
208 | movl %eax,%ss | ||
209 | movl %eax,%es | ||
210 | |||
211 | /* | ||
212 | * We don't really need to load %fs or %gs, but load them anyway | ||
213 | * to kill any stale realmode selectors. This allows execution | ||
214 | * under VT hardware. | ||
215 | */ | ||
216 | movl %eax,%fs | ||
217 | movl %eax,%gs | ||
218 | |||
219 | /* | ||
220 | * Setup up a dummy PDA. this is just for some early bootup code | ||
221 | * that does in_interrupt() | ||
222 | */ | ||
223 | movl $MSR_GS_BASE,%ecx | ||
224 | movq $empty_zero_page,%rax | ||
225 | movq %rax,%rdx | ||
226 | shrq $32,%rdx | ||
227 | wrmsr | ||
228 | |||
229 | /* esi is pointer to real mode structure with interesting info. | ||
230 | pass it to C */ | ||
231 | movl %esi, %edi | ||
232 | |||
233 | /* Finally jump to run C code and to be on real kernel address | ||
234 | * Since we are running on identity-mapped space we have to jump | ||
235 | * to the full 64bit address, this is only possible as indirect | ||
236 | * jump. In addition we need to ensure %cs is set so we make this | ||
237 | * a far return. | ||
238 | */ | ||
239 | movq initial_code(%rip),%rax | ||
240 | pushq $0 # fake return address to stop unwinder | ||
241 | pushq $__KERNEL_CS # set correct cs | ||
242 | pushq %rax # target address in negative space | ||
243 | lretq | ||
244 | |||
245 | /* SMP bootup changes these two */ | ||
246 | #ifndef CONFIG_HOTPLUG_CPU | ||
247 | .pushsection .init.data | ||
248 | #endif | ||
249 | .align 8 | ||
250 | .globl initial_code | ||
251 | initial_code: | ||
252 | .quad x86_64_start_kernel | ||
253 | #ifndef CONFIG_HOTPLUG_CPU | ||
254 | .popsection | ||
255 | #endif | ||
256 | .globl init_rsp | ||
257 | init_rsp: | ||
258 | .quad init_thread_union+THREAD_SIZE-8 | ||
259 | |||
260 | bad_address: | ||
261 | jmp bad_address | ||
262 | |||
263 | ENTRY(early_idt_handler) | ||
264 | cmpl $2,early_recursion_flag(%rip) | ||
265 | jz 1f | ||
266 | incl early_recursion_flag(%rip) | ||
267 | xorl %eax,%eax | ||
268 | movq 8(%rsp),%rsi # get rip | ||
269 | movq (%rsp),%rdx | ||
270 | movq %cr2,%rcx | ||
271 | leaq early_idt_msg(%rip),%rdi | ||
272 | call early_printk | ||
273 | cmpl $2,early_recursion_flag(%rip) | ||
274 | jz 1f | ||
275 | call dump_stack | ||
276 | #ifdef CONFIG_KALLSYMS | ||
277 | leaq early_idt_ripmsg(%rip),%rdi | ||
278 | movq 8(%rsp),%rsi # get rip again | ||
279 | call __print_symbol | ||
280 | #endif | ||
281 | 1: hlt | ||
282 | jmp 1b | ||
283 | early_recursion_flag: | ||
284 | .long 0 | ||
285 | |||
286 | early_idt_msg: | ||
287 | .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" | ||
288 | early_idt_ripmsg: | ||
289 | .asciz "RIP %s\n" | ||
290 | |||
291 | .balign PAGE_SIZE | ||
292 | |||
293 | #define NEXT_PAGE(name) \ | ||
294 | .balign PAGE_SIZE; \ | ||
295 | ENTRY(name) | ||
296 | |||
297 | /* Automate the creation of 1 to 1 mapping pmd entries */ | ||
298 | #define PMDS(START, PERM, COUNT) \ | ||
299 | i = 0 ; \ | ||
300 | .rept (COUNT) ; \ | ||
301 | .quad (START) + (i << 21) + (PERM) ; \ | ||
302 | i = i + 1 ; \ | ||
303 | .endr | ||
304 | |||
305 | /* | ||
306 | * This default setting generates an ident mapping at address 0x100000 | ||
307 | * and a mapping for the kernel that precisely maps virtual address | ||
308 | * 0xffffffff80000000 to physical address 0x000000. (always using | ||
309 | * 2Mbyte large pages provided by PAE mode) | ||
310 | */ | ||
311 | NEXT_PAGE(init_level4_pgt) | ||
312 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
313 | .fill 257,8,0 | ||
314 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
315 | .fill 252,8,0 | ||
316 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
317 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
318 | |||
319 | NEXT_PAGE(level3_ident_pgt) | ||
320 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
321 | .fill 511,8,0 | ||
322 | |||
323 | NEXT_PAGE(level3_kernel_pgt) | ||
324 | .fill 510,8,0 | ||
325 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | ||
326 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
327 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
328 | |||
329 | NEXT_PAGE(level2_fixmap_pgt) | ||
330 | .fill 506,8,0 | ||
331 | .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
332 | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ | ||
333 | .fill 5,8,0 | ||
334 | |||
335 | NEXT_PAGE(level1_fixmap_pgt) | ||
336 | .fill 512,8,0 | ||
337 | |||
338 | NEXT_PAGE(level2_ident_pgt) | ||
339 | /* Since I easily can, map the first 1G. | ||
340 | * Don't set NX because code runs from these pages. | ||
341 | */ | ||
342 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) | ||
343 | |||
344 | NEXT_PAGE(level2_kernel_pgt) | ||
345 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. | ||
346 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ | ||
347 | /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ | ||
348 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) | ||
349 | /* Module mapping starts here */ | ||
350 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 | ||
351 | |||
352 | NEXT_PAGE(level2_spare_pgt) | ||
353 | .fill 512,8,0 | ||
354 | |||
355 | #undef PMDS | ||
356 | #undef NEXT_PAGE | ||
357 | |||
358 | .data | ||
359 | .align 16 | ||
360 | .globl cpu_gdt_descr | ||
361 | cpu_gdt_descr: | ||
362 | .word gdt_end-cpu_gdt_table-1 | ||
363 | gdt: | ||
364 | .quad cpu_gdt_table | ||
365 | #ifdef CONFIG_SMP | ||
366 | .rept NR_CPUS-1 | ||
367 | .word 0 | ||
368 | .quad 0 | ||
369 | .endr | ||
370 | #endif | ||
371 | |||
372 | ENTRY(phys_base) | ||
373 | /* This must match the first entry in level2_kernel_pgt */ | ||
374 | .quad 0x0000000000000000 | ||
375 | |||
376 | /* We need valid kernel segments for data and code in long mode too | ||
377 | * IRET will check the segment types kkeil 2000/10/28 | ||
378 | * Also sysret mandates a special GDT layout | ||
379 | */ | ||
380 | |||
381 | .section .data.page_aligned, "aw" | ||
382 | .align PAGE_SIZE | ||
383 | |||
384 | /* The TLS descriptors are currently at a different place compared to i386. | ||
385 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
386 | |||
387 | ENTRY(cpu_gdt_table) | ||
388 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
389 | .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ | ||
390 | .quad 0x00af9b000000ffff /* __KERNEL_CS */ | ||
391 | .quad 0x00cf93000000ffff /* __KERNEL_DS */ | ||
392 | .quad 0x00cffb000000ffff /* __USER32_CS */ | ||
393 | .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ | ||
394 | .quad 0x00affb000000ffff /* __USER_CS */ | ||
395 | .quad 0x0 /* unused */ | ||
396 | .quad 0,0 /* TSS */ | ||
397 | .quad 0,0 /* LDT */ | ||
398 | .quad 0,0,0 /* three TLS descriptors */ | ||
399 | .quad 0x0000f40000000000 /* node/CPU stored in limit */ | ||
400 | gdt_end: | ||
401 | /* asm/segment.h:GDT_ENTRIES must match this */ | ||
402 | /* This should be a multiple of the cache line size */ | ||
403 | /* GDTs of other CPUs are now dynamically allocated */ | ||
404 | |||
405 | /* zero the remaining page */ | ||
406 | .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 | ||
407 | |||
408 | .section .bss, "aw", @nobits | ||
409 | .align L1_CACHE_BYTES | ||
410 | ENTRY(idt_table) | ||
411 | .skip 256 * 16 | ||
412 | |||
413 | .section .bss.page_aligned, "aw", @nobits | ||
414 | .align PAGE_SIZE | ||
415 | ENTRY(empty_zero_page) | ||
416 | .skip PAGE_SIZE | ||
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c deleted file mode 100644 index 6c34bdd22e26..000000000000 --- a/arch/x86_64/kernel/head64.c +++ /dev/null | |||
@@ -1,86 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head64.c -- prepare to run common code | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | */ | ||
6 | |||
7 | #include <linux/init.h> | ||
8 | #include <linux/linkage.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/percpu.h> | ||
13 | |||
14 | #include <asm/processor.h> | ||
15 | #include <asm/proto.h> | ||
16 | #include <asm/smp.h> | ||
17 | #include <asm/bootsetup.h> | ||
18 | #include <asm/setup.h> | ||
19 | #include <asm/desc.h> | ||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/tlbflush.h> | ||
22 | #include <asm/sections.h> | ||
23 | |||
24 | static void __init zap_identity_mappings(void) | ||
25 | { | ||
26 | pgd_t *pgd = pgd_offset_k(0UL); | ||
27 | pgd_clear(pgd); | ||
28 | __flush_tlb(); | ||
29 | } | ||
30 | |||
31 | /* Don't add a printk in there. printk relies on the PDA which is not initialized | ||
32 | yet. */ | ||
33 | static void __init clear_bss(void) | ||
34 | { | ||
35 | memset(__bss_start, 0, | ||
36 | (unsigned long) __bss_stop - (unsigned long) __bss_start); | ||
37 | } | ||
38 | |||
39 | #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ | ||
40 | #define OLD_CL_MAGIC_ADDR 0x20 | ||
41 | #define OLD_CL_MAGIC 0xA33F | ||
42 | #define OLD_CL_OFFSET 0x22 | ||
43 | |||
44 | static void __init copy_bootdata(char *real_mode_data) | ||
45 | { | ||
46 | unsigned long new_data; | ||
47 | char * command_line; | ||
48 | |||
49 | memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); | ||
50 | new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); | ||
51 | if (!new_data) { | ||
52 | if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { | ||
53 | return; | ||
54 | } | ||
55 | new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); | ||
56 | } | ||
57 | command_line = __va(new_data); | ||
58 | memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); | ||
59 | } | ||
60 | |||
61 | void __init x86_64_start_kernel(char * real_mode_data) | ||
62 | { | ||
63 | int i; | ||
64 | |||
65 | /* clear bss before set_intr_gate with early_idt_handler */ | ||
66 | clear_bss(); | ||
67 | |||
68 | /* Make NULL pointers segfault */ | ||
69 | zap_identity_mappings(); | ||
70 | |||
71 | for (i = 0; i < IDT_ENTRIES; i++) | ||
72 | set_intr_gate(i, early_idt_handler); | ||
73 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
74 | |||
75 | early_printk("Kernel alive\n"); | ||
76 | |||
77 | for (i = 0; i < NR_CPUS; i++) | ||
78 | cpu_pda(i) = &boot_cpu_pda[i]; | ||
79 | |||
80 | pda_init(0); | ||
81 | copy_bootdata(__va(real_mode_data)); | ||
82 | #ifdef CONFIG_SMP | ||
83 | cpu_set(0, cpu_online_map); | ||
84 | #endif | ||
85 | start_kernel(); | ||
86 | } | ||
diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c deleted file mode 100644 index e2d1b912e154..000000000000 --- a/arch/x86_64/kernel/hpet.c +++ /dev/null | |||
@@ -1,493 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/mc146818rtc.h> | ||
5 | #include <linux/time.h> | ||
6 | #include <linux/clocksource.h> | ||
7 | #include <linux/ioport.h> | ||
8 | #include <linux/acpi.h> | ||
9 | #include <linux/hpet.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/vsyscall.h> | ||
12 | #include <asm/timex.h> | ||
13 | #include <asm/hpet.h> | ||
14 | |||
15 | #define HPET_MASK 0xFFFFFFFF | ||
16 | #define HPET_SHIFT 22 | ||
17 | |||
18 | /* FSEC = 10^-15 NSEC = 10^-9 */ | ||
19 | #define FSEC_PER_NSEC 1000000 | ||
20 | |||
21 | int nohpet __initdata; | ||
22 | |||
23 | unsigned long hpet_address; | ||
24 | unsigned long hpet_period; /* fsecs / HPET clock */ | ||
25 | unsigned long hpet_tick; /* HPET clocks / interrupt */ | ||
26 | |||
27 | int hpet_use_timer; /* Use counter of hpet for time keeping, | ||
28 | * otherwise PIT | ||
29 | */ | ||
30 | |||
31 | #ifdef CONFIG_HPET | ||
32 | static __init int late_hpet_init(void) | ||
33 | { | ||
34 | struct hpet_data hd; | ||
35 | unsigned int ntimer; | ||
36 | |||
37 | if (!hpet_address) | ||
38 | return 0; | ||
39 | |||
40 | memset(&hd, 0, sizeof(hd)); | ||
41 | |||
42 | ntimer = hpet_readl(HPET_ID); | ||
43 | ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; | ||
44 | ntimer++; | ||
45 | |||
46 | /* | ||
47 | * Register with driver. | ||
48 | * Timer0 and Timer1 is used by platform. | ||
49 | */ | ||
50 | hd.hd_phys_address = hpet_address; | ||
51 | hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); | ||
52 | hd.hd_nirqs = ntimer; | ||
53 | hd.hd_flags = HPET_DATA_PLATFORM; | ||
54 | hpet_reserve_timer(&hd, 0); | ||
55 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
56 | hpet_reserve_timer(&hd, 1); | ||
57 | #endif | ||
58 | hd.hd_irq[0] = HPET_LEGACY_8254; | ||
59 | hd.hd_irq[1] = HPET_LEGACY_RTC; | ||
60 | if (ntimer > 2) { | ||
61 | struct hpet *hpet; | ||
62 | struct hpet_timer *timer; | ||
63 | int i; | ||
64 | |||
65 | hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); | ||
66 | timer = &hpet->hpet_timers[2]; | ||
67 | for (i = 2; i < ntimer; timer++, i++) | ||
68 | hd.hd_irq[i] = (timer->hpet_config & | ||
69 | Tn_INT_ROUTE_CNF_MASK) >> | ||
70 | Tn_INT_ROUTE_CNF_SHIFT; | ||
71 | |||
72 | } | ||
73 | |||
74 | hpet_alloc(&hd); | ||
75 | return 0; | ||
76 | } | ||
77 | fs_initcall(late_hpet_init); | ||
78 | #endif | ||
79 | |||
80 | int hpet_timer_stop_set_go(unsigned long tick) | ||
81 | { | ||
82 | unsigned int cfg; | ||
83 | |||
84 | /* | ||
85 | * Stop the timers and reset the main counter. | ||
86 | */ | ||
87 | |||
88 | cfg = hpet_readl(HPET_CFG); | ||
89 | cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); | ||
90 | hpet_writel(cfg, HPET_CFG); | ||
91 | hpet_writel(0, HPET_COUNTER); | ||
92 | hpet_writel(0, HPET_COUNTER + 4); | ||
93 | |||
94 | /* | ||
95 | * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, | ||
96 | * and period also hpet_tick. | ||
97 | */ | ||
98 | if (hpet_use_timer) { | ||
99 | hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | | ||
100 | HPET_TN_32BIT, HPET_T0_CFG); | ||
101 | hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ | ||
102 | hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ | ||
103 | cfg |= HPET_CFG_LEGACY; | ||
104 | } | ||
105 | /* | ||
106 | * Go! | ||
107 | */ | ||
108 | |||
109 | cfg |= HPET_CFG_ENABLE; | ||
110 | hpet_writel(cfg, HPET_CFG); | ||
111 | |||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static cycle_t read_hpet(void) | ||
116 | { | ||
117 | return (cycle_t)hpet_readl(HPET_COUNTER); | ||
118 | } | ||
119 | |||
120 | static cycle_t __vsyscall_fn vread_hpet(void) | ||
121 | { | ||
122 | return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); | ||
123 | } | ||
124 | |||
125 | struct clocksource clocksource_hpet = { | ||
126 | .name = "hpet", | ||
127 | .rating = 250, | ||
128 | .read = read_hpet, | ||
129 | .mask = (cycle_t)HPET_MASK, | ||
130 | .mult = 0, /* set below */ | ||
131 | .shift = HPET_SHIFT, | ||
132 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
133 | .vread = vread_hpet, | ||
134 | }; | ||
135 | |||
136 | int __init hpet_arch_init(void) | ||
137 | { | ||
138 | unsigned int id; | ||
139 | u64 tmp; | ||
140 | |||
141 | if (!hpet_address) | ||
142 | return -1; | ||
143 | set_fixmap_nocache(FIX_HPET_BASE, hpet_address); | ||
144 | __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); | ||
145 | |||
146 | /* | ||
147 | * Read the period, compute tick and quotient. | ||
148 | */ | ||
149 | |||
150 | id = hpet_readl(HPET_ID); | ||
151 | |||
152 | if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) | ||
153 | return -1; | ||
154 | |||
155 | hpet_period = hpet_readl(HPET_PERIOD); | ||
156 | if (hpet_period < 100000 || hpet_period > 100000000) | ||
157 | return -1; | ||
158 | |||
159 | hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; | ||
160 | |||
161 | hpet_use_timer = (id & HPET_ID_LEGSUP); | ||
162 | |||
163 | /* | ||
164 | * hpet period is in femto seconds per cycle | ||
165 | * so we need to convert this to ns/cyc units | ||
166 | * aproximated by mult/2^shift | ||
167 | * | ||
168 | * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift | ||
169 | * fsec/cyc * 1ns/1000000fsec * 2^shift = mult | ||
170 | * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult | ||
171 | * (fsec/cyc << shift)/1000000 = mult | ||
172 | * (hpet_period << shift)/FSEC_PER_NSEC = mult | ||
173 | */ | ||
174 | tmp = (u64)hpet_period << HPET_SHIFT; | ||
175 | do_div(tmp, FSEC_PER_NSEC); | ||
176 | clocksource_hpet.mult = (u32)tmp; | ||
177 | clocksource_register(&clocksource_hpet); | ||
178 | |||
179 | return hpet_timer_stop_set_go(hpet_tick); | ||
180 | } | ||
181 | |||
182 | int hpet_reenable(void) | ||
183 | { | ||
184 | return hpet_timer_stop_set_go(hpet_tick); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing | ||
189 | * it to the HPET timer of known frequency. | ||
190 | */ | ||
191 | |||
192 | #define TICK_COUNT 100000000 | ||
193 | #define SMI_THRESHOLD 50000 | ||
194 | #define MAX_TRIES 5 | ||
195 | |||
196 | /* | ||
197 | * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none | ||
198 | * occurs between the reads of the hpet & TSC. | ||
199 | */ | ||
200 | static void __init read_hpet_tsc(int *hpet, int *tsc) | ||
201 | { | ||
202 | int tsc1, tsc2, hpet1, i; | ||
203 | |||
204 | for (i = 0; i < MAX_TRIES; i++) { | ||
205 | tsc1 = get_cycles_sync(); | ||
206 | hpet1 = hpet_readl(HPET_COUNTER); | ||
207 | tsc2 = get_cycles_sync(); | ||
208 | if ((tsc2 - tsc1) < SMI_THRESHOLD) | ||
209 | break; | ||
210 | } | ||
211 | *hpet = hpet1; | ||
212 | *tsc = tsc2; | ||
213 | } | ||
214 | |||
215 | unsigned int __init hpet_calibrate_tsc(void) | ||
216 | { | ||
217 | int tsc_start, hpet_start; | ||
218 | int tsc_now, hpet_now; | ||
219 | unsigned long flags; | ||
220 | |||
221 | local_irq_save(flags); | ||
222 | |||
223 | read_hpet_tsc(&hpet_start, &tsc_start); | ||
224 | |||
225 | do { | ||
226 | local_irq_disable(); | ||
227 | read_hpet_tsc(&hpet_now, &tsc_now); | ||
228 | local_irq_restore(flags); | ||
229 | } while ((tsc_now - tsc_start) < TICK_COUNT && | ||
230 | (hpet_now - hpet_start) < TICK_COUNT); | ||
231 | |||
232 | return (tsc_now - tsc_start) * 1000000000L | ||
233 | / ((hpet_now - hpet_start) * hpet_period / 1000); | ||
234 | } | ||
235 | |||
236 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
237 | /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET | ||
238 | * is enabled, we support RTC interrupt functionality in software. | ||
239 | * RTC has 3 kinds of interrupts: | ||
240 | * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock | ||
241 | * is updated | ||
242 | * 2) Alarm Interrupt - generate an interrupt at a specific time of day | ||
243 | * 3) Periodic Interrupt - generate periodic interrupt, with frequencies | ||
244 | * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) | ||
245 | * (1) and (2) above are implemented using polling at a frequency of | ||
246 | * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt | ||
247 | * overhead. (DEFAULT_RTC_INT_FREQ) | ||
248 | * For (3), we use interrupts at 64Hz or user specified periodic | ||
249 | * frequency, whichever is higher. | ||
250 | */ | ||
251 | #include <linux/rtc.h> | ||
252 | |||
253 | #define DEFAULT_RTC_INT_FREQ 64 | ||
254 | #define RTC_NUM_INTS 1 | ||
255 | |||
256 | static unsigned long UIE_on; | ||
257 | static unsigned long prev_update_sec; | ||
258 | |||
259 | static unsigned long AIE_on; | ||
260 | static struct rtc_time alarm_time; | ||
261 | |||
262 | static unsigned long PIE_on; | ||
263 | static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; | ||
264 | static unsigned long PIE_count; | ||
265 | |||
266 | static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ | ||
267 | static unsigned int hpet_t1_cmp; /* cached comparator register */ | ||
268 | |||
269 | int is_hpet_enabled(void) | ||
270 | { | ||
271 | return hpet_address != 0; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Timer 1 for RTC, we do not use periodic interrupt feature, | ||
276 | * even if HPET supports periodic interrupts on Timer 1. | ||
277 | * The reason being, to set up a periodic interrupt in HPET, we need to | ||
278 | * stop the main counter. And if we do that everytime someone diables/enables | ||
279 | * RTC, we will have adverse effect on main kernel timer running on Timer 0. | ||
280 | * So, for the time being, simulate the periodic interrupt in software. | ||
281 | * | ||
282 | * hpet_rtc_timer_init() is called for the first time and during subsequent | ||
283 | * interuppts reinit happens through hpet_rtc_timer_reinit(). | ||
284 | */ | ||
285 | int hpet_rtc_timer_init(void) | ||
286 | { | ||
287 | unsigned int cfg, cnt; | ||
288 | unsigned long flags; | ||
289 | |||
290 | if (!is_hpet_enabled()) | ||
291 | return 0; | ||
292 | /* | ||
293 | * Set the counter 1 and enable the interrupts. | ||
294 | */ | ||
295 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
296 | hpet_rtc_int_freq = PIE_freq; | ||
297 | else | ||
298 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
299 | |||
300 | local_irq_save(flags); | ||
301 | |||
302 | cnt = hpet_readl(HPET_COUNTER); | ||
303 | cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); | ||
304 | hpet_writel(cnt, HPET_T1_CMP); | ||
305 | hpet_t1_cmp = cnt; | ||
306 | |||
307 | cfg = hpet_readl(HPET_T1_CFG); | ||
308 | cfg &= ~HPET_TN_PERIODIC; | ||
309 | cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; | ||
310 | hpet_writel(cfg, HPET_T1_CFG); | ||
311 | |||
312 | local_irq_restore(flags); | ||
313 | |||
314 | return 1; | ||
315 | } | ||
316 | |||
317 | static void hpet_rtc_timer_reinit(void) | ||
318 | { | ||
319 | unsigned int cfg, cnt, ticks_per_int, lost_ints; | ||
320 | |||
321 | if (unlikely(!(PIE_on | AIE_on | UIE_on))) { | ||
322 | cfg = hpet_readl(HPET_T1_CFG); | ||
323 | cfg &= ~HPET_TN_ENABLE; | ||
324 | hpet_writel(cfg, HPET_T1_CFG); | ||
325 | return; | ||
326 | } | ||
327 | |||
328 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
329 | hpet_rtc_int_freq = PIE_freq; | ||
330 | else | ||
331 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
332 | |||
333 | /* It is more accurate to use the comparator value than current count.*/ | ||
334 | ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; | ||
335 | hpet_t1_cmp += ticks_per_int; | ||
336 | hpet_writel(hpet_t1_cmp, HPET_T1_CMP); | ||
337 | |||
338 | /* | ||
339 | * If the interrupt handler was delayed too long, the write above tries | ||
340 | * to schedule the next interrupt in the past and the hardware would | ||
341 | * not interrupt until the counter had wrapped around. | ||
342 | * So we have to check that the comparator wasn't set to a past time. | ||
343 | */ | ||
344 | cnt = hpet_readl(HPET_COUNTER); | ||
345 | if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { | ||
346 | lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; | ||
347 | /* Make sure that, even with the time needed to execute | ||
348 | * this code, the next scheduled interrupt has been moved | ||
349 | * back to the future: */ | ||
350 | lost_ints++; | ||
351 | |||
352 | hpet_t1_cmp += lost_ints * ticks_per_int; | ||
353 | hpet_writel(hpet_t1_cmp, HPET_T1_CMP); | ||
354 | |||
355 | if (PIE_on) | ||
356 | PIE_count += lost_ints; | ||
357 | |||
358 | if (printk_ratelimit()) | ||
359 | printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", | ||
360 | hpet_rtc_int_freq); | ||
361 | } | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * The functions below are called from rtc driver. | ||
366 | * Return 0 if HPET is not being used. | ||
367 | * Otherwise do the necessary changes and return 1. | ||
368 | */ | ||
369 | int hpet_mask_rtc_irq_bit(unsigned long bit_mask) | ||
370 | { | ||
371 | if (!is_hpet_enabled()) | ||
372 | return 0; | ||
373 | |||
374 | if (bit_mask & RTC_UIE) | ||
375 | UIE_on = 0; | ||
376 | if (bit_mask & RTC_PIE) | ||
377 | PIE_on = 0; | ||
378 | if (bit_mask & RTC_AIE) | ||
379 | AIE_on = 0; | ||
380 | |||
381 | return 1; | ||
382 | } | ||
383 | |||
384 | int hpet_set_rtc_irq_bit(unsigned long bit_mask) | ||
385 | { | ||
386 | int timer_init_reqd = 0; | ||
387 | |||
388 | if (!is_hpet_enabled()) | ||
389 | return 0; | ||
390 | |||
391 | if (!(PIE_on | AIE_on | UIE_on)) | ||
392 | timer_init_reqd = 1; | ||
393 | |||
394 | if (bit_mask & RTC_UIE) { | ||
395 | UIE_on = 1; | ||
396 | } | ||
397 | if (bit_mask & RTC_PIE) { | ||
398 | PIE_on = 1; | ||
399 | PIE_count = 0; | ||
400 | } | ||
401 | if (bit_mask & RTC_AIE) { | ||
402 | AIE_on = 1; | ||
403 | } | ||
404 | |||
405 | if (timer_init_reqd) | ||
406 | hpet_rtc_timer_init(); | ||
407 | |||
408 | return 1; | ||
409 | } | ||
410 | |||
411 | int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) | ||
412 | { | ||
413 | if (!is_hpet_enabled()) | ||
414 | return 0; | ||
415 | |||
416 | alarm_time.tm_hour = hrs; | ||
417 | alarm_time.tm_min = min; | ||
418 | alarm_time.tm_sec = sec; | ||
419 | |||
420 | return 1; | ||
421 | } | ||
422 | |||
423 | int hpet_set_periodic_freq(unsigned long freq) | ||
424 | { | ||
425 | if (!is_hpet_enabled()) | ||
426 | return 0; | ||
427 | |||
428 | PIE_freq = freq; | ||
429 | PIE_count = 0; | ||
430 | |||
431 | return 1; | ||
432 | } | ||
433 | |||
434 | int hpet_rtc_dropped_irq(void) | ||
435 | { | ||
436 | if (!is_hpet_enabled()) | ||
437 | return 0; | ||
438 | |||
439 | return 1; | ||
440 | } | ||
441 | |||
442 | irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) | ||
443 | { | ||
444 | struct rtc_time curr_time; | ||
445 | unsigned long rtc_int_flag = 0; | ||
446 | int call_rtc_interrupt = 0; | ||
447 | |||
448 | hpet_rtc_timer_reinit(); | ||
449 | |||
450 | if (UIE_on | AIE_on) { | ||
451 | rtc_get_rtc_time(&curr_time); | ||
452 | } | ||
453 | if (UIE_on) { | ||
454 | if (curr_time.tm_sec != prev_update_sec) { | ||
455 | /* Set update int info, call real rtc int routine */ | ||
456 | call_rtc_interrupt = 1; | ||
457 | rtc_int_flag = RTC_UF; | ||
458 | prev_update_sec = curr_time.tm_sec; | ||
459 | } | ||
460 | } | ||
461 | if (PIE_on) { | ||
462 | PIE_count++; | ||
463 | if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { | ||
464 | /* Set periodic int info, call real rtc int routine */ | ||
465 | call_rtc_interrupt = 1; | ||
466 | rtc_int_flag |= RTC_PF; | ||
467 | PIE_count = 0; | ||
468 | } | ||
469 | } | ||
470 | if (AIE_on) { | ||
471 | if ((curr_time.tm_sec == alarm_time.tm_sec) && | ||
472 | (curr_time.tm_min == alarm_time.tm_min) && | ||
473 | (curr_time.tm_hour == alarm_time.tm_hour)) { | ||
474 | /* Set alarm int info, call real rtc int routine */ | ||
475 | call_rtc_interrupt = 1; | ||
476 | rtc_int_flag |= RTC_AF; | ||
477 | } | ||
478 | } | ||
479 | if (call_rtc_interrupt) { | ||
480 | rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); | ||
481 | rtc_interrupt(rtc_int_flag, dev_id); | ||
482 | } | ||
483 | return IRQ_HANDLED; | ||
484 | } | ||
485 | #endif | ||
486 | |||
487 | static int __init nohpet_setup(char *s) | ||
488 | { | ||
489 | nohpet = 1; | ||
490 | return 1; | ||
491 | } | ||
492 | |||
493 | __setup("nohpet", nohpet_setup); | ||
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c deleted file mode 100644 index 1d58c13bc6bc..000000000000 --- a/arch/x86_64/kernel/i387.c +++ /dev/null | |||
@@ -1,151 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/i387.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * Copyright (C) 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * General FPU state handling cleanups | ||
9 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
10 | * | ||
11 | * x86-64 rework 2002 Andi Kleen. | ||
12 | * Does direct fxsave in and out of user space now for signal handlers. | ||
13 | * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, | ||
14 | * the 64bit user space sees a FXSAVE frame directly. | ||
15 | */ | ||
16 | |||
17 | #include <linux/sched.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <asm/processor.h> | ||
20 | #include <asm/i387.h> | ||
21 | #include <asm/sigcontext.h> | ||
22 | #include <asm/user.h> | ||
23 | #include <asm/ptrace.h> | ||
24 | #include <asm/uaccess.h> | ||
25 | |||
26 | unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff; | ||
27 | |||
28 | void mxcsr_feature_mask_init(void) | ||
29 | { | ||
30 | unsigned int mask; | ||
31 | clts(); | ||
32 | memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
33 | asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); | ||
34 | mask = current->thread.i387.fxsave.mxcsr_mask; | ||
35 | if (mask == 0) mask = 0x0000ffbf; | ||
36 | mxcsr_feature_mask &= mask; | ||
37 | stts(); | ||
38 | } | ||
39 | |||
40 | /* | ||
41 | * Called at bootup to set up the initial FPU state that is later cloned | ||
42 | * into all processes. | ||
43 | */ | ||
44 | void __cpuinit fpu_init(void) | ||
45 | { | ||
46 | unsigned long oldcr0 = read_cr0(); | ||
47 | extern void __bad_fxsave_alignment(void); | ||
48 | |||
49 | if (offsetof(struct task_struct, thread.i387.fxsave) & 15) | ||
50 | __bad_fxsave_alignment(); | ||
51 | set_in_cr4(X86_CR4_OSFXSR); | ||
52 | set_in_cr4(X86_CR4_OSXMMEXCPT); | ||
53 | |||
54 | write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ | ||
55 | |||
56 | mxcsr_feature_mask_init(); | ||
57 | /* clean state in init */ | ||
58 | current_thread_info()->status = 0; | ||
59 | clear_used_math(); | ||
60 | } | ||
61 | |||
62 | void init_fpu(struct task_struct *child) | ||
63 | { | ||
64 | if (tsk_used_math(child)) { | ||
65 | if (child == current) | ||
66 | unlazy_fpu(child); | ||
67 | return; | ||
68 | } | ||
69 | memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
70 | child->thread.i387.fxsave.cwd = 0x37f; | ||
71 | child->thread.i387.fxsave.mxcsr = 0x1f80; | ||
72 | /* only the device not available exception or ptrace can call init_fpu */ | ||
73 | set_stopped_child_used_math(child); | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Signal frame handlers. | ||
78 | */ | ||
79 | |||
80 | int save_i387(struct _fpstate __user *buf) | ||
81 | { | ||
82 | struct task_struct *tsk = current; | ||
83 | int err = 0; | ||
84 | |||
85 | BUILD_BUG_ON(sizeof(struct user_i387_struct) != | ||
86 | sizeof(tsk->thread.i387.fxsave)); | ||
87 | |||
88 | if ((unsigned long)buf % 16) | ||
89 | printk("save_i387: bad fpstate %p\n",buf); | ||
90 | |||
91 | if (!used_math()) | ||
92 | return 0; | ||
93 | clear_used_math(); /* trigger finit */ | ||
94 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | ||
95 | err = save_i387_checking((struct i387_fxsave_struct __user *)buf); | ||
96 | if (err) return err; | ||
97 | stts(); | ||
98 | } else { | ||
99 | if (__copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
100 | sizeof(struct i387_fxsave_struct))) | ||
101 | return -1; | ||
102 | } | ||
103 | return 1; | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * ptrace request handlers. | ||
108 | */ | ||
109 | |||
110 | int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) | ||
111 | { | ||
112 | init_fpu(tsk); | ||
113 | return __copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
114 | sizeof(struct user_i387_struct)) ? -EFAULT : 0; | ||
115 | } | ||
116 | |||
117 | int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) | ||
118 | { | ||
119 | if (__copy_from_user(&tsk->thread.i387.fxsave, buf, | ||
120 | sizeof(struct user_i387_struct))) | ||
121 | return -EFAULT; | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * FPU state for core dumps. | ||
127 | */ | ||
128 | |||
129 | int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) | ||
130 | { | ||
131 | struct task_struct *tsk = current; | ||
132 | |||
133 | if (!used_math()) | ||
134 | return 0; | ||
135 | |||
136 | unlazy_fpu(tsk); | ||
137 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
138 | return 1; | ||
139 | } | ||
140 | |||
141 | int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) | ||
142 | { | ||
143 | int fpvalid = !!tsk_used_math(tsk); | ||
144 | |||
145 | if (fpvalid) { | ||
146 | if (tsk == current) | ||
147 | unlazy_fpu(tsk); | ||
148 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
149 | } | ||
150 | return fpvalid; | ||
151 | } | ||
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c deleted file mode 100644 index 948cae646099..000000000000 --- a/arch/x86_64/kernel/i8259.c +++ /dev/null | |||
@@ -1,544 +0,0 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/signal.h> | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/ioport.h> | ||
6 | #include <linux/interrupt.h> | ||
7 | #include <linux/timex.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/random.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/kernel_stat.h> | ||
12 | #include <linux/sysdev.h> | ||
13 | #include <linux/bitops.h> | ||
14 | |||
15 | #include <asm/acpi.h> | ||
16 | #include <asm/atomic.h> | ||
17 | #include <asm/system.h> | ||
18 | #include <asm/io.h> | ||
19 | #include <asm/hw_irq.h> | ||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/delay.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/apic.h> | ||
24 | |||
25 | /* | ||
26 | * Common place to define all x86 IRQ vectors | ||
27 | * | ||
28 | * This builds up the IRQ handler stubs using some ugly macros in irq.h | ||
29 | * | ||
30 | * These macros create the low-level assembly IRQ routines that save | ||
31 | * register context and call do_IRQ(). do_IRQ() then does all the | ||
32 | * operations that are needed to keep the AT (or SMP IOAPIC) | ||
33 | * interrupt-controller happy. | ||
34 | */ | ||
35 | |||
36 | #define BI(x,y) \ | ||
37 | BUILD_IRQ(x##y) | ||
38 | |||
39 | #define BUILD_16_IRQS(x) \ | ||
40 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
41 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
42 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
43 | BI(x,c) BI(x,d) BI(x,e) BI(x,f) | ||
44 | |||
45 | /* | ||
46 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
47 | * (these are usually mapped to vectors 0x30-0x3f) | ||
48 | */ | ||
49 | |||
50 | /* | ||
51 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
52 | * are unused but an SMP system is supposed to have enough memory ... | ||
53 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
54 | * across the spectrum, so we really want to be prepared to get all | ||
55 | * of these. Plus, more powerful systems might have more than 64 | ||
56 | * IO-APIC registers. | ||
57 | * | ||
58 | * (these are usually mapped into the 0x30-0xff vector range) | ||
59 | */ | ||
60 | BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) | ||
61 | BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) | ||
62 | BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) | ||
63 | BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) | ||
64 | |||
65 | #undef BUILD_16_IRQS | ||
66 | #undef BI | ||
67 | |||
68 | |||
69 | #define IRQ(x,y) \ | ||
70 | IRQ##x##y##_interrupt | ||
71 | |||
72 | #define IRQLIST_16(x) \ | ||
73 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
74 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
75 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
76 | IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) | ||
77 | |||
78 | /* for the irq vectors */ | ||
79 | static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { | ||
80 | IRQLIST_16(0x2), IRQLIST_16(0x3), | ||
81 | IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), | ||
82 | IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), | ||
83 | IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) | ||
84 | }; | ||
85 | |||
86 | #undef IRQ | ||
87 | #undef IRQLIST_16 | ||
88 | |||
89 | /* | ||
90 | * This is the 'legacy' 8259A Programmable Interrupt Controller, | ||
91 | * present in the majority of PC/AT boxes. | ||
92 | * plus some generic x86 specific things if generic specifics makes | ||
93 | * any sense at all. | ||
94 | * this file should become arch/i386/kernel/irq.c when the old irq.c | ||
95 | * moves to arch independent land | ||
96 | */ | ||
97 | |||
98 | static int i8259A_auto_eoi; | ||
99 | DEFINE_SPINLOCK(i8259A_lock); | ||
100 | static void mask_and_ack_8259A(unsigned int); | ||
101 | |||
102 | static struct irq_chip i8259A_chip = { | ||
103 | .name = "XT-PIC", | ||
104 | .mask = disable_8259A_irq, | ||
105 | .disable = disable_8259A_irq, | ||
106 | .unmask = enable_8259A_irq, | ||
107 | .mask_ack = mask_and_ack_8259A, | ||
108 | }; | ||
109 | |||
110 | /* | ||
111 | * 8259A PIC functions to handle ISA devices: | ||
112 | */ | ||
113 | |||
114 | /* | ||
115 | * This contains the irq mask for both 8259A irq controllers, | ||
116 | */ | ||
117 | static unsigned int cached_irq_mask = 0xffff; | ||
118 | |||
119 | #define __byte(x,y) (((unsigned char *)&(y))[x]) | ||
120 | #define cached_21 (__byte(0,cached_irq_mask)) | ||
121 | #define cached_A1 (__byte(1,cached_irq_mask)) | ||
122 | |||
123 | /* | ||
124 | * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) | ||
125 | * boards the timer interrupt is not really connected to any IO-APIC pin, | ||
126 | * it's fed to the master 8259A's IR0 line only. | ||
127 | * | ||
128 | * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. | ||
129 | * this 'mixed mode' IRQ handling costs nothing because it's only used | ||
130 | * at IRQ setup time. | ||
131 | */ | ||
132 | unsigned long io_apic_irqs; | ||
133 | |||
134 | void disable_8259A_irq(unsigned int irq) | ||
135 | { | ||
136 | unsigned int mask = 1 << irq; | ||
137 | unsigned long flags; | ||
138 | |||
139 | spin_lock_irqsave(&i8259A_lock, flags); | ||
140 | cached_irq_mask |= mask; | ||
141 | if (irq & 8) | ||
142 | outb(cached_A1,0xA1); | ||
143 | else | ||
144 | outb(cached_21,0x21); | ||
145 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
146 | } | ||
147 | |||
148 | void enable_8259A_irq(unsigned int irq) | ||
149 | { | ||
150 | unsigned int mask = ~(1 << irq); | ||
151 | unsigned long flags; | ||
152 | |||
153 | spin_lock_irqsave(&i8259A_lock, flags); | ||
154 | cached_irq_mask &= mask; | ||
155 | if (irq & 8) | ||
156 | outb(cached_A1,0xA1); | ||
157 | else | ||
158 | outb(cached_21,0x21); | ||
159 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
160 | } | ||
161 | |||
162 | int i8259A_irq_pending(unsigned int irq) | ||
163 | { | ||
164 | unsigned int mask = 1<<irq; | ||
165 | unsigned long flags; | ||
166 | int ret; | ||
167 | |||
168 | spin_lock_irqsave(&i8259A_lock, flags); | ||
169 | if (irq < 8) | ||
170 | ret = inb(0x20) & mask; | ||
171 | else | ||
172 | ret = inb(0xA0) & (mask >> 8); | ||
173 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
174 | |||
175 | return ret; | ||
176 | } | ||
177 | |||
178 | void make_8259A_irq(unsigned int irq) | ||
179 | { | ||
180 | disable_irq_nosync(irq); | ||
181 | io_apic_irqs &= ~(1<<irq); | ||
182 | set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, | ||
183 | "XT"); | ||
184 | enable_irq(irq); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * This function assumes to be called rarely. Switching between | ||
189 | * 8259A registers is slow. | ||
190 | * This has to be protected by the irq controller spinlock | ||
191 | * before being called. | ||
192 | */ | ||
193 | static inline int i8259A_irq_real(unsigned int irq) | ||
194 | { | ||
195 | int value; | ||
196 | int irqmask = 1<<irq; | ||
197 | |||
198 | if (irq < 8) { | ||
199 | outb(0x0B,0x20); /* ISR register */ | ||
200 | value = inb(0x20) & irqmask; | ||
201 | outb(0x0A,0x20); /* back to the IRR register */ | ||
202 | return value; | ||
203 | } | ||
204 | outb(0x0B,0xA0); /* ISR register */ | ||
205 | value = inb(0xA0) & (irqmask >> 8); | ||
206 | outb(0x0A,0xA0); /* back to the IRR register */ | ||
207 | return value; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Careful! The 8259A is a fragile beast, it pretty | ||
212 | * much _has_ to be done exactly like this (mask it | ||
213 | * first, _then_ send the EOI, and the order of EOI | ||
214 | * to the two 8259s is important! | ||
215 | */ | ||
216 | static void mask_and_ack_8259A(unsigned int irq) | ||
217 | { | ||
218 | unsigned int irqmask = 1 << irq; | ||
219 | unsigned long flags; | ||
220 | |||
221 | spin_lock_irqsave(&i8259A_lock, flags); | ||
222 | /* | ||
223 | * Lightweight spurious IRQ detection. We do not want | ||
224 | * to overdo spurious IRQ handling - it's usually a sign | ||
225 | * of hardware problems, so we only do the checks we can | ||
226 | * do without slowing down good hardware unnecessarily. | ||
227 | * | ||
228 | * Note that IRQ7 and IRQ15 (the two spurious IRQs | ||
229 | * usually resulting from the 8259A-1|2 PICs) occur | ||
230 | * even if the IRQ is masked in the 8259A. Thus we | ||
231 | * can check spurious 8259A IRQs without doing the | ||
232 | * quite slow i8259A_irq_real() call for every IRQ. | ||
233 | * This does not cover 100% of spurious interrupts, | ||
234 | * but should be enough to warn the user that there | ||
235 | * is something bad going on ... | ||
236 | */ | ||
237 | if (cached_irq_mask & irqmask) | ||
238 | goto spurious_8259A_irq; | ||
239 | cached_irq_mask |= irqmask; | ||
240 | |||
241 | handle_real_irq: | ||
242 | if (irq & 8) { | ||
243 | inb(0xA1); /* DUMMY - (do we need this?) */ | ||
244 | outb(cached_A1,0xA1); | ||
245 | outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ | ||
246 | outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ | ||
247 | } else { | ||
248 | inb(0x21); /* DUMMY - (do we need this?) */ | ||
249 | outb(cached_21,0x21); | ||
250 | outb(0x60+irq,0x20); /* 'Specific EOI' to master */ | ||
251 | } | ||
252 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
253 | return; | ||
254 | |||
255 | spurious_8259A_irq: | ||
256 | /* | ||
257 | * this is the slow path - should happen rarely. | ||
258 | */ | ||
259 | if (i8259A_irq_real(irq)) | ||
260 | /* | ||
261 | * oops, the IRQ _is_ in service according to the | ||
262 | * 8259A - not spurious, go handle it. | ||
263 | */ | ||
264 | goto handle_real_irq; | ||
265 | |||
266 | { | ||
267 | static int spurious_irq_mask; | ||
268 | /* | ||
269 | * At this point we can be sure the IRQ is spurious, | ||
270 | * lets ACK and report it. [once per IRQ] | ||
271 | */ | ||
272 | if (!(spurious_irq_mask & irqmask)) { | ||
273 | printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); | ||
274 | spurious_irq_mask |= irqmask; | ||
275 | } | ||
276 | atomic_inc(&irq_err_count); | ||
277 | /* | ||
278 | * Theoretically we do not have to handle this IRQ, | ||
279 | * but in Linux this does not cause problems and is | ||
280 | * simpler for us. | ||
281 | */ | ||
282 | goto handle_real_irq; | ||
283 | } | ||
284 | } | ||
285 | |||
286 | void init_8259A(int auto_eoi) | ||
287 | { | ||
288 | unsigned long flags; | ||
289 | |||
290 | i8259A_auto_eoi = auto_eoi; | ||
291 | |||
292 | spin_lock_irqsave(&i8259A_lock, flags); | ||
293 | |||
294 | outb(0xff, 0x21); /* mask all of 8259A-1 */ | ||
295 | outb(0xff, 0xA1); /* mask all of 8259A-2 */ | ||
296 | |||
297 | /* | ||
298 | * outb_p - this has to work on a wide range of PC hardware. | ||
299 | */ | ||
300 | outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ | ||
301 | outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ | ||
302 | outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ | ||
303 | if (auto_eoi) | ||
304 | outb_p(0x03, 0x21); /* master does Auto EOI */ | ||
305 | else | ||
306 | outb_p(0x01, 0x21); /* master expects normal EOI */ | ||
307 | |||
308 | outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ | ||
309 | outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ | ||
310 | outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ | ||
311 | outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode | ||
312 | is to be investigated) */ | ||
313 | |||
314 | if (auto_eoi) | ||
315 | /* | ||
316 | * in AEOI mode we just have to mask the interrupt | ||
317 | * when acking. | ||
318 | */ | ||
319 | i8259A_chip.mask_ack = disable_8259A_irq; | ||
320 | else | ||
321 | i8259A_chip.mask_ack = mask_and_ack_8259A; | ||
322 | |||
323 | udelay(100); /* wait for 8259A to initialize */ | ||
324 | |||
325 | outb(cached_21, 0x21); /* restore master IRQ mask */ | ||
326 | outb(cached_A1, 0xA1); /* restore slave IRQ mask */ | ||
327 | |||
328 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
329 | } | ||
330 | |||
331 | static char irq_trigger[2]; | ||
332 | /** | ||
333 | * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ | ||
334 | */ | ||
335 | static void restore_ELCR(char *trigger) | ||
336 | { | ||
337 | outb(trigger[0], 0x4d0); | ||
338 | outb(trigger[1], 0x4d1); | ||
339 | } | ||
340 | |||
341 | static void save_ELCR(char *trigger) | ||
342 | { | ||
343 | /* IRQ 0,1,2,8,13 are marked as reserved */ | ||
344 | trigger[0] = inb(0x4d0) & 0xF8; | ||
345 | trigger[1] = inb(0x4d1) & 0xDE; | ||
346 | } | ||
347 | |||
348 | static int i8259A_resume(struct sys_device *dev) | ||
349 | { | ||
350 | init_8259A(i8259A_auto_eoi); | ||
351 | restore_ELCR(irq_trigger); | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | static int i8259A_suspend(struct sys_device *dev, pm_message_t state) | ||
356 | { | ||
357 | save_ELCR(irq_trigger); | ||
358 | return 0; | ||
359 | } | ||
360 | |||
361 | static int i8259A_shutdown(struct sys_device *dev) | ||
362 | { | ||
363 | /* Put the i8259A into a quiescent state that | ||
364 | * the kernel initialization code can get it | ||
365 | * out of. | ||
366 | */ | ||
367 | outb(0xff, 0x21); /* mask all of 8259A-1 */ | ||
368 | outb(0xff, 0xA1); /* mask all of 8259A-1 */ | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | static struct sysdev_class i8259_sysdev_class = { | ||
373 | set_kset_name("i8259"), | ||
374 | .suspend = i8259A_suspend, | ||
375 | .resume = i8259A_resume, | ||
376 | .shutdown = i8259A_shutdown, | ||
377 | }; | ||
378 | |||
379 | static struct sys_device device_i8259A = { | ||
380 | .id = 0, | ||
381 | .cls = &i8259_sysdev_class, | ||
382 | }; | ||
383 | |||
384 | static int __init i8259A_init_sysfs(void) | ||
385 | { | ||
386 | int error = sysdev_class_register(&i8259_sysdev_class); | ||
387 | if (!error) | ||
388 | error = sysdev_register(&device_i8259A); | ||
389 | return error; | ||
390 | } | ||
391 | |||
392 | device_initcall(i8259A_init_sysfs); | ||
393 | |||
394 | /* | ||
395 | * IRQ2 is cascade interrupt to second interrupt controller | ||
396 | */ | ||
397 | |||
398 | static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; | ||
399 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | ||
400 | [0 ... IRQ0_VECTOR - 1] = -1, | ||
401 | [IRQ0_VECTOR] = 0, | ||
402 | [IRQ1_VECTOR] = 1, | ||
403 | [IRQ2_VECTOR] = 2, | ||
404 | [IRQ3_VECTOR] = 3, | ||
405 | [IRQ4_VECTOR] = 4, | ||
406 | [IRQ5_VECTOR] = 5, | ||
407 | [IRQ6_VECTOR] = 6, | ||
408 | [IRQ7_VECTOR] = 7, | ||
409 | [IRQ8_VECTOR] = 8, | ||
410 | [IRQ9_VECTOR] = 9, | ||
411 | [IRQ10_VECTOR] = 10, | ||
412 | [IRQ11_VECTOR] = 11, | ||
413 | [IRQ12_VECTOR] = 12, | ||
414 | [IRQ13_VECTOR] = 13, | ||
415 | [IRQ14_VECTOR] = 14, | ||
416 | [IRQ15_VECTOR] = 15, | ||
417 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | ||
418 | }; | ||
419 | |||
420 | void __init init_ISA_irqs (void) | ||
421 | { | ||
422 | int i; | ||
423 | |||
424 | init_bsp_APIC(); | ||
425 | init_8259A(0); | ||
426 | |||
427 | for (i = 0; i < NR_IRQS; i++) { | ||
428 | irq_desc[i].status = IRQ_DISABLED; | ||
429 | irq_desc[i].action = NULL; | ||
430 | irq_desc[i].depth = 1; | ||
431 | |||
432 | if (i < 16) { | ||
433 | /* | ||
434 | * 16 old-style INTA-cycle interrupts: | ||
435 | */ | ||
436 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
437 | handle_level_irq, "XT"); | ||
438 | } else { | ||
439 | /* | ||
440 | * 'high' PCI IRQs filled in on demand | ||
441 | */ | ||
442 | irq_desc[i].chip = &no_irq_chip; | ||
443 | } | ||
444 | } | ||
445 | } | ||
446 | |||
447 | static void setup_timer_hardware(void) | ||
448 | { | ||
449 | outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
450 | udelay(10); | ||
451 | outb_p(LATCH & 0xff , 0x40); /* LSB */ | ||
452 | udelay(10); | ||
453 | outb(LATCH >> 8 , 0x40); /* MSB */ | ||
454 | } | ||
455 | |||
456 | static int timer_resume(struct sys_device *dev) | ||
457 | { | ||
458 | setup_timer_hardware(); | ||
459 | return 0; | ||
460 | } | ||
461 | |||
462 | void i8254_timer_resume(void) | ||
463 | { | ||
464 | setup_timer_hardware(); | ||
465 | } | ||
466 | |||
467 | static struct sysdev_class timer_sysclass = { | ||
468 | set_kset_name("timer_pit"), | ||
469 | .resume = timer_resume, | ||
470 | }; | ||
471 | |||
472 | static struct sys_device device_timer = { | ||
473 | .id = 0, | ||
474 | .cls = &timer_sysclass, | ||
475 | }; | ||
476 | |||
477 | static int __init init_timer_sysfs(void) | ||
478 | { | ||
479 | int error = sysdev_class_register(&timer_sysclass); | ||
480 | if (!error) | ||
481 | error = sysdev_register(&device_timer); | ||
482 | return error; | ||
483 | } | ||
484 | |||
485 | device_initcall(init_timer_sysfs); | ||
486 | |||
487 | void __init init_IRQ(void) | ||
488 | { | ||
489 | int i; | ||
490 | |||
491 | init_ISA_irqs(); | ||
492 | /* | ||
493 | * Cover the whole vector space, no vector can escape | ||
494 | * us. (some of these will be overridden and become | ||
495 | * 'special' SMP interrupts) | ||
496 | */ | ||
497 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
498 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
499 | if (vector != IA32_SYSCALL_VECTOR) | ||
500 | set_intr_gate(vector, interrupt[i]); | ||
501 | } | ||
502 | |||
503 | #ifdef CONFIG_SMP | ||
504 | /* | ||
505 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
506 | * IPI, driven by wakeup. | ||
507 | */ | ||
508 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
509 | |||
510 | /* IPIs for invalidation */ | ||
511 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); | ||
512 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | ||
513 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | ||
514 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | ||
515 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | ||
516 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | ||
517 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | ||
518 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | ||
519 | |||
520 | /* IPI for generic function call */ | ||
521 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
522 | |||
523 | /* Low priority IPI to cleanup after moving an irq */ | ||
524 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | ||
525 | #endif | ||
526 | set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
527 | set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | ||
528 | |||
529 | /* self generated IPI for local APIC timer */ | ||
530 | set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
531 | |||
532 | /* IPI vectors for APIC spurious and error interrupts */ | ||
533 | set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
534 | set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
535 | |||
536 | /* | ||
537 | * Set the clock to HZ Hz, we already have a valid | ||
538 | * vector now: | ||
539 | */ | ||
540 | setup_timer_hardware(); | ||
541 | |||
542 | if (!acpi_ioapic) | ||
543 | setup_irq(2, &irq2); | ||
544 | } | ||
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c deleted file mode 100644 index 4ff33d4f8551..000000000000 --- a/arch/x86_64/kernel/init_task.c +++ /dev/null | |||
@@ -1,54 +0,0 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/init_task.h> | ||
6 | #include <linux/fs.h> | ||
7 | #include <linux/mqueue.h> | ||
8 | |||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/desc.h> | ||
12 | |||
13 | static struct fs_struct init_fs = INIT_FS; | ||
14 | static struct files_struct init_files = INIT_FILES; | ||
15 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | ||
16 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | ||
17 | struct mm_struct init_mm = INIT_MM(init_mm); | ||
18 | |||
19 | EXPORT_SYMBOL(init_mm); | ||
20 | |||
21 | /* | ||
22 | * Initial task structure. | ||
23 | * | ||
24 | * We need to make sure that this is 8192-byte aligned due to the | ||
25 | * way process stacks are handled. This is done by having a special | ||
26 | * "init_task" linker map entry.. | ||
27 | */ | ||
28 | union thread_union init_thread_union | ||
29 | __attribute__((__section__(".data.init_task"))) = | ||
30 | { INIT_THREAD_INFO(init_task) }; | ||
31 | |||
32 | /* | ||
33 | * Initial task structure. | ||
34 | * | ||
35 | * All other task structs will be allocated on slabs in fork.c | ||
36 | */ | ||
37 | struct task_struct init_task = INIT_TASK(init_task); | ||
38 | |||
39 | EXPORT_SYMBOL(init_task); | ||
40 | /* | ||
41 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
42 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | ||
43 | * so they are allowed to end up in the .data.cacheline_aligned | ||
44 | * section. Since TSS's are completely CPU-local, we want them | ||
45 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | ||
46 | */ | ||
47 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; | ||
48 | |||
49 | /* Copies of the original ist values from the tss are only accessed during | ||
50 | * debugging, no special alignment required. | ||
51 | */ | ||
52 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | ||
53 | |||
54 | #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) | ||
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c deleted file mode 100644 index 966fa1062491..000000000000 --- a/arch/x86_64/kernel/io_apic.c +++ /dev/null | |||
@@ -1,2202 +0,0 @@ | |||
1 | /* | ||
2 | * Intel IO-APIC support for multi-Pentium hosts. | ||
3 | * | ||
4 | * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | ||
5 | * | ||
6 | * Many thanks to Stig Venaas for trying out countless experimental | ||
7 | * patches and reporting/debugging problems patiently! | ||
8 | * | ||
9 | * (c) 1999, Multiple IO-APIC support, developed by | ||
10 | * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | ||
11 | * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | ||
12 | * further tested and cleaned up by Zach Brown <zab@redhat.com> | ||
13 | * and Ingo Molnar <mingo@redhat.com> | ||
14 | * | ||
15 | * Fixes | ||
16 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
17 | * thanks to Eric Gilmore | ||
18 | * and Rolf G. Tews | ||
19 | * for testing these extensively | ||
20 | * Paul Diefenbaugh : Added full ACPI support | ||
21 | */ | ||
22 | |||
23 | #include <linux/mm.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/pci.h> | ||
29 | #include <linux/mc146818rtc.h> | ||
30 | #include <linux/acpi.h> | ||
31 | #include <linux/sysdev.h> | ||
32 | #include <linux/msi.h> | ||
33 | #include <linux/htirq.h> | ||
34 | #ifdef CONFIG_ACPI | ||
35 | #include <acpi/acpi_bus.h> | ||
36 | #endif | ||
37 | |||
38 | #include <asm/idle.h> | ||
39 | #include <asm/io.h> | ||
40 | #include <asm/smp.h> | ||
41 | #include <asm/desc.h> | ||
42 | #include <asm/proto.h> | ||
43 | #include <asm/mach_apic.h> | ||
44 | #include <asm/acpi.h> | ||
45 | #include <asm/dma.h> | ||
46 | #include <asm/nmi.h> | ||
47 | #include <asm/msidef.h> | ||
48 | #include <asm/hypertransport.h> | ||
49 | |||
50 | struct irq_cfg { | ||
51 | cpumask_t domain; | ||
52 | cpumask_t old_domain; | ||
53 | unsigned move_cleanup_count; | ||
54 | u8 vector; | ||
55 | u8 move_in_progress : 1; | ||
56 | }; | ||
57 | |||
58 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ | ||
59 | struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { | ||
60 | [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, | ||
61 | [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, | ||
62 | [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, | ||
63 | [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, | ||
64 | [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, | ||
65 | [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, | ||
66 | [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, | ||
67 | [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, | ||
68 | [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, | ||
69 | [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, | ||
70 | [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, | ||
71 | [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, | ||
72 | [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, | ||
73 | [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, | ||
74 | [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, | ||
75 | [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, | ||
76 | }; | ||
77 | |||
78 | static int assign_irq_vector(int irq, cpumask_t mask); | ||
79 | |||
80 | #define __apicdebuginit __init | ||
81 | |||
82 | int sis_apic_bug; /* not actually supported, dummy for compile */ | ||
83 | |||
84 | static int no_timer_check; | ||
85 | |||
86 | static int disable_timer_pin_1 __initdata; | ||
87 | |||
88 | int timer_over_8254 __initdata = 1; | ||
89 | |||
90 | /* Where if anywhere is the i8259 connect in external int mode */ | ||
91 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | ||
92 | |||
93 | static DEFINE_SPINLOCK(ioapic_lock); | ||
94 | DEFINE_SPINLOCK(vector_lock); | ||
95 | |||
96 | /* | ||
97 | * # of IRQ routing registers | ||
98 | */ | ||
99 | int nr_ioapic_registers[MAX_IO_APICS]; | ||
100 | |||
101 | /* | ||
102 | * Rough estimation of how many shared IRQs there are, can | ||
103 | * be changed anytime. | ||
104 | */ | ||
105 | #define MAX_PLUS_SHARED_IRQS NR_IRQS | ||
106 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | ||
107 | |||
108 | /* | ||
109 | * This is performance-critical, we want to do it O(1) | ||
110 | * | ||
111 | * the indexing order of this array favors 1:1 mappings | ||
112 | * between pins and IRQs. | ||
113 | */ | ||
114 | |||
115 | static struct irq_pin_list { | ||
116 | short apic, pin, next; | ||
117 | } irq_2_pin[PIN_MAP_SIZE]; | ||
118 | |||
119 | struct io_apic { | ||
120 | unsigned int index; | ||
121 | unsigned int unused[3]; | ||
122 | unsigned int data; | ||
123 | }; | ||
124 | |||
125 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) | ||
126 | { | ||
127 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) | ||
128 | + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); | ||
129 | } | ||
130 | |||
131 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) | ||
132 | { | ||
133 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
134 | writel(reg, &io_apic->index); | ||
135 | return readl(&io_apic->data); | ||
136 | } | ||
137 | |||
138 | static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | ||
139 | { | ||
140 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
141 | writel(reg, &io_apic->index); | ||
142 | writel(value, &io_apic->data); | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Re-write a value: to be used for read-modify-write | ||
147 | * cycles where the read already set up the index register. | ||
148 | */ | ||
149 | static inline void io_apic_modify(unsigned int apic, unsigned int value) | ||
150 | { | ||
151 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
152 | writel(value, &io_apic->data); | ||
153 | } | ||
154 | |||
155 | static int io_apic_level_ack_pending(unsigned int irq) | ||
156 | { | ||
157 | struct irq_pin_list *entry; | ||
158 | unsigned long flags; | ||
159 | int pending = 0; | ||
160 | |||
161 | spin_lock_irqsave(&ioapic_lock, flags); | ||
162 | entry = irq_2_pin + irq; | ||
163 | for (;;) { | ||
164 | unsigned int reg; | ||
165 | int pin; | ||
166 | |||
167 | pin = entry->pin; | ||
168 | if (pin == -1) | ||
169 | break; | ||
170 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | ||
171 | /* Is the remote IRR bit set? */ | ||
172 | pending |= (reg >> 14) & 1; | ||
173 | if (!entry->next) | ||
174 | break; | ||
175 | entry = irq_2_pin + entry->next; | ||
176 | } | ||
177 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
178 | return pending; | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * Synchronize the IO-APIC and the CPU by doing | ||
183 | * a dummy read from the IO-APIC | ||
184 | */ | ||
185 | static inline void io_apic_sync(unsigned int apic) | ||
186 | { | ||
187 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
188 | readl(&io_apic->data); | ||
189 | } | ||
190 | |||
191 | #define __DO_ACTION(R, ACTION, FINAL) \ | ||
192 | \ | ||
193 | { \ | ||
194 | int pin; \ | ||
195 | struct irq_pin_list *entry = irq_2_pin + irq; \ | ||
196 | \ | ||
197 | BUG_ON(irq >= NR_IRQS); \ | ||
198 | for (;;) { \ | ||
199 | unsigned int reg; \ | ||
200 | pin = entry->pin; \ | ||
201 | if (pin == -1) \ | ||
202 | break; \ | ||
203 | reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ | ||
204 | reg ACTION; \ | ||
205 | io_apic_modify(entry->apic, reg); \ | ||
206 | FINAL; \ | ||
207 | if (!entry->next) \ | ||
208 | break; \ | ||
209 | entry = irq_2_pin + entry->next; \ | ||
210 | } \ | ||
211 | } | ||
212 | |||
213 | union entry_union { | ||
214 | struct { u32 w1, w2; }; | ||
215 | struct IO_APIC_route_entry entry; | ||
216 | }; | ||
217 | |||
218 | static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) | ||
219 | { | ||
220 | union entry_union eu; | ||
221 | unsigned long flags; | ||
222 | spin_lock_irqsave(&ioapic_lock, flags); | ||
223 | eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); | ||
224 | eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); | ||
225 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
226 | return eu.entry; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * When we write a new IO APIC routing entry, we need to write the high | ||
231 | * word first! If the mask bit in the low word is clear, we will enable | ||
232 | * the interrupt, and we need to make sure the entry is fully populated | ||
233 | * before that happens. | ||
234 | */ | ||
235 | static void | ||
236 | __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
237 | { | ||
238 | union entry_union eu; | ||
239 | eu.entry = e; | ||
240 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
241 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
242 | } | ||
243 | |||
244 | static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
245 | { | ||
246 | unsigned long flags; | ||
247 | spin_lock_irqsave(&ioapic_lock, flags); | ||
248 | __ioapic_write_entry(apic, pin, e); | ||
249 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * When we mask an IO APIC routing entry, we need to write the low | ||
254 | * word first, in order to set the mask bit before we change the | ||
255 | * high bits! | ||
256 | */ | ||
257 | static void ioapic_mask_entry(int apic, int pin) | ||
258 | { | ||
259 | unsigned long flags; | ||
260 | union entry_union eu = { .entry.mask = 1 }; | ||
261 | |||
262 | spin_lock_irqsave(&ioapic_lock, flags); | ||
263 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
264 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
265 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
266 | } | ||
267 | |||
268 | #ifdef CONFIG_SMP | ||
269 | static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) | ||
270 | { | ||
271 | int apic, pin; | ||
272 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
273 | |||
274 | BUG_ON(irq >= NR_IRQS); | ||
275 | for (;;) { | ||
276 | unsigned int reg; | ||
277 | apic = entry->apic; | ||
278 | pin = entry->pin; | ||
279 | if (pin == -1) | ||
280 | break; | ||
281 | io_apic_write(apic, 0x11 + pin*2, dest); | ||
282 | reg = io_apic_read(apic, 0x10 + pin*2); | ||
283 | reg &= ~0x000000ff; | ||
284 | reg |= vector; | ||
285 | io_apic_modify(apic, reg); | ||
286 | if (!entry->next) | ||
287 | break; | ||
288 | entry = irq_2_pin + entry->next; | ||
289 | } | ||
290 | } | ||
291 | |||
292 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | ||
293 | { | ||
294 | struct irq_cfg *cfg = irq_cfg + irq; | ||
295 | unsigned long flags; | ||
296 | unsigned int dest; | ||
297 | cpumask_t tmp; | ||
298 | |||
299 | cpus_and(tmp, mask, cpu_online_map); | ||
300 | if (cpus_empty(tmp)) | ||
301 | return; | ||
302 | |||
303 | if (assign_irq_vector(irq, mask)) | ||
304 | return; | ||
305 | |||
306 | cpus_and(tmp, cfg->domain, mask); | ||
307 | dest = cpu_mask_to_apicid(tmp); | ||
308 | |||
309 | /* | ||
310 | * Only the high 8 bits are valid. | ||
311 | */ | ||
312 | dest = SET_APIC_LOGICAL_ID(dest); | ||
313 | |||
314 | spin_lock_irqsave(&ioapic_lock, flags); | ||
315 | __target_IO_APIC_irq(irq, dest, cfg->vector); | ||
316 | irq_desc[irq].affinity = mask; | ||
317 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
318 | } | ||
319 | #endif | ||
320 | |||
321 | /* | ||
322 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | ||
323 | * shared ISA-space IRQs, so we have to support them. We are super | ||
324 | * fast in the common case, and fast for shared ISA-space IRQs. | ||
325 | */ | ||
326 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) | ||
327 | { | ||
328 | static int first_free_entry = NR_IRQS; | ||
329 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
330 | |||
331 | BUG_ON(irq >= NR_IRQS); | ||
332 | while (entry->next) | ||
333 | entry = irq_2_pin + entry->next; | ||
334 | |||
335 | if (entry->pin != -1) { | ||
336 | entry->next = first_free_entry; | ||
337 | entry = irq_2_pin + entry->next; | ||
338 | if (++first_free_entry >= PIN_MAP_SIZE) | ||
339 | panic("io_apic.c: ran out of irq_2_pin entries!"); | ||
340 | } | ||
341 | entry->apic = apic; | ||
342 | entry->pin = pin; | ||
343 | } | ||
344 | |||
345 | |||
346 | #define DO_ACTION(name,R,ACTION, FINAL) \ | ||
347 | \ | ||
348 | static void name##_IO_APIC_irq (unsigned int irq) \ | ||
349 | __DO_ACTION(R, ACTION, FINAL) | ||
350 | |||
351 | DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) | ||
352 | /* mask = 1 */ | ||
353 | DO_ACTION( __unmask, 0, &= 0xfffeffff, ) | ||
354 | /* mask = 0 */ | ||
355 | |||
356 | static void mask_IO_APIC_irq (unsigned int irq) | ||
357 | { | ||
358 | unsigned long flags; | ||
359 | |||
360 | spin_lock_irqsave(&ioapic_lock, flags); | ||
361 | __mask_IO_APIC_irq(irq); | ||
362 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
363 | } | ||
364 | |||
365 | static void unmask_IO_APIC_irq (unsigned int irq) | ||
366 | { | ||
367 | unsigned long flags; | ||
368 | |||
369 | spin_lock_irqsave(&ioapic_lock, flags); | ||
370 | __unmask_IO_APIC_irq(irq); | ||
371 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
372 | } | ||
373 | |||
374 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | ||
375 | { | ||
376 | struct IO_APIC_route_entry entry; | ||
377 | |||
378 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ | ||
379 | entry = ioapic_read_entry(apic, pin); | ||
380 | if (entry.delivery_mode == dest_SMI) | ||
381 | return; | ||
382 | /* | ||
383 | * Disable it in the IO-APIC irq-routing table: | ||
384 | */ | ||
385 | ioapic_mask_entry(apic, pin); | ||
386 | } | ||
387 | |||
388 | static void clear_IO_APIC (void) | ||
389 | { | ||
390 | int apic, pin; | ||
391 | |||
392 | for (apic = 0; apic < nr_ioapics; apic++) | ||
393 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | ||
394 | clear_IO_APIC_pin(apic, pin); | ||
395 | } | ||
396 | |||
397 | int skip_ioapic_setup; | ||
398 | int ioapic_force; | ||
399 | |||
400 | static int __init parse_noapic(char *str) | ||
401 | { | ||
402 | disable_ioapic_setup(); | ||
403 | return 0; | ||
404 | } | ||
405 | early_param("noapic", parse_noapic); | ||
406 | |||
407 | /* Actually the next is obsolete, but keep it for paranoid reasons -AK */ | ||
408 | static int __init disable_timer_pin_setup(char *arg) | ||
409 | { | ||
410 | disable_timer_pin_1 = 1; | ||
411 | return 1; | ||
412 | } | ||
413 | __setup("disable_timer_pin_1", disable_timer_pin_setup); | ||
414 | |||
415 | static int __init setup_disable_8254_timer(char *s) | ||
416 | { | ||
417 | timer_over_8254 = -1; | ||
418 | return 1; | ||
419 | } | ||
420 | static int __init setup_enable_8254_timer(char *s) | ||
421 | { | ||
422 | timer_over_8254 = 2; | ||
423 | return 1; | ||
424 | } | ||
425 | |||
426 | __setup("disable_8254_timer", setup_disable_8254_timer); | ||
427 | __setup("enable_8254_timer", setup_enable_8254_timer); | ||
428 | |||
429 | |||
430 | /* | ||
431 | * Find the IRQ entry number of a certain pin. | ||
432 | */ | ||
433 | static int find_irq_entry(int apic, int pin, int type) | ||
434 | { | ||
435 | int i; | ||
436 | |||
437 | for (i = 0; i < mp_irq_entries; i++) | ||
438 | if (mp_irqs[i].mpc_irqtype == type && | ||
439 | (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | ||
440 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | ||
441 | mp_irqs[i].mpc_dstirq == pin) | ||
442 | return i; | ||
443 | |||
444 | return -1; | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * Find the pin to which IRQ[irq] (ISA) is connected | ||
449 | */ | ||
450 | static int __init find_isa_irq_pin(int irq, int type) | ||
451 | { | ||
452 | int i; | ||
453 | |||
454 | for (i = 0; i < mp_irq_entries; i++) { | ||
455 | int lbus = mp_irqs[i].mpc_srcbus; | ||
456 | |||
457 | if (test_bit(lbus, mp_bus_not_pci) && | ||
458 | (mp_irqs[i].mpc_irqtype == type) && | ||
459 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
460 | |||
461 | return mp_irqs[i].mpc_dstirq; | ||
462 | } | ||
463 | return -1; | ||
464 | } | ||
465 | |||
466 | static int __init find_isa_irq_apic(int irq, int type) | ||
467 | { | ||
468 | int i; | ||
469 | |||
470 | for (i = 0; i < mp_irq_entries; i++) { | ||
471 | int lbus = mp_irqs[i].mpc_srcbus; | ||
472 | |||
473 | if (test_bit(lbus, mp_bus_not_pci) && | ||
474 | (mp_irqs[i].mpc_irqtype == type) && | ||
475 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
476 | break; | ||
477 | } | ||
478 | if (i < mp_irq_entries) { | ||
479 | int apic; | ||
480 | for(apic = 0; apic < nr_ioapics; apic++) { | ||
481 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) | ||
482 | return apic; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | return -1; | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Find a specific PCI IRQ entry. | ||
491 | * Not an __init, possibly needed by modules | ||
492 | */ | ||
493 | static int pin_2_irq(int idx, int apic, int pin); | ||
494 | |||
495 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | ||
496 | { | ||
497 | int apic, i, best_guess = -1; | ||
498 | |||
499 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | ||
500 | bus, slot, pin); | ||
501 | if (mp_bus_id_to_pci_bus[bus] == -1) { | ||
502 | apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
503 | return -1; | ||
504 | } | ||
505 | for (i = 0; i < mp_irq_entries; i++) { | ||
506 | int lbus = mp_irqs[i].mpc_srcbus; | ||
507 | |||
508 | for (apic = 0; apic < nr_ioapics; apic++) | ||
509 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | ||
510 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | ||
511 | break; | ||
512 | |||
513 | if (!test_bit(lbus, mp_bus_not_pci) && | ||
514 | !mp_irqs[i].mpc_irqtype && | ||
515 | (bus == lbus) && | ||
516 | (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | ||
517 | int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | ||
518 | |||
519 | if (!(apic || IO_APIC_IRQ(irq))) | ||
520 | continue; | ||
521 | |||
522 | if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | ||
523 | return irq; | ||
524 | /* | ||
525 | * Use the first all-but-pin matching entry as a | ||
526 | * best-guess fuzzy result for broken mptables. | ||
527 | */ | ||
528 | if (best_guess < 0) | ||
529 | best_guess = irq; | ||
530 | } | ||
531 | } | ||
532 | BUG_ON(best_guess >= NR_IRQS); | ||
533 | return best_guess; | ||
534 | } | ||
535 | |||
536 | /* ISA interrupts are always polarity zero edge triggered, | ||
537 | * when listed as conforming in the MP table. */ | ||
538 | |||
539 | #define default_ISA_trigger(idx) (0) | ||
540 | #define default_ISA_polarity(idx) (0) | ||
541 | |||
542 | /* PCI interrupts are always polarity one level triggered, | ||
543 | * when listed as conforming in the MP table. */ | ||
544 | |||
545 | #define default_PCI_trigger(idx) (1) | ||
546 | #define default_PCI_polarity(idx) (1) | ||
547 | |||
548 | static int __init MPBIOS_polarity(int idx) | ||
549 | { | ||
550 | int bus = mp_irqs[idx].mpc_srcbus; | ||
551 | int polarity; | ||
552 | |||
553 | /* | ||
554 | * Determine IRQ line polarity (high active or low active): | ||
555 | */ | ||
556 | switch (mp_irqs[idx].mpc_irqflag & 3) | ||
557 | { | ||
558 | case 0: /* conforms, ie. bus-type dependent polarity */ | ||
559 | if (test_bit(bus, mp_bus_not_pci)) | ||
560 | polarity = default_ISA_polarity(idx); | ||
561 | else | ||
562 | polarity = default_PCI_polarity(idx); | ||
563 | break; | ||
564 | case 1: /* high active */ | ||
565 | { | ||
566 | polarity = 0; | ||
567 | break; | ||
568 | } | ||
569 | case 2: /* reserved */ | ||
570 | { | ||
571 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
572 | polarity = 1; | ||
573 | break; | ||
574 | } | ||
575 | case 3: /* low active */ | ||
576 | { | ||
577 | polarity = 1; | ||
578 | break; | ||
579 | } | ||
580 | default: /* invalid */ | ||
581 | { | ||
582 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
583 | polarity = 1; | ||
584 | break; | ||
585 | } | ||
586 | } | ||
587 | return polarity; | ||
588 | } | ||
589 | |||
590 | static int MPBIOS_trigger(int idx) | ||
591 | { | ||
592 | int bus = mp_irqs[idx].mpc_srcbus; | ||
593 | int trigger; | ||
594 | |||
595 | /* | ||
596 | * Determine IRQ trigger mode (edge or level sensitive): | ||
597 | */ | ||
598 | switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | ||
599 | { | ||
600 | case 0: /* conforms, ie. bus-type dependent */ | ||
601 | if (test_bit(bus, mp_bus_not_pci)) | ||
602 | trigger = default_ISA_trigger(idx); | ||
603 | else | ||
604 | trigger = default_PCI_trigger(idx); | ||
605 | break; | ||
606 | case 1: /* edge */ | ||
607 | { | ||
608 | trigger = 0; | ||
609 | break; | ||
610 | } | ||
611 | case 2: /* reserved */ | ||
612 | { | ||
613 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
614 | trigger = 1; | ||
615 | break; | ||
616 | } | ||
617 | case 3: /* level */ | ||
618 | { | ||
619 | trigger = 1; | ||
620 | break; | ||
621 | } | ||
622 | default: /* invalid */ | ||
623 | { | ||
624 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
625 | trigger = 0; | ||
626 | break; | ||
627 | } | ||
628 | } | ||
629 | return trigger; | ||
630 | } | ||
631 | |||
632 | static inline int irq_polarity(int idx) | ||
633 | { | ||
634 | return MPBIOS_polarity(idx); | ||
635 | } | ||
636 | |||
637 | static inline int irq_trigger(int idx) | ||
638 | { | ||
639 | return MPBIOS_trigger(idx); | ||
640 | } | ||
641 | |||
642 | static int pin_2_irq(int idx, int apic, int pin) | ||
643 | { | ||
644 | int irq, i; | ||
645 | int bus = mp_irqs[idx].mpc_srcbus; | ||
646 | |||
647 | /* | ||
648 | * Debugging check, we are in big trouble if this message pops up! | ||
649 | */ | ||
650 | if (mp_irqs[idx].mpc_dstirq != pin) | ||
651 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | ||
652 | |||
653 | if (test_bit(bus, mp_bus_not_pci)) { | ||
654 | irq = mp_irqs[idx].mpc_srcbusirq; | ||
655 | } else { | ||
656 | /* | ||
657 | * PCI IRQs are mapped in order | ||
658 | */ | ||
659 | i = irq = 0; | ||
660 | while (i < apic) | ||
661 | irq += nr_ioapic_registers[i++]; | ||
662 | irq += pin; | ||
663 | } | ||
664 | BUG_ON(irq >= NR_IRQS); | ||
665 | return irq; | ||
666 | } | ||
667 | |||
668 | static int __assign_irq_vector(int irq, cpumask_t mask) | ||
669 | { | ||
670 | /* | ||
671 | * NOTE! The local APIC isn't very good at handling | ||
672 | * multiple interrupts at the same interrupt level. | ||
673 | * As the interrupt level is determined by taking the | ||
674 | * vector number and shifting that right by 4, we | ||
675 | * want to spread these out a bit so that they don't | ||
676 | * all fall in the same interrupt level. | ||
677 | * | ||
678 | * Also, we've got to be careful not to trash gate | ||
679 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
680 | */ | ||
681 | static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; | ||
682 | unsigned int old_vector; | ||
683 | int cpu; | ||
684 | struct irq_cfg *cfg; | ||
685 | |||
686 | BUG_ON((unsigned)irq >= NR_IRQS); | ||
687 | cfg = &irq_cfg[irq]; | ||
688 | |||
689 | /* Only try and allocate irqs on cpus that are present */ | ||
690 | cpus_and(mask, mask, cpu_online_map); | ||
691 | |||
692 | if ((cfg->move_in_progress) || cfg->move_cleanup_count) | ||
693 | return -EBUSY; | ||
694 | |||
695 | old_vector = cfg->vector; | ||
696 | if (old_vector) { | ||
697 | cpumask_t tmp; | ||
698 | cpus_and(tmp, cfg->domain, mask); | ||
699 | if (!cpus_empty(tmp)) | ||
700 | return 0; | ||
701 | } | ||
702 | |||
703 | for_each_cpu_mask(cpu, mask) { | ||
704 | cpumask_t domain, new_mask; | ||
705 | int new_cpu; | ||
706 | int vector, offset; | ||
707 | |||
708 | domain = vector_allocation_domain(cpu); | ||
709 | cpus_and(new_mask, domain, cpu_online_map); | ||
710 | |||
711 | vector = current_vector; | ||
712 | offset = current_offset; | ||
713 | next: | ||
714 | vector += 8; | ||
715 | if (vector >= FIRST_SYSTEM_VECTOR) { | ||
716 | /* If we run out of vectors on large boxen, must share them. */ | ||
717 | offset = (offset + 1) % 8; | ||
718 | vector = FIRST_DEVICE_VECTOR + offset; | ||
719 | } | ||
720 | if (unlikely(current_vector == vector)) | ||
721 | continue; | ||
722 | if (vector == IA32_SYSCALL_VECTOR) | ||
723 | goto next; | ||
724 | for_each_cpu_mask(new_cpu, new_mask) | ||
725 | if (per_cpu(vector_irq, new_cpu)[vector] != -1) | ||
726 | goto next; | ||
727 | /* Found one! */ | ||
728 | current_vector = vector; | ||
729 | current_offset = offset; | ||
730 | if (old_vector) { | ||
731 | cfg->move_in_progress = 1; | ||
732 | cfg->old_domain = cfg->domain; | ||
733 | } | ||
734 | for_each_cpu_mask(new_cpu, new_mask) | ||
735 | per_cpu(vector_irq, new_cpu)[vector] = irq; | ||
736 | cfg->vector = vector; | ||
737 | cfg->domain = domain; | ||
738 | return 0; | ||
739 | } | ||
740 | return -ENOSPC; | ||
741 | } | ||
742 | |||
743 | static int assign_irq_vector(int irq, cpumask_t mask) | ||
744 | { | ||
745 | int err; | ||
746 | unsigned long flags; | ||
747 | |||
748 | spin_lock_irqsave(&vector_lock, flags); | ||
749 | err = __assign_irq_vector(irq, mask); | ||
750 | spin_unlock_irqrestore(&vector_lock, flags); | ||
751 | return err; | ||
752 | } | ||
753 | |||
754 | static void __clear_irq_vector(int irq) | ||
755 | { | ||
756 | struct irq_cfg *cfg; | ||
757 | cpumask_t mask; | ||
758 | int cpu, vector; | ||
759 | |||
760 | BUG_ON((unsigned)irq >= NR_IRQS); | ||
761 | cfg = &irq_cfg[irq]; | ||
762 | BUG_ON(!cfg->vector); | ||
763 | |||
764 | vector = cfg->vector; | ||
765 | cpus_and(mask, cfg->domain, cpu_online_map); | ||
766 | for_each_cpu_mask(cpu, mask) | ||
767 | per_cpu(vector_irq, cpu)[vector] = -1; | ||
768 | |||
769 | cfg->vector = 0; | ||
770 | cfg->domain = CPU_MASK_NONE; | ||
771 | } | ||
772 | |||
773 | void __setup_vector_irq(int cpu) | ||
774 | { | ||
775 | /* Initialize vector_irq on a new cpu */ | ||
776 | /* This function must be called with vector_lock held */ | ||
777 | int irq, vector; | ||
778 | |||
779 | /* Mark the inuse vectors */ | ||
780 | for (irq = 0; irq < NR_IRQS; ++irq) { | ||
781 | if (!cpu_isset(cpu, irq_cfg[irq].domain)) | ||
782 | continue; | ||
783 | vector = irq_cfg[irq].vector; | ||
784 | per_cpu(vector_irq, cpu)[vector] = irq; | ||
785 | } | ||
786 | /* Mark the free vectors */ | ||
787 | for (vector = 0; vector < NR_VECTORS; ++vector) { | ||
788 | irq = per_cpu(vector_irq, cpu)[vector]; | ||
789 | if (irq < 0) | ||
790 | continue; | ||
791 | if (!cpu_isset(cpu, irq_cfg[irq].domain)) | ||
792 | per_cpu(vector_irq, cpu)[vector] = -1; | ||
793 | } | ||
794 | } | ||
795 | |||
796 | |||
797 | static struct irq_chip ioapic_chip; | ||
798 | |||
799 | static void ioapic_register_intr(int irq, unsigned long trigger) | ||
800 | { | ||
801 | if (trigger) { | ||
802 | irq_desc[irq].status |= IRQ_LEVEL; | ||
803 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
804 | handle_fasteoi_irq, "fasteoi"); | ||
805 | } else { | ||
806 | irq_desc[irq].status &= ~IRQ_LEVEL; | ||
807 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
808 | handle_edge_irq, "edge"); | ||
809 | } | ||
810 | } | ||
811 | |||
812 | static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, | ||
813 | int trigger, int polarity) | ||
814 | { | ||
815 | struct irq_cfg *cfg = irq_cfg + irq; | ||
816 | struct IO_APIC_route_entry entry; | ||
817 | cpumask_t mask; | ||
818 | |||
819 | if (!IO_APIC_IRQ(irq)) | ||
820 | return; | ||
821 | |||
822 | mask = TARGET_CPUS; | ||
823 | if (assign_irq_vector(irq, mask)) | ||
824 | return; | ||
825 | |||
826 | cpus_and(mask, cfg->domain, mask); | ||
827 | |||
828 | apic_printk(APIC_VERBOSE,KERN_DEBUG | ||
829 | "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " | ||
830 | "IRQ %d Mode:%i Active:%i)\n", | ||
831 | apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, | ||
832 | irq, trigger, polarity); | ||
833 | |||
834 | /* | ||
835 | * add it to the IO-APIC irq-routing table: | ||
836 | */ | ||
837 | memset(&entry,0,sizeof(entry)); | ||
838 | |||
839 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
840 | entry.dest_mode = INT_DEST_MODE; | ||
841 | entry.dest = cpu_mask_to_apicid(mask); | ||
842 | entry.mask = 0; /* enable IRQ */ | ||
843 | entry.trigger = trigger; | ||
844 | entry.polarity = polarity; | ||
845 | entry.vector = cfg->vector; | ||
846 | |||
847 | /* Mask level triggered irqs. | ||
848 | * Use IRQ_DELAYED_DISABLE for edge triggered irqs. | ||
849 | */ | ||
850 | if (trigger) | ||
851 | entry.mask = 1; | ||
852 | |||
853 | ioapic_register_intr(irq, trigger); | ||
854 | if (irq < 16) | ||
855 | disable_8259A_irq(irq); | ||
856 | |||
857 | ioapic_write_entry(apic, pin, entry); | ||
858 | } | ||
859 | |||
860 | static void __init setup_IO_APIC_irqs(void) | ||
861 | { | ||
862 | int apic, pin, idx, irq, first_notcon = 1; | ||
863 | |||
864 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | ||
865 | |||
866 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
867 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
868 | |||
869 | idx = find_irq_entry(apic,pin,mp_INT); | ||
870 | if (idx == -1) { | ||
871 | if (first_notcon) { | ||
872 | apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
873 | first_notcon = 0; | ||
874 | } else | ||
875 | apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
876 | continue; | ||
877 | } | ||
878 | |||
879 | irq = pin_2_irq(idx, apic, pin); | ||
880 | add_pin_to_irq(irq, apic, pin); | ||
881 | |||
882 | setup_IO_APIC_irq(apic, pin, irq, | ||
883 | irq_trigger(idx), irq_polarity(idx)); | ||
884 | } | ||
885 | } | ||
886 | |||
887 | if (!first_notcon) | ||
888 | apic_printk(APIC_VERBOSE," not connected.\n"); | ||
889 | } | ||
890 | |||
891 | /* | ||
892 | * Set up the 8259A-master output pin as broadcast to all | ||
893 | * CPUs. | ||
894 | */ | ||
895 | static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) | ||
896 | { | ||
897 | struct IO_APIC_route_entry entry; | ||
898 | unsigned long flags; | ||
899 | |||
900 | memset(&entry,0,sizeof(entry)); | ||
901 | |||
902 | disable_8259A_irq(0); | ||
903 | |||
904 | /* mask LVT0 */ | ||
905 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
906 | |||
907 | /* | ||
908 | * We use logical delivery to get the timer IRQ | ||
909 | * to the first CPU. | ||
910 | */ | ||
911 | entry.dest_mode = INT_DEST_MODE; | ||
912 | entry.mask = 0; /* unmask IRQ now */ | ||
913 | entry.dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
914 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
915 | entry.polarity = 0; | ||
916 | entry.trigger = 0; | ||
917 | entry.vector = vector; | ||
918 | |||
919 | /* | ||
920 | * The timer IRQ doesn't have to know that behind the | ||
921 | * scene we have a 8259A-master in AEOI mode ... | ||
922 | */ | ||
923 | set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); | ||
924 | |||
925 | /* | ||
926 | * Add it to the IO-APIC irq-routing table: | ||
927 | */ | ||
928 | spin_lock_irqsave(&ioapic_lock, flags); | ||
929 | io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | ||
930 | io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | ||
931 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
932 | |||
933 | enable_8259A_irq(0); | ||
934 | } | ||
935 | |||
936 | void __apicdebuginit print_IO_APIC(void) | ||
937 | { | ||
938 | int apic, i; | ||
939 | union IO_APIC_reg_00 reg_00; | ||
940 | union IO_APIC_reg_01 reg_01; | ||
941 | union IO_APIC_reg_02 reg_02; | ||
942 | unsigned long flags; | ||
943 | |||
944 | if (apic_verbosity == APIC_QUIET) | ||
945 | return; | ||
946 | |||
947 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | ||
948 | for (i = 0; i < nr_ioapics; i++) | ||
949 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | ||
950 | mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | ||
951 | |||
952 | /* | ||
953 | * We are a bit conservative about what we expect. We have to | ||
954 | * know about every hardware change ASAP. | ||
955 | */ | ||
956 | printk(KERN_INFO "testing the IO APIC.......................\n"); | ||
957 | |||
958 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
959 | |||
960 | spin_lock_irqsave(&ioapic_lock, flags); | ||
961 | reg_00.raw = io_apic_read(apic, 0); | ||
962 | reg_01.raw = io_apic_read(apic, 1); | ||
963 | if (reg_01.bits.version >= 0x10) | ||
964 | reg_02.raw = io_apic_read(apic, 2); | ||
965 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
966 | |||
967 | printk("\n"); | ||
968 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | ||
969 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | ||
970 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | ||
971 | |||
972 | printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); | ||
973 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | ||
974 | |||
975 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | ||
976 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | ||
977 | |||
978 | if (reg_01.bits.version >= 0x10) { | ||
979 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | ||
980 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | ||
981 | } | ||
982 | |||
983 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | ||
984 | |||
985 | printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" | ||
986 | " Stat Dmod Deli Vect: \n"); | ||
987 | |||
988 | for (i = 0; i <= reg_01.bits.entries; i++) { | ||
989 | struct IO_APIC_route_entry entry; | ||
990 | |||
991 | entry = ioapic_read_entry(apic, i); | ||
992 | |||
993 | printk(KERN_DEBUG " %02x %03X ", | ||
994 | i, | ||
995 | entry.dest | ||
996 | ); | ||
997 | |||
998 | printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | ||
999 | entry.mask, | ||
1000 | entry.trigger, | ||
1001 | entry.irr, | ||
1002 | entry.polarity, | ||
1003 | entry.delivery_status, | ||
1004 | entry.dest_mode, | ||
1005 | entry.delivery_mode, | ||
1006 | entry.vector | ||
1007 | ); | ||
1008 | } | ||
1009 | } | ||
1010 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); | ||
1011 | for (i = 0; i < NR_IRQS; i++) { | ||
1012 | struct irq_pin_list *entry = irq_2_pin + i; | ||
1013 | if (entry->pin < 0) | ||
1014 | continue; | ||
1015 | printk(KERN_DEBUG "IRQ%d ", i); | ||
1016 | for (;;) { | ||
1017 | printk("-> %d:%d", entry->apic, entry->pin); | ||
1018 | if (!entry->next) | ||
1019 | break; | ||
1020 | entry = irq_2_pin + entry->next; | ||
1021 | } | ||
1022 | printk("\n"); | ||
1023 | } | ||
1024 | |||
1025 | printk(KERN_INFO ".................................... done.\n"); | ||
1026 | |||
1027 | return; | ||
1028 | } | ||
1029 | |||
1030 | #if 0 | ||
1031 | |||
1032 | static __apicdebuginit void print_APIC_bitfield (int base) | ||
1033 | { | ||
1034 | unsigned int v; | ||
1035 | int i, j; | ||
1036 | |||
1037 | if (apic_verbosity == APIC_QUIET) | ||
1038 | return; | ||
1039 | |||
1040 | printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | ||
1041 | for (i = 0; i < 8; i++) { | ||
1042 | v = apic_read(base + i*0x10); | ||
1043 | for (j = 0; j < 32; j++) { | ||
1044 | if (v & (1<<j)) | ||
1045 | printk("1"); | ||
1046 | else | ||
1047 | printk("0"); | ||
1048 | } | ||
1049 | printk("\n"); | ||
1050 | } | ||
1051 | } | ||
1052 | |||
1053 | void __apicdebuginit print_local_APIC(void * dummy) | ||
1054 | { | ||
1055 | unsigned int v, ver, maxlvt; | ||
1056 | |||
1057 | if (apic_verbosity == APIC_QUIET) | ||
1058 | return; | ||
1059 | |||
1060 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | ||
1061 | smp_processor_id(), hard_smp_processor_id()); | ||
1062 | v = apic_read(APIC_ID); | ||
1063 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); | ||
1064 | v = apic_read(APIC_LVR); | ||
1065 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | ||
1066 | ver = GET_APIC_VERSION(v); | ||
1067 | maxlvt = get_maxlvt(); | ||
1068 | |||
1069 | v = apic_read(APIC_TASKPRI); | ||
1070 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | ||
1071 | |||
1072 | v = apic_read(APIC_ARBPRI); | ||
1073 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | ||
1074 | v & APIC_ARBPRI_MASK); | ||
1075 | v = apic_read(APIC_PROCPRI); | ||
1076 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | ||
1077 | |||
1078 | v = apic_read(APIC_EOI); | ||
1079 | printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | ||
1080 | v = apic_read(APIC_RRR); | ||
1081 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1082 | v = apic_read(APIC_LDR); | ||
1083 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | ||
1084 | v = apic_read(APIC_DFR); | ||
1085 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | ||
1086 | v = apic_read(APIC_SPIV); | ||
1087 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | ||
1088 | |||
1089 | printk(KERN_DEBUG "... APIC ISR field:\n"); | ||
1090 | print_APIC_bitfield(APIC_ISR); | ||
1091 | printk(KERN_DEBUG "... APIC TMR field:\n"); | ||
1092 | print_APIC_bitfield(APIC_TMR); | ||
1093 | printk(KERN_DEBUG "... APIC IRR field:\n"); | ||
1094 | print_APIC_bitfield(APIC_IRR); | ||
1095 | |||
1096 | v = apic_read(APIC_ESR); | ||
1097 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | ||
1098 | |||
1099 | v = apic_read(APIC_ICR); | ||
1100 | printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | ||
1101 | v = apic_read(APIC_ICR2); | ||
1102 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | ||
1103 | |||
1104 | v = apic_read(APIC_LVTT); | ||
1105 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | ||
1106 | |||
1107 | if (maxlvt > 3) { /* PC is LVT#4. */ | ||
1108 | v = apic_read(APIC_LVTPC); | ||
1109 | printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | ||
1110 | } | ||
1111 | v = apic_read(APIC_LVT0); | ||
1112 | printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | ||
1113 | v = apic_read(APIC_LVT1); | ||
1114 | printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | ||
1115 | |||
1116 | if (maxlvt > 2) { /* ERR is LVT#3. */ | ||
1117 | v = apic_read(APIC_LVTERR); | ||
1118 | printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | ||
1119 | } | ||
1120 | |||
1121 | v = apic_read(APIC_TMICT); | ||
1122 | printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | ||
1123 | v = apic_read(APIC_TMCCT); | ||
1124 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | ||
1125 | v = apic_read(APIC_TDCR); | ||
1126 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | ||
1127 | printk("\n"); | ||
1128 | } | ||
1129 | |||
1130 | void print_all_local_APICs (void) | ||
1131 | { | ||
1132 | on_each_cpu(print_local_APIC, NULL, 1, 1); | ||
1133 | } | ||
1134 | |||
1135 | void __apicdebuginit print_PIC(void) | ||
1136 | { | ||
1137 | unsigned int v; | ||
1138 | unsigned long flags; | ||
1139 | |||
1140 | if (apic_verbosity == APIC_QUIET) | ||
1141 | return; | ||
1142 | |||
1143 | printk(KERN_DEBUG "\nprinting PIC contents\n"); | ||
1144 | |||
1145 | spin_lock_irqsave(&i8259A_lock, flags); | ||
1146 | |||
1147 | v = inb(0xa1) << 8 | inb(0x21); | ||
1148 | printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | ||
1149 | |||
1150 | v = inb(0xa0) << 8 | inb(0x20); | ||
1151 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | ||
1152 | |||
1153 | outb(0x0b,0xa0); | ||
1154 | outb(0x0b,0x20); | ||
1155 | v = inb(0xa0) << 8 | inb(0x20); | ||
1156 | outb(0x0a,0xa0); | ||
1157 | outb(0x0a,0x20); | ||
1158 | |||
1159 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
1160 | |||
1161 | printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | ||
1162 | |||
1163 | v = inb(0x4d1) << 8 | inb(0x4d0); | ||
1164 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | ||
1165 | } | ||
1166 | |||
1167 | #endif /* 0 */ | ||
1168 | |||
1169 | static void __init enable_IO_APIC(void) | ||
1170 | { | ||
1171 | union IO_APIC_reg_01 reg_01; | ||
1172 | int i8259_apic, i8259_pin; | ||
1173 | int i, apic; | ||
1174 | unsigned long flags; | ||
1175 | |||
1176 | for (i = 0; i < PIN_MAP_SIZE; i++) { | ||
1177 | irq_2_pin[i].pin = -1; | ||
1178 | irq_2_pin[i].next = 0; | ||
1179 | } | ||
1180 | |||
1181 | /* | ||
1182 | * The number of IO-APIC IRQ registers (== #pins): | ||
1183 | */ | ||
1184 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1185 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1186 | reg_01.raw = io_apic_read(apic, 1); | ||
1187 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1188 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | ||
1189 | } | ||
1190 | for(apic = 0; apic < nr_ioapics; apic++) { | ||
1191 | int pin; | ||
1192 | /* See if any of the pins is in ExtINT mode */ | ||
1193 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1194 | struct IO_APIC_route_entry entry; | ||
1195 | entry = ioapic_read_entry(apic, pin); | ||
1196 | |||
1197 | /* If the interrupt line is enabled and in ExtInt mode | ||
1198 | * I have found the pin where the i8259 is connected. | ||
1199 | */ | ||
1200 | if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { | ||
1201 | ioapic_i8259.apic = apic; | ||
1202 | ioapic_i8259.pin = pin; | ||
1203 | goto found_i8259; | ||
1204 | } | ||
1205 | } | ||
1206 | } | ||
1207 | found_i8259: | ||
1208 | /* Look to see what if the MP table has reported the ExtINT */ | ||
1209 | i8259_pin = find_isa_irq_pin(0, mp_ExtINT); | ||
1210 | i8259_apic = find_isa_irq_apic(0, mp_ExtINT); | ||
1211 | /* Trust the MP table if nothing is setup in the hardware */ | ||
1212 | if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { | ||
1213 | printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); | ||
1214 | ioapic_i8259.pin = i8259_pin; | ||
1215 | ioapic_i8259.apic = i8259_apic; | ||
1216 | } | ||
1217 | /* Complain if the MP table and the hardware disagree */ | ||
1218 | if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && | ||
1219 | (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) | ||
1220 | { | ||
1221 | printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); | ||
1222 | } | ||
1223 | |||
1224 | /* | ||
1225 | * Do not trust the IO-APIC being empty at bootup | ||
1226 | */ | ||
1227 | clear_IO_APIC(); | ||
1228 | } | ||
1229 | |||
1230 | /* | ||
1231 | * Not an __init, needed by the reboot code | ||
1232 | */ | ||
1233 | void disable_IO_APIC(void) | ||
1234 | { | ||
1235 | /* | ||
1236 | * Clear the IO-APIC before rebooting: | ||
1237 | */ | ||
1238 | clear_IO_APIC(); | ||
1239 | |||
1240 | /* | ||
1241 | * If the i8259 is routed through an IOAPIC | ||
1242 | * Put that IOAPIC in virtual wire mode | ||
1243 | * so legacy interrupts can be delivered. | ||
1244 | */ | ||
1245 | if (ioapic_i8259.pin != -1) { | ||
1246 | struct IO_APIC_route_entry entry; | ||
1247 | |||
1248 | memset(&entry, 0, sizeof(entry)); | ||
1249 | entry.mask = 0; /* Enabled */ | ||
1250 | entry.trigger = 0; /* Edge */ | ||
1251 | entry.irr = 0; | ||
1252 | entry.polarity = 0; /* High */ | ||
1253 | entry.delivery_status = 0; | ||
1254 | entry.dest_mode = 0; /* Physical */ | ||
1255 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ | ||
1256 | entry.vector = 0; | ||
1257 | entry.dest = GET_APIC_ID(apic_read(APIC_ID)); | ||
1258 | |||
1259 | /* | ||
1260 | * Add it to the IO-APIC irq-routing table: | ||
1261 | */ | ||
1262 | ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); | ||
1263 | } | ||
1264 | |||
1265 | disconnect_bsp_APIC(ioapic_i8259.pin != -1); | ||
1266 | } | ||
1267 | |||
1268 | /* | ||
1269 | * There is a nasty bug in some older SMP boards, their mptable lies | ||
1270 | * about the timer IRQ. We do the following to work around the situation: | ||
1271 | * | ||
1272 | * - timer IRQ defaults to IO-APIC IRQ | ||
1273 | * - if this function detects that timer IRQs are defunct, then we fall | ||
1274 | * back to ISA timer IRQs | ||
1275 | */ | ||
1276 | static int __init timer_irq_works(void) | ||
1277 | { | ||
1278 | unsigned long t1 = jiffies; | ||
1279 | |||
1280 | local_irq_enable(); | ||
1281 | /* Let ten ticks pass... */ | ||
1282 | mdelay((10 * 1000) / HZ); | ||
1283 | |||
1284 | /* | ||
1285 | * Expect a few ticks at least, to be sure some possible | ||
1286 | * glue logic does not lock up after one or two first | ||
1287 | * ticks in a non-ExtINT mode. Also the local APIC | ||
1288 | * might have cached one ExtINT interrupt. Finally, at | ||
1289 | * least one tick may be lost due to delays. | ||
1290 | */ | ||
1291 | |||
1292 | /* jiffies wrap? */ | ||
1293 | if (jiffies - t1 > 4) | ||
1294 | return 1; | ||
1295 | return 0; | ||
1296 | } | ||
1297 | |||
1298 | /* | ||
1299 | * In the SMP+IOAPIC case it might happen that there are an unspecified | ||
1300 | * number of pending IRQ events unhandled. These cases are very rare, | ||
1301 | * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | ||
1302 | * better to do it this way as thus we do not have to be aware of | ||
1303 | * 'pending' interrupts in the IRQ path, except at this point. | ||
1304 | */ | ||
1305 | /* | ||
1306 | * Edge triggered needs to resend any interrupt | ||
1307 | * that was delayed but this is now handled in the device | ||
1308 | * independent code. | ||
1309 | */ | ||
1310 | |||
1311 | /* | ||
1312 | * Starting up a edge-triggered IO-APIC interrupt is | ||
1313 | * nasty - we need to make sure that we get the edge. | ||
1314 | * If it is already asserted for some reason, we need | ||
1315 | * return 1 to indicate that is was pending. | ||
1316 | * | ||
1317 | * This is not complete - we should be able to fake | ||
1318 | * an edge even if it isn't on the 8259A... | ||
1319 | */ | ||
1320 | |||
1321 | static unsigned int startup_ioapic_irq(unsigned int irq) | ||
1322 | { | ||
1323 | int was_pending = 0; | ||
1324 | unsigned long flags; | ||
1325 | |||
1326 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1327 | if (irq < 16) { | ||
1328 | disable_8259A_irq(irq); | ||
1329 | if (i8259A_irq_pending(irq)) | ||
1330 | was_pending = 1; | ||
1331 | } | ||
1332 | __unmask_IO_APIC_irq(irq); | ||
1333 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1334 | |||
1335 | return was_pending; | ||
1336 | } | ||
1337 | |||
1338 | static int ioapic_retrigger_irq(unsigned int irq) | ||
1339 | { | ||
1340 | struct irq_cfg *cfg = &irq_cfg[irq]; | ||
1341 | cpumask_t mask; | ||
1342 | unsigned long flags; | ||
1343 | |||
1344 | spin_lock_irqsave(&vector_lock, flags); | ||
1345 | cpus_clear(mask); | ||
1346 | cpu_set(first_cpu(cfg->domain), mask); | ||
1347 | |||
1348 | send_IPI_mask(mask, cfg->vector); | ||
1349 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1350 | |||
1351 | return 1; | ||
1352 | } | ||
1353 | |||
1354 | /* | ||
1355 | * Level and edge triggered IO-APIC interrupts need different handling, | ||
1356 | * so we use two separate IRQ descriptors. Edge triggered IRQs can be | ||
1357 | * handled with the level-triggered descriptor, but that one has slightly | ||
1358 | * more overhead. Level-triggered interrupts cannot be handled with the | ||
1359 | * edge-triggered handler, without risking IRQ storms and other ugly | ||
1360 | * races. | ||
1361 | */ | ||
1362 | |||
1363 | #ifdef CONFIG_SMP | ||
1364 | asmlinkage void smp_irq_move_cleanup_interrupt(void) | ||
1365 | { | ||
1366 | unsigned vector, me; | ||
1367 | ack_APIC_irq(); | ||
1368 | exit_idle(); | ||
1369 | irq_enter(); | ||
1370 | |||
1371 | me = smp_processor_id(); | ||
1372 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { | ||
1373 | unsigned int irq; | ||
1374 | struct irq_desc *desc; | ||
1375 | struct irq_cfg *cfg; | ||
1376 | irq = __get_cpu_var(vector_irq)[vector]; | ||
1377 | if (irq >= NR_IRQS) | ||
1378 | continue; | ||
1379 | |||
1380 | desc = irq_desc + irq; | ||
1381 | cfg = irq_cfg + irq; | ||
1382 | spin_lock(&desc->lock); | ||
1383 | if (!cfg->move_cleanup_count) | ||
1384 | goto unlock; | ||
1385 | |||
1386 | if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) | ||
1387 | goto unlock; | ||
1388 | |||
1389 | __get_cpu_var(vector_irq)[vector] = -1; | ||
1390 | cfg->move_cleanup_count--; | ||
1391 | unlock: | ||
1392 | spin_unlock(&desc->lock); | ||
1393 | } | ||
1394 | |||
1395 | irq_exit(); | ||
1396 | } | ||
1397 | |||
1398 | static void irq_complete_move(unsigned int irq) | ||
1399 | { | ||
1400 | struct irq_cfg *cfg = irq_cfg + irq; | ||
1401 | unsigned vector, me; | ||
1402 | |||
1403 | if (likely(!cfg->move_in_progress)) | ||
1404 | return; | ||
1405 | |||
1406 | vector = ~get_irq_regs()->orig_rax; | ||
1407 | me = smp_processor_id(); | ||
1408 | if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { | ||
1409 | cpumask_t cleanup_mask; | ||
1410 | |||
1411 | cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); | ||
1412 | cfg->move_cleanup_count = cpus_weight(cleanup_mask); | ||
1413 | send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); | ||
1414 | cfg->move_in_progress = 0; | ||
1415 | } | ||
1416 | } | ||
1417 | #else | ||
1418 | static inline void irq_complete_move(unsigned int irq) {} | ||
1419 | #endif | ||
1420 | |||
1421 | static void ack_apic_edge(unsigned int irq) | ||
1422 | { | ||
1423 | irq_complete_move(irq); | ||
1424 | move_native_irq(irq); | ||
1425 | ack_APIC_irq(); | ||
1426 | } | ||
1427 | |||
1428 | static void ack_apic_level(unsigned int irq) | ||
1429 | { | ||
1430 | int do_unmask_irq = 0; | ||
1431 | |||
1432 | irq_complete_move(irq); | ||
1433 | #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) | ||
1434 | /* If we are moving the irq we need to mask it */ | ||
1435 | if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { | ||
1436 | do_unmask_irq = 1; | ||
1437 | mask_IO_APIC_irq(irq); | ||
1438 | } | ||
1439 | #endif | ||
1440 | |||
1441 | /* | ||
1442 | * We must acknowledge the irq before we move it or the acknowledge will | ||
1443 | * not propagate properly. | ||
1444 | */ | ||
1445 | ack_APIC_irq(); | ||
1446 | |||
1447 | /* Now we can move and renable the irq */ | ||
1448 | if (unlikely(do_unmask_irq)) { | ||
1449 | /* Only migrate the irq if the ack has been received. | ||
1450 | * | ||
1451 | * On rare occasions the broadcast level triggered ack gets | ||
1452 | * delayed going to ioapics, and if we reprogram the | ||
1453 | * vector while Remote IRR is still set the irq will never | ||
1454 | * fire again. | ||
1455 | * | ||
1456 | * To prevent this scenario we read the Remote IRR bit | ||
1457 | * of the ioapic. This has two effects. | ||
1458 | * - On any sane system the read of the ioapic will | ||
1459 | * flush writes (and acks) going to the ioapic from | ||
1460 | * this cpu. | ||
1461 | * - We get to see if the ACK has actually been delivered. | ||
1462 | * | ||
1463 | * Based on failed experiments of reprogramming the | ||
1464 | * ioapic entry from outside of irq context starting | ||
1465 | * with masking the ioapic entry and then polling until | ||
1466 | * Remote IRR was clear before reprogramming the | ||
1467 | * ioapic I don't trust the Remote IRR bit to be | ||
1468 | * completey accurate. | ||
1469 | * | ||
1470 | * However there appears to be no other way to plug | ||
1471 | * this race, so if the Remote IRR bit is not | ||
1472 | * accurate and is causing problems then it is a hardware bug | ||
1473 | * and you can go talk to the chipset vendor about it. | ||
1474 | */ | ||
1475 | if (!io_apic_level_ack_pending(irq)) | ||
1476 | move_masked_irq(irq); | ||
1477 | unmask_IO_APIC_irq(irq); | ||
1478 | } | ||
1479 | } | ||
1480 | |||
1481 | static struct irq_chip ioapic_chip __read_mostly = { | ||
1482 | .name = "IO-APIC", | ||
1483 | .startup = startup_ioapic_irq, | ||
1484 | .mask = mask_IO_APIC_irq, | ||
1485 | .unmask = unmask_IO_APIC_irq, | ||
1486 | .ack = ack_apic_edge, | ||
1487 | .eoi = ack_apic_level, | ||
1488 | #ifdef CONFIG_SMP | ||
1489 | .set_affinity = set_ioapic_affinity_irq, | ||
1490 | #endif | ||
1491 | .retrigger = ioapic_retrigger_irq, | ||
1492 | }; | ||
1493 | |||
1494 | static inline void init_IO_APIC_traps(void) | ||
1495 | { | ||
1496 | int irq; | ||
1497 | |||
1498 | /* | ||
1499 | * NOTE! The local APIC isn't very good at handling | ||
1500 | * multiple interrupts at the same interrupt level. | ||
1501 | * As the interrupt level is determined by taking the | ||
1502 | * vector number and shifting that right by 4, we | ||
1503 | * want to spread these out a bit so that they don't | ||
1504 | * all fall in the same interrupt level. | ||
1505 | * | ||
1506 | * Also, we've got to be careful not to trash gate | ||
1507 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
1508 | */ | ||
1509 | for (irq = 0; irq < NR_IRQS ; irq++) { | ||
1510 | int tmp = irq; | ||
1511 | if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) { | ||
1512 | /* | ||
1513 | * Hmm.. We don't have an entry for this, | ||
1514 | * so default to an old-fashioned 8259 | ||
1515 | * interrupt if we can.. | ||
1516 | */ | ||
1517 | if (irq < 16) | ||
1518 | make_8259A_irq(irq); | ||
1519 | else | ||
1520 | /* Strange. Oh, well.. */ | ||
1521 | irq_desc[irq].chip = &no_irq_chip; | ||
1522 | } | ||
1523 | } | ||
1524 | } | ||
1525 | |||
1526 | static void enable_lapic_irq (unsigned int irq) | ||
1527 | { | ||
1528 | unsigned long v; | ||
1529 | |||
1530 | v = apic_read(APIC_LVT0); | ||
1531 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); | ||
1532 | } | ||
1533 | |||
1534 | static void disable_lapic_irq (unsigned int irq) | ||
1535 | { | ||
1536 | unsigned long v; | ||
1537 | |||
1538 | v = apic_read(APIC_LVT0); | ||
1539 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | ||
1540 | } | ||
1541 | |||
1542 | static void ack_lapic_irq (unsigned int irq) | ||
1543 | { | ||
1544 | ack_APIC_irq(); | ||
1545 | } | ||
1546 | |||
1547 | static void end_lapic_irq (unsigned int i) { /* nothing */ } | ||
1548 | |||
1549 | static struct hw_interrupt_type lapic_irq_type __read_mostly = { | ||
1550 | .name = "local-APIC", | ||
1551 | .typename = "local-APIC-edge", | ||
1552 | .startup = NULL, /* startup_irq() not used for IRQ0 */ | ||
1553 | .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ | ||
1554 | .enable = enable_lapic_irq, | ||
1555 | .disable = disable_lapic_irq, | ||
1556 | .ack = ack_lapic_irq, | ||
1557 | .end = end_lapic_irq, | ||
1558 | }; | ||
1559 | |||
1560 | static void setup_nmi (void) | ||
1561 | { | ||
1562 | /* | ||
1563 | * Dirty trick to enable the NMI watchdog ... | ||
1564 | * We put the 8259A master into AEOI mode and | ||
1565 | * unmask on all local APICs LVT0 as NMI. | ||
1566 | * | ||
1567 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | ||
1568 | * is from Maciej W. Rozycki - so we do not have to EOI from | ||
1569 | * the NMI handler or the timer interrupt. | ||
1570 | */ | ||
1571 | printk(KERN_INFO "activating NMI Watchdog ..."); | ||
1572 | |||
1573 | enable_NMI_through_LVT0(NULL); | ||
1574 | |||
1575 | printk(" done.\n"); | ||
1576 | } | ||
1577 | |||
1578 | /* | ||
1579 | * This looks a bit hackish but it's about the only one way of sending | ||
1580 | * a few INTA cycles to 8259As and any associated glue logic. ICR does | ||
1581 | * not support the ExtINT mode, unfortunately. We need to send these | ||
1582 | * cycles as some i82489DX-based boards have glue logic that keeps the | ||
1583 | * 8259A interrupt line asserted until INTA. --macro | ||
1584 | */ | ||
1585 | static inline void unlock_ExtINT_logic(void) | ||
1586 | { | ||
1587 | int apic, pin, i; | ||
1588 | struct IO_APIC_route_entry entry0, entry1; | ||
1589 | unsigned char save_control, save_freq_select; | ||
1590 | unsigned long flags; | ||
1591 | |||
1592 | pin = find_isa_irq_pin(8, mp_INT); | ||
1593 | apic = find_isa_irq_apic(8, mp_INT); | ||
1594 | if (pin == -1) | ||
1595 | return; | ||
1596 | |||
1597 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1598 | *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | ||
1599 | *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | ||
1600 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1601 | clear_IO_APIC_pin(apic, pin); | ||
1602 | |||
1603 | memset(&entry1, 0, sizeof(entry1)); | ||
1604 | |||
1605 | entry1.dest_mode = 0; /* physical delivery */ | ||
1606 | entry1.mask = 0; /* unmask IRQ now */ | ||
1607 | entry1.dest = hard_smp_processor_id(); | ||
1608 | entry1.delivery_mode = dest_ExtINT; | ||
1609 | entry1.polarity = entry0.polarity; | ||
1610 | entry1.trigger = 0; | ||
1611 | entry1.vector = 0; | ||
1612 | |||
1613 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1614 | io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); | ||
1615 | io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); | ||
1616 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1617 | |||
1618 | save_control = CMOS_READ(RTC_CONTROL); | ||
1619 | save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
1620 | CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | ||
1621 | RTC_FREQ_SELECT); | ||
1622 | CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | ||
1623 | |||
1624 | i = 100; | ||
1625 | while (i-- > 0) { | ||
1626 | mdelay(10); | ||
1627 | if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | ||
1628 | i -= 10; | ||
1629 | } | ||
1630 | |||
1631 | CMOS_WRITE(save_control, RTC_CONTROL); | ||
1632 | CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | ||
1633 | clear_IO_APIC_pin(apic, pin); | ||
1634 | |||
1635 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1636 | io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); | ||
1637 | io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); | ||
1638 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1639 | } | ||
1640 | |||
1641 | /* | ||
1642 | * This code may look a bit paranoid, but it's supposed to cooperate with | ||
1643 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | ||
1644 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | ||
1645 | * fanatically on his truly buggy board. | ||
1646 | * | ||
1647 | * FIXME: really need to revamp this for modern platforms only. | ||
1648 | */ | ||
1649 | static inline void check_timer(void) | ||
1650 | { | ||
1651 | struct irq_cfg *cfg = irq_cfg + 0; | ||
1652 | int apic1, pin1, apic2, pin2; | ||
1653 | |||
1654 | /* | ||
1655 | * get/set the timer IRQ vector: | ||
1656 | */ | ||
1657 | disable_8259A_irq(0); | ||
1658 | assign_irq_vector(0, TARGET_CPUS); | ||
1659 | |||
1660 | /* | ||
1661 | * Subtle, code in do_timer_interrupt() expects an AEOI | ||
1662 | * mode for the 8259A whenever interrupts are routed | ||
1663 | * through I/O APICs. Also IRQ0 has to be enabled in | ||
1664 | * the 8259A which implies the virtual wire has to be | ||
1665 | * disabled in the local APIC. | ||
1666 | */ | ||
1667 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
1668 | init_8259A(1); | ||
1669 | if (timer_over_8254 > 0) | ||
1670 | enable_8259A_irq(0); | ||
1671 | |||
1672 | pin1 = find_isa_irq_pin(0, mp_INT); | ||
1673 | apic1 = find_isa_irq_apic(0, mp_INT); | ||
1674 | pin2 = ioapic_i8259.pin; | ||
1675 | apic2 = ioapic_i8259.apic; | ||
1676 | |||
1677 | apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", | ||
1678 | cfg->vector, apic1, pin1, apic2, pin2); | ||
1679 | |||
1680 | if (pin1 != -1) { | ||
1681 | /* | ||
1682 | * Ok, does IRQ0 through the IOAPIC work? | ||
1683 | */ | ||
1684 | unmask_IO_APIC_irq(0); | ||
1685 | if (!no_timer_check && timer_irq_works()) { | ||
1686 | nmi_watchdog_default(); | ||
1687 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1688 | disable_8259A_irq(0); | ||
1689 | setup_nmi(); | ||
1690 | enable_8259A_irq(0); | ||
1691 | } | ||
1692 | if (disable_timer_pin_1 > 0) | ||
1693 | clear_IO_APIC_pin(0, pin1); | ||
1694 | return; | ||
1695 | } | ||
1696 | clear_IO_APIC_pin(apic1, pin1); | ||
1697 | apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " | ||
1698 | "connected to IO-APIC\n"); | ||
1699 | } | ||
1700 | |||
1701 | apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " | ||
1702 | "through the 8259A ... "); | ||
1703 | if (pin2 != -1) { | ||
1704 | apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", | ||
1705 | apic2, pin2); | ||
1706 | /* | ||
1707 | * legacy devices should be connected to IO APIC #0 | ||
1708 | */ | ||
1709 | setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); | ||
1710 | if (timer_irq_works()) { | ||
1711 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1712 | nmi_watchdog_default(); | ||
1713 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1714 | setup_nmi(); | ||
1715 | } | ||
1716 | return; | ||
1717 | } | ||
1718 | /* | ||
1719 | * Cleanup, just in case ... | ||
1720 | */ | ||
1721 | clear_IO_APIC_pin(apic2, pin2); | ||
1722 | } | ||
1723 | apic_printk(APIC_VERBOSE," failed.\n"); | ||
1724 | |||
1725 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1726 | printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | ||
1727 | nmi_watchdog = 0; | ||
1728 | } | ||
1729 | |||
1730 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | ||
1731 | |||
1732 | disable_8259A_irq(0); | ||
1733 | irq_desc[0].chip = &lapic_irq_type; | ||
1734 | apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ | ||
1735 | enable_8259A_irq(0); | ||
1736 | |||
1737 | if (timer_irq_works()) { | ||
1738 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1739 | return; | ||
1740 | } | ||
1741 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); | ||
1742 | apic_printk(APIC_VERBOSE," failed.\n"); | ||
1743 | |||
1744 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | ||
1745 | |||
1746 | init_8259A(0); | ||
1747 | make_8259A_irq(0); | ||
1748 | apic_write(APIC_LVT0, APIC_DM_EXTINT); | ||
1749 | |||
1750 | unlock_ExtINT_logic(); | ||
1751 | |||
1752 | if (timer_irq_works()) { | ||
1753 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1754 | return; | ||
1755 | } | ||
1756 | apic_printk(APIC_VERBOSE," failed :(.\n"); | ||
1757 | panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); | ||
1758 | } | ||
1759 | |||
1760 | static int __init notimercheck(char *s) | ||
1761 | { | ||
1762 | no_timer_check = 1; | ||
1763 | return 1; | ||
1764 | } | ||
1765 | __setup("no_timer_check", notimercheck); | ||
1766 | |||
1767 | /* | ||
1768 | * | ||
1769 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. | ||
1770 | * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | ||
1771 | * Linux doesn't really care, as it's not actually used | ||
1772 | * for any interrupt handling anyway. | ||
1773 | */ | ||
1774 | #define PIC_IRQS (1<<2) | ||
1775 | |||
1776 | void __init setup_IO_APIC(void) | ||
1777 | { | ||
1778 | enable_IO_APIC(); | ||
1779 | |||
1780 | if (acpi_ioapic) | ||
1781 | io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | ||
1782 | else | ||
1783 | io_apic_irqs = ~PIC_IRQS; | ||
1784 | |||
1785 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | ||
1786 | |||
1787 | sync_Arb_IDs(); | ||
1788 | setup_IO_APIC_irqs(); | ||
1789 | init_IO_APIC_traps(); | ||
1790 | check_timer(); | ||
1791 | if (!acpi_ioapic) | ||
1792 | print_IO_APIC(); | ||
1793 | } | ||
1794 | |||
1795 | struct sysfs_ioapic_data { | ||
1796 | struct sys_device dev; | ||
1797 | struct IO_APIC_route_entry entry[0]; | ||
1798 | }; | ||
1799 | static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | ||
1800 | |||
1801 | static int ioapic_suspend(struct sys_device *dev, pm_message_t state) | ||
1802 | { | ||
1803 | struct IO_APIC_route_entry *entry; | ||
1804 | struct sysfs_ioapic_data *data; | ||
1805 | int i; | ||
1806 | |||
1807 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1808 | entry = data->entry; | ||
1809 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) | ||
1810 | *entry = ioapic_read_entry(dev->id, i); | ||
1811 | |||
1812 | return 0; | ||
1813 | } | ||
1814 | |||
1815 | static int ioapic_resume(struct sys_device *dev) | ||
1816 | { | ||
1817 | struct IO_APIC_route_entry *entry; | ||
1818 | struct sysfs_ioapic_data *data; | ||
1819 | unsigned long flags; | ||
1820 | union IO_APIC_reg_00 reg_00; | ||
1821 | int i; | ||
1822 | |||
1823 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1824 | entry = data->entry; | ||
1825 | |||
1826 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1827 | reg_00.raw = io_apic_read(dev->id, 0); | ||
1828 | if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | ||
1829 | reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | ||
1830 | io_apic_write(dev->id, 0, reg_00.raw); | ||
1831 | } | ||
1832 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1833 | for (i = 0; i < nr_ioapic_registers[dev->id]; i++) | ||
1834 | ioapic_write_entry(dev->id, i, entry[i]); | ||
1835 | |||
1836 | return 0; | ||
1837 | } | ||
1838 | |||
1839 | static struct sysdev_class ioapic_sysdev_class = { | ||
1840 | set_kset_name("ioapic"), | ||
1841 | .suspend = ioapic_suspend, | ||
1842 | .resume = ioapic_resume, | ||
1843 | }; | ||
1844 | |||
1845 | static int __init ioapic_init_sysfs(void) | ||
1846 | { | ||
1847 | struct sys_device * dev; | ||
1848 | int i, size, error = 0; | ||
1849 | |||
1850 | error = sysdev_class_register(&ioapic_sysdev_class); | ||
1851 | if (error) | ||
1852 | return error; | ||
1853 | |||
1854 | for (i = 0; i < nr_ioapics; i++ ) { | ||
1855 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] | ||
1856 | * sizeof(struct IO_APIC_route_entry); | ||
1857 | mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | ||
1858 | if (!mp_ioapic_data[i]) { | ||
1859 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1860 | continue; | ||
1861 | } | ||
1862 | memset(mp_ioapic_data[i], 0, size); | ||
1863 | dev = &mp_ioapic_data[i]->dev; | ||
1864 | dev->id = i; | ||
1865 | dev->cls = &ioapic_sysdev_class; | ||
1866 | error = sysdev_register(dev); | ||
1867 | if (error) { | ||
1868 | kfree(mp_ioapic_data[i]); | ||
1869 | mp_ioapic_data[i] = NULL; | ||
1870 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1871 | continue; | ||
1872 | } | ||
1873 | } | ||
1874 | |||
1875 | return 0; | ||
1876 | } | ||
1877 | |||
1878 | device_initcall(ioapic_init_sysfs); | ||
1879 | |||
1880 | /* | ||
1881 | * Dynamic irq allocate and deallocation | ||
1882 | */ | ||
1883 | int create_irq(void) | ||
1884 | { | ||
1885 | /* Allocate an unused irq */ | ||
1886 | int irq; | ||
1887 | int new; | ||
1888 | unsigned long flags; | ||
1889 | |||
1890 | irq = -ENOSPC; | ||
1891 | spin_lock_irqsave(&vector_lock, flags); | ||
1892 | for (new = (NR_IRQS - 1); new >= 0; new--) { | ||
1893 | if (platform_legacy_irq(new)) | ||
1894 | continue; | ||
1895 | if (irq_cfg[new].vector != 0) | ||
1896 | continue; | ||
1897 | if (__assign_irq_vector(new, TARGET_CPUS) == 0) | ||
1898 | irq = new; | ||
1899 | break; | ||
1900 | } | ||
1901 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1902 | |||
1903 | if (irq >= 0) { | ||
1904 | dynamic_irq_init(irq); | ||
1905 | } | ||
1906 | return irq; | ||
1907 | } | ||
1908 | |||
1909 | void destroy_irq(unsigned int irq) | ||
1910 | { | ||
1911 | unsigned long flags; | ||
1912 | |||
1913 | dynamic_irq_cleanup(irq); | ||
1914 | |||
1915 | spin_lock_irqsave(&vector_lock, flags); | ||
1916 | __clear_irq_vector(irq); | ||
1917 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1918 | } | ||
1919 | |||
1920 | /* | ||
1921 | * MSI mesage composition | ||
1922 | */ | ||
1923 | #ifdef CONFIG_PCI_MSI | ||
1924 | static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) | ||
1925 | { | ||
1926 | struct irq_cfg *cfg = irq_cfg + irq; | ||
1927 | int err; | ||
1928 | unsigned dest; | ||
1929 | cpumask_t tmp; | ||
1930 | |||
1931 | tmp = TARGET_CPUS; | ||
1932 | err = assign_irq_vector(irq, tmp); | ||
1933 | if (!err) { | ||
1934 | cpus_and(tmp, cfg->domain, tmp); | ||
1935 | dest = cpu_mask_to_apicid(tmp); | ||
1936 | |||
1937 | msg->address_hi = MSI_ADDR_BASE_HI; | ||
1938 | msg->address_lo = | ||
1939 | MSI_ADDR_BASE_LO | | ||
1940 | ((INT_DEST_MODE == 0) ? | ||
1941 | MSI_ADDR_DEST_MODE_PHYSICAL: | ||
1942 | MSI_ADDR_DEST_MODE_LOGICAL) | | ||
1943 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
1944 | MSI_ADDR_REDIRECTION_CPU: | ||
1945 | MSI_ADDR_REDIRECTION_LOWPRI) | | ||
1946 | MSI_ADDR_DEST_ID(dest); | ||
1947 | |||
1948 | msg->data = | ||
1949 | MSI_DATA_TRIGGER_EDGE | | ||
1950 | MSI_DATA_LEVEL_ASSERT | | ||
1951 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
1952 | MSI_DATA_DELIVERY_FIXED: | ||
1953 | MSI_DATA_DELIVERY_LOWPRI) | | ||
1954 | MSI_DATA_VECTOR(cfg->vector); | ||
1955 | } | ||
1956 | return err; | ||
1957 | } | ||
1958 | |||
1959 | #ifdef CONFIG_SMP | ||
1960 | static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | ||
1961 | { | ||
1962 | struct irq_cfg *cfg = irq_cfg + irq; | ||
1963 | struct msi_msg msg; | ||
1964 | unsigned int dest; | ||
1965 | cpumask_t tmp; | ||
1966 | |||
1967 | cpus_and(tmp, mask, cpu_online_map); | ||
1968 | if (cpus_empty(tmp)) | ||
1969 | return; | ||
1970 | |||
1971 | if (assign_irq_vector(irq, mask)) | ||
1972 | return; | ||
1973 | |||
1974 | cpus_and(tmp, cfg->domain, mask); | ||
1975 | dest = cpu_mask_to_apicid(tmp); | ||
1976 | |||
1977 | read_msi_msg(irq, &msg); | ||
1978 | |||
1979 | msg.data &= ~MSI_DATA_VECTOR_MASK; | ||
1980 | msg.data |= MSI_DATA_VECTOR(cfg->vector); | ||
1981 | msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; | ||
1982 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | ||
1983 | |||
1984 | write_msi_msg(irq, &msg); | ||
1985 | irq_desc[irq].affinity = mask; | ||
1986 | } | ||
1987 | #endif /* CONFIG_SMP */ | ||
1988 | |||
1989 | /* | ||
1990 | * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, | ||
1991 | * which implement the MSI or MSI-X Capability Structure. | ||
1992 | */ | ||
1993 | static struct irq_chip msi_chip = { | ||
1994 | .name = "PCI-MSI", | ||
1995 | .unmask = unmask_msi_irq, | ||
1996 | .mask = mask_msi_irq, | ||
1997 | .ack = ack_apic_edge, | ||
1998 | #ifdef CONFIG_SMP | ||
1999 | .set_affinity = set_msi_irq_affinity, | ||
2000 | #endif | ||
2001 | .retrigger = ioapic_retrigger_irq, | ||
2002 | }; | ||
2003 | |||
2004 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) | ||
2005 | { | ||
2006 | struct msi_msg msg; | ||
2007 | int irq, ret; | ||
2008 | irq = create_irq(); | ||
2009 | if (irq < 0) | ||
2010 | return irq; | ||
2011 | |||
2012 | ret = msi_compose_msg(dev, irq, &msg); | ||
2013 | if (ret < 0) { | ||
2014 | destroy_irq(irq); | ||
2015 | return ret; | ||
2016 | } | ||
2017 | |||
2018 | set_irq_msi(irq, desc); | ||
2019 | write_msi_msg(irq, &msg); | ||
2020 | |||
2021 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); | ||
2022 | |||
2023 | return 0; | ||
2024 | } | ||
2025 | |||
2026 | void arch_teardown_msi_irq(unsigned int irq) | ||
2027 | { | ||
2028 | destroy_irq(irq); | ||
2029 | } | ||
2030 | |||
2031 | #endif /* CONFIG_PCI_MSI */ | ||
2032 | |||
2033 | /* | ||
2034 | * Hypertransport interrupt support | ||
2035 | */ | ||
2036 | #ifdef CONFIG_HT_IRQ | ||
2037 | |||
2038 | #ifdef CONFIG_SMP | ||
2039 | |||
2040 | static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) | ||
2041 | { | ||
2042 | struct ht_irq_msg msg; | ||
2043 | fetch_ht_irq_msg(irq, &msg); | ||
2044 | |||
2045 | msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); | ||
2046 | msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); | ||
2047 | |||
2048 | msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); | ||
2049 | msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); | ||
2050 | |||
2051 | write_ht_irq_msg(irq, &msg); | ||
2052 | } | ||
2053 | |||
2054 | static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) | ||
2055 | { | ||
2056 | struct irq_cfg *cfg = irq_cfg + irq; | ||
2057 | unsigned int dest; | ||
2058 | cpumask_t tmp; | ||
2059 | |||
2060 | cpus_and(tmp, mask, cpu_online_map); | ||
2061 | if (cpus_empty(tmp)) | ||
2062 | return; | ||
2063 | |||
2064 | if (assign_irq_vector(irq, mask)) | ||
2065 | return; | ||
2066 | |||
2067 | cpus_and(tmp, cfg->domain, mask); | ||
2068 | dest = cpu_mask_to_apicid(tmp); | ||
2069 | |||
2070 | target_ht_irq(irq, dest, cfg->vector); | ||
2071 | irq_desc[irq].affinity = mask; | ||
2072 | } | ||
2073 | #endif | ||
2074 | |||
2075 | static struct irq_chip ht_irq_chip = { | ||
2076 | .name = "PCI-HT", | ||
2077 | .mask = mask_ht_irq, | ||
2078 | .unmask = unmask_ht_irq, | ||
2079 | .ack = ack_apic_edge, | ||
2080 | #ifdef CONFIG_SMP | ||
2081 | .set_affinity = set_ht_irq_affinity, | ||
2082 | #endif | ||
2083 | .retrigger = ioapic_retrigger_irq, | ||
2084 | }; | ||
2085 | |||
2086 | int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | ||
2087 | { | ||
2088 | struct irq_cfg *cfg = irq_cfg + irq; | ||
2089 | int err; | ||
2090 | cpumask_t tmp; | ||
2091 | |||
2092 | tmp = TARGET_CPUS; | ||
2093 | err = assign_irq_vector(irq, tmp); | ||
2094 | if (!err) { | ||
2095 | struct ht_irq_msg msg; | ||
2096 | unsigned dest; | ||
2097 | |||
2098 | cpus_and(tmp, cfg->domain, tmp); | ||
2099 | dest = cpu_mask_to_apicid(tmp); | ||
2100 | |||
2101 | msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); | ||
2102 | |||
2103 | msg.address_lo = | ||
2104 | HT_IRQ_LOW_BASE | | ||
2105 | HT_IRQ_LOW_DEST_ID(dest) | | ||
2106 | HT_IRQ_LOW_VECTOR(cfg->vector) | | ||
2107 | ((INT_DEST_MODE == 0) ? | ||
2108 | HT_IRQ_LOW_DM_PHYSICAL : | ||
2109 | HT_IRQ_LOW_DM_LOGICAL) | | ||
2110 | HT_IRQ_LOW_RQEOI_EDGE | | ||
2111 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2112 | HT_IRQ_LOW_MT_FIXED : | ||
2113 | HT_IRQ_LOW_MT_ARBITRATED) | | ||
2114 | HT_IRQ_LOW_IRQ_MASKED; | ||
2115 | |||
2116 | write_ht_irq_msg(irq, &msg); | ||
2117 | |||
2118 | set_irq_chip_and_handler_name(irq, &ht_irq_chip, | ||
2119 | handle_edge_irq, "edge"); | ||
2120 | } | ||
2121 | return err; | ||
2122 | } | ||
2123 | #endif /* CONFIG_HT_IRQ */ | ||
2124 | |||
2125 | /* -------------------------------------------------------------------------- | ||
2126 | ACPI-based IOAPIC Configuration | ||
2127 | -------------------------------------------------------------------------- */ | ||
2128 | |||
2129 | #ifdef CONFIG_ACPI | ||
2130 | |||
2131 | #define IO_APIC_MAX_ID 0xFE | ||
2132 | |||
2133 | int __init io_apic_get_redir_entries (int ioapic) | ||
2134 | { | ||
2135 | union IO_APIC_reg_01 reg_01; | ||
2136 | unsigned long flags; | ||
2137 | |||
2138 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2139 | reg_01.raw = io_apic_read(ioapic, 1); | ||
2140 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2141 | |||
2142 | return reg_01.bits.entries; | ||
2143 | } | ||
2144 | |||
2145 | |||
2146 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) | ||
2147 | { | ||
2148 | if (!IO_APIC_IRQ(irq)) { | ||
2149 | apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
2150 | ioapic); | ||
2151 | return -EINVAL; | ||
2152 | } | ||
2153 | |||
2154 | /* | ||
2155 | * IRQs < 16 are already in the irq_2_pin[] map | ||
2156 | */ | ||
2157 | if (irq >= 16) | ||
2158 | add_pin_to_irq(irq, ioapic, pin); | ||
2159 | |||
2160 | setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); | ||
2161 | |||
2162 | return 0; | ||
2163 | } | ||
2164 | |||
2165 | #endif /* CONFIG_ACPI */ | ||
2166 | |||
2167 | |||
2168 | /* | ||
2169 | * This function currently is only a helper for the i386 smp boot process where | ||
2170 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | ||
2171 | * so mask in all cases should simply be TARGET_CPUS | ||
2172 | */ | ||
2173 | #ifdef CONFIG_SMP | ||
2174 | void __init setup_ioapic_dest(void) | ||
2175 | { | ||
2176 | int pin, ioapic, irq, irq_entry; | ||
2177 | |||
2178 | if (skip_ioapic_setup == 1) | ||
2179 | return; | ||
2180 | |||
2181 | for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | ||
2182 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | ||
2183 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | ||
2184 | if (irq_entry == -1) | ||
2185 | continue; | ||
2186 | irq = pin_2_irq(irq_entry, ioapic, pin); | ||
2187 | |||
2188 | /* setup_IO_APIC_irqs could fail to get vector for some device | ||
2189 | * when you have too many devices, because at that time only boot | ||
2190 | * cpu is online. | ||
2191 | */ | ||
2192 | if (!irq_cfg[irq].vector) | ||
2193 | setup_IO_APIC_irq(ioapic, pin, irq, | ||
2194 | irq_trigger(irq_entry), | ||
2195 | irq_polarity(irq_entry)); | ||
2196 | else | ||
2197 | set_ioapic_affinity_irq(irq, TARGET_CPUS); | ||
2198 | } | ||
2199 | |||
2200 | } | ||
2201 | } | ||
2202 | #endif | ||
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c deleted file mode 100644 index 653efa30b0f4..000000000000 --- a/arch/x86_64/kernel/ioport.c +++ /dev/null | |||
@@ -1,119 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ioport.c | ||
3 | * | ||
4 | * This contains the io-permission bitmap code - written by obz, with changes | ||
5 | * by Linus. | ||
6 | */ | ||
7 | |||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/capability.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/ioport.h> | ||
14 | #include <linux/smp.h> | ||
15 | #include <linux/stddef.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/thread_info.h> | ||
18 | #include <linux/syscalls.h> | ||
19 | |||
20 | /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | ||
21 | static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | ||
22 | { | ||
23 | int i; | ||
24 | if (new_value) | ||
25 | for (i = base; i < base + extent; i++) | ||
26 | __set_bit(i, bitmap); | ||
27 | else | ||
28 | for (i = base; i < base + extent; i++) | ||
29 | clear_bit(i, bitmap); | ||
30 | } | ||
31 | |||
32 | /* | ||
33 | * this changes the io permissions bitmap in the current task. | ||
34 | */ | ||
35 | asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | ||
36 | { | ||
37 | unsigned int i, max_long, bytes, bytes_updated; | ||
38 | struct thread_struct * t = ¤t->thread; | ||
39 | struct tss_struct * tss; | ||
40 | unsigned long *bitmap; | ||
41 | |||
42 | if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | ||
43 | return -EINVAL; | ||
44 | if (turn_on && !capable(CAP_SYS_RAWIO)) | ||
45 | return -EPERM; | ||
46 | |||
47 | /* | ||
48 | * If it's the first ioperm() call in this thread's lifetime, set the | ||
49 | * IO bitmap up. ioperm() is much less timing critical than clone(), | ||
50 | * this is why we delay this operation until now: | ||
51 | */ | ||
52 | if (!t->io_bitmap_ptr) { | ||
53 | bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
54 | if (!bitmap) | ||
55 | return -ENOMEM; | ||
56 | |||
57 | memset(bitmap, 0xff, IO_BITMAP_BYTES); | ||
58 | t->io_bitmap_ptr = bitmap; | ||
59 | set_thread_flag(TIF_IO_BITMAP); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * do it in the per-thread copy and in the TSS ... | ||
64 | * | ||
65 | * Disable preemption via get_cpu() - we must not switch away | ||
66 | * because the ->io_bitmap_max value must match the bitmap | ||
67 | * contents: | ||
68 | */ | ||
69 | tss = &per_cpu(init_tss, get_cpu()); | ||
70 | |||
71 | set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | ||
72 | |||
73 | /* | ||
74 | * Search for a (possibly new) maximum. This is simple and stupid, | ||
75 | * to keep it obviously correct: | ||
76 | */ | ||
77 | max_long = 0; | ||
78 | for (i = 0; i < IO_BITMAP_LONGS; i++) | ||
79 | if (t->io_bitmap_ptr[i] != ~0UL) | ||
80 | max_long = i; | ||
81 | |||
82 | bytes = (max_long + 1) * sizeof(long); | ||
83 | bytes_updated = max(bytes, t->io_bitmap_max); | ||
84 | |||
85 | t->io_bitmap_max = bytes; | ||
86 | |||
87 | /* Update the TSS: */ | ||
88 | memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); | ||
89 | |||
90 | put_cpu(); | ||
91 | |||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * sys_iopl has to be used when you want to access the IO ports | ||
97 | * beyond the 0x3ff range: to get the full 65536 ports bitmapped | ||
98 | * you'd need 8kB of bitmaps/process, which is a bit excessive. | ||
99 | * | ||
100 | * Here we just change the eflags value on the stack: we allow | ||
101 | * only the super-user to do it. This depends on the stack-layout | ||
102 | * on system-call entry - see also fork() and the signal handling | ||
103 | * code. | ||
104 | */ | ||
105 | |||
106 | asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) | ||
107 | { | ||
108 | unsigned int old = (regs->eflags >> 12) & 3; | ||
109 | |||
110 | if (level > 3) | ||
111 | return -EINVAL; | ||
112 | /* Trying to gain more privileges? */ | ||
113 | if (level > old) { | ||
114 | if (!capable(CAP_SYS_RAWIO)) | ||
115 | return -EPERM; | ||
116 | } | ||
117 | regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); | ||
118 | return 0; | ||
119 | } | ||
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c deleted file mode 100644 index 39cb3fa83ebb..000000000000 --- a/arch/x86_64/kernel/irq.c +++ /dev/null | |||
@@ -1,213 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/irq.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the lowest level x86_64-specific interrupt | ||
7 | * entry and irq statistics code. All the remaining irq logic is | ||
8 | * done by the generic kernel/irq/ code and in the | ||
9 | * x86_64-specific irq controller code. (e.g. i8259.c and | ||
10 | * io_apic.c.) | ||
11 | */ | ||
12 | |||
13 | #include <linux/kernel_stat.h> | ||
14 | #include <linux/interrupt.h> | ||
15 | #include <linux/seq_file.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <asm/uaccess.h> | ||
19 | #include <asm/io_apic.h> | ||
20 | #include <asm/idle.h> | ||
21 | #include <asm/smp.h> | ||
22 | |||
23 | atomic_t irq_err_count; | ||
24 | |||
25 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
26 | /* | ||
27 | * Probabilistic stack overflow check: | ||
28 | * | ||
29 | * Only check the stack in process context, because everything else | ||
30 | * runs on the big interrupt stacks. Checking reliably is too expensive, | ||
31 | * so we just check from interrupts. | ||
32 | */ | ||
33 | static inline void stack_overflow_check(struct pt_regs *regs) | ||
34 | { | ||
35 | u64 curbase = (u64)task_stack_page(current); | ||
36 | static unsigned long warned = -60*HZ; | ||
37 | |||
38 | if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && | ||
39 | regs->rsp < curbase + sizeof(struct thread_info) + 128 && | ||
40 | time_after(jiffies, warned + 60*HZ)) { | ||
41 | printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", | ||
42 | current->comm, curbase, regs->rsp); | ||
43 | show_stack(NULL,NULL); | ||
44 | warned = jiffies; | ||
45 | } | ||
46 | } | ||
47 | #endif | ||
48 | |||
49 | /* | ||
50 | * Generic, controller-independent functions: | ||
51 | */ | ||
52 | |||
53 | int show_interrupts(struct seq_file *p, void *v) | ||
54 | { | ||
55 | int i = *(loff_t *) v, j; | ||
56 | struct irqaction * action; | ||
57 | unsigned long flags; | ||
58 | |||
59 | if (i == 0) { | ||
60 | seq_printf(p, " "); | ||
61 | for_each_online_cpu(j) | ||
62 | seq_printf(p, "CPU%-8d",j); | ||
63 | seq_putc(p, '\n'); | ||
64 | } | ||
65 | |||
66 | if (i < NR_IRQS) { | ||
67 | spin_lock_irqsave(&irq_desc[i].lock, flags); | ||
68 | action = irq_desc[i].action; | ||
69 | if (!action) | ||
70 | goto skip; | ||
71 | seq_printf(p, "%3d: ",i); | ||
72 | #ifndef CONFIG_SMP | ||
73 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
74 | #else | ||
75 | for_each_online_cpu(j) | ||
76 | seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); | ||
77 | #endif | ||
78 | seq_printf(p, " %8s", irq_desc[i].chip->name); | ||
79 | seq_printf(p, "-%-8s", irq_desc[i].name); | ||
80 | |||
81 | seq_printf(p, " %s", action->name); | ||
82 | for (action=action->next; action; action = action->next) | ||
83 | seq_printf(p, ", %s", action->name); | ||
84 | seq_putc(p, '\n'); | ||
85 | skip: | ||
86 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | ||
87 | } else if (i == NR_IRQS) { | ||
88 | seq_printf(p, "NMI: "); | ||
89 | for_each_online_cpu(j) | ||
90 | seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); | ||
91 | seq_putc(p, '\n'); | ||
92 | seq_printf(p, "LOC: "); | ||
93 | for_each_online_cpu(j) | ||
94 | seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); | ||
95 | seq_putc(p, '\n'); | ||
96 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
97 | } | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * do_IRQ handles all normal device IRQ's (the special | ||
103 | * SMP cross-CPU interrupts have their own specific | ||
104 | * handlers). | ||
105 | */ | ||
106 | asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | ||
107 | { | ||
108 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
109 | |||
110 | /* high bit used in ret_from_ code */ | ||
111 | unsigned vector = ~regs->orig_rax; | ||
112 | unsigned irq; | ||
113 | |||
114 | exit_idle(); | ||
115 | irq_enter(); | ||
116 | irq = __get_cpu_var(vector_irq)[vector]; | ||
117 | |||
118 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
119 | stack_overflow_check(regs); | ||
120 | #endif | ||
121 | |||
122 | if (likely(irq < NR_IRQS)) | ||
123 | generic_handle_irq(irq); | ||
124 | else { | ||
125 | if (!disable_apic) | ||
126 | ack_APIC_irq(); | ||
127 | |||
128 | if (printk_ratelimit()) | ||
129 | printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", | ||
130 | __func__, smp_processor_id(), vector); | ||
131 | } | ||
132 | |||
133 | irq_exit(); | ||
134 | |||
135 | set_irq_regs(old_regs); | ||
136 | return 1; | ||
137 | } | ||
138 | |||
139 | #ifdef CONFIG_HOTPLUG_CPU | ||
140 | void fixup_irqs(cpumask_t map) | ||
141 | { | ||
142 | unsigned int irq; | ||
143 | static int warned; | ||
144 | |||
145 | for (irq = 0; irq < NR_IRQS; irq++) { | ||
146 | cpumask_t mask; | ||
147 | int break_affinity = 0; | ||
148 | int set_affinity = 1; | ||
149 | |||
150 | if (irq == 2) | ||
151 | continue; | ||
152 | |||
153 | /* interrupt's are disabled at this point */ | ||
154 | spin_lock(&irq_desc[irq].lock); | ||
155 | |||
156 | if (!irq_has_action(irq) || | ||
157 | cpus_equal(irq_desc[irq].affinity, map)) { | ||
158 | spin_unlock(&irq_desc[irq].lock); | ||
159 | continue; | ||
160 | } | ||
161 | |||
162 | cpus_and(mask, irq_desc[irq].affinity, map); | ||
163 | if (cpus_empty(mask)) { | ||
164 | break_affinity = 1; | ||
165 | mask = map; | ||
166 | } | ||
167 | |||
168 | if (irq_desc[irq].chip->mask) | ||
169 | irq_desc[irq].chip->mask(irq); | ||
170 | |||
171 | if (irq_desc[irq].chip->set_affinity) | ||
172 | irq_desc[irq].chip->set_affinity(irq, mask); | ||
173 | else if (!(warned++)) | ||
174 | set_affinity = 0; | ||
175 | |||
176 | if (irq_desc[irq].chip->unmask) | ||
177 | irq_desc[irq].chip->unmask(irq); | ||
178 | |||
179 | spin_unlock(&irq_desc[irq].lock); | ||
180 | |||
181 | if (break_affinity && set_affinity) | ||
182 | printk("Broke affinity for irq %i\n", irq); | ||
183 | else if (!set_affinity) | ||
184 | printk("Cannot set affinity for irq %i\n", irq); | ||
185 | } | ||
186 | |||
187 | /* That doesn't seem sufficient. Give it 1ms. */ | ||
188 | local_irq_enable(); | ||
189 | mdelay(1); | ||
190 | local_irq_disable(); | ||
191 | } | ||
192 | #endif | ||
193 | |||
194 | extern void call_softirq(void); | ||
195 | |||
196 | asmlinkage void do_softirq(void) | ||
197 | { | ||
198 | __u32 pending; | ||
199 | unsigned long flags; | ||
200 | |||
201 | if (in_interrupt()) | ||
202 | return; | ||
203 | |||
204 | local_irq_save(flags); | ||
205 | pending = local_softirq_pending(); | ||
206 | /* Switch to interrupt stack */ | ||
207 | if (pending) { | ||
208 | call_softirq(); | ||
209 | WARN_ON_ONCE(softirq_count()); | ||
210 | } | ||
211 | local_irq_restore(flags); | ||
212 | } | ||
213 | EXPORT_SYMBOL(do_softirq); | ||
diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c deleted file mode 100644 index 7377ccb21335..000000000000 --- a/arch/x86_64/kernel/k8.c +++ /dev/null | |||
@@ -1,123 +0,0 @@ | |||
1 | /* | ||
2 | * Shared support code for AMD K8 northbridges and derivates. | ||
3 | * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. | ||
4 | */ | ||
5 | #include <linux/gfp.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/errno.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/spinlock.h> | ||
11 | #include <asm/k8.h> | ||
12 | |||
13 | int num_k8_northbridges; | ||
14 | EXPORT_SYMBOL(num_k8_northbridges); | ||
15 | |||
16 | static u32 *flush_words; | ||
17 | |||
18 | struct pci_device_id k8_nb_ids[] = { | ||
19 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, | ||
20 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, | ||
21 | {} | ||
22 | }; | ||
23 | EXPORT_SYMBOL(k8_nb_ids); | ||
24 | |||
25 | struct pci_dev **k8_northbridges; | ||
26 | EXPORT_SYMBOL(k8_northbridges); | ||
27 | |||
28 | static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) | ||
29 | { | ||
30 | do { | ||
31 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); | ||
32 | if (!dev) | ||
33 | break; | ||
34 | } while (!pci_match_id(&k8_nb_ids[0], dev)); | ||
35 | return dev; | ||
36 | } | ||
37 | |||
38 | int cache_k8_northbridges(void) | ||
39 | { | ||
40 | int i; | ||
41 | struct pci_dev *dev; | ||
42 | |||
43 | if (num_k8_northbridges) | ||
44 | return 0; | ||
45 | |||
46 | dev = NULL; | ||
47 | while ((dev = next_k8_northbridge(dev)) != NULL) | ||
48 | num_k8_northbridges++; | ||
49 | |||
50 | k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), | ||
51 | GFP_KERNEL); | ||
52 | if (!k8_northbridges) | ||
53 | return -ENOMEM; | ||
54 | |||
55 | if (!num_k8_northbridges) { | ||
56 | k8_northbridges[0] = NULL; | ||
57 | return 0; | ||
58 | } | ||
59 | |||
60 | flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); | ||
61 | if (!flush_words) { | ||
62 | kfree(k8_northbridges); | ||
63 | return -ENOMEM; | ||
64 | } | ||
65 | |||
66 | dev = NULL; | ||
67 | i = 0; | ||
68 | while ((dev = next_k8_northbridge(dev)) != NULL) { | ||
69 | k8_northbridges[i] = dev; | ||
70 | pci_read_config_dword(dev, 0x9c, &flush_words[i++]); | ||
71 | } | ||
72 | k8_northbridges[i] = NULL; | ||
73 | return 0; | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(cache_k8_northbridges); | ||
76 | |||
77 | /* Ignores subdevice/subvendor but as far as I can figure out | ||
78 | they're useless anyways */ | ||
79 | int __init early_is_k8_nb(u32 device) | ||
80 | { | ||
81 | struct pci_device_id *id; | ||
82 | u32 vendor = device & 0xffff; | ||
83 | device >>= 16; | ||
84 | for (id = k8_nb_ids; id->vendor; id++) | ||
85 | if (vendor == id->vendor && device == id->device) | ||
86 | return 1; | ||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | void k8_flush_garts(void) | ||
91 | { | ||
92 | int flushed, i; | ||
93 | unsigned long flags; | ||
94 | static DEFINE_SPINLOCK(gart_lock); | ||
95 | |||
96 | /* Avoid races between AGP and IOMMU. In theory it's not needed | ||
97 | but I'm not sure if the hardware won't lose flush requests | ||
98 | when another is pending. This whole thing is so expensive anyways | ||
99 | that it doesn't matter to serialize more. -AK */ | ||
100 | spin_lock_irqsave(&gart_lock, flags); | ||
101 | flushed = 0; | ||
102 | for (i = 0; i < num_k8_northbridges; i++) { | ||
103 | pci_write_config_dword(k8_northbridges[i], 0x9c, | ||
104 | flush_words[i]|1); | ||
105 | flushed++; | ||
106 | } | ||
107 | for (i = 0; i < num_k8_northbridges; i++) { | ||
108 | u32 w; | ||
109 | /* Make sure the hardware actually executed the flush*/ | ||
110 | for (;;) { | ||
111 | pci_read_config_dword(k8_northbridges[i], | ||
112 | 0x9c, &w); | ||
113 | if (!(w & 1)) | ||
114 | break; | ||
115 | cpu_relax(); | ||
116 | } | ||
117 | } | ||
118 | spin_unlock_irqrestore(&gart_lock, flags); | ||
119 | if (!flushed) | ||
120 | printk("nothing to flush?\n"); | ||
121 | } | ||
122 | EXPORT_SYMBOL_GPL(k8_flush_garts); | ||
123 | |||
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c deleted file mode 100644 index a30e004682e2..000000000000 --- a/arch/x86_64/kernel/kprobes.c +++ /dev/null | |||
@@ -1,749 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel Probes (KProbes) | ||
3 | * arch/x86_64/kernel/kprobes.c | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
20 | * | ||
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | ||
22 | * Probes initial implementation ( includes contributions from | ||
23 | * Rusty Russell). | ||
24 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | ||
25 | * interface to access function arguments. | ||
26 | * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi | ||
27 | * <prasanna@in.ibm.com> adapted for x86_64 | ||
28 | * 2005-Mar Roland McGrath <roland@redhat.com> | ||
29 | * Fixed to handle %rip-relative addressing mode correctly. | ||
30 | * 2005-May Rusty Lynch <rusty.lynch@intel.com> | ||
31 | * Added function return probes functionality | ||
32 | */ | ||
33 | |||
34 | #include <linux/kprobes.h> | ||
35 | #include <linux/ptrace.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/slab.h> | ||
38 | #include <linux/preempt.h> | ||
39 | #include <linux/module.h> | ||
40 | #include <linux/kdebug.h> | ||
41 | |||
42 | #include <asm/pgtable.h> | ||
43 | #include <asm/uaccess.h> | ||
44 | #include <asm/alternative.h> | ||
45 | |||
46 | void jprobe_return_end(void); | ||
47 | static void __kprobes arch_copy_kprobe(struct kprobe *p); | ||
48 | |||
49 | DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; | ||
50 | DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); | ||
51 | |||
52 | /* | ||
53 | * returns non-zero if opcode modifies the interrupt flag. | ||
54 | */ | ||
55 | static __always_inline int is_IF_modifier(kprobe_opcode_t *insn) | ||
56 | { | ||
57 | switch (*insn) { | ||
58 | case 0xfa: /* cli */ | ||
59 | case 0xfb: /* sti */ | ||
60 | case 0xcf: /* iret/iretd */ | ||
61 | case 0x9d: /* popf/popfd */ | ||
62 | return 1; | ||
63 | } | ||
64 | |||
65 | if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) | ||
66 | return 1; | ||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | int __kprobes arch_prepare_kprobe(struct kprobe *p) | ||
71 | { | ||
72 | /* insn: must be on special executable page on x86_64. */ | ||
73 | p->ainsn.insn = get_insn_slot(); | ||
74 | if (!p->ainsn.insn) { | ||
75 | return -ENOMEM; | ||
76 | } | ||
77 | arch_copy_kprobe(p); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Determine if the instruction uses the %rip-relative addressing mode. | ||
83 | * If it does, return the address of the 32-bit displacement word. | ||
84 | * If not, return null. | ||
85 | */ | ||
86 | static s32 __kprobes *is_riprel(u8 *insn) | ||
87 | { | ||
88 | #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ | ||
89 | (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ | ||
90 | (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ | ||
91 | (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ | ||
92 | (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ | ||
93 | << (row % 64)) | ||
94 | static const u64 onebyte_has_modrm[256 / 64] = { | ||
95 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
96 | /* ------------------------------- */ | ||
97 | W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ | ||
98 | W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ | ||
99 | W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ | ||
100 | W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ | ||
101 | W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ | ||
102 | W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ | ||
103 | W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ | ||
104 | W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ | ||
105 | W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ | ||
106 | W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ | ||
107 | W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ | ||
108 | W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ | ||
109 | W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ | ||
110 | W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ | ||
111 | W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ | ||
112 | W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ | ||
113 | /* ------------------------------- */ | ||
114 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
115 | }; | ||
116 | static const u64 twobyte_has_modrm[256 / 64] = { | ||
117 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
118 | /* ------------------------------- */ | ||
119 | W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ | ||
120 | W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ | ||
121 | W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ | ||
122 | W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ | ||
123 | W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ | ||
124 | W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ | ||
125 | W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ | ||
126 | W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ | ||
127 | W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ | ||
128 | W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ | ||
129 | W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ | ||
130 | W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ | ||
131 | W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ | ||
132 | W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ | ||
133 | W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ | ||
134 | W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ | ||
135 | /* ------------------------------- */ | ||
136 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
137 | }; | ||
138 | #undef W | ||
139 | int need_modrm; | ||
140 | |||
141 | /* Skip legacy instruction prefixes. */ | ||
142 | while (1) { | ||
143 | switch (*insn) { | ||
144 | case 0x66: | ||
145 | case 0x67: | ||
146 | case 0x2e: | ||
147 | case 0x3e: | ||
148 | case 0x26: | ||
149 | case 0x64: | ||
150 | case 0x65: | ||
151 | case 0x36: | ||
152 | case 0xf0: | ||
153 | case 0xf3: | ||
154 | case 0xf2: | ||
155 | ++insn; | ||
156 | continue; | ||
157 | } | ||
158 | break; | ||
159 | } | ||
160 | |||
161 | /* Skip REX instruction prefix. */ | ||
162 | if ((*insn & 0xf0) == 0x40) | ||
163 | ++insn; | ||
164 | |||
165 | if (*insn == 0x0f) { /* Two-byte opcode. */ | ||
166 | ++insn; | ||
167 | need_modrm = test_bit(*insn, twobyte_has_modrm); | ||
168 | } else { /* One-byte opcode. */ | ||
169 | need_modrm = test_bit(*insn, onebyte_has_modrm); | ||
170 | } | ||
171 | |||
172 | if (need_modrm) { | ||
173 | u8 modrm = *++insn; | ||
174 | if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ | ||
175 | /* Displacement follows ModRM byte. */ | ||
176 | return (s32 *) ++insn; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | /* No %rip-relative addressing mode here. */ | ||
181 | return NULL; | ||
182 | } | ||
183 | |||
184 | static void __kprobes arch_copy_kprobe(struct kprobe *p) | ||
185 | { | ||
186 | s32 *ripdisp; | ||
187 | memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); | ||
188 | ripdisp = is_riprel(p->ainsn.insn); | ||
189 | if (ripdisp) { | ||
190 | /* | ||
191 | * The copied instruction uses the %rip-relative | ||
192 | * addressing mode. Adjust the displacement for the | ||
193 | * difference between the original location of this | ||
194 | * instruction and the location of the copy that will | ||
195 | * actually be run. The tricky bit here is making sure | ||
196 | * that the sign extension happens correctly in this | ||
197 | * calculation, since we need a signed 32-bit result to | ||
198 | * be sign-extended to 64 bits when it's added to the | ||
199 | * %rip value and yield the same 64-bit result that the | ||
200 | * sign-extension of the original signed 32-bit | ||
201 | * displacement would have given. | ||
202 | */ | ||
203 | s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; | ||
204 | BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ | ||
205 | *ripdisp = disp; | ||
206 | } | ||
207 | p->opcode = *p->addr; | ||
208 | } | ||
209 | |||
210 | void __kprobes arch_arm_kprobe(struct kprobe *p) | ||
211 | { | ||
212 | text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); | ||
213 | } | ||
214 | |||
215 | void __kprobes arch_disarm_kprobe(struct kprobe *p) | ||
216 | { | ||
217 | text_poke(p->addr, &p->opcode, 1); | ||
218 | } | ||
219 | |||
220 | void __kprobes arch_remove_kprobe(struct kprobe *p) | ||
221 | { | ||
222 | mutex_lock(&kprobe_mutex); | ||
223 | free_insn_slot(p->ainsn.insn, 0); | ||
224 | mutex_unlock(&kprobe_mutex); | ||
225 | } | ||
226 | |||
227 | static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) | ||
228 | { | ||
229 | kcb->prev_kprobe.kp = kprobe_running(); | ||
230 | kcb->prev_kprobe.status = kcb->kprobe_status; | ||
231 | kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; | ||
232 | kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; | ||
233 | } | ||
234 | |||
235 | static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) | ||
236 | { | ||
237 | __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; | ||
238 | kcb->kprobe_status = kcb->prev_kprobe.status; | ||
239 | kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; | ||
240 | kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; | ||
241 | } | ||
242 | |||
243 | static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, | ||
244 | struct kprobe_ctlblk *kcb) | ||
245 | { | ||
246 | __get_cpu_var(current_kprobe) = p; | ||
247 | kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags | ||
248 | = (regs->eflags & (TF_MASK | IF_MASK)); | ||
249 | if (is_IF_modifier(p->ainsn.insn)) | ||
250 | kcb->kprobe_saved_rflags &= ~IF_MASK; | ||
251 | } | ||
252 | |||
253 | static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | ||
254 | { | ||
255 | regs->eflags |= TF_MASK; | ||
256 | regs->eflags &= ~IF_MASK; | ||
257 | /*single step inline if the instruction is an int3*/ | ||
258 | if (p->opcode == BREAKPOINT_INSTRUCTION) | ||
259 | regs->rip = (unsigned long)p->addr; | ||
260 | else | ||
261 | regs->rip = (unsigned long)p->ainsn.insn; | ||
262 | } | ||
263 | |||
264 | /* Called with kretprobe_lock held */ | ||
265 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | ||
266 | struct pt_regs *regs) | ||
267 | { | ||
268 | unsigned long *sara = (unsigned long *)regs->rsp; | ||
269 | |||
270 | ri->ret_addr = (kprobe_opcode_t *) *sara; | ||
271 | /* Replace the return addr with trampoline addr */ | ||
272 | *sara = (unsigned long) &kretprobe_trampoline; | ||
273 | } | ||
274 | |||
275 | int __kprobes kprobe_handler(struct pt_regs *regs) | ||
276 | { | ||
277 | struct kprobe *p; | ||
278 | int ret = 0; | ||
279 | kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); | ||
280 | struct kprobe_ctlblk *kcb; | ||
281 | |||
282 | /* | ||
283 | * We don't want to be preempted for the entire | ||
284 | * duration of kprobe processing | ||
285 | */ | ||
286 | preempt_disable(); | ||
287 | kcb = get_kprobe_ctlblk(); | ||
288 | |||
289 | /* Check we're not actually recursing */ | ||
290 | if (kprobe_running()) { | ||
291 | p = get_kprobe(addr); | ||
292 | if (p) { | ||
293 | if (kcb->kprobe_status == KPROBE_HIT_SS && | ||
294 | *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { | ||
295 | regs->eflags &= ~TF_MASK; | ||
296 | regs->eflags |= kcb->kprobe_saved_rflags; | ||
297 | goto no_kprobe; | ||
298 | } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { | ||
299 | /* TODO: Provide re-entrancy from | ||
300 | * post_kprobes_handler() and avoid exception | ||
301 | * stack corruption while single-stepping on | ||
302 | * the instruction of the new probe. | ||
303 | */ | ||
304 | arch_disarm_kprobe(p); | ||
305 | regs->rip = (unsigned long)p->addr; | ||
306 | reset_current_kprobe(); | ||
307 | ret = 1; | ||
308 | } else { | ||
309 | /* We have reentered the kprobe_handler(), since | ||
310 | * another probe was hit while within the | ||
311 | * handler. We here save the original kprobe | ||
312 | * variables and just single step on instruction | ||
313 | * of the new probe without calling any user | ||
314 | * handlers. | ||
315 | */ | ||
316 | save_previous_kprobe(kcb); | ||
317 | set_current_kprobe(p, regs, kcb); | ||
318 | kprobes_inc_nmissed_count(p); | ||
319 | prepare_singlestep(p, regs); | ||
320 | kcb->kprobe_status = KPROBE_REENTER; | ||
321 | return 1; | ||
322 | } | ||
323 | } else { | ||
324 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
325 | /* The breakpoint instruction was removed by | ||
326 | * another cpu right after we hit, no further | ||
327 | * handling of this interrupt is appropriate | ||
328 | */ | ||
329 | regs->rip = (unsigned long)addr; | ||
330 | ret = 1; | ||
331 | goto no_kprobe; | ||
332 | } | ||
333 | p = __get_cpu_var(current_kprobe); | ||
334 | if (p->break_handler && p->break_handler(p, regs)) { | ||
335 | goto ss_probe; | ||
336 | } | ||
337 | } | ||
338 | goto no_kprobe; | ||
339 | } | ||
340 | |||
341 | p = get_kprobe(addr); | ||
342 | if (!p) { | ||
343 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
344 | /* | ||
345 | * The breakpoint instruction was removed right | ||
346 | * after we hit it. Another cpu has removed | ||
347 | * either a probepoint or a debugger breakpoint | ||
348 | * at this address. In either case, no further | ||
349 | * handling of this interrupt is appropriate. | ||
350 | * Back up over the (now missing) int3 and run | ||
351 | * the original instruction. | ||
352 | */ | ||
353 | regs->rip = (unsigned long)addr; | ||
354 | ret = 1; | ||
355 | } | ||
356 | /* Not one of ours: let kernel handle it */ | ||
357 | goto no_kprobe; | ||
358 | } | ||
359 | |||
360 | set_current_kprobe(p, regs, kcb); | ||
361 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | ||
362 | |||
363 | if (p->pre_handler && p->pre_handler(p, regs)) | ||
364 | /* handler has already set things up, so skip ss setup */ | ||
365 | return 1; | ||
366 | |||
367 | ss_probe: | ||
368 | prepare_singlestep(p, regs); | ||
369 | kcb->kprobe_status = KPROBE_HIT_SS; | ||
370 | return 1; | ||
371 | |||
372 | no_kprobe: | ||
373 | preempt_enable_no_resched(); | ||
374 | return ret; | ||
375 | } | ||
376 | |||
377 | /* | ||
378 | * For function-return probes, init_kprobes() establishes a probepoint | ||
379 | * here. When a retprobed function returns, this probe is hit and | ||
380 | * trampoline_probe_handler() runs, calling the kretprobe's handler. | ||
381 | */ | ||
382 | void kretprobe_trampoline_holder(void) | ||
383 | { | ||
384 | asm volatile ( ".global kretprobe_trampoline\n" | ||
385 | "kretprobe_trampoline: \n" | ||
386 | "nop\n"); | ||
387 | } | ||
388 | |||
389 | /* | ||
390 | * Called when we hit the probe point at kretprobe_trampoline | ||
391 | */ | ||
392 | int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) | ||
393 | { | ||
394 | struct kretprobe_instance *ri = NULL; | ||
395 | struct hlist_head *head, empty_rp; | ||
396 | struct hlist_node *node, *tmp; | ||
397 | unsigned long flags, orig_ret_address = 0; | ||
398 | unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; | ||
399 | |||
400 | INIT_HLIST_HEAD(&empty_rp); | ||
401 | spin_lock_irqsave(&kretprobe_lock, flags); | ||
402 | head = kretprobe_inst_table_head(current); | ||
403 | |||
404 | /* | ||
405 | * It is possible to have multiple instances associated with a given | ||
406 | * task either because an multiple functions in the call path | ||
407 | * have a return probe installed on them, and/or more then one return | ||
408 | * return probe was registered for a target function. | ||
409 | * | ||
410 | * We can handle this because: | ||
411 | * - instances are always inserted at the head of the list | ||
412 | * - when multiple return probes are registered for the same | ||
413 | * function, the first instance's ret_addr will point to the | ||
414 | * real return address, and all the rest will point to | ||
415 | * kretprobe_trampoline | ||
416 | */ | ||
417 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | ||
418 | if (ri->task != current) | ||
419 | /* another task is sharing our hash bucket */ | ||
420 | continue; | ||
421 | |||
422 | if (ri->rp && ri->rp->handler) | ||
423 | ri->rp->handler(ri, regs); | ||
424 | |||
425 | orig_ret_address = (unsigned long)ri->ret_addr; | ||
426 | recycle_rp_inst(ri, &empty_rp); | ||
427 | |||
428 | if (orig_ret_address != trampoline_address) | ||
429 | /* | ||
430 | * This is the real return address. Any other | ||
431 | * instances associated with this task are for | ||
432 | * other calls deeper on the call stack | ||
433 | */ | ||
434 | break; | ||
435 | } | ||
436 | |||
437 | kretprobe_assert(ri, orig_ret_address, trampoline_address); | ||
438 | regs->rip = orig_ret_address; | ||
439 | |||
440 | reset_current_kprobe(); | ||
441 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
442 | preempt_enable_no_resched(); | ||
443 | |||
444 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { | ||
445 | hlist_del(&ri->hlist); | ||
446 | kfree(ri); | ||
447 | } | ||
448 | /* | ||
449 | * By returning a non-zero value, we are telling | ||
450 | * kprobe_handler() that we don't want the post_handler | ||
451 | * to run (and have re-enabled preemption) | ||
452 | */ | ||
453 | return 1; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Called after single-stepping. p->addr is the address of the | ||
458 | * instruction whose first byte has been replaced by the "int 3" | ||
459 | * instruction. To avoid the SMP problems that can occur when we | ||
460 | * temporarily put back the original opcode to single-step, we | ||
461 | * single-stepped a copy of the instruction. The address of this | ||
462 | * copy is p->ainsn.insn. | ||
463 | * | ||
464 | * This function prepares to return from the post-single-step | ||
465 | * interrupt. We have to fix up the stack as follows: | ||
466 | * | ||
467 | * 0) Except in the case of absolute or indirect jump or call instructions, | ||
468 | * the new rip is relative to the copied instruction. We need to make | ||
469 | * it relative to the original instruction. | ||
470 | * | ||
471 | * 1) If the single-stepped instruction was pushfl, then the TF and IF | ||
472 | * flags are set in the just-pushed eflags, and may need to be cleared. | ||
473 | * | ||
474 | * 2) If the single-stepped instruction was a call, the return address | ||
475 | * that is atop the stack is the address following the copied instruction. | ||
476 | * We need to make it the address following the original instruction. | ||
477 | */ | ||
478 | static void __kprobes resume_execution(struct kprobe *p, | ||
479 | struct pt_regs *regs, struct kprobe_ctlblk *kcb) | ||
480 | { | ||
481 | unsigned long *tos = (unsigned long *)regs->rsp; | ||
482 | unsigned long next_rip = 0; | ||
483 | unsigned long copy_rip = (unsigned long)p->ainsn.insn; | ||
484 | unsigned long orig_rip = (unsigned long)p->addr; | ||
485 | kprobe_opcode_t *insn = p->ainsn.insn; | ||
486 | |||
487 | /*skip the REX prefix*/ | ||
488 | if (*insn >= 0x40 && *insn <= 0x4f) | ||
489 | insn++; | ||
490 | |||
491 | switch (*insn) { | ||
492 | case 0x9c: /* pushfl */ | ||
493 | *tos &= ~(TF_MASK | IF_MASK); | ||
494 | *tos |= kcb->kprobe_old_rflags; | ||
495 | break; | ||
496 | case 0xc3: /* ret/lret */ | ||
497 | case 0xcb: | ||
498 | case 0xc2: | ||
499 | case 0xca: | ||
500 | regs->eflags &= ~TF_MASK; | ||
501 | /* rip is already adjusted, no more changes required*/ | ||
502 | return; | ||
503 | case 0xe8: /* call relative - Fix return addr */ | ||
504 | *tos = orig_rip + (*tos - copy_rip); | ||
505 | break; | ||
506 | case 0xff: | ||
507 | if ((insn[1] & 0x30) == 0x10) { | ||
508 | /* call absolute, indirect */ | ||
509 | /* Fix return addr; rip is correct. */ | ||
510 | next_rip = regs->rip; | ||
511 | *tos = orig_rip + (*tos - copy_rip); | ||
512 | } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ | ||
513 | ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ | ||
514 | /* rip is correct. */ | ||
515 | next_rip = regs->rip; | ||
516 | } | ||
517 | break; | ||
518 | case 0xea: /* jmp absolute -- rip is correct */ | ||
519 | next_rip = regs->rip; | ||
520 | break; | ||
521 | default: | ||
522 | break; | ||
523 | } | ||
524 | |||
525 | regs->eflags &= ~TF_MASK; | ||
526 | if (next_rip) { | ||
527 | regs->rip = next_rip; | ||
528 | } else { | ||
529 | regs->rip = orig_rip + (regs->rip - copy_rip); | ||
530 | } | ||
531 | } | ||
532 | |||
533 | int __kprobes post_kprobe_handler(struct pt_regs *regs) | ||
534 | { | ||
535 | struct kprobe *cur = kprobe_running(); | ||
536 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
537 | |||
538 | if (!cur) | ||
539 | return 0; | ||
540 | |||
541 | if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { | ||
542 | kcb->kprobe_status = KPROBE_HIT_SSDONE; | ||
543 | cur->post_handler(cur, regs, 0); | ||
544 | } | ||
545 | |||
546 | resume_execution(cur, regs, kcb); | ||
547 | regs->eflags |= kcb->kprobe_saved_rflags; | ||
548 | |||
549 | /* Restore the original saved kprobes variables and continue. */ | ||
550 | if (kcb->kprobe_status == KPROBE_REENTER) { | ||
551 | restore_previous_kprobe(kcb); | ||
552 | goto out; | ||
553 | } | ||
554 | reset_current_kprobe(); | ||
555 | out: | ||
556 | preempt_enable_no_resched(); | ||
557 | |||
558 | /* | ||
559 | * if somebody else is singlestepping across a probe point, eflags | ||
560 | * will have TF set, in which case, continue the remaining processing | ||
561 | * of do_debug, as if this is not a probe hit. | ||
562 | */ | ||
563 | if (regs->eflags & TF_MASK) | ||
564 | return 0; | ||
565 | |||
566 | return 1; | ||
567 | } | ||
568 | |||
569 | int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) | ||
570 | { | ||
571 | struct kprobe *cur = kprobe_running(); | ||
572 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
573 | const struct exception_table_entry *fixup; | ||
574 | |||
575 | switch(kcb->kprobe_status) { | ||
576 | case KPROBE_HIT_SS: | ||
577 | case KPROBE_REENTER: | ||
578 | /* | ||
579 | * We are here because the instruction being single | ||
580 | * stepped caused a page fault. We reset the current | ||
581 | * kprobe and the rip points back to the probe address | ||
582 | * and allow the page fault handler to continue as a | ||
583 | * normal page fault. | ||
584 | */ | ||
585 | regs->rip = (unsigned long)cur->addr; | ||
586 | regs->eflags |= kcb->kprobe_old_rflags; | ||
587 | if (kcb->kprobe_status == KPROBE_REENTER) | ||
588 | restore_previous_kprobe(kcb); | ||
589 | else | ||
590 | reset_current_kprobe(); | ||
591 | preempt_enable_no_resched(); | ||
592 | break; | ||
593 | case KPROBE_HIT_ACTIVE: | ||
594 | case KPROBE_HIT_SSDONE: | ||
595 | /* | ||
596 | * We increment the nmissed count for accounting, | ||
597 | * we can also use npre/npostfault count for accouting | ||
598 | * these specific fault cases. | ||
599 | */ | ||
600 | kprobes_inc_nmissed_count(cur); | ||
601 | |||
602 | /* | ||
603 | * We come here because instructions in the pre/post | ||
604 | * handler caused the page_fault, this could happen | ||
605 | * if handler tries to access user space by | ||
606 | * copy_from_user(), get_user() etc. Let the | ||
607 | * user-specified handler try to fix it first. | ||
608 | */ | ||
609 | if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) | ||
610 | return 1; | ||
611 | |||
612 | /* | ||
613 | * In case the user-specified fault handler returned | ||
614 | * zero, try to fix up. | ||
615 | */ | ||
616 | fixup = search_exception_tables(regs->rip); | ||
617 | if (fixup) { | ||
618 | regs->rip = fixup->fixup; | ||
619 | return 1; | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * fixup() could not handle it, | ||
624 | * Let do_page_fault() fix it. | ||
625 | */ | ||
626 | break; | ||
627 | default: | ||
628 | break; | ||
629 | } | ||
630 | return 0; | ||
631 | } | ||
632 | |||
633 | /* | ||
634 | * Wrapper routine for handling exceptions. | ||
635 | */ | ||
636 | int __kprobes kprobe_exceptions_notify(struct notifier_block *self, | ||
637 | unsigned long val, void *data) | ||
638 | { | ||
639 | struct die_args *args = (struct die_args *)data; | ||
640 | int ret = NOTIFY_DONE; | ||
641 | |||
642 | if (args->regs && user_mode(args->regs)) | ||
643 | return ret; | ||
644 | |||
645 | switch (val) { | ||
646 | case DIE_INT3: | ||
647 | if (kprobe_handler(args->regs)) | ||
648 | ret = NOTIFY_STOP; | ||
649 | break; | ||
650 | case DIE_DEBUG: | ||
651 | if (post_kprobe_handler(args->regs)) | ||
652 | ret = NOTIFY_STOP; | ||
653 | break; | ||
654 | case DIE_GPF: | ||
655 | case DIE_PAGE_FAULT: | ||
656 | /* kprobe_running() needs smp_processor_id() */ | ||
657 | preempt_disable(); | ||
658 | if (kprobe_running() && | ||
659 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
660 | ret = NOTIFY_STOP; | ||
661 | preempt_enable(); | ||
662 | break; | ||
663 | default: | ||
664 | break; | ||
665 | } | ||
666 | return ret; | ||
667 | } | ||
668 | |||
669 | int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
670 | { | ||
671 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
672 | unsigned long addr; | ||
673 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
674 | |||
675 | kcb->jprobe_saved_regs = *regs; | ||
676 | kcb->jprobe_saved_rsp = (long *) regs->rsp; | ||
677 | addr = (unsigned long)(kcb->jprobe_saved_rsp); | ||
678 | /* | ||
679 | * As Linus pointed out, gcc assumes that the callee | ||
680 | * owns the argument space and could overwrite it, e.g. | ||
681 | * tailcall optimization. So, to be absolutely safe | ||
682 | * we also save and restore enough stack bytes to cover | ||
683 | * the argument area. | ||
684 | */ | ||
685 | memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, | ||
686 | MIN_STACK_SIZE(addr)); | ||
687 | regs->eflags &= ~IF_MASK; | ||
688 | regs->rip = (unsigned long)(jp->entry); | ||
689 | return 1; | ||
690 | } | ||
691 | |||
692 | void __kprobes jprobe_return(void) | ||
693 | { | ||
694 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
695 | |||
696 | asm volatile (" xchg %%rbx,%%rsp \n" | ||
697 | " int3 \n" | ||
698 | " .globl jprobe_return_end \n" | ||
699 | " jprobe_return_end: \n" | ||
700 | " nop \n"::"b" | ||
701 | (kcb->jprobe_saved_rsp):"memory"); | ||
702 | } | ||
703 | |||
704 | int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
705 | { | ||
706 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
707 | u8 *addr = (u8 *) (regs->rip - 1); | ||
708 | unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); | ||
709 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
710 | |||
711 | if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { | ||
712 | if ((long *)regs->rsp != kcb->jprobe_saved_rsp) { | ||
713 | struct pt_regs *saved_regs = | ||
714 | container_of(kcb->jprobe_saved_rsp, | ||
715 | struct pt_regs, rsp); | ||
716 | printk("current rsp %p does not match saved rsp %p\n", | ||
717 | (long *)regs->rsp, kcb->jprobe_saved_rsp); | ||
718 | printk("Saved registers for jprobe %p\n", jp); | ||
719 | show_registers(saved_regs); | ||
720 | printk("Current registers\n"); | ||
721 | show_registers(regs); | ||
722 | BUG(); | ||
723 | } | ||
724 | *regs = kcb->jprobe_saved_regs; | ||
725 | memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, | ||
726 | MIN_STACK_SIZE(stack_addr)); | ||
727 | preempt_enable_no_resched(); | ||
728 | return 1; | ||
729 | } | ||
730 | return 0; | ||
731 | } | ||
732 | |||
733 | static struct kprobe trampoline_p = { | ||
734 | .addr = (kprobe_opcode_t *) &kretprobe_trampoline, | ||
735 | .pre_handler = trampoline_probe_handler | ||
736 | }; | ||
737 | |||
738 | int __init arch_init_kprobes(void) | ||
739 | { | ||
740 | return register_kprobe(&trampoline_p); | ||
741 | } | ||
742 | |||
743 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) | ||
744 | { | ||
745 | if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) | ||
746 | return 1; | ||
747 | |||
748 | return 0; | ||
749 | } | ||
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c deleted file mode 100644 index bc9ffd5c19cc..000000000000 --- a/arch/x86_64/kernel/ldt.c +++ /dev/null | |||
@@ -1,252 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ldt.c | ||
3 | * | ||
4 | * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | ||
5 | * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright (C) 2002 Andi Kleen | ||
7 | * | ||
8 | * This handles calls from both 32bit and 64bit mode. | ||
9 | */ | ||
10 | |||
11 | #include <linux/errno.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/smp.h> | ||
16 | #include <linux/vmalloc.h> | ||
17 | #include <linux/slab.h> | ||
18 | |||
19 | #include <asm/uaccess.h> | ||
20 | #include <asm/system.h> | ||
21 | #include <asm/ldt.h> | ||
22 | #include <asm/desc.h> | ||
23 | #include <asm/proto.h> | ||
24 | |||
25 | #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | ||
26 | static void flush_ldt(void *null) | ||
27 | { | ||
28 | if (current->active_mm) | ||
29 | load_LDT(¤t->active_mm->context); | ||
30 | } | ||
31 | #endif | ||
32 | |||
33 | static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) | ||
34 | { | ||
35 | void *oldldt; | ||
36 | void *newldt; | ||
37 | unsigned oldsize; | ||
38 | |||
39 | if (mincount <= (unsigned)pc->size) | ||
40 | return 0; | ||
41 | oldsize = pc->size; | ||
42 | mincount = (mincount+511)&(~511); | ||
43 | if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
44 | newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | ||
45 | else | ||
46 | newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | ||
47 | |||
48 | if (!newldt) | ||
49 | return -ENOMEM; | ||
50 | |||
51 | if (oldsize) | ||
52 | memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | ||
53 | oldldt = pc->ldt; | ||
54 | memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | ||
55 | wmb(); | ||
56 | pc->ldt = newldt; | ||
57 | wmb(); | ||
58 | pc->size = mincount; | ||
59 | wmb(); | ||
60 | if (reload) { | ||
61 | #ifdef CONFIG_SMP | ||
62 | cpumask_t mask; | ||
63 | |||
64 | preempt_disable(); | ||
65 | mask = cpumask_of_cpu(smp_processor_id()); | ||
66 | load_LDT(pc); | ||
67 | if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | ||
68 | smp_call_function(flush_ldt, NULL, 1, 1); | ||
69 | preempt_enable(); | ||
70 | #else | ||
71 | load_LDT(pc); | ||
72 | #endif | ||
73 | } | ||
74 | if (oldsize) { | ||
75 | if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
76 | vfree(oldldt); | ||
77 | else | ||
78 | kfree(oldldt); | ||
79 | } | ||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | ||
84 | { | ||
85 | int err = alloc_ldt(new, old->size, 0); | ||
86 | if (err < 0) | ||
87 | return err; | ||
88 | memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * we do not have to muck with descriptors here, that is | ||
94 | * done in switch_mm() as needed. | ||
95 | */ | ||
96 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | ||
97 | { | ||
98 | struct mm_struct * old_mm; | ||
99 | int retval = 0; | ||
100 | |||
101 | init_MUTEX(&mm->context.sem); | ||
102 | mm->context.size = 0; | ||
103 | old_mm = current->mm; | ||
104 | if (old_mm && old_mm->context.size > 0) { | ||
105 | down(&old_mm->context.sem); | ||
106 | retval = copy_ldt(&mm->context, &old_mm->context); | ||
107 | up(&old_mm->context.sem); | ||
108 | } | ||
109 | return retval; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * | ||
114 | * Don't touch the LDT register - we're already in the next thread. | ||
115 | */ | ||
116 | void destroy_context(struct mm_struct *mm) | ||
117 | { | ||
118 | if (mm->context.size) { | ||
119 | if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
120 | vfree(mm->context.ldt); | ||
121 | else | ||
122 | kfree(mm->context.ldt); | ||
123 | mm->context.size = 0; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | static int read_ldt(void __user * ptr, unsigned long bytecount) | ||
128 | { | ||
129 | int err; | ||
130 | unsigned long size; | ||
131 | struct mm_struct * mm = current->mm; | ||
132 | |||
133 | if (!mm->context.size) | ||
134 | return 0; | ||
135 | if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | ||
136 | bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | ||
137 | |||
138 | down(&mm->context.sem); | ||
139 | size = mm->context.size*LDT_ENTRY_SIZE; | ||
140 | if (size > bytecount) | ||
141 | size = bytecount; | ||
142 | |||
143 | err = 0; | ||
144 | if (copy_to_user(ptr, mm->context.ldt, size)) | ||
145 | err = -EFAULT; | ||
146 | up(&mm->context.sem); | ||
147 | if (err < 0) | ||
148 | goto error_return; | ||
149 | if (size != bytecount) { | ||
150 | /* zero-fill the rest */ | ||
151 | if (clear_user(ptr+size, bytecount-size) != 0) { | ||
152 | err = -EFAULT; | ||
153 | goto error_return; | ||
154 | } | ||
155 | } | ||
156 | return bytecount; | ||
157 | error_return: | ||
158 | return err; | ||
159 | } | ||
160 | |||
161 | static int read_default_ldt(void __user * ptr, unsigned long bytecount) | ||
162 | { | ||
163 | /* Arbitrary number */ | ||
164 | /* x86-64 default LDT is all zeros */ | ||
165 | if (bytecount > 128) | ||
166 | bytecount = 128; | ||
167 | if (clear_user(ptr, bytecount)) | ||
168 | return -EFAULT; | ||
169 | return bytecount; | ||
170 | } | ||
171 | |||
172 | static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | ||
173 | { | ||
174 | struct task_struct *me = current; | ||
175 | struct mm_struct * mm = me->mm; | ||
176 | __u32 entry_1, entry_2, *lp; | ||
177 | int error; | ||
178 | struct user_desc ldt_info; | ||
179 | |||
180 | error = -EINVAL; | ||
181 | |||
182 | if (bytecount != sizeof(ldt_info)) | ||
183 | goto out; | ||
184 | error = -EFAULT; | ||
185 | if (copy_from_user(&ldt_info, ptr, bytecount)) | ||
186 | goto out; | ||
187 | |||
188 | error = -EINVAL; | ||
189 | if (ldt_info.entry_number >= LDT_ENTRIES) | ||
190 | goto out; | ||
191 | if (ldt_info.contents == 3) { | ||
192 | if (oldmode) | ||
193 | goto out; | ||
194 | if (ldt_info.seg_not_present == 0) | ||
195 | goto out; | ||
196 | } | ||
197 | |||
198 | down(&mm->context.sem); | ||
199 | if (ldt_info.entry_number >= (unsigned)mm->context.size) { | ||
200 | error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | ||
201 | if (error < 0) | ||
202 | goto out_unlock; | ||
203 | } | ||
204 | |||
205 | lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); | ||
206 | |||
207 | /* Allow LDTs to be cleared by the user. */ | ||
208 | if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | ||
209 | if (oldmode || LDT_empty(&ldt_info)) { | ||
210 | entry_1 = 0; | ||
211 | entry_2 = 0; | ||
212 | goto install; | ||
213 | } | ||
214 | } | ||
215 | |||
216 | entry_1 = LDT_entry_a(&ldt_info); | ||
217 | entry_2 = LDT_entry_b(&ldt_info); | ||
218 | if (oldmode) | ||
219 | entry_2 &= ~(1 << 20); | ||
220 | |||
221 | /* Install the new entry ... */ | ||
222 | install: | ||
223 | *lp = entry_1; | ||
224 | *(lp+1) = entry_2; | ||
225 | error = 0; | ||
226 | |||
227 | out_unlock: | ||
228 | up(&mm->context.sem); | ||
229 | out: | ||
230 | return error; | ||
231 | } | ||
232 | |||
233 | asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | ||
234 | { | ||
235 | int ret = -ENOSYS; | ||
236 | |||
237 | switch (func) { | ||
238 | case 0: | ||
239 | ret = read_ldt(ptr, bytecount); | ||
240 | break; | ||
241 | case 1: | ||
242 | ret = write_ldt(ptr, bytecount, 1); | ||
243 | break; | ||
244 | case 2: | ||
245 | ret = read_default_ldt(ptr, bytecount); | ||
246 | break; | ||
247 | case 0x11: | ||
248 | ret = write_ldt(ptr, bytecount, 0); | ||
249 | break; | ||
250 | } | ||
251 | return ret; | ||
252 | } | ||
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c deleted file mode 100644 index c3a554703672..000000000000 --- a/arch/x86_64/kernel/machine_kexec.c +++ /dev/null | |||
@@ -1,259 +0,0 @@ | |||
1 | /* | ||
2 | * machine_kexec.c - handle transition of Linux booting another kernel | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/kexec.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/reboot.h> | ||
13 | #include <asm/pgtable.h> | ||
14 | #include <asm/tlbflush.h> | ||
15 | #include <asm/mmu_context.h> | ||
16 | #include <asm/io.h> | ||
17 | |||
18 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) | ||
19 | static u64 kexec_pgd[512] PAGE_ALIGNED; | ||
20 | static u64 kexec_pud0[512] PAGE_ALIGNED; | ||
21 | static u64 kexec_pmd0[512] PAGE_ALIGNED; | ||
22 | static u64 kexec_pte0[512] PAGE_ALIGNED; | ||
23 | static u64 kexec_pud1[512] PAGE_ALIGNED; | ||
24 | static u64 kexec_pmd1[512] PAGE_ALIGNED; | ||
25 | static u64 kexec_pte1[512] PAGE_ALIGNED; | ||
26 | |||
27 | static void init_level2_page(pmd_t *level2p, unsigned long addr) | ||
28 | { | ||
29 | unsigned long end_addr; | ||
30 | |||
31 | addr &= PAGE_MASK; | ||
32 | end_addr = addr + PUD_SIZE; | ||
33 | while (addr < end_addr) { | ||
34 | set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); | ||
35 | addr += PMD_SIZE; | ||
36 | } | ||
37 | } | ||
38 | |||
39 | static int init_level3_page(struct kimage *image, pud_t *level3p, | ||
40 | unsigned long addr, unsigned long last_addr) | ||
41 | { | ||
42 | unsigned long end_addr; | ||
43 | int result; | ||
44 | |||
45 | result = 0; | ||
46 | addr &= PAGE_MASK; | ||
47 | end_addr = addr + PGDIR_SIZE; | ||
48 | while ((addr < last_addr) && (addr < end_addr)) { | ||
49 | struct page *page; | ||
50 | pmd_t *level2p; | ||
51 | |||
52 | page = kimage_alloc_control_pages(image, 0); | ||
53 | if (!page) { | ||
54 | result = -ENOMEM; | ||
55 | goto out; | ||
56 | } | ||
57 | level2p = (pmd_t *)page_address(page); | ||
58 | init_level2_page(level2p, addr); | ||
59 | set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); | ||
60 | addr += PUD_SIZE; | ||
61 | } | ||
62 | /* clear the unused entries */ | ||
63 | while (addr < end_addr) { | ||
64 | pud_clear(level3p++); | ||
65 | addr += PUD_SIZE; | ||
66 | } | ||
67 | out: | ||
68 | return result; | ||
69 | } | ||
70 | |||
71 | |||
72 | static int init_level4_page(struct kimage *image, pgd_t *level4p, | ||
73 | unsigned long addr, unsigned long last_addr) | ||
74 | { | ||
75 | unsigned long end_addr; | ||
76 | int result; | ||
77 | |||
78 | result = 0; | ||
79 | addr &= PAGE_MASK; | ||
80 | end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); | ||
81 | while ((addr < last_addr) && (addr < end_addr)) { | ||
82 | struct page *page; | ||
83 | pud_t *level3p; | ||
84 | |||
85 | page = kimage_alloc_control_pages(image, 0); | ||
86 | if (!page) { | ||
87 | result = -ENOMEM; | ||
88 | goto out; | ||
89 | } | ||
90 | level3p = (pud_t *)page_address(page); | ||
91 | result = init_level3_page(image, level3p, addr, last_addr); | ||
92 | if (result) { | ||
93 | goto out; | ||
94 | } | ||
95 | set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); | ||
96 | addr += PGDIR_SIZE; | ||
97 | } | ||
98 | /* clear the unused entries */ | ||
99 | while (addr < end_addr) { | ||
100 | pgd_clear(level4p++); | ||
101 | addr += PGDIR_SIZE; | ||
102 | } | ||
103 | out: | ||
104 | return result; | ||
105 | } | ||
106 | |||
107 | |||
108 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | ||
109 | { | ||
110 | pgd_t *level4p; | ||
111 | level4p = (pgd_t *)__va(start_pgtable); | ||
112 | return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); | ||
113 | } | ||
114 | |||
115 | static void set_idt(void *newidt, u16 limit) | ||
116 | { | ||
117 | struct desc_ptr curidt; | ||
118 | |||
119 | /* x86-64 supports unaliged loads & stores */ | ||
120 | curidt.size = limit; | ||
121 | curidt.address = (unsigned long)newidt; | ||
122 | |||
123 | __asm__ __volatile__ ( | ||
124 | "lidtq %0\n" | ||
125 | : : "m" (curidt) | ||
126 | ); | ||
127 | }; | ||
128 | |||
129 | |||
130 | static void set_gdt(void *newgdt, u16 limit) | ||
131 | { | ||
132 | struct desc_ptr curgdt; | ||
133 | |||
134 | /* x86-64 supports unaligned loads & stores */ | ||
135 | curgdt.size = limit; | ||
136 | curgdt.address = (unsigned long)newgdt; | ||
137 | |||
138 | __asm__ __volatile__ ( | ||
139 | "lgdtq %0\n" | ||
140 | : : "m" (curgdt) | ||
141 | ); | ||
142 | }; | ||
143 | |||
144 | static void load_segments(void) | ||
145 | { | ||
146 | __asm__ __volatile__ ( | ||
147 | "\tmovl %0,%%ds\n" | ||
148 | "\tmovl %0,%%es\n" | ||
149 | "\tmovl %0,%%ss\n" | ||
150 | "\tmovl %0,%%fs\n" | ||
151 | "\tmovl %0,%%gs\n" | ||
152 | : : "a" (__KERNEL_DS) : "memory" | ||
153 | ); | ||
154 | } | ||
155 | |||
156 | int machine_kexec_prepare(struct kimage *image) | ||
157 | { | ||
158 | unsigned long start_pgtable; | ||
159 | int result; | ||
160 | |||
161 | /* Calculate the offsets */ | ||
162 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | ||
163 | |||
164 | /* Setup the identity mapped 64bit page table */ | ||
165 | result = init_pgtable(image, start_pgtable); | ||
166 | if (result) | ||
167 | return result; | ||
168 | |||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | void machine_kexec_cleanup(struct kimage *image) | ||
173 | { | ||
174 | return; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Do not allocate memory (or fail in any way) in machine_kexec(). | ||
179 | * We are past the point of no return, committed to rebooting now. | ||
180 | */ | ||
181 | NORET_TYPE void machine_kexec(struct kimage *image) | ||
182 | { | ||
183 | unsigned long page_list[PAGES_NR]; | ||
184 | void *control_page; | ||
185 | |||
186 | /* Interrupts aren't acceptable while we reboot */ | ||
187 | local_irq_disable(); | ||
188 | |||
189 | control_page = page_address(image->control_code_page) + PAGE_SIZE; | ||
190 | memcpy(control_page, relocate_kernel, PAGE_SIZE); | ||
191 | |||
192 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); | ||
193 | page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; | ||
194 | page_list[PA_PGD] = virt_to_phys(&kexec_pgd); | ||
195 | page_list[VA_PGD] = (unsigned long)kexec_pgd; | ||
196 | page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0); | ||
197 | page_list[VA_PUD_0] = (unsigned long)kexec_pud0; | ||
198 | page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0); | ||
199 | page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; | ||
200 | page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0); | ||
201 | page_list[VA_PTE_0] = (unsigned long)kexec_pte0; | ||
202 | page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1); | ||
203 | page_list[VA_PUD_1] = (unsigned long)kexec_pud1; | ||
204 | page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1); | ||
205 | page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; | ||
206 | page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1); | ||
207 | page_list[VA_PTE_1] = (unsigned long)kexec_pte1; | ||
208 | |||
209 | page_list[PA_TABLE_PAGE] = | ||
210 | (unsigned long)__pa(page_address(image->control_code_page)); | ||
211 | |||
212 | /* The segment registers are funny things, they have both a | ||
213 | * visible and an invisible part. Whenever the visible part is | ||
214 | * set to a specific selector, the invisible part is loaded | ||
215 | * with from a table in memory. At no other time is the | ||
216 | * descriptor table in memory accessed. | ||
217 | * | ||
218 | * I take advantage of this here by force loading the | ||
219 | * segments, before I zap the gdt with an invalid value. | ||
220 | */ | ||
221 | load_segments(); | ||
222 | /* The gdt & idt are now invalid. | ||
223 | * If you want to load them you must set up your own idt & gdt. | ||
224 | */ | ||
225 | set_gdt(phys_to_virt(0),0); | ||
226 | set_idt(phys_to_virt(0),0); | ||
227 | |||
228 | /* now call it */ | ||
229 | relocate_kernel((unsigned long)image->head, (unsigned long)page_list, | ||
230 | image->start); | ||
231 | } | ||
232 | |||
233 | /* crashkernel=size@addr specifies the location to reserve for | ||
234 | * a crash kernel. By reserving this memory we guarantee | ||
235 | * that linux never set's it up as a DMA target. | ||
236 | * Useful for holding code to do something appropriate | ||
237 | * after a kernel panic. | ||
238 | */ | ||
239 | static int __init setup_crashkernel(char *arg) | ||
240 | { | ||
241 | unsigned long size, base; | ||
242 | char *p; | ||
243 | if (!arg) | ||
244 | return -EINVAL; | ||
245 | size = memparse(arg, &p); | ||
246 | if (arg == p) | ||
247 | return -EINVAL; | ||
248 | if (*p == '@') { | ||
249 | base = memparse(p+1, &p); | ||
250 | /* FIXME: Do I want a sanity check to validate the | ||
251 | * memory range? Yes you do, but it's too early for | ||
252 | * e820 -AK */ | ||
253 | crashk_res.start = base; | ||
254 | crashk_res.end = base + size - 1; | ||
255 | } | ||
256 | return 0; | ||
257 | } | ||
258 | early_param("crashkernel", setup_crashkernel); | ||
259 | |||
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c deleted file mode 100644 index a66d607f5b92..000000000000 --- a/arch/x86_64/kernel/mce.c +++ /dev/null | |||
@@ -1,875 +0,0 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | * Rest from unknown author(s). | ||
5 | * 2004 Andi Kleen. Rewrote most of it. | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/rcupdate.h> | ||
14 | #include <linux/kallsyms.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/miscdevice.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <linux/capability.h> | ||
19 | #include <linux/cpu.h> | ||
20 | #include <linux/percpu.h> | ||
21 | #include <linux/poll.h> | ||
22 | #include <linux/thread_info.h> | ||
23 | #include <linux/ctype.h> | ||
24 | #include <linux/kmod.h> | ||
25 | #include <linux/kdebug.h> | ||
26 | #include <asm/processor.h> | ||
27 | #include <asm/msr.h> | ||
28 | #include <asm/mce.h> | ||
29 | #include <asm/uaccess.h> | ||
30 | #include <asm/smp.h> | ||
31 | #include <asm/idle.h> | ||
32 | |||
33 | #define MISC_MCELOG_MINOR 227 | ||
34 | #define NR_BANKS 6 | ||
35 | |||
36 | atomic_t mce_entry; | ||
37 | |||
38 | static int mce_dont_init; | ||
39 | |||
40 | /* | ||
41 | * Tolerant levels: | ||
42 | * 0: always panic on uncorrected errors, log corrected errors | ||
43 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | ||
44 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | ||
45 | * 3: never panic or SIGBUS, log all errors (for testing only) | ||
46 | */ | ||
47 | static int tolerant = 1; | ||
48 | static int banks; | ||
49 | static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; | ||
50 | static unsigned long notify_user; | ||
51 | static int rip_msr; | ||
52 | static int mce_bootlog = 1; | ||
53 | static atomic_t mce_events; | ||
54 | |||
55 | static char trigger[128]; | ||
56 | static char *trigger_argv[2] = { trigger, NULL }; | ||
57 | |||
58 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | ||
59 | |||
60 | /* | ||
61 | * Lockless MCE logging infrastructure. | ||
62 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
63 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
64 | */ | ||
65 | |||
66 | struct mce_log mcelog = { | ||
67 | MCE_LOG_SIGNATURE, | ||
68 | MCE_LOG_LEN, | ||
69 | }; | ||
70 | |||
71 | void mce_log(struct mce *mce) | ||
72 | { | ||
73 | unsigned next, entry; | ||
74 | atomic_inc(&mce_events); | ||
75 | mce->finished = 0; | ||
76 | wmb(); | ||
77 | for (;;) { | ||
78 | entry = rcu_dereference(mcelog.next); | ||
79 | /* The rmb forces the compiler to reload next in each | ||
80 | iteration */ | ||
81 | rmb(); | ||
82 | for (;;) { | ||
83 | /* When the buffer fills up discard new entries. Assume | ||
84 | that the earlier errors are the more interesting. */ | ||
85 | if (entry >= MCE_LOG_LEN) { | ||
86 | set_bit(MCE_OVERFLOW, &mcelog.flags); | ||
87 | return; | ||
88 | } | ||
89 | /* Old left over entry. Skip. */ | ||
90 | if (mcelog.entry[entry].finished) { | ||
91 | entry++; | ||
92 | continue; | ||
93 | } | ||
94 | break; | ||
95 | } | ||
96 | smp_rmb(); | ||
97 | next = entry + 1; | ||
98 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
99 | break; | ||
100 | } | ||
101 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
102 | wmb(); | ||
103 | mcelog.entry[entry].finished = 1; | ||
104 | wmb(); | ||
105 | |||
106 | set_bit(0, ¬ify_user); | ||
107 | } | ||
108 | |||
109 | static void print_mce(struct mce *m) | ||
110 | { | ||
111 | printk(KERN_EMERG "\n" | ||
112 | KERN_EMERG "HARDWARE ERROR\n" | ||
113 | KERN_EMERG | ||
114 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
115 | m->cpu, m->mcgstatus, m->bank, m->status); | ||
116 | if (m->rip) { | ||
117 | printk(KERN_EMERG | ||
118 | "RIP%s %02x:<%016Lx> ", | ||
119 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
120 | m->cs, m->rip); | ||
121 | if (m->cs == __KERNEL_CS) | ||
122 | print_symbol("{%s}", m->rip); | ||
123 | printk("\n"); | ||
124 | } | ||
125 | printk(KERN_EMERG "TSC %Lx ", m->tsc); | ||
126 | if (m->addr) | ||
127 | printk("ADDR %Lx ", m->addr); | ||
128 | if (m->misc) | ||
129 | printk("MISC %Lx ", m->misc); | ||
130 | printk("\n"); | ||
131 | printk(KERN_EMERG "This is not a software problem!\n"); | ||
132 | printk(KERN_EMERG | ||
133 | "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | ||
134 | } | ||
135 | |||
136 | static void mce_panic(char *msg, struct mce *backup, unsigned long start) | ||
137 | { | ||
138 | int i; | ||
139 | |||
140 | oops_begin(); | ||
141 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
142 | unsigned long tsc = mcelog.entry[i].tsc; | ||
143 | if (time_before(tsc, start)) | ||
144 | continue; | ||
145 | print_mce(&mcelog.entry[i]); | ||
146 | if (backup && mcelog.entry[i].tsc == backup->tsc) | ||
147 | backup = NULL; | ||
148 | } | ||
149 | if (backup) | ||
150 | print_mce(backup); | ||
151 | panic(msg); | ||
152 | } | ||
153 | |||
154 | static int mce_available(struct cpuinfo_x86 *c) | ||
155 | { | ||
156 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | ||
157 | } | ||
158 | |||
159 | static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | ||
160 | { | ||
161 | if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { | ||
162 | m->rip = regs->rip; | ||
163 | m->cs = regs->cs; | ||
164 | } else { | ||
165 | m->rip = 0; | ||
166 | m->cs = 0; | ||
167 | } | ||
168 | if (rip_msr) { | ||
169 | /* Assume the RIP in the MSR is exact. Is this true? */ | ||
170 | m->mcgstatus |= MCG_STATUS_EIPV; | ||
171 | rdmsrl(rip_msr, m->rip); | ||
172 | m->cs = 0; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * The actual machine check handler | ||
178 | */ | ||
179 | |||
180 | void do_machine_check(struct pt_regs * regs, long error_code) | ||
181 | { | ||
182 | struct mce m, panicm; | ||
183 | u64 mcestart = 0; | ||
184 | int i; | ||
185 | int panicm_found = 0; | ||
186 | /* | ||
187 | * If no_way_out gets set, there is no safe way to recover from this | ||
188 | * MCE. If tolerant is cranked up, we'll try anyway. | ||
189 | */ | ||
190 | int no_way_out = 0; | ||
191 | /* | ||
192 | * If kill_it gets set, there might be a way to recover from this | ||
193 | * error. | ||
194 | */ | ||
195 | int kill_it = 0; | ||
196 | |||
197 | atomic_inc(&mce_entry); | ||
198 | |||
199 | if (regs) | ||
200 | notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); | ||
201 | if (!banks) | ||
202 | goto out2; | ||
203 | |||
204 | memset(&m, 0, sizeof(struct mce)); | ||
205 | m.cpu = smp_processor_id(); | ||
206 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
207 | /* if the restart IP is not valid, we're done for */ | ||
208 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
209 | no_way_out = 1; | ||
210 | |||
211 | rdtscll(mcestart); | ||
212 | barrier(); | ||
213 | |||
214 | for (i = 0; i < banks; i++) { | ||
215 | if (!bank[i]) | ||
216 | continue; | ||
217 | |||
218 | m.misc = 0; | ||
219 | m.addr = 0; | ||
220 | m.bank = i; | ||
221 | m.tsc = 0; | ||
222 | |||
223 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
224 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
225 | continue; | ||
226 | |||
227 | if (m.status & MCI_STATUS_EN) { | ||
228 | /* if PCC was set, there's no way out */ | ||
229 | no_way_out |= !!(m.status & MCI_STATUS_PCC); | ||
230 | /* | ||
231 | * If this error was uncorrectable and there was | ||
232 | * an overflow, we're in trouble. If no overflow, | ||
233 | * we might get away with just killing a task. | ||
234 | */ | ||
235 | if (m.status & MCI_STATUS_UC) { | ||
236 | if (tolerant < 1 || m.status & MCI_STATUS_OVER) | ||
237 | no_way_out = 1; | ||
238 | kill_it = 1; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | if (m.status & MCI_STATUS_MISCV) | ||
243 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
244 | if (m.status & MCI_STATUS_ADDRV) | ||
245 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
246 | |||
247 | mce_get_rip(&m, regs); | ||
248 | if (error_code >= 0) | ||
249 | rdtscll(m.tsc); | ||
250 | if (error_code != -2) | ||
251 | mce_log(&m); | ||
252 | |||
253 | /* Did this bank cause the exception? */ | ||
254 | /* Assume that the bank with uncorrectable errors did it, | ||
255 | and that there is only a single one. */ | ||
256 | if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { | ||
257 | panicm = m; | ||
258 | panicm_found = 1; | ||
259 | } | ||
260 | |||
261 | add_taint(TAINT_MACHINE_CHECK); | ||
262 | } | ||
263 | |||
264 | /* Never do anything final in the polling timer */ | ||
265 | if (!regs) | ||
266 | goto out; | ||
267 | |||
268 | /* If we didn't find an uncorrectable error, pick | ||
269 | the last one (shouldn't happen, just being safe). */ | ||
270 | if (!panicm_found) | ||
271 | panicm = m; | ||
272 | |||
273 | /* | ||
274 | * If we have decided that we just CAN'T continue, and the user | ||
275 | * has not set tolerant to an insane level, give up and die. | ||
276 | */ | ||
277 | if (no_way_out && tolerant < 3) | ||
278 | mce_panic("Machine check", &panicm, mcestart); | ||
279 | |||
280 | /* | ||
281 | * If the error seems to be unrecoverable, something should be | ||
282 | * done. Try to kill as little as possible. If we can kill just | ||
283 | * one task, do that. If the user has set the tolerance very | ||
284 | * high, don't try to do anything at all. | ||
285 | */ | ||
286 | if (kill_it && tolerant < 3) { | ||
287 | int user_space = 0; | ||
288 | |||
289 | /* | ||
290 | * If the EIPV bit is set, it means the saved IP is the | ||
291 | * instruction which caused the MCE. | ||
292 | */ | ||
293 | if (m.mcgstatus & MCG_STATUS_EIPV) | ||
294 | user_space = panicm.rip && (panicm.cs & 3); | ||
295 | |||
296 | /* | ||
297 | * If we know that the error was in user space, send a | ||
298 | * SIGBUS. Otherwise, panic if tolerance is low. | ||
299 | * | ||
300 | * do_exit() takes an awful lot of locks and has a slight | ||
301 | * risk of deadlocking. | ||
302 | */ | ||
303 | if (user_space) { | ||
304 | do_exit(SIGBUS); | ||
305 | } else if (panic_on_oops || tolerant < 2) { | ||
306 | mce_panic("Uncorrected machine check", | ||
307 | &panicm, mcestart); | ||
308 | } | ||
309 | } | ||
310 | |||
311 | /* notify userspace ASAP */ | ||
312 | set_thread_flag(TIF_MCE_NOTIFY); | ||
313 | |||
314 | out: | ||
315 | /* the last thing we do is clear state */ | ||
316 | for (i = 0; i < banks; i++) | ||
317 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
318 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
319 | out2: | ||
320 | atomic_dec(&mce_entry); | ||
321 | } | ||
322 | |||
323 | #ifdef CONFIG_X86_MCE_INTEL | ||
324 | /*** | ||
325 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog | ||
326 | * @cpu: The CPU on which the event occured. | ||
327 | * @status: Event status information | ||
328 | * | ||
329 | * This function should be called by the thermal interrupt after the | ||
330 | * event has been processed and the decision was made to log the event | ||
331 | * further. | ||
332 | * | ||
333 | * The status parameter will be saved to the 'status' field of 'struct mce' | ||
334 | * and historically has been the register value of the | ||
335 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | ||
336 | */ | ||
337 | void mce_log_therm_throt_event(unsigned int cpu, __u64 status) | ||
338 | { | ||
339 | struct mce m; | ||
340 | |||
341 | memset(&m, 0, sizeof(m)); | ||
342 | m.cpu = cpu; | ||
343 | m.bank = MCE_THERMAL_BANK; | ||
344 | m.status = status; | ||
345 | rdtscll(m.tsc); | ||
346 | mce_log(&m); | ||
347 | } | ||
348 | #endif /* CONFIG_X86_MCE_INTEL */ | ||
349 | |||
350 | /* | ||
351 | * Periodic polling timer for "silent" machine check errors. If the | ||
352 | * poller finds an MCE, poll 2x faster. When the poller finds no more | ||
353 | * errors, poll 2x slower (up to check_interval seconds). | ||
354 | */ | ||
355 | |||
356 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
357 | static int next_interval; /* in jiffies */ | ||
358 | static void mcheck_timer(struct work_struct *work); | ||
359 | static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); | ||
360 | |||
361 | static void mcheck_check_cpu(void *info) | ||
362 | { | ||
363 | if (mce_available(¤t_cpu_data)) | ||
364 | do_machine_check(NULL, 0); | ||
365 | } | ||
366 | |||
367 | static void mcheck_timer(struct work_struct *work) | ||
368 | { | ||
369 | on_each_cpu(mcheck_check_cpu, NULL, 1, 1); | ||
370 | |||
371 | /* | ||
372 | * Alert userspace if needed. If we logged an MCE, reduce the | ||
373 | * polling interval, otherwise increase the polling interval. | ||
374 | */ | ||
375 | if (mce_notify_user()) { | ||
376 | next_interval = max(next_interval/2, HZ/100); | ||
377 | } else { | ||
378 | next_interval = min(next_interval*2, | ||
379 | (int)round_jiffies_relative(check_interval*HZ)); | ||
380 | } | ||
381 | |||
382 | schedule_delayed_work(&mcheck_work, next_interval); | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | * This is only called from process context. This is where we do | ||
387 | * anything we need to alert userspace about new MCEs. This is called | ||
388 | * directly from the poller and also from entry.S and idle, thanks to | ||
389 | * TIF_MCE_NOTIFY. | ||
390 | */ | ||
391 | int mce_notify_user(void) | ||
392 | { | ||
393 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
394 | if (test_and_clear_bit(0, ¬ify_user)) { | ||
395 | static unsigned long last_print; | ||
396 | unsigned long now = jiffies; | ||
397 | |||
398 | wake_up_interruptible(&mce_wait); | ||
399 | if (trigger[0]) | ||
400 | call_usermodehelper(trigger, trigger_argv, NULL, | ||
401 | UMH_NO_WAIT); | ||
402 | |||
403 | if (time_after_eq(now, last_print + (check_interval*HZ))) { | ||
404 | last_print = now; | ||
405 | printk(KERN_INFO "Machine check events logged\n"); | ||
406 | } | ||
407 | |||
408 | return 1; | ||
409 | } | ||
410 | return 0; | ||
411 | } | ||
412 | |||
413 | /* see if the idle task needs to notify userspace */ | ||
414 | static int | ||
415 | mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) | ||
416 | { | ||
417 | /* IDLE_END should be safe - interrupts are back on */ | ||
418 | if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) | ||
419 | mce_notify_user(); | ||
420 | |||
421 | return NOTIFY_OK; | ||
422 | } | ||
423 | |||
424 | static struct notifier_block mce_idle_notifier = { | ||
425 | .notifier_call = mce_idle_callback, | ||
426 | }; | ||
427 | |||
428 | static __init int periodic_mcheck_init(void) | ||
429 | { | ||
430 | next_interval = check_interval * HZ; | ||
431 | if (next_interval) | ||
432 | schedule_delayed_work(&mcheck_work, | ||
433 | round_jiffies_relative(next_interval)); | ||
434 | idle_notifier_register(&mce_idle_notifier); | ||
435 | return 0; | ||
436 | } | ||
437 | __initcall(periodic_mcheck_init); | ||
438 | |||
439 | |||
440 | /* | ||
441 | * Initialize Machine Checks for a CPU. | ||
442 | */ | ||
443 | static void mce_init(void *dummy) | ||
444 | { | ||
445 | u64 cap; | ||
446 | int i; | ||
447 | |||
448 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
449 | banks = cap & 0xff; | ||
450 | if (banks > NR_BANKS) { | ||
451 | printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); | ||
452 | banks = NR_BANKS; | ||
453 | } | ||
454 | /* Use accurate RIP reporting if available. */ | ||
455 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | ||
456 | rip_msr = MSR_IA32_MCG_EIP; | ||
457 | |||
458 | /* Log the machine checks left over from the previous reset. | ||
459 | This also clears all registers */ | ||
460 | do_machine_check(NULL, mce_bootlog ? -1 : -2); | ||
461 | |||
462 | set_in_cr4(X86_CR4_MCE); | ||
463 | |||
464 | if (cap & MCG_CTL_P) | ||
465 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
466 | |||
467 | for (i = 0; i < banks; i++) { | ||
468 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
469 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
470 | } | ||
471 | } | ||
472 | |||
473 | /* Add per CPU specific workarounds here */ | ||
474 | static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
475 | { | ||
476 | /* This should be disabled by the BIOS, but isn't always */ | ||
477 | if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { | ||
478 | /* disable GART TBL walk error reporting, which trips off | ||
479 | incorrectly with the IOMMU & 3ware & Cerberus. */ | ||
480 | clear_bit(10, &bank[4]); | ||
481 | /* Lots of broken BIOS around that don't clear them | ||
482 | by default and leave crap in there. Don't log. */ | ||
483 | mce_bootlog = 0; | ||
484 | } | ||
485 | |||
486 | } | ||
487 | |||
488 | static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) | ||
489 | { | ||
490 | switch (c->x86_vendor) { | ||
491 | case X86_VENDOR_INTEL: | ||
492 | mce_intel_feature_init(c); | ||
493 | break; | ||
494 | case X86_VENDOR_AMD: | ||
495 | mce_amd_feature_init(c); | ||
496 | break; | ||
497 | default: | ||
498 | break; | ||
499 | } | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Called for each booted CPU to set up machine checks. | ||
504 | * Must be called with preempt off. | ||
505 | */ | ||
506 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | ||
507 | { | ||
508 | static cpumask_t mce_cpus = CPU_MASK_NONE; | ||
509 | |||
510 | mce_cpu_quirks(c); | ||
511 | |||
512 | if (mce_dont_init || | ||
513 | cpu_test_and_set(smp_processor_id(), mce_cpus) || | ||
514 | !mce_available(c)) | ||
515 | return; | ||
516 | |||
517 | mce_init(NULL); | ||
518 | mce_cpu_features(c); | ||
519 | } | ||
520 | |||
521 | /* | ||
522 | * Character device to read and clear the MCE log. | ||
523 | */ | ||
524 | |||
525 | static DEFINE_SPINLOCK(mce_state_lock); | ||
526 | static int open_count; /* #times opened */ | ||
527 | static int open_exclu; /* already open exclusive? */ | ||
528 | |||
529 | static int mce_open(struct inode *inode, struct file *file) | ||
530 | { | ||
531 | spin_lock(&mce_state_lock); | ||
532 | |||
533 | if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { | ||
534 | spin_unlock(&mce_state_lock); | ||
535 | return -EBUSY; | ||
536 | } | ||
537 | |||
538 | if (file->f_flags & O_EXCL) | ||
539 | open_exclu = 1; | ||
540 | open_count++; | ||
541 | |||
542 | spin_unlock(&mce_state_lock); | ||
543 | |||
544 | return nonseekable_open(inode, file); | ||
545 | } | ||
546 | |||
547 | static int mce_release(struct inode *inode, struct file *file) | ||
548 | { | ||
549 | spin_lock(&mce_state_lock); | ||
550 | |||
551 | open_count--; | ||
552 | open_exclu = 0; | ||
553 | |||
554 | spin_unlock(&mce_state_lock); | ||
555 | |||
556 | return 0; | ||
557 | } | ||
558 | |||
559 | static void collect_tscs(void *data) | ||
560 | { | ||
561 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
562 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
563 | } | ||
564 | |||
565 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) | ||
566 | { | ||
567 | unsigned long *cpu_tsc; | ||
568 | static DECLARE_MUTEX(mce_read_sem); | ||
569 | unsigned next; | ||
570 | char __user *buf = ubuf; | ||
571 | int i, err; | ||
572 | |||
573 | cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); | ||
574 | if (!cpu_tsc) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | down(&mce_read_sem); | ||
578 | next = rcu_dereference(mcelog.next); | ||
579 | |||
580 | /* Only supports full reads right now */ | ||
581 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
582 | up(&mce_read_sem); | ||
583 | kfree(cpu_tsc); | ||
584 | return -EINVAL; | ||
585 | } | ||
586 | |||
587 | err = 0; | ||
588 | for (i = 0; i < next; i++) { | ||
589 | unsigned long start = jiffies; | ||
590 | while (!mcelog.entry[i].finished) { | ||
591 | if (time_after_eq(jiffies, start + 2)) { | ||
592 | memset(mcelog.entry + i,0, sizeof(struct mce)); | ||
593 | goto timeout; | ||
594 | } | ||
595 | cpu_relax(); | ||
596 | } | ||
597 | smp_rmb(); | ||
598 | err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); | ||
599 | buf += sizeof(struct mce); | ||
600 | timeout: | ||
601 | ; | ||
602 | } | ||
603 | |||
604 | memset(mcelog.entry, 0, next * sizeof(struct mce)); | ||
605 | mcelog.next = 0; | ||
606 | |||
607 | synchronize_sched(); | ||
608 | |||
609 | /* Collect entries that were still getting written before the synchronize. */ | ||
610 | |||
611 | on_each_cpu(collect_tscs, cpu_tsc, 1, 1); | ||
612 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
613 | if (mcelog.entry[i].finished && | ||
614 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
615 | err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); | ||
616 | smp_rmb(); | ||
617 | buf += sizeof(struct mce); | ||
618 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
619 | } | ||
620 | } | ||
621 | up(&mce_read_sem); | ||
622 | kfree(cpu_tsc); | ||
623 | return err ? -EFAULT : buf - ubuf; | ||
624 | } | ||
625 | |||
626 | static unsigned int mce_poll(struct file *file, poll_table *wait) | ||
627 | { | ||
628 | poll_wait(file, &mce_wait, wait); | ||
629 | if (rcu_dereference(mcelog.next)) | ||
630 | return POLLIN | POLLRDNORM; | ||
631 | return 0; | ||
632 | } | ||
633 | |||
634 | static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) | ||
635 | { | ||
636 | int __user *p = (int __user *)arg; | ||
637 | if (!capable(CAP_SYS_ADMIN)) | ||
638 | return -EPERM; | ||
639 | switch (cmd) { | ||
640 | case MCE_GET_RECORD_LEN: | ||
641 | return put_user(sizeof(struct mce), p); | ||
642 | case MCE_GET_LOG_LEN: | ||
643 | return put_user(MCE_LOG_LEN, p); | ||
644 | case MCE_GETCLEAR_FLAGS: { | ||
645 | unsigned flags; | ||
646 | do { | ||
647 | flags = mcelog.flags; | ||
648 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
649 | return put_user(flags, p); | ||
650 | } | ||
651 | default: | ||
652 | return -ENOTTY; | ||
653 | } | ||
654 | } | ||
655 | |||
656 | static const struct file_operations mce_chrdev_ops = { | ||
657 | .open = mce_open, | ||
658 | .release = mce_release, | ||
659 | .read = mce_read, | ||
660 | .poll = mce_poll, | ||
661 | .ioctl = mce_ioctl, | ||
662 | }; | ||
663 | |||
664 | static struct miscdevice mce_log_device = { | ||
665 | MISC_MCELOG_MINOR, | ||
666 | "mcelog", | ||
667 | &mce_chrdev_ops, | ||
668 | }; | ||
669 | |||
670 | static unsigned long old_cr4 __initdata; | ||
671 | |||
672 | void __init stop_mce(void) | ||
673 | { | ||
674 | old_cr4 = read_cr4(); | ||
675 | clear_in_cr4(X86_CR4_MCE); | ||
676 | } | ||
677 | |||
678 | void __init restart_mce(void) | ||
679 | { | ||
680 | if (old_cr4 & X86_CR4_MCE) | ||
681 | set_in_cr4(X86_CR4_MCE); | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * Old style boot options parsing. Only for compatibility. | ||
686 | */ | ||
687 | |||
688 | static int __init mcheck_disable(char *str) | ||
689 | { | ||
690 | mce_dont_init = 1; | ||
691 | return 1; | ||
692 | } | ||
693 | |||
694 | /* mce=off disables machine check. Note you can reenable it later | ||
695 | using sysfs. | ||
696 | mce=TOLERANCELEVEL (number, see above) | ||
697 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | ||
698 | mce=nobootlog Don't log MCEs from before booting. */ | ||
699 | static int __init mcheck_enable(char *str) | ||
700 | { | ||
701 | if (*str == '=') | ||
702 | str++; | ||
703 | if (!strcmp(str, "off")) | ||
704 | mce_dont_init = 1; | ||
705 | else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) | ||
706 | mce_bootlog = str[0] == 'b'; | ||
707 | else if (isdigit(str[0])) | ||
708 | get_option(&str, &tolerant); | ||
709 | else | ||
710 | printk("mce= argument %s ignored. Please use /sys", str); | ||
711 | return 1; | ||
712 | } | ||
713 | |||
714 | __setup("nomce", mcheck_disable); | ||
715 | __setup("mce", mcheck_enable); | ||
716 | |||
717 | /* | ||
718 | * Sysfs support | ||
719 | */ | ||
720 | |||
721 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. | ||
722 | Only one CPU is active at this time, the others get readded later using | ||
723 | CPU hotplug. */ | ||
724 | static int mce_resume(struct sys_device *dev) | ||
725 | { | ||
726 | mce_init(NULL); | ||
727 | return 0; | ||
728 | } | ||
729 | |||
730 | /* Reinit MCEs after user configuration changes */ | ||
731 | static void mce_restart(void) | ||
732 | { | ||
733 | if (next_interval) | ||
734 | cancel_delayed_work(&mcheck_work); | ||
735 | /* Timer race is harmless here */ | ||
736 | on_each_cpu(mce_init, NULL, 1, 1); | ||
737 | next_interval = check_interval * HZ; | ||
738 | if (next_interval) | ||
739 | schedule_delayed_work(&mcheck_work, | ||
740 | round_jiffies_relative(next_interval)); | ||
741 | } | ||
742 | |||
743 | static struct sysdev_class mce_sysclass = { | ||
744 | .resume = mce_resume, | ||
745 | set_kset_name("machinecheck"), | ||
746 | }; | ||
747 | |||
748 | DEFINE_PER_CPU(struct sys_device, device_mce); | ||
749 | |||
750 | /* Why are there no generic functions for this? */ | ||
751 | #define ACCESSOR(name, var, start) \ | ||
752 | static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ | ||
753 | return sprintf(buf, "%lx\n", (unsigned long)var); \ | ||
754 | } \ | ||
755 | static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ | ||
756 | char *end; \ | ||
757 | unsigned long new = simple_strtoul(buf, &end, 0); \ | ||
758 | if (end == buf) return -EINVAL; \ | ||
759 | var = new; \ | ||
760 | start; \ | ||
761 | return end-buf; \ | ||
762 | } \ | ||
763 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | ||
764 | |||
765 | /* TBD should generate these dynamically based on number of available banks */ | ||
766 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | ||
767 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | ||
768 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | ||
769 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | ||
770 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | ||
771 | ACCESSOR(bank5ctl,bank[5],mce_restart()) | ||
772 | |||
773 | static ssize_t show_trigger(struct sys_device *s, char *buf) | ||
774 | { | ||
775 | strcpy(buf, trigger); | ||
776 | strcat(buf, "\n"); | ||
777 | return strlen(trigger) + 1; | ||
778 | } | ||
779 | |||
780 | static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) | ||
781 | { | ||
782 | char *p; | ||
783 | int len; | ||
784 | strncpy(trigger, buf, sizeof(trigger)); | ||
785 | trigger[sizeof(trigger)-1] = 0; | ||
786 | len = strlen(trigger); | ||
787 | p = strchr(trigger, '\n'); | ||
788 | if (*p) *p = 0; | ||
789 | return len; | ||
790 | } | ||
791 | |||
792 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | ||
793 | ACCESSOR(tolerant,tolerant,) | ||
794 | ACCESSOR(check_interval,check_interval,mce_restart()) | ||
795 | static struct sysdev_attribute *mce_attributes[] = { | ||
796 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | ||
797 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, | ||
798 | &attr_tolerant, &attr_check_interval, &attr_trigger, | ||
799 | NULL | ||
800 | }; | ||
801 | |||
802 | /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ | ||
803 | static __cpuinit int mce_create_device(unsigned int cpu) | ||
804 | { | ||
805 | int err; | ||
806 | int i; | ||
807 | if (!mce_available(&cpu_data[cpu])) | ||
808 | return -EIO; | ||
809 | |||
810 | per_cpu(device_mce,cpu).id = cpu; | ||
811 | per_cpu(device_mce,cpu).cls = &mce_sysclass; | ||
812 | |||
813 | err = sysdev_register(&per_cpu(device_mce,cpu)); | ||
814 | |||
815 | if (!err) { | ||
816 | for (i = 0; mce_attributes[i]; i++) | ||
817 | sysdev_create_file(&per_cpu(device_mce,cpu), | ||
818 | mce_attributes[i]); | ||
819 | } | ||
820 | return err; | ||
821 | } | ||
822 | |||
823 | static void mce_remove_device(unsigned int cpu) | ||
824 | { | ||
825 | int i; | ||
826 | |||
827 | for (i = 0; mce_attributes[i]; i++) | ||
828 | sysdev_remove_file(&per_cpu(device_mce,cpu), | ||
829 | mce_attributes[i]); | ||
830 | sysdev_unregister(&per_cpu(device_mce,cpu)); | ||
831 | memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); | ||
832 | } | ||
833 | |||
834 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | ||
835 | static int | ||
836 | mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
837 | { | ||
838 | unsigned int cpu = (unsigned long)hcpu; | ||
839 | |||
840 | switch (action) { | ||
841 | case CPU_ONLINE: | ||
842 | case CPU_ONLINE_FROZEN: | ||
843 | mce_create_device(cpu); | ||
844 | break; | ||
845 | case CPU_DEAD: | ||
846 | case CPU_DEAD_FROZEN: | ||
847 | mce_remove_device(cpu); | ||
848 | break; | ||
849 | } | ||
850 | return NOTIFY_OK; | ||
851 | } | ||
852 | |||
853 | static struct notifier_block mce_cpu_notifier = { | ||
854 | .notifier_call = mce_cpu_callback, | ||
855 | }; | ||
856 | |||
857 | static __init int mce_init_device(void) | ||
858 | { | ||
859 | int err; | ||
860 | int i = 0; | ||
861 | |||
862 | if (!mce_available(&boot_cpu_data)) | ||
863 | return -EIO; | ||
864 | err = sysdev_class_register(&mce_sysclass); | ||
865 | |||
866 | for_each_online_cpu(i) { | ||
867 | mce_create_device(i); | ||
868 | } | ||
869 | |||
870 | register_hotcpu_notifier(&mce_cpu_notifier); | ||
871 | misc_register(&mce_log_device); | ||
872 | return err; | ||
873 | } | ||
874 | |||
875 | device_initcall(mce_init_device); | ||
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c deleted file mode 100644 index 2f8a7f18b0fe..000000000000 --- a/arch/x86_64/kernel/mce_amd.c +++ /dev/null | |||
@@ -1,689 +0,0 @@ | |||
1 | /* | ||
2 | * (c) 2005, 2006 Advanced Micro Devices, Inc. | ||
3 | * Your use of this code is subject to the terms and conditions of the | ||
4 | * GNU general public license version 2. See "COPYING" or | ||
5 | * http://www.gnu.org/licenses/gpl.html | ||
6 | * | ||
7 | * Written by Jacob Shin - AMD, Inc. | ||
8 | * | ||
9 | * Support : jacob.shin@amd.com | ||
10 | * | ||
11 | * April 2006 | ||
12 | * - added support for AMD Family 0x10 processors | ||
13 | * | ||
14 | * All MC4_MISCi registers are shared between multi-cores | ||
15 | */ | ||
16 | |||
17 | #include <linux/cpu.h> | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/interrupt.h> | ||
21 | #include <linux/kobject.h> | ||
22 | #include <linux/notifier.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/smp.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/sysfs.h> | ||
27 | #include <asm/apic.h> | ||
28 | #include <asm/mce.h> | ||
29 | #include <asm/msr.h> | ||
30 | #include <asm/percpu.h> | ||
31 | #include <asm/idle.h> | ||
32 | |||
33 | #define PFX "mce_threshold: " | ||
34 | #define VERSION "version 1.1.1" | ||
35 | #define NR_BANKS 6 | ||
36 | #define NR_BLOCKS 9 | ||
37 | #define THRESHOLD_MAX 0xFFF | ||
38 | #define INT_TYPE_APIC 0x00020000 | ||
39 | #define MASK_VALID_HI 0x80000000 | ||
40 | #define MASK_CNTP_HI 0x40000000 | ||
41 | #define MASK_LOCKED_HI 0x20000000 | ||
42 | #define MASK_LVTOFF_HI 0x00F00000 | ||
43 | #define MASK_COUNT_EN_HI 0x00080000 | ||
44 | #define MASK_INT_TYPE_HI 0x00060000 | ||
45 | #define MASK_OVERFLOW_HI 0x00010000 | ||
46 | #define MASK_ERR_COUNT_HI 0x00000FFF | ||
47 | #define MASK_BLKPTR_LO 0xFF000000 | ||
48 | #define MCG_XBLK_ADDR 0xC0000400 | ||
49 | |||
50 | struct threshold_block { | ||
51 | unsigned int block; | ||
52 | unsigned int bank; | ||
53 | unsigned int cpu; | ||
54 | u32 address; | ||
55 | u16 interrupt_enable; | ||
56 | u16 threshold_limit; | ||
57 | struct kobject kobj; | ||
58 | struct list_head miscj; | ||
59 | }; | ||
60 | |||
61 | /* defaults used early on boot */ | ||
62 | static struct threshold_block threshold_defaults = { | ||
63 | .interrupt_enable = 0, | ||
64 | .threshold_limit = THRESHOLD_MAX, | ||
65 | }; | ||
66 | |||
67 | struct threshold_bank { | ||
68 | struct kobject kobj; | ||
69 | struct threshold_block *blocks; | ||
70 | cpumask_t cpus; | ||
71 | }; | ||
72 | static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); | ||
73 | |||
74 | #ifdef CONFIG_SMP | ||
75 | static unsigned char shared_bank[NR_BANKS] = { | ||
76 | 0, 0, 0, 0, 1 | ||
77 | }; | ||
78 | #endif | ||
79 | |||
80 | static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ | ||
81 | |||
82 | /* | ||
83 | * CPU Initialization | ||
84 | */ | ||
85 | |||
86 | /* must be called with correct cpu affinity */ | ||
87 | static void threshold_restart_bank(struct threshold_block *b, | ||
88 | int reset, u16 old_limit) | ||
89 | { | ||
90 | u32 mci_misc_hi, mci_misc_lo; | ||
91 | |||
92 | rdmsr(b->address, mci_misc_lo, mci_misc_hi); | ||
93 | |||
94 | if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) | ||
95 | reset = 1; /* limit cannot be lower than err count */ | ||
96 | |||
97 | if (reset) { /* reset err count and overflow bit */ | ||
98 | mci_misc_hi = | ||
99 | (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | | ||
100 | (THRESHOLD_MAX - b->threshold_limit); | ||
101 | } else if (old_limit) { /* change limit w/o reset */ | ||
102 | int new_count = (mci_misc_hi & THRESHOLD_MAX) + | ||
103 | (old_limit - b->threshold_limit); | ||
104 | mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | | ||
105 | (new_count & THRESHOLD_MAX); | ||
106 | } | ||
107 | |||
108 | b->interrupt_enable ? | ||
109 | (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : | ||
110 | (mci_misc_hi &= ~MASK_INT_TYPE_HI); | ||
111 | |||
112 | mci_misc_hi |= MASK_COUNT_EN_HI; | ||
113 | wrmsr(b->address, mci_misc_lo, mci_misc_hi); | ||
114 | } | ||
115 | |||
116 | /* cpu init entry point, called from mce.c with preempt off */ | ||
117 | void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) | ||
118 | { | ||
119 | unsigned int bank, block; | ||
120 | unsigned int cpu = smp_processor_id(); | ||
121 | u32 low = 0, high = 0, address = 0; | ||
122 | |||
123 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
124 | for (block = 0; block < NR_BLOCKS; ++block) { | ||
125 | if (block == 0) | ||
126 | address = MSR_IA32_MC0_MISC + bank * 4; | ||
127 | else if (block == 1) { | ||
128 | address = (low & MASK_BLKPTR_LO) >> 21; | ||
129 | if (!address) | ||
130 | break; | ||
131 | address += MCG_XBLK_ADDR; | ||
132 | } | ||
133 | else | ||
134 | ++address; | ||
135 | |||
136 | if (rdmsr_safe(address, &low, &high)) | ||
137 | break; | ||
138 | |||
139 | if (!(high & MASK_VALID_HI)) { | ||
140 | if (block) | ||
141 | continue; | ||
142 | else | ||
143 | break; | ||
144 | } | ||
145 | |||
146 | if (!(high & MASK_CNTP_HI) || | ||
147 | (high & MASK_LOCKED_HI)) | ||
148 | continue; | ||
149 | |||
150 | if (!block) | ||
151 | per_cpu(bank_map, cpu) |= (1 << bank); | ||
152 | #ifdef CONFIG_SMP | ||
153 | if (shared_bank[bank] && c->cpu_core_id) | ||
154 | break; | ||
155 | #endif | ||
156 | high &= ~MASK_LVTOFF_HI; | ||
157 | high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; | ||
158 | wrmsr(address, low, high); | ||
159 | |||
160 | setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, | ||
161 | THRESHOLD_APIC_VECTOR, | ||
162 | K8_APIC_EXT_INT_MSG_FIX, 0); | ||
163 | |||
164 | threshold_defaults.address = address; | ||
165 | threshold_restart_bank(&threshold_defaults, 0, 0); | ||
166 | } | ||
167 | } | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * APIC Interrupt Handler | ||
172 | */ | ||
173 | |||
174 | /* | ||
175 | * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. | ||
176 | * the interrupt goes off when error_count reaches threshold_limit. | ||
177 | * the handler will simply log mcelog w/ software defined bank number. | ||
178 | */ | ||
179 | asmlinkage void mce_threshold_interrupt(void) | ||
180 | { | ||
181 | unsigned int bank, block; | ||
182 | struct mce m; | ||
183 | u32 low = 0, high = 0, address = 0; | ||
184 | |||
185 | ack_APIC_irq(); | ||
186 | exit_idle(); | ||
187 | irq_enter(); | ||
188 | |||
189 | memset(&m, 0, sizeof(m)); | ||
190 | rdtscll(m.tsc); | ||
191 | m.cpu = smp_processor_id(); | ||
192 | |||
193 | /* assume first bank caused it */ | ||
194 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
195 | if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) | ||
196 | continue; | ||
197 | for (block = 0; block < NR_BLOCKS; ++block) { | ||
198 | if (block == 0) | ||
199 | address = MSR_IA32_MC0_MISC + bank * 4; | ||
200 | else if (block == 1) { | ||
201 | address = (low & MASK_BLKPTR_LO) >> 21; | ||
202 | if (!address) | ||
203 | break; | ||
204 | address += MCG_XBLK_ADDR; | ||
205 | } | ||
206 | else | ||
207 | ++address; | ||
208 | |||
209 | if (rdmsr_safe(address, &low, &high)) | ||
210 | break; | ||
211 | |||
212 | if (!(high & MASK_VALID_HI)) { | ||
213 | if (block) | ||
214 | continue; | ||
215 | else | ||
216 | break; | ||
217 | } | ||
218 | |||
219 | if (!(high & MASK_CNTP_HI) || | ||
220 | (high & MASK_LOCKED_HI)) | ||
221 | continue; | ||
222 | |||
223 | /* Log the machine check that caused the threshold | ||
224 | event. */ | ||
225 | do_machine_check(NULL, 0); | ||
226 | |||
227 | if (high & MASK_OVERFLOW_HI) { | ||
228 | rdmsrl(address, m.misc); | ||
229 | rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, | ||
230 | m.status); | ||
231 | m.bank = K8_MCE_THRESHOLD_BASE | ||
232 | + bank * NR_BLOCKS | ||
233 | + block; | ||
234 | mce_log(&m); | ||
235 | goto out; | ||
236 | } | ||
237 | } | ||
238 | } | ||
239 | out: | ||
240 | irq_exit(); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Sysfs Interface | ||
245 | */ | ||
246 | |||
247 | struct threshold_attr { | ||
248 | struct attribute attr; | ||
249 | ssize_t(*show) (struct threshold_block *, char *); | ||
250 | ssize_t(*store) (struct threshold_block *, const char *, size_t count); | ||
251 | }; | ||
252 | |||
253 | static cpumask_t affinity_set(unsigned int cpu) | ||
254 | { | ||
255 | cpumask_t oldmask = current->cpus_allowed; | ||
256 | cpumask_t newmask = CPU_MASK_NONE; | ||
257 | cpu_set(cpu, newmask); | ||
258 | set_cpus_allowed(current, newmask); | ||
259 | return oldmask; | ||
260 | } | ||
261 | |||
262 | static void affinity_restore(cpumask_t oldmask) | ||
263 | { | ||
264 | set_cpus_allowed(current, oldmask); | ||
265 | } | ||
266 | |||
267 | #define SHOW_FIELDS(name) \ | ||
268 | static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ | ||
269 | { \ | ||
270 | return sprintf(buf, "%lx\n", (unsigned long) b->name); \ | ||
271 | } | ||
272 | SHOW_FIELDS(interrupt_enable) | ||
273 | SHOW_FIELDS(threshold_limit) | ||
274 | |||
275 | static ssize_t store_interrupt_enable(struct threshold_block *b, | ||
276 | const char *buf, size_t count) | ||
277 | { | ||
278 | char *end; | ||
279 | cpumask_t oldmask; | ||
280 | unsigned long new = simple_strtoul(buf, &end, 0); | ||
281 | if (end == buf) | ||
282 | return -EINVAL; | ||
283 | b->interrupt_enable = !!new; | ||
284 | |||
285 | oldmask = affinity_set(b->cpu); | ||
286 | threshold_restart_bank(b, 0, 0); | ||
287 | affinity_restore(oldmask); | ||
288 | |||
289 | return end - buf; | ||
290 | } | ||
291 | |||
292 | static ssize_t store_threshold_limit(struct threshold_block *b, | ||
293 | const char *buf, size_t count) | ||
294 | { | ||
295 | char *end; | ||
296 | cpumask_t oldmask; | ||
297 | u16 old; | ||
298 | unsigned long new = simple_strtoul(buf, &end, 0); | ||
299 | if (end == buf) | ||
300 | return -EINVAL; | ||
301 | if (new > THRESHOLD_MAX) | ||
302 | new = THRESHOLD_MAX; | ||
303 | if (new < 1) | ||
304 | new = 1; | ||
305 | old = b->threshold_limit; | ||
306 | b->threshold_limit = new; | ||
307 | |||
308 | oldmask = affinity_set(b->cpu); | ||
309 | threshold_restart_bank(b, 0, old); | ||
310 | affinity_restore(oldmask); | ||
311 | |||
312 | return end - buf; | ||
313 | } | ||
314 | |||
315 | static ssize_t show_error_count(struct threshold_block *b, char *buf) | ||
316 | { | ||
317 | u32 high, low; | ||
318 | cpumask_t oldmask; | ||
319 | oldmask = affinity_set(b->cpu); | ||
320 | rdmsr(b->address, low, high); | ||
321 | affinity_restore(oldmask); | ||
322 | return sprintf(buf, "%x\n", | ||
323 | (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); | ||
324 | } | ||
325 | |||
326 | static ssize_t store_error_count(struct threshold_block *b, | ||
327 | const char *buf, size_t count) | ||
328 | { | ||
329 | cpumask_t oldmask; | ||
330 | oldmask = affinity_set(b->cpu); | ||
331 | threshold_restart_bank(b, 1, 0); | ||
332 | affinity_restore(oldmask); | ||
333 | return 1; | ||
334 | } | ||
335 | |||
336 | #define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ | ||
337 | .attr = {.name = __stringify(_name), .mode = _mode }, \ | ||
338 | .show = _show, \ | ||
339 | .store = _store, \ | ||
340 | }; | ||
341 | |||
342 | #define RW_ATTR(name) \ | ||
343 | static struct threshold_attr name = \ | ||
344 | THRESHOLD_ATTR(name, 0644, show_## name, store_## name) | ||
345 | |||
346 | RW_ATTR(interrupt_enable); | ||
347 | RW_ATTR(threshold_limit); | ||
348 | RW_ATTR(error_count); | ||
349 | |||
350 | static struct attribute *default_attrs[] = { | ||
351 | &interrupt_enable.attr, | ||
352 | &threshold_limit.attr, | ||
353 | &error_count.attr, | ||
354 | NULL | ||
355 | }; | ||
356 | |||
357 | #define to_block(k) container_of(k, struct threshold_block, kobj) | ||
358 | #define to_attr(a) container_of(a, struct threshold_attr, attr) | ||
359 | |||
360 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) | ||
361 | { | ||
362 | struct threshold_block *b = to_block(kobj); | ||
363 | struct threshold_attr *a = to_attr(attr); | ||
364 | ssize_t ret; | ||
365 | ret = a->show ? a->show(b, buf) : -EIO; | ||
366 | return ret; | ||
367 | } | ||
368 | |||
369 | static ssize_t store(struct kobject *kobj, struct attribute *attr, | ||
370 | const char *buf, size_t count) | ||
371 | { | ||
372 | struct threshold_block *b = to_block(kobj); | ||
373 | struct threshold_attr *a = to_attr(attr); | ||
374 | ssize_t ret; | ||
375 | ret = a->store ? a->store(b, buf, count) : -EIO; | ||
376 | return ret; | ||
377 | } | ||
378 | |||
379 | static struct sysfs_ops threshold_ops = { | ||
380 | .show = show, | ||
381 | .store = store, | ||
382 | }; | ||
383 | |||
384 | static struct kobj_type threshold_ktype = { | ||
385 | .sysfs_ops = &threshold_ops, | ||
386 | .default_attrs = default_attrs, | ||
387 | }; | ||
388 | |||
389 | static __cpuinit int allocate_threshold_blocks(unsigned int cpu, | ||
390 | unsigned int bank, | ||
391 | unsigned int block, | ||
392 | u32 address) | ||
393 | { | ||
394 | int err; | ||
395 | u32 low, high; | ||
396 | struct threshold_block *b = NULL; | ||
397 | |||
398 | if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) | ||
399 | return 0; | ||
400 | |||
401 | if (rdmsr_safe(address, &low, &high)) | ||
402 | return 0; | ||
403 | |||
404 | if (!(high & MASK_VALID_HI)) { | ||
405 | if (block) | ||
406 | goto recurse; | ||
407 | else | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | if (!(high & MASK_CNTP_HI) || | ||
412 | (high & MASK_LOCKED_HI)) | ||
413 | goto recurse; | ||
414 | |||
415 | b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); | ||
416 | if (!b) | ||
417 | return -ENOMEM; | ||
418 | |||
419 | b->block = block; | ||
420 | b->bank = bank; | ||
421 | b->cpu = cpu; | ||
422 | b->address = address; | ||
423 | b->interrupt_enable = 0; | ||
424 | b->threshold_limit = THRESHOLD_MAX; | ||
425 | |||
426 | INIT_LIST_HEAD(&b->miscj); | ||
427 | |||
428 | if (per_cpu(threshold_banks, cpu)[bank]->blocks) | ||
429 | list_add(&b->miscj, | ||
430 | &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); | ||
431 | else | ||
432 | per_cpu(threshold_banks, cpu)[bank]->blocks = b; | ||
433 | |||
434 | kobject_set_name(&b->kobj, "misc%i", block); | ||
435 | b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; | ||
436 | b->kobj.ktype = &threshold_ktype; | ||
437 | err = kobject_register(&b->kobj); | ||
438 | if (err) | ||
439 | goto out_free; | ||
440 | recurse: | ||
441 | if (!block) { | ||
442 | address = (low & MASK_BLKPTR_LO) >> 21; | ||
443 | if (!address) | ||
444 | return 0; | ||
445 | address += MCG_XBLK_ADDR; | ||
446 | } else | ||
447 | ++address; | ||
448 | |||
449 | err = allocate_threshold_blocks(cpu, bank, ++block, address); | ||
450 | if (err) | ||
451 | goto out_free; | ||
452 | |||
453 | return err; | ||
454 | |||
455 | out_free: | ||
456 | if (b) { | ||
457 | kobject_unregister(&b->kobj); | ||
458 | kfree(b); | ||
459 | } | ||
460 | return err; | ||
461 | } | ||
462 | |||
463 | /* symlinks sibling shared banks to first core. first core owns dir/files. */ | ||
464 | static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | ||
465 | { | ||
466 | int i, err = 0; | ||
467 | struct threshold_bank *b = NULL; | ||
468 | cpumask_t oldmask = CPU_MASK_NONE; | ||
469 | char name[32]; | ||
470 | |||
471 | sprintf(name, "threshold_bank%i", bank); | ||
472 | |||
473 | #ifdef CONFIG_SMP | ||
474 | if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ | ||
475 | i = first_cpu(cpu_core_map[cpu]); | ||
476 | |||
477 | /* first core not up yet */ | ||
478 | if (cpu_data[i].cpu_core_id) | ||
479 | goto out; | ||
480 | |||
481 | /* already linked */ | ||
482 | if (per_cpu(threshold_banks, cpu)[bank]) | ||
483 | goto out; | ||
484 | |||
485 | b = per_cpu(threshold_banks, i)[bank]; | ||
486 | |||
487 | if (!b) | ||
488 | goto out; | ||
489 | |||
490 | err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, | ||
491 | &b->kobj, name); | ||
492 | if (err) | ||
493 | goto out; | ||
494 | |||
495 | b->cpus = cpu_core_map[cpu]; | ||
496 | per_cpu(threshold_banks, cpu)[bank] = b; | ||
497 | goto out; | ||
498 | } | ||
499 | #endif | ||
500 | |||
501 | b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); | ||
502 | if (!b) { | ||
503 | err = -ENOMEM; | ||
504 | goto out; | ||
505 | } | ||
506 | |||
507 | kobject_set_name(&b->kobj, "threshold_bank%i", bank); | ||
508 | b->kobj.parent = &per_cpu(device_mce, cpu).kobj; | ||
509 | #ifndef CONFIG_SMP | ||
510 | b->cpus = CPU_MASK_ALL; | ||
511 | #else | ||
512 | b->cpus = cpu_core_map[cpu]; | ||
513 | #endif | ||
514 | err = kobject_register(&b->kobj); | ||
515 | if (err) | ||
516 | goto out_free; | ||
517 | |||
518 | per_cpu(threshold_banks, cpu)[bank] = b; | ||
519 | |||
520 | oldmask = affinity_set(cpu); | ||
521 | err = allocate_threshold_blocks(cpu, bank, 0, | ||
522 | MSR_IA32_MC0_MISC + bank * 4); | ||
523 | affinity_restore(oldmask); | ||
524 | |||
525 | if (err) | ||
526 | goto out_free; | ||
527 | |||
528 | for_each_cpu_mask(i, b->cpus) { | ||
529 | if (i == cpu) | ||
530 | continue; | ||
531 | |||
532 | err = sysfs_create_link(&per_cpu(device_mce, i).kobj, | ||
533 | &b->kobj, name); | ||
534 | if (err) | ||
535 | goto out; | ||
536 | |||
537 | per_cpu(threshold_banks, i)[bank] = b; | ||
538 | } | ||
539 | |||
540 | goto out; | ||
541 | |||
542 | out_free: | ||
543 | per_cpu(threshold_banks, cpu)[bank] = NULL; | ||
544 | kfree(b); | ||
545 | out: | ||
546 | return err; | ||
547 | } | ||
548 | |||
549 | /* create dir/files for all valid threshold banks */ | ||
550 | static __cpuinit int threshold_create_device(unsigned int cpu) | ||
551 | { | ||
552 | unsigned int bank; | ||
553 | int err = 0; | ||
554 | |||
555 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
556 | if (!(per_cpu(bank_map, cpu) & 1 << bank)) | ||
557 | continue; | ||
558 | err = threshold_create_bank(cpu, bank); | ||
559 | if (err) | ||
560 | goto out; | ||
561 | } | ||
562 | out: | ||
563 | return err; | ||
564 | } | ||
565 | |||
566 | /* | ||
567 | * let's be hotplug friendly. | ||
568 | * in case of multiple core processors, the first core always takes ownership | ||
569 | * of shared sysfs dir/files, and rest of the cores will be symlinked to it. | ||
570 | */ | ||
571 | |||
572 | static void deallocate_threshold_block(unsigned int cpu, | ||
573 | unsigned int bank) | ||
574 | { | ||
575 | struct threshold_block *pos = NULL; | ||
576 | struct threshold_block *tmp = NULL; | ||
577 | struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; | ||
578 | |||
579 | if (!head) | ||
580 | return; | ||
581 | |||
582 | list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { | ||
583 | kobject_unregister(&pos->kobj); | ||
584 | list_del(&pos->miscj); | ||
585 | kfree(pos); | ||
586 | } | ||
587 | |||
588 | kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); | ||
589 | per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; | ||
590 | } | ||
591 | |||
592 | static void threshold_remove_bank(unsigned int cpu, int bank) | ||
593 | { | ||
594 | int i = 0; | ||
595 | struct threshold_bank *b; | ||
596 | char name[32]; | ||
597 | |||
598 | b = per_cpu(threshold_banks, cpu)[bank]; | ||
599 | |||
600 | if (!b) | ||
601 | return; | ||
602 | |||
603 | if (!b->blocks) | ||
604 | goto free_out; | ||
605 | |||
606 | sprintf(name, "threshold_bank%i", bank); | ||
607 | |||
608 | #ifdef CONFIG_SMP | ||
609 | /* sibling symlink */ | ||
610 | if (shared_bank[bank] && b->blocks->cpu != cpu) { | ||
611 | sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); | ||
612 | per_cpu(threshold_banks, cpu)[bank] = NULL; | ||
613 | return; | ||
614 | } | ||
615 | #endif | ||
616 | |||
617 | /* remove all sibling symlinks before unregistering */ | ||
618 | for_each_cpu_mask(i, b->cpus) { | ||
619 | if (i == cpu) | ||
620 | continue; | ||
621 | |||
622 | sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); | ||
623 | per_cpu(threshold_banks, i)[bank] = NULL; | ||
624 | } | ||
625 | |||
626 | deallocate_threshold_block(cpu, bank); | ||
627 | |||
628 | free_out: | ||
629 | kobject_unregister(&b->kobj); | ||
630 | kfree(b); | ||
631 | per_cpu(threshold_banks, cpu)[bank] = NULL; | ||
632 | } | ||
633 | |||
634 | static void threshold_remove_device(unsigned int cpu) | ||
635 | { | ||
636 | unsigned int bank; | ||
637 | |||
638 | for (bank = 0; bank < NR_BANKS; ++bank) { | ||
639 | if (!(per_cpu(bank_map, cpu) & 1 << bank)) | ||
640 | continue; | ||
641 | threshold_remove_bank(cpu, bank); | ||
642 | } | ||
643 | } | ||
644 | |||
645 | /* get notified when a cpu comes on/off */ | ||
646 | static int threshold_cpu_callback(struct notifier_block *nfb, | ||
647 | unsigned long action, void *hcpu) | ||
648 | { | ||
649 | /* cpu was unsigned int to begin with */ | ||
650 | unsigned int cpu = (unsigned long)hcpu; | ||
651 | |||
652 | if (cpu >= NR_CPUS) | ||
653 | goto out; | ||
654 | |||
655 | switch (action) { | ||
656 | case CPU_ONLINE: | ||
657 | case CPU_ONLINE_FROZEN: | ||
658 | threshold_create_device(cpu); | ||
659 | break; | ||
660 | case CPU_DEAD: | ||
661 | case CPU_DEAD_FROZEN: | ||
662 | threshold_remove_device(cpu); | ||
663 | break; | ||
664 | default: | ||
665 | break; | ||
666 | } | ||
667 | out: | ||
668 | return NOTIFY_OK; | ||
669 | } | ||
670 | |||
671 | static struct notifier_block threshold_cpu_notifier = { | ||
672 | .notifier_call = threshold_cpu_callback, | ||
673 | }; | ||
674 | |||
675 | static __init int threshold_init_device(void) | ||
676 | { | ||
677 | unsigned lcpu = 0; | ||
678 | |||
679 | /* to hit CPUs online before the notifier is up */ | ||
680 | for_each_online_cpu(lcpu) { | ||
681 | int err = threshold_create_device(lcpu); | ||
682 | if (err) | ||
683 | return err; | ||
684 | } | ||
685 | register_hotcpu_notifier(&threshold_cpu_notifier); | ||
686 | return 0; | ||
687 | } | ||
688 | |||
689 | device_initcall(threshold_init_device); | ||
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c deleted file mode 100644 index 6551505d8a2c..000000000000 --- a/arch/x86_64/kernel/mce_intel.c +++ /dev/null | |||
@@ -1,89 +0,0 @@ | |||
1 | /* | ||
2 | * Intel specific MCE features. | ||
3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | ||
4 | */ | ||
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/percpu.h> | ||
9 | #include <asm/processor.h> | ||
10 | #include <asm/msr.h> | ||
11 | #include <asm/mce.h> | ||
12 | #include <asm/hw_irq.h> | ||
13 | #include <asm/idle.h> | ||
14 | #include <asm/therm_throt.h> | ||
15 | |||
16 | asmlinkage void smp_thermal_interrupt(void) | ||
17 | { | ||
18 | __u64 msr_val; | ||
19 | |||
20 | ack_APIC_irq(); | ||
21 | |||
22 | exit_idle(); | ||
23 | irq_enter(); | ||
24 | |||
25 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | ||
26 | if (therm_throt_process(msr_val & 1)) | ||
27 | mce_log_therm_throt_event(smp_processor_id(), msr_val); | ||
28 | |||
29 | irq_exit(); | ||
30 | } | ||
31 | |||
32 | static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) | ||
33 | { | ||
34 | u32 l, h; | ||
35 | int tm2 = 0; | ||
36 | unsigned int cpu = smp_processor_id(); | ||
37 | |||
38 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
39 | return; | ||
40 | |||
41 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
42 | return; | ||
43 | |||
44 | /* first check if TM1 is already enabled by the BIOS, in which | ||
45 | * case there might be some SMM goo which handles it, so we can't even | ||
46 | * put a handler since it might be delivered via SMI already. | ||
47 | */ | ||
48 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
49 | h = apic_read(APIC_LVTTHMR); | ||
50 | if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { | ||
51 | printk(KERN_DEBUG | ||
52 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
53 | return; | ||
54 | } | ||
55 | |||
56 | if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) | ||
57 | tm2 = 1; | ||
58 | |||
59 | if (h & APIC_VECTOR_MASK) { | ||
60 | printk(KERN_DEBUG | ||
61 | "CPU%d: Thermal LVT vector (%#x) already " | ||
62 | "installed\n", cpu, (h & APIC_VECTOR_MASK)); | ||
63 | return; | ||
64 | } | ||
65 | |||
66 | h = THERMAL_APIC_VECTOR; | ||
67 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); | ||
68 | apic_write(APIC_LVTTHMR, h); | ||
69 | |||
70 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
71 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); | ||
72 | |||
73 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
74 | wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); | ||
75 | |||
76 | l = apic_read(APIC_LVTTHMR); | ||
77 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
78 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
79 | cpu, tm2 ? "TM2" : "TM1"); | ||
80 | |||
81 | /* enable thermal throttle processing */ | ||
82 | atomic_set(&therm_throt_en, 1); | ||
83 | return; | ||
84 | } | ||
85 | |||
86 | void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) | ||
87 | { | ||
88 | intel_init_thermal(c); | ||
89 | } | ||
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c deleted file mode 100644 index a888e67f5874..000000000000 --- a/arch/x86_64/kernel/module.c +++ /dev/null | |||
@@ -1,185 +0,0 @@ | |||
1 | /* Kernel module help for x86-64 | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 2 of the License, or | ||
8 | (at your option) any later version. | ||
9 | |||
10 | This program is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with this program; if not, write to the Free Software | ||
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #include <linux/moduleloader.h> | ||
20 | #include <linux/elf.h> | ||
21 | #include <linux/vmalloc.h> | ||
22 | #include <linux/fs.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/bug.h> | ||
27 | |||
28 | #include <asm/system.h> | ||
29 | #include <asm/page.h> | ||
30 | #include <asm/pgtable.h> | ||
31 | |||
32 | #define DEBUGP(fmt...) | ||
33 | |||
34 | #ifndef CONFIG_UML | ||
35 | void module_free(struct module *mod, void *module_region) | ||
36 | { | ||
37 | vfree(module_region); | ||
38 | /* FIXME: If module_region == mod->init_region, trim exception | ||
39 | table entries. */ | ||
40 | } | ||
41 | |||
42 | void *module_alloc(unsigned long size) | ||
43 | { | ||
44 | struct vm_struct *area; | ||
45 | |||
46 | if (!size) | ||
47 | return NULL; | ||
48 | size = PAGE_ALIGN(size); | ||
49 | if (size > MODULES_LEN) | ||
50 | return NULL; | ||
51 | |||
52 | area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); | ||
53 | if (!area) | ||
54 | return NULL; | ||
55 | |||
56 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); | ||
57 | } | ||
58 | #endif | ||
59 | |||
60 | /* We don't need anything special. */ | ||
61 | int module_frob_arch_sections(Elf_Ehdr *hdr, | ||
62 | Elf_Shdr *sechdrs, | ||
63 | char *secstrings, | ||
64 | struct module *mod) | ||
65 | { | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | int apply_relocate_add(Elf64_Shdr *sechdrs, | ||
70 | const char *strtab, | ||
71 | unsigned int symindex, | ||
72 | unsigned int relsec, | ||
73 | struct module *me) | ||
74 | { | ||
75 | unsigned int i; | ||
76 | Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; | ||
77 | Elf64_Sym *sym; | ||
78 | void *loc; | ||
79 | u64 val; | ||
80 | |||
81 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
82 | sechdrs[relsec].sh_info); | ||
83 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
84 | /* This is where to make the change */ | ||
85 | loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
86 | + rel[i].r_offset; | ||
87 | |||
88 | /* This is the symbol it is referring to. Note that all | ||
89 | undefined symbols have been resolved. */ | ||
90 | sym = (Elf64_Sym *)sechdrs[symindex].sh_addr | ||
91 | + ELF64_R_SYM(rel[i].r_info); | ||
92 | |||
93 | DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", | ||
94 | (int)ELF64_R_TYPE(rel[i].r_info), | ||
95 | sym->st_value, rel[i].r_addend, (u64)loc); | ||
96 | |||
97 | val = sym->st_value + rel[i].r_addend; | ||
98 | |||
99 | switch (ELF64_R_TYPE(rel[i].r_info)) { | ||
100 | case R_X86_64_NONE: | ||
101 | break; | ||
102 | case R_X86_64_64: | ||
103 | *(u64 *)loc = val; | ||
104 | break; | ||
105 | case R_X86_64_32: | ||
106 | *(u32 *)loc = val; | ||
107 | if (val != *(u32 *)loc) | ||
108 | goto overflow; | ||
109 | break; | ||
110 | case R_X86_64_32S: | ||
111 | *(s32 *)loc = val; | ||
112 | if ((s64)val != *(s32 *)loc) | ||
113 | goto overflow; | ||
114 | break; | ||
115 | case R_X86_64_PC32: | ||
116 | val -= (u64)loc; | ||
117 | *(u32 *)loc = val; | ||
118 | #if 0 | ||
119 | if ((s64)val != *(s32 *)loc) | ||
120 | goto overflow; | ||
121 | #endif | ||
122 | break; | ||
123 | default: | ||
124 | printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", | ||
125 | me->name, ELF64_R_TYPE(rel[i].r_info)); | ||
126 | return -ENOEXEC; | ||
127 | } | ||
128 | } | ||
129 | return 0; | ||
130 | |||
131 | overflow: | ||
132 | printk(KERN_ERR "overflow in relocation type %d val %Lx\n", | ||
133 | (int)ELF64_R_TYPE(rel[i].r_info), val); | ||
134 | printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", | ||
135 | me->name); | ||
136 | return -ENOEXEC; | ||
137 | } | ||
138 | |||
139 | int apply_relocate(Elf_Shdr *sechdrs, | ||
140 | const char *strtab, | ||
141 | unsigned int symindex, | ||
142 | unsigned int relsec, | ||
143 | struct module *me) | ||
144 | { | ||
145 | printk("non add relocation not supported\n"); | ||
146 | return -ENOSYS; | ||
147 | } | ||
148 | |||
149 | int module_finalize(const Elf_Ehdr *hdr, | ||
150 | const Elf_Shdr *sechdrs, | ||
151 | struct module *me) | ||
152 | { | ||
153 | const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; | ||
154 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
155 | |||
156 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | ||
157 | if (!strcmp(".text", secstrings + s->sh_name)) | ||
158 | text = s; | ||
159 | if (!strcmp(".altinstructions", secstrings + s->sh_name)) | ||
160 | alt = s; | ||
161 | if (!strcmp(".smp_locks", secstrings + s->sh_name)) | ||
162 | locks= s; | ||
163 | } | ||
164 | |||
165 | if (alt) { | ||
166 | /* patch .altinstructions */ | ||
167 | void *aseg = (void *)alt->sh_addr; | ||
168 | apply_alternatives(aseg, aseg + alt->sh_size); | ||
169 | } | ||
170 | if (locks && text) { | ||
171 | void *lseg = (void *)locks->sh_addr; | ||
172 | void *tseg = (void *)text->sh_addr; | ||
173 | alternatives_smp_module_add(me, me->name, | ||
174 | lseg, lseg + locks->sh_size, | ||
175 | tseg, tseg + text->sh_size); | ||
176 | } | ||
177 | |||
178 | return module_bug_finalize(hdr, sechdrs, me); | ||
179 | } | ||
180 | |||
181 | void module_arch_cleanup(struct module *mod) | ||
182 | { | ||
183 | alternatives_smp_module_del(mod); | ||
184 | module_bug_cleanup(mod); | ||
185 | } | ||
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c deleted file mode 100644 index 8bf0ca03ac8e..000000000000 --- a/arch/x86_64/kernel/mpparse.c +++ /dev/null | |||
@@ -1,852 +0,0 @@ | |||
1 | /* | ||
2 | * Intel Multiprocessor Specification 1.1 and 1.4 | ||
3 | * compliant MP-table parsing routines. | ||
4 | * | ||
5 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
6 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes | ||
9 | * Erich Boleyn : MP v1.4 and additional changes. | ||
10 | * Alan Cox : Added EBDA scanning | ||
11 | * Ingo Molnar : various cleanups and rewrites | ||
12 | * Maciej W. Rozycki: Bits for default MP configurations | ||
13 | * Paul Diefenbaugh: Added full ACPI support | ||
14 | */ | ||
15 | |||
16 | #include <linux/mm.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/delay.h> | ||
19 | #include <linux/bootmem.h> | ||
20 | #include <linux/kernel_stat.h> | ||
21 | #include <linux/mc146818rtc.h> | ||
22 | #include <linux/acpi.h> | ||
23 | #include <linux/module.h> | ||
24 | |||
25 | #include <asm/smp.h> | ||
26 | #include <asm/mtrr.h> | ||
27 | #include <asm/mpspec.h> | ||
28 | #include <asm/pgalloc.h> | ||
29 | #include <asm/io_apic.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/acpi.h> | ||
32 | |||
33 | /* Have we found an MP table */ | ||
34 | int smp_found_config; | ||
35 | |||
36 | /* | ||
37 | * Various Linux-internal data structures created from the | ||
38 | * MP-table. | ||
39 | */ | ||
40 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | ||
41 | int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | ||
42 | |||
43 | static int mp_current_pci_id = 0; | ||
44 | /* I/O APIC entries */ | ||
45 | struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | ||
46 | |||
47 | /* # of MP IRQ source entries */ | ||
48 | struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | ||
49 | |||
50 | /* MP IRQ source entries */ | ||
51 | int mp_irq_entries; | ||
52 | |||
53 | int nr_ioapics; | ||
54 | unsigned long mp_lapic_addr = 0; | ||
55 | |||
56 | |||
57 | |||
58 | /* Processor that is doing the boot up */ | ||
59 | unsigned int boot_cpu_id = -1U; | ||
60 | /* Internal processor count */ | ||
61 | unsigned int num_processors __cpuinitdata = 0; | ||
62 | |||
63 | unsigned disabled_cpus __cpuinitdata; | ||
64 | |||
65 | /* Bitmask of physically existing CPUs */ | ||
66 | physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; | ||
67 | |||
68 | u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
69 | |||
70 | |||
71 | /* | ||
72 | * Intel MP BIOS table parsing routines: | ||
73 | */ | ||
74 | |||
75 | /* | ||
76 | * Checksum an MP configuration block. | ||
77 | */ | ||
78 | |||
79 | static int __init mpf_checksum(unsigned char *mp, int len) | ||
80 | { | ||
81 | int sum = 0; | ||
82 | |||
83 | while (len--) | ||
84 | sum += *mp++; | ||
85 | |||
86 | return sum & 0xFF; | ||
87 | } | ||
88 | |||
89 | static void __cpuinit MP_processor_info (struct mpc_config_processor *m) | ||
90 | { | ||
91 | int cpu; | ||
92 | cpumask_t tmp_map; | ||
93 | char *bootup_cpu = ""; | ||
94 | |||
95 | if (!(m->mpc_cpuflag & CPU_ENABLED)) { | ||
96 | disabled_cpus++; | ||
97 | return; | ||
98 | } | ||
99 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | ||
100 | bootup_cpu = " (Bootup-CPU)"; | ||
101 | boot_cpu_id = m->mpc_apicid; | ||
102 | } | ||
103 | |||
104 | printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); | ||
105 | |||
106 | if (num_processors >= NR_CPUS) { | ||
107 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | ||
108 | " Processor ignored.\n", NR_CPUS); | ||
109 | return; | ||
110 | } | ||
111 | |||
112 | num_processors++; | ||
113 | cpus_complement(tmp_map, cpu_present_map); | ||
114 | cpu = first_cpu(tmp_map); | ||
115 | |||
116 | physid_set(m->mpc_apicid, phys_cpu_present_map); | ||
117 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | ||
118 | /* | ||
119 | * bios_cpu_apicid is required to have processors listed | ||
120 | * in same order as logical cpu numbers. Hence the first | ||
121 | * entry is BSP, and so on. | ||
122 | */ | ||
123 | cpu = 0; | ||
124 | } | ||
125 | bios_cpu_apicid[cpu] = m->mpc_apicid; | ||
126 | x86_cpu_to_apicid[cpu] = m->mpc_apicid; | ||
127 | |||
128 | cpu_set(cpu, cpu_possible_map); | ||
129 | cpu_set(cpu, cpu_present_map); | ||
130 | } | ||
131 | |||
132 | static void __init MP_bus_info (struct mpc_config_bus *m) | ||
133 | { | ||
134 | char str[7]; | ||
135 | |||
136 | memcpy(str, m->mpc_bustype, 6); | ||
137 | str[6] = 0; | ||
138 | Dprintk("Bus #%d is %s\n", m->mpc_busid, str); | ||
139 | |||
140 | if (strncmp(str, "ISA", 3) == 0) { | ||
141 | set_bit(m->mpc_busid, mp_bus_not_pci); | ||
142 | } else if (strncmp(str, "PCI", 3) == 0) { | ||
143 | clear_bit(m->mpc_busid, mp_bus_not_pci); | ||
144 | mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | ||
145 | mp_current_pci_id++; | ||
146 | } else { | ||
147 | printk(KERN_ERR "Unknown bustype %s\n", str); | ||
148 | } | ||
149 | } | ||
150 | |||
151 | static int bad_ioapic(unsigned long address) | ||
152 | { | ||
153 | if (nr_ioapics >= MAX_IO_APICS) { | ||
154 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
155 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
156 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
157 | } | ||
158 | if (!address) { | ||
159 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
160 | " found in table, skipping!\n"); | ||
161 | return 1; | ||
162 | } | ||
163 | return 0; | ||
164 | } | ||
165 | |||
166 | static void __init MP_ioapic_info (struct mpc_config_ioapic *m) | ||
167 | { | ||
168 | if (!(m->mpc_flags & MPC_APIC_USABLE)) | ||
169 | return; | ||
170 | |||
171 | printk("I/O APIC #%d at 0x%X.\n", | ||
172 | m->mpc_apicid, m->mpc_apicaddr); | ||
173 | |||
174 | if (bad_ioapic(m->mpc_apicaddr)) | ||
175 | return; | ||
176 | |||
177 | mp_ioapics[nr_ioapics] = *m; | ||
178 | nr_ioapics++; | ||
179 | } | ||
180 | |||
181 | static void __init MP_intsrc_info (struct mpc_config_intsrc *m) | ||
182 | { | ||
183 | mp_irqs [mp_irq_entries] = *m; | ||
184 | Dprintk("Int: type %d, pol %d, trig %d, bus %d," | ||
185 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | ||
186 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
187 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | ||
188 | m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | ||
189 | if (++mp_irq_entries >= MAX_IRQ_SOURCES) | ||
190 | panic("Max # of irq sources exceeded!!\n"); | ||
191 | } | ||
192 | |||
193 | static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) | ||
194 | { | ||
195 | Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | ||
196 | " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | ||
197 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
198 | (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, | ||
199 | m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * Read/parse the MPC | ||
204 | */ | ||
205 | |||
206 | static int __init smp_read_mpc(struct mp_config_table *mpc) | ||
207 | { | ||
208 | char str[16]; | ||
209 | int count=sizeof(*mpc); | ||
210 | unsigned char *mpt=((unsigned char *)mpc)+count; | ||
211 | |||
212 | if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { | ||
213 | printk("MPTABLE: bad signature [%c%c%c%c]!\n", | ||
214 | mpc->mpc_signature[0], | ||
215 | mpc->mpc_signature[1], | ||
216 | mpc->mpc_signature[2], | ||
217 | mpc->mpc_signature[3]); | ||
218 | return 0; | ||
219 | } | ||
220 | if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { | ||
221 | printk("MPTABLE: checksum error!\n"); | ||
222 | return 0; | ||
223 | } | ||
224 | if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { | ||
225 | printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", | ||
226 | mpc->mpc_spec); | ||
227 | return 0; | ||
228 | } | ||
229 | if (!mpc->mpc_lapic) { | ||
230 | printk(KERN_ERR "MPTABLE: null local APIC address!\n"); | ||
231 | return 0; | ||
232 | } | ||
233 | memcpy(str,mpc->mpc_oem,8); | ||
234 | str[8] = 0; | ||
235 | printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); | ||
236 | |||
237 | memcpy(str,mpc->mpc_productid,12); | ||
238 | str[12] = 0; | ||
239 | printk("MPTABLE: Product ID: %s ",str); | ||
240 | |||
241 | printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); | ||
242 | |||
243 | /* save the local APIC address, it might be non-default */ | ||
244 | if (!acpi_lapic) | ||
245 | mp_lapic_addr = mpc->mpc_lapic; | ||
246 | |||
247 | /* | ||
248 | * Now process the configuration blocks. | ||
249 | */ | ||
250 | while (count < mpc->mpc_length) { | ||
251 | switch(*mpt) { | ||
252 | case MP_PROCESSOR: | ||
253 | { | ||
254 | struct mpc_config_processor *m= | ||
255 | (struct mpc_config_processor *)mpt; | ||
256 | if (!acpi_lapic) | ||
257 | MP_processor_info(m); | ||
258 | mpt += sizeof(*m); | ||
259 | count += sizeof(*m); | ||
260 | break; | ||
261 | } | ||
262 | case MP_BUS: | ||
263 | { | ||
264 | struct mpc_config_bus *m= | ||
265 | (struct mpc_config_bus *)mpt; | ||
266 | MP_bus_info(m); | ||
267 | mpt += sizeof(*m); | ||
268 | count += sizeof(*m); | ||
269 | break; | ||
270 | } | ||
271 | case MP_IOAPIC: | ||
272 | { | ||
273 | struct mpc_config_ioapic *m= | ||
274 | (struct mpc_config_ioapic *)mpt; | ||
275 | MP_ioapic_info(m); | ||
276 | mpt += sizeof(*m); | ||
277 | count += sizeof(*m); | ||
278 | break; | ||
279 | } | ||
280 | case MP_INTSRC: | ||
281 | { | ||
282 | struct mpc_config_intsrc *m= | ||
283 | (struct mpc_config_intsrc *)mpt; | ||
284 | |||
285 | MP_intsrc_info(m); | ||
286 | mpt += sizeof(*m); | ||
287 | count += sizeof(*m); | ||
288 | break; | ||
289 | } | ||
290 | case MP_LINTSRC: | ||
291 | { | ||
292 | struct mpc_config_lintsrc *m= | ||
293 | (struct mpc_config_lintsrc *)mpt; | ||
294 | MP_lintsrc_info(m); | ||
295 | mpt += sizeof(*m); | ||
296 | count += sizeof(*m); | ||
297 | break; | ||
298 | } | ||
299 | } | ||
300 | } | ||
301 | setup_apic_routing(); | ||
302 | if (!num_processors) | ||
303 | printk(KERN_ERR "MPTABLE: no processors registered!\n"); | ||
304 | return num_processors; | ||
305 | } | ||
306 | |||
307 | static int __init ELCR_trigger(unsigned int irq) | ||
308 | { | ||
309 | unsigned int port; | ||
310 | |||
311 | port = 0x4d0 + (irq >> 3); | ||
312 | return (inb(port) >> (irq & 7)) & 1; | ||
313 | } | ||
314 | |||
315 | static void __init construct_default_ioirq_mptable(int mpc_default_type) | ||
316 | { | ||
317 | struct mpc_config_intsrc intsrc; | ||
318 | int i; | ||
319 | int ELCR_fallback = 0; | ||
320 | |||
321 | intsrc.mpc_type = MP_INTSRC; | ||
322 | intsrc.mpc_irqflag = 0; /* conforming */ | ||
323 | intsrc.mpc_srcbus = 0; | ||
324 | intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | ||
325 | |||
326 | intsrc.mpc_irqtype = mp_INT; | ||
327 | |||
328 | /* | ||
329 | * If true, we have an ISA/PCI system with no IRQ entries | ||
330 | * in the MP table. To prevent the PCI interrupts from being set up | ||
331 | * incorrectly, we try to use the ELCR. The sanity check to see if | ||
332 | * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can | ||
333 | * never be level sensitive, so we simply see if the ELCR agrees. | ||
334 | * If it does, we assume it's valid. | ||
335 | */ | ||
336 | if (mpc_default_type == 5) { | ||
337 | printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); | ||
338 | |||
339 | if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) | ||
340 | printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); | ||
341 | else { | ||
342 | printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); | ||
343 | ELCR_fallback = 1; | ||
344 | } | ||
345 | } | ||
346 | |||
347 | for (i = 0; i < 16; i++) { | ||
348 | switch (mpc_default_type) { | ||
349 | case 2: | ||
350 | if (i == 0 || i == 13) | ||
351 | continue; /* IRQ0 & IRQ13 not connected */ | ||
352 | /* fall through */ | ||
353 | default: | ||
354 | if (i == 2) | ||
355 | continue; /* IRQ2 is never connected */ | ||
356 | } | ||
357 | |||
358 | if (ELCR_fallback) { | ||
359 | /* | ||
360 | * If the ELCR indicates a level-sensitive interrupt, we | ||
361 | * copy that information over to the MP table in the | ||
362 | * irqflag field (level sensitive, active high polarity). | ||
363 | */ | ||
364 | if (ELCR_trigger(i)) | ||
365 | intsrc.mpc_irqflag = 13; | ||
366 | else | ||
367 | intsrc.mpc_irqflag = 0; | ||
368 | } | ||
369 | |||
370 | intsrc.mpc_srcbusirq = i; | ||
371 | intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | ||
372 | MP_intsrc_info(&intsrc); | ||
373 | } | ||
374 | |||
375 | intsrc.mpc_irqtype = mp_ExtINT; | ||
376 | intsrc.mpc_srcbusirq = 0; | ||
377 | intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ | ||
378 | MP_intsrc_info(&intsrc); | ||
379 | } | ||
380 | |||
381 | static inline void __init construct_default_ISA_mptable(int mpc_default_type) | ||
382 | { | ||
383 | struct mpc_config_processor processor; | ||
384 | struct mpc_config_bus bus; | ||
385 | struct mpc_config_ioapic ioapic; | ||
386 | struct mpc_config_lintsrc lintsrc; | ||
387 | int linttypes[2] = { mp_ExtINT, mp_NMI }; | ||
388 | int i; | ||
389 | |||
390 | /* | ||
391 | * local APIC has default address | ||
392 | */ | ||
393 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
394 | |||
395 | /* | ||
396 | * 2 CPUs, numbered 0 & 1. | ||
397 | */ | ||
398 | processor.mpc_type = MP_PROCESSOR; | ||
399 | processor.mpc_apicver = 0; | ||
400 | processor.mpc_cpuflag = CPU_ENABLED; | ||
401 | processor.mpc_cpufeature = 0; | ||
402 | processor.mpc_featureflag = 0; | ||
403 | processor.mpc_reserved[0] = 0; | ||
404 | processor.mpc_reserved[1] = 0; | ||
405 | for (i = 0; i < 2; i++) { | ||
406 | processor.mpc_apicid = i; | ||
407 | MP_processor_info(&processor); | ||
408 | } | ||
409 | |||
410 | bus.mpc_type = MP_BUS; | ||
411 | bus.mpc_busid = 0; | ||
412 | switch (mpc_default_type) { | ||
413 | default: | ||
414 | printk(KERN_ERR "???\nUnknown standard configuration %d\n", | ||
415 | mpc_default_type); | ||
416 | /* fall through */ | ||
417 | case 1: | ||
418 | case 5: | ||
419 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
420 | break; | ||
421 | } | ||
422 | MP_bus_info(&bus); | ||
423 | if (mpc_default_type > 4) { | ||
424 | bus.mpc_busid = 1; | ||
425 | memcpy(bus.mpc_bustype, "PCI ", 6); | ||
426 | MP_bus_info(&bus); | ||
427 | } | ||
428 | |||
429 | ioapic.mpc_type = MP_IOAPIC; | ||
430 | ioapic.mpc_apicid = 2; | ||
431 | ioapic.mpc_apicver = 0; | ||
432 | ioapic.mpc_flags = MPC_APIC_USABLE; | ||
433 | ioapic.mpc_apicaddr = 0xFEC00000; | ||
434 | MP_ioapic_info(&ioapic); | ||
435 | |||
436 | /* | ||
437 | * We set up most of the low 16 IO-APIC pins according to MPS rules. | ||
438 | */ | ||
439 | construct_default_ioirq_mptable(mpc_default_type); | ||
440 | |||
441 | lintsrc.mpc_type = MP_LINTSRC; | ||
442 | lintsrc.mpc_irqflag = 0; /* conforming */ | ||
443 | lintsrc.mpc_srcbusid = 0; | ||
444 | lintsrc.mpc_srcbusirq = 0; | ||
445 | lintsrc.mpc_destapic = MP_APIC_ALL; | ||
446 | for (i = 0; i < 2; i++) { | ||
447 | lintsrc.mpc_irqtype = linttypes[i]; | ||
448 | lintsrc.mpc_destapiclint = i; | ||
449 | MP_lintsrc_info(&lintsrc); | ||
450 | } | ||
451 | } | ||
452 | |||
453 | static struct intel_mp_floating *mpf_found; | ||
454 | |||
455 | /* | ||
456 | * Scan the memory blocks for an SMP configuration block. | ||
457 | */ | ||
458 | void __init get_smp_config (void) | ||
459 | { | ||
460 | struct intel_mp_floating *mpf = mpf_found; | ||
461 | |||
462 | /* | ||
463 | * ACPI supports both logical (e.g. Hyper-Threading) and physical | ||
464 | * processors, where MPS only supports physical. | ||
465 | */ | ||
466 | if (acpi_lapic && acpi_ioapic) { | ||
467 | printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); | ||
468 | return; | ||
469 | } | ||
470 | else if (acpi_lapic) | ||
471 | printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); | ||
472 | |||
473 | printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); | ||
474 | |||
475 | /* | ||
476 | * Now see if we need to read further. | ||
477 | */ | ||
478 | if (mpf->mpf_feature1 != 0) { | ||
479 | |||
480 | printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); | ||
481 | construct_default_ISA_mptable(mpf->mpf_feature1); | ||
482 | |||
483 | } else if (mpf->mpf_physptr) { | ||
484 | |||
485 | /* | ||
486 | * Read the physical hardware table. Anything here will | ||
487 | * override the defaults. | ||
488 | */ | ||
489 | if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { | ||
490 | smp_found_config = 0; | ||
491 | printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); | ||
492 | printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); | ||
493 | return; | ||
494 | } | ||
495 | /* | ||
496 | * If there are no explicit MP IRQ entries, then we are | ||
497 | * broken. We set up most of the low 16 IO-APIC pins to | ||
498 | * ISA defaults and hope it will work. | ||
499 | */ | ||
500 | if (!mp_irq_entries) { | ||
501 | struct mpc_config_bus bus; | ||
502 | |||
503 | printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); | ||
504 | |||
505 | bus.mpc_type = MP_BUS; | ||
506 | bus.mpc_busid = 0; | ||
507 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
508 | MP_bus_info(&bus); | ||
509 | |||
510 | construct_default_ioirq_mptable(0); | ||
511 | } | ||
512 | |||
513 | } else | ||
514 | BUG(); | ||
515 | |||
516 | printk(KERN_INFO "Processors: %d\n", num_processors); | ||
517 | /* | ||
518 | * Only use the first configuration found. | ||
519 | */ | ||
520 | } | ||
521 | |||
522 | static int __init smp_scan_config (unsigned long base, unsigned long length) | ||
523 | { | ||
524 | extern void __bad_mpf_size(void); | ||
525 | unsigned int *bp = phys_to_virt(base); | ||
526 | struct intel_mp_floating *mpf; | ||
527 | |||
528 | Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | ||
529 | if (sizeof(*mpf) != 16) | ||
530 | __bad_mpf_size(); | ||
531 | |||
532 | while (length > 0) { | ||
533 | mpf = (struct intel_mp_floating *)bp; | ||
534 | if ((*bp == SMP_MAGIC_IDENT) && | ||
535 | (mpf->mpf_length == 1) && | ||
536 | !mpf_checksum((unsigned char *)bp, 16) && | ||
537 | ((mpf->mpf_specification == 1) | ||
538 | || (mpf->mpf_specification == 4)) ) { | ||
539 | |||
540 | smp_found_config = 1; | ||
541 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); | ||
542 | if (mpf->mpf_physptr) | ||
543 | reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); | ||
544 | mpf_found = mpf; | ||
545 | return 1; | ||
546 | } | ||
547 | bp += 4; | ||
548 | length -= 16; | ||
549 | } | ||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | void __init find_smp_config(void) | ||
554 | { | ||
555 | unsigned int address; | ||
556 | |||
557 | /* | ||
558 | * FIXME: Linux assumes you have 640K of base ram.. | ||
559 | * this continues the error... | ||
560 | * | ||
561 | * 1) Scan the bottom 1K for a signature | ||
562 | * 2) Scan the top 1K of base RAM | ||
563 | * 3) Scan the 64K of bios | ||
564 | */ | ||
565 | if (smp_scan_config(0x0,0x400) || | ||
566 | smp_scan_config(639*0x400,0x400) || | ||
567 | smp_scan_config(0xF0000,0x10000)) | ||
568 | return; | ||
569 | /* | ||
570 | * If it is an SMP machine we should know now. | ||
571 | * | ||
572 | * there is a real-mode segmented pointer pointing to the | ||
573 | * 4K EBDA area at 0x40E, calculate and scan it here. | ||
574 | * | ||
575 | * NOTE! There are Linux loaders that will corrupt the EBDA | ||
576 | * area, and as such this kind of SMP config may be less | ||
577 | * trustworthy, simply because the SMP table may have been | ||
578 | * stomped on during early boot. These loaders are buggy and | ||
579 | * should be fixed. | ||
580 | */ | ||
581 | |||
582 | address = *(unsigned short *)phys_to_virt(0x40E); | ||
583 | address <<= 4; | ||
584 | if (smp_scan_config(address, 0x1000)) | ||
585 | return; | ||
586 | |||
587 | /* If we have come this far, we did not find an MP table */ | ||
588 | printk(KERN_INFO "No mptable found.\n"); | ||
589 | } | ||
590 | |||
591 | /* -------------------------------------------------------------------------- | ||
592 | ACPI-based MP Configuration | ||
593 | -------------------------------------------------------------------------- */ | ||
594 | |||
595 | #ifdef CONFIG_ACPI | ||
596 | |||
597 | void __init mp_register_lapic_address(u64 address) | ||
598 | { | ||
599 | mp_lapic_addr = (unsigned long) address; | ||
600 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
601 | if (boot_cpu_id == -1U) | ||
602 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
603 | } | ||
604 | |||
605 | void __cpuinit mp_register_lapic (u8 id, u8 enabled) | ||
606 | { | ||
607 | struct mpc_config_processor processor; | ||
608 | int boot_cpu = 0; | ||
609 | |||
610 | if (id == boot_cpu_id) | ||
611 | boot_cpu = 1; | ||
612 | |||
613 | processor.mpc_type = MP_PROCESSOR; | ||
614 | processor.mpc_apicid = id; | ||
615 | processor.mpc_apicver = 0; | ||
616 | processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); | ||
617 | processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); | ||
618 | processor.mpc_cpufeature = 0; | ||
619 | processor.mpc_featureflag = 0; | ||
620 | processor.mpc_reserved[0] = 0; | ||
621 | processor.mpc_reserved[1] = 0; | ||
622 | |||
623 | MP_processor_info(&processor); | ||
624 | } | ||
625 | |||
626 | #define MP_ISA_BUS 0 | ||
627 | #define MP_MAX_IOAPIC_PIN 127 | ||
628 | |||
629 | static struct mp_ioapic_routing { | ||
630 | int apic_id; | ||
631 | int gsi_start; | ||
632 | int gsi_end; | ||
633 | u32 pin_programmed[4]; | ||
634 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
635 | |||
636 | static int mp_find_ioapic(int gsi) | ||
637 | { | ||
638 | int i = 0; | ||
639 | |||
640 | /* Find the IOAPIC that manages this GSI. */ | ||
641 | for (i = 0; i < nr_ioapics; i++) { | ||
642 | if ((gsi >= mp_ioapic_routing[i].gsi_start) | ||
643 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | ||
644 | return i; | ||
645 | } | ||
646 | |||
647 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
648 | return -1; | ||
649 | } | ||
650 | |||
651 | static u8 uniq_ioapic_id(u8 id) | ||
652 | { | ||
653 | int i; | ||
654 | DECLARE_BITMAP(used, 256); | ||
655 | bitmap_zero(used, 256); | ||
656 | for (i = 0; i < nr_ioapics; i++) { | ||
657 | struct mpc_config_ioapic *ia = &mp_ioapics[i]; | ||
658 | __set_bit(ia->mpc_apicid, used); | ||
659 | } | ||
660 | if (!test_bit(id, used)) | ||
661 | return id; | ||
662 | return find_first_zero_bit(used, 256); | ||
663 | } | ||
664 | |||
665 | void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) | ||
666 | { | ||
667 | int idx = 0; | ||
668 | |||
669 | if (bad_ioapic(address)) | ||
670 | return; | ||
671 | |||
672 | idx = nr_ioapics; | ||
673 | |||
674 | mp_ioapics[idx].mpc_type = MP_IOAPIC; | ||
675 | mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | ||
676 | mp_ioapics[idx].mpc_apicaddr = address; | ||
677 | |||
678 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
679 | mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); | ||
680 | mp_ioapics[idx].mpc_apicver = 0; | ||
681 | |||
682 | /* | ||
683 | * Build basic IRQ lookup table to facilitate gsi->io_apic lookups | ||
684 | * and to prevent reprogramming of IOAPIC pins (PCI IRQs). | ||
685 | */ | ||
686 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | ||
687 | mp_ioapic_routing[idx].gsi_start = gsi_base; | ||
688 | mp_ioapic_routing[idx].gsi_end = gsi_base + | ||
689 | io_apic_get_redir_entries(idx); | ||
690 | |||
691 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " | ||
692 | "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | ||
693 | mp_ioapics[idx].mpc_apicaddr, | ||
694 | mp_ioapic_routing[idx].gsi_start, | ||
695 | mp_ioapic_routing[idx].gsi_end); | ||
696 | |||
697 | nr_ioapics++; | ||
698 | } | ||
699 | |||
700 | void __init | ||
701 | mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | ||
702 | { | ||
703 | struct mpc_config_intsrc intsrc; | ||
704 | int ioapic = -1; | ||
705 | int pin = -1; | ||
706 | |||
707 | /* | ||
708 | * Convert 'gsi' to 'ioapic.pin'. | ||
709 | */ | ||
710 | ioapic = mp_find_ioapic(gsi); | ||
711 | if (ioapic < 0) | ||
712 | return; | ||
713 | pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
714 | |||
715 | /* | ||
716 | * TBD: This check is for faulty timer entries, where the override | ||
717 | * erroneously sets the trigger to level, resulting in a HUGE | ||
718 | * increase of timer interrupts! | ||
719 | */ | ||
720 | if ((bus_irq == 0) && (trigger == 3)) | ||
721 | trigger = 1; | ||
722 | |||
723 | intsrc.mpc_type = MP_INTSRC; | ||
724 | intsrc.mpc_irqtype = mp_INT; | ||
725 | intsrc.mpc_irqflag = (trigger << 2) | polarity; | ||
726 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
727 | intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | ||
728 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | ||
729 | intsrc.mpc_dstirq = pin; /* INTIN# */ | ||
730 | |||
731 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", | ||
732 | intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
733 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
734 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); | ||
735 | |||
736 | mp_irqs[mp_irq_entries] = intsrc; | ||
737 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
738 | panic("Max # of irq sources exceeded!\n"); | ||
739 | } | ||
740 | |||
741 | void __init mp_config_acpi_legacy_irqs(void) | ||
742 | { | ||
743 | struct mpc_config_intsrc intsrc; | ||
744 | int i = 0; | ||
745 | int ioapic = -1; | ||
746 | |||
747 | /* | ||
748 | * Fabricate the legacy ISA bus (bus #31). | ||
749 | */ | ||
750 | set_bit(MP_ISA_BUS, mp_bus_not_pci); | ||
751 | |||
752 | /* | ||
753 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | ||
754 | */ | ||
755 | ioapic = mp_find_ioapic(0); | ||
756 | if (ioapic < 0) | ||
757 | return; | ||
758 | |||
759 | intsrc.mpc_type = MP_INTSRC; | ||
760 | intsrc.mpc_irqflag = 0; /* Conforming */ | ||
761 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
762 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | ||
763 | |||
764 | /* | ||
765 | * Use the default configuration for the IRQs 0-15. Unless | ||
766 | * overridden by (MADT) interrupt source override entries. | ||
767 | */ | ||
768 | for (i = 0; i < 16; i++) { | ||
769 | int idx; | ||
770 | |||
771 | for (idx = 0; idx < mp_irq_entries; idx++) { | ||
772 | struct mpc_config_intsrc *irq = mp_irqs + idx; | ||
773 | |||
774 | /* Do we already have a mapping for this ISA IRQ? */ | ||
775 | if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) | ||
776 | break; | ||
777 | |||
778 | /* Do we already have a mapping for this IOAPIC pin */ | ||
779 | if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | ||
780 | (irq->mpc_dstirq == i)) | ||
781 | break; | ||
782 | } | ||
783 | |||
784 | if (idx != mp_irq_entries) { | ||
785 | printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | ||
786 | continue; /* IRQ already used */ | ||
787 | } | ||
788 | |||
789 | intsrc.mpc_irqtype = mp_INT; | ||
790 | intsrc.mpc_srcbusirq = i; /* Identity mapped */ | ||
791 | intsrc.mpc_dstirq = i; | ||
792 | |||
793 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " | ||
794 | "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
795 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
796 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, | ||
797 | intsrc.mpc_dstirq); | ||
798 | |||
799 | mp_irqs[mp_irq_entries] = intsrc; | ||
800 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
801 | panic("Max # of irq sources exceeded!\n"); | ||
802 | } | ||
803 | } | ||
804 | |||
805 | int mp_register_gsi(u32 gsi, int triggering, int polarity) | ||
806 | { | ||
807 | int ioapic = -1; | ||
808 | int ioapic_pin = 0; | ||
809 | int idx, bit = 0; | ||
810 | |||
811 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | ||
812 | return gsi; | ||
813 | |||
814 | /* Don't set up the ACPI SCI because it's already set up */ | ||
815 | if (acpi_gbl_FADT.sci_interrupt == gsi) | ||
816 | return gsi; | ||
817 | |||
818 | ioapic = mp_find_ioapic(gsi); | ||
819 | if (ioapic < 0) { | ||
820 | printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | ||
821 | return gsi; | ||
822 | } | ||
823 | |||
824 | ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
825 | |||
826 | /* | ||
827 | * Avoid pin reprogramming. PRTs typically include entries | ||
828 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
829 | * we only program the IOAPIC on the first. | ||
830 | */ | ||
831 | bit = ioapic_pin % 32; | ||
832 | idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); | ||
833 | if (idx > 3) { | ||
834 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | ||
835 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | ||
836 | ioapic_pin); | ||
837 | return gsi; | ||
838 | } | ||
839 | if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | ||
840 | Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | ||
841 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | ||
842 | return gsi; | ||
843 | } | ||
844 | |||
845 | mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | ||
846 | |||
847 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
848 | triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
849 | polarity == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
850 | return gsi; | ||
851 | } | ||
852 | #endif /*CONFIG_ACPI*/ | ||
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c deleted file mode 100644 index 0ec6d2ddb931..000000000000 --- a/arch/x86_64/kernel/nmi.c +++ /dev/null | |||
@@ -1,483 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/nmi.c | ||
3 | * | ||
4 | * NMI watchdog support on APIC systems | ||
5 | * | ||
6 | * Started by Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes: | ||
9 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | ||
10 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | ||
11 | * Pavel Machek and | ||
12 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | ||
13 | */ | ||
14 | |||
15 | #include <linux/nmi.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/sysdev.h> | ||
21 | #include <linux/sysctl.h> | ||
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/cpumask.h> | ||
24 | #include <linux/kdebug.h> | ||
25 | |||
26 | #include <asm/smp.h> | ||
27 | #include <asm/nmi.h> | ||
28 | #include <asm/proto.h> | ||
29 | #include <asm/mce.h> | ||
30 | |||
31 | int unknown_nmi_panic; | ||
32 | int nmi_watchdog_enabled; | ||
33 | int panic_on_unrecovered_nmi; | ||
34 | |||
35 | static cpumask_t backtrace_mask = CPU_MASK_NONE; | ||
36 | |||
37 | /* nmi_active: | ||
38 | * >0: the lapic NMI watchdog is active, but can be disabled | ||
39 | * <0: the lapic NMI watchdog has not been set up, and cannot | ||
40 | * be enabled | ||
41 | * 0: the lapic NMI watchdog is disabled, but can be enabled | ||
42 | */ | ||
43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
44 | int panic_on_timeout; | ||
45 | |||
46 | unsigned int nmi_watchdog = NMI_DEFAULT; | ||
47 | static unsigned int nmi_hz = HZ; | ||
48 | |||
49 | static DEFINE_PER_CPU(short, wd_enabled); | ||
50 | |||
51 | /* local prototypes */ | ||
52 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); | ||
53 | |||
54 | /* Run after command line and cpu_init init, but before all other checks */ | ||
55 | void nmi_watchdog_default(void) | ||
56 | { | ||
57 | if (nmi_watchdog != NMI_DEFAULT) | ||
58 | return; | ||
59 | nmi_watchdog = NMI_NONE; | ||
60 | } | ||
61 | |||
62 | static int endflag __initdata = 0; | ||
63 | |||
64 | #ifdef CONFIG_SMP | ||
65 | /* The performance counters used by NMI_LOCAL_APIC don't trigger when | ||
66 | * the CPU is idle. To make sure the NMI watchdog really ticks on all | ||
67 | * CPUs during the test make them busy. | ||
68 | */ | ||
69 | static __init void nmi_cpu_busy(void *data) | ||
70 | { | ||
71 | local_irq_enable_in_hardirq(); | ||
72 | /* Intentionally don't use cpu_relax here. This is | ||
73 | to make sure that the performance counter really ticks, | ||
74 | even if there is a simulator or similar that catches the | ||
75 | pause instruction. On a real HT machine this is fine because | ||
76 | all other CPUs are busy with "useless" delay loops and don't | ||
77 | care if they get somewhat less cycles. */ | ||
78 | while (endflag == 0) | ||
79 | mb(); | ||
80 | } | ||
81 | #endif | ||
82 | |||
83 | int __init check_nmi_watchdog (void) | ||
84 | { | ||
85 | int *counts; | ||
86 | int cpu; | ||
87 | |||
88 | if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) | ||
89 | return 0; | ||
90 | |||
91 | if (!atomic_read(&nmi_active)) | ||
92 | return 0; | ||
93 | |||
94 | counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); | ||
95 | if (!counts) | ||
96 | return -1; | ||
97 | |||
98 | printk(KERN_INFO "testing NMI watchdog ... "); | ||
99 | |||
100 | #ifdef CONFIG_SMP | ||
101 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
102 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); | ||
103 | #endif | ||
104 | |||
105 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
106 | counts[cpu] = cpu_pda(cpu)->__nmi_count; | ||
107 | local_irq_enable(); | ||
108 | mdelay((20*1000)/nmi_hz); // wait 20 ticks | ||
109 | |||
110 | for_each_online_cpu(cpu) { | ||
111 | if (!per_cpu(wd_enabled, cpu)) | ||
112 | continue; | ||
113 | if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { | ||
114 | printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", | ||
115 | cpu, | ||
116 | counts[cpu], | ||
117 | cpu_pda(cpu)->__nmi_count); | ||
118 | per_cpu(wd_enabled, cpu) = 0; | ||
119 | atomic_dec(&nmi_active); | ||
120 | } | ||
121 | } | ||
122 | if (!atomic_read(&nmi_active)) { | ||
123 | kfree(counts); | ||
124 | atomic_set(&nmi_active, -1); | ||
125 | endflag = 1; | ||
126 | return -1; | ||
127 | } | ||
128 | endflag = 1; | ||
129 | printk("OK.\n"); | ||
130 | |||
131 | /* now that we know it works we can reduce NMI frequency to | ||
132 | something more reasonable; makes a difference in some configs */ | ||
133 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
134 | nmi_hz = lapic_adjust_nmi_hz(1); | ||
135 | |||
136 | kfree(counts); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | int __init setup_nmi_watchdog(char *str) | ||
141 | { | ||
142 | int nmi; | ||
143 | |||
144 | if (!strncmp(str,"panic",5)) { | ||
145 | panic_on_timeout = 1; | ||
146 | str = strchr(str, ','); | ||
147 | if (!str) | ||
148 | return 1; | ||
149 | ++str; | ||
150 | } | ||
151 | |||
152 | get_option(&str, &nmi); | ||
153 | |||
154 | if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) | ||
155 | return 0; | ||
156 | |||
157 | nmi_watchdog = nmi; | ||
158 | return 1; | ||
159 | } | ||
160 | |||
161 | __setup("nmi_watchdog=", setup_nmi_watchdog); | ||
162 | |||
163 | |||
164 | static void __acpi_nmi_disable(void *__unused) | ||
165 | { | ||
166 | apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Disable timer based NMIs on all CPUs: | ||
171 | */ | ||
172 | void acpi_nmi_disable(void) | ||
173 | { | ||
174 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
175 | on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); | ||
176 | } | ||
177 | |||
178 | static void __acpi_nmi_enable(void *__unused) | ||
179 | { | ||
180 | apic_write(APIC_LVT0, APIC_DM_NMI); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * Enable timer based NMIs on all CPUs: | ||
185 | */ | ||
186 | void acpi_nmi_enable(void) | ||
187 | { | ||
188 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
189 | on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); | ||
190 | } | ||
191 | #ifdef CONFIG_PM | ||
192 | |||
193 | static int nmi_pm_active; /* nmi_active before suspend */ | ||
194 | |||
195 | static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) | ||
196 | { | ||
197 | /* only CPU0 goes here, other CPUs should be offline */ | ||
198 | nmi_pm_active = atomic_read(&nmi_active); | ||
199 | stop_apic_nmi_watchdog(NULL); | ||
200 | BUG_ON(atomic_read(&nmi_active) != 0); | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | static int lapic_nmi_resume(struct sys_device *dev) | ||
205 | { | ||
206 | /* only CPU0 goes here, other CPUs should be offline */ | ||
207 | if (nmi_pm_active > 0) { | ||
208 | setup_apic_nmi_watchdog(NULL); | ||
209 | touch_nmi_watchdog(); | ||
210 | } | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | static struct sysdev_class nmi_sysclass = { | ||
215 | set_kset_name("lapic_nmi"), | ||
216 | .resume = lapic_nmi_resume, | ||
217 | .suspend = lapic_nmi_suspend, | ||
218 | }; | ||
219 | |||
220 | static struct sys_device device_lapic_nmi = { | ||
221 | .id = 0, | ||
222 | .cls = &nmi_sysclass, | ||
223 | }; | ||
224 | |||
225 | static int __init init_lapic_nmi_sysfs(void) | ||
226 | { | ||
227 | int error; | ||
228 | |||
229 | /* should really be a BUG_ON but b/c this is an | ||
230 | * init call, it just doesn't work. -dcz | ||
231 | */ | ||
232 | if (nmi_watchdog != NMI_LOCAL_APIC) | ||
233 | return 0; | ||
234 | |||
235 | if ( atomic_read(&nmi_active) < 0 ) | ||
236 | return 0; | ||
237 | |||
238 | error = sysdev_class_register(&nmi_sysclass); | ||
239 | if (!error) | ||
240 | error = sysdev_register(&device_lapic_nmi); | ||
241 | return error; | ||
242 | } | ||
243 | /* must come after the local APIC's device_initcall() */ | ||
244 | late_initcall(init_lapic_nmi_sysfs); | ||
245 | |||
246 | #endif /* CONFIG_PM */ | ||
247 | |||
248 | void setup_apic_nmi_watchdog(void *unused) | ||
249 | { | ||
250 | if (__get_cpu_var(wd_enabled) == 1) | ||
251 | return; | ||
252 | |||
253 | /* cheap hack to support suspend/resume */ | ||
254 | /* if cpu0 is not active neither should the other cpus */ | ||
255 | if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) | ||
256 | return; | ||
257 | |||
258 | switch (nmi_watchdog) { | ||
259 | case NMI_LOCAL_APIC: | ||
260 | __get_cpu_var(wd_enabled) = 1; | ||
261 | if (lapic_watchdog_init(nmi_hz) < 0) { | ||
262 | __get_cpu_var(wd_enabled) = 0; | ||
263 | return; | ||
264 | } | ||
265 | /* FALL THROUGH */ | ||
266 | case NMI_IO_APIC: | ||
267 | __get_cpu_var(wd_enabled) = 1; | ||
268 | atomic_inc(&nmi_active); | ||
269 | } | ||
270 | } | ||
271 | |||
272 | void stop_apic_nmi_watchdog(void *unused) | ||
273 | { | ||
274 | /* only support LOCAL and IO APICs for now */ | ||
275 | if ((nmi_watchdog != NMI_LOCAL_APIC) && | ||
276 | (nmi_watchdog != NMI_IO_APIC)) | ||
277 | return; | ||
278 | if (__get_cpu_var(wd_enabled) == 0) | ||
279 | return; | ||
280 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
281 | lapic_watchdog_stop(); | ||
282 | __get_cpu_var(wd_enabled) = 0; | ||
283 | atomic_dec(&nmi_active); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * the best way to detect whether a CPU has a 'hard lockup' problem | ||
288 | * is to check it's local APIC timer IRQ counts. If they are not | ||
289 | * changing then that CPU has some problem. | ||
290 | * | ||
291 | * as these watchdog NMI IRQs are generated on every CPU, we only | ||
292 | * have to check the current processor. | ||
293 | */ | ||
294 | |||
295 | static DEFINE_PER_CPU(unsigned, last_irq_sum); | ||
296 | static DEFINE_PER_CPU(local_t, alert_counter); | ||
297 | static DEFINE_PER_CPU(int, nmi_touch); | ||
298 | |||
299 | void touch_nmi_watchdog(void) | ||
300 | { | ||
301 | if (nmi_watchdog > 0) { | ||
302 | unsigned cpu; | ||
303 | |||
304 | /* | ||
305 | * Tell other CPUs to reset their alert counters. We cannot | ||
306 | * do it ourselves because the alert count increase is not | ||
307 | * atomic. | ||
308 | */ | ||
309 | for_each_present_cpu(cpu) { | ||
310 | if (per_cpu(nmi_touch, cpu) != 1) | ||
311 | per_cpu(nmi_touch, cpu) = 1; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | touch_softlockup_watchdog(); | ||
316 | } | ||
317 | |||
318 | int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) | ||
319 | { | ||
320 | int sum; | ||
321 | int touched = 0; | ||
322 | int cpu = smp_processor_id(); | ||
323 | int rc = 0; | ||
324 | |||
325 | /* check for other users first */ | ||
326 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
327 | == NOTIFY_STOP) { | ||
328 | rc = 1; | ||
329 | touched = 1; | ||
330 | } | ||
331 | |||
332 | sum = read_pda(apic_timer_irqs); | ||
333 | if (__get_cpu_var(nmi_touch)) { | ||
334 | __get_cpu_var(nmi_touch) = 0; | ||
335 | touched = 1; | ||
336 | } | ||
337 | |||
338 | if (cpu_isset(cpu, backtrace_mask)) { | ||
339 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ | ||
340 | |||
341 | spin_lock(&lock); | ||
342 | printk("NMI backtrace for cpu %d\n", cpu); | ||
343 | dump_stack(); | ||
344 | spin_unlock(&lock); | ||
345 | cpu_clear(cpu, backtrace_mask); | ||
346 | } | ||
347 | |||
348 | #ifdef CONFIG_X86_MCE | ||
349 | /* Could check oops_in_progress here too, but it's safer | ||
350 | not too */ | ||
351 | if (atomic_read(&mce_entry) > 0) | ||
352 | touched = 1; | ||
353 | #endif | ||
354 | /* if the apic timer isn't firing, this cpu isn't doing much */ | ||
355 | if (!touched && __get_cpu_var(last_irq_sum) == sum) { | ||
356 | /* | ||
357 | * Ayiee, looks like this CPU is stuck ... | ||
358 | * wait a few IRQs (5 seconds) before doing the oops ... | ||
359 | */ | ||
360 | local_inc(&__get_cpu_var(alert_counter)); | ||
361 | if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) | ||
362 | die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, | ||
363 | panic_on_timeout); | ||
364 | } else { | ||
365 | __get_cpu_var(last_irq_sum) = sum; | ||
366 | local_set(&__get_cpu_var(alert_counter), 0); | ||
367 | } | ||
368 | |||
369 | /* see if the nmi watchdog went off */ | ||
370 | if (!__get_cpu_var(wd_enabled)) | ||
371 | return rc; | ||
372 | switch (nmi_watchdog) { | ||
373 | case NMI_LOCAL_APIC: | ||
374 | rc |= lapic_wd_event(nmi_hz); | ||
375 | break; | ||
376 | case NMI_IO_APIC: | ||
377 | /* don't know how to accurately check for this. | ||
378 | * just assume it was a watchdog timer interrupt | ||
379 | * This matches the old behaviour. | ||
380 | */ | ||
381 | rc = 1; | ||
382 | break; | ||
383 | } | ||
384 | return rc; | ||
385 | } | ||
386 | |||
387 | static unsigned ignore_nmis; | ||
388 | |||
389 | asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) | ||
390 | { | ||
391 | nmi_enter(); | ||
392 | add_pda(__nmi_count,1); | ||
393 | if (!ignore_nmis) | ||
394 | default_do_nmi(regs); | ||
395 | nmi_exit(); | ||
396 | } | ||
397 | |||
398 | int do_nmi_callback(struct pt_regs * regs, int cpu) | ||
399 | { | ||
400 | #ifdef CONFIG_SYSCTL | ||
401 | if (unknown_nmi_panic) | ||
402 | return unknown_nmi_panic_callback(regs, cpu); | ||
403 | #endif | ||
404 | return 0; | ||
405 | } | ||
406 | |||
407 | void stop_nmi(void) | ||
408 | { | ||
409 | acpi_nmi_disable(); | ||
410 | ignore_nmis++; | ||
411 | } | ||
412 | |||
413 | void restart_nmi(void) | ||
414 | { | ||
415 | ignore_nmis--; | ||
416 | acpi_nmi_enable(); | ||
417 | } | ||
418 | |||
419 | #ifdef CONFIG_SYSCTL | ||
420 | |||
421 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | ||
422 | { | ||
423 | unsigned char reason = get_nmi_reason(); | ||
424 | char buf[64]; | ||
425 | |||
426 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | ||
427 | die_nmi(buf, regs, 1); /* Always panic here */ | ||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * proc handler for /proc/sys/kernel/nmi | ||
433 | */ | ||
434 | int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, | ||
435 | void __user *buffer, size_t *length, loff_t *ppos) | ||
436 | { | ||
437 | int old_state; | ||
438 | |||
439 | nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; | ||
440 | old_state = nmi_watchdog_enabled; | ||
441 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
442 | if (!!old_state == !!nmi_watchdog_enabled) | ||
443 | return 0; | ||
444 | |||
445 | if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { | ||
446 | printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); | ||
447 | return -EIO; | ||
448 | } | ||
449 | |||
450 | /* if nmi_watchdog is not set yet, then set it */ | ||
451 | nmi_watchdog_default(); | ||
452 | |||
453 | if (nmi_watchdog == NMI_LOCAL_APIC) { | ||
454 | if (nmi_watchdog_enabled) | ||
455 | enable_lapic_nmi_watchdog(); | ||
456 | else | ||
457 | disable_lapic_nmi_watchdog(); | ||
458 | } else { | ||
459 | printk( KERN_WARNING | ||
460 | "NMI watchdog doesn't know what hardware to touch\n"); | ||
461 | return -EIO; | ||
462 | } | ||
463 | return 0; | ||
464 | } | ||
465 | |||
466 | #endif | ||
467 | |||
468 | void __trigger_all_cpu_backtrace(void) | ||
469 | { | ||
470 | int i; | ||
471 | |||
472 | backtrace_mask = cpu_online_map; | ||
473 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
474 | for (i = 0; i < 10 * 1000; i++) { | ||
475 | if (cpus_empty(backtrace_mask)) | ||
476 | break; | ||
477 | mdelay(1); | ||
478 | } | ||
479 | } | ||
480 | |||
481 | EXPORT_SYMBOL(nmi_active); | ||
482 | EXPORT_SYMBOL(nmi_watchdog); | ||
483 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c deleted file mode 100644 index 71da01e73f03..000000000000 --- a/arch/x86_64/kernel/pci-calgary.c +++ /dev/null | |||
@@ -1,1578 +0,0 @@ | |||
1 | /* | ||
2 | * Derived from arch/powerpc/kernel/iommu.c | ||
3 | * | ||
4 | * Copyright IBM Corporation, 2006-2007 | ||
5 | * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> | ||
6 | * | ||
7 | * Author: Jon Mason <jdmason@kudzu.us> | ||
8 | * Author: Muli Ben-Yehuda <muli@il.ibm.com> | ||
9 | |||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or | ||
13 | * (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 | */ | ||
24 | |||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/mm.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/string.h> | ||
32 | #include <linux/dma-mapping.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/bitops.h> | ||
35 | #include <linux/pci_ids.h> | ||
36 | #include <linux/pci.h> | ||
37 | #include <linux/delay.h> | ||
38 | #include <asm/iommu.h> | ||
39 | #include <asm/calgary.h> | ||
40 | #include <asm/tce.h> | ||
41 | #include <asm/pci-direct.h> | ||
42 | #include <asm/system.h> | ||
43 | #include <asm/dma.h> | ||
44 | #include <asm/rio.h> | ||
45 | |||
46 | #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT | ||
47 | int use_calgary __read_mostly = 1; | ||
48 | #else | ||
49 | int use_calgary __read_mostly = 0; | ||
50 | #endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ | ||
51 | |||
52 | #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 | ||
53 | #define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 | ||
54 | |||
55 | /* register offsets inside the host bridge space */ | ||
56 | #define CALGARY_CONFIG_REG 0x0108 | ||
57 | #define PHB_CSR_OFFSET 0x0110 /* Channel Status */ | ||
58 | #define PHB_PLSSR_OFFSET 0x0120 | ||
59 | #define PHB_CONFIG_RW_OFFSET 0x0160 | ||
60 | #define PHB_IOBASE_BAR_LOW 0x0170 | ||
61 | #define PHB_IOBASE_BAR_HIGH 0x0180 | ||
62 | #define PHB_MEM_1_LOW 0x0190 | ||
63 | #define PHB_MEM_1_HIGH 0x01A0 | ||
64 | #define PHB_IO_ADDR_SIZE 0x01B0 | ||
65 | #define PHB_MEM_1_SIZE 0x01C0 | ||
66 | #define PHB_MEM_ST_OFFSET 0x01D0 | ||
67 | #define PHB_AER_OFFSET 0x0200 | ||
68 | #define PHB_CONFIG_0_HIGH 0x0220 | ||
69 | #define PHB_CONFIG_0_LOW 0x0230 | ||
70 | #define PHB_CONFIG_0_END 0x0240 | ||
71 | #define PHB_MEM_2_LOW 0x02B0 | ||
72 | #define PHB_MEM_2_HIGH 0x02C0 | ||
73 | #define PHB_MEM_2_SIZE_HIGH 0x02D0 | ||
74 | #define PHB_MEM_2_SIZE_LOW 0x02E0 | ||
75 | #define PHB_DOSHOLE_OFFSET 0x08E0 | ||
76 | |||
77 | /* CalIOC2 specific */ | ||
78 | #define PHB_SAVIOR_L2 0x0DB0 | ||
79 | #define PHB_PAGE_MIG_CTRL 0x0DA8 | ||
80 | #define PHB_PAGE_MIG_DEBUG 0x0DA0 | ||
81 | #define PHB_ROOT_COMPLEX_STATUS 0x0CB0 | ||
82 | |||
83 | /* PHB_CONFIG_RW */ | ||
84 | #define PHB_TCE_ENABLE 0x20000000 | ||
85 | #define PHB_SLOT_DISABLE 0x1C000000 | ||
86 | #define PHB_DAC_DISABLE 0x01000000 | ||
87 | #define PHB_MEM2_ENABLE 0x00400000 | ||
88 | #define PHB_MCSR_ENABLE 0x00100000 | ||
89 | /* TAR (Table Address Register) */ | ||
90 | #define TAR_SW_BITS 0x0000ffffffff800fUL | ||
91 | #define TAR_VALID 0x0000000000000008UL | ||
92 | /* CSR (Channel/DMA Status Register) */ | ||
93 | #define CSR_AGENT_MASK 0xffe0ffff | ||
94 | /* CCR (Calgary Configuration Register) */ | ||
95 | #define CCR_2SEC_TIMEOUT 0x000000000000000EUL | ||
96 | /* PMCR/PMDR (Page Migration Control/Debug Registers */ | ||
97 | #define PMR_SOFTSTOP 0x80000000 | ||
98 | #define PMR_SOFTSTOPFAULT 0x40000000 | ||
99 | #define PMR_HARDSTOP 0x20000000 | ||
100 | |||
101 | #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ | ||
102 | #define MAX_NUM_CHASSIS 8 /* max number of chassis */ | ||
103 | /* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ | ||
104 | #define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) | ||
105 | #define PHBS_PER_CALGARY 4 | ||
106 | |||
107 | /* register offsets in Calgary's internal register space */ | ||
108 | static const unsigned long tar_offsets[] = { | ||
109 | 0x0580 /* TAR0 */, | ||
110 | 0x0588 /* TAR1 */, | ||
111 | 0x0590 /* TAR2 */, | ||
112 | 0x0598 /* TAR3 */ | ||
113 | }; | ||
114 | |||
115 | static const unsigned long split_queue_offsets[] = { | ||
116 | 0x4870 /* SPLIT QUEUE 0 */, | ||
117 | 0x5870 /* SPLIT QUEUE 1 */, | ||
118 | 0x6870 /* SPLIT QUEUE 2 */, | ||
119 | 0x7870 /* SPLIT QUEUE 3 */ | ||
120 | }; | ||
121 | |||
122 | static const unsigned long phb_offsets[] = { | ||
123 | 0x8000 /* PHB0 */, | ||
124 | 0x9000 /* PHB1 */, | ||
125 | 0xA000 /* PHB2 */, | ||
126 | 0xB000 /* PHB3 */ | ||
127 | }; | ||
128 | |||
129 | /* PHB debug registers */ | ||
130 | |||
131 | static const unsigned long phb_debug_offsets[] = { | ||
132 | 0x4000 /* PHB 0 DEBUG */, | ||
133 | 0x5000 /* PHB 1 DEBUG */, | ||
134 | 0x6000 /* PHB 2 DEBUG */, | ||
135 | 0x7000 /* PHB 3 DEBUG */ | ||
136 | }; | ||
137 | |||
138 | /* | ||
139 | * STUFF register for each debug PHB, | ||
140 | * byte 1 = start bus number, byte 2 = end bus number | ||
141 | */ | ||
142 | |||
143 | #define PHB_DEBUG_STUFF_OFFSET 0x0020 | ||
144 | |||
145 | #define EMERGENCY_PAGES 32 /* = 128KB */ | ||
146 | |||
147 | unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; | ||
148 | static int translate_empty_slots __read_mostly = 0; | ||
149 | static int calgary_detected __read_mostly = 0; | ||
150 | |||
151 | static struct rio_table_hdr *rio_table_hdr __initdata; | ||
152 | static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; | ||
153 | static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; | ||
154 | |||
155 | struct calgary_bus_info { | ||
156 | void *tce_space; | ||
157 | unsigned char translation_disabled; | ||
158 | signed char phbid; | ||
159 | void __iomem *bbar; | ||
160 | }; | ||
161 | |||
162 | static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); | ||
163 | static void calgary_tce_cache_blast(struct iommu_table *tbl); | ||
164 | static void calgary_dump_error_regs(struct iommu_table *tbl); | ||
165 | static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); | ||
166 | static void calioc2_tce_cache_blast(struct iommu_table *tbl); | ||
167 | static void calioc2_dump_error_regs(struct iommu_table *tbl); | ||
168 | |||
169 | static struct cal_chipset_ops calgary_chip_ops = { | ||
170 | .handle_quirks = calgary_handle_quirks, | ||
171 | .tce_cache_blast = calgary_tce_cache_blast, | ||
172 | .dump_error_regs = calgary_dump_error_regs | ||
173 | }; | ||
174 | |||
175 | static struct cal_chipset_ops calioc2_chip_ops = { | ||
176 | .handle_quirks = calioc2_handle_quirks, | ||
177 | .tce_cache_blast = calioc2_tce_cache_blast, | ||
178 | .dump_error_regs = calioc2_dump_error_regs | ||
179 | }; | ||
180 | |||
181 | static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; | ||
182 | |||
183 | /* enable this to stress test the chip's TCE cache */ | ||
184 | #ifdef CONFIG_IOMMU_DEBUG | ||
185 | int debugging __read_mostly = 1; | ||
186 | |||
187 | static inline unsigned long verify_bit_range(unsigned long* bitmap, | ||
188 | int expected, unsigned long start, unsigned long end) | ||
189 | { | ||
190 | unsigned long idx = start; | ||
191 | |||
192 | BUG_ON(start >= end); | ||
193 | |||
194 | while (idx < end) { | ||
195 | if (!!test_bit(idx, bitmap) != expected) | ||
196 | return idx; | ||
197 | ++idx; | ||
198 | } | ||
199 | |||
200 | /* all bits have the expected value */ | ||
201 | return ~0UL; | ||
202 | } | ||
203 | #else /* debugging is disabled */ | ||
204 | int debugging __read_mostly = 0; | ||
205 | |||
206 | static inline unsigned long verify_bit_range(unsigned long* bitmap, | ||
207 | int expected, unsigned long start, unsigned long end) | ||
208 | { | ||
209 | return ~0UL; | ||
210 | } | ||
211 | |||
212 | #endif /* CONFIG_IOMMU_DEBUG */ | ||
213 | |||
214 | static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) | ||
215 | { | ||
216 | unsigned int npages; | ||
217 | |||
218 | npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); | ||
219 | npages >>= PAGE_SHIFT; | ||
220 | |||
221 | return npages; | ||
222 | } | ||
223 | |||
224 | static inline int translate_phb(struct pci_dev* dev) | ||
225 | { | ||
226 | int disabled = bus_info[dev->bus->number].translation_disabled; | ||
227 | return !disabled; | ||
228 | } | ||
229 | |||
230 | static void iommu_range_reserve(struct iommu_table *tbl, | ||
231 | unsigned long start_addr, unsigned int npages) | ||
232 | { | ||
233 | unsigned long index; | ||
234 | unsigned long end; | ||
235 | unsigned long badbit; | ||
236 | unsigned long flags; | ||
237 | |||
238 | index = start_addr >> PAGE_SHIFT; | ||
239 | |||
240 | /* bail out if we're asked to reserve a region we don't cover */ | ||
241 | if (index >= tbl->it_size) | ||
242 | return; | ||
243 | |||
244 | end = index + npages; | ||
245 | if (end > tbl->it_size) /* don't go off the table */ | ||
246 | end = tbl->it_size; | ||
247 | |||
248 | spin_lock_irqsave(&tbl->it_lock, flags); | ||
249 | |||
250 | badbit = verify_bit_range(tbl->it_map, 0, index, end); | ||
251 | if (badbit != ~0UL) { | ||
252 | if (printk_ratelimit()) | ||
253 | printk(KERN_ERR "Calgary: entry already allocated at " | ||
254 | "0x%lx tbl %p dma 0x%lx npages %u\n", | ||
255 | badbit, tbl, start_addr, npages); | ||
256 | } | ||
257 | |||
258 | set_bit_string(tbl->it_map, index, npages); | ||
259 | |||
260 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
261 | } | ||
262 | |||
263 | static unsigned long iommu_range_alloc(struct iommu_table *tbl, | ||
264 | unsigned int npages) | ||
265 | { | ||
266 | unsigned long flags; | ||
267 | unsigned long offset; | ||
268 | |||
269 | BUG_ON(npages == 0); | ||
270 | |||
271 | spin_lock_irqsave(&tbl->it_lock, flags); | ||
272 | |||
273 | offset = find_next_zero_string(tbl->it_map, tbl->it_hint, | ||
274 | tbl->it_size, npages); | ||
275 | if (offset == ~0UL) { | ||
276 | tbl->chip_ops->tce_cache_blast(tbl); | ||
277 | offset = find_next_zero_string(tbl->it_map, 0, | ||
278 | tbl->it_size, npages); | ||
279 | if (offset == ~0UL) { | ||
280 | printk(KERN_WARNING "Calgary: IOMMU full.\n"); | ||
281 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
282 | if (panic_on_overflow) | ||
283 | panic("Calgary: fix the allocator.\n"); | ||
284 | else | ||
285 | return bad_dma_address; | ||
286 | } | ||
287 | } | ||
288 | |||
289 | set_bit_string(tbl->it_map, offset, npages); | ||
290 | tbl->it_hint = offset + npages; | ||
291 | BUG_ON(tbl->it_hint > tbl->it_size); | ||
292 | |||
293 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
294 | |||
295 | return offset; | ||
296 | } | ||
297 | |||
298 | static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, | ||
299 | unsigned int npages, int direction) | ||
300 | { | ||
301 | unsigned long entry; | ||
302 | dma_addr_t ret = bad_dma_address; | ||
303 | |||
304 | entry = iommu_range_alloc(tbl, npages); | ||
305 | |||
306 | if (unlikely(entry == bad_dma_address)) | ||
307 | goto error; | ||
308 | |||
309 | /* set the return dma address */ | ||
310 | ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); | ||
311 | |||
312 | /* put the TCEs in the HW table */ | ||
313 | tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, | ||
314 | direction); | ||
315 | |||
316 | return ret; | ||
317 | |||
318 | error: | ||
319 | printk(KERN_WARNING "Calgary: failed to allocate %u pages in " | ||
320 | "iommu %p\n", npages, tbl); | ||
321 | return bad_dma_address; | ||
322 | } | ||
323 | |||
324 | static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, | ||
325 | unsigned int npages) | ||
326 | { | ||
327 | unsigned long entry; | ||
328 | unsigned long badbit; | ||
329 | unsigned long badend; | ||
330 | unsigned long flags; | ||
331 | |||
332 | /* were we called with bad_dma_address? */ | ||
333 | badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); | ||
334 | if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { | ||
335 | printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " | ||
336 | "address 0x%Lx\n", dma_addr); | ||
337 | WARN_ON(1); | ||
338 | return; | ||
339 | } | ||
340 | |||
341 | entry = dma_addr >> PAGE_SHIFT; | ||
342 | |||
343 | BUG_ON(entry + npages > tbl->it_size); | ||
344 | |||
345 | tce_free(tbl, entry, npages); | ||
346 | |||
347 | spin_lock_irqsave(&tbl->it_lock, flags); | ||
348 | |||
349 | badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); | ||
350 | if (badbit != ~0UL) { | ||
351 | if (printk_ratelimit()) | ||
352 | printk(KERN_ERR "Calgary: bit is off at 0x%lx " | ||
353 | "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", | ||
354 | badbit, tbl, dma_addr, entry, npages); | ||
355 | } | ||
356 | |||
357 | __clear_bit_string(tbl->it_map, entry, npages); | ||
358 | |||
359 | spin_unlock_irqrestore(&tbl->it_lock, flags); | ||
360 | } | ||
361 | |||
362 | static inline struct iommu_table *find_iommu_table(struct device *dev) | ||
363 | { | ||
364 | struct pci_dev *pdev; | ||
365 | struct pci_bus *pbus; | ||
366 | struct iommu_table *tbl; | ||
367 | |||
368 | pdev = to_pci_dev(dev); | ||
369 | |||
370 | pbus = pdev->bus; | ||
371 | |||
372 | /* is the device behind a bridge? Look for the root bus */ | ||
373 | while (pbus->parent) | ||
374 | pbus = pbus->parent; | ||
375 | |||
376 | tbl = pci_iommu(pbus); | ||
377 | |||
378 | BUG_ON(tbl && (tbl->it_busno != pbus->number)); | ||
379 | |||
380 | return tbl; | ||
381 | } | ||
382 | |||
383 | static void calgary_unmap_sg(struct device *dev, | ||
384 | struct scatterlist *sglist, int nelems, int direction) | ||
385 | { | ||
386 | struct iommu_table *tbl = find_iommu_table(dev); | ||
387 | |||
388 | if (!translate_phb(to_pci_dev(dev))) | ||
389 | return; | ||
390 | |||
391 | while (nelems--) { | ||
392 | unsigned int npages; | ||
393 | dma_addr_t dma = sglist->dma_address; | ||
394 | unsigned int dmalen = sglist->dma_length; | ||
395 | |||
396 | if (dmalen == 0) | ||
397 | break; | ||
398 | |||
399 | npages = num_dma_pages(dma, dmalen); | ||
400 | iommu_free(tbl, dma, npages); | ||
401 | sglist++; | ||
402 | } | ||
403 | } | ||
404 | |||
405 | static int calgary_nontranslate_map_sg(struct device* dev, | ||
406 | struct scatterlist *sg, int nelems, int direction) | ||
407 | { | ||
408 | int i; | ||
409 | |||
410 | for (i = 0; i < nelems; i++ ) { | ||
411 | struct scatterlist *s = &sg[i]; | ||
412 | BUG_ON(!s->page); | ||
413 | s->dma_address = virt_to_bus(page_address(s->page) +s->offset); | ||
414 | s->dma_length = s->length; | ||
415 | } | ||
416 | return nelems; | ||
417 | } | ||
418 | |||
419 | static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | ||
420 | int nelems, int direction) | ||
421 | { | ||
422 | struct iommu_table *tbl = find_iommu_table(dev); | ||
423 | unsigned long vaddr; | ||
424 | unsigned int npages; | ||
425 | unsigned long entry; | ||
426 | int i; | ||
427 | |||
428 | if (!translate_phb(to_pci_dev(dev))) | ||
429 | return calgary_nontranslate_map_sg(dev, sg, nelems, direction); | ||
430 | |||
431 | for (i = 0; i < nelems; i++ ) { | ||
432 | struct scatterlist *s = &sg[i]; | ||
433 | BUG_ON(!s->page); | ||
434 | |||
435 | vaddr = (unsigned long)page_address(s->page) + s->offset; | ||
436 | npages = num_dma_pages(vaddr, s->length); | ||
437 | |||
438 | entry = iommu_range_alloc(tbl, npages); | ||
439 | if (entry == bad_dma_address) { | ||
440 | /* makes sure unmap knows to stop */ | ||
441 | s->dma_length = 0; | ||
442 | goto error; | ||
443 | } | ||
444 | |||
445 | s->dma_address = (entry << PAGE_SHIFT) | s->offset; | ||
446 | |||
447 | /* insert into HW table */ | ||
448 | tce_build(tbl, entry, npages, vaddr & PAGE_MASK, | ||
449 | direction); | ||
450 | |||
451 | s->dma_length = s->length; | ||
452 | } | ||
453 | |||
454 | return nelems; | ||
455 | error: | ||
456 | calgary_unmap_sg(dev, sg, nelems, direction); | ||
457 | for (i = 0; i < nelems; i++) { | ||
458 | sg[i].dma_address = bad_dma_address; | ||
459 | sg[i].dma_length = 0; | ||
460 | } | ||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, | ||
465 | size_t size, int direction) | ||
466 | { | ||
467 | dma_addr_t dma_handle = bad_dma_address; | ||
468 | unsigned long uaddr; | ||
469 | unsigned int npages; | ||
470 | struct iommu_table *tbl = find_iommu_table(dev); | ||
471 | |||
472 | uaddr = (unsigned long)vaddr; | ||
473 | npages = num_dma_pages(uaddr, size); | ||
474 | |||
475 | if (translate_phb(to_pci_dev(dev))) | ||
476 | dma_handle = iommu_alloc(tbl, vaddr, npages, direction); | ||
477 | else | ||
478 | dma_handle = virt_to_bus(vaddr); | ||
479 | |||
480 | return dma_handle; | ||
481 | } | ||
482 | |||
483 | static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | ||
484 | size_t size, int direction) | ||
485 | { | ||
486 | struct iommu_table *tbl = find_iommu_table(dev); | ||
487 | unsigned int npages; | ||
488 | |||
489 | if (!translate_phb(to_pci_dev(dev))) | ||
490 | return; | ||
491 | |||
492 | npages = num_dma_pages(dma_handle, size); | ||
493 | iommu_free(tbl, dma_handle, npages); | ||
494 | } | ||
495 | |||
496 | static void* calgary_alloc_coherent(struct device *dev, size_t size, | ||
497 | dma_addr_t *dma_handle, gfp_t flag) | ||
498 | { | ||
499 | void *ret = NULL; | ||
500 | dma_addr_t mapping; | ||
501 | unsigned int npages, order; | ||
502 | struct iommu_table *tbl = find_iommu_table(dev); | ||
503 | |||
504 | size = PAGE_ALIGN(size); /* size rounded up to full pages */ | ||
505 | npages = size >> PAGE_SHIFT; | ||
506 | order = get_order(size); | ||
507 | |||
508 | /* alloc enough pages (and possibly more) */ | ||
509 | ret = (void *)__get_free_pages(flag, order); | ||
510 | if (!ret) | ||
511 | goto error; | ||
512 | memset(ret, 0, size); | ||
513 | |||
514 | if (translate_phb(to_pci_dev(dev))) { | ||
515 | /* set up tces to cover the allocated range */ | ||
516 | mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); | ||
517 | if (mapping == bad_dma_address) | ||
518 | goto free; | ||
519 | |||
520 | *dma_handle = mapping; | ||
521 | } else /* non translated slot */ | ||
522 | *dma_handle = virt_to_bus(ret); | ||
523 | |||
524 | return ret; | ||
525 | |||
526 | free: | ||
527 | free_pages((unsigned long)ret, get_order(size)); | ||
528 | ret = NULL; | ||
529 | error: | ||
530 | return ret; | ||
531 | } | ||
532 | |||
533 | static const struct dma_mapping_ops calgary_dma_ops = { | ||
534 | .alloc_coherent = calgary_alloc_coherent, | ||
535 | .map_single = calgary_map_single, | ||
536 | .unmap_single = calgary_unmap_single, | ||
537 | .map_sg = calgary_map_sg, | ||
538 | .unmap_sg = calgary_unmap_sg, | ||
539 | }; | ||
540 | |||
541 | static inline void __iomem * busno_to_bbar(unsigned char num) | ||
542 | { | ||
543 | return bus_info[num].bbar; | ||
544 | } | ||
545 | |||
546 | static inline int busno_to_phbid(unsigned char num) | ||
547 | { | ||
548 | return bus_info[num].phbid; | ||
549 | } | ||
550 | |||
551 | static inline unsigned long split_queue_offset(unsigned char num) | ||
552 | { | ||
553 | size_t idx = busno_to_phbid(num); | ||
554 | |||
555 | return split_queue_offsets[idx]; | ||
556 | } | ||
557 | |||
558 | static inline unsigned long tar_offset(unsigned char num) | ||
559 | { | ||
560 | size_t idx = busno_to_phbid(num); | ||
561 | |||
562 | return tar_offsets[idx]; | ||
563 | } | ||
564 | |||
565 | static inline unsigned long phb_offset(unsigned char num) | ||
566 | { | ||
567 | size_t idx = busno_to_phbid(num); | ||
568 | |||
569 | return phb_offsets[idx]; | ||
570 | } | ||
571 | |||
572 | static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) | ||
573 | { | ||
574 | unsigned long target = ((unsigned long)bar) | offset; | ||
575 | return (void __iomem*)target; | ||
576 | } | ||
577 | |||
578 | static inline int is_calioc2(unsigned short device) | ||
579 | { | ||
580 | return (device == PCI_DEVICE_ID_IBM_CALIOC2); | ||
581 | } | ||
582 | |||
583 | static inline int is_calgary(unsigned short device) | ||
584 | { | ||
585 | return (device == PCI_DEVICE_ID_IBM_CALGARY); | ||
586 | } | ||
587 | |||
588 | static inline int is_cal_pci_dev(unsigned short device) | ||
589 | { | ||
590 | return (is_calgary(device) || is_calioc2(device)); | ||
591 | } | ||
592 | |||
593 | static void calgary_tce_cache_blast(struct iommu_table *tbl) | ||
594 | { | ||
595 | u64 val; | ||
596 | u32 aer; | ||
597 | int i = 0; | ||
598 | void __iomem *bbar = tbl->bbar; | ||
599 | void __iomem *target; | ||
600 | |||
601 | /* disable arbitration on the bus */ | ||
602 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); | ||
603 | aer = readl(target); | ||
604 | writel(0, target); | ||
605 | |||
606 | /* read plssr to ensure it got there */ | ||
607 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); | ||
608 | val = readl(target); | ||
609 | |||
610 | /* poll split queues until all DMA activity is done */ | ||
611 | target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); | ||
612 | do { | ||
613 | val = readq(target); | ||
614 | i++; | ||
615 | } while ((val & 0xff) != 0xff && i < 100); | ||
616 | if (i == 100) | ||
617 | printk(KERN_WARNING "Calgary: PCI bus not quiesced, " | ||
618 | "continuing anyway\n"); | ||
619 | |||
620 | /* invalidate TCE cache */ | ||
621 | target = calgary_reg(bbar, tar_offset(tbl->it_busno)); | ||
622 | writeq(tbl->tar_val, target); | ||
623 | |||
624 | /* enable arbitration */ | ||
625 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); | ||
626 | writel(aer, target); | ||
627 | (void)readl(target); /* flush */ | ||
628 | } | ||
629 | |||
630 | static void calioc2_tce_cache_blast(struct iommu_table *tbl) | ||
631 | { | ||
632 | void __iomem *bbar = tbl->bbar; | ||
633 | void __iomem *target; | ||
634 | u64 val64; | ||
635 | u32 val; | ||
636 | int i = 0; | ||
637 | int count = 1; | ||
638 | unsigned char bus = tbl->it_busno; | ||
639 | |||
640 | begin: | ||
641 | printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " | ||
642 | "sequence - count %d\n", bus, count); | ||
643 | |||
644 | /* 1. using the Page Migration Control reg set SoftStop */ | ||
645 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
646 | val = be32_to_cpu(readl(target)); | ||
647 | printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); | ||
648 | val |= PMR_SOFTSTOP; | ||
649 | printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); | ||
650 | writel(cpu_to_be32(val), target); | ||
651 | |||
652 | /* 2. poll split queues until all DMA activity is done */ | ||
653 | printk(KERN_DEBUG "2a. starting to poll split queues\n"); | ||
654 | target = calgary_reg(bbar, split_queue_offset(bus)); | ||
655 | do { | ||
656 | val64 = readq(target); | ||
657 | i++; | ||
658 | } while ((val64 & 0xff) != 0xff && i < 100); | ||
659 | if (i == 100) | ||
660 | printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " | ||
661 | "continuing anyway\n"); | ||
662 | |||
663 | /* 3. poll Page Migration DEBUG for SoftStopFault */ | ||
664 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); | ||
665 | val = be32_to_cpu(readl(target)); | ||
666 | printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); | ||
667 | |||
668 | /* 4. if SoftStopFault - goto (1) */ | ||
669 | if (val & PMR_SOFTSTOPFAULT) { | ||
670 | if (++count < 100) | ||
671 | goto begin; | ||
672 | else { | ||
673 | printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " | ||
674 | "aborting TCE cache flush sequence!\n"); | ||
675 | return; /* pray for the best */ | ||
676 | } | ||
677 | } | ||
678 | |||
679 | /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ | ||
680 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
681 | printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); | ||
682 | val = be32_to_cpu(readl(target)); | ||
683 | printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); | ||
684 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); | ||
685 | val = be32_to_cpu(readl(target)); | ||
686 | printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); | ||
687 | |||
688 | /* 6. invalidate TCE cache */ | ||
689 | printk(KERN_DEBUG "6. invalidating TCE cache\n"); | ||
690 | target = calgary_reg(bbar, tar_offset(bus)); | ||
691 | writeq(tbl->tar_val, target); | ||
692 | |||
693 | /* 7. Re-read PMCR */ | ||
694 | printk(KERN_DEBUG "7a. Re-reading PMCR\n"); | ||
695 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
696 | val = be32_to_cpu(readl(target)); | ||
697 | printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); | ||
698 | |||
699 | /* 8. Remove HardStop */ | ||
700 | printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); | ||
701 | target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); | ||
702 | val = 0; | ||
703 | printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); | ||
704 | writel(cpu_to_be32(val), target); | ||
705 | val = be32_to_cpu(readl(target)); | ||
706 | printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); | ||
707 | } | ||
708 | |||
709 | static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, | ||
710 | u64 limit) | ||
711 | { | ||
712 | unsigned int numpages; | ||
713 | |||
714 | limit = limit | 0xfffff; | ||
715 | limit++; | ||
716 | |||
717 | numpages = ((limit - start) >> PAGE_SHIFT); | ||
718 | iommu_range_reserve(pci_iommu(dev->bus), start, numpages); | ||
719 | } | ||
720 | |||
721 | static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) | ||
722 | { | ||
723 | void __iomem *target; | ||
724 | u64 low, high, sizelow; | ||
725 | u64 start, limit; | ||
726 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
727 | unsigned char busnum = dev->bus->number; | ||
728 | void __iomem *bbar = tbl->bbar; | ||
729 | |||
730 | /* peripheral MEM_1 region */ | ||
731 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); | ||
732 | low = be32_to_cpu(readl(target)); | ||
733 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); | ||
734 | high = be32_to_cpu(readl(target)); | ||
735 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); | ||
736 | sizelow = be32_to_cpu(readl(target)); | ||
737 | |||
738 | start = (high << 32) | low; | ||
739 | limit = sizelow; | ||
740 | |||
741 | calgary_reserve_mem_region(dev, start, limit); | ||
742 | } | ||
743 | |||
744 | static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) | ||
745 | { | ||
746 | void __iomem *target; | ||
747 | u32 val32; | ||
748 | u64 low, high, sizelow, sizehigh; | ||
749 | u64 start, limit; | ||
750 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
751 | unsigned char busnum = dev->bus->number; | ||
752 | void __iomem *bbar = tbl->bbar; | ||
753 | |||
754 | /* is it enabled? */ | ||
755 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); | ||
756 | val32 = be32_to_cpu(readl(target)); | ||
757 | if (!(val32 & PHB_MEM2_ENABLE)) | ||
758 | return; | ||
759 | |||
760 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); | ||
761 | low = be32_to_cpu(readl(target)); | ||
762 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); | ||
763 | high = be32_to_cpu(readl(target)); | ||
764 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); | ||
765 | sizelow = be32_to_cpu(readl(target)); | ||
766 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); | ||
767 | sizehigh = be32_to_cpu(readl(target)); | ||
768 | |||
769 | start = (high << 32) | low; | ||
770 | limit = (sizehigh << 32) | sizelow; | ||
771 | |||
772 | calgary_reserve_mem_region(dev, start, limit); | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * some regions of the IO address space do not get translated, so we | ||
777 | * must not give devices IO addresses in those regions. The regions | ||
778 | * are the 640KB-1MB region and the two PCI peripheral memory holes. | ||
779 | * Reserve all of them in the IOMMU bitmap to avoid giving them out | ||
780 | * later. | ||
781 | */ | ||
782 | static void __init calgary_reserve_regions(struct pci_dev *dev) | ||
783 | { | ||
784 | unsigned int npages; | ||
785 | u64 start; | ||
786 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
787 | |||
788 | /* reserve EMERGENCY_PAGES from bad_dma_address and up */ | ||
789 | iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); | ||
790 | |||
791 | /* avoid the BIOS/VGA first 640KB-1MB region */ | ||
792 | /* for CalIOC2 - avoid the entire first MB */ | ||
793 | if (is_calgary(dev->device)) { | ||
794 | start = (640 * 1024); | ||
795 | npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; | ||
796 | } else { /* calioc2 */ | ||
797 | start = 0; | ||
798 | npages = (1 * 1024 * 1024) >> PAGE_SHIFT; | ||
799 | } | ||
800 | iommu_range_reserve(tbl, start, npages); | ||
801 | |||
802 | /* reserve the two PCI peripheral memory regions in IO space */ | ||
803 | calgary_reserve_peripheral_mem_1(dev); | ||
804 | calgary_reserve_peripheral_mem_2(dev); | ||
805 | } | ||
806 | |||
807 | static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) | ||
808 | { | ||
809 | u64 val64; | ||
810 | u64 table_phys; | ||
811 | void __iomem *target; | ||
812 | int ret; | ||
813 | struct iommu_table *tbl; | ||
814 | |||
815 | /* build TCE tables for each PHB */ | ||
816 | ret = build_tce_table(dev, bbar); | ||
817 | if (ret) | ||
818 | return ret; | ||
819 | |||
820 | tbl = pci_iommu(dev->bus); | ||
821 | tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; | ||
822 | tce_free(tbl, 0, tbl->it_size); | ||
823 | |||
824 | if (is_calgary(dev->device)) | ||
825 | tbl->chip_ops = &calgary_chip_ops; | ||
826 | else if (is_calioc2(dev->device)) | ||
827 | tbl->chip_ops = &calioc2_chip_ops; | ||
828 | else | ||
829 | BUG(); | ||
830 | |||
831 | calgary_reserve_regions(dev); | ||
832 | |||
833 | /* set TARs for each PHB */ | ||
834 | target = calgary_reg(bbar, tar_offset(dev->bus->number)); | ||
835 | val64 = be64_to_cpu(readq(target)); | ||
836 | |||
837 | /* zero out all TAR bits under sw control */ | ||
838 | val64 &= ~TAR_SW_BITS; | ||
839 | table_phys = (u64)__pa(tbl->it_base); | ||
840 | |||
841 | val64 |= table_phys; | ||
842 | |||
843 | BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); | ||
844 | val64 |= (u64) specified_table_size; | ||
845 | |||
846 | tbl->tar_val = cpu_to_be64(val64); | ||
847 | |||
848 | writeq(tbl->tar_val, target); | ||
849 | readq(target); /* flush */ | ||
850 | |||
851 | return 0; | ||
852 | } | ||
853 | |||
854 | static void __init calgary_free_bus(struct pci_dev *dev) | ||
855 | { | ||
856 | u64 val64; | ||
857 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
858 | void __iomem *target; | ||
859 | unsigned int bitmapsz; | ||
860 | |||
861 | target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); | ||
862 | val64 = be64_to_cpu(readq(target)); | ||
863 | val64 &= ~TAR_SW_BITS; | ||
864 | writeq(cpu_to_be64(val64), target); | ||
865 | readq(target); /* flush */ | ||
866 | |||
867 | bitmapsz = tbl->it_size / BITS_PER_BYTE; | ||
868 | free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); | ||
869 | tbl->it_map = NULL; | ||
870 | |||
871 | kfree(tbl); | ||
872 | |||
873 | set_pci_iommu(dev->bus, NULL); | ||
874 | |||
875 | /* Can't free bootmem allocated memory after system is up :-( */ | ||
876 | bus_info[dev->bus->number].tce_space = NULL; | ||
877 | } | ||
878 | |||
879 | static void calgary_dump_error_regs(struct iommu_table *tbl) | ||
880 | { | ||
881 | void __iomem *bbar = tbl->bbar; | ||
882 | void __iomem *target; | ||
883 | u32 csr, plssr; | ||
884 | |||
885 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); | ||
886 | csr = be32_to_cpu(readl(target)); | ||
887 | |||
888 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); | ||
889 | plssr = be32_to_cpu(readl(target)); | ||
890 | |||
891 | /* If no error, the agent ID in the CSR is not valid */ | ||
892 | printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " | ||
893 | "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); | ||
894 | } | ||
895 | |||
896 | static void calioc2_dump_error_regs(struct iommu_table *tbl) | ||
897 | { | ||
898 | void __iomem *bbar = tbl->bbar; | ||
899 | u32 csr, csmr, plssr, mck, rcstat; | ||
900 | void __iomem *target; | ||
901 | unsigned long phboff = phb_offset(tbl->it_busno); | ||
902 | unsigned long erroff; | ||
903 | u32 errregs[7]; | ||
904 | int i; | ||
905 | |||
906 | /* dump CSR */ | ||
907 | target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); | ||
908 | csr = be32_to_cpu(readl(target)); | ||
909 | /* dump PLSSR */ | ||
910 | target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); | ||
911 | plssr = be32_to_cpu(readl(target)); | ||
912 | /* dump CSMR */ | ||
913 | target = calgary_reg(bbar, phboff | 0x290); | ||
914 | csmr = be32_to_cpu(readl(target)); | ||
915 | /* dump mck */ | ||
916 | target = calgary_reg(bbar, phboff | 0x800); | ||
917 | mck = be32_to_cpu(readl(target)); | ||
918 | |||
919 | printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", | ||
920 | tbl->it_busno); | ||
921 | |||
922 | printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", | ||
923 | csr, plssr, csmr, mck); | ||
924 | |||
925 | /* dump rest of error regs */ | ||
926 | printk(KERN_EMERG "Calgary: "); | ||
927 | for (i = 0; i < ARRAY_SIZE(errregs); i++) { | ||
928 | /* err regs are at 0x810 - 0x870 */ | ||
929 | erroff = (0x810 + (i * 0x10)); | ||
930 | target = calgary_reg(bbar, phboff | erroff); | ||
931 | errregs[i] = be32_to_cpu(readl(target)); | ||
932 | printk("0x%08x@0x%lx ", errregs[i], erroff); | ||
933 | } | ||
934 | printk("\n"); | ||
935 | |||
936 | /* root complex status */ | ||
937 | target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); | ||
938 | rcstat = be32_to_cpu(readl(target)); | ||
939 | printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, | ||
940 | PHB_ROOT_COMPLEX_STATUS); | ||
941 | } | ||
942 | |||
943 | static void calgary_watchdog(unsigned long data) | ||
944 | { | ||
945 | struct pci_dev *dev = (struct pci_dev *)data; | ||
946 | struct iommu_table *tbl = pci_iommu(dev->bus); | ||
947 | void __iomem *bbar = tbl->bbar; | ||
948 | u32 val32; | ||
949 | void __iomem *target; | ||
950 | |||
951 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); | ||
952 | val32 = be32_to_cpu(readl(target)); | ||
953 | |||
954 | /* If no error, the agent ID in the CSR is not valid */ | ||
955 | if (val32 & CSR_AGENT_MASK) { | ||
956 | tbl->chip_ops->dump_error_regs(tbl); | ||
957 | |||
958 | /* reset error */ | ||
959 | writel(0, target); | ||
960 | |||
961 | /* Disable bus that caused the error */ | ||
962 | target = calgary_reg(bbar, phb_offset(tbl->it_busno) | | ||
963 | PHB_CONFIG_RW_OFFSET); | ||
964 | val32 = be32_to_cpu(readl(target)); | ||
965 | val32 |= PHB_SLOT_DISABLE; | ||
966 | writel(cpu_to_be32(val32), target); | ||
967 | readl(target); /* flush */ | ||
968 | } else { | ||
969 | /* Reset the timer */ | ||
970 | mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | static void __init calgary_set_split_completion_timeout(void __iomem *bbar, | ||
975 | unsigned char busnum, unsigned long timeout) | ||
976 | { | ||
977 | u64 val64; | ||
978 | void __iomem *target; | ||
979 | unsigned int phb_shift = ~0; /* silence gcc */ | ||
980 | u64 mask; | ||
981 | |||
982 | switch (busno_to_phbid(busnum)) { | ||
983 | case 0: phb_shift = (63 - 19); | ||
984 | break; | ||
985 | case 1: phb_shift = (63 - 23); | ||
986 | break; | ||
987 | case 2: phb_shift = (63 - 27); | ||
988 | break; | ||
989 | case 3: phb_shift = (63 - 35); | ||
990 | break; | ||
991 | default: | ||
992 | BUG_ON(busno_to_phbid(busnum)); | ||
993 | } | ||
994 | |||
995 | target = calgary_reg(bbar, CALGARY_CONFIG_REG); | ||
996 | val64 = be64_to_cpu(readq(target)); | ||
997 | |||
998 | /* zero out this PHB's timer bits */ | ||
999 | mask = ~(0xFUL << phb_shift); | ||
1000 | val64 &= mask; | ||
1001 | val64 |= (timeout << phb_shift); | ||
1002 | writeq(cpu_to_be64(val64), target); | ||
1003 | readq(target); /* flush */ | ||
1004 | } | ||
1005 | |||
1006 | static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) | ||
1007 | { | ||
1008 | unsigned char busnum = dev->bus->number; | ||
1009 | void __iomem *bbar = tbl->bbar; | ||
1010 | void __iomem *target; | ||
1011 | u32 val; | ||
1012 | |||
1013 | /* | ||
1014 | * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 | ||
1015 | */ | ||
1016 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); | ||
1017 | val = cpu_to_be32(readl(target)); | ||
1018 | val |= 0x00800000; | ||
1019 | writel(cpu_to_be32(val), target); | ||
1020 | } | ||
1021 | |||
1022 | static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) | ||
1023 | { | ||
1024 | unsigned char busnum = dev->bus->number; | ||
1025 | |||
1026 | /* | ||
1027 | * Give split completion a longer timeout on bus 1 for aic94xx | ||
1028 | * http://bugzilla.kernel.org/show_bug.cgi?id=7180 | ||
1029 | */ | ||
1030 | if (is_calgary(dev->device) && (busnum == 1)) | ||
1031 | calgary_set_split_completion_timeout(tbl->bbar, busnum, | ||
1032 | CCR_2SEC_TIMEOUT); | ||
1033 | } | ||
1034 | |||
1035 | static void __init calgary_enable_translation(struct pci_dev *dev) | ||
1036 | { | ||
1037 | u32 val32; | ||
1038 | unsigned char busnum; | ||
1039 | void __iomem *target; | ||
1040 | void __iomem *bbar; | ||
1041 | struct iommu_table *tbl; | ||
1042 | |||
1043 | busnum = dev->bus->number; | ||
1044 | tbl = pci_iommu(dev->bus); | ||
1045 | bbar = tbl->bbar; | ||
1046 | |||
1047 | /* enable TCE in PHB Config Register */ | ||
1048 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); | ||
1049 | val32 = be32_to_cpu(readl(target)); | ||
1050 | val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; | ||
1051 | |||
1052 | printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", | ||
1053 | (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? | ||
1054 | "Calgary" : "CalIOC2", busnum); | ||
1055 | printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " | ||
1056 | "bus.\n"); | ||
1057 | |||
1058 | writel(cpu_to_be32(val32), target); | ||
1059 | readl(target); /* flush */ | ||
1060 | |||
1061 | init_timer(&tbl->watchdog_timer); | ||
1062 | tbl->watchdog_timer.function = &calgary_watchdog; | ||
1063 | tbl->watchdog_timer.data = (unsigned long)dev; | ||
1064 | mod_timer(&tbl->watchdog_timer, jiffies); | ||
1065 | } | ||
1066 | |||
1067 | static void __init calgary_disable_translation(struct pci_dev *dev) | ||
1068 | { | ||
1069 | u32 val32; | ||
1070 | unsigned char busnum; | ||
1071 | void __iomem *target; | ||
1072 | void __iomem *bbar; | ||
1073 | struct iommu_table *tbl; | ||
1074 | |||
1075 | busnum = dev->bus->number; | ||
1076 | tbl = pci_iommu(dev->bus); | ||
1077 | bbar = tbl->bbar; | ||
1078 | |||
1079 | /* disable TCE in PHB Config Register */ | ||
1080 | target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); | ||
1081 | val32 = be32_to_cpu(readl(target)); | ||
1082 | val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); | ||
1083 | |||
1084 | printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); | ||
1085 | writel(cpu_to_be32(val32), target); | ||
1086 | readl(target); /* flush */ | ||
1087 | |||
1088 | del_timer_sync(&tbl->watchdog_timer); | ||
1089 | } | ||
1090 | |||
1091 | static void __init calgary_init_one_nontraslated(struct pci_dev *dev) | ||
1092 | { | ||
1093 | pci_dev_get(dev); | ||
1094 | set_pci_iommu(dev->bus, NULL); | ||
1095 | |||
1096 | /* is the device behind a bridge? */ | ||
1097 | if (dev->bus->parent) | ||
1098 | dev->bus->parent->self = dev; | ||
1099 | else | ||
1100 | dev->bus->self = dev; | ||
1101 | } | ||
1102 | |||
1103 | static int __init calgary_init_one(struct pci_dev *dev) | ||
1104 | { | ||
1105 | void __iomem *bbar; | ||
1106 | struct iommu_table *tbl; | ||
1107 | int ret; | ||
1108 | |||
1109 | BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); | ||
1110 | |||
1111 | bbar = busno_to_bbar(dev->bus->number); | ||
1112 | ret = calgary_setup_tar(dev, bbar); | ||
1113 | if (ret) | ||
1114 | goto done; | ||
1115 | |||
1116 | pci_dev_get(dev); | ||
1117 | |||
1118 | if (dev->bus->parent) { | ||
1119 | if (dev->bus->parent->self) | ||
1120 | printk(KERN_WARNING "Calgary: IEEEE, dev %p has " | ||
1121 | "bus->parent->self!\n", dev); | ||
1122 | dev->bus->parent->self = dev; | ||
1123 | } else | ||
1124 | dev->bus->self = dev; | ||
1125 | |||
1126 | tbl = pci_iommu(dev->bus); | ||
1127 | tbl->chip_ops->handle_quirks(tbl, dev); | ||
1128 | |||
1129 | calgary_enable_translation(dev); | ||
1130 | |||
1131 | return 0; | ||
1132 | |||
1133 | done: | ||
1134 | return ret; | ||
1135 | } | ||
1136 | |||
1137 | static int __init calgary_locate_bbars(void) | ||
1138 | { | ||
1139 | int ret; | ||
1140 | int rioidx, phb, bus; | ||
1141 | void __iomem *bbar; | ||
1142 | void __iomem *target; | ||
1143 | unsigned long offset; | ||
1144 | u8 start_bus, end_bus; | ||
1145 | u32 val; | ||
1146 | |||
1147 | ret = -ENODATA; | ||
1148 | for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { | ||
1149 | struct rio_detail *rio = rio_devs[rioidx]; | ||
1150 | |||
1151 | if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) | ||
1152 | continue; | ||
1153 | |||
1154 | /* map entire 1MB of Calgary config space */ | ||
1155 | bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); | ||
1156 | if (!bbar) | ||
1157 | goto error; | ||
1158 | |||
1159 | for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { | ||
1160 | offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; | ||
1161 | target = calgary_reg(bbar, offset); | ||
1162 | |||
1163 | val = be32_to_cpu(readl(target)); | ||
1164 | |||
1165 | start_bus = (u8)((val & 0x00FF0000) >> 16); | ||
1166 | end_bus = (u8)((val & 0x0000FF00) >> 8); | ||
1167 | |||
1168 | if (end_bus) { | ||
1169 | for (bus = start_bus; bus <= end_bus; bus++) { | ||
1170 | bus_info[bus].bbar = bbar; | ||
1171 | bus_info[bus].phbid = phb; | ||
1172 | } | ||
1173 | } else { | ||
1174 | bus_info[start_bus].bbar = bbar; | ||
1175 | bus_info[start_bus].phbid = phb; | ||
1176 | } | ||
1177 | } | ||
1178 | } | ||
1179 | |||
1180 | return 0; | ||
1181 | |||
1182 | error: | ||
1183 | /* scan bus_info and iounmap any bbars we previously ioremap'd */ | ||
1184 | for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) | ||
1185 | if (bus_info[bus].bbar) | ||
1186 | iounmap(bus_info[bus].bbar); | ||
1187 | |||
1188 | return ret; | ||
1189 | } | ||
1190 | |||
1191 | static int __init calgary_init(void) | ||
1192 | { | ||
1193 | int ret; | ||
1194 | struct pci_dev *dev = NULL; | ||
1195 | void *tce_space; | ||
1196 | |||
1197 | ret = calgary_locate_bbars(); | ||
1198 | if (ret) | ||
1199 | return ret; | ||
1200 | |||
1201 | do { | ||
1202 | dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); | ||
1203 | if (!dev) | ||
1204 | break; | ||
1205 | if (!is_cal_pci_dev(dev->device)) | ||
1206 | continue; | ||
1207 | if (!translate_phb(dev)) { | ||
1208 | calgary_init_one_nontraslated(dev); | ||
1209 | continue; | ||
1210 | } | ||
1211 | tce_space = bus_info[dev->bus->number].tce_space; | ||
1212 | if (!tce_space && !translate_empty_slots) | ||
1213 | continue; | ||
1214 | |||
1215 | ret = calgary_init_one(dev); | ||
1216 | if (ret) | ||
1217 | goto error; | ||
1218 | } while (1); | ||
1219 | |||
1220 | return ret; | ||
1221 | |||
1222 | error: | ||
1223 | do { | ||
1224 | dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, | ||
1225 | PCI_ANY_ID, dev); | ||
1226 | if (!dev) | ||
1227 | break; | ||
1228 | if (!is_cal_pci_dev(dev->device)) | ||
1229 | continue; | ||
1230 | if (!translate_phb(dev)) { | ||
1231 | pci_dev_put(dev); | ||
1232 | continue; | ||
1233 | } | ||
1234 | if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) | ||
1235 | continue; | ||
1236 | |||
1237 | calgary_disable_translation(dev); | ||
1238 | calgary_free_bus(dev); | ||
1239 | pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ | ||
1240 | } while (1); | ||
1241 | |||
1242 | return ret; | ||
1243 | } | ||
1244 | |||
1245 | static inline int __init determine_tce_table_size(u64 ram) | ||
1246 | { | ||
1247 | int ret; | ||
1248 | |||
1249 | if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) | ||
1250 | return specified_table_size; | ||
1251 | |||
1252 | /* | ||
1253 | * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to | ||
1254 | * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each | ||
1255 | * larger table size has twice as many entries, so shift the | ||
1256 | * max ram address by 13 to divide by 8K and then look at the | ||
1257 | * order of the result to choose between 0-7. | ||
1258 | */ | ||
1259 | ret = get_order(ram >> 13); | ||
1260 | if (ret > TCE_TABLE_SIZE_8M) | ||
1261 | ret = TCE_TABLE_SIZE_8M; | ||
1262 | |||
1263 | return ret; | ||
1264 | } | ||
1265 | |||
1266 | static int __init build_detail_arrays(void) | ||
1267 | { | ||
1268 | unsigned long ptr; | ||
1269 | int i, scal_detail_size, rio_detail_size; | ||
1270 | |||
1271 | if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ | ||
1272 | printk(KERN_WARNING | ||
1273 | "Calgary: MAX_NUMNODES too low! Defined as %d, " | ||
1274 | "but system has %d nodes.\n", | ||
1275 | MAX_NUMNODES, rio_table_hdr->num_scal_dev); | ||
1276 | return -ENODEV; | ||
1277 | } | ||
1278 | |||
1279 | switch (rio_table_hdr->version){ | ||
1280 | case 2: | ||
1281 | scal_detail_size = 11; | ||
1282 | rio_detail_size = 13; | ||
1283 | break; | ||
1284 | case 3: | ||
1285 | scal_detail_size = 12; | ||
1286 | rio_detail_size = 15; | ||
1287 | break; | ||
1288 | default: | ||
1289 | printk(KERN_WARNING | ||
1290 | "Calgary: Invalid Rio Grande Table Version: %d\n", | ||
1291 | rio_table_hdr->version); | ||
1292 | return -EPROTO; | ||
1293 | } | ||
1294 | |||
1295 | ptr = ((unsigned long)rio_table_hdr) + 3; | ||
1296 | for (i = 0; i < rio_table_hdr->num_scal_dev; | ||
1297 | i++, ptr += scal_detail_size) | ||
1298 | scal_devs[i] = (struct scal_detail *)ptr; | ||
1299 | |||
1300 | for (i = 0; i < rio_table_hdr->num_rio_dev; | ||
1301 | i++, ptr += rio_detail_size) | ||
1302 | rio_devs[i] = (struct rio_detail *)ptr; | ||
1303 | |||
1304 | return 0; | ||
1305 | } | ||
1306 | |||
1307 | static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) | ||
1308 | { | ||
1309 | int dev; | ||
1310 | u32 val; | ||
1311 | |||
1312 | if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { | ||
1313 | /* | ||
1314 | * FIXME: properly scan for devices accross the | ||
1315 | * PCI-to-PCI bridge on every CalIOC2 port. | ||
1316 | */ | ||
1317 | return 1; | ||
1318 | } | ||
1319 | |||
1320 | for (dev = 1; dev < 8; dev++) { | ||
1321 | val = read_pci_config(bus, dev, 0, 0); | ||
1322 | if (val != 0xffffffff) | ||
1323 | break; | ||
1324 | } | ||
1325 | return (val != 0xffffffff); | ||
1326 | } | ||
1327 | |||
1328 | void __init detect_calgary(void) | ||
1329 | { | ||
1330 | int bus; | ||
1331 | void *tbl; | ||
1332 | int calgary_found = 0; | ||
1333 | unsigned long ptr; | ||
1334 | unsigned int offset, prev_offset; | ||
1335 | int ret; | ||
1336 | |||
1337 | /* | ||
1338 | * if the user specified iommu=off or iommu=soft or we found | ||
1339 | * another HW IOMMU already, bail out. | ||
1340 | */ | ||
1341 | if (swiotlb || no_iommu || iommu_detected) | ||
1342 | return; | ||
1343 | |||
1344 | if (!use_calgary) | ||
1345 | return; | ||
1346 | |||
1347 | if (!early_pci_allowed()) | ||
1348 | return; | ||
1349 | |||
1350 | printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); | ||
1351 | |||
1352 | ptr = (unsigned long)phys_to_virt(get_bios_ebda()); | ||
1353 | |||
1354 | rio_table_hdr = NULL; | ||
1355 | prev_offset = 0; | ||
1356 | offset = 0x180; | ||
1357 | /* | ||
1358 | * The next offset is stored in the 1st word. | ||
1359 | * Only parse up until the offset increases: | ||
1360 | */ | ||
1361 | while (offset > prev_offset) { | ||
1362 | /* The block id is stored in the 2nd word */ | ||
1363 | if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ | ||
1364 | /* set the pointer past the offset & block id */ | ||
1365 | rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); | ||
1366 | break; | ||
1367 | } | ||
1368 | prev_offset = offset; | ||
1369 | offset = *((unsigned short *)(ptr + offset)); | ||
1370 | } | ||
1371 | if (!rio_table_hdr) { | ||
1372 | printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " | ||
1373 | "in EBDA - bailing!\n"); | ||
1374 | return; | ||
1375 | } | ||
1376 | |||
1377 | ret = build_detail_arrays(); | ||
1378 | if (ret) { | ||
1379 | printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); | ||
1380 | return; | ||
1381 | } | ||
1382 | |||
1383 | specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); | ||
1384 | |||
1385 | for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { | ||
1386 | struct calgary_bus_info *info = &bus_info[bus]; | ||
1387 | unsigned short pci_device; | ||
1388 | u32 val; | ||
1389 | |||
1390 | val = read_pci_config(bus, 0, 0, 0); | ||
1391 | pci_device = (val & 0xFFFF0000) >> 16; | ||
1392 | |||
1393 | if (!is_cal_pci_dev(pci_device)) | ||
1394 | continue; | ||
1395 | |||
1396 | if (info->translation_disabled) | ||
1397 | continue; | ||
1398 | |||
1399 | if (calgary_bus_has_devices(bus, pci_device) || | ||
1400 | translate_empty_slots) { | ||
1401 | tbl = alloc_tce_table(); | ||
1402 | if (!tbl) | ||
1403 | goto cleanup; | ||
1404 | info->tce_space = tbl; | ||
1405 | calgary_found = 1; | ||
1406 | } | ||
1407 | } | ||
1408 | |||
1409 | printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", | ||
1410 | calgary_found ? "found" : "not found"); | ||
1411 | |||
1412 | if (calgary_found) { | ||
1413 | iommu_detected = 1; | ||
1414 | calgary_detected = 1; | ||
1415 | printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); | ||
1416 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " | ||
1417 | "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, | ||
1418 | debugging ? "enabled" : "disabled"); | ||
1419 | } | ||
1420 | return; | ||
1421 | |||
1422 | cleanup: | ||
1423 | for (--bus; bus >= 0; --bus) { | ||
1424 | struct calgary_bus_info *info = &bus_info[bus]; | ||
1425 | |||
1426 | if (info->tce_space) | ||
1427 | free_tce_table(info->tce_space); | ||
1428 | } | ||
1429 | } | ||
1430 | |||
1431 | int __init calgary_iommu_init(void) | ||
1432 | { | ||
1433 | int ret; | ||
1434 | |||
1435 | if (no_iommu || swiotlb) | ||
1436 | return -ENODEV; | ||
1437 | |||
1438 | if (!calgary_detected) | ||
1439 | return -ENODEV; | ||
1440 | |||
1441 | /* ok, we're trying to use Calgary - let's roll */ | ||
1442 | printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); | ||
1443 | |||
1444 | ret = calgary_init(); | ||
1445 | if (ret) { | ||
1446 | printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " | ||
1447 | "falling back to no_iommu\n", ret); | ||
1448 | if (end_pfn > MAX_DMA32_PFN) | ||
1449 | printk(KERN_ERR "WARNING more than 4GB of memory, " | ||
1450 | "32bit PCI may malfunction.\n"); | ||
1451 | return ret; | ||
1452 | } | ||
1453 | |||
1454 | force_iommu = 1; | ||
1455 | bad_dma_address = 0x0; | ||
1456 | dma_ops = &calgary_dma_ops; | ||
1457 | |||
1458 | return 0; | ||
1459 | } | ||
1460 | |||
1461 | static int __init calgary_parse_options(char *p) | ||
1462 | { | ||
1463 | unsigned int bridge; | ||
1464 | size_t len; | ||
1465 | char* endp; | ||
1466 | |||
1467 | while (*p) { | ||
1468 | if (!strncmp(p, "64k", 3)) | ||
1469 | specified_table_size = TCE_TABLE_SIZE_64K; | ||
1470 | else if (!strncmp(p, "128k", 4)) | ||
1471 | specified_table_size = TCE_TABLE_SIZE_128K; | ||
1472 | else if (!strncmp(p, "256k", 4)) | ||
1473 | specified_table_size = TCE_TABLE_SIZE_256K; | ||
1474 | else if (!strncmp(p, "512k", 4)) | ||
1475 | specified_table_size = TCE_TABLE_SIZE_512K; | ||
1476 | else if (!strncmp(p, "1M", 2)) | ||
1477 | specified_table_size = TCE_TABLE_SIZE_1M; | ||
1478 | else if (!strncmp(p, "2M", 2)) | ||
1479 | specified_table_size = TCE_TABLE_SIZE_2M; | ||
1480 | else if (!strncmp(p, "4M", 2)) | ||
1481 | specified_table_size = TCE_TABLE_SIZE_4M; | ||
1482 | else if (!strncmp(p, "8M", 2)) | ||
1483 | specified_table_size = TCE_TABLE_SIZE_8M; | ||
1484 | |||
1485 | len = strlen("translate_empty_slots"); | ||
1486 | if (!strncmp(p, "translate_empty_slots", len)) | ||
1487 | translate_empty_slots = 1; | ||
1488 | |||
1489 | len = strlen("disable"); | ||
1490 | if (!strncmp(p, "disable", len)) { | ||
1491 | p += len; | ||
1492 | if (*p == '=') | ||
1493 | ++p; | ||
1494 | if (*p == '\0') | ||
1495 | break; | ||
1496 | bridge = simple_strtol(p, &endp, 0); | ||
1497 | if (p == endp) | ||
1498 | break; | ||
1499 | |||
1500 | if (bridge < MAX_PHB_BUS_NUM) { | ||
1501 | printk(KERN_INFO "Calgary: disabling " | ||
1502 | "translation for PHB %#x\n", bridge); | ||
1503 | bus_info[bridge].translation_disabled = 1; | ||
1504 | } | ||
1505 | } | ||
1506 | |||
1507 | p = strpbrk(p, ","); | ||
1508 | if (!p) | ||
1509 | break; | ||
1510 | |||
1511 | p++; /* skip ',' */ | ||
1512 | } | ||
1513 | return 1; | ||
1514 | } | ||
1515 | __setup("calgary=", calgary_parse_options); | ||
1516 | |||
1517 | static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) | ||
1518 | { | ||
1519 | struct iommu_table *tbl; | ||
1520 | unsigned int npages; | ||
1521 | int i; | ||
1522 | |||
1523 | tbl = pci_iommu(dev->bus); | ||
1524 | |||
1525 | for (i = 0; i < 4; i++) { | ||
1526 | struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; | ||
1527 | |||
1528 | /* Don't give out TCEs that map MEM resources */ | ||
1529 | if (!(r->flags & IORESOURCE_MEM)) | ||
1530 | continue; | ||
1531 | |||
1532 | /* 0-based? we reserve the whole 1st MB anyway */ | ||
1533 | if (!r->start) | ||
1534 | continue; | ||
1535 | |||
1536 | /* cover the whole region */ | ||
1537 | npages = (r->end - r->start) >> PAGE_SHIFT; | ||
1538 | npages++; | ||
1539 | |||
1540 | iommu_range_reserve(tbl, r->start, npages); | ||
1541 | } | ||
1542 | } | ||
1543 | |||
1544 | static int __init calgary_fixup_tce_spaces(void) | ||
1545 | { | ||
1546 | struct pci_dev *dev = NULL; | ||
1547 | void *tce_space; | ||
1548 | |||
1549 | if (no_iommu || swiotlb || !calgary_detected) | ||
1550 | return -ENODEV; | ||
1551 | |||
1552 | printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); | ||
1553 | |||
1554 | do { | ||
1555 | dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); | ||
1556 | if (!dev) | ||
1557 | break; | ||
1558 | if (!is_cal_pci_dev(dev->device)) | ||
1559 | continue; | ||
1560 | if (!translate_phb(dev)) | ||
1561 | continue; | ||
1562 | |||
1563 | tce_space = bus_info[dev->bus->number].tce_space; | ||
1564 | if (!tce_space) | ||
1565 | continue; | ||
1566 | |||
1567 | calgary_fixup_one_tce_space(dev); | ||
1568 | |||
1569 | } while (1); | ||
1570 | |||
1571 | return 0; | ||
1572 | } | ||
1573 | |||
1574 | /* | ||
1575 | * We need to be call after pcibios_assign_resources (fs_initcall level) | ||
1576 | * and before device_initcall. | ||
1577 | */ | ||
1578 | rootfs_initcall(calgary_fixup_tce_spaces); | ||
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c deleted file mode 100644 index 29711445c818..000000000000 --- a/arch/x86_64/kernel/pci-dma.c +++ /dev/null | |||
@@ -1,346 +0,0 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support. | ||
3 | */ | ||
4 | |||
5 | #include <linux/types.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/pci.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <asm/io.h> | ||
11 | #include <asm/iommu.h> | ||
12 | #include <asm/calgary.h> | ||
13 | |||
14 | int iommu_merge __read_mostly = 0; | ||
15 | EXPORT_SYMBOL(iommu_merge); | ||
16 | |||
17 | dma_addr_t bad_dma_address __read_mostly; | ||
18 | EXPORT_SYMBOL(bad_dma_address); | ||
19 | |||
20 | /* This tells the BIO block layer to assume merging. Default to off | ||
21 | because we cannot guarantee merging later. */ | ||
22 | int iommu_bio_merge __read_mostly = 0; | ||
23 | EXPORT_SYMBOL(iommu_bio_merge); | ||
24 | |||
25 | static int iommu_sac_force __read_mostly = 0; | ||
26 | |||
27 | int no_iommu __read_mostly; | ||
28 | #ifdef CONFIG_IOMMU_DEBUG | ||
29 | int panic_on_overflow __read_mostly = 1; | ||
30 | int force_iommu __read_mostly = 1; | ||
31 | #else | ||
32 | int panic_on_overflow __read_mostly = 0; | ||
33 | int force_iommu __read_mostly= 0; | ||
34 | #endif | ||
35 | |||
36 | /* Set this to 1 if there is a HW IOMMU in the system */ | ||
37 | int iommu_detected __read_mostly = 0; | ||
38 | |||
39 | /* Dummy device used for NULL arguments (normally ISA). Better would | ||
40 | be probably a smaller DMA mask, but this is bug-to-bug compatible | ||
41 | to i386. */ | ||
42 | struct device fallback_dev = { | ||
43 | .bus_id = "fallback device", | ||
44 | .coherent_dma_mask = DMA_32BIT_MASK, | ||
45 | .dma_mask = &fallback_dev.coherent_dma_mask, | ||
46 | }; | ||
47 | |||
48 | /* Allocate DMA memory on node near device */ | ||
49 | noinline static void * | ||
50 | dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) | ||
51 | { | ||
52 | struct page *page; | ||
53 | int node; | ||
54 | #ifdef CONFIG_PCI | ||
55 | if (dev->bus == &pci_bus_type) | ||
56 | node = pcibus_to_node(to_pci_dev(dev)->bus); | ||
57 | else | ||
58 | #endif | ||
59 | node = numa_node_id(); | ||
60 | |||
61 | if (node < first_node(node_online_map)) | ||
62 | node = first_node(node_online_map); | ||
63 | |||
64 | page = alloc_pages_node(node, gfp, order); | ||
65 | return page ? page_address(page) : NULL; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Allocate memory for a coherent mapping. | ||
70 | */ | ||
71 | void * | ||
72 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | ||
73 | gfp_t gfp) | ||
74 | { | ||
75 | void *memory; | ||
76 | unsigned long dma_mask = 0; | ||
77 | u64 bus; | ||
78 | |||
79 | if (!dev) | ||
80 | dev = &fallback_dev; | ||
81 | dma_mask = dev->coherent_dma_mask; | ||
82 | if (dma_mask == 0) | ||
83 | dma_mask = DMA_32BIT_MASK; | ||
84 | |||
85 | /* Device not DMA able */ | ||
86 | if (dev->dma_mask == NULL) | ||
87 | return NULL; | ||
88 | |||
89 | /* Don't invoke OOM killer */ | ||
90 | gfp |= __GFP_NORETRY; | ||
91 | |||
92 | /* Kludge to make it bug-to-bug compatible with i386. i386 | ||
93 | uses the normal dma_mask for alloc_coherent. */ | ||
94 | dma_mask &= *dev->dma_mask; | ||
95 | |||
96 | /* Why <=? Even when the mask is smaller than 4GB it is often | ||
97 | larger than 16MB and in this case we have a chance of | ||
98 | finding fitting memory in the next higher zone first. If | ||
99 | not retry with true GFP_DMA. -AK */ | ||
100 | if (dma_mask <= DMA_32BIT_MASK) | ||
101 | gfp |= GFP_DMA32; | ||
102 | |||
103 | again: | ||
104 | memory = dma_alloc_pages(dev, gfp, get_order(size)); | ||
105 | if (memory == NULL) | ||
106 | return NULL; | ||
107 | |||
108 | { | ||
109 | int high, mmu; | ||
110 | bus = virt_to_bus(memory); | ||
111 | high = (bus + size) >= dma_mask; | ||
112 | mmu = high; | ||
113 | if (force_iommu && !(gfp & GFP_DMA)) | ||
114 | mmu = 1; | ||
115 | else if (high) { | ||
116 | free_pages((unsigned long)memory, | ||
117 | get_order(size)); | ||
118 | |||
119 | /* Don't use the 16MB ZONE_DMA unless absolutely | ||
120 | needed. It's better to use remapping first. */ | ||
121 | if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { | ||
122 | gfp = (gfp & ~GFP_DMA32) | GFP_DMA; | ||
123 | goto again; | ||
124 | } | ||
125 | |||
126 | /* Let low level make its own zone decisions */ | ||
127 | gfp &= ~(GFP_DMA32|GFP_DMA); | ||
128 | |||
129 | if (dma_ops->alloc_coherent) | ||
130 | return dma_ops->alloc_coherent(dev, size, | ||
131 | dma_handle, gfp); | ||
132 | return NULL; | ||
133 | } | ||
134 | |||
135 | memset(memory, 0, size); | ||
136 | if (!mmu) { | ||
137 | *dma_handle = virt_to_bus(memory); | ||
138 | return memory; | ||
139 | } | ||
140 | } | ||
141 | |||
142 | if (dma_ops->alloc_coherent) { | ||
143 | free_pages((unsigned long)memory, get_order(size)); | ||
144 | gfp &= ~(GFP_DMA|GFP_DMA32); | ||
145 | return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); | ||
146 | } | ||
147 | |||
148 | if (dma_ops->map_simple) { | ||
149 | *dma_handle = dma_ops->map_simple(dev, memory, | ||
150 | size, | ||
151 | PCI_DMA_BIDIRECTIONAL); | ||
152 | if (*dma_handle != bad_dma_address) | ||
153 | return memory; | ||
154 | } | ||
155 | |||
156 | if (panic_on_overflow) | ||
157 | panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size); | ||
158 | free_pages((unsigned long)memory, get_order(size)); | ||
159 | return NULL; | ||
160 | } | ||
161 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
162 | |||
163 | /* | ||
164 | * Unmap coherent memory. | ||
165 | * The caller must ensure that the device has finished accessing the mapping. | ||
166 | */ | ||
167 | void dma_free_coherent(struct device *dev, size_t size, | ||
168 | void *vaddr, dma_addr_t bus) | ||
169 | { | ||
170 | if (dma_ops->unmap_single) | ||
171 | dma_ops->unmap_single(dev, bus, size, 0); | ||
172 | free_pages((unsigned long)vaddr, get_order(size)); | ||
173 | } | ||
174 | EXPORT_SYMBOL(dma_free_coherent); | ||
175 | |||
176 | static int forbid_dac __read_mostly; | ||
177 | |||
178 | int dma_supported(struct device *dev, u64 mask) | ||
179 | { | ||
180 | #ifdef CONFIG_PCI | ||
181 | if (mask > 0xffffffff && forbid_dac > 0) { | ||
182 | |||
183 | |||
184 | |||
185 | printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id); | ||
186 | return 0; | ||
187 | } | ||
188 | #endif | ||
189 | |||
190 | if (dma_ops->dma_supported) | ||
191 | return dma_ops->dma_supported(dev, mask); | ||
192 | |||
193 | /* Copied from i386. Doesn't make much sense, because it will | ||
194 | only work for pci_alloc_coherent. | ||
195 | The caller just has to use GFP_DMA in this case. */ | ||
196 | if (mask < DMA_24BIT_MASK) | ||
197 | return 0; | ||
198 | |||
199 | /* Tell the device to use SAC when IOMMU force is on. This | ||
200 | allows the driver to use cheaper accesses in some cases. | ||
201 | |||
202 | Problem with this is that if we overflow the IOMMU area and | ||
203 | return DAC as fallback address the device may not handle it | ||
204 | correctly. | ||
205 | |||
206 | As a special case some controllers have a 39bit address | ||
207 | mode that is as efficient as 32bit (aic79xx). Don't force | ||
208 | SAC for these. Assume all masks <= 40 bits are of this | ||
209 | type. Normally this doesn't make any difference, but gives | ||
210 | more gentle handling of IOMMU overflow. */ | ||
211 | if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { | ||
212 | printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); | ||
213 | return 0; | ||
214 | } | ||
215 | |||
216 | return 1; | ||
217 | } | ||
218 | EXPORT_SYMBOL(dma_supported); | ||
219 | |||
220 | int dma_set_mask(struct device *dev, u64 mask) | ||
221 | { | ||
222 | if (!dev->dma_mask || !dma_supported(dev, mask)) | ||
223 | return -EIO; | ||
224 | *dev->dma_mask = mask; | ||
225 | return 0; | ||
226 | } | ||
227 | EXPORT_SYMBOL(dma_set_mask); | ||
228 | |||
229 | /* | ||
230 | * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter | ||
231 | * documentation. | ||
232 | */ | ||
233 | __init int iommu_setup(char *p) | ||
234 | { | ||
235 | iommu_merge = 1; | ||
236 | |||
237 | if (!p) | ||
238 | return -EINVAL; | ||
239 | |||
240 | while (*p) { | ||
241 | if (!strncmp(p,"off",3)) | ||
242 | no_iommu = 1; | ||
243 | /* gart_parse_options has more force support */ | ||
244 | if (!strncmp(p,"force",5)) | ||
245 | force_iommu = 1; | ||
246 | if (!strncmp(p,"noforce",7)) { | ||
247 | iommu_merge = 0; | ||
248 | force_iommu = 0; | ||
249 | } | ||
250 | |||
251 | if (!strncmp(p, "biomerge",8)) { | ||
252 | iommu_bio_merge = 4096; | ||
253 | iommu_merge = 1; | ||
254 | force_iommu = 1; | ||
255 | } | ||
256 | if (!strncmp(p, "panic",5)) | ||
257 | panic_on_overflow = 1; | ||
258 | if (!strncmp(p, "nopanic",7)) | ||
259 | panic_on_overflow = 0; | ||
260 | if (!strncmp(p, "merge",5)) { | ||
261 | iommu_merge = 1; | ||
262 | force_iommu = 1; | ||
263 | } | ||
264 | if (!strncmp(p, "nomerge",7)) | ||
265 | iommu_merge = 0; | ||
266 | if (!strncmp(p, "forcesac",8)) | ||
267 | iommu_sac_force = 1; | ||
268 | if (!strncmp(p, "allowdac", 8)) | ||
269 | forbid_dac = 0; | ||
270 | if (!strncmp(p, "nodac", 5)) | ||
271 | forbid_dac = -1; | ||
272 | |||
273 | #ifdef CONFIG_SWIOTLB | ||
274 | if (!strncmp(p, "soft",4)) | ||
275 | swiotlb = 1; | ||
276 | #endif | ||
277 | |||
278 | #ifdef CONFIG_IOMMU | ||
279 | gart_parse_options(p); | ||
280 | #endif | ||
281 | |||
282 | #ifdef CONFIG_CALGARY_IOMMU | ||
283 | if (!strncmp(p, "calgary", 7)) | ||
284 | use_calgary = 1; | ||
285 | #endif /* CONFIG_CALGARY_IOMMU */ | ||
286 | |||
287 | p += strcspn(p, ","); | ||
288 | if (*p == ',') | ||
289 | ++p; | ||
290 | } | ||
291 | return 0; | ||
292 | } | ||
293 | early_param("iommu", iommu_setup); | ||
294 | |||
295 | void __init pci_iommu_alloc(void) | ||
296 | { | ||
297 | /* | ||
298 | * The order of these functions is important for | ||
299 | * fall-back/fail-over reasons | ||
300 | */ | ||
301 | #ifdef CONFIG_IOMMU | ||
302 | iommu_hole_init(); | ||
303 | #endif | ||
304 | |||
305 | #ifdef CONFIG_CALGARY_IOMMU | ||
306 | detect_calgary(); | ||
307 | #endif | ||
308 | |||
309 | #ifdef CONFIG_SWIOTLB | ||
310 | pci_swiotlb_init(); | ||
311 | #endif | ||
312 | } | ||
313 | |||
314 | static int __init pci_iommu_init(void) | ||
315 | { | ||
316 | #ifdef CONFIG_CALGARY_IOMMU | ||
317 | calgary_iommu_init(); | ||
318 | #endif | ||
319 | |||
320 | #ifdef CONFIG_IOMMU | ||
321 | gart_iommu_init(); | ||
322 | #endif | ||
323 | |||
324 | no_iommu_init(); | ||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | void pci_iommu_shutdown(void) | ||
329 | { | ||
330 | gart_iommu_shutdown(); | ||
331 | } | ||
332 | |||
333 | #ifdef CONFIG_PCI | ||
334 | /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ | ||
335 | |||
336 | static __devinit void via_no_dac(struct pci_dev *dev) | ||
337 | { | ||
338 | if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { | ||
339 | printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); | ||
340 | forbid_dac = 1; | ||
341 | } | ||
342 | } | ||
343 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); | ||
344 | #endif | ||
345 | /* Must execute after PCI subsystem */ | ||
346 | fs_initcall(pci_iommu_init); | ||
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c deleted file mode 100644 index 4918c575d582..000000000000 --- a/arch/x86_64/kernel/pci-gart.c +++ /dev/null | |||
@@ -1,740 +0,0 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support for AMD Hammer. | ||
3 | * | ||
4 | * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. | ||
5 | * This allows to use PCI devices that only support 32bit addresses on systems | ||
6 | * with more than 4GB. | ||
7 | * | ||
8 | * See Documentation/DMA-mapping.txt for the interface specification. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | */ | ||
12 | |||
13 | #include <linux/types.h> | ||
14 | #include <linux/ctype.h> | ||
15 | #include <linux/agp_backend.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/spinlock.h> | ||
20 | #include <linux/pci.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/topology.h> | ||
23 | #include <linux/interrupt.h> | ||
24 | #include <linux/bitops.h> | ||
25 | #include <linux/kdebug.h> | ||
26 | #include <asm/atomic.h> | ||
27 | #include <asm/io.h> | ||
28 | #include <asm/mtrr.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/iommu.h> | ||
32 | #include <asm/cacheflush.h> | ||
33 | #include <asm/swiotlb.h> | ||
34 | #include <asm/dma.h> | ||
35 | #include <asm/k8.h> | ||
36 | |||
37 | unsigned long iommu_bus_base; /* GART remapping area (physical) */ | ||
38 | static unsigned long iommu_size; /* size of remapping area bytes */ | ||
39 | static unsigned long iommu_pages; /* .. and in pages */ | ||
40 | |||
41 | u32 *iommu_gatt_base; /* Remapping table */ | ||
42 | |||
43 | /* If this is disabled the IOMMU will use an optimized flushing strategy | ||
44 | of only flushing when an mapping is reused. With it true the GART is flushed | ||
45 | for every mapping. Problem is that doing the lazy flush seems to trigger | ||
46 | bugs with some popular PCI cards, in particular 3ware (but has been also | ||
47 | also seen with Qlogic at least). */ | ||
48 | int iommu_fullflush = 1; | ||
49 | |||
50 | /* Allocation bitmap for the remapping area */ | ||
51 | static DEFINE_SPINLOCK(iommu_bitmap_lock); | ||
52 | static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ | ||
53 | |||
54 | static u32 gart_unmapped_entry; | ||
55 | |||
56 | #define GPTE_VALID 1 | ||
57 | #define GPTE_COHERENT 2 | ||
58 | #define GPTE_ENCODE(x) \ | ||
59 | (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) | ||
60 | #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) | ||
61 | |||
62 | #define to_pages(addr,size) \ | ||
63 | (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) | ||
64 | |||
65 | #define EMERGENCY_PAGES 32 /* = 128KB */ | ||
66 | |||
67 | #ifdef CONFIG_AGP | ||
68 | #define AGPEXTERN extern | ||
69 | #else | ||
70 | #define AGPEXTERN | ||
71 | #endif | ||
72 | |||
73 | /* backdoor interface to AGP driver */ | ||
74 | AGPEXTERN int agp_memory_reserved; | ||
75 | AGPEXTERN __u32 *agp_gatt_table; | ||
76 | |||
77 | static unsigned long next_bit; /* protected by iommu_bitmap_lock */ | ||
78 | static int need_flush; /* global flush state. set for each gart wrap */ | ||
79 | |||
80 | static unsigned long alloc_iommu(int size) | ||
81 | { | ||
82 | unsigned long offset, flags; | ||
83 | |||
84 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
85 | offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); | ||
86 | if (offset == -1) { | ||
87 | need_flush = 1; | ||
88 | offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); | ||
89 | } | ||
90 | if (offset != -1) { | ||
91 | set_bit_string(iommu_gart_bitmap, offset, size); | ||
92 | next_bit = offset+size; | ||
93 | if (next_bit >= iommu_pages) { | ||
94 | next_bit = 0; | ||
95 | need_flush = 1; | ||
96 | } | ||
97 | } | ||
98 | if (iommu_fullflush) | ||
99 | need_flush = 1; | ||
100 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
101 | return offset; | ||
102 | } | ||
103 | |||
104 | static void free_iommu(unsigned long offset, int size) | ||
105 | { | ||
106 | unsigned long flags; | ||
107 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
108 | __clear_bit_string(iommu_gart_bitmap, offset, size); | ||
109 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Use global flush state to avoid races with multiple flushers. | ||
114 | */ | ||
115 | static void flush_gart(void) | ||
116 | { | ||
117 | unsigned long flags; | ||
118 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
119 | if (need_flush) { | ||
120 | k8_flush_garts(); | ||
121 | need_flush = 0; | ||
122 | } | ||
123 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
124 | } | ||
125 | |||
126 | #ifdef CONFIG_IOMMU_LEAK | ||
127 | |||
128 | #define SET_LEAK(x) if (iommu_leak_tab) \ | ||
129 | iommu_leak_tab[x] = __builtin_return_address(0); | ||
130 | #define CLEAR_LEAK(x) if (iommu_leak_tab) \ | ||
131 | iommu_leak_tab[x] = NULL; | ||
132 | |||
133 | /* Debugging aid for drivers that don't free their IOMMU tables */ | ||
134 | static void **iommu_leak_tab; | ||
135 | static int leak_trace; | ||
136 | int iommu_leak_pages = 20; | ||
137 | void dump_leak(void) | ||
138 | { | ||
139 | int i; | ||
140 | static int dump; | ||
141 | if (dump || !iommu_leak_tab) return; | ||
142 | dump = 1; | ||
143 | show_stack(NULL,NULL); | ||
144 | /* Very crude. dump some from the end of the table too */ | ||
145 | printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); | ||
146 | for (i = 0; i < iommu_leak_pages; i+=2) { | ||
147 | printk("%lu: ", iommu_pages-i); | ||
148 | printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); | ||
149 | printk("%c", (i+1)%2 == 0 ? '\n' : ' '); | ||
150 | } | ||
151 | printk("\n"); | ||
152 | } | ||
153 | #else | ||
154 | #define SET_LEAK(x) | ||
155 | #define CLEAR_LEAK(x) | ||
156 | #endif | ||
157 | |||
158 | static void iommu_full(struct device *dev, size_t size, int dir) | ||
159 | { | ||
160 | /* | ||
161 | * Ran out of IOMMU space for this operation. This is very bad. | ||
162 | * Unfortunately the drivers cannot handle this operation properly. | ||
163 | * Return some non mapped prereserved space in the aperture and | ||
164 | * let the Northbridge deal with it. This will result in garbage | ||
165 | * in the IO operation. When the size exceeds the prereserved space | ||
166 | * memory corruption will occur or random memory will be DMAed | ||
167 | * out. Hopefully no network devices use single mappings that big. | ||
168 | */ | ||
169 | |||
170 | printk(KERN_ERR | ||
171 | "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", | ||
172 | size, dev->bus_id); | ||
173 | |||
174 | if (size > PAGE_SIZE*EMERGENCY_PAGES) { | ||
175 | if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
176 | panic("PCI-DMA: Memory would be corrupted\n"); | ||
177 | if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
178 | panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); | ||
179 | } | ||
180 | |||
181 | #ifdef CONFIG_IOMMU_LEAK | ||
182 | dump_leak(); | ||
183 | #endif | ||
184 | } | ||
185 | |||
186 | static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) | ||
187 | { | ||
188 | u64 mask = *dev->dma_mask; | ||
189 | int high = addr + size > mask; | ||
190 | int mmu = high; | ||
191 | if (force_iommu) | ||
192 | mmu = 1; | ||
193 | return mmu; | ||
194 | } | ||
195 | |||
196 | static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | ||
197 | { | ||
198 | u64 mask = *dev->dma_mask; | ||
199 | int high = addr + size > mask; | ||
200 | int mmu = high; | ||
201 | return mmu; | ||
202 | } | ||
203 | |||
204 | /* Map a single continuous physical area into the IOMMU. | ||
205 | * Caller needs to check if the iommu is needed and flush. | ||
206 | */ | ||
207 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | ||
208 | size_t size, int dir) | ||
209 | { | ||
210 | unsigned long npages = to_pages(phys_mem, size); | ||
211 | unsigned long iommu_page = alloc_iommu(npages); | ||
212 | int i; | ||
213 | if (iommu_page == -1) { | ||
214 | if (!nonforced_iommu(dev, phys_mem, size)) | ||
215 | return phys_mem; | ||
216 | if (panic_on_overflow) | ||
217 | panic("dma_map_area overflow %lu bytes\n", size); | ||
218 | iommu_full(dev, size, dir); | ||
219 | return bad_dma_address; | ||
220 | } | ||
221 | |||
222 | for (i = 0; i < npages; i++) { | ||
223 | iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); | ||
224 | SET_LEAK(iommu_page + i); | ||
225 | phys_mem += PAGE_SIZE; | ||
226 | } | ||
227 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); | ||
228 | } | ||
229 | |||
230 | static dma_addr_t gart_map_simple(struct device *dev, char *buf, | ||
231 | size_t size, int dir) | ||
232 | { | ||
233 | dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); | ||
234 | flush_gart(); | ||
235 | return map; | ||
236 | } | ||
237 | |||
238 | /* Map a single area into the IOMMU */ | ||
239 | static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) | ||
240 | { | ||
241 | unsigned long phys_mem, bus; | ||
242 | |||
243 | if (!dev) | ||
244 | dev = &fallback_dev; | ||
245 | |||
246 | phys_mem = virt_to_phys(addr); | ||
247 | if (!need_iommu(dev, phys_mem, size)) | ||
248 | return phys_mem; | ||
249 | |||
250 | bus = gart_map_simple(dev, addr, size, dir); | ||
251 | return bus; | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Free a DMA mapping. | ||
256 | */ | ||
257 | static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, | ||
258 | size_t size, int direction) | ||
259 | { | ||
260 | unsigned long iommu_page; | ||
261 | int npages; | ||
262 | int i; | ||
263 | |||
264 | if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || | ||
265 | dma_addr >= iommu_bus_base + iommu_size) | ||
266 | return; | ||
267 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | ||
268 | npages = to_pages(dma_addr, size); | ||
269 | for (i = 0; i < npages; i++) { | ||
270 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | ||
271 | CLEAR_LEAK(iommu_page + i); | ||
272 | } | ||
273 | free_iommu(iommu_page, npages); | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Wrapper for pci_unmap_single working with scatterlists. | ||
278 | */ | ||
279 | static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
280 | { | ||
281 | int i; | ||
282 | |||
283 | for (i = 0; i < nents; i++) { | ||
284 | struct scatterlist *s = &sg[i]; | ||
285 | if (!s->dma_length || !s->length) | ||
286 | break; | ||
287 | gart_unmap_single(dev, s->dma_address, s->dma_length, dir); | ||
288 | } | ||
289 | } | ||
290 | |||
291 | /* Fallback for dma_map_sg in case of overflow */ | ||
292 | static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, | ||
293 | int nents, int dir) | ||
294 | { | ||
295 | int i; | ||
296 | |||
297 | #ifdef CONFIG_IOMMU_DEBUG | ||
298 | printk(KERN_DEBUG "dma_map_sg overflow\n"); | ||
299 | #endif | ||
300 | |||
301 | for (i = 0; i < nents; i++ ) { | ||
302 | struct scatterlist *s = &sg[i]; | ||
303 | unsigned long addr = page_to_phys(s->page) + s->offset; | ||
304 | if (nonforced_iommu(dev, addr, s->length)) { | ||
305 | addr = dma_map_area(dev, addr, s->length, dir); | ||
306 | if (addr == bad_dma_address) { | ||
307 | if (i > 0) | ||
308 | gart_unmap_sg(dev, sg, i, dir); | ||
309 | nents = 0; | ||
310 | sg[0].dma_length = 0; | ||
311 | break; | ||
312 | } | ||
313 | } | ||
314 | s->dma_address = addr; | ||
315 | s->dma_length = s->length; | ||
316 | } | ||
317 | flush_gart(); | ||
318 | return nents; | ||
319 | } | ||
320 | |||
321 | /* Map multiple scatterlist entries continuous into the first. */ | ||
322 | static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
323 | struct scatterlist *sout, unsigned long pages) | ||
324 | { | ||
325 | unsigned long iommu_start = alloc_iommu(pages); | ||
326 | unsigned long iommu_page = iommu_start; | ||
327 | int i; | ||
328 | |||
329 | if (iommu_start == -1) | ||
330 | return -1; | ||
331 | |||
332 | for (i = start; i < stopat; i++) { | ||
333 | struct scatterlist *s = &sg[i]; | ||
334 | unsigned long pages, addr; | ||
335 | unsigned long phys_addr = s->dma_address; | ||
336 | |||
337 | BUG_ON(i > start && s->offset); | ||
338 | if (i == start) { | ||
339 | *sout = *s; | ||
340 | sout->dma_address = iommu_bus_base; | ||
341 | sout->dma_address += iommu_page*PAGE_SIZE + s->offset; | ||
342 | sout->dma_length = s->length; | ||
343 | } else { | ||
344 | sout->dma_length += s->length; | ||
345 | } | ||
346 | |||
347 | addr = phys_addr; | ||
348 | pages = to_pages(s->offset, s->length); | ||
349 | while (pages--) { | ||
350 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | ||
351 | SET_LEAK(iommu_page); | ||
352 | addr += PAGE_SIZE; | ||
353 | iommu_page++; | ||
354 | } | ||
355 | } | ||
356 | BUG_ON(iommu_page - iommu_start != pages); | ||
357 | return 0; | ||
358 | } | ||
359 | |||
360 | static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
361 | struct scatterlist *sout, | ||
362 | unsigned long pages, int need) | ||
363 | { | ||
364 | if (!need) { | ||
365 | BUG_ON(stopat - start != 1); | ||
366 | *sout = sg[start]; | ||
367 | sout->dma_length = sg[start].length; | ||
368 | return 0; | ||
369 | } | ||
370 | return __dma_map_cont(sg, start, stopat, sout, pages); | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * DMA map all entries in a scatterlist. | ||
375 | * Merge chunks that have page aligned sizes into a continuous mapping. | ||
376 | */ | ||
377 | int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
378 | { | ||
379 | int i; | ||
380 | int out; | ||
381 | int start; | ||
382 | unsigned long pages = 0; | ||
383 | int need = 0, nextneed; | ||
384 | |||
385 | if (nents == 0) | ||
386 | return 0; | ||
387 | |||
388 | if (!dev) | ||
389 | dev = &fallback_dev; | ||
390 | |||
391 | out = 0; | ||
392 | start = 0; | ||
393 | for (i = 0; i < nents; i++) { | ||
394 | struct scatterlist *s = &sg[i]; | ||
395 | dma_addr_t addr = page_to_phys(s->page) + s->offset; | ||
396 | s->dma_address = addr; | ||
397 | BUG_ON(s->length == 0); | ||
398 | |||
399 | nextneed = need_iommu(dev, addr, s->length); | ||
400 | |||
401 | /* Handle the previous not yet processed entries */ | ||
402 | if (i > start) { | ||
403 | struct scatterlist *ps = &sg[i-1]; | ||
404 | /* Can only merge when the last chunk ends on a page | ||
405 | boundary and the new one doesn't have an offset. */ | ||
406 | if (!iommu_merge || !nextneed || !need || s->offset || | ||
407 | (ps->offset + ps->length) % PAGE_SIZE) { | ||
408 | if (dma_map_cont(sg, start, i, sg+out, pages, | ||
409 | need) < 0) | ||
410 | goto error; | ||
411 | out++; | ||
412 | pages = 0; | ||
413 | start = i; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | need = nextneed; | ||
418 | pages += to_pages(s->offset, s->length); | ||
419 | } | ||
420 | if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) | ||
421 | goto error; | ||
422 | out++; | ||
423 | flush_gart(); | ||
424 | if (out < nents) | ||
425 | sg[out].dma_length = 0; | ||
426 | return out; | ||
427 | |||
428 | error: | ||
429 | flush_gart(); | ||
430 | gart_unmap_sg(dev, sg, nents, dir); | ||
431 | /* When it was forced or merged try again in a dumb way */ | ||
432 | if (force_iommu || iommu_merge) { | ||
433 | out = dma_map_sg_nonforce(dev, sg, nents, dir); | ||
434 | if (out > 0) | ||
435 | return out; | ||
436 | } | ||
437 | if (panic_on_overflow) | ||
438 | panic("dma_map_sg: overflow on %lu pages\n", pages); | ||
439 | iommu_full(dev, pages << PAGE_SHIFT, dir); | ||
440 | for (i = 0; i < nents; i++) | ||
441 | sg[i].dma_address = bad_dma_address; | ||
442 | return 0; | ||
443 | } | ||
444 | |||
445 | static int no_agp; | ||
446 | |||
447 | static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) | ||
448 | { | ||
449 | unsigned long a; | ||
450 | if (!iommu_size) { | ||
451 | iommu_size = aper_size; | ||
452 | if (!no_agp) | ||
453 | iommu_size /= 2; | ||
454 | } | ||
455 | |||
456 | a = aper + iommu_size; | ||
457 | iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; | ||
458 | |||
459 | if (iommu_size < 64*1024*1024) | ||
460 | printk(KERN_WARNING | ||
461 | "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); | ||
462 | |||
463 | return iommu_size; | ||
464 | } | ||
465 | |||
466 | static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) | ||
467 | { | ||
468 | unsigned aper_size = 0, aper_base_32; | ||
469 | u64 aper_base; | ||
470 | unsigned aper_order; | ||
471 | |||
472 | pci_read_config_dword(dev, 0x94, &aper_base_32); | ||
473 | pci_read_config_dword(dev, 0x90, &aper_order); | ||
474 | aper_order = (aper_order >> 1) & 7; | ||
475 | |||
476 | aper_base = aper_base_32 & 0x7fff; | ||
477 | aper_base <<= 25; | ||
478 | |||
479 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
480 | if (aper_base + aper_size > 0x100000000UL || !aper_size) | ||
481 | aper_base = 0; | ||
482 | |||
483 | *size = aper_size; | ||
484 | return aper_base; | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * Private Northbridge GATT initialization in case we cannot use the | ||
489 | * AGP driver for some reason. | ||
490 | */ | ||
491 | static __init int init_k8_gatt(struct agp_kern_info *info) | ||
492 | { | ||
493 | struct pci_dev *dev; | ||
494 | void *gatt; | ||
495 | unsigned aper_base, new_aper_base; | ||
496 | unsigned aper_size, gatt_size, new_aper_size; | ||
497 | int i; | ||
498 | |||
499 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); | ||
500 | aper_size = aper_base = info->aper_size = 0; | ||
501 | dev = NULL; | ||
502 | for (i = 0; i < num_k8_northbridges; i++) { | ||
503 | dev = k8_northbridges[i]; | ||
504 | new_aper_base = read_aperture(dev, &new_aper_size); | ||
505 | if (!new_aper_base) | ||
506 | goto nommu; | ||
507 | |||
508 | if (!aper_base) { | ||
509 | aper_size = new_aper_size; | ||
510 | aper_base = new_aper_base; | ||
511 | } | ||
512 | if (aper_size != new_aper_size || aper_base != new_aper_base) | ||
513 | goto nommu; | ||
514 | } | ||
515 | if (!aper_base) | ||
516 | goto nommu; | ||
517 | info->aper_base = aper_base; | ||
518 | info->aper_size = aper_size>>20; | ||
519 | |||
520 | gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); | ||
521 | gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); | ||
522 | if (!gatt) | ||
523 | panic("Cannot allocate GATT table"); | ||
524 | if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) | ||
525 | panic("Could not set GART PTEs to uncacheable pages"); | ||
526 | global_flush_tlb(); | ||
527 | |||
528 | memset(gatt, 0, gatt_size); | ||
529 | agp_gatt_table = gatt; | ||
530 | |||
531 | for (i = 0; i < num_k8_northbridges; i++) { | ||
532 | u32 ctl; | ||
533 | u32 gatt_reg; | ||
534 | |||
535 | dev = k8_northbridges[i]; | ||
536 | gatt_reg = __pa(gatt) >> 12; | ||
537 | gatt_reg <<= 4; | ||
538 | pci_write_config_dword(dev, 0x98, gatt_reg); | ||
539 | pci_read_config_dword(dev, 0x90, &ctl); | ||
540 | |||
541 | ctl |= 1; | ||
542 | ctl &= ~((1<<4) | (1<<5)); | ||
543 | |||
544 | pci_write_config_dword(dev, 0x90, ctl); | ||
545 | } | ||
546 | flush_gart(); | ||
547 | |||
548 | printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); | ||
549 | return 0; | ||
550 | |||
551 | nommu: | ||
552 | /* Should not happen anymore */ | ||
553 | printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" | ||
554 | KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); | ||
555 | return -1; | ||
556 | } | ||
557 | |||
558 | extern int agp_amd64_init(void); | ||
559 | |||
560 | static const struct dma_mapping_ops gart_dma_ops = { | ||
561 | .mapping_error = NULL, | ||
562 | .map_single = gart_map_single, | ||
563 | .map_simple = gart_map_simple, | ||
564 | .unmap_single = gart_unmap_single, | ||
565 | .sync_single_for_cpu = NULL, | ||
566 | .sync_single_for_device = NULL, | ||
567 | .sync_single_range_for_cpu = NULL, | ||
568 | .sync_single_range_for_device = NULL, | ||
569 | .sync_sg_for_cpu = NULL, | ||
570 | .sync_sg_for_device = NULL, | ||
571 | .map_sg = gart_map_sg, | ||
572 | .unmap_sg = gart_unmap_sg, | ||
573 | }; | ||
574 | |||
575 | void gart_iommu_shutdown(void) | ||
576 | { | ||
577 | struct pci_dev *dev; | ||
578 | int i; | ||
579 | |||
580 | if (no_agp && (dma_ops != &gart_dma_ops)) | ||
581 | return; | ||
582 | |||
583 | for (i = 0; i < num_k8_northbridges; i++) { | ||
584 | u32 ctl; | ||
585 | |||
586 | dev = k8_northbridges[i]; | ||
587 | pci_read_config_dword(dev, 0x90, &ctl); | ||
588 | |||
589 | ctl &= ~1; | ||
590 | |||
591 | pci_write_config_dword(dev, 0x90, ctl); | ||
592 | } | ||
593 | } | ||
594 | |||
595 | void __init gart_iommu_init(void) | ||
596 | { | ||
597 | struct agp_kern_info info; | ||
598 | unsigned long aper_size; | ||
599 | unsigned long iommu_start; | ||
600 | unsigned long scratch; | ||
601 | long i; | ||
602 | |||
603 | if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { | ||
604 | printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); | ||
605 | return; | ||
606 | } | ||
607 | |||
608 | #ifndef CONFIG_AGP_AMD64 | ||
609 | no_agp = 1; | ||
610 | #else | ||
611 | /* Makefile puts PCI initialization via subsys_initcall first. */ | ||
612 | /* Add other K8 AGP bridge drivers here */ | ||
613 | no_agp = no_agp || | ||
614 | (agp_amd64_init() < 0) || | ||
615 | (agp_copy_info(agp_bridge, &info) < 0); | ||
616 | #endif | ||
617 | |||
618 | if (swiotlb) | ||
619 | return; | ||
620 | |||
621 | /* Did we detect a different HW IOMMU? */ | ||
622 | if (iommu_detected && !iommu_aperture) | ||
623 | return; | ||
624 | |||
625 | if (no_iommu || | ||
626 | (!force_iommu && end_pfn <= MAX_DMA32_PFN) || | ||
627 | !iommu_aperture || | ||
628 | (no_agp && init_k8_gatt(&info) < 0)) { | ||
629 | if (end_pfn > MAX_DMA32_PFN) { | ||
630 | printk(KERN_ERR "WARNING more than 4GB of memory " | ||
631 | "but GART IOMMU not available.\n" | ||
632 | KERN_ERR "WARNING 32bit PCI may malfunction.\n"); | ||
633 | } | ||
634 | return; | ||
635 | } | ||
636 | |||
637 | printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); | ||
638 | aper_size = info.aper_size * 1024 * 1024; | ||
639 | iommu_size = check_iommu_size(info.aper_base, aper_size); | ||
640 | iommu_pages = iommu_size >> PAGE_SHIFT; | ||
641 | |||
642 | iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, | ||
643 | get_order(iommu_pages/8)); | ||
644 | if (!iommu_gart_bitmap) | ||
645 | panic("Cannot allocate iommu bitmap\n"); | ||
646 | memset(iommu_gart_bitmap, 0, iommu_pages/8); | ||
647 | |||
648 | #ifdef CONFIG_IOMMU_LEAK | ||
649 | if (leak_trace) { | ||
650 | iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, | ||
651 | get_order(iommu_pages*sizeof(void *))); | ||
652 | if (iommu_leak_tab) | ||
653 | memset(iommu_leak_tab, 0, iommu_pages * 8); | ||
654 | else | ||
655 | printk("PCI-DMA: Cannot allocate leak trace area\n"); | ||
656 | } | ||
657 | #endif | ||
658 | |||
659 | /* | ||
660 | * Out of IOMMU space handling. | ||
661 | * Reserve some invalid pages at the beginning of the GART. | ||
662 | */ | ||
663 | set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); | ||
664 | |||
665 | agp_memory_reserved = iommu_size; | ||
666 | printk(KERN_INFO | ||
667 | "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", | ||
668 | iommu_size>>20); | ||
669 | |||
670 | iommu_start = aper_size - iommu_size; | ||
671 | iommu_bus_base = info.aper_base + iommu_start; | ||
672 | bad_dma_address = iommu_bus_base; | ||
673 | iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); | ||
674 | |||
675 | /* | ||
676 | * Unmap the IOMMU part of the GART. The alias of the page is | ||
677 | * always mapped with cache enabled and there is no full cache | ||
678 | * coherency across the GART remapping. The unmapping avoids | ||
679 | * automatic prefetches from the CPU allocating cache lines in | ||
680 | * there. All CPU accesses are done via the direct mapping to | ||
681 | * the backing memory. The GART address is only used by PCI | ||
682 | * devices. | ||
683 | */ | ||
684 | clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); | ||
685 | |||
686 | /* | ||
687 | * Try to workaround a bug (thanks to BenH) | ||
688 | * Set unmapped entries to a scratch page instead of 0. | ||
689 | * Any prefetches that hit unmapped entries won't get an bus abort | ||
690 | * then. | ||
691 | */ | ||
692 | scratch = get_zeroed_page(GFP_KERNEL); | ||
693 | if (!scratch) | ||
694 | panic("Cannot allocate iommu scratch page"); | ||
695 | gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); | ||
696 | for (i = EMERGENCY_PAGES; i < iommu_pages; i++) | ||
697 | iommu_gatt_base[i] = gart_unmapped_entry; | ||
698 | |||
699 | flush_gart(); | ||
700 | dma_ops = &gart_dma_ops; | ||
701 | } | ||
702 | |||
703 | void __init gart_parse_options(char *p) | ||
704 | { | ||
705 | int arg; | ||
706 | |||
707 | #ifdef CONFIG_IOMMU_LEAK | ||
708 | if (!strncmp(p,"leak",4)) { | ||
709 | leak_trace = 1; | ||
710 | p += 4; | ||
711 | if (*p == '=') ++p; | ||
712 | if (isdigit(*p) && get_option(&p, &arg)) | ||
713 | iommu_leak_pages = arg; | ||
714 | } | ||
715 | #endif | ||
716 | if (isdigit(*p) && get_option(&p, &arg)) | ||
717 | iommu_size = arg; | ||
718 | if (!strncmp(p, "fullflush",8)) | ||
719 | iommu_fullflush = 1; | ||
720 | if (!strncmp(p, "nofullflush",11)) | ||
721 | iommu_fullflush = 0; | ||
722 | if (!strncmp(p,"noagp",5)) | ||
723 | no_agp = 1; | ||
724 | if (!strncmp(p, "noaperture",10)) | ||
725 | fix_aperture = 0; | ||
726 | /* duplicated from pci-dma.c */ | ||
727 | if (!strncmp(p,"force",5)) | ||
728 | iommu_aperture_allowed = 1; | ||
729 | if (!strncmp(p,"allowed",7)) | ||
730 | iommu_aperture_allowed = 1; | ||
731 | if (!strncmp(p, "memaper", 7)) { | ||
732 | fallback_aper_force = 1; | ||
733 | p += 7; | ||
734 | if (*p == '=') { | ||
735 | ++p; | ||
736 | if (get_option(&p, &arg)) | ||
737 | fallback_aper_order = arg; | ||
738 | } | ||
739 | } | ||
740 | } | ||
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c deleted file mode 100644 index 2a34c6c025a9..000000000000 --- a/arch/x86_64/kernel/pci-nommu.c +++ /dev/null | |||
@@ -1,97 +0,0 @@ | |||
1 | /* Fallback functions when the main IOMMU code is not compiled in. This | ||
2 | code is roughly equivalent to i386. */ | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/pci.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <linux/dma-mapping.h> | ||
8 | |||
9 | #include <asm/iommu.h> | ||
10 | #include <asm/processor.h> | ||
11 | #include <asm/dma.h> | ||
12 | |||
13 | static int | ||
14 | check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) | ||
15 | { | ||
16 | if (hwdev && bus + size > *hwdev->dma_mask) { | ||
17 | if (*hwdev->dma_mask >= DMA_32BIT_MASK) | ||
18 | printk(KERN_ERR | ||
19 | "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", | ||
20 | name, (long long)bus, size, | ||
21 | (long long)*hwdev->dma_mask); | ||
22 | return 0; | ||
23 | } | ||
24 | return 1; | ||
25 | } | ||
26 | |||
27 | static dma_addr_t | ||
28 | nommu_map_single(struct device *hwdev, void *ptr, size_t size, | ||
29 | int direction) | ||
30 | { | ||
31 | dma_addr_t bus = virt_to_bus(ptr); | ||
32 | if (!check_addr("map_single", hwdev, bus, size)) | ||
33 | return bad_dma_address; | ||
34 | return bus; | ||
35 | } | ||
36 | |||
37 | static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, | ||
38 | int direction) | ||
39 | { | ||
40 | } | ||
41 | |||
42 | /* Map a set of buffers described by scatterlist in streaming | ||
43 | * mode for DMA. This is the scatter-gather version of the | ||
44 | * above pci_map_single interface. Here the scatter gather list | ||
45 | * elements are each tagged with the appropriate dma address | ||
46 | * and length. They are obtained via sg_dma_{address,length}(SG). | ||
47 | * | ||
48 | * NOTE: An implementation may be able to use a smaller number of | ||
49 | * DMA address/length pairs than there are SG table elements. | ||
50 | * (for example via virtual mapping capabilities) | ||
51 | * The routine returns the number of addr/length pairs actually | ||
52 | * used, at most nents. | ||
53 | * | ||
54 | * Device ownership issues as mentioned above for pci_map_single are | ||
55 | * the same here. | ||
56 | */ | ||
57 | static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, | ||
58 | int nents, int direction) | ||
59 | { | ||
60 | int i; | ||
61 | |||
62 | for (i = 0; i < nents; i++ ) { | ||
63 | struct scatterlist *s = &sg[i]; | ||
64 | BUG_ON(!s->page); | ||
65 | s->dma_address = virt_to_bus(page_address(s->page) +s->offset); | ||
66 | if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) | ||
67 | return 0; | ||
68 | s->dma_length = s->length; | ||
69 | } | ||
70 | return nents; | ||
71 | } | ||
72 | |||
73 | /* Unmap a set of streaming mode DMA translations. | ||
74 | * Again, cpu read rules concerning calls here are the same as for | ||
75 | * pci_unmap_single() above. | ||
76 | */ | ||
77 | static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, | ||
78 | int nents, int dir) | ||
79 | { | ||
80 | } | ||
81 | |||
82 | const struct dma_mapping_ops nommu_dma_ops = { | ||
83 | .map_single = nommu_map_single, | ||
84 | .unmap_single = nommu_unmap_single, | ||
85 | .map_sg = nommu_map_sg, | ||
86 | .unmap_sg = nommu_unmap_sg, | ||
87 | .is_phys = 1, | ||
88 | }; | ||
89 | |||
90 | void __init no_iommu_init(void) | ||
91 | { | ||
92 | if (dma_ops) | ||
93 | return; | ||
94 | |||
95 | force_iommu = 0; /* no HW IOMMU */ | ||
96 | dma_ops = &nommu_dma_ops; | ||
97 | } | ||
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c deleted file mode 100644 index b2f405ea7c85..000000000000 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ /dev/null | |||
@@ -1,44 +0,0 @@ | |||
1 | /* Glue code to lib/swiotlb.c */ | ||
2 | |||
3 | #include <linux/pci.h> | ||
4 | #include <linux/cache.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/dma-mapping.h> | ||
7 | |||
8 | #include <asm/iommu.h> | ||
9 | #include <asm/swiotlb.h> | ||
10 | #include <asm/dma.h> | ||
11 | |||
12 | int swiotlb __read_mostly; | ||
13 | EXPORT_SYMBOL(swiotlb); | ||
14 | |||
15 | const struct dma_mapping_ops swiotlb_dma_ops = { | ||
16 | .mapping_error = swiotlb_dma_mapping_error, | ||
17 | .alloc_coherent = swiotlb_alloc_coherent, | ||
18 | .free_coherent = swiotlb_free_coherent, | ||
19 | .map_single = swiotlb_map_single, | ||
20 | .unmap_single = swiotlb_unmap_single, | ||
21 | .sync_single_for_cpu = swiotlb_sync_single_for_cpu, | ||
22 | .sync_single_for_device = swiotlb_sync_single_for_device, | ||
23 | .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, | ||
24 | .sync_single_range_for_device = swiotlb_sync_single_range_for_device, | ||
25 | .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, | ||
26 | .sync_sg_for_device = swiotlb_sync_sg_for_device, | ||
27 | .map_sg = swiotlb_map_sg, | ||
28 | .unmap_sg = swiotlb_unmap_sg, | ||
29 | .dma_supported = NULL, | ||
30 | }; | ||
31 | |||
32 | void __init pci_swiotlb_init(void) | ||
33 | { | ||
34 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ | ||
35 | if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) | ||
36 | swiotlb = 1; | ||
37 | if (swiotlb_force) | ||
38 | swiotlb = 1; | ||
39 | if (swiotlb) { | ||
40 | printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); | ||
41 | swiotlb_init(); | ||
42 | dma_ops = &swiotlb_dma_ops; | ||
43 | } | ||
44 | } | ||
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c deleted file mode 100644 index ae8f91214f15..000000000000 --- a/arch/x86_64/kernel/pmtimer.c +++ /dev/null | |||
@@ -1,69 +0,0 @@ | |||
1 | /* Ported over from i386 by AK, original copyright was: | ||
2 | * | ||
3 | * (C) Dominik Brodowski <linux@brodo.de> 2003 | ||
4 | * | ||
5 | * Driver to use the Power Management Timer (PMTMR) available in some | ||
6 | * southbridges as primary timing source for the Linux kernel. | ||
7 | * | ||
8 | * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, | ||
9 | * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. | ||
10 | * | ||
11 | * This file is licensed under the GPL v2. | ||
12 | * | ||
13 | * Dropped all the hardware bug workarounds for now. Hopefully they | ||
14 | * are not needed on 64bit chipsets. | ||
15 | */ | ||
16 | |||
17 | #include <linux/jiffies.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/cpumask.h> | ||
22 | #include <asm/io.h> | ||
23 | #include <asm/proto.h> | ||
24 | #include <asm/msr.h> | ||
25 | #include <asm/vsyscall.h> | ||
26 | |||
27 | #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ | ||
28 | |||
29 | static inline u32 cyc2us(u32 cycles) | ||
30 | { | ||
31 | /* The Power Management Timer ticks at 3.579545 ticks per microsecond. | ||
32 | * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] | ||
33 | * | ||
34 | * Even with HZ = 100, delta is at maximum 35796 ticks, so it can | ||
35 | * easily be multiplied with 286 (=0x11E) without having to fear | ||
36 | * u32 overflows. | ||
37 | */ | ||
38 | cycles *= 286; | ||
39 | return (cycles >> 10); | ||
40 | } | ||
41 | |||
42 | static unsigned pmtimer_wait_tick(void) | ||
43 | { | ||
44 | u32 a, b; | ||
45 | for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; | ||
46 | a == b; | ||
47 | b = inl(pmtmr_ioport) & ACPI_PM_MASK) | ||
48 | cpu_relax(); | ||
49 | return b; | ||
50 | } | ||
51 | |||
52 | /* note: wait time is rounded up to one tick */ | ||
53 | void pmtimer_wait(unsigned us) | ||
54 | { | ||
55 | u32 a, b; | ||
56 | a = pmtimer_wait_tick(); | ||
57 | do { | ||
58 | b = inl(pmtmr_ioport); | ||
59 | cpu_relax(); | ||
60 | } while (cyc2us(b - a) < us); | ||
61 | } | ||
62 | |||
63 | static int __init nopmtimer_setup(char *s) | ||
64 | { | ||
65 | pmtmr_ioport = 0; | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | __setup("nopmtimer", nopmtimer_setup); | ||
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c deleted file mode 100644 index 98956555450b..000000000000 --- a/arch/x86_64/kernel/process.c +++ /dev/null | |||
@@ -1,903 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/process.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Pentium III FXSR, SSE support | ||
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
8 | * | ||
9 | * X86-64 port | ||
10 | * Andi Kleen. | ||
11 | * | ||
12 | * CPU hotplug support - ashok.raj@intel.com | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * This file handles the architecture-dependent parts of process handling.. | ||
17 | */ | ||
18 | |||
19 | #include <stdarg.h> | ||
20 | |||
21 | #include <linux/cpu.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/elfcore.h> | ||
28 | #include <linux/smp.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/user.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/a.out.h> | ||
33 | #include <linux/interrupt.h> | ||
34 | #include <linux/delay.h> | ||
35 | #include <linux/ptrace.h> | ||
36 | #include <linux/utsname.h> | ||
37 | #include <linux/random.h> | ||
38 | #include <linux/notifier.h> | ||
39 | #include <linux/kprobes.h> | ||
40 | #include <linux/kdebug.h> | ||
41 | |||
42 | #include <asm/uaccess.h> | ||
43 | #include <asm/pgtable.h> | ||
44 | #include <asm/system.h> | ||
45 | #include <asm/io.h> | ||
46 | #include <asm/processor.h> | ||
47 | #include <asm/i387.h> | ||
48 | #include <asm/mmu_context.h> | ||
49 | #include <asm/pda.h> | ||
50 | #include <asm/prctl.h> | ||
51 | #include <asm/desc.h> | ||
52 | #include <asm/proto.h> | ||
53 | #include <asm/ia32.h> | ||
54 | #include <asm/idle.h> | ||
55 | |||
56 | asmlinkage extern void ret_from_fork(void); | ||
57 | |||
58 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; | ||
59 | |||
60 | unsigned long boot_option_idle_override = 0; | ||
61 | EXPORT_SYMBOL(boot_option_idle_override); | ||
62 | |||
63 | /* | ||
64 | * Powermanagement idle function, if any.. | ||
65 | */ | ||
66 | void (*pm_idle)(void); | ||
67 | EXPORT_SYMBOL(pm_idle); | ||
68 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | ||
69 | |||
70 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); | ||
71 | |||
72 | void idle_notifier_register(struct notifier_block *n) | ||
73 | { | ||
74 | atomic_notifier_chain_register(&idle_notifier, n); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(idle_notifier_register); | ||
77 | |||
78 | void idle_notifier_unregister(struct notifier_block *n) | ||
79 | { | ||
80 | atomic_notifier_chain_unregister(&idle_notifier, n); | ||
81 | } | ||
82 | EXPORT_SYMBOL(idle_notifier_unregister); | ||
83 | |||
84 | void enter_idle(void) | ||
85 | { | ||
86 | write_pda(isidle, 1); | ||
87 | atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); | ||
88 | } | ||
89 | |||
90 | static void __exit_idle(void) | ||
91 | { | ||
92 | if (test_and_clear_bit_pda(0, isidle) == 0) | ||
93 | return; | ||
94 | atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); | ||
95 | } | ||
96 | |||
97 | /* Called from interrupts to signify idle end */ | ||
98 | void exit_idle(void) | ||
99 | { | ||
100 | /* idle loop has pid 0 */ | ||
101 | if (current->pid) | ||
102 | return; | ||
103 | __exit_idle(); | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * We use this if we don't have any better | ||
108 | * idle routine.. | ||
109 | */ | ||
110 | static void default_idle(void) | ||
111 | { | ||
112 | current_thread_info()->status &= ~TS_POLLING; | ||
113 | /* | ||
114 | * TS_POLLING-cleared state must be visible before we | ||
115 | * test NEED_RESCHED: | ||
116 | */ | ||
117 | smp_mb(); | ||
118 | local_irq_disable(); | ||
119 | if (!need_resched()) { | ||
120 | /* Enables interrupts one instruction before HLT. | ||
121 | x86 special cases this so there is no race. */ | ||
122 | safe_halt(); | ||
123 | } else | ||
124 | local_irq_enable(); | ||
125 | current_thread_info()->status |= TS_POLLING; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * On SMP it's slightly faster (but much more power-consuming!) | ||
130 | * to poll the ->need_resched flag instead of waiting for the | ||
131 | * cross-CPU IPI to arrive. Use this option with caution. | ||
132 | */ | ||
133 | static void poll_idle (void) | ||
134 | { | ||
135 | local_irq_enable(); | ||
136 | cpu_relax(); | ||
137 | } | ||
138 | |||
139 | void cpu_idle_wait(void) | ||
140 | { | ||
141 | unsigned int cpu, this_cpu = get_cpu(); | ||
142 | cpumask_t map, tmp = current->cpus_allowed; | ||
143 | |||
144 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | ||
145 | put_cpu(); | ||
146 | |||
147 | cpus_clear(map); | ||
148 | for_each_online_cpu(cpu) { | ||
149 | per_cpu(cpu_idle_state, cpu) = 1; | ||
150 | cpu_set(cpu, map); | ||
151 | } | ||
152 | |||
153 | __get_cpu_var(cpu_idle_state) = 0; | ||
154 | |||
155 | wmb(); | ||
156 | do { | ||
157 | ssleep(1); | ||
158 | for_each_online_cpu(cpu) { | ||
159 | if (cpu_isset(cpu, map) && | ||
160 | !per_cpu(cpu_idle_state, cpu)) | ||
161 | cpu_clear(cpu, map); | ||
162 | } | ||
163 | cpus_and(map, map, cpu_online_map); | ||
164 | } while (!cpus_empty(map)); | ||
165 | |||
166 | set_cpus_allowed(current, tmp); | ||
167 | } | ||
168 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | ||
169 | |||
170 | #ifdef CONFIG_HOTPLUG_CPU | ||
171 | DECLARE_PER_CPU(int, cpu_state); | ||
172 | |||
173 | #include <asm/nmi.h> | ||
174 | /* We halt the CPU with physical CPU hotplug */ | ||
175 | static inline void play_dead(void) | ||
176 | { | ||
177 | idle_task_exit(); | ||
178 | wbinvd(); | ||
179 | mb(); | ||
180 | /* Ack it */ | ||
181 | __get_cpu_var(cpu_state) = CPU_DEAD; | ||
182 | |||
183 | local_irq_disable(); | ||
184 | while (1) | ||
185 | halt(); | ||
186 | } | ||
187 | #else | ||
188 | static inline void play_dead(void) | ||
189 | { | ||
190 | BUG(); | ||
191 | } | ||
192 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
193 | |||
194 | /* | ||
195 | * The idle thread. There's no useful work to be | ||
196 | * done, so just try to conserve power and have a | ||
197 | * low exit latency (ie sit in a loop waiting for | ||
198 | * somebody to say that they'd like to reschedule) | ||
199 | */ | ||
200 | void cpu_idle (void) | ||
201 | { | ||
202 | current_thread_info()->status |= TS_POLLING; | ||
203 | /* endless idle loop with no priority at all */ | ||
204 | while (1) { | ||
205 | while (!need_resched()) { | ||
206 | void (*idle)(void); | ||
207 | |||
208 | if (__get_cpu_var(cpu_idle_state)) | ||
209 | __get_cpu_var(cpu_idle_state) = 0; | ||
210 | |||
211 | rmb(); | ||
212 | idle = pm_idle; | ||
213 | if (!idle) | ||
214 | idle = default_idle; | ||
215 | if (cpu_is_offline(smp_processor_id())) | ||
216 | play_dead(); | ||
217 | /* | ||
218 | * Idle routines should keep interrupts disabled | ||
219 | * from here on, until they go to idle. | ||
220 | * Otherwise, idle callbacks can misfire. | ||
221 | */ | ||
222 | local_irq_disable(); | ||
223 | enter_idle(); | ||
224 | idle(); | ||
225 | /* In many cases the interrupt that ended idle | ||
226 | has already called exit_idle. But some idle | ||
227 | loops can be woken up without interrupt. */ | ||
228 | __exit_idle(); | ||
229 | } | ||
230 | |||
231 | preempt_enable_no_resched(); | ||
232 | schedule(); | ||
233 | preempt_disable(); | ||
234 | } | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
239 | * which can obviate IPI to trigger checking of need_resched. | ||
240 | * We execute MONITOR against need_resched and enter optimized wait state | ||
241 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
242 | * up from MWAIT (without an IPI). | ||
243 | * | ||
244 | * New with Core Duo processors, MWAIT can take some hints based on CPU | ||
245 | * capability. | ||
246 | */ | ||
247 | void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) | ||
248 | { | ||
249 | if (!need_resched()) { | ||
250 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
251 | smp_mb(); | ||
252 | if (!need_resched()) | ||
253 | __mwait(eax, ecx); | ||
254 | } | ||
255 | } | ||
256 | |||
257 | /* Default MONITOR/MWAIT with no hints, used for default C1 state */ | ||
258 | static void mwait_idle(void) | ||
259 | { | ||
260 | if (!need_resched()) { | ||
261 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
262 | smp_mb(); | ||
263 | if (!need_resched()) | ||
264 | __sti_mwait(0, 0); | ||
265 | else | ||
266 | local_irq_enable(); | ||
267 | } else { | ||
268 | local_irq_enable(); | ||
269 | } | ||
270 | } | ||
271 | |||
272 | void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | ||
273 | { | ||
274 | static int printed; | ||
275 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | ||
276 | /* | ||
277 | * Skip, if setup has overridden idle. | ||
278 | * One CPU supports mwait => All CPUs supports mwait | ||
279 | */ | ||
280 | if (!pm_idle) { | ||
281 | if (!printed) { | ||
282 | printk(KERN_INFO "using mwait in idle threads.\n"); | ||
283 | printed = 1; | ||
284 | } | ||
285 | pm_idle = mwait_idle; | ||
286 | } | ||
287 | } | ||
288 | } | ||
289 | |||
290 | static int __init idle_setup (char *str) | ||
291 | { | ||
292 | if (!strcmp(str, "poll")) { | ||
293 | printk("using polling idle threads.\n"); | ||
294 | pm_idle = poll_idle; | ||
295 | } else if (!strcmp(str, "mwait")) | ||
296 | force_mwait = 1; | ||
297 | else | ||
298 | return -1; | ||
299 | |||
300 | boot_option_idle_override = 1; | ||
301 | return 0; | ||
302 | } | ||
303 | early_param("idle", idle_setup); | ||
304 | |||
305 | /* Prints also some state that isn't saved in the pt_regs */ | ||
306 | void __show_regs(struct pt_regs * regs) | ||
307 | { | ||
308 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | ||
309 | unsigned long d0, d1, d2, d3, d6, d7; | ||
310 | unsigned int fsindex,gsindex; | ||
311 | unsigned int ds,cs,es; | ||
312 | |||
313 | printk("\n"); | ||
314 | print_modules(); | ||
315 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
316 | current->pid, current->comm, print_tainted(), | ||
317 | init_utsname()->release, | ||
318 | (int)strcspn(init_utsname()->version, " "), | ||
319 | init_utsname()->version); | ||
320 | printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); | ||
321 | printk_address(regs->rip); | ||
322 | printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, | ||
323 | regs->eflags); | ||
324 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | ||
325 | regs->rax, regs->rbx, regs->rcx); | ||
326 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | ||
327 | regs->rdx, regs->rsi, regs->rdi); | ||
328 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", | ||
329 | regs->rbp, regs->r8, regs->r9); | ||
330 | printk("R10: %016lx R11: %016lx R12: %016lx\n", | ||
331 | regs->r10, regs->r11, regs->r12); | ||
332 | printk("R13: %016lx R14: %016lx R15: %016lx\n", | ||
333 | regs->r13, regs->r14, regs->r15); | ||
334 | |||
335 | asm("movl %%ds,%0" : "=r" (ds)); | ||
336 | asm("movl %%cs,%0" : "=r" (cs)); | ||
337 | asm("movl %%es,%0" : "=r" (es)); | ||
338 | asm("movl %%fs,%0" : "=r" (fsindex)); | ||
339 | asm("movl %%gs,%0" : "=r" (gsindex)); | ||
340 | |||
341 | rdmsrl(MSR_FS_BASE, fs); | ||
342 | rdmsrl(MSR_GS_BASE, gs); | ||
343 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | ||
344 | |||
345 | cr0 = read_cr0(); | ||
346 | cr2 = read_cr2(); | ||
347 | cr3 = read_cr3(); | ||
348 | cr4 = read_cr4(); | ||
349 | |||
350 | printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | ||
351 | fs,fsindex,gs,gsindex,shadowgs); | ||
352 | printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); | ||
353 | printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); | ||
354 | |||
355 | get_debugreg(d0, 0); | ||
356 | get_debugreg(d1, 1); | ||
357 | get_debugreg(d2, 2); | ||
358 | printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); | ||
359 | get_debugreg(d3, 3); | ||
360 | get_debugreg(d6, 6); | ||
361 | get_debugreg(d7, 7); | ||
362 | printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); | ||
363 | } | ||
364 | |||
365 | void show_regs(struct pt_regs *regs) | ||
366 | { | ||
367 | printk("CPU %d:", smp_processor_id()); | ||
368 | __show_regs(regs); | ||
369 | show_trace(NULL, regs, (void *)(regs + 1)); | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Free current thread data structures etc.. | ||
374 | */ | ||
375 | void exit_thread(void) | ||
376 | { | ||
377 | struct task_struct *me = current; | ||
378 | struct thread_struct *t = &me->thread; | ||
379 | |||
380 | if (me->thread.io_bitmap_ptr) { | ||
381 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | ||
382 | |||
383 | kfree(t->io_bitmap_ptr); | ||
384 | t->io_bitmap_ptr = NULL; | ||
385 | clear_thread_flag(TIF_IO_BITMAP); | ||
386 | /* | ||
387 | * Careful, clear this in the TSS too: | ||
388 | */ | ||
389 | memset(tss->io_bitmap, 0xff, t->io_bitmap_max); | ||
390 | t->io_bitmap_max = 0; | ||
391 | put_cpu(); | ||
392 | } | ||
393 | } | ||
394 | |||
395 | void flush_thread(void) | ||
396 | { | ||
397 | struct task_struct *tsk = current; | ||
398 | |||
399 | if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { | ||
400 | clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); | ||
401 | if (test_tsk_thread_flag(tsk, TIF_IA32)) { | ||
402 | clear_tsk_thread_flag(tsk, TIF_IA32); | ||
403 | } else { | ||
404 | set_tsk_thread_flag(tsk, TIF_IA32); | ||
405 | current_thread_info()->status |= TS_COMPAT; | ||
406 | } | ||
407 | } | ||
408 | clear_tsk_thread_flag(tsk, TIF_DEBUG); | ||
409 | |||
410 | tsk->thread.debugreg0 = 0; | ||
411 | tsk->thread.debugreg1 = 0; | ||
412 | tsk->thread.debugreg2 = 0; | ||
413 | tsk->thread.debugreg3 = 0; | ||
414 | tsk->thread.debugreg6 = 0; | ||
415 | tsk->thread.debugreg7 = 0; | ||
416 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | ||
417 | /* | ||
418 | * Forget coprocessor state.. | ||
419 | */ | ||
420 | clear_fpu(tsk); | ||
421 | clear_used_math(); | ||
422 | } | ||
423 | |||
424 | void release_thread(struct task_struct *dead_task) | ||
425 | { | ||
426 | if (dead_task->mm) { | ||
427 | if (dead_task->mm->context.size) { | ||
428 | printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", | ||
429 | dead_task->comm, | ||
430 | dead_task->mm->context.ldt, | ||
431 | dead_task->mm->context.size); | ||
432 | BUG(); | ||
433 | } | ||
434 | } | ||
435 | } | ||
436 | |||
437 | static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) | ||
438 | { | ||
439 | struct user_desc ud = { | ||
440 | .base_addr = addr, | ||
441 | .limit = 0xfffff, | ||
442 | .seg_32bit = 1, | ||
443 | .limit_in_pages = 1, | ||
444 | .useable = 1, | ||
445 | }; | ||
446 | struct n_desc_struct *desc = (void *)t->thread.tls_array; | ||
447 | desc += tls; | ||
448 | desc->a = LDT_entry_a(&ud); | ||
449 | desc->b = LDT_entry_b(&ud); | ||
450 | } | ||
451 | |||
452 | static inline u32 read_32bit_tls(struct task_struct *t, int tls) | ||
453 | { | ||
454 | struct desc_struct *desc = (void *)t->thread.tls_array; | ||
455 | desc += tls; | ||
456 | return desc->base0 | | ||
457 | (((u32)desc->base1) << 16) | | ||
458 | (((u32)desc->base2) << 24); | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * This gets called before we allocate a new thread and copy | ||
463 | * the current task into it. | ||
464 | */ | ||
465 | void prepare_to_copy(struct task_struct *tsk) | ||
466 | { | ||
467 | unlazy_fpu(tsk); | ||
468 | } | ||
469 | |||
470 | int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, | ||
471 | unsigned long unused, | ||
472 | struct task_struct * p, struct pt_regs * regs) | ||
473 | { | ||
474 | int err; | ||
475 | struct pt_regs * childregs; | ||
476 | struct task_struct *me = current; | ||
477 | |||
478 | childregs = ((struct pt_regs *) | ||
479 | (THREAD_SIZE + task_stack_page(p))) - 1; | ||
480 | *childregs = *regs; | ||
481 | |||
482 | childregs->rax = 0; | ||
483 | childregs->rsp = rsp; | ||
484 | if (rsp == ~0UL) | ||
485 | childregs->rsp = (unsigned long)childregs; | ||
486 | |||
487 | p->thread.rsp = (unsigned long) childregs; | ||
488 | p->thread.rsp0 = (unsigned long) (childregs+1); | ||
489 | p->thread.userrsp = me->thread.userrsp; | ||
490 | |||
491 | set_tsk_thread_flag(p, TIF_FORK); | ||
492 | |||
493 | p->thread.fs = me->thread.fs; | ||
494 | p->thread.gs = me->thread.gs; | ||
495 | |||
496 | asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); | ||
497 | asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); | ||
498 | asm("mov %%es,%0" : "=m" (p->thread.es)); | ||
499 | asm("mov %%ds,%0" : "=m" (p->thread.ds)); | ||
500 | |||
501 | if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { | ||
502 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
503 | if (!p->thread.io_bitmap_ptr) { | ||
504 | p->thread.io_bitmap_max = 0; | ||
505 | return -ENOMEM; | ||
506 | } | ||
507 | memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, | ||
508 | IO_BITMAP_BYTES); | ||
509 | set_tsk_thread_flag(p, TIF_IO_BITMAP); | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * Set a new TLS for the child thread? | ||
514 | */ | ||
515 | if (clone_flags & CLONE_SETTLS) { | ||
516 | #ifdef CONFIG_IA32_EMULATION | ||
517 | if (test_thread_flag(TIF_IA32)) | ||
518 | err = ia32_child_tls(p, childregs); | ||
519 | else | ||
520 | #endif | ||
521 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | ||
522 | if (err) | ||
523 | goto out; | ||
524 | } | ||
525 | err = 0; | ||
526 | out: | ||
527 | if (err && p->thread.io_bitmap_ptr) { | ||
528 | kfree(p->thread.io_bitmap_ptr); | ||
529 | p->thread.io_bitmap_max = 0; | ||
530 | } | ||
531 | return err; | ||
532 | } | ||
533 | |||
534 | /* | ||
535 | * This special macro can be used to load a debugging register | ||
536 | */ | ||
537 | #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) | ||
538 | |||
539 | static inline void __switch_to_xtra(struct task_struct *prev_p, | ||
540 | struct task_struct *next_p, | ||
541 | struct tss_struct *tss) | ||
542 | { | ||
543 | struct thread_struct *prev, *next; | ||
544 | |||
545 | prev = &prev_p->thread, | ||
546 | next = &next_p->thread; | ||
547 | |||
548 | if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { | ||
549 | loaddebug(next, 0); | ||
550 | loaddebug(next, 1); | ||
551 | loaddebug(next, 2); | ||
552 | loaddebug(next, 3); | ||
553 | /* no 4 and 5 */ | ||
554 | loaddebug(next, 6); | ||
555 | loaddebug(next, 7); | ||
556 | } | ||
557 | |||
558 | if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { | ||
559 | /* | ||
560 | * Copy the relevant range of the IO bitmap. | ||
561 | * Normally this is 128 bytes or less: | ||
562 | */ | ||
563 | memcpy(tss->io_bitmap, next->io_bitmap_ptr, | ||
564 | max(prev->io_bitmap_max, next->io_bitmap_max)); | ||
565 | } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { | ||
566 | /* | ||
567 | * Clear any possible leftover bits: | ||
568 | */ | ||
569 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); | ||
570 | } | ||
571 | } | ||
572 | |||
573 | /* | ||
574 | * switch_to(x,y) should switch tasks from x to y. | ||
575 | * | ||
576 | * This could still be optimized: | ||
577 | * - fold all the options into a flag word and test it with a single test. | ||
578 | * - could test fs/gs bitsliced | ||
579 | * | ||
580 | * Kprobes not supported here. Set the probe on schedule instead. | ||
581 | */ | ||
582 | __kprobes struct task_struct * | ||
583 | __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
584 | { | ||
585 | struct thread_struct *prev = &prev_p->thread, | ||
586 | *next = &next_p->thread; | ||
587 | int cpu = smp_processor_id(); | ||
588 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
589 | |||
590 | /* we're going to use this soon, after a few expensive things */ | ||
591 | if (next_p->fpu_counter>5) | ||
592 | prefetch(&next->i387.fxsave); | ||
593 | |||
594 | /* | ||
595 | * Reload esp0, LDT and the page table pointer: | ||
596 | */ | ||
597 | tss->rsp0 = next->rsp0; | ||
598 | |||
599 | /* | ||
600 | * Switch DS and ES. | ||
601 | * This won't pick up thread selector changes, but I guess that is ok. | ||
602 | */ | ||
603 | asm volatile("mov %%es,%0" : "=m" (prev->es)); | ||
604 | if (unlikely(next->es | prev->es)) | ||
605 | loadsegment(es, next->es); | ||
606 | |||
607 | asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); | ||
608 | if (unlikely(next->ds | prev->ds)) | ||
609 | loadsegment(ds, next->ds); | ||
610 | |||
611 | load_TLS(next, cpu); | ||
612 | |||
613 | /* | ||
614 | * Switch FS and GS. | ||
615 | */ | ||
616 | { | ||
617 | unsigned fsindex; | ||
618 | asm volatile("movl %%fs,%0" : "=r" (fsindex)); | ||
619 | /* segment register != 0 always requires a reload. | ||
620 | also reload when it has changed. | ||
621 | when prev process used 64bit base always reload | ||
622 | to avoid an information leak. */ | ||
623 | if (unlikely(fsindex | next->fsindex | prev->fs)) { | ||
624 | loadsegment(fs, next->fsindex); | ||
625 | /* check if the user used a selector != 0 | ||
626 | * if yes clear 64bit base, since overloaded base | ||
627 | * is always mapped to the Null selector | ||
628 | */ | ||
629 | if (fsindex) | ||
630 | prev->fs = 0; | ||
631 | } | ||
632 | /* when next process has a 64bit base use it */ | ||
633 | if (next->fs) | ||
634 | wrmsrl(MSR_FS_BASE, next->fs); | ||
635 | prev->fsindex = fsindex; | ||
636 | } | ||
637 | { | ||
638 | unsigned gsindex; | ||
639 | asm volatile("movl %%gs,%0" : "=r" (gsindex)); | ||
640 | if (unlikely(gsindex | next->gsindex | prev->gs)) { | ||
641 | load_gs_index(next->gsindex); | ||
642 | if (gsindex) | ||
643 | prev->gs = 0; | ||
644 | } | ||
645 | if (next->gs) | ||
646 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | ||
647 | prev->gsindex = gsindex; | ||
648 | } | ||
649 | |||
650 | /* Must be after DS reload */ | ||
651 | unlazy_fpu(prev_p); | ||
652 | |||
653 | /* | ||
654 | * Switch the PDA and FPU contexts. | ||
655 | */ | ||
656 | prev->userrsp = read_pda(oldrsp); | ||
657 | write_pda(oldrsp, next->userrsp); | ||
658 | write_pda(pcurrent, next_p); | ||
659 | |||
660 | write_pda(kernelstack, | ||
661 | (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); | ||
662 | #ifdef CONFIG_CC_STACKPROTECTOR | ||
663 | write_pda(stack_canary, next_p->stack_canary); | ||
664 | /* | ||
665 | * Build time only check to make sure the stack_canary is at | ||
666 | * offset 40 in the pda; this is a gcc ABI requirement | ||
667 | */ | ||
668 | BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); | ||
669 | #endif | ||
670 | |||
671 | /* | ||
672 | * Now maybe reload the debug registers and handle I/O bitmaps | ||
673 | */ | ||
674 | if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) | ||
675 | || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) | ||
676 | __switch_to_xtra(prev_p, next_p, tss); | ||
677 | |||
678 | /* If the task has used fpu the last 5 timeslices, just do a full | ||
679 | * restore of the math state immediately to avoid the trap; the | ||
680 | * chances of needing FPU soon are obviously high now | ||
681 | */ | ||
682 | if (next_p->fpu_counter>5) | ||
683 | math_state_restore(); | ||
684 | return prev_p; | ||
685 | } | ||
686 | |||
687 | /* | ||
688 | * sys_execve() executes a new program. | ||
689 | */ | ||
690 | asmlinkage | ||
691 | long sys_execve(char __user *name, char __user * __user *argv, | ||
692 | char __user * __user *envp, struct pt_regs regs) | ||
693 | { | ||
694 | long error; | ||
695 | char * filename; | ||
696 | |||
697 | filename = getname(name); | ||
698 | error = PTR_ERR(filename); | ||
699 | if (IS_ERR(filename)) | ||
700 | return error; | ||
701 | error = do_execve(filename, argv, envp, ®s); | ||
702 | if (error == 0) { | ||
703 | task_lock(current); | ||
704 | current->ptrace &= ~PT_DTRACE; | ||
705 | task_unlock(current); | ||
706 | } | ||
707 | putname(filename); | ||
708 | return error; | ||
709 | } | ||
710 | |||
711 | void set_personality_64bit(void) | ||
712 | { | ||
713 | /* inherit personality from parent */ | ||
714 | |||
715 | /* Make sure to be in 64bit mode */ | ||
716 | clear_thread_flag(TIF_IA32); | ||
717 | |||
718 | /* TBD: overwrites user setup. Should have two bits. | ||
719 | But 64bit processes have always behaved this way, | ||
720 | so it's not too bad. The main problem is just that | ||
721 | 32bit childs are affected again. */ | ||
722 | current->personality &= ~READ_IMPLIES_EXEC; | ||
723 | } | ||
724 | |||
725 | asmlinkage long sys_fork(struct pt_regs *regs) | ||
726 | { | ||
727 | return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); | ||
728 | } | ||
729 | |||
730 | asmlinkage long | ||
731 | sys_clone(unsigned long clone_flags, unsigned long newsp, | ||
732 | void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | ||
733 | { | ||
734 | if (!newsp) | ||
735 | newsp = regs->rsp; | ||
736 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | ||
737 | } | ||
738 | |||
739 | /* | ||
740 | * This is trivial, and on the face of it looks like it | ||
741 | * could equally well be done in user mode. | ||
742 | * | ||
743 | * Not so, for quite unobvious reasons - register pressure. | ||
744 | * In user mode vfork() cannot have a stack frame, and if | ||
745 | * done by calling the "clone()" system call directly, you | ||
746 | * do not have enough call-clobbered registers to hold all | ||
747 | * the information you need. | ||
748 | */ | ||
749 | asmlinkage long sys_vfork(struct pt_regs *regs) | ||
750 | { | ||
751 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, | ||
752 | NULL, NULL); | ||
753 | } | ||
754 | |||
755 | unsigned long get_wchan(struct task_struct *p) | ||
756 | { | ||
757 | unsigned long stack; | ||
758 | u64 fp,rip; | ||
759 | int count = 0; | ||
760 | |||
761 | if (!p || p == current || p->state==TASK_RUNNING) | ||
762 | return 0; | ||
763 | stack = (unsigned long)task_stack_page(p); | ||
764 | if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) | ||
765 | return 0; | ||
766 | fp = *(u64 *)(p->thread.rsp); | ||
767 | do { | ||
768 | if (fp < (unsigned long)stack || | ||
769 | fp > (unsigned long)stack+THREAD_SIZE) | ||
770 | return 0; | ||
771 | rip = *(u64 *)(fp+8); | ||
772 | if (!in_sched_functions(rip)) | ||
773 | return rip; | ||
774 | fp = *(u64 *)fp; | ||
775 | } while (count++ < 16); | ||
776 | return 0; | ||
777 | } | ||
778 | |||
779 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | ||
780 | { | ||
781 | int ret = 0; | ||
782 | int doit = task == current; | ||
783 | int cpu; | ||
784 | |||
785 | switch (code) { | ||
786 | case ARCH_SET_GS: | ||
787 | if (addr >= TASK_SIZE_OF(task)) | ||
788 | return -EPERM; | ||
789 | cpu = get_cpu(); | ||
790 | /* handle small bases via the GDT because that's faster to | ||
791 | switch. */ | ||
792 | if (addr <= 0xffffffff) { | ||
793 | set_32bit_tls(task, GS_TLS, addr); | ||
794 | if (doit) { | ||
795 | load_TLS(&task->thread, cpu); | ||
796 | load_gs_index(GS_TLS_SEL); | ||
797 | } | ||
798 | task->thread.gsindex = GS_TLS_SEL; | ||
799 | task->thread.gs = 0; | ||
800 | } else { | ||
801 | task->thread.gsindex = 0; | ||
802 | task->thread.gs = addr; | ||
803 | if (doit) { | ||
804 | load_gs_index(0); | ||
805 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); | ||
806 | } | ||
807 | } | ||
808 | put_cpu(); | ||
809 | break; | ||
810 | case ARCH_SET_FS: | ||
811 | /* Not strictly needed for fs, but do it for symmetry | ||
812 | with gs */ | ||
813 | if (addr >= TASK_SIZE_OF(task)) | ||
814 | return -EPERM; | ||
815 | cpu = get_cpu(); | ||
816 | /* handle small bases via the GDT because that's faster to | ||
817 | switch. */ | ||
818 | if (addr <= 0xffffffff) { | ||
819 | set_32bit_tls(task, FS_TLS, addr); | ||
820 | if (doit) { | ||
821 | load_TLS(&task->thread, cpu); | ||
822 | asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); | ||
823 | } | ||
824 | task->thread.fsindex = FS_TLS_SEL; | ||
825 | task->thread.fs = 0; | ||
826 | } else { | ||
827 | task->thread.fsindex = 0; | ||
828 | task->thread.fs = addr; | ||
829 | if (doit) { | ||
830 | /* set the selector to 0 to not confuse | ||
831 | __switch_to */ | ||
832 | asm volatile("movl %0,%%fs" :: "r" (0)); | ||
833 | ret = checking_wrmsrl(MSR_FS_BASE, addr); | ||
834 | } | ||
835 | } | ||
836 | put_cpu(); | ||
837 | break; | ||
838 | case ARCH_GET_FS: { | ||
839 | unsigned long base; | ||
840 | if (task->thread.fsindex == FS_TLS_SEL) | ||
841 | base = read_32bit_tls(task, FS_TLS); | ||
842 | else if (doit) | ||
843 | rdmsrl(MSR_FS_BASE, base); | ||
844 | else | ||
845 | base = task->thread.fs; | ||
846 | ret = put_user(base, (unsigned long __user *)addr); | ||
847 | break; | ||
848 | } | ||
849 | case ARCH_GET_GS: { | ||
850 | unsigned long base; | ||
851 | unsigned gsindex; | ||
852 | if (task->thread.gsindex == GS_TLS_SEL) | ||
853 | base = read_32bit_tls(task, GS_TLS); | ||
854 | else if (doit) { | ||
855 | asm("movl %%gs,%0" : "=r" (gsindex)); | ||
856 | if (gsindex) | ||
857 | rdmsrl(MSR_KERNEL_GS_BASE, base); | ||
858 | else | ||
859 | base = task->thread.gs; | ||
860 | } | ||
861 | else | ||
862 | base = task->thread.gs; | ||
863 | ret = put_user(base, (unsigned long __user *)addr); | ||
864 | break; | ||
865 | } | ||
866 | |||
867 | default: | ||
868 | ret = -EINVAL; | ||
869 | break; | ||
870 | } | ||
871 | |||
872 | return ret; | ||
873 | } | ||
874 | |||
875 | long sys_arch_prctl(int code, unsigned long addr) | ||
876 | { | ||
877 | return do_arch_prctl(current, code, addr); | ||
878 | } | ||
879 | |||
880 | /* | ||
881 | * Capture the user space registers if the task is not running (in user space) | ||
882 | */ | ||
883 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | ||
884 | { | ||
885 | struct pt_regs *pp, ptregs; | ||
886 | |||
887 | pp = task_pt_regs(tsk); | ||
888 | |||
889 | ptregs = *pp; | ||
890 | ptregs.cs &= 0xffff; | ||
891 | ptregs.ss &= 0xffff; | ||
892 | |||
893 | elf_core_copy_regs(regs, &ptregs); | ||
894 | |||
895 | return 1; | ||
896 | } | ||
897 | |||
898 | unsigned long arch_align_stack(unsigned long sp) | ||
899 | { | ||
900 | if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) | ||
901 | sp -= get_random_int() % 8192; | ||
902 | return sp & ~0xf; | ||
903 | } | ||
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c deleted file mode 100644 index eea3702427b4..000000000000 --- a/arch/x86_64/kernel/ptrace.c +++ /dev/null | |||
@@ -1,627 +0,0 @@ | |||
1 | /* ptrace.c */ | ||
2 | /* By Ross Biro 1/23/92 */ | ||
3 | /* | ||
4 | * Pentium III FXSR, SSE support | ||
5 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
6 | * | ||
7 | * x86-64 port 2000-2002 Andi Kleen | ||
8 | */ | ||
9 | |||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/errno.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/user.h> | ||
17 | #include <linux/security.h> | ||
18 | #include <linux/audit.h> | ||
19 | #include <linux/seccomp.h> | ||
20 | #include <linux/signal.h> | ||
21 | |||
22 | #include <asm/uaccess.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/system.h> | ||
25 | #include <asm/processor.h> | ||
26 | #include <asm/i387.h> | ||
27 | #include <asm/debugreg.h> | ||
28 | #include <asm/ldt.h> | ||
29 | #include <asm/desc.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/ia32.h> | ||
32 | |||
33 | /* | ||
34 | * does not yet catch signals sent when the child dies. | ||
35 | * in exit.c or in signal.c. | ||
36 | */ | ||
37 | |||
38 | /* | ||
39 | * Determines which flags the user has access to [1 = access, 0 = no access]. | ||
40 | * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). | ||
41 | * Also masks reserved bits (63-22, 15, 5, 3, 1). | ||
42 | */ | ||
43 | #define FLAG_MASK 0x54dd5UL | ||
44 | |||
45 | /* set's the trap flag. */ | ||
46 | #define TRAP_FLAG 0x100UL | ||
47 | |||
48 | /* | ||
49 | * eflags and offset of eflags on child stack.. | ||
50 | */ | ||
51 | #define EFLAGS offsetof(struct pt_regs, eflags) | ||
52 | #define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) | ||
53 | |||
54 | /* | ||
55 | * this routine will get a word off of the processes privileged stack. | ||
56 | * the offset is how far from the base addr as stored in the TSS. | ||
57 | * this routine assumes that all the privileged stacks are in our | ||
58 | * data space. | ||
59 | */ | ||
60 | static inline unsigned long get_stack_long(struct task_struct *task, int offset) | ||
61 | { | ||
62 | unsigned char *stack; | ||
63 | |||
64 | stack = (unsigned char *)task->thread.rsp0; | ||
65 | stack += offset; | ||
66 | return (*((unsigned long *)stack)); | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * this routine will put a word on the processes privileged stack. | ||
71 | * the offset is how far from the base addr as stored in the TSS. | ||
72 | * this routine assumes that all the privileged stacks are in our | ||
73 | * data space. | ||
74 | */ | ||
75 | static inline long put_stack_long(struct task_struct *task, int offset, | ||
76 | unsigned long data) | ||
77 | { | ||
78 | unsigned char * stack; | ||
79 | |||
80 | stack = (unsigned char *) task->thread.rsp0; | ||
81 | stack += offset; | ||
82 | *(unsigned long *) stack = data; | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | #define LDT_SEGMENT 4 | ||
87 | |||
88 | unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) | ||
89 | { | ||
90 | unsigned long addr, seg; | ||
91 | |||
92 | addr = regs->rip; | ||
93 | seg = regs->cs & 0xffff; | ||
94 | |||
95 | /* | ||
96 | * We'll assume that the code segments in the GDT | ||
97 | * are all zero-based. That is largely true: the | ||
98 | * TLS segments are used for data, and the PNPBIOS | ||
99 | * and APM bios ones we just ignore here. | ||
100 | */ | ||
101 | if (seg & LDT_SEGMENT) { | ||
102 | u32 *desc; | ||
103 | unsigned long base; | ||
104 | |||
105 | seg &= ~7UL; | ||
106 | |||
107 | down(&child->mm->context.sem); | ||
108 | if (unlikely((seg >> 3) >= child->mm->context.size)) | ||
109 | addr = -1L; /* bogus selector, access would fault */ | ||
110 | else { | ||
111 | desc = child->mm->context.ldt + seg; | ||
112 | base = ((desc[0] >> 16) | | ||
113 | ((desc[1] & 0xff) << 16) | | ||
114 | (desc[1] & 0xff000000)); | ||
115 | |||
116 | /* 16-bit code segment? */ | ||
117 | if (!((desc[1] >> 22) & 1)) | ||
118 | addr &= 0xffff; | ||
119 | addr += base; | ||
120 | } | ||
121 | up(&child->mm->context.sem); | ||
122 | } | ||
123 | |||
124 | return addr; | ||
125 | } | ||
126 | |||
127 | static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) | ||
128 | { | ||
129 | int i, copied; | ||
130 | unsigned char opcode[15]; | ||
131 | unsigned long addr = convert_rip_to_linear(child, regs); | ||
132 | |||
133 | copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); | ||
134 | for (i = 0; i < copied; i++) { | ||
135 | switch (opcode[i]) { | ||
136 | /* popf and iret */ | ||
137 | case 0x9d: case 0xcf: | ||
138 | return 1; | ||
139 | |||
140 | /* CHECKME: 64 65 */ | ||
141 | |||
142 | /* opcode and address size prefixes */ | ||
143 | case 0x66: case 0x67: | ||
144 | continue; | ||
145 | /* irrelevant prefixes (segment overrides and repeats) */ | ||
146 | case 0x26: case 0x2e: | ||
147 | case 0x36: case 0x3e: | ||
148 | case 0x64: case 0x65: | ||
149 | case 0xf2: case 0xf3: | ||
150 | continue; | ||
151 | |||
152 | case 0x40 ... 0x4f: | ||
153 | if (regs->cs != __USER_CS) | ||
154 | /* 32-bit mode: register increment */ | ||
155 | return 0; | ||
156 | /* 64-bit mode: REX prefix */ | ||
157 | continue; | ||
158 | |||
159 | /* CHECKME: f2, f3 */ | ||
160 | |||
161 | /* | ||
162 | * pushf: NOTE! We should probably not let | ||
163 | * the user see the TF bit being set. But | ||
164 | * it's more pain than it's worth to avoid | ||
165 | * it, and a debugger could emulate this | ||
166 | * all in user space if it _really_ cares. | ||
167 | */ | ||
168 | case 0x9c: | ||
169 | default: | ||
170 | return 0; | ||
171 | } | ||
172 | } | ||
173 | return 0; | ||
174 | } | ||
175 | |||
176 | static void set_singlestep(struct task_struct *child) | ||
177 | { | ||
178 | struct pt_regs *regs = task_pt_regs(child); | ||
179 | |||
180 | /* | ||
181 | * Always set TIF_SINGLESTEP - this guarantees that | ||
182 | * we single-step system calls etc.. This will also | ||
183 | * cause us to set TF when returning to user mode. | ||
184 | */ | ||
185 | set_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
186 | |||
187 | /* | ||
188 | * If TF was already set, don't do anything else | ||
189 | */ | ||
190 | if (regs->eflags & TRAP_FLAG) | ||
191 | return; | ||
192 | |||
193 | /* Set TF on the kernel stack.. */ | ||
194 | regs->eflags |= TRAP_FLAG; | ||
195 | |||
196 | /* | ||
197 | * ..but if TF is changed by the instruction we will trace, | ||
198 | * don't mark it as being "us" that set it, so that we | ||
199 | * won't clear it by hand later. | ||
200 | */ | ||
201 | if (is_setting_trap_flag(child, regs)) | ||
202 | return; | ||
203 | |||
204 | child->ptrace |= PT_DTRACE; | ||
205 | } | ||
206 | |||
207 | static void clear_singlestep(struct task_struct *child) | ||
208 | { | ||
209 | /* Always clear TIF_SINGLESTEP... */ | ||
210 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
211 | |||
212 | /* But touch TF only if it was set by us.. */ | ||
213 | if (child->ptrace & PT_DTRACE) { | ||
214 | struct pt_regs *regs = task_pt_regs(child); | ||
215 | regs->eflags &= ~TRAP_FLAG; | ||
216 | child->ptrace &= ~PT_DTRACE; | ||
217 | } | ||
218 | } | ||
219 | |||
220 | /* | ||
221 | * Called by kernel/ptrace.c when detaching.. | ||
222 | * | ||
223 | * Make sure the single step bit is not set. | ||
224 | */ | ||
225 | void ptrace_disable(struct task_struct *child) | ||
226 | { | ||
227 | clear_singlestep(child); | ||
228 | } | ||
229 | |||
230 | static int putreg(struct task_struct *child, | ||
231 | unsigned long regno, unsigned long value) | ||
232 | { | ||
233 | unsigned long tmp; | ||
234 | |||
235 | switch (regno) { | ||
236 | case offsetof(struct user_regs_struct,fs): | ||
237 | if (value && (value & 3) != 3) | ||
238 | return -EIO; | ||
239 | child->thread.fsindex = value & 0xffff; | ||
240 | return 0; | ||
241 | case offsetof(struct user_regs_struct,gs): | ||
242 | if (value && (value & 3) != 3) | ||
243 | return -EIO; | ||
244 | child->thread.gsindex = value & 0xffff; | ||
245 | return 0; | ||
246 | case offsetof(struct user_regs_struct,ds): | ||
247 | if (value && (value & 3) != 3) | ||
248 | return -EIO; | ||
249 | child->thread.ds = value & 0xffff; | ||
250 | return 0; | ||
251 | case offsetof(struct user_regs_struct,es): | ||
252 | if (value && (value & 3) != 3) | ||
253 | return -EIO; | ||
254 | child->thread.es = value & 0xffff; | ||
255 | return 0; | ||
256 | case offsetof(struct user_regs_struct,ss): | ||
257 | if ((value & 3) != 3) | ||
258 | return -EIO; | ||
259 | value &= 0xffff; | ||
260 | return 0; | ||
261 | case offsetof(struct user_regs_struct,fs_base): | ||
262 | if (value >= TASK_SIZE_OF(child)) | ||
263 | return -EIO; | ||
264 | child->thread.fs = value; | ||
265 | return 0; | ||
266 | case offsetof(struct user_regs_struct,gs_base): | ||
267 | if (value >= TASK_SIZE_OF(child)) | ||
268 | return -EIO; | ||
269 | child->thread.gs = value; | ||
270 | return 0; | ||
271 | case offsetof(struct user_regs_struct, eflags): | ||
272 | value &= FLAG_MASK; | ||
273 | tmp = get_stack_long(child, EFL_OFFSET); | ||
274 | tmp &= ~FLAG_MASK; | ||
275 | value |= tmp; | ||
276 | break; | ||
277 | case offsetof(struct user_regs_struct,cs): | ||
278 | if ((value & 3) != 3) | ||
279 | return -EIO; | ||
280 | value &= 0xffff; | ||
281 | break; | ||
282 | } | ||
283 | put_stack_long(child, regno - sizeof(struct pt_regs), value); | ||
284 | return 0; | ||
285 | } | ||
286 | |||
287 | static unsigned long getreg(struct task_struct *child, unsigned long regno) | ||
288 | { | ||
289 | unsigned long val; | ||
290 | switch (regno) { | ||
291 | case offsetof(struct user_regs_struct, fs): | ||
292 | return child->thread.fsindex; | ||
293 | case offsetof(struct user_regs_struct, gs): | ||
294 | return child->thread.gsindex; | ||
295 | case offsetof(struct user_regs_struct, ds): | ||
296 | return child->thread.ds; | ||
297 | case offsetof(struct user_regs_struct, es): | ||
298 | return child->thread.es; | ||
299 | case offsetof(struct user_regs_struct, fs_base): | ||
300 | return child->thread.fs; | ||
301 | case offsetof(struct user_regs_struct, gs_base): | ||
302 | return child->thread.gs; | ||
303 | default: | ||
304 | regno = regno - sizeof(struct pt_regs); | ||
305 | val = get_stack_long(child, regno); | ||
306 | if (test_tsk_thread_flag(child, TIF_IA32)) | ||
307 | val &= 0xffffffff; | ||
308 | return val; | ||
309 | } | ||
310 | |||
311 | } | ||
312 | |||
313 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) | ||
314 | { | ||
315 | long i, ret; | ||
316 | unsigned ui; | ||
317 | |||
318 | switch (request) { | ||
319 | /* when I and D space are separate, these will need to be fixed. */ | ||
320 | case PTRACE_PEEKTEXT: /* read word at location addr. */ | ||
321 | case PTRACE_PEEKDATA: | ||
322 | ret = generic_ptrace_peekdata(child, addr, data); | ||
323 | break; | ||
324 | |||
325 | /* read the word at location addr in the USER area. */ | ||
326 | case PTRACE_PEEKUSR: { | ||
327 | unsigned long tmp; | ||
328 | |||
329 | ret = -EIO; | ||
330 | if ((addr & 7) || | ||
331 | addr > sizeof(struct user) - 7) | ||
332 | break; | ||
333 | |||
334 | switch (addr) { | ||
335 | case 0 ... sizeof(struct user_regs_struct) - sizeof(long): | ||
336 | tmp = getreg(child, addr); | ||
337 | break; | ||
338 | case offsetof(struct user, u_debugreg[0]): | ||
339 | tmp = child->thread.debugreg0; | ||
340 | break; | ||
341 | case offsetof(struct user, u_debugreg[1]): | ||
342 | tmp = child->thread.debugreg1; | ||
343 | break; | ||
344 | case offsetof(struct user, u_debugreg[2]): | ||
345 | tmp = child->thread.debugreg2; | ||
346 | break; | ||
347 | case offsetof(struct user, u_debugreg[3]): | ||
348 | tmp = child->thread.debugreg3; | ||
349 | break; | ||
350 | case offsetof(struct user, u_debugreg[6]): | ||
351 | tmp = child->thread.debugreg6; | ||
352 | break; | ||
353 | case offsetof(struct user, u_debugreg[7]): | ||
354 | tmp = child->thread.debugreg7; | ||
355 | break; | ||
356 | default: | ||
357 | tmp = 0; | ||
358 | break; | ||
359 | } | ||
360 | ret = put_user(tmp,(unsigned long __user *) data); | ||
361 | break; | ||
362 | } | ||
363 | |||
364 | /* when I and D space are separate, this will have to be fixed. */ | ||
365 | case PTRACE_POKETEXT: /* write the word at location addr. */ | ||
366 | case PTRACE_POKEDATA: | ||
367 | ret = generic_ptrace_pokedata(child, addr, data); | ||
368 | break; | ||
369 | |||
370 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | ||
371 | { | ||
372 | int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; | ||
373 | ret = -EIO; | ||
374 | if ((addr & 7) || | ||
375 | addr > sizeof(struct user) - 7) | ||
376 | break; | ||
377 | |||
378 | switch (addr) { | ||
379 | case 0 ... sizeof(struct user_regs_struct) - sizeof(long): | ||
380 | ret = putreg(child, addr, data); | ||
381 | break; | ||
382 | /* Disallows to set a breakpoint into the vsyscall */ | ||
383 | case offsetof(struct user, u_debugreg[0]): | ||
384 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
385 | child->thread.debugreg0 = data; | ||
386 | ret = 0; | ||
387 | break; | ||
388 | case offsetof(struct user, u_debugreg[1]): | ||
389 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
390 | child->thread.debugreg1 = data; | ||
391 | ret = 0; | ||
392 | break; | ||
393 | case offsetof(struct user, u_debugreg[2]): | ||
394 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
395 | child->thread.debugreg2 = data; | ||
396 | ret = 0; | ||
397 | break; | ||
398 | case offsetof(struct user, u_debugreg[3]): | ||
399 | if (data >= TASK_SIZE_OF(child) - dsize) break; | ||
400 | child->thread.debugreg3 = data; | ||
401 | ret = 0; | ||
402 | break; | ||
403 | case offsetof(struct user, u_debugreg[6]): | ||
404 | if (data >> 32) | ||
405 | break; | ||
406 | child->thread.debugreg6 = data; | ||
407 | ret = 0; | ||
408 | break; | ||
409 | case offsetof(struct user, u_debugreg[7]): | ||
410 | /* See arch/i386/kernel/ptrace.c for an explanation of | ||
411 | * this awkward check.*/ | ||
412 | data &= ~DR_CONTROL_RESERVED; | ||
413 | for(i=0; i<4; i++) | ||
414 | if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) | ||
415 | break; | ||
416 | if (i == 4) { | ||
417 | child->thread.debugreg7 = data; | ||
418 | if (data) | ||
419 | set_tsk_thread_flag(child, TIF_DEBUG); | ||
420 | else | ||
421 | clear_tsk_thread_flag(child, TIF_DEBUG); | ||
422 | ret = 0; | ||
423 | } | ||
424 | break; | ||
425 | } | ||
426 | break; | ||
427 | } | ||
428 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ | ||
429 | case PTRACE_CONT: /* restart after signal. */ | ||
430 | |||
431 | ret = -EIO; | ||
432 | if (!valid_signal(data)) | ||
433 | break; | ||
434 | if (request == PTRACE_SYSCALL) | ||
435 | set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
436 | else | ||
437 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
438 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
439 | child->exit_code = data; | ||
440 | /* make sure the single step bit is not set. */ | ||
441 | clear_singlestep(child); | ||
442 | wake_up_process(child); | ||
443 | ret = 0; | ||
444 | break; | ||
445 | |||
446 | #ifdef CONFIG_IA32_EMULATION | ||
447 | /* This makes only sense with 32bit programs. Allow a | ||
448 | 64bit debugger to fully examine them too. Better | ||
449 | don't use it against 64bit processes, use | ||
450 | PTRACE_ARCH_PRCTL instead. */ | ||
451 | case PTRACE_SET_THREAD_AREA: { | ||
452 | struct user_desc __user *p; | ||
453 | int old; | ||
454 | p = (struct user_desc __user *)data; | ||
455 | get_user(old, &p->entry_number); | ||
456 | put_user(addr, &p->entry_number); | ||
457 | ret = do_set_thread_area(&child->thread, p); | ||
458 | put_user(old, &p->entry_number); | ||
459 | break; | ||
460 | case PTRACE_GET_THREAD_AREA: | ||
461 | p = (struct user_desc __user *)data; | ||
462 | get_user(old, &p->entry_number); | ||
463 | put_user(addr, &p->entry_number); | ||
464 | ret = do_get_thread_area(&child->thread, p); | ||
465 | put_user(old, &p->entry_number); | ||
466 | break; | ||
467 | } | ||
468 | #endif | ||
469 | /* normal 64bit interface to access TLS data. | ||
470 | Works just like arch_prctl, except that the arguments | ||
471 | are reversed. */ | ||
472 | case PTRACE_ARCH_PRCTL: | ||
473 | ret = do_arch_prctl(child, data, addr); | ||
474 | break; | ||
475 | |||
476 | /* | ||
477 | * make the child exit. Best I can do is send it a sigkill. | ||
478 | * perhaps it should be put in the status that it wants to | ||
479 | * exit. | ||
480 | */ | ||
481 | case PTRACE_KILL: | ||
482 | ret = 0; | ||
483 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ | ||
484 | break; | ||
485 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
486 | child->exit_code = SIGKILL; | ||
487 | /* make sure the single step bit is not set. */ | ||
488 | clear_singlestep(child); | ||
489 | wake_up_process(child); | ||
490 | break; | ||
491 | |||
492 | case PTRACE_SINGLESTEP: /* set the trap flag. */ | ||
493 | ret = -EIO; | ||
494 | if (!valid_signal(data)) | ||
495 | break; | ||
496 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
497 | set_singlestep(child); | ||
498 | child->exit_code = data; | ||
499 | /* give it a chance to run. */ | ||
500 | wake_up_process(child); | ||
501 | ret = 0; | ||
502 | break; | ||
503 | |||
504 | case PTRACE_DETACH: | ||
505 | /* detach a process that was attached. */ | ||
506 | ret = ptrace_detach(child, data); | ||
507 | break; | ||
508 | |||
509 | case PTRACE_GETREGS: { /* Get all gp regs from the child. */ | ||
510 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
511 | sizeof(struct user_regs_struct))) { | ||
512 | ret = -EIO; | ||
513 | break; | ||
514 | } | ||
515 | ret = 0; | ||
516 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
517 | ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); | ||
518 | data += sizeof(long); | ||
519 | } | ||
520 | break; | ||
521 | } | ||
522 | |||
523 | case PTRACE_SETREGS: { /* Set all gp regs in the child. */ | ||
524 | unsigned long tmp; | ||
525 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
526 | sizeof(struct user_regs_struct))) { | ||
527 | ret = -EIO; | ||
528 | break; | ||
529 | } | ||
530 | ret = 0; | ||
531 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
532 | ret = __get_user(tmp, (unsigned long __user *) data); | ||
533 | if (ret) | ||
534 | break; | ||
535 | ret = putreg(child, ui, tmp); | ||
536 | if (ret) | ||
537 | break; | ||
538 | data += sizeof(long); | ||
539 | } | ||
540 | break; | ||
541 | } | ||
542 | |||
543 | case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ | ||
544 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
545 | sizeof(struct user_i387_struct))) { | ||
546 | ret = -EIO; | ||
547 | break; | ||
548 | } | ||
549 | ret = get_fpregs((struct user_i387_struct __user *)data, child); | ||
550 | break; | ||
551 | } | ||
552 | |||
553 | case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ | ||
554 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
555 | sizeof(struct user_i387_struct))) { | ||
556 | ret = -EIO; | ||
557 | break; | ||
558 | } | ||
559 | set_stopped_child_used_math(child); | ||
560 | ret = set_fpregs(child, (struct user_i387_struct __user *)data); | ||
561 | break; | ||
562 | } | ||
563 | |||
564 | default: | ||
565 | ret = ptrace_request(child, request, addr, data); | ||
566 | break; | ||
567 | } | ||
568 | return ret; | ||
569 | } | ||
570 | |||
571 | static void syscall_trace(struct pt_regs *regs) | ||
572 | { | ||
573 | |||
574 | #if 0 | ||
575 | printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", | ||
576 | current->comm, | ||
577 | regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), | ||
578 | current_thread_info()->flags, current->ptrace); | ||
579 | #endif | ||
580 | |||
581 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) | ||
582 | ? 0x80 : 0)); | ||
583 | /* | ||
584 | * this isn't the same as continuing with a signal, but it will do | ||
585 | * for normal use. strace only continues with a signal if the | ||
586 | * stopping signal is not SIGTRAP. -brl | ||
587 | */ | ||
588 | if (current->exit_code) { | ||
589 | send_sig(current->exit_code, current, 1); | ||
590 | current->exit_code = 0; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | asmlinkage void syscall_trace_enter(struct pt_regs *regs) | ||
595 | { | ||
596 | /* do the secure computing check first */ | ||
597 | secure_computing(regs->orig_rax); | ||
598 | |||
599 | if (test_thread_flag(TIF_SYSCALL_TRACE) | ||
600 | && (current->ptrace & PT_PTRACED)) | ||
601 | syscall_trace(regs); | ||
602 | |||
603 | if (unlikely(current->audit_context)) { | ||
604 | if (test_thread_flag(TIF_IA32)) { | ||
605 | audit_syscall_entry(AUDIT_ARCH_I386, | ||
606 | regs->orig_rax, | ||
607 | regs->rbx, regs->rcx, | ||
608 | regs->rdx, regs->rsi); | ||
609 | } else { | ||
610 | audit_syscall_entry(AUDIT_ARCH_X86_64, | ||
611 | regs->orig_rax, | ||
612 | regs->rdi, regs->rsi, | ||
613 | regs->rdx, regs->r10); | ||
614 | } | ||
615 | } | ||
616 | } | ||
617 | |||
618 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) | ||
619 | { | ||
620 | if (unlikely(current->audit_context)) | ||
621 | audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); | ||
622 | |||
623 | if ((test_thread_flag(TIF_SYSCALL_TRACE) | ||
624 | || test_thread_flag(TIF_SINGLESTEP)) | ||
625 | && (current->ptrace & PT_PTRACED)) | ||
626 | syscall_trace(regs); | ||
627 | } | ||
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c deleted file mode 100644 index 368db2b9c5ac..000000000000 --- a/arch/x86_64/kernel/reboot.c +++ /dev/null | |||
@@ -1,171 +0,0 @@ | |||
1 | /* Various gunk just to reboot the machine. */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/reboot.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/ctype.h> | ||
8 | #include <linux/string.h> | ||
9 | #include <linux/pm.h> | ||
10 | #include <linux/kdebug.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <asm/io.h> | ||
13 | #include <asm/delay.h> | ||
14 | #include <asm/hw_irq.h> | ||
15 | #include <asm/system.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/apic.h> | ||
19 | #include <asm/iommu.h> | ||
20 | |||
21 | /* | ||
22 | * Power off function, if any | ||
23 | */ | ||
24 | void (*pm_power_off)(void); | ||
25 | EXPORT_SYMBOL(pm_power_off); | ||
26 | |||
27 | static long no_idt[3]; | ||
28 | static enum { | ||
29 | BOOT_TRIPLE = 't', | ||
30 | BOOT_KBD = 'k' | ||
31 | } reboot_type = BOOT_KBD; | ||
32 | static int reboot_mode = 0; | ||
33 | int reboot_force; | ||
34 | |||
35 | /* reboot=t[riple] | k[bd] [, [w]arm | [c]old] | ||
36 | warm Don't set the cold reboot flag | ||
37 | cold Set the cold reboot flag | ||
38 | triple Force a triple fault (init) | ||
39 | kbd Use the keyboard controller. cold reset (default) | ||
40 | force Avoid anything that could hang. | ||
41 | */ | ||
42 | static int __init reboot_setup(char *str) | ||
43 | { | ||
44 | for (;;) { | ||
45 | switch (*str) { | ||
46 | case 'w': | ||
47 | reboot_mode = 0x1234; | ||
48 | break; | ||
49 | |||
50 | case 'c': | ||
51 | reboot_mode = 0; | ||
52 | break; | ||
53 | |||
54 | case 't': | ||
55 | case 'b': | ||
56 | case 'k': | ||
57 | reboot_type = *str; | ||
58 | break; | ||
59 | case 'f': | ||
60 | reboot_force = 1; | ||
61 | break; | ||
62 | } | ||
63 | if((str = strchr(str,',')) != NULL) | ||
64 | str++; | ||
65 | else | ||
66 | break; | ||
67 | } | ||
68 | return 1; | ||
69 | } | ||
70 | |||
71 | __setup("reboot=", reboot_setup); | ||
72 | |||
73 | static inline void kb_wait(void) | ||
74 | { | ||
75 | int i; | ||
76 | |||
77 | for (i=0; i<0x10000; i++) | ||
78 | if ((inb_p(0x64) & 0x02) == 0) | ||
79 | break; | ||
80 | } | ||
81 | |||
82 | void machine_shutdown(void) | ||
83 | { | ||
84 | unsigned long flags; | ||
85 | |||
86 | /* Stop the cpus and apics */ | ||
87 | #ifdef CONFIG_SMP | ||
88 | int reboot_cpu_id; | ||
89 | |||
90 | /* The boot cpu is always logical cpu 0 */ | ||
91 | reboot_cpu_id = 0; | ||
92 | |||
93 | /* Make certain the cpu I'm about to reboot on is online */ | ||
94 | if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { | ||
95 | reboot_cpu_id = smp_processor_id(); | ||
96 | } | ||
97 | |||
98 | /* Make certain I only run on the appropriate processor */ | ||
99 | set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); | ||
100 | |||
101 | /* O.K Now that I'm on the appropriate processor, | ||
102 | * stop all of the others. | ||
103 | */ | ||
104 | smp_send_stop(); | ||
105 | #endif | ||
106 | |||
107 | local_irq_save(flags); | ||
108 | |||
109 | #ifndef CONFIG_SMP | ||
110 | disable_local_APIC(); | ||
111 | #endif | ||
112 | |||
113 | disable_IO_APIC(); | ||
114 | |||
115 | local_irq_restore(flags); | ||
116 | |||
117 | pci_iommu_shutdown(); | ||
118 | } | ||
119 | |||
120 | void machine_emergency_restart(void) | ||
121 | { | ||
122 | int i; | ||
123 | |||
124 | /* Tell the BIOS if we want cold or warm reboot */ | ||
125 | *((unsigned short *)__va(0x472)) = reboot_mode; | ||
126 | |||
127 | for (;;) { | ||
128 | /* Could also try the reset bit in the Hammer NB */ | ||
129 | switch (reboot_type) { | ||
130 | case BOOT_KBD: | ||
131 | for (i=0; i<10; i++) { | ||
132 | kb_wait(); | ||
133 | udelay(50); | ||
134 | outb(0xfe,0x64); /* pulse reset low */ | ||
135 | udelay(50); | ||
136 | } | ||
137 | |||
138 | case BOOT_TRIPLE: | ||
139 | __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); | ||
140 | __asm__ __volatile__("int3"); | ||
141 | |||
142 | reboot_type = BOOT_KBD; | ||
143 | break; | ||
144 | } | ||
145 | } | ||
146 | } | ||
147 | |||
148 | void machine_restart(char * __unused) | ||
149 | { | ||
150 | printk("machine restart\n"); | ||
151 | |||
152 | if (!reboot_force) { | ||
153 | machine_shutdown(); | ||
154 | } | ||
155 | machine_emergency_restart(); | ||
156 | } | ||
157 | |||
158 | void machine_halt(void) | ||
159 | { | ||
160 | } | ||
161 | |||
162 | void machine_power_off(void) | ||
163 | { | ||
164 | if (pm_power_off) { | ||
165 | if (!reboot_force) { | ||
166 | machine_shutdown(); | ||
167 | } | ||
168 | pm_power_off(); | ||
169 | } | ||
170 | } | ||
171 | |||
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S deleted file mode 100644 index 14e95872c6a3..000000000000 --- a/arch/x86_64/kernel/relocate_kernel.S +++ /dev/null | |||
@@ -1,276 +0,0 @@ | |||
1 | /* | ||
2 | * relocate_kernel.S - put the kernel image in place to boot | ||
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/linkage.h> | ||
10 | #include <asm/page.h> | ||
11 | #include <asm/kexec.h> | ||
12 | |||
13 | /* | ||
14 | * Must be relocatable PIC code callable as a C function | ||
15 | */ | ||
16 | |||
17 | #define PTR(x) (x << 3) | ||
18 | #define PAGE_ALIGNED (1 << PAGE_SHIFT) | ||
19 | #define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ | ||
20 | |||
21 | .text | ||
22 | .align PAGE_ALIGNED | ||
23 | .code64 | ||
24 | .globl relocate_kernel | ||
25 | relocate_kernel: | ||
26 | /* %rdi indirection_page | ||
27 | * %rsi page_list | ||
28 | * %rdx start address | ||
29 | */ | ||
30 | |||
31 | /* map the control page at its virtual address */ | ||
32 | |||
33 | movq $0x0000ff8000000000, %r10 /* mask */ | ||
34 | mov $(39 - 3), %cl /* bits to shift */ | ||
35 | movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ | ||
36 | |||
37 | movq %r11, %r9 | ||
38 | andq %r10, %r9 | ||
39 | shrq %cl, %r9 | ||
40 | |||
41 | movq PTR(VA_PGD)(%rsi), %r8 | ||
42 | addq %r8, %r9 | ||
43 | movq PTR(PA_PUD_0)(%rsi), %r8 | ||
44 | orq $PAGE_ATTR, %r8 | ||
45 | movq %r8, (%r9) | ||
46 | |||
47 | shrq $9, %r10 | ||
48 | sub $9, %cl | ||
49 | |||
50 | movq %r11, %r9 | ||
51 | andq %r10, %r9 | ||
52 | shrq %cl, %r9 | ||
53 | |||
54 | movq PTR(VA_PUD_0)(%rsi), %r8 | ||
55 | addq %r8, %r9 | ||
56 | movq PTR(PA_PMD_0)(%rsi), %r8 | ||
57 | orq $PAGE_ATTR, %r8 | ||
58 | movq %r8, (%r9) | ||
59 | |||
60 | shrq $9, %r10 | ||
61 | sub $9, %cl | ||
62 | |||
63 | movq %r11, %r9 | ||
64 | andq %r10, %r9 | ||
65 | shrq %cl, %r9 | ||
66 | |||
67 | movq PTR(VA_PMD_0)(%rsi), %r8 | ||
68 | addq %r8, %r9 | ||
69 | movq PTR(PA_PTE_0)(%rsi), %r8 | ||
70 | orq $PAGE_ATTR, %r8 | ||
71 | movq %r8, (%r9) | ||
72 | |||
73 | shrq $9, %r10 | ||
74 | sub $9, %cl | ||
75 | |||
76 | movq %r11, %r9 | ||
77 | andq %r10, %r9 | ||
78 | shrq %cl, %r9 | ||
79 | |||
80 | movq PTR(VA_PTE_0)(%rsi), %r8 | ||
81 | addq %r8, %r9 | ||
82 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | ||
83 | orq $PAGE_ATTR, %r8 | ||
84 | movq %r8, (%r9) | ||
85 | |||
86 | /* identity map the control page at its physical address */ | ||
87 | |||
88 | movq $0x0000ff8000000000, %r10 /* mask */ | ||
89 | mov $(39 - 3), %cl /* bits to shift */ | ||
90 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ | ||
91 | |||
92 | movq %r11, %r9 | ||
93 | andq %r10, %r9 | ||
94 | shrq %cl, %r9 | ||
95 | |||
96 | movq PTR(VA_PGD)(%rsi), %r8 | ||
97 | addq %r8, %r9 | ||
98 | movq PTR(PA_PUD_1)(%rsi), %r8 | ||
99 | orq $PAGE_ATTR, %r8 | ||
100 | movq %r8, (%r9) | ||
101 | |||
102 | shrq $9, %r10 | ||
103 | sub $9, %cl | ||
104 | |||
105 | movq %r11, %r9 | ||
106 | andq %r10, %r9 | ||
107 | shrq %cl, %r9 | ||
108 | |||
109 | movq PTR(VA_PUD_1)(%rsi), %r8 | ||
110 | addq %r8, %r9 | ||
111 | movq PTR(PA_PMD_1)(%rsi), %r8 | ||
112 | orq $PAGE_ATTR, %r8 | ||
113 | movq %r8, (%r9) | ||
114 | |||
115 | shrq $9, %r10 | ||
116 | sub $9, %cl | ||
117 | |||
118 | movq %r11, %r9 | ||
119 | andq %r10, %r9 | ||
120 | shrq %cl, %r9 | ||
121 | |||
122 | movq PTR(VA_PMD_1)(%rsi), %r8 | ||
123 | addq %r8, %r9 | ||
124 | movq PTR(PA_PTE_1)(%rsi), %r8 | ||
125 | orq $PAGE_ATTR, %r8 | ||
126 | movq %r8, (%r9) | ||
127 | |||
128 | shrq $9, %r10 | ||
129 | sub $9, %cl | ||
130 | |||
131 | movq %r11, %r9 | ||
132 | andq %r10, %r9 | ||
133 | shrq %cl, %r9 | ||
134 | |||
135 | movq PTR(VA_PTE_1)(%rsi), %r8 | ||
136 | addq %r8, %r9 | ||
137 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | ||
138 | orq $PAGE_ATTR, %r8 | ||
139 | movq %r8, (%r9) | ||
140 | |||
141 | relocate_new_kernel: | ||
142 | /* %rdi indirection_page | ||
143 | * %rsi page_list | ||
144 | * %rdx start address | ||
145 | */ | ||
146 | |||
147 | /* zero out flags, and disable interrupts */ | ||
148 | pushq $0 | ||
149 | popfq | ||
150 | |||
151 | /* get physical address of control page now */ | ||
152 | /* this is impossible after page table switch */ | ||
153 | movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 | ||
154 | |||
155 | /* get physical address of page table now too */ | ||
156 | movq PTR(PA_TABLE_PAGE)(%rsi), %rcx | ||
157 | |||
158 | /* switch to new set of page tables */ | ||
159 | movq PTR(PA_PGD)(%rsi), %r9 | ||
160 | movq %r9, %cr3 | ||
161 | |||
162 | /* setup a new stack at the end of the physical control page */ | ||
163 | lea 4096(%r8), %rsp | ||
164 | |||
165 | /* jump to identity mapped page */ | ||
166 | addq $(identity_mapped - relocate_kernel), %r8 | ||
167 | pushq %r8 | ||
168 | ret | ||
169 | |||
170 | identity_mapped: | ||
171 | /* store the start address on the stack */ | ||
172 | pushq %rdx | ||
173 | |||
174 | /* Set cr0 to a known state: | ||
175 | * 31 1 == Paging enabled | ||
176 | * 18 0 == Alignment check disabled | ||
177 | * 16 0 == Write protect disabled | ||
178 | * 3 0 == No task switch | ||
179 | * 2 0 == Don't do FP software emulation. | ||
180 | * 0 1 == Proctected mode enabled | ||
181 | */ | ||
182 | movq %cr0, %rax | ||
183 | andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax | ||
184 | orl $((1<<31)|(1<<0)), %eax | ||
185 | movq %rax, %cr0 | ||
186 | |||
187 | /* Set cr4 to a known state: | ||
188 | * 10 0 == xmm exceptions disabled | ||
189 | * 9 0 == xmm registers instructions disabled | ||
190 | * 8 0 == performance monitoring counter disabled | ||
191 | * 7 0 == page global disabled | ||
192 | * 6 0 == machine check exceptions disabled | ||
193 | * 5 1 == physical address extension enabled | ||
194 | * 4 0 == page size extensions disabled | ||
195 | * 3 0 == Debug extensions disabled | ||
196 | * 2 0 == Time stamp disable (disabled) | ||
197 | * 1 0 == Protected mode virtual interrupts disabled | ||
198 | * 0 0 == VME disabled | ||
199 | */ | ||
200 | |||
201 | movq $((1<<5)), %rax | ||
202 | movq %rax, %cr4 | ||
203 | |||
204 | jmp 1f | ||
205 | 1: | ||
206 | |||
207 | /* Switch to the identity mapped page tables, | ||
208 | * and flush the TLB. | ||
209 | */ | ||
210 | movq %rcx, %cr3 | ||
211 | |||
212 | /* Do the copies */ | ||
213 | movq %rdi, %rcx /* Put the page_list in %rcx */ | ||
214 | xorq %rdi, %rdi | ||
215 | xorq %rsi, %rsi | ||
216 | jmp 1f | ||
217 | |||
218 | 0: /* top, read another word for the indirection page */ | ||
219 | |||
220 | movq (%rbx), %rcx | ||
221 | addq $8, %rbx | ||
222 | 1: | ||
223 | testq $0x1, %rcx /* is it a destination page? */ | ||
224 | jz 2f | ||
225 | movq %rcx, %rdi | ||
226 | andq $0xfffffffffffff000, %rdi | ||
227 | jmp 0b | ||
228 | 2: | ||
229 | testq $0x2, %rcx /* is it an indirection page? */ | ||
230 | jz 2f | ||
231 | movq %rcx, %rbx | ||
232 | andq $0xfffffffffffff000, %rbx | ||
233 | jmp 0b | ||
234 | 2: | ||
235 | testq $0x4, %rcx /* is it the done indicator? */ | ||
236 | jz 2f | ||
237 | jmp 3f | ||
238 | 2: | ||
239 | testq $0x8, %rcx /* is it the source indicator? */ | ||
240 | jz 0b /* Ignore it otherwise */ | ||
241 | movq %rcx, %rsi /* For ever source page do a copy */ | ||
242 | andq $0xfffffffffffff000, %rsi | ||
243 | |||
244 | movq $512, %rcx | ||
245 | rep ; movsq | ||
246 | jmp 0b | ||
247 | 3: | ||
248 | |||
249 | /* To be certain of avoiding problems with self-modifying code | ||
250 | * I need to execute a serializing instruction here. | ||
251 | * So I flush the TLB by reloading %cr3 here, it's handy, | ||
252 | * and not processor dependent. | ||
253 | */ | ||
254 | movq %cr3, %rax | ||
255 | movq %rax, %cr3 | ||
256 | |||
257 | /* set all of the registers to known values */ | ||
258 | /* leave %rsp alone */ | ||
259 | |||
260 | xorq %rax, %rax | ||
261 | xorq %rbx, %rbx | ||
262 | xorq %rcx, %rcx | ||
263 | xorq %rdx, %rdx | ||
264 | xorq %rsi, %rsi | ||
265 | xorq %rdi, %rdi | ||
266 | xorq %rbp, %rbp | ||
267 | xorq %r8, %r8 | ||
268 | xorq %r9, %r9 | ||
269 | xorq %r10, %r9 | ||
270 | xorq %r11, %r11 | ||
271 | xorq %r12, %r12 | ||
272 | xorq %r13, %r13 | ||
273 | xorq %r14, %r14 | ||
274 | xorq %r15, %r15 | ||
275 | |||
276 | ret | ||
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c deleted file mode 100644 index af838f6b0b7f..000000000000 --- a/arch/x86_64/kernel/setup.c +++ /dev/null | |||
@@ -1,1117 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/setup.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Nov 2001 Dave Jones <davej@suse.de> | ||
7 | * Forked from i386 setup code. | ||
8 | */ | ||
9 | |||
10 | /* | ||
11 | * This file handles the architecture-dependent parts of initialization | ||
12 | */ | ||
13 | |||
14 | #include <linux/errno.h> | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/stddef.h> | ||
19 | #include <linux/unistd.h> | ||
20 | #include <linux/ptrace.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/user.h> | ||
23 | #include <linux/a.out.h> | ||
24 | #include <linux/screen_info.h> | ||
25 | #include <linux/ioport.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/initrd.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/bootmem.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <asm/processor.h> | ||
33 | #include <linux/console.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/crash_dump.h> | ||
36 | #include <linux/root_dev.h> | ||
37 | #include <linux/pci.h> | ||
38 | #include <linux/acpi.h> | ||
39 | #include <linux/kallsyms.h> | ||
40 | #include <linux/edd.h> | ||
41 | #include <linux/mmzone.h> | ||
42 | #include <linux/kexec.h> | ||
43 | #include <linux/cpufreq.h> | ||
44 | #include <linux/dmi.h> | ||
45 | #include <linux/dma-mapping.h> | ||
46 | #include <linux/ctype.h> | ||
47 | |||
48 | #include <asm/mtrr.h> | ||
49 | #include <asm/uaccess.h> | ||
50 | #include <asm/system.h> | ||
51 | #include <asm/io.h> | ||
52 | #include <asm/smp.h> | ||
53 | #include <asm/msr.h> | ||
54 | #include <asm/desc.h> | ||
55 | #include <video/edid.h> | ||
56 | #include <asm/e820.h> | ||
57 | #include <asm/dma.h> | ||
58 | #include <asm/mpspec.h> | ||
59 | #include <asm/mmu_context.h> | ||
60 | #include <asm/bootsetup.h> | ||
61 | #include <asm/proto.h> | ||
62 | #include <asm/setup.h> | ||
63 | #include <asm/mach_apic.h> | ||
64 | #include <asm/numa.h> | ||
65 | #include <asm/sections.h> | ||
66 | #include <asm/dmi.h> | ||
67 | |||
68 | /* | ||
69 | * Machine setup.. | ||
70 | */ | ||
71 | |||
72 | struct cpuinfo_x86 boot_cpu_data __read_mostly; | ||
73 | EXPORT_SYMBOL(boot_cpu_data); | ||
74 | |||
75 | unsigned long mmu_cr4_features; | ||
76 | |||
77 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
78 | int bootloader_type; | ||
79 | |||
80 | unsigned long saved_video_mode; | ||
81 | |||
82 | int force_mwait __cpuinitdata; | ||
83 | |||
84 | /* | ||
85 | * Early DMI memory | ||
86 | */ | ||
87 | int dmi_alloc_index; | ||
88 | char dmi_alloc_data[DMI_MAX_DATA]; | ||
89 | |||
90 | /* | ||
91 | * Setup options | ||
92 | */ | ||
93 | struct screen_info screen_info; | ||
94 | EXPORT_SYMBOL(screen_info); | ||
95 | struct sys_desc_table_struct { | ||
96 | unsigned short length; | ||
97 | unsigned char table[0]; | ||
98 | }; | ||
99 | |||
100 | struct edid_info edid_info; | ||
101 | EXPORT_SYMBOL_GPL(edid_info); | ||
102 | |||
103 | extern int root_mountflags; | ||
104 | |||
105 | char __initdata command_line[COMMAND_LINE_SIZE]; | ||
106 | |||
107 | struct resource standard_io_resources[] = { | ||
108 | { .name = "dma1", .start = 0x00, .end = 0x1f, | ||
109 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
110 | { .name = "pic1", .start = 0x20, .end = 0x21, | ||
111 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
112 | { .name = "timer0", .start = 0x40, .end = 0x43, | ||
113 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
114 | { .name = "timer1", .start = 0x50, .end = 0x53, | ||
115 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
116 | { .name = "keyboard", .start = 0x60, .end = 0x6f, | ||
117 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
118 | { .name = "dma page reg", .start = 0x80, .end = 0x8f, | ||
119 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
120 | { .name = "pic2", .start = 0xa0, .end = 0xa1, | ||
121 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
122 | { .name = "dma2", .start = 0xc0, .end = 0xdf, | ||
123 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
124 | { .name = "fpu", .start = 0xf0, .end = 0xff, | ||
125 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } | ||
126 | }; | ||
127 | |||
128 | #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) | ||
129 | |||
130 | struct resource data_resource = { | ||
131 | .name = "Kernel data", | ||
132 | .start = 0, | ||
133 | .end = 0, | ||
134 | .flags = IORESOURCE_RAM, | ||
135 | }; | ||
136 | struct resource code_resource = { | ||
137 | .name = "Kernel code", | ||
138 | .start = 0, | ||
139 | .end = 0, | ||
140 | .flags = IORESOURCE_RAM, | ||
141 | }; | ||
142 | |||
143 | #ifdef CONFIG_PROC_VMCORE | ||
144 | /* elfcorehdr= specifies the location of elf core header | ||
145 | * stored by the crashed kernel. This option will be passed | ||
146 | * by kexec loader to the capture kernel. | ||
147 | */ | ||
148 | static int __init setup_elfcorehdr(char *arg) | ||
149 | { | ||
150 | char *end; | ||
151 | if (!arg) | ||
152 | return -EINVAL; | ||
153 | elfcorehdr_addr = memparse(arg, &end); | ||
154 | return end > arg ? 0 : -EINVAL; | ||
155 | } | ||
156 | early_param("elfcorehdr", setup_elfcorehdr); | ||
157 | #endif | ||
158 | |||
159 | #ifndef CONFIG_NUMA | ||
160 | static void __init | ||
161 | contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
162 | { | ||
163 | unsigned long bootmap_size, bootmap; | ||
164 | |||
165 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
166 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); | ||
167 | if (bootmap == -1L) | ||
168 | panic("Cannot find bootmem map of size %ld\n",bootmap_size); | ||
169 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | ||
170 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
171 | free_bootmem_with_active_regions(0, end_pfn); | ||
172 | reserve_bootmem(bootmap, bootmap_size); | ||
173 | } | ||
174 | #endif | ||
175 | |||
176 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
177 | struct edd edd; | ||
178 | #ifdef CONFIG_EDD_MODULE | ||
179 | EXPORT_SYMBOL(edd); | ||
180 | #endif | ||
181 | /** | ||
182 | * copy_edd() - Copy the BIOS EDD information | ||
183 | * from boot_params into a safe place. | ||
184 | * | ||
185 | */ | ||
186 | static inline void copy_edd(void) | ||
187 | { | ||
188 | memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); | ||
189 | memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); | ||
190 | edd.mbr_signature_nr = EDD_MBR_SIG_NR; | ||
191 | edd.edd_info_nr = EDD_NR; | ||
192 | } | ||
193 | #else | ||
194 | static inline void copy_edd(void) | ||
195 | { | ||
196 | } | ||
197 | #endif | ||
198 | |||
199 | #define EBDA_ADDR_POINTER 0x40E | ||
200 | |||
201 | unsigned __initdata ebda_addr; | ||
202 | unsigned __initdata ebda_size; | ||
203 | |||
204 | static void discover_ebda(void) | ||
205 | { | ||
206 | /* | ||
207 | * there is a real-mode segmented pointer pointing to the | ||
208 | * 4K EBDA area at 0x40E | ||
209 | */ | ||
210 | ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); | ||
211 | ebda_addr <<= 4; | ||
212 | |||
213 | ebda_size = *(unsigned short *)__va(ebda_addr); | ||
214 | |||
215 | /* Round EBDA up to pages */ | ||
216 | if (ebda_size == 0) | ||
217 | ebda_size = 1; | ||
218 | ebda_size <<= 10; | ||
219 | ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); | ||
220 | if (ebda_size > 64*1024) | ||
221 | ebda_size = 64*1024; | ||
222 | } | ||
223 | |||
224 | void __init setup_arch(char **cmdline_p) | ||
225 | { | ||
226 | printk(KERN_INFO "Command line: %s\n", boot_command_line); | ||
227 | |||
228 | ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); | ||
229 | screen_info = SCREEN_INFO; | ||
230 | edid_info = EDID_INFO; | ||
231 | saved_video_mode = SAVED_VIDEO_MODE; | ||
232 | bootloader_type = LOADER_TYPE; | ||
233 | |||
234 | #ifdef CONFIG_BLK_DEV_RAM | ||
235 | rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; | ||
236 | rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); | ||
237 | rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); | ||
238 | #endif | ||
239 | setup_memory_region(); | ||
240 | copy_edd(); | ||
241 | |||
242 | if (!MOUNT_ROOT_RDONLY) | ||
243 | root_mountflags &= ~MS_RDONLY; | ||
244 | init_mm.start_code = (unsigned long) &_text; | ||
245 | init_mm.end_code = (unsigned long) &_etext; | ||
246 | init_mm.end_data = (unsigned long) &_edata; | ||
247 | init_mm.brk = (unsigned long) &_end; | ||
248 | |||
249 | code_resource.start = virt_to_phys(&_text); | ||
250 | code_resource.end = virt_to_phys(&_etext)-1; | ||
251 | data_resource.start = virt_to_phys(&_etext); | ||
252 | data_resource.end = virt_to_phys(&_edata)-1; | ||
253 | |||
254 | early_identify_cpu(&boot_cpu_data); | ||
255 | |||
256 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | ||
257 | *cmdline_p = command_line; | ||
258 | |||
259 | parse_early_param(); | ||
260 | |||
261 | finish_e820_parsing(); | ||
262 | |||
263 | e820_register_active_regions(0, 0, -1UL); | ||
264 | /* | ||
265 | * partially used pages are not usable - thus | ||
266 | * we are rounding upwards: | ||
267 | */ | ||
268 | end_pfn = e820_end_of_ram(); | ||
269 | num_physpages = end_pfn; | ||
270 | |||
271 | check_efer(); | ||
272 | |||
273 | discover_ebda(); | ||
274 | |||
275 | init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); | ||
276 | |||
277 | dmi_scan_machine(); | ||
278 | |||
279 | #ifdef CONFIG_ACPI | ||
280 | /* | ||
281 | * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). | ||
282 | * Call this early for SRAT node setup. | ||
283 | */ | ||
284 | acpi_boot_table_init(); | ||
285 | #endif | ||
286 | |||
287 | /* How many end-of-memory variables you have, grandma! */ | ||
288 | max_low_pfn = end_pfn; | ||
289 | max_pfn = end_pfn; | ||
290 | high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; | ||
291 | |||
292 | /* Remove active ranges so rediscovery with NUMA-awareness happens */ | ||
293 | remove_all_active_ranges(); | ||
294 | |||
295 | #ifdef CONFIG_ACPI_NUMA | ||
296 | /* | ||
297 | * Parse SRAT to discover nodes. | ||
298 | */ | ||
299 | acpi_numa_init(); | ||
300 | #endif | ||
301 | |||
302 | #ifdef CONFIG_NUMA | ||
303 | numa_initmem_init(0, end_pfn); | ||
304 | #else | ||
305 | contig_initmem_init(0, end_pfn); | ||
306 | #endif | ||
307 | |||
308 | /* Reserve direct mapping */ | ||
309 | reserve_bootmem_generic(table_start << PAGE_SHIFT, | ||
310 | (table_end - table_start) << PAGE_SHIFT); | ||
311 | |||
312 | /* reserve kernel */ | ||
313 | reserve_bootmem_generic(__pa_symbol(&_text), | ||
314 | __pa_symbol(&_end) - __pa_symbol(&_text)); | ||
315 | |||
316 | /* | ||
317 | * reserve physical page 0 - it's a special BIOS page on many boxes, | ||
318 | * enabling clean reboots, SMP operation, laptop functions. | ||
319 | */ | ||
320 | reserve_bootmem_generic(0, PAGE_SIZE); | ||
321 | |||
322 | /* reserve ebda region */ | ||
323 | if (ebda_addr) | ||
324 | reserve_bootmem_generic(ebda_addr, ebda_size); | ||
325 | #ifdef CONFIG_NUMA | ||
326 | /* reserve nodemap region */ | ||
327 | if (nodemap_addr) | ||
328 | reserve_bootmem_generic(nodemap_addr, nodemap_size); | ||
329 | #endif | ||
330 | |||
331 | #ifdef CONFIG_SMP | ||
332 | /* Reserve SMP trampoline */ | ||
333 | reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); | ||
334 | #endif | ||
335 | |||
336 | #ifdef CONFIG_ACPI_SLEEP | ||
337 | /* | ||
338 | * Reserve low memory region for sleep support. | ||
339 | */ | ||
340 | acpi_reserve_bootmem(); | ||
341 | #endif | ||
342 | /* | ||
343 | * Find and reserve possible boot-time SMP configuration: | ||
344 | */ | ||
345 | find_smp_config(); | ||
346 | #ifdef CONFIG_BLK_DEV_INITRD | ||
347 | if (LOADER_TYPE && INITRD_START) { | ||
348 | if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { | ||
349 | reserve_bootmem_generic(INITRD_START, INITRD_SIZE); | ||
350 | initrd_start = INITRD_START + PAGE_OFFSET; | ||
351 | initrd_end = initrd_start+INITRD_SIZE; | ||
352 | } | ||
353 | else { | ||
354 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
355 | "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | ||
356 | (unsigned long)(INITRD_START + INITRD_SIZE), | ||
357 | (unsigned long)(end_pfn << PAGE_SHIFT)); | ||
358 | initrd_start = 0; | ||
359 | } | ||
360 | } | ||
361 | #endif | ||
362 | #ifdef CONFIG_KEXEC | ||
363 | if (crashk_res.start != crashk_res.end) { | ||
364 | reserve_bootmem_generic(crashk_res.start, | ||
365 | crashk_res.end - crashk_res.start + 1); | ||
366 | } | ||
367 | #endif | ||
368 | |||
369 | paging_init(); | ||
370 | |||
371 | #ifdef CONFIG_PCI | ||
372 | early_quirks(); | ||
373 | #endif | ||
374 | |||
375 | /* | ||
376 | * set this early, so we dont allocate cpu0 | ||
377 | * if MADT list doesnt list BSP first | ||
378 | * mpparse.c/MP_processor_info() allocates logical cpu numbers. | ||
379 | */ | ||
380 | cpu_set(0, cpu_present_map); | ||
381 | #ifdef CONFIG_ACPI | ||
382 | /* | ||
383 | * Read APIC and some other early information from ACPI tables. | ||
384 | */ | ||
385 | acpi_boot_init(); | ||
386 | #endif | ||
387 | |||
388 | init_cpu_to_node(); | ||
389 | |||
390 | /* | ||
391 | * get boot-time SMP configuration: | ||
392 | */ | ||
393 | if (smp_found_config) | ||
394 | get_smp_config(); | ||
395 | init_apic_mappings(); | ||
396 | |||
397 | /* | ||
398 | * We trust e820 completely. No explicit ROM probing in memory. | ||
399 | */ | ||
400 | e820_reserve_resources(); | ||
401 | e820_mark_nosave_regions(); | ||
402 | |||
403 | { | ||
404 | unsigned i; | ||
405 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
406 | for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) | ||
407 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
408 | } | ||
409 | |||
410 | e820_setup_gap(); | ||
411 | |||
412 | #ifdef CONFIG_VT | ||
413 | #if defined(CONFIG_VGA_CONSOLE) | ||
414 | conswitchp = &vga_con; | ||
415 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
416 | conswitchp = &dummy_con; | ||
417 | #endif | ||
418 | #endif | ||
419 | } | ||
420 | |||
421 | static int __cpuinit get_model_name(struct cpuinfo_x86 *c) | ||
422 | { | ||
423 | unsigned int *v; | ||
424 | |||
425 | if (c->extended_cpuid_level < 0x80000004) | ||
426 | return 0; | ||
427 | |||
428 | v = (unsigned int *) c->x86_model_id; | ||
429 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | ||
430 | cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | ||
431 | cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | ||
432 | c->x86_model_id[48] = 0; | ||
433 | return 1; | ||
434 | } | ||
435 | |||
436 | |||
437 | static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | ||
438 | { | ||
439 | unsigned int n, dummy, eax, ebx, ecx, edx; | ||
440 | |||
441 | n = c->extended_cpuid_level; | ||
442 | |||
443 | if (n >= 0x80000005) { | ||
444 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | ||
445 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | ||
446 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | ||
447 | c->x86_cache_size=(ecx>>24)+(edx>>24); | ||
448 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
449 | c->x86_tlbsize = 0; | ||
450 | } | ||
451 | |||
452 | if (n >= 0x80000006) { | ||
453 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | ||
454 | ecx = cpuid_ecx(0x80000006); | ||
455 | c->x86_cache_size = ecx >> 16; | ||
456 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
457 | |||
458 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | ||
459 | c->x86_cache_size, ecx & 0xFF); | ||
460 | } | ||
461 | |||
462 | if (n >= 0x80000007) | ||
463 | cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); | ||
464 | if (n >= 0x80000008) { | ||
465 | cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | ||
466 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
467 | c->x86_phys_bits = eax & 0xff; | ||
468 | } | ||
469 | } | ||
470 | |||
471 | #ifdef CONFIG_NUMA | ||
472 | static int nearby_node(int apicid) | ||
473 | { | ||
474 | int i; | ||
475 | for (i = apicid - 1; i >= 0; i--) { | ||
476 | int node = apicid_to_node[i]; | ||
477 | if (node != NUMA_NO_NODE && node_online(node)) | ||
478 | return node; | ||
479 | } | ||
480 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | ||
481 | int node = apicid_to_node[i]; | ||
482 | if (node != NUMA_NO_NODE && node_online(node)) | ||
483 | return node; | ||
484 | } | ||
485 | return first_node(node_online_map); /* Shouldn't happen */ | ||
486 | } | ||
487 | #endif | ||
488 | |||
489 | /* | ||
490 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | ||
491 | * Assumes number of cores is a power of two. | ||
492 | */ | ||
493 | static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | ||
494 | { | ||
495 | #ifdef CONFIG_SMP | ||
496 | unsigned bits; | ||
497 | #ifdef CONFIG_NUMA | ||
498 | int cpu = smp_processor_id(); | ||
499 | int node = 0; | ||
500 | unsigned apicid = hard_smp_processor_id(); | ||
501 | #endif | ||
502 | unsigned ecx = cpuid_ecx(0x80000008); | ||
503 | |||
504 | c->x86_max_cores = (ecx & 0xff) + 1; | ||
505 | |||
506 | /* CPU telling us the core id bits shift? */ | ||
507 | bits = (ecx >> 12) & 0xF; | ||
508 | |||
509 | /* Otherwise recompute */ | ||
510 | if (bits == 0) { | ||
511 | while ((1 << bits) < c->x86_max_cores) | ||
512 | bits++; | ||
513 | } | ||
514 | |||
515 | /* Low order bits define the core id (index of core in socket) */ | ||
516 | c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); | ||
517 | /* Convert the APIC ID into the socket ID */ | ||
518 | c->phys_proc_id = phys_pkg_id(bits); | ||
519 | |||
520 | #ifdef CONFIG_NUMA | ||
521 | node = c->phys_proc_id; | ||
522 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | ||
523 | node = apicid_to_node[apicid]; | ||
524 | if (!node_online(node)) { | ||
525 | /* Two possibilities here: | ||
526 | - The CPU is missing memory and no node was created. | ||
527 | In that case try picking one from a nearby CPU | ||
528 | - The APIC IDs differ from the HyperTransport node IDs | ||
529 | which the K8 northbridge parsing fills in. | ||
530 | Assume they are all increased by a constant offset, | ||
531 | but in the same order as the HT nodeids. | ||
532 | If that doesn't result in a usable node fall back to the | ||
533 | path for the previous case. */ | ||
534 | int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); | ||
535 | if (ht_nodeid >= 0 && | ||
536 | apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | ||
537 | node = apicid_to_node[ht_nodeid]; | ||
538 | /* Pick a nearby node */ | ||
539 | if (!node_online(node)) | ||
540 | node = nearby_node(apicid); | ||
541 | } | ||
542 | numa_set_node(cpu, node); | ||
543 | |||
544 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
545 | #endif | ||
546 | #endif | ||
547 | } | ||
548 | |||
549 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | ||
550 | { | ||
551 | unsigned level; | ||
552 | |||
553 | #ifdef CONFIG_SMP | ||
554 | unsigned long value; | ||
555 | |||
556 | /* | ||
557 | * Disable TLB flush filter by setting HWCR.FFDIS on K8 | ||
558 | * bit 6 of msr C001_0015 | ||
559 | * | ||
560 | * Errata 63 for SH-B3 steppings | ||
561 | * Errata 122 for all steppings (F+ have it disabled by default) | ||
562 | */ | ||
563 | if (c->x86 == 15) { | ||
564 | rdmsrl(MSR_K8_HWCR, value); | ||
565 | value |= 1 << 6; | ||
566 | wrmsrl(MSR_K8_HWCR, value); | ||
567 | } | ||
568 | #endif | ||
569 | |||
570 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | ||
571 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | ||
572 | clear_bit(0*32+31, &c->x86_capability); | ||
573 | |||
574 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | ||
575 | level = cpuid_eax(1); | ||
576 | if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) | ||
577 | set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | ||
578 | if (c->x86 == 0x10) | ||
579 | set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | ||
580 | |||
581 | /* Enable workaround for FXSAVE leak */ | ||
582 | if (c->x86 >= 6) | ||
583 | set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); | ||
584 | |||
585 | level = get_model_name(c); | ||
586 | if (!level) { | ||
587 | switch (c->x86) { | ||
588 | case 15: | ||
589 | /* Should distinguish Models here, but this is only | ||
590 | a fallback anyways. */ | ||
591 | strcpy(c->x86_model_id, "Hammer"); | ||
592 | break; | ||
593 | } | ||
594 | } | ||
595 | display_cacheinfo(c); | ||
596 | |||
597 | /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | ||
598 | if (c->x86_power & (1<<8)) | ||
599 | set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | ||
600 | |||
601 | /* Multi core CPU? */ | ||
602 | if (c->extended_cpuid_level >= 0x80000008) | ||
603 | amd_detect_cmp(c); | ||
604 | |||
605 | if (c->extended_cpuid_level >= 0x80000006 && | ||
606 | (cpuid_edx(0x80000006) & 0xf000)) | ||
607 | num_cache_leaves = 4; | ||
608 | else | ||
609 | num_cache_leaves = 3; | ||
610 | |||
611 | if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) | ||
612 | set_bit(X86_FEATURE_K8, &c->x86_capability); | ||
613 | |||
614 | /* RDTSC can be speculated around */ | ||
615 | clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | ||
616 | |||
617 | /* Family 10 doesn't support C states in MWAIT so don't use it */ | ||
618 | if (c->x86 == 0x10 && !force_mwait) | ||
619 | clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); | ||
620 | } | ||
621 | |||
622 | static void __cpuinit detect_ht(struct cpuinfo_x86 *c) | ||
623 | { | ||
624 | #ifdef CONFIG_SMP | ||
625 | u32 eax, ebx, ecx, edx; | ||
626 | int index_msb, core_bits; | ||
627 | |||
628 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
629 | |||
630 | |||
631 | if (!cpu_has(c, X86_FEATURE_HT)) | ||
632 | return; | ||
633 | if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
634 | goto out; | ||
635 | |||
636 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
637 | |||
638 | if (smp_num_siblings == 1) { | ||
639 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
640 | } else if (smp_num_siblings > 1 ) { | ||
641 | |||
642 | if (smp_num_siblings > NR_CPUS) { | ||
643 | printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); | ||
644 | smp_num_siblings = 1; | ||
645 | return; | ||
646 | } | ||
647 | |||
648 | index_msb = get_count_order(smp_num_siblings); | ||
649 | c->phys_proc_id = phys_pkg_id(index_msb); | ||
650 | |||
651 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | ||
652 | |||
653 | index_msb = get_count_order(smp_num_siblings) ; | ||
654 | |||
655 | core_bits = get_count_order(c->x86_max_cores); | ||
656 | |||
657 | c->cpu_core_id = phys_pkg_id(index_msb) & | ||
658 | ((1 << core_bits) - 1); | ||
659 | } | ||
660 | out: | ||
661 | if ((c->x86_max_cores * smp_num_siblings) > 1) { | ||
662 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); | ||
663 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); | ||
664 | } | ||
665 | |||
666 | #endif | ||
667 | } | ||
668 | |||
669 | /* | ||
670 | * find out the number of processor cores on the die | ||
671 | */ | ||
672 | static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) | ||
673 | { | ||
674 | unsigned int eax, t; | ||
675 | |||
676 | if (c->cpuid_level < 4) | ||
677 | return 1; | ||
678 | |||
679 | cpuid_count(4, 0, &eax, &t, &t, &t); | ||
680 | |||
681 | if (eax & 0x1f) | ||
682 | return ((eax >> 26) + 1); | ||
683 | else | ||
684 | return 1; | ||
685 | } | ||
686 | |||
687 | static void srat_detect_node(void) | ||
688 | { | ||
689 | #ifdef CONFIG_NUMA | ||
690 | unsigned node; | ||
691 | int cpu = smp_processor_id(); | ||
692 | int apicid = hard_smp_processor_id(); | ||
693 | |||
694 | /* Don't do the funky fallback heuristics the AMD version employs | ||
695 | for now. */ | ||
696 | node = apicid_to_node[apicid]; | ||
697 | if (node == NUMA_NO_NODE) | ||
698 | node = first_node(node_online_map); | ||
699 | numa_set_node(cpu, node); | ||
700 | |||
701 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
702 | #endif | ||
703 | } | ||
704 | |||
705 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) | ||
706 | { | ||
707 | /* Cache sizes */ | ||
708 | unsigned n; | ||
709 | |||
710 | init_intel_cacheinfo(c); | ||
711 | if (c->cpuid_level > 9 ) { | ||
712 | unsigned eax = cpuid_eax(10); | ||
713 | /* Check for version and the number of counters */ | ||
714 | if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) | ||
715 | set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); | ||
716 | } | ||
717 | |||
718 | if (cpu_has_ds) { | ||
719 | unsigned int l1, l2; | ||
720 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | ||
721 | if (!(l1 & (1<<11))) | ||
722 | set_bit(X86_FEATURE_BTS, c->x86_capability); | ||
723 | if (!(l1 & (1<<12))) | ||
724 | set_bit(X86_FEATURE_PEBS, c->x86_capability); | ||
725 | } | ||
726 | |||
727 | n = c->extended_cpuid_level; | ||
728 | if (n >= 0x80000008) { | ||
729 | unsigned eax = cpuid_eax(0x80000008); | ||
730 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
731 | c->x86_phys_bits = eax & 0xff; | ||
732 | /* CPUID workaround for Intel 0F34 CPU */ | ||
733 | if (c->x86_vendor == X86_VENDOR_INTEL && | ||
734 | c->x86 == 0xF && c->x86_model == 0x3 && | ||
735 | c->x86_mask == 0x4) | ||
736 | c->x86_phys_bits = 36; | ||
737 | } | ||
738 | |||
739 | if (c->x86 == 15) | ||
740 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
741 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || | ||
742 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | ||
743 | set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); | ||
744 | if (c->x86 == 6) | ||
745 | set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | ||
746 | if (c->x86 == 15) | ||
747 | set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | ||
748 | else | ||
749 | clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); | ||
750 | c->x86_max_cores = intel_num_cpu_cores(c); | ||
751 | |||
752 | srat_detect_node(); | ||
753 | } | ||
754 | |||
755 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | ||
756 | { | ||
757 | char *v = c->x86_vendor_id; | ||
758 | |||
759 | if (!strcmp(v, "AuthenticAMD")) | ||
760 | c->x86_vendor = X86_VENDOR_AMD; | ||
761 | else if (!strcmp(v, "GenuineIntel")) | ||
762 | c->x86_vendor = X86_VENDOR_INTEL; | ||
763 | else | ||
764 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
765 | } | ||
766 | |||
767 | struct cpu_model_info { | ||
768 | int vendor; | ||
769 | int family; | ||
770 | char *model_names[16]; | ||
771 | }; | ||
772 | |||
773 | /* Do some early cpuid on the boot CPU to get some parameter that are | ||
774 | needed before check_bugs. Everything advanced is in identify_cpu | ||
775 | below. */ | ||
776 | void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | ||
777 | { | ||
778 | u32 tfms; | ||
779 | |||
780 | c->loops_per_jiffy = loops_per_jiffy; | ||
781 | c->x86_cache_size = -1; | ||
782 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
783 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | ||
784 | c->x86_vendor_id[0] = '\0'; /* Unset */ | ||
785 | c->x86_model_id[0] = '\0'; /* Unset */ | ||
786 | c->x86_clflush_size = 64; | ||
787 | c->x86_cache_alignment = c->x86_clflush_size; | ||
788 | c->x86_max_cores = 1; | ||
789 | c->extended_cpuid_level = 0; | ||
790 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
791 | |||
792 | /* Get vendor name */ | ||
793 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | ||
794 | (unsigned int *)&c->x86_vendor_id[0], | ||
795 | (unsigned int *)&c->x86_vendor_id[8], | ||
796 | (unsigned int *)&c->x86_vendor_id[4]); | ||
797 | |||
798 | get_cpu_vendor(c); | ||
799 | |||
800 | /* Initialize the standard set of capabilities */ | ||
801 | /* Note that the vendor-specific code below might override */ | ||
802 | |||
803 | /* Intel-defined flags: level 0x00000001 */ | ||
804 | if (c->cpuid_level >= 0x00000001) { | ||
805 | __u32 misc; | ||
806 | cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | ||
807 | &c->x86_capability[0]); | ||
808 | c->x86 = (tfms >> 8) & 0xf; | ||
809 | c->x86_model = (tfms >> 4) & 0xf; | ||
810 | c->x86_mask = tfms & 0xf; | ||
811 | if (c->x86 == 0xf) | ||
812 | c->x86 += (tfms >> 20) & 0xff; | ||
813 | if (c->x86 >= 0x6) | ||
814 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
815 | if (c->x86_capability[0] & (1<<19)) | ||
816 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | ||
817 | } else { | ||
818 | /* Have CPUID level 0 only - unheard of */ | ||
819 | c->x86 = 4; | ||
820 | } | ||
821 | |||
822 | #ifdef CONFIG_SMP | ||
823 | c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; | ||
824 | #endif | ||
825 | } | ||
826 | |||
827 | /* | ||
828 | * This does the hard work of actually picking apart the CPU stuff... | ||
829 | */ | ||
830 | void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | ||
831 | { | ||
832 | int i; | ||
833 | u32 xlvl; | ||
834 | |||
835 | early_identify_cpu(c); | ||
836 | |||
837 | /* AMD-defined flags: level 0x80000001 */ | ||
838 | xlvl = cpuid_eax(0x80000000); | ||
839 | c->extended_cpuid_level = xlvl; | ||
840 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
841 | if (xlvl >= 0x80000001) { | ||
842 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
843 | c->x86_capability[6] = cpuid_ecx(0x80000001); | ||
844 | } | ||
845 | if (xlvl >= 0x80000004) | ||
846 | get_model_name(c); /* Default name */ | ||
847 | } | ||
848 | |||
849 | /* Transmeta-defined flags: level 0x80860001 */ | ||
850 | xlvl = cpuid_eax(0x80860000); | ||
851 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
852 | /* Don't set x86_cpuid_level here for now to not confuse. */ | ||
853 | if (xlvl >= 0x80860001) | ||
854 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
855 | } | ||
856 | |||
857 | init_scattered_cpuid_features(c); | ||
858 | |||
859 | c->apicid = phys_pkg_id(0); | ||
860 | |||
861 | /* | ||
862 | * Vendor-specific initialization. In this section we | ||
863 | * canonicalize the feature flags, meaning if there are | ||
864 | * features a certain CPU supports which CPUID doesn't | ||
865 | * tell us, CPUID claiming incorrect flags, or other bugs, | ||
866 | * we handle them here. | ||
867 | * | ||
868 | * At the end of this section, c->x86_capability better | ||
869 | * indicate the features this CPU genuinely supports! | ||
870 | */ | ||
871 | switch (c->x86_vendor) { | ||
872 | case X86_VENDOR_AMD: | ||
873 | init_amd(c); | ||
874 | break; | ||
875 | |||
876 | case X86_VENDOR_INTEL: | ||
877 | init_intel(c); | ||
878 | break; | ||
879 | |||
880 | case X86_VENDOR_UNKNOWN: | ||
881 | default: | ||
882 | display_cacheinfo(c); | ||
883 | break; | ||
884 | } | ||
885 | |||
886 | select_idle_routine(c); | ||
887 | detect_ht(c); | ||
888 | |||
889 | /* | ||
890 | * On SMP, boot_cpu_data holds the common feature set between | ||
891 | * all CPUs; so make sure that we indicate which features are | ||
892 | * common between the CPUs. The first time this routine gets | ||
893 | * executed, c == &boot_cpu_data. | ||
894 | */ | ||
895 | if (c != &boot_cpu_data) { | ||
896 | /* AND the already accumulated flags with these */ | ||
897 | for (i = 0 ; i < NCAPINTS ; i++) | ||
898 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | ||
899 | } | ||
900 | |||
901 | #ifdef CONFIG_X86_MCE | ||
902 | mcheck_init(c); | ||
903 | #endif | ||
904 | if (c != &boot_cpu_data) | ||
905 | mtrr_ap_init(); | ||
906 | #ifdef CONFIG_NUMA | ||
907 | numa_add_cpu(smp_processor_id()); | ||
908 | #endif | ||
909 | } | ||
910 | |||
911 | |||
912 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | ||
913 | { | ||
914 | if (c->x86_model_id[0]) | ||
915 | printk("%s", c->x86_model_id); | ||
916 | |||
917 | if (c->x86_mask || c->cpuid_level >= 0) | ||
918 | printk(" stepping %02x\n", c->x86_mask); | ||
919 | else | ||
920 | printk("\n"); | ||
921 | } | ||
922 | |||
923 | /* | ||
924 | * Get CPU information for use by the procfs. | ||
925 | */ | ||
926 | |||
927 | static int show_cpuinfo(struct seq_file *m, void *v) | ||
928 | { | ||
929 | struct cpuinfo_x86 *c = v; | ||
930 | |||
931 | /* | ||
932 | * These flag bits must match the definitions in <asm/cpufeature.h>. | ||
933 | * NULL means this bit is undefined or reserved; either way it doesn't | ||
934 | * have meaning as far as Linux is concerned. Note that it's important | ||
935 | * to realize there is a difference between this table and CPUID -- if | ||
936 | * applications want to get the raw CPUID data, they should access | ||
937 | * /dev/cpu/<cpu_nr>/cpuid instead. | ||
938 | */ | ||
939 | static char *x86_cap_flags[] = { | ||
940 | /* Intel-defined */ | ||
941 | "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", | ||
942 | "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", | ||
943 | "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", | ||
944 | "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", | ||
945 | |||
946 | /* AMD-defined */ | ||
947 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
948 | NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, | ||
949 | NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, | ||
950 | NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", | ||
951 | "3dnowext", "3dnow", | ||
952 | |||
953 | /* Transmeta-defined */ | ||
954 | "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, | ||
955 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
956 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
957 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
958 | |||
959 | /* Other (Linux-defined) */ | ||
960 | "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", | ||
961 | NULL, NULL, NULL, NULL, | ||
962 | "constant_tsc", "up", NULL, "arch_perfmon", | ||
963 | "pebs", "bts", NULL, "sync_rdtsc", | ||
964 | "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
965 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
966 | |||
967 | /* Intel-defined (#2) */ | ||
968 | "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", | ||
969 | "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, | ||
970 | NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", | ||
971 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
972 | |||
973 | /* VIA/Cyrix/Centaur-defined */ | ||
974 | NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", | ||
975 | "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, | ||
976 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
977 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
978 | |||
979 | /* AMD-defined (#2) */ | ||
980 | "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", | ||
981 | "altmovcr8", "abm", "sse4a", | ||
982 | "misalignsse", "3dnowprefetch", | ||
983 | "osvw", "ibs", NULL, NULL, NULL, NULL, | ||
984 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
985 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
986 | |||
987 | /* Auxiliary (Linux-defined) */ | ||
988 | "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
989 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
990 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
991 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
992 | }; | ||
993 | static char *x86_power_flags[] = { | ||
994 | "ts", /* temperature sensor */ | ||
995 | "fid", /* frequency id control */ | ||
996 | "vid", /* voltage id control */ | ||
997 | "ttp", /* thermal trip */ | ||
998 | "tm", | ||
999 | "stc", | ||
1000 | "100mhzsteps", | ||
1001 | "hwpstate", | ||
1002 | "", /* tsc invariant mapped to constant_tsc */ | ||
1003 | /* nothing */ | ||
1004 | }; | ||
1005 | |||
1006 | |||
1007 | #ifdef CONFIG_SMP | ||
1008 | if (!cpu_online(c-cpu_data)) | ||
1009 | return 0; | ||
1010 | #endif | ||
1011 | |||
1012 | seq_printf(m,"processor\t: %u\n" | ||
1013 | "vendor_id\t: %s\n" | ||
1014 | "cpu family\t: %d\n" | ||
1015 | "model\t\t: %d\n" | ||
1016 | "model name\t: %s\n", | ||
1017 | (unsigned)(c-cpu_data), | ||
1018 | c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", | ||
1019 | c->x86, | ||
1020 | (int)c->x86_model, | ||
1021 | c->x86_model_id[0] ? c->x86_model_id : "unknown"); | ||
1022 | |||
1023 | if (c->x86_mask || c->cpuid_level >= 0) | ||
1024 | seq_printf(m, "stepping\t: %d\n", c->x86_mask); | ||
1025 | else | ||
1026 | seq_printf(m, "stepping\t: unknown\n"); | ||
1027 | |||
1028 | if (cpu_has(c,X86_FEATURE_TSC)) { | ||
1029 | unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); | ||
1030 | if (!freq) | ||
1031 | freq = cpu_khz; | ||
1032 | seq_printf(m, "cpu MHz\t\t: %u.%03u\n", | ||
1033 | freq / 1000, (freq % 1000)); | ||
1034 | } | ||
1035 | |||
1036 | /* Cache size */ | ||
1037 | if (c->x86_cache_size >= 0) | ||
1038 | seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); | ||
1039 | |||
1040 | #ifdef CONFIG_SMP | ||
1041 | if (smp_num_siblings * c->x86_max_cores > 1) { | ||
1042 | int cpu = c - cpu_data; | ||
1043 | seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); | ||
1044 | seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); | ||
1045 | seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); | ||
1046 | seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); | ||
1047 | } | ||
1048 | #endif | ||
1049 | |||
1050 | seq_printf(m, | ||
1051 | "fpu\t\t: yes\n" | ||
1052 | "fpu_exception\t: yes\n" | ||
1053 | "cpuid level\t: %d\n" | ||
1054 | "wp\t\t: yes\n" | ||
1055 | "flags\t\t:", | ||
1056 | c->cpuid_level); | ||
1057 | |||
1058 | { | ||
1059 | int i; | ||
1060 | for ( i = 0 ; i < 32*NCAPINTS ; i++ ) | ||
1061 | if (cpu_has(c, i) && x86_cap_flags[i] != NULL) | ||
1062 | seq_printf(m, " %s", x86_cap_flags[i]); | ||
1063 | } | ||
1064 | |||
1065 | seq_printf(m, "\nbogomips\t: %lu.%02lu\n", | ||
1066 | c->loops_per_jiffy/(500000/HZ), | ||
1067 | (c->loops_per_jiffy/(5000/HZ)) % 100); | ||
1068 | |||
1069 | if (c->x86_tlbsize > 0) | ||
1070 | seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); | ||
1071 | seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); | ||
1072 | seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); | ||
1073 | |||
1074 | seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | ||
1075 | c->x86_phys_bits, c->x86_virt_bits); | ||
1076 | |||
1077 | seq_printf(m, "power management:"); | ||
1078 | { | ||
1079 | unsigned i; | ||
1080 | for (i = 0; i < 32; i++) | ||
1081 | if (c->x86_power & (1 << i)) { | ||
1082 | if (i < ARRAY_SIZE(x86_power_flags) && | ||
1083 | x86_power_flags[i]) | ||
1084 | seq_printf(m, "%s%s", | ||
1085 | x86_power_flags[i][0]?" ":"", | ||
1086 | x86_power_flags[i]); | ||
1087 | else | ||
1088 | seq_printf(m, " [%d]", i); | ||
1089 | } | ||
1090 | } | ||
1091 | |||
1092 | seq_printf(m, "\n\n"); | ||
1093 | |||
1094 | return 0; | ||
1095 | } | ||
1096 | |||
1097 | static void *c_start(struct seq_file *m, loff_t *pos) | ||
1098 | { | ||
1099 | return *pos < NR_CPUS ? cpu_data + *pos : NULL; | ||
1100 | } | ||
1101 | |||
1102 | static void *c_next(struct seq_file *m, void *v, loff_t *pos) | ||
1103 | { | ||
1104 | ++*pos; | ||
1105 | return c_start(m, pos); | ||
1106 | } | ||
1107 | |||
1108 | static void c_stop(struct seq_file *m, void *v) | ||
1109 | { | ||
1110 | } | ||
1111 | |||
1112 | struct seq_operations cpuinfo_op = { | ||
1113 | .start =c_start, | ||
1114 | .next = c_next, | ||
1115 | .stop = c_stop, | ||
1116 | .show = show_cpuinfo, | ||
1117 | }; | ||
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c deleted file mode 100644 index 1200aaac403e..000000000000 --- a/arch/x86_64/kernel/setup64.c +++ /dev/null | |||
@@ -1,289 +0,0 @@ | |||
1 | /* | ||
2 | * X86-64 specific CPU setup. | ||
3 | * Copyright (C) 1995 Linus Torvalds | ||
4 | * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. | ||
5 | * See setup.c for older changelog. | ||
6 | */ | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/bootmem.h> | ||
12 | #include <linux/bitops.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <asm/bootsetup.h> | ||
15 | #include <asm/pda.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/processor.h> | ||
18 | #include <asm/desc.h> | ||
19 | #include <asm/atomic.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/smp.h> | ||
22 | #include <asm/i387.h> | ||
23 | #include <asm/percpu.h> | ||
24 | #include <asm/proto.h> | ||
25 | #include <asm/sections.h> | ||
26 | |||
27 | char x86_boot_params[BOOT_PARAM_SIZE] __initdata; | ||
28 | |||
29 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | ||
30 | |||
31 | struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; | ||
32 | EXPORT_SYMBOL(_cpu_pda); | ||
33 | struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; | ||
34 | |||
35 | struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | ||
36 | |||
37 | char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); | ||
38 | |||
39 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | ||
40 | static int do_not_nx __cpuinitdata = 0; | ||
41 | |||
42 | /* noexec=on|off | ||
43 | Control non executable mappings for 64bit processes. | ||
44 | |||
45 | on Enable(default) | ||
46 | off Disable | ||
47 | */ | ||
48 | static int __init nonx_setup(char *str) | ||
49 | { | ||
50 | if (!str) | ||
51 | return -EINVAL; | ||
52 | if (!strncmp(str, "on", 2)) { | ||
53 | __supported_pte_mask |= _PAGE_NX; | ||
54 | do_not_nx = 0; | ||
55 | } else if (!strncmp(str, "off", 3)) { | ||
56 | do_not_nx = 1; | ||
57 | __supported_pte_mask &= ~_PAGE_NX; | ||
58 | } | ||
59 | return 0; | ||
60 | } | ||
61 | early_param("noexec", nonx_setup); | ||
62 | |||
63 | int force_personality32 = 0; | ||
64 | |||
65 | /* noexec32=on|off | ||
66 | Control non executable heap for 32bit processes. | ||
67 | To control the stack too use noexec=off | ||
68 | |||
69 | on PROT_READ does not imply PROT_EXEC for 32bit processes | ||
70 | off PROT_READ implies PROT_EXEC (default) | ||
71 | */ | ||
72 | static int __init nonx32_setup(char *str) | ||
73 | { | ||
74 | if (!strcmp(str, "on")) | ||
75 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
76 | else if (!strcmp(str, "off")) | ||
77 | force_personality32 |= READ_IMPLIES_EXEC; | ||
78 | return 1; | ||
79 | } | ||
80 | __setup("noexec32=", nonx32_setup); | ||
81 | |||
82 | /* | ||
83 | * Great future plan: | ||
84 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | ||
85 | * Always point %gs to its beginning | ||
86 | */ | ||
87 | void __init setup_per_cpu_areas(void) | ||
88 | { | ||
89 | int i; | ||
90 | unsigned long size; | ||
91 | |||
92 | #ifdef CONFIG_HOTPLUG_CPU | ||
93 | prefill_possible_map(); | ||
94 | #endif | ||
95 | |||
96 | /* Copy section for each CPU (we discard the original) */ | ||
97 | size = PERCPU_ENOUGH_ROOM; | ||
98 | |||
99 | printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); | ||
100 | for_each_cpu_mask (i, cpu_possible_map) { | ||
101 | char *ptr; | ||
102 | |||
103 | if (!NODE_DATA(cpu_to_node(i))) { | ||
104 | printk("cpu with no node %d, num_online_nodes %d\n", | ||
105 | i, num_online_nodes()); | ||
106 | ptr = alloc_bootmem_pages(size); | ||
107 | } else { | ||
108 | ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); | ||
109 | } | ||
110 | if (!ptr) | ||
111 | panic("Cannot allocate cpu data for CPU %d\n", i); | ||
112 | cpu_pda(i)->data_offset = ptr - __per_cpu_start; | ||
113 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
114 | } | ||
115 | } | ||
116 | |||
117 | void pda_init(int cpu) | ||
118 | { | ||
119 | struct x8664_pda *pda = cpu_pda(cpu); | ||
120 | |||
121 | /* Setup up data that may be needed in __get_free_pages early */ | ||
122 | asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); | ||
123 | /* Memory clobbers used to order PDA accessed */ | ||
124 | mb(); | ||
125 | wrmsrl(MSR_GS_BASE, pda); | ||
126 | mb(); | ||
127 | |||
128 | pda->cpunumber = cpu; | ||
129 | pda->irqcount = -1; | ||
130 | pda->kernelstack = | ||
131 | (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; | ||
132 | pda->active_mm = &init_mm; | ||
133 | pda->mmu_state = 0; | ||
134 | |||
135 | if (cpu == 0) { | ||
136 | /* others are initialized in smpboot.c */ | ||
137 | pda->pcurrent = &init_task; | ||
138 | pda->irqstackptr = boot_cpu_stack; | ||
139 | } else { | ||
140 | pda->irqstackptr = (char *) | ||
141 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
142 | if (!pda->irqstackptr) | ||
143 | panic("cannot allocate irqstack for cpu %d", cpu); | ||
144 | } | ||
145 | |||
146 | |||
147 | pda->irqstackptr += IRQSTACKSIZE-64; | ||
148 | } | ||
149 | |||
150 | char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] | ||
151 | __attribute__((section(".bss.page_aligned"))); | ||
152 | |||
153 | extern asmlinkage void ignore_sysret(void); | ||
154 | |||
155 | /* May not be marked __init: used by software suspend */ | ||
156 | void syscall_init(void) | ||
157 | { | ||
158 | /* | ||
159 | * LSTAR and STAR live in a bit strange symbiosis. | ||
160 | * They both write to the same internal register. STAR allows to set CS/DS | ||
161 | * but only a 32bit target. LSTAR sets the 64bit rip. | ||
162 | */ | ||
163 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
164 | wrmsrl(MSR_LSTAR, system_call); | ||
165 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
166 | |||
167 | #ifdef CONFIG_IA32_EMULATION | ||
168 | syscall32_cpu_init (); | ||
169 | #endif | ||
170 | |||
171 | /* Flags to clear on syscall */ | ||
172 | wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); | ||
173 | } | ||
174 | |||
175 | void __cpuinit check_efer(void) | ||
176 | { | ||
177 | unsigned long efer; | ||
178 | |||
179 | rdmsrl(MSR_EFER, efer); | ||
180 | if (!(efer & EFER_NX) || do_not_nx) { | ||
181 | __supported_pte_mask &= ~_PAGE_NX; | ||
182 | } | ||
183 | } | ||
184 | |||
185 | unsigned long kernel_eflags; | ||
186 | |||
187 | /* | ||
188 | * cpu_init() initializes state that is per-CPU. Some data is already | ||
189 | * initialized (naturally) in the bootstrap process, such as the GDT | ||
190 | * and IDT. We reload them nevertheless, this function acts as a | ||
191 | * 'CPU state barrier', nothing should get across. | ||
192 | * A lot of state is already set up in PDA init. | ||
193 | */ | ||
194 | void __cpuinit cpu_init (void) | ||
195 | { | ||
196 | int cpu = stack_smp_processor_id(); | ||
197 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
198 | struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); | ||
199 | unsigned long v; | ||
200 | char *estacks = NULL; | ||
201 | struct task_struct *me; | ||
202 | int i; | ||
203 | |||
204 | /* CPU 0 is initialised in head64.c */ | ||
205 | if (cpu != 0) { | ||
206 | pda_init(cpu); | ||
207 | } else | ||
208 | estacks = boot_exception_stacks; | ||
209 | |||
210 | me = current; | ||
211 | |||
212 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
213 | panic("CPU#%d already initialized!\n", cpu); | ||
214 | |||
215 | printk("Initializing CPU#%d\n", cpu); | ||
216 | |||
217 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
218 | |||
219 | /* | ||
220 | * Initialize the per-CPU GDT with the boot GDT, | ||
221 | * and set up the GDT descriptor: | ||
222 | */ | ||
223 | if (cpu) | ||
224 | memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); | ||
225 | |||
226 | cpu_gdt_descr[cpu].size = GDT_SIZE; | ||
227 | asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); | ||
228 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
229 | |||
230 | memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); | ||
231 | syscall_init(); | ||
232 | |||
233 | wrmsrl(MSR_FS_BASE, 0); | ||
234 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
235 | barrier(); | ||
236 | |||
237 | check_efer(); | ||
238 | |||
239 | /* | ||
240 | * set up and load the per-CPU TSS | ||
241 | */ | ||
242 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
243 | static const unsigned int order[N_EXCEPTION_STACKS] = { | ||
244 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, | ||
245 | [DEBUG_STACK - 1] = DEBUG_STACK_ORDER | ||
246 | }; | ||
247 | if (cpu) { | ||
248 | estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); | ||
249 | if (!estacks) | ||
250 | panic("Cannot allocate exception stack %ld %d\n", | ||
251 | v, cpu); | ||
252 | } | ||
253 | estacks += PAGE_SIZE << order[v]; | ||
254 | orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; | ||
255 | } | ||
256 | |||
257 | t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
258 | /* | ||
259 | * <= is required because the CPU will access up to | ||
260 | * 8 bits beyond the end of the IO permission bitmap. | ||
261 | */ | ||
262 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
263 | t->io_bitmap[i] = ~0UL; | ||
264 | |||
265 | atomic_inc(&init_mm.mm_count); | ||
266 | me->active_mm = &init_mm; | ||
267 | if (me->mm) | ||
268 | BUG(); | ||
269 | enter_lazy_tlb(&init_mm, me); | ||
270 | |||
271 | set_tss_desc(cpu, t); | ||
272 | load_TR_desc(); | ||
273 | load_LDT(&init_mm.context); | ||
274 | |||
275 | /* | ||
276 | * Clear all 6 debug registers: | ||
277 | */ | ||
278 | |||
279 | set_debugreg(0UL, 0); | ||
280 | set_debugreg(0UL, 1); | ||
281 | set_debugreg(0UL, 2); | ||
282 | set_debugreg(0UL, 3); | ||
283 | set_debugreg(0UL, 6); | ||
284 | set_debugreg(0UL, 7); | ||
285 | |||
286 | fpu_init(); | ||
287 | |||
288 | raw_local_save_flags(kernel_eflags); | ||
289 | } | ||
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c deleted file mode 100644 index 739175b01e06..000000000000 --- a/arch/x86_64/kernel/signal.c +++ /dev/null | |||
@@ -1,495 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * | ||
7 | * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson | ||
8 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | ||
9 | * 2000-2002 x86-64 support by Andi Kleen | ||
10 | */ | ||
11 | |||
12 | #include <linux/sched.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/smp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/signal.h> | ||
17 | #include <linux/errno.h> | ||
18 | #include <linux/wait.h> | ||
19 | #include <linux/ptrace.h> | ||
20 | #include <linux/unistd.h> | ||
21 | #include <linux/stddef.h> | ||
22 | #include <linux/personality.h> | ||
23 | #include <linux/compiler.h> | ||
24 | #include <asm/ucontext.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/i387.h> | ||
27 | #include <asm/proto.h> | ||
28 | #include <asm/ia32_unistd.h> | ||
29 | #include <asm/mce.h> | ||
30 | |||
31 | /* #define DEBUG_SIG 1 */ | ||
32 | |||
33 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | ||
34 | |||
35 | int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
36 | sigset_t *set, struct pt_regs * regs); | ||
37 | int ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
38 | sigset_t *set, struct pt_regs * regs); | ||
39 | |||
40 | asmlinkage long | ||
41 | sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | ||
42 | struct pt_regs *regs) | ||
43 | { | ||
44 | return do_sigaltstack(uss, uoss, regs->rsp); | ||
45 | } | ||
46 | |||
47 | |||
48 | /* | ||
49 | * Do a signal return; undo the signal stack. | ||
50 | */ | ||
51 | |||
52 | struct rt_sigframe | ||
53 | { | ||
54 | char __user *pretcode; | ||
55 | struct ucontext uc; | ||
56 | struct siginfo info; | ||
57 | }; | ||
58 | |||
59 | static int | ||
60 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) | ||
61 | { | ||
62 | unsigned int err = 0; | ||
63 | |||
64 | /* Always make any pending restarted system calls return -EINTR */ | ||
65 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | ||
66 | |||
67 | #define COPY(x) err |= __get_user(regs->x, &sc->x) | ||
68 | |||
69 | COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); | ||
70 | COPY(rdx); COPY(rcx); COPY(rip); | ||
71 | COPY(r8); | ||
72 | COPY(r9); | ||
73 | COPY(r10); | ||
74 | COPY(r11); | ||
75 | COPY(r12); | ||
76 | COPY(r13); | ||
77 | COPY(r14); | ||
78 | COPY(r15); | ||
79 | |||
80 | /* Kernel saves and restores only the CS segment register on signals, | ||
81 | * which is the bare minimum needed to allow mixed 32/64-bit code. | ||
82 | * App's signal handler can save/restore other segments if needed. */ | ||
83 | { | ||
84 | unsigned cs; | ||
85 | err |= __get_user(cs, &sc->cs); | ||
86 | regs->cs = cs | 3; /* Force into user mode */ | ||
87 | } | ||
88 | |||
89 | { | ||
90 | unsigned int tmpflags; | ||
91 | err |= __get_user(tmpflags, &sc->eflags); | ||
92 | regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); | ||
93 | regs->orig_rax = -1; /* disable syscall checks */ | ||
94 | } | ||
95 | |||
96 | { | ||
97 | struct _fpstate __user * buf; | ||
98 | err |= __get_user(buf, &sc->fpstate); | ||
99 | |||
100 | if (buf) { | ||
101 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
102 | goto badframe; | ||
103 | err |= restore_i387(buf); | ||
104 | } else { | ||
105 | struct task_struct *me = current; | ||
106 | if (used_math()) { | ||
107 | clear_fpu(me); | ||
108 | clear_used_math(); | ||
109 | } | ||
110 | } | ||
111 | } | ||
112 | |||
113 | err |= __get_user(*prax, &sc->rax); | ||
114 | return err; | ||
115 | |||
116 | badframe: | ||
117 | return 1; | ||
118 | } | ||
119 | |||
120 | asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | ||
121 | { | ||
122 | struct rt_sigframe __user *frame; | ||
123 | sigset_t set; | ||
124 | unsigned long eax; | ||
125 | |||
126 | frame = (struct rt_sigframe __user *)(regs->rsp - 8); | ||
127 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { | ||
128 | goto badframe; | ||
129 | } | ||
130 | if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { | ||
131 | goto badframe; | ||
132 | } | ||
133 | |||
134 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
135 | spin_lock_irq(¤t->sighand->siglock); | ||
136 | current->blocked = set; | ||
137 | recalc_sigpending(); | ||
138 | spin_unlock_irq(¤t->sighand->siglock); | ||
139 | |||
140 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) | ||
141 | goto badframe; | ||
142 | |||
143 | #ifdef DEBUG_SIG | ||
144 | printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); | ||
145 | #endif | ||
146 | |||
147 | if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) | ||
148 | goto badframe; | ||
149 | |||
150 | return eax; | ||
151 | |||
152 | badframe: | ||
153 | signal_fault(regs,frame,"sigreturn"); | ||
154 | return 0; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Set up a signal frame. | ||
159 | */ | ||
160 | |||
161 | static inline int | ||
162 | setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) | ||
163 | { | ||
164 | int err = 0; | ||
165 | |||
166 | err |= __put_user(regs->cs, &sc->cs); | ||
167 | err |= __put_user(0, &sc->gs); | ||
168 | err |= __put_user(0, &sc->fs); | ||
169 | |||
170 | err |= __put_user(regs->rdi, &sc->rdi); | ||
171 | err |= __put_user(regs->rsi, &sc->rsi); | ||
172 | err |= __put_user(regs->rbp, &sc->rbp); | ||
173 | err |= __put_user(regs->rsp, &sc->rsp); | ||
174 | err |= __put_user(regs->rbx, &sc->rbx); | ||
175 | err |= __put_user(regs->rdx, &sc->rdx); | ||
176 | err |= __put_user(regs->rcx, &sc->rcx); | ||
177 | err |= __put_user(regs->rax, &sc->rax); | ||
178 | err |= __put_user(regs->r8, &sc->r8); | ||
179 | err |= __put_user(regs->r9, &sc->r9); | ||
180 | err |= __put_user(regs->r10, &sc->r10); | ||
181 | err |= __put_user(regs->r11, &sc->r11); | ||
182 | err |= __put_user(regs->r12, &sc->r12); | ||
183 | err |= __put_user(regs->r13, &sc->r13); | ||
184 | err |= __put_user(regs->r14, &sc->r14); | ||
185 | err |= __put_user(regs->r15, &sc->r15); | ||
186 | err |= __put_user(me->thread.trap_no, &sc->trapno); | ||
187 | err |= __put_user(me->thread.error_code, &sc->err); | ||
188 | err |= __put_user(regs->rip, &sc->rip); | ||
189 | err |= __put_user(regs->eflags, &sc->eflags); | ||
190 | err |= __put_user(mask, &sc->oldmask); | ||
191 | err |= __put_user(me->thread.cr2, &sc->cr2); | ||
192 | |||
193 | return err; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Determine which stack to use.. | ||
198 | */ | ||
199 | |||
200 | static void __user * | ||
201 | get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) | ||
202 | { | ||
203 | unsigned long rsp; | ||
204 | |||
205 | /* Default to using normal stack - redzone*/ | ||
206 | rsp = regs->rsp - 128; | ||
207 | |||
208 | /* This is the X/Open sanctioned signal stack switching. */ | ||
209 | if (ka->sa.sa_flags & SA_ONSTACK) { | ||
210 | if (sas_ss_flags(rsp) == 0) | ||
211 | rsp = current->sas_ss_sp + current->sas_ss_size; | ||
212 | } | ||
213 | |||
214 | return (void __user *)round_down(rsp - size, 16); | ||
215 | } | ||
216 | |||
217 | static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
218 | sigset_t *set, struct pt_regs * regs) | ||
219 | { | ||
220 | struct rt_sigframe __user *frame; | ||
221 | struct _fpstate __user *fp = NULL; | ||
222 | int err = 0; | ||
223 | struct task_struct *me = current; | ||
224 | |||
225 | if (used_math()) { | ||
226 | fp = get_stack(ka, regs, sizeof(struct _fpstate)); | ||
227 | frame = (void __user *)round_down( | ||
228 | (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; | ||
229 | |||
230 | if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) | ||
231 | goto give_sigsegv; | ||
232 | |||
233 | if (save_i387(fp) < 0) | ||
234 | err |= -1; | ||
235 | } else | ||
236 | frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; | ||
237 | |||
238 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
239 | goto give_sigsegv; | ||
240 | |||
241 | if (ka->sa.sa_flags & SA_SIGINFO) { | ||
242 | err |= copy_siginfo_to_user(&frame->info, info); | ||
243 | if (err) | ||
244 | goto give_sigsegv; | ||
245 | } | ||
246 | |||
247 | /* Create the ucontext. */ | ||
248 | err |= __put_user(0, &frame->uc.uc_flags); | ||
249 | err |= __put_user(0, &frame->uc.uc_link); | ||
250 | err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | ||
251 | err |= __put_user(sas_ss_flags(regs->rsp), | ||
252 | &frame->uc.uc_stack.ss_flags); | ||
253 | err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
254 | err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); | ||
255 | err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); | ||
256 | if (sizeof(*set) == 16) { | ||
257 | __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); | ||
258 | __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); | ||
259 | } else | ||
260 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | ||
261 | |||
262 | /* Set up to return from userspace. If provided, use a stub | ||
263 | already in userspace. */ | ||
264 | /* x86-64 should always use SA_RESTORER. */ | ||
265 | if (ka->sa.sa_flags & SA_RESTORER) { | ||
266 | err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); | ||
267 | } else { | ||
268 | /* could use a vstub here */ | ||
269 | goto give_sigsegv; | ||
270 | } | ||
271 | |||
272 | if (err) | ||
273 | goto give_sigsegv; | ||
274 | |||
275 | #ifdef DEBUG_SIG | ||
276 | printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); | ||
277 | #endif | ||
278 | |||
279 | /* Set up registers for signal handler */ | ||
280 | regs->rdi = sig; | ||
281 | /* In case the signal handler was declared without prototypes */ | ||
282 | regs->rax = 0; | ||
283 | |||
284 | /* This also works for non SA_SIGINFO handlers because they expect the | ||
285 | next argument after the signal number on the stack. */ | ||
286 | regs->rsi = (unsigned long)&frame->info; | ||
287 | regs->rdx = (unsigned long)&frame->uc; | ||
288 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
289 | |||
290 | regs->rsp = (unsigned long)frame; | ||
291 | |||
292 | /* Set up the CS register to run signal handlers in 64-bit mode, | ||
293 | even if the handler happens to be interrupting 32-bit code. */ | ||
294 | regs->cs = __USER_CS; | ||
295 | |||
296 | /* This, by contrast, has nothing to do with segment registers - | ||
297 | see include/asm-x86_64/uaccess.h for details. */ | ||
298 | set_fs(USER_DS); | ||
299 | |||
300 | regs->eflags &= ~TF_MASK; | ||
301 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
302 | ptrace_notify(SIGTRAP); | ||
303 | #ifdef DEBUG_SIG | ||
304 | printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", | ||
305 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
306 | #endif | ||
307 | |||
308 | return 0; | ||
309 | |||
310 | give_sigsegv: | ||
311 | force_sigsegv(sig, current); | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * OK, we're invoking a handler | ||
317 | */ | ||
318 | |||
319 | static int | ||
320 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | ||
321 | sigset_t *oldset, struct pt_regs *regs) | ||
322 | { | ||
323 | int ret; | ||
324 | |||
325 | #ifdef DEBUG_SIG | ||
326 | printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", | ||
327 | current->pid, sig, | ||
328 | regs->rip, regs->rsp, regs); | ||
329 | #endif | ||
330 | |||
331 | /* Are we from a system call? */ | ||
332 | if ((long)regs->orig_rax >= 0) { | ||
333 | /* If so, check system call restarting.. */ | ||
334 | switch (regs->rax) { | ||
335 | case -ERESTART_RESTARTBLOCK: | ||
336 | case -ERESTARTNOHAND: | ||
337 | regs->rax = -EINTR; | ||
338 | break; | ||
339 | |||
340 | case -ERESTARTSYS: | ||
341 | if (!(ka->sa.sa_flags & SA_RESTART)) { | ||
342 | regs->rax = -EINTR; | ||
343 | break; | ||
344 | } | ||
345 | /* fallthrough */ | ||
346 | case -ERESTARTNOINTR: | ||
347 | regs->rax = regs->orig_rax; | ||
348 | regs->rip -= 2; | ||
349 | break; | ||
350 | } | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * If TF is set due to a debugger (PT_DTRACE), clear the TF | ||
355 | * flag so that register information in the sigcontext is | ||
356 | * correct. | ||
357 | */ | ||
358 | if (unlikely(regs->eflags & TF_MASK)) { | ||
359 | if (likely(current->ptrace & PT_DTRACE)) { | ||
360 | current->ptrace &= ~PT_DTRACE; | ||
361 | regs->eflags &= ~TF_MASK; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | #ifdef CONFIG_IA32_EMULATION | ||
366 | if (test_thread_flag(TIF_IA32)) { | ||
367 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
368 | ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); | ||
369 | else | ||
370 | ret = ia32_setup_frame(sig, ka, oldset, regs); | ||
371 | } else | ||
372 | #endif | ||
373 | ret = setup_rt_frame(sig, ka, info, oldset, regs); | ||
374 | |||
375 | if (ret == 0) { | ||
376 | spin_lock_irq(¤t->sighand->siglock); | ||
377 | sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); | ||
378 | if (!(ka->sa.sa_flags & SA_NODEFER)) | ||
379 | sigaddset(¤t->blocked,sig); | ||
380 | recalc_sigpending(); | ||
381 | spin_unlock_irq(¤t->sighand->siglock); | ||
382 | } | ||
383 | |||
384 | return ret; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * Note that 'init' is a special process: it doesn't get signals it doesn't | ||
389 | * want to handle. Thus you cannot kill init even with a SIGKILL even by | ||
390 | * mistake. | ||
391 | */ | ||
392 | static void do_signal(struct pt_regs *regs) | ||
393 | { | ||
394 | struct k_sigaction ka; | ||
395 | siginfo_t info; | ||
396 | int signr; | ||
397 | sigset_t *oldset; | ||
398 | |||
399 | /* | ||
400 | * We want the common case to go fast, which | ||
401 | * is why we may in certain cases get here from | ||
402 | * kernel mode. Just return without doing anything | ||
403 | * if so. | ||
404 | */ | ||
405 | if (!user_mode(regs)) | ||
406 | return; | ||
407 | |||
408 | if (test_thread_flag(TIF_RESTORE_SIGMASK)) | ||
409 | oldset = ¤t->saved_sigmask; | ||
410 | else | ||
411 | oldset = ¤t->blocked; | ||
412 | |||
413 | signr = get_signal_to_deliver(&info, &ka, regs, NULL); | ||
414 | if (signr > 0) { | ||
415 | /* Reenable any watchpoints before delivering the | ||
416 | * signal to user space. The processor register will | ||
417 | * have been cleared if the watchpoint triggered | ||
418 | * inside the kernel. | ||
419 | */ | ||
420 | if (current->thread.debugreg7) | ||
421 | set_debugreg(current->thread.debugreg7, 7); | ||
422 | |||
423 | /* Whee! Actually deliver the signal. */ | ||
424 | if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { | ||
425 | /* a signal was successfully delivered; the saved | ||
426 | * sigmask will have been stored in the signal frame, | ||
427 | * and will be restored by sigreturn, so we can simply | ||
428 | * clear the TIF_RESTORE_SIGMASK flag */ | ||
429 | clear_thread_flag(TIF_RESTORE_SIGMASK); | ||
430 | } | ||
431 | return; | ||
432 | } | ||
433 | |||
434 | /* Did we come from a system call? */ | ||
435 | if ((long)regs->orig_rax >= 0) { | ||
436 | /* Restart the system call - no handlers present */ | ||
437 | long res = regs->rax; | ||
438 | switch (res) { | ||
439 | case -ERESTARTNOHAND: | ||
440 | case -ERESTARTSYS: | ||
441 | case -ERESTARTNOINTR: | ||
442 | regs->rax = regs->orig_rax; | ||
443 | regs->rip -= 2; | ||
444 | break; | ||
445 | case -ERESTART_RESTARTBLOCK: | ||
446 | regs->rax = test_thread_flag(TIF_IA32) ? | ||
447 | __NR_ia32_restart_syscall : | ||
448 | __NR_restart_syscall; | ||
449 | regs->rip -= 2; | ||
450 | break; | ||
451 | } | ||
452 | } | ||
453 | |||
454 | /* if there's no signal to deliver, we just put the saved sigmask | ||
455 | back. */ | ||
456 | if (test_thread_flag(TIF_RESTORE_SIGMASK)) { | ||
457 | clear_thread_flag(TIF_RESTORE_SIGMASK); | ||
458 | sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); | ||
459 | } | ||
460 | } | ||
461 | |||
462 | void | ||
463 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | ||
464 | { | ||
465 | #ifdef DEBUG_SIG | ||
466 | printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", | ||
467 | thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); | ||
468 | #endif | ||
469 | |||
470 | /* Pending single-step? */ | ||
471 | if (thread_info_flags & _TIF_SINGLESTEP) { | ||
472 | regs->eflags |= TF_MASK; | ||
473 | clear_thread_flag(TIF_SINGLESTEP); | ||
474 | } | ||
475 | |||
476 | #ifdef CONFIG_X86_MCE | ||
477 | /* notify userspace of pending MCEs */ | ||
478 | if (thread_info_flags & _TIF_MCE_NOTIFY) | ||
479 | mce_notify_user(); | ||
480 | #endif /* CONFIG_X86_MCE */ | ||
481 | |||
482 | /* deal with pending signal delivery */ | ||
483 | if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) | ||
484 | do_signal(regs); | ||
485 | } | ||
486 | |||
487 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | ||
488 | { | ||
489 | struct task_struct *me = current; | ||
490 | if (show_unhandled_signals && printk_ratelimit()) | ||
491 | printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", | ||
492 | me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); | ||
493 | |||
494 | force_sig(SIGSEGV, me); | ||
495 | } | ||
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c deleted file mode 100644 index df4a82812adb..000000000000 --- a/arch/x86_64/kernel/smp.c +++ /dev/null | |||
@@ -1,523 +0,0 @@ | |||
1 | /* | ||
2 | * Intel SMP support routines. | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * (c) 2002,2003 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * This code is released under the GNU General Public License version 2 or | ||
9 | * later. | ||
10 | */ | ||
11 | |||
12 | #include <linux/init.h> | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/delay.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/smp.h> | ||
18 | #include <linux/kernel_stat.h> | ||
19 | #include <linux/mc146818rtc.h> | ||
20 | #include <linux/interrupt.h> | ||
21 | |||
22 | #include <asm/mtrr.h> | ||
23 | #include <asm/pgalloc.h> | ||
24 | #include <asm/tlbflush.h> | ||
25 | #include <asm/mach_apic.h> | ||
26 | #include <asm/mmu_context.h> | ||
27 | #include <asm/proto.h> | ||
28 | #include <asm/apicdef.h> | ||
29 | #include <asm/idle.h> | ||
30 | |||
31 | /* | ||
32 | * Smarter SMP flushing macros. | ||
33 | * c/o Linus Torvalds. | ||
34 | * | ||
35 | * These mean you can really definitely utterly forget about | ||
36 | * writing to user space from interrupts. (Its not allowed anyway). | ||
37 | * | ||
38 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | ||
39 | * | ||
40 | * More scalable flush, from Andi Kleen | ||
41 | * | ||
42 | * To avoid global state use 8 different call vectors. | ||
43 | * Each CPU uses a specific vector to trigger flushes on other | ||
44 | * CPUs. Depending on the received vector the target CPUs look into | ||
45 | * the right per cpu variable for the flush data. | ||
46 | * | ||
47 | * With more than 8 CPUs they are hashed to the 8 available | ||
48 | * vectors. The limited global vector space forces us to this right now. | ||
49 | * In future when interrupts are split into per CPU domains this could be | ||
50 | * fixed, at the cost of triggering multiple IPIs in some cases. | ||
51 | */ | ||
52 | |||
53 | union smp_flush_state { | ||
54 | struct { | ||
55 | cpumask_t flush_cpumask; | ||
56 | struct mm_struct *flush_mm; | ||
57 | unsigned long flush_va; | ||
58 | #define FLUSH_ALL -1ULL | ||
59 | spinlock_t tlbstate_lock; | ||
60 | }; | ||
61 | char pad[SMP_CACHE_BYTES]; | ||
62 | } ____cacheline_aligned; | ||
63 | |||
64 | /* State is put into the per CPU data section, but padded | ||
65 | to a full cache line because other CPUs can access it and we don't | ||
66 | want false sharing in the per cpu data segment. */ | ||
67 | static DEFINE_PER_CPU(union smp_flush_state, flush_state); | ||
68 | |||
69 | /* | ||
70 | * We cannot call mmdrop() because we are in interrupt context, | ||
71 | * instead update mm->cpu_vm_mask. | ||
72 | */ | ||
73 | static inline void leave_mm(int cpu) | ||
74 | { | ||
75 | if (read_pda(mmu_state) == TLBSTATE_OK) | ||
76 | BUG(); | ||
77 | cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); | ||
78 | load_cr3(swapper_pg_dir); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * | ||
83 | * The flush IPI assumes that a thread switch happens in this order: | ||
84 | * [cpu0: the cpu that switches] | ||
85 | * 1) switch_mm() either 1a) or 1b) | ||
86 | * 1a) thread switch to a different mm | ||
87 | * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | ||
88 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
89 | * the other cpus, but smp_invalidate_interrupt ignore flush ipis | ||
90 | * for the wrong mm, and in the worst case we perform a superfluous | ||
91 | * tlb flush. | ||
92 | * 1a2) set cpu mmu_state to TLBSTATE_OK | ||
93 | * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | ||
94 | * was in lazy tlb mode. | ||
95 | * 1a3) update cpu active_mm | ||
96 | * Now cpu0 accepts tlb flushes for the new mm. | ||
97 | * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | ||
98 | * Now the other cpus will send tlb flush ipis. | ||
99 | * 1a4) change cr3. | ||
100 | * 1b) thread switch without mm change | ||
101 | * cpu active_mm is correct, cpu0 already handles | ||
102 | * flush ipis. | ||
103 | * 1b1) set cpu mmu_state to TLBSTATE_OK | ||
104 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
105 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
106 | * and test the bit. | ||
107 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
108 | * 2) switch %%esp, ie current | ||
109 | * | ||
110 | * The interrupt must handle 2 special cases: | ||
111 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
112 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
113 | * runs in kernel space, the cpu could load tlb entries for user space | ||
114 | * pages. | ||
115 | * | ||
116 | * The good news is that cpu mmu_state is local to each cpu, no | ||
117 | * write/read ordering problems. | ||
118 | */ | ||
119 | |||
120 | /* | ||
121 | * TLB flush IPI: | ||
122 | * | ||
123 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
124 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
125 | * | ||
126 | * Interrupts are disabled. | ||
127 | */ | ||
128 | |||
129 | asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) | ||
130 | { | ||
131 | int cpu; | ||
132 | int sender; | ||
133 | union smp_flush_state *f; | ||
134 | |||
135 | cpu = smp_processor_id(); | ||
136 | /* | ||
137 | * orig_rax contains the negated interrupt vector. | ||
138 | * Use that to determine where the sender put the data. | ||
139 | */ | ||
140 | sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; | ||
141 | f = &per_cpu(flush_state, sender); | ||
142 | |||
143 | if (!cpu_isset(cpu, f->flush_cpumask)) | ||
144 | goto out; | ||
145 | /* | ||
146 | * This was a BUG() but until someone can quote me the | ||
147 | * line from the intel manual that guarantees an IPI to | ||
148 | * multiple CPUs is retried _only_ on the erroring CPUs | ||
149 | * its staying as a return | ||
150 | * | ||
151 | * BUG(); | ||
152 | */ | ||
153 | |||
154 | if (f->flush_mm == read_pda(active_mm)) { | ||
155 | if (read_pda(mmu_state) == TLBSTATE_OK) { | ||
156 | if (f->flush_va == FLUSH_ALL) | ||
157 | local_flush_tlb(); | ||
158 | else | ||
159 | __flush_tlb_one(f->flush_va); | ||
160 | } else | ||
161 | leave_mm(cpu); | ||
162 | } | ||
163 | out: | ||
164 | ack_APIC_irq(); | ||
165 | cpu_clear(cpu, f->flush_cpumask); | ||
166 | } | ||
167 | |||
168 | static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | ||
169 | unsigned long va) | ||
170 | { | ||
171 | int sender; | ||
172 | union smp_flush_state *f; | ||
173 | |||
174 | /* Caller has disabled preemption */ | ||
175 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | ||
176 | f = &per_cpu(flush_state, sender); | ||
177 | |||
178 | /* Could avoid this lock when | ||
179 | num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | ||
180 | probably not worth checking this for a cache-hot lock. */ | ||
181 | spin_lock(&f->tlbstate_lock); | ||
182 | |||
183 | f->flush_mm = mm; | ||
184 | f->flush_va = va; | ||
185 | cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); | ||
186 | |||
187 | /* | ||
188 | * We have to send the IPI only to | ||
189 | * CPUs affected. | ||
190 | */ | ||
191 | send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); | ||
192 | |||
193 | while (!cpus_empty(f->flush_cpumask)) | ||
194 | cpu_relax(); | ||
195 | |||
196 | f->flush_mm = NULL; | ||
197 | f->flush_va = 0; | ||
198 | spin_unlock(&f->tlbstate_lock); | ||
199 | } | ||
200 | |||
201 | int __cpuinit init_smp_flush(void) | ||
202 | { | ||
203 | int i; | ||
204 | for_each_cpu_mask(i, cpu_possible_map) { | ||
205 | spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); | ||
206 | } | ||
207 | return 0; | ||
208 | } | ||
209 | |||
210 | core_initcall(init_smp_flush); | ||
211 | |||
212 | void flush_tlb_current_task(void) | ||
213 | { | ||
214 | struct mm_struct *mm = current->mm; | ||
215 | cpumask_t cpu_mask; | ||
216 | |||
217 | preempt_disable(); | ||
218 | cpu_mask = mm->cpu_vm_mask; | ||
219 | cpu_clear(smp_processor_id(), cpu_mask); | ||
220 | |||
221 | local_flush_tlb(); | ||
222 | if (!cpus_empty(cpu_mask)) | ||
223 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
224 | preempt_enable(); | ||
225 | } | ||
226 | EXPORT_SYMBOL(flush_tlb_current_task); | ||
227 | |||
228 | void flush_tlb_mm (struct mm_struct * mm) | ||
229 | { | ||
230 | cpumask_t cpu_mask; | ||
231 | |||
232 | preempt_disable(); | ||
233 | cpu_mask = mm->cpu_vm_mask; | ||
234 | cpu_clear(smp_processor_id(), cpu_mask); | ||
235 | |||
236 | if (current->active_mm == mm) { | ||
237 | if (current->mm) | ||
238 | local_flush_tlb(); | ||
239 | else | ||
240 | leave_mm(smp_processor_id()); | ||
241 | } | ||
242 | if (!cpus_empty(cpu_mask)) | ||
243 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
244 | |||
245 | preempt_enable(); | ||
246 | } | ||
247 | EXPORT_SYMBOL(flush_tlb_mm); | ||
248 | |||
249 | void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | ||
250 | { | ||
251 | struct mm_struct *mm = vma->vm_mm; | ||
252 | cpumask_t cpu_mask; | ||
253 | |||
254 | preempt_disable(); | ||
255 | cpu_mask = mm->cpu_vm_mask; | ||
256 | cpu_clear(smp_processor_id(), cpu_mask); | ||
257 | |||
258 | if (current->active_mm == mm) { | ||
259 | if(current->mm) | ||
260 | __flush_tlb_one(va); | ||
261 | else | ||
262 | leave_mm(smp_processor_id()); | ||
263 | } | ||
264 | |||
265 | if (!cpus_empty(cpu_mask)) | ||
266 | flush_tlb_others(cpu_mask, mm, va); | ||
267 | |||
268 | preempt_enable(); | ||
269 | } | ||
270 | EXPORT_SYMBOL(flush_tlb_page); | ||
271 | |||
272 | static void do_flush_tlb_all(void* info) | ||
273 | { | ||
274 | unsigned long cpu = smp_processor_id(); | ||
275 | |||
276 | __flush_tlb_all(); | ||
277 | if (read_pda(mmu_state) == TLBSTATE_LAZY) | ||
278 | leave_mm(cpu); | ||
279 | } | ||
280 | |||
281 | void flush_tlb_all(void) | ||
282 | { | ||
283 | on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * this function sends a 'reschedule' IPI to another CPU. | ||
288 | * it goes straight through and wastes no time serializing | ||
289 | * anything. Worst case is that we lose a reschedule ... | ||
290 | */ | ||
291 | |||
292 | void smp_send_reschedule(int cpu) | ||
293 | { | ||
294 | send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Structure and data for smp_call_function(). This is designed to minimise | ||
299 | * static memory requirements. It also looks cleaner. | ||
300 | */ | ||
301 | static DEFINE_SPINLOCK(call_lock); | ||
302 | |||
303 | struct call_data_struct { | ||
304 | void (*func) (void *info); | ||
305 | void *info; | ||
306 | atomic_t started; | ||
307 | atomic_t finished; | ||
308 | int wait; | ||
309 | }; | ||
310 | |||
311 | static struct call_data_struct * call_data; | ||
312 | |||
313 | void lock_ipi_call_lock(void) | ||
314 | { | ||
315 | spin_lock_irq(&call_lock); | ||
316 | } | ||
317 | |||
318 | void unlock_ipi_call_lock(void) | ||
319 | { | ||
320 | spin_unlock_irq(&call_lock); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * this function sends a 'generic call function' IPI to one other CPU | ||
325 | * in the system. | ||
326 | * | ||
327 | * cpu is a standard Linux logical CPU number. | ||
328 | */ | ||
329 | static void | ||
330 | __smp_call_function_single(int cpu, void (*func) (void *info), void *info, | ||
331 | int nonatomic, int wait) | ||
332 | { | ||
333 | struct call_data_struct data; | ||
334 | int cpus = 1; | ||
335 | |||
336 | data.func = func; | ||
337 | data.info = info; | ||
338 | atomic_set(&data.started, 0); | ||
339 | data.wait = wait; | ||
340 | if (wait) | ||
341 | atomic_set(&data.finished, 0); | ||
342 | |||
343 | call_data = &data; | ||
344 | wmb(); | ||
345 | /* Send a message to all other CPUs and wait for them to respond */ | ||
346 | send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); | ||
347 | |||
348 | /* Wait for response */ | ||
349 | while (atomic_read(&data.started) != cpus) | ||
350 | cpu_relax(); | ||
351 | |||
352 | if (!wait) | ||
353 | return; | ||
354 | |||
355 | while (atomic_read(&data.finished) != cpus) | ||
356 | cpu_relax(); | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | * smp_call_function_single - Run a function on a specific CPU | ||
361 | * @func: The function to run. This must be fast and non-blocking. | ||
362 | * @info: An arbitrary pointer to pass to the function. | ||
363 | * @nonatomic: Currently unused. | ||
364 | * @wait: If true, wait until function has completed on other CPUs. | ||
365 | * | ||
366 | * Retrurns 0 on success, else a negative status code. | ||
367 | * | ||
368 | * Does not return until the remote CPU is nearly ready to execute <func> | ||
369 | * or is or has executed. | ||
370 | */ | ||
371 | |||
372 | int smp_call_function_single (int cpu, void (*func) (void *info), void *info, | ||
373 | int nonatomic, int wait) | ||
374 | { | ||
375 | /* prevent preemption and reschedule on another processor */ | ||
376 | int me = get_cpu(); | ||
377 | |||
378 | /* Can deadlock when called with interrupts disabled */ | ||
379 | WARN_ON(irqs_disabled()); | ||
380 | |||
381 | if (cpu == me) { | ||
382 | local_irq_disable(); | ||
383 | func(info); | ||
384 | local_irq_enable(); | ||
385 | put_cpu(); | ||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | spin_lock(&call_lock); | ||
390 | __smp_call_function_single(cpu, func, info, nonatomic, wait); | ||
391 | spin_unlock(&call_lock); | ||
392 | put_cpu(); | ||
393 | return 0; | ||
394 | } | ||
395 | EXPORT_SYMBOL(smp_call_function_single); | ||
396 | |||
397 | /* | ||
398 | * this function sends a 'generic call function' IPI to all other CPUs | ||
399 | * in the system. | ||
400 | */ | ||
401 | static void __smp_call_function (void (*func) (void *info), void *info, | ||
402 | int nonatomic, int wait) | ||
403 | { | ||
404 | struct call_data_struct data; | ||
405 | int cpus = num_online_cpus()-1; | ||
406 | |||
407 | if (!cpus) | ||
408 | return; | ||
409 | |||
410 | data.func = func; | ||
411 | data.info = info; | ||
412 | atomic_set(&data.started, 0); | ||
413 | data.wait = wait; | ||
414 | if (wait) | ||
415 | atomic_set(&data.finished, 0); | ||
416 | |||
417 | call_data = &data; | ||
418 | wmb(); | ||
419 | /* Send a message to all other CPUs and wait for them to respond */ | ||
420 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | ||
421 | |||
422 | /* Wait for response */ | ||
423 | while (atomic_read(&data.started) != cpus) | ||
424 | cpu_relax(); | ||
425 | |||
426 | if (!wait) | ||
427 | return; | ||
428 | |||
429 | while (atomic_read(&data.finished) != cpus) | ||
430 | cpu_relax(); | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * smp_call_function - run a function on all other CPUs. | ||
435 | * @func: The function to run. This must be fast and non-blocking. | ||
436 | * @info: An arbitrary pointer to pass to the function. | ||
437 | * @nonatomic: currently unused. | ||
438 | * @wait: If true, wait (atomically) until function has completed on other | ||
439 | * CPUs. | ||
440 | * | ||
441 | * Returns 0 on success, else a negative status code. Does not return until | ||
442 | * remote CPUs are nearly ready to execute func or are or have executed. | ||
443 | * | ||
444 | * You must not call this function with disabled interrupts or from a | ||
445 | * hardware interrupt handler or from a bottom half handler. | ||
446 | * Actually there are a few legal cases, like panic. | ||
447 | */ | ||
448 | int smp_call_function (void (*func) (void *info), void *info, int nonatomic, | ||
449 | int wait) | ||
450 | { | ||
451 | spin_lock(&call_lock); | ||
452 | __smp_call_function(func,info,nonatomic,wait); | ||
453 | spin_unlock(&call_lock); | ||
454 | return 0; | ||
455 | } | ||
456 | EXPORT_SYMBOL(smp_call_function); | ||
457 | |||
458 | static void stop_this_cpu(void *dummy) | ||
459 | { | ||
460 | local_irq_disable(); | ||
461 | /* | ||
462 | * Remove this CPU: | ||
463 | */ | ||
464 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
465 | disable_local_APIC(); | ||
466 | for (;;) | ||
467 | halt(); | ||
468 | } | ||
469 | |||
470 | void smp_send_stop(void) | ||
471 | { | ||
472 | int nolock; | ||
473 | unsigned long flags; | ||
474 | |||
475 | if (reboot_force) | ||
476 | return; | ||
477 | |||
478 | /* Don't deadlock on the call lock in panic */ | ||
479 | nolock = !spin_trylock(&call_lock); | ||
480 | local_irq_save(flags); | ||
481 | __smp_call_function(stop_this_cpu, NULL, 0, 0); | ||
482 | if (!nolock) | ||
483 | spin_unlock(&call_lock); | ||
484 | disable_local_APIC(); | ||
485 | local_irq_restore(flags); | ||
486 | } | ||
487 | |||
488 | /* | ||
489 | * Reschedule call back. Nothing to do, | ||
490 | * all the work is done automatically when | ||
491 | * we return from the interrupt. | ||
492 | */ | ||
493 | asmlinkage void smp_reschedule_interrupt(void) | ||
494 | { | ||
495 | ack_APIC_irq(); | ||
496 | } | ||
497 | |||
498 | asmlinkage void smp_call_function_interrupt(void) | ||
499 | { | ||
500 | void (*func) (void *info) = call_data->func; | ||
501 | void *info = call_data->info; | ||
502 | int wait = call_data->wait; | ||
503 | |||
504 | ack_APIC_irq(); | ||
505 | /* | ||
506 | * Notify initiating CPU that I've grabbed the data and am | ||
507 | * about to execute the function | ||
508 | */ | ||
509 | mb(); | ||
510 | atomic_inc(&call_data->started); | ||
511 | /* | ||
512 | * At this point the info structure may be out of scope unless wait==1 | ||
513 | */ | ||
514 | exit_idle(); | ||
515 | irq_enter(); | ||
516 | (*func)(info); | ||
517 | irq_exit(); | ||
518 | if (wait) { | ||
519 | mb(); | ||
520 | atomic_inc(&call_data->finished); | ||
521 | } | ||
522 | } | ||
523 | |||
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c deleted file mode 100644 index 32f50783edc8..000000000000 --- a/arch/x86_64/kernel/smpboot.c +++ /dev/null | |||
@@ -1,1085 +0,0 @@ | |||
1 | /* | ||
2 | * x86 SMP booting functions | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright 2001 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * Much of the core SMP work is based on previous work by Thomas Radke, to | ||
9 | * whom a great many thanks are extended. | ||
10 | * | ||
11 | * Thanks to Intel for making available several different Pentium, | ||
12 | * Pentium Pro and Pentium-II/Xeon MP machines. | ||
13 | * Original development of Linux SMP code supported by Caldera. | ||
14 | * | ||
15 | * This code is released under the GNU General Public License version 2 | ||
16 | * | ||
17 | * Fixes | ||
18 | * Felix Koop : NR_CPUS used properly | ||
19 | * Jose Renau : Handle single CPU case. | ||
20 | * Alan Cox : By repeated request 8) - Total BogoMIP report. | ||
21 | * Greg Wright : Fix for kernel stacks panic. | ||
22 | * Erich Boleyn : MP v1.4 and additional changes. | ||
23 | * Matthias Sattler : Changes for 2.1 kernel map. | ||
24 | * Michel Lespinasse : Changes for 2.1 kernel map. | ||
25 | * Michael Chastain : Change trampoline.S to gnu as. | ||
26 | * Alan Cox : Dumb bug: 'B' step PPro's are fine | ||
27 | * Ingo Molnar : Added APIC timers, based on code | ||
28 | * from Jose Renau | ||
29 | * Ingo Molnar : various cleanups and rewrites | ||
30 | * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. | ||
31 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs | ||
32 | * Andi Kleen : Changed for SMP boot into long mode. | ||
33 | * Rusty Russell : Hacked into shape for new "hotplug" boot process. | ||
34 | * Andi Kleen : Converted to new state machine. | ||
35 | * Various cleanups. | ||
36 | * Probably mostly hotplug CPU ready now. | ||
37 | * Ashok Raj : CPU hotplug support | ||
38 | */ | ||
39 | |||
40 | |||
41 | #include <linux/init.h> | ||
42 | |||
43 | #include <linux/mm.h> | ||
44 | #include <linux/kernel_stat.h> | ||
45 | #include <linux/bootmem.h> | ||
46 | #include <linux/thread_info.h> | ||
47 | #include <linux/module.h> | ||
48 | #include <linux/delay.h> | ||
49 | #include <linux/mc146818rtc.h> | ||
50 | #include <linux/smp.h> | ||
51 | #include <linux/kdebug.h> | ||
52 | |||
53 | #include <asm/mtrr.h> | ||
54 | #include <asm/pgalloc.h> | ||
55 | #include <asm/desc.h> | ||
56 | #include <asm/tlbflush.h> | ||
57 | #include <asm/proto.h> | ||
58 | #include <asm/nmi.h> | ||
59 | #include <asm/irq.h> | ||
60 | #include <asm/hw_irq.h> | ||
61 | #include <asm/numa.h> | ||
62 | |||
63 | /* Number of siblings per CPU package */ | ||
64 | int smp_num_siblings = 1; | ||
65 | EXPORT_SYMBOL(smp_num_siblings); | ||
66 | |||
67 | /* Last level cache ID of each logical CPU */ | ||
68 | u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; | ||
69 | |||
70 | /* Bitmask of currently online CPUs */ | ||
71 | cpumask_t cpu_online_map __read_mostly; | ||
72 | |||
73 | EXPORT_SYMBOL(cpu_online_map); | ||
74 | |||
75 | /* | ||
76 | * Private maps to synchronize booting between AP and BP. | ||
77 | * Probably not needed anymore, but it makes for easier debugging. -AK | ||
78 | */ | ||
79 | cpumask_t cpu_callin_map; | ||
80 | cpumask_t cpu_callout_map; | ||
81 | EXPORT_SYMBOL(cpu_callout_map); | ||
82 | |||
83 | cpumask_t cpu_possible_map; | ||
84 | EXPORT_SYMBOL(cpu_possible_map); | ||
85 | |||
86 | /* Per CPU bogomips and other parameters */ | ||
87 | struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; | ||
88 | EXPORT_SYMBOL(cpu_data); | ||
89 | |||
90 | /* Set when the idlers are all forked */ | ||
91 | int smp_threads_ready; | ||
92 | |||
93 | /* representing HT siblings of each logical CPU */ | ||
94 | cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; | ||
95 | EXPORT_SYMBOL(cpu_sibling_map); | ||
96 | |||
97 | /* representing HT and core siblings of each logical CPU */ | ||
98 | cpumask_t cpu_core_map[NR_CPUS] __read_mostly; | ||
99 | EXPORT_SYMBOL(cpu_core_map); | ||
100 | |||
101 | /* | ||
102 | * Trampoline 80x86 program as an array. | ||
103 | */ | ||
104 | |||
105 | extern unsigned char trampoline_data[]; | ||
106 | extern unsigned char trampoline_end[]; | ||
107 | |||
108 | /* State of each CPU */ | ||
109 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | ||
110 | |||
111 | /* | ||
112 | * Store all idle threads, this can be reused instead of creating | ||
113 | * a new thread. Also avoids complicated thread destroy functionality | ||
114 | * for idle threads. | ||
115 | */ | ||
116 | struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; | ||
117 | |||
118 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) | ||
119 | #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) | ||
120 | |||
121 | /* | ||
122 | * Currently trivial. Write the real->protected mode | ||
123 | * bootstrap into the page concerned. The caller | ||
124 | * has made sure it's suitably aligned. | ||
125 | */ | ||
126 | |||
127 | static unsigned long __cpuinit setup_trampoline(void) | ||
128 | { | ||
129 | void *tramp = __va(SMP_TRAMPOLINE_BASE); | ||
130 | memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); | ||
131 | return virt_to_phys(tramp); | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * The bootstrap kernel entry code has set these up. Save them for | ||
136 | * a given CPU | ||
137 | */ | ||
138 | |||
139 | static void __cpuinit smp_store_cpu_info(int id) | ||
140 | { | ||
141 | struct cpuinfo_x86 *c = cpu_data + id; | ||
142 | |||
143 | *c = boot_cpu_data; | ||
144 | identify_cpu(c); | ||
145 | print_cpu_info(c); | ||
146 | } | ||
147 | |||
148 | static atomic_t init_deasserted __cpuinitdata; | ||
149 | |||
150 | /* | ||
151 | * Report back to the Boot Processor. | ||
152 | * Running on AP. | ||
153 | */ | ||
154 | void __cpuinit smp_callin(void) | ||
155 | { | ||
156 | int cpuid, phys_id; | ||
157 | unsigned long timeout; | ||
158 | |||
159 | /* | ||
160 | * If waken up by an INIT in an 82489DX configuration | ||
161 | * we may get here before an INIT-deassert IPI reaches | ||
162 | * our local APIC. We have to wait for the IPI or we'll | ||
163 | * lock up on an APIC access. | ||
164 | */ | ||
165 | while (!atomic_read(&init_deasserted)) | ||
166 | cpu_relax(); | ||
167 | |||
168 | /* | ||
169 | * (This works even if the APIC is not enabled.) | ||
170 | */ | ||
171 | phys_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
172 | cpuid = smp_processor_id(); | ||
173 | if (cpu_isset(cpuid, cpu_callin_map)) { | ||
174 | panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", | ||
175 | phys_id, cpuid); | ||
176 | } | ||
177 | Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); | ||
178 | |||
179 | /* | ||
180 | * STARTUP IPIs are fragile beasts as they might sometimes | ||
181 | * trigger some glue motherboard logic. Complete APIC bus | ||
182 | * silence for 1 second, this overestimates the time the | ||
183 | * boot CPU is spending to send the up to 2 STARTUP IPIs | ||
184 | * by a factor of two. This should be enough. | ||
185 | */ | ||
186 | |||
187 | /* | ||
188 | * Waiting 2s total for startup (udelay is not yet working) | ||
189 | */ | ||
190 | timeout = jiffies + 2*HZ; | ||
191 | while (time_before(jiffies, timeout)) { | ||
192 | /* | ||
193 | * Has the boot CPU finished it's STARTUP sequence? | ||
194 | */ | ||
195 | if (cpu_isset(cpuid, cpu_callout_map)) | ||
196 | break; | ||
197 | cpu_relax(); | ||
198 | } | ||
199 | |||
200 | if (!time_before(jiffies, timeout)) { | ||
201 | panic("smp_callin: CPU%d started up but did not get a callout!\n", | ||
202 | cpuid); | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * the boot CPU has finished the init stage and is spinning | ||
207 | * on callin_map until we finish. We are free to set up this | ||
208 | * CPU, first the APIC. (this is probably redundant on most | ||
209 | * boards) | ||
210 | */ | ||
211 | |||
212 | Dprintk("CALLIN, before setup_local_APIC().\n"); | ||
213 | setup_local_APIC(); | ||
214 | |||
215 | /* | ||
216 | * Get our bogomips. | ||
217 | * | ||
218 | * Need to enable IRQs because it can take longer and then | ||
219 | * the NMI watchdog might kill us. | ||
220 | */ | ||
221 | local_irq_enable(); | ||
222 | calibrate_delay(); | ||
223 | local_irq_disable(); | ||
224 | Dprintk("Stack at about %p\n",&cpuid); | ||
225 | |||
226 | disable_APIC_timer(); | ||
227 | |||
228 | /* | ||
229 | * Save our processor parameters | ||
230 | */ | ||
231 | smp_store_cpu_info(cpuid); | ||
232 | |||
233 | /* | ||
234 | * Allow the master to continue. | ||
235 | */ | ||
236 | cpu_set(cpuid, cpu_callin_map); | ||
237 | } | ||
238 | |||
239 | /* maps the cpu to the sched domain representing multi-core */ | ||
240 | cpumask_t cpu_coregroup_map(int cpu) | ||
241 | { | ||
242 | struct cpuinfo_x86 *c = cpu_data + cpu; | ||
243 | /* | ||
244 | * For perf, we return last level cache shared map. | ||
245 | * And for power savings, we return cpu_core_map | ||
246 | */ | ||
247 | if (sched_mc_power_savings || sched_smt_power_savings) | ||
248 | return cpu_core_map[cpu]; | ||
249 | else | ||
250 | return c->llc_shared_map; | ||
251 | } | ||
252 | |||
253 | /* representing cpus for which sibling maps can be computed */ | ||
254 | static cpumask_t cpu_sibling_setup_map; | ||
255 | |||
256 | static inline void set_cpu_sibling_map(int cpu) | ||
257 | { | ||
258 | int i; | ||
259 | struct cpuinfo_x86 *c = cpu_data; | ||
260 | |||
261 | cpu_set(cpu, cpu_sibling_setup_map); | ||
262 | |||
263 | if (smp_num_siblings > 1) { | ||
264 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | ||
265 | if (c[cpu].phys_proc_id == c[i].phys_proc_id && | ||
266 | c[cpu].cpu_core_id == c[i].cpu_core_id) { | ||
267 | cpu_set(i, cpu_sibling_map[cpu]); | ||
268 | cpu_set(cpu, cpu_sibling_map[i]); | ||
269 | cpu_set(i, cpu_core_map[cpu]); | ||
270 | cpu_set(cpu, cpu_core_map[i]); | ||
271 | cpu_set(i, c[cpu].llc_shared_map); | ||
272 | cpu_set(cpu, c[i].llc_shared_map); | ||
273 | } | ||
274 | } | ||
275 | } else { | ||
276 | cpu_set(cpu, cpu_sibling_map[cpu]); | ||
277 | } | ||
278 | |||
279 | cpu_set(cpu, c[cpu].llc_shared_map); | ||
280 | |||
281 | if (current_cpu_data.x86_max_cores == 1) { | ||
282 | cpu_core_map[cpu] = cpu_sibling_map[cpu]; | ||
283 | c[cpu].booted_cores = 1; | ||
284 | return; | ||
285 | } | ||
286 | |||
287 | for_each_cpu_mask(i, cpu_sibling_setup_map) { | ||
288 | if (cpu_llc_id[cpu] != BAD_APICID && | ||
289 | cpu_llc_id[cpu] == cpu_llc_id[i]) { | ||
290 | cpu_set(i, c[cpu].llc_shared_map); | ||
291 | cpu_set(cpu, c[i].llc_shared_map); | ||
292 | } | ||
293 | if (c[cpu].phys_proc_id == c[i].phys_proc_id) { | ||
294 | cpu_set(i, cpu_core_map[cpu]); | ||
295 | cpu_set(cpu, cpu_core_map[i]); | ||
296 | /* | ||
297 | * Does this new cpu bringup a new core? | ||
298 | */ | ||
299 | if (cpus_weight(cpu_sibling_map[cpu]) == 1) { | ||
300 | /* | ||
301 | * for each core in package, increment | ||
302 | * the booted_cores for this new cpu | ||
303 | */ | ||
304 | if (first_cpu(cpu_sibling_map[i]) == i) | ||
305 | c[cpu].booted_cores++; | ||
306 | /* | ||
307 | * increment the core count for all | ||
308 | * the other cpus in this package | ||
309 | */ | ||
310 | if (i != cpu) | ||
311 | c[i].booted_cores++; | ||
312 | } else if (i != cpu && !c[cpu].booted_cores) | ||
313 | c[cpu].booted_cores = c[i].booted_cores; | ||
314 | } | ||
315 | } | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Setup code on secondary processor (after comming out of the trampoline) | ||
320 | */ | ||
321 | void __cpuinit start_secondary(void) | ||
322 | { | ||
323 | /* | ||
324 | * Dont put anything before smp_callin(), SMP | ||
325 | * booting is too fragile that we want to limit the | ||
326 | * things done here to the most necessary things. | ||
327 | */ | ||
328 | cpu_init(); | ||
329 | preempt_disable(); | ||
330 | smp_callin(); | ||
331 | |||
332 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | ||
333 | barrier(); | ||
334 | |||
335 | /* | ||
336 | * Check TSC sync first: | ||
337 | */ | ||
338 | check_tsc_sync_target(); | ||
339 | |||
340 | Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); | ||
341 | setup_secondary_APIC_clock(); | ||
342 | |||
343 | Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); | ||
344 | |||
345 | if (nmi_watchdog == NMI_IO_APIC) { | ||
346 | disable_8259A_irq(0); | ||
347 | enable_NMI_through_LVT0(NULL); | ||
348 | enable_8259A_irq(0); | ||
349 | } | ||
350 | |||
351 | enable_APIC_timer(); | ||
352 | |||
353 | /* | ||
354 | * The sibling maps must be set before turing the online map on for | ||
355 | * this cpu | ||
356 | */ | ||
357 | set_cpu_sibling_map(smp_processor_id()); | ||
358 | |||
359 | /* | ||
360 | * We need to hold call_lock, so there is no inconsistency | ||
361 | * between the time smp_call_function() determines number of | ||
362 | * IPI receipients, and the time when the determination is made | ||
363 | * for which cpus receive the IPI in genapic_flat.c. Holding this | ||
364 | * lock helps us to not include this cpu in a currently in progress | ||
365 | * smp_call_function(). | ||
366 | */ | ||
367 | lock_ipi_call_lock(); | ||
368 | spin_lock(&vector_lock); | ||
369 | |||
370 | /* Setup the per cpu irq handling data structures */ | ||
371 | __setup_vector_irq(smp_processor_id()); | ||
372 | /* | ||
373 | * Allow the master to continue. | ||
374 | */ | ||
375 | cpu_set(smp_processor_id(), cpu_online_map); | ||
376 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | ||
377 | spin_unlock(&vector_lock); | ||
378 | |||
379 | unlock_ipi_call_lock(); | ||
380 | |||
381 | cpu_idle(); | ||
382 | } | ||
383 | |||
384 | extern volatile unsigned long init_rsp; | ||
385 | extern void (*initial_code)(void); | ||
386 | |||
387 | #ifdef APIC_DEBUG | ||
388 | static void inquire_remote_apic(int apicid) | ||
389 | { | ||
390 | unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; | ||
391 | char *names[] = { "ID", "VERSION", "SPIV" }; | ||
392 | int timeout; | ||
393 | unsigned int status; | ||
394 | |||
395 | printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); | ||
396 | |||
397 | for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { | ||
398 | printk("... APIC #%d %s: ", apicid, names[i]); | ||
399 | |||
400 | /* | ||
401 | * Wait for idle. | ||
402 | */ | ||
403 | status = safe_apic_wait_icr_idle(); | ||
404 | if (status) | ||
405 | printk("a previous APIC delivery may have failed\n"); | ||
406 | |||
407 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); | ||
408 | apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); | ||
409 | |||
410 | timeout = 0; | ||
411 | do { | ||
412 | udelay(100); | ||
413 | status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; | ||
414 | } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); | ||
415 | |||
416 | switch (status) { | ||
417 | case APIC_ICR_RR_VALID: | ||
418 | status = apic_read(APIC_RRR); | ||
419 | printk("%08x\n", status); | ||
420 | break; | ||
421 | default: | ||
422 | printk("failed\n"); | ||
423 | } | ||
424 | } | ||
425 | } | ||
426 | #endif | ||
427 | |||
428 | /* | ||
429 | * Kick the secondary to wake up. | ||
430 | */ | ||
431 | static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) | ||
432 | { | ||
433 | unsigned long send_status, accept_status = 0; | ||
434 | int maxlvt, num_starts, j; | ||
435 | |||
436 | Dprintk("Asserting INIT.\n"); | ||
437 | |||
438 | /* | ||
439 | * Turn INIT on target chip | ||
440 | */ | ||
441 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
442 | |||
443 | /* | ||
444 | * Send IPI | ||
445 | */ | ||
446 | apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | ||
447 | | APIC_DM_INIT); | ||
448 | |||
449 | Dprintk("Waiting for send to finish...\n"); | ||
450 | send_status = safe_apic_wait_icr_idle(); | ||
451 | |||
452 | mdelay(10); | ||
453 | |||
454 | Dprintk("Deasserting INIT.\n"); | ||
455 | |||
456 | /* Target chip */ | ||
457 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
458 | |||
459 | /* Send IPI */ | ||
460 | apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); | ||
461 | |||
462 | Dprintk("Waiting for send to finish...\n"); | ||
463 | send_status = safe_apic_wait_icr_idle(); | ||
464 | |||
465 | mb(); | ||
466 | atomic_set(&init_deasserted, 1); | ||
467 | |||
468 | num_starts = 2; | ||
469 | |||
470 | /* | ||
471 | * Run STARTUP IPI loop. | ||
472 | */ | ||
473 | Dprintk("#startup loops: %d.\n", num_starts); | ||
474 | |||
475 | maxlvt = get_maxlvt(); | ||
476 | |||
477 | for (j = 1; j <= num_starts; j++) { | ||
478 | Dprintk("Sending STARTUP #%d.\n",j); | ||
479 | apic_write(APIC_ESR, 0); | ||
480 | apic_read(APIC_ESR); | ||
481 | Dprintk("After apic_write.\n"); | ||
482 | |||
483 | /* | ||
484 | * STARTUP IPI | ||
485 | */ | ||
486 | |||
487 | /* Target chip */ | ||
488 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
489 | |||
490 | /* Boot on the stack */ | ||
491 | /* Kick the second */ | ||
492 | apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); | ||
493 | |||
494 | /* | ||
495 | * Give the other CPU some time to accept the IPI. | ||
496 | */ | ||
497 | udelay(300); | ||
498 | |||
499 | Dprintk("Startup point 1.\n"); | ||
500 | |||
501 | Dprintk("Waiting for send to finish...\n"); | ||
502 | send_status = safe_apic_wait_icr_idle(); | ||
503 | |||
504 | /* | ||
505 | * Give the other CPU some time to accept the IPI. | ||
506 | */ | ||
507 | udelay(200); | ||
508 | /* | ||
509 | * Due to the Pentium erratum 3AP. | ||
510 | */ | ||
511 | if (maxlvt > 3) { | ||
512 | apic_write(APIC_ESR, 0); | ||
513 | } | ||
514 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
515 | if (send_status || accept_status) | ||
516 | break; | ||
517 | } | ||
518 | Dprintk("After Startup.\n"); | ||
519 | |||
520 | if (send_status) | ||
521 | printk(KERN_ERR "APIC never delivered???\n"); | ||
522 | if (accept_status) | ||
523 | printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); | ||
524 | |||
525 | return (send_status | accept_status); | ||
526 | } | ||
527 | |||
528 | struct create_idle { | ||
529 | struct work_struct work; | ||
530 | struct task_struct *idle; | ||
531 | struct completion done; | ||
532 | int cpu; | ||
533 | }; | ||
534 | |||
535 | void do_fork_idle(struct work_struct *work) | ||
536 | { | ||
537 | struct create_idle *c_idle = | ||
538 | container_of(work, struct create_idle, work); | ||
539 | |||
540 | c_idle->idle = fork_idle(c_idle->cpu); | ||
541 | complete(&c_idle->done); | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * Boot one CPU. | ||
546 | */ | ||
547 | static int __cpuinit do_boot_cpu(int cpu, int apicid) | ||
548 | { | ||
549 | unsigned long boot_error; | ||
550 | int timeout; | ||
551 | unsigned long start_rip; | ||
552 | struct create_idle c_idle = { | ||
553 | .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), | ||
554 | .cpu = cpu, | ||
555 | .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), | ||
556 | }; | ||
557 | |||
558 | /* allocate memory for gdts of secondary cpus. Hotplug is considered */ | ||
559 | if (!cpu_gdt_descr[cpu].address && | ||
560 | !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { | ||
561 | printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); | ||
562 | return -1; | ||
563 | } | ||
564 | |||
565 | /* Allocate node local memory for AP pdas */ | ||
566 | if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { | ||
567 | struct x8664_pda *newpda, *pda; | ||
568 | int node = cpu_to_node(cpu); | ||
569 | pda = cpu_pda(cpu); | ||
570 | newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC, | ||
571 | node); | ||
572 | if (newpda) { | ||
573 | memcpy(newpda, pda, sizeof (struct x8664_pda)); | ||
574 | cpu_pda(cpu) = newpda; | ||
575 | } else | ||
576 | printk(KERN_ERR | ||
577 | "Could not allocate node local PDA for CPU %d on node %d\n", | ||
578 | cpu, node); | ||
579 | } | ||
580 | |||
581 | alternatives_smp_switch(1); | ||
582 | |||
583 | c_idle.idle = get_idle_for_cpu(cpu); | ||
584 | |||
585 | if (c_idle.idle) { | ||
586 | c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) | ||
587 | (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); | ||
588 | init_idle(c_idle.idle, cpu); | ||
589 | goto do_rest; | ||
590 | } | ||
591 | |||
592 | /* | ||
593 | * During cold boot process, keventd thread is not spun up yet. | ||
594 | * When we do cpu hot-add, we create idle threads on the fly, we should | ||
595 | * not acquire any attributes from the calling context. Hence the clean | ||
596 | * way to create kernel_threads() is to do that from keventd(). | ||
597 | * We do the current_is_keventd() due to the fact that ACPI notifier | ||
598 | * was also queuing to keventd() and when the caller is already running | ||
599 | * in context of keventd(), we would end up with locking up the keventd | ||
600 | * thread. | ||
601 | */ | ||
602 | if (!keventd_up() || current_is_keventd()) | ||
603 | c_idle.work.func(&c_idle.work); | ||
604 | else { | ||
605 | schedule_work(&c_idle.work); | ||
606 | wait_for_completion(&c_idle.done); | ||
607 | } | ||
608 | |||
609 | if (IS_ERR(c_idle.idle)) { | ||
610 | printk("failed fork for CPU %d\n", cpu); | ||
611 | return PTR_ERR(c_idle.idle); | ||
612 | } | ||
613 | |||
614 | set_idle_for_cpu(cpu, c_idle.idle); | ||
615 | |||
616 | do_rest: | ||
617 | |||
618 | cpu_pda(cpu)->pcurrent = c_idle.idle; | ||
619 | |||
620 | start_rip = setup_trampoline(); | ||
621 | |||
622 | init_rsp = c_idle.idle->thread.rsp; | ||
623 | per_cpu(init_tss,cpu).rsp0 = init_rsp; | ||
624 | initial_code = start_secondary; | ||
625 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); | ||
626 | |||
627 | printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, | ||
628 | cpus_weight(cpu_present_map), | ||
629 | apicid); | ||
630 | |||
631 | /* | ||
632 | * This grunge runs the startup process for | ||
633 | * the targeted processor. | ||
634 | */ | ||
635 | |||
636 | atomic_set(&init_deasserted, 0); | ||
637 | |||
638 | Dprintk("Setting warm reset code and vector.\n"); | ||
639 | |||
640 | CMOS_WRITE(0xa, 0xf); | ||
641 | local_flush_tlb(); | ||
642 | Dprintk("1.\n"); | ||
643 | *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; | ||
644 | Dprintk("2.\n"); | ||
645 | *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; | ||
646 | Dprintk("3.\n"); | ||
647 | |||
648 | /* | ||
649 | * Be paranoid about clearing APIC errors. | ||
650 | */ | ||
651 | apic_write(APIC_ESR, 0); | ||
652 | apic_read(APIC_ESR); | ||
653 | |||
654 | /* | ||
655 | * Status is now clean | ||
656 | */ | ||
657 | boot_error = 0; | ||
658 | |||
659 | /* | ||
660 | * Starting actual IPI sequence... | ||
661 | */ | ||
662 | boot_error = wakeup_secondary_via_INIT(apicid, start_rip); | ||
663 | |||
664 | if (!boot_error) { | ||
665 | /* | ||
666 | * allow APs to start initializing. | ||
667 | */ | ||
668 | Dprintk("Before Callout %d.\n", cpu); | ||
669 | cpu_set(cpu, cpu_callout_map); | ||
670 | Dprintk("After Callout %d.\n", cpu); | ||
671 | |||
672 | /* | ||
673 | * Wait 5s total for a response | ||
674 | */ | ||
675 | for (timeout = 0; timeout < 50000; timeout++) { | ||
676 | if (cpu_isset(cpu, cpu_callin_map)) | ||
677 | break; /* It has booted */ | ||
678 | udelay(100); | ||
679 | } | ||
680 | |||
681 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
682 | /* number CPUs logically, starting from 1 (BSP is 0) */ | ||
683 | Dprintk("CPU has booted.\n"); | ||
684 | } else { | ||
685 | boot_error = 1; | ||
686 | if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) | ||
687 | == 0xA5) | ||
688 | /* trampoline started but...? */ | ||
689 | printk("Stuck ??\n"); | ||
690 | else | ||
691 | /* trampoline code not run */ | ||
692 | printk("Not responding.\n"); | ||
693 | #ifdef APIC_DEBUG | ||
694 | inquire_remote_apic(apicid); | ||
695 | #endif | ||
696 | } | ||
697 | } | ||
698 | if (boot_error) { | ||
699 | cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ | ||
700 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
701 | clear_node_cpumask(cpu); /* was set by numa_add_cpu */ | ||
702 | cpu_clear(cpu, cpu_present_map); | ||
703 | cpu_clear(cpu, cpu_possible_map); | ||
704 | x86_cpu_to_apicid[cpu] = BAD_APICID; | ||
705 | x86_cpu_to_log_apicid[cpu] = BAD_APICID; | ||
706 | return -EIO; | ||
707 | } | ||
708 | |||
709 | return 0; | ||
710 | } | ||
711 | |||
712 | cycles_t cacheflush_time; | ||
713 | unsigned long cache_decay_ticks; | ||
714 | |||
715 | /* | ||
716 | * Cleanup possible dangling ends... | ||
717 | */ | ||
718 | static __cpuinit void smp_cleanup_boot(void) | ||
719 | { | ||
720 | /* | ||
721 | * Paranoid: Set warm reset code and vector here back | ||
722 | * to default values. | ||
723 | */ | ||
724 | CMOS_WRITE(0, 0xf); | ||
725 | |||
726 | /* | ||
727 | * Reset trampoline flag | ||
728 | */ | ||
729 | *((volatile int *) phys_to_virt(0x467)) = 0; | ||
730 | } | ||
731 | |||
732 | /* | ||
733 | * Fall back to non SMP mode after errors. | ||
734 | * | ||
735 | * RED-PEN audit/test this more. I bet there is more state messed up here. | ||
736 | */ | ||
737 | static __init void disable_smp(void) | ||
738 | { | ||
739 | cpu_present_map = cpumask_of_cpu(0); | ||
740 | cpu_possible_map = cpumask_of_cpu(0); | ||
741 | if (smp_found_config) | ||
742 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); | ||
743 | else | ||
744 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
745 | cpu_set(0, cpu_sibling_map[0]); | ||
746 | cpu_set(0, cpu_core_map[0]); | ||
747 | } | ||
748 | |||
749 | #ifdef CONFIG_HOTPLUG_CPU | ||
750 | |||
751 | int additional_cpus __initdata = -1; | ||
752 | |||
753 | /* | ||
754 | * cpu_possible_map should be static, it cannot change as cpu's | ||
755 | * are onlined, or offlined. The reason is per-cpu data-structures | ||
756 | * are allocated by some modules at init time, and dont expect to | ||
757 | * do this dynamically on cpu arrival/departure. | ||
758 | * cpu_present_map on the other hand can change dynamically. | ||
759 | * In case when cpu_hotplug is not compiled, then we resort to current | ||
760 | * behaviour, which is cpu_possible == cpu_present. | ||
761 | * - Ashok Raj | ||
762 | * | ||
763 | * Three ways to find out the number of additional hotplug CPUs: | ||
764 | * - If the BIOS specified disabled CPUs in ACPI/mptables use that. | ||
765 | * - The user can overwrite it with additional_cpus=NUM | ||
766 | * - Otherwise don't reserve additional CPUs. | ||
767 | * We do this because additional CPUs waste a lot of memory. | ||
768 | * -AK | ||
769 | */ | ||
770 | __init void prefill_possible_map(void) | ||
771 | { | ||
772 | int i; | ||
773 | int possible; | ||
774 | |||
775 | if (additional_cpus == -1) { | ||
776 | if (disabled_cpus > 0) | ||
777 | additional_cpus = disabled_cpus; | ||
778 | else | ||
779 | additional_cpus = 0; | ||
780 | } | ||
781 | possible = num_processors + additional_cpus; | ||
782 | if (possible > NR_CPUS) | ||
783 | possible = NR_CPUS; | ||
784 | |||
785 | printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", | ||
786 | possible, | ||
787 | max_t(int, possible - num_processors, 0)); | ||
788 | |||
789 | for (i = 0; i < possible; i++) | ||
790 | cpu_set(i, cpu_possible_map); | ||
791 | } | ||
792 | #endif | ||
793 | |||
794 | /* | ||
795 | * Various sanity checks. | ||
796 | */ | ||
797 | static int __init smp_sanity_check(unsigned max_cpus) | ||
798 | { | ||
799 | if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { | ||
800 | printk("weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
801 | hard_smp_processor_id()); | ||
802 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * If we couldn't find an SMP configuration at boot time, | ||
807 | * get out of here now! | ||
808 | */ | ||
809 | if (!smp_found_config) { | ||
810 | printk(KERN_NOTICE "SMP motherboard not detected.\n"); | ||
811 | disable_smp(); | ||
812 | if (APIC_init_uniprocessor()) | ||
813 | printk(KERN_NOTICE "Local APIC not detected." | ||
814 | " Using dummy APIC emulation.\n"); | ||
815 | return -1; | ||
816 | } | ||
817 | |||
818 | /* | ||
819 | * Should not be necessary because the MP table should list the boot | ||
820 | * CPU too, but we do it for the sake of robustness anyway. | ||
821 | */ | ||
822 | if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { | ||
823 | printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
824 | boot_cpu_id); | ||
825 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
826 | } | ||
827 | |||
828 | /* | ||
829 | * If we couldn't find a local APIC, then get out of here now! | ||
830 | */ | ||
831 | if (!cpu_has_apic) { | ||
832 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | ||
833 | boot_cpu_id); | ||
834 | printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); | ||
835 | nr_ioapics = 0; | ||
836 | return -1; | ||
837 | } | ||
838 | |||
839 | /* | ||
840 | * If SMP should be disabled, then really disable it! | ||
841 | */ | ||
842 | if (!max_cpus) { | ||
843 | printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); | ||
844 | nr_ioapics = 0; | ||
845 | return -1; | ||
846 | } | ||
847 | |||
848 | return 0; | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Prepare for SMP bootup. The MP table or ACPI has been read | ||
853 | * earlier. Just do some sanity checking here and enable APIC mode. | ||
854 | */ | ||
855 | void __init smp_prepare_cpus(unsigned int max_cpus) | ||
856 | { | ||
857 | nmi_watchdog_default(); | ||
858 | current_cpu_data = boot_cpu_data; | ||
859 | current_thread_info()->cpu = 0; /* needed? */ | ||
860 | set_cpu_sibling_map(0); | ||
861 | |||
862 | if (smp_sanity_check(max_cpus) < 0) { | ||
863 | printk(KERN_INFO "SMP disabled\n"); | ||
864 | disable_smp(); | ||
865 | return; | ||
866 | } | ||
867 | |||
868 | |||
869 | /* | ||
870 | * Switch from PIC to APIC mode. | ||
871 | */ | ||
872 | setup_local_APIC(); | ||
873 | |||
874 | if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { | ||
875 | panic("Boot APIC ID in local APIC unexpected (%d vs %d)", | ||
876 | GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); | ||
877 | /* Or can we switch back to PIC here? */ | ||
878 | } | ||
879 | |||
880 | /* | ||
881 | * Now start the IO-APICs | ||
882 | */ | ||
883 | if (!skip_ioapic_setup && nr_ioapics) | ||
884 | setup_IO_APIC(); | ||
885 | else | ||
886 | nr_ioapics = 0; | ||
887 | |||
888 | /* | ||
889 | * Set up local APIC timer on boot CPU. | ||
890 | */ | ||
891 | |||
892 | setup_boot_APIC_clock(); | ||
893 | } | ||
894 | |||
895 | /* | ||
896 | * Early setup to make printk work. | ||
897 | */ | ||
898 | void __init smp_prepare_boot_cpu(void) | ||
899 | { | ||
900 | int me = smp_processor_id(); | ||
901 | cpu_set(me, cpu_online_map); | ||
902 | cpu_set(me, cpu_callout_map); | ||
903 | per_cpu(cpu_state, me) = CPU_ONLINE; | ||
904 | } | ||
905 | |||
906 | /* | ||
907 | * Entry point to boot a CPU. | ||
908 | */ | ||
909 | int __cpuinit __cpu_up(unsigned int cpu) | ||
910 | { | ||
911 | int apicid = cpu_present_to_apicid(cpu); | ||
912 | unsigned long flags; | ||
913 | int err; | ||
914 | |||
915 | WARN_ON(irqs_disabled()); | ||
916 | |||
917 | Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); | ||
918 | |||
919 | if (apicid == BAD_APICID || apicid == boot_cpu_id || | ||
920 | !physid_isset(apicid, phys_cpu_present_map)) { | ||
921 | printk("__cpu_up: bad cpu %d\n", cpu); | ||
922 | return -EINVAL; | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Already booted CPU? | ||
927 | */ | ||
928 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
929 | Dprintk("do_boot_cpu %d Already started\n", cpu); | ||
930 | return -ENOSYS; | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * Save current MTRR state in case it was changed since early boot | ||
935 | * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: | ||
936 | */ | ||
937 | mtrr_save_state(); | ||
938 | |||
939 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | ||
940 | /* Boot it! */ | ||
941 | err = do_boot_cpu(cpu, apicid); | ||
942 | if (err < 0) { | ||
943 | Dprintk("do_boot_cpu failed %d\n", err); | ||
944 | return err; | ||
945 | } | ||
946 | |||
947 | /* Unleash the CPU! */ | ||
948 | Dprintk("waiting for cpu %d\n", cpu); | ||
949 | |||
950 | /* | ||
951 | * Make sure and check TSC sync: | ||
952 | */ | ||
953 | local_irq_save(flags); | ||
954 | check_tsc_sync_source(cpu); | ||
955 | local_irq_restore(flags); | ||
956 | |||
957 | while (!cpu_isset(cpu, cpu_online_map)) | ||
958 | cpu_relax(); | ||
959 | err = 0; | ||
960 | |||
961 | return err; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Finish the SMP boot. | ||
966 | */ | ||
967 | void __init smp_cpus_done(unsigned int max_cpus) | ||
968 | { | ||
969 | smp_cleanup_boot(); | ||
970 | setup_ioapic_dest(); | ||
971 | check_nmi_watchdog(); | ||
972 | } | ||
973 | |||
974 | #ifdef CONFIG_HOTPLUG_CPU | ||
975 | |||
976 | static void remove_siblinginfo(int cpu) | ||
977 | { | ||
978 | int sibling; | ||
979 | struct cpuinfo_x86 *c = cpu_data; | ||
980 | |||
981 | for_each_cpu_mask(sibling, cpu_core_map[cpu]) { | ||
982 | cpu_clear(cpu, cpu_core_map[sibling]); | ||
983 | /* | ||
984 | * last thread sibling in this cpu core going down | ||
985 | */ | ||
986 | if (cpus_weight(cpu_sibling_map[cpu]) == 1) | ||
987 | c[sibling].booted_cores--; | ||
988 | } | ||
989 | |||
990 | for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) | ||
991 | cpu_clear(cpu, cpu_sibling_map[sibling]); | ||
992 | cpus_clear(cpu_sibling_map[cpu]); | ||
993 | cpus_clear(cpu_core_map[cpu]); | ||
994 | c[cpu].phys_proc_id = 0; | ||
995 | c[cpu].cpu_core_id = 0; | ||
996 | cpu_clear(cpu, cpu_sibling_setup_map); | ||
997 | } | ||
998 | |||
999 | void remove_cpu_from_maps(void) | ||
1000 | { | ||
1001 | int cpu = smp_processor_id(); | ||
1002 | |||
1003 | cpu_clear(cpu, cpu_callout_map); | ||
1004 | cpu_clear(cpu, cpu_callin_map); | ||
1005 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
1006 | clear_node_cpumask(cpu); | ||
1007 | } | ||
1008 | |||
1009 | int __cpu_disable(void) | ||
1010 | { | ||
1011 | int cpu = smp_processor_id(); | ||
1012 | |||
1013 | /* | ||
1014 | * Perhaps use cpufreq to drop frequency, but that could go | ||
1015 | * into generic code. | ||
1016 | * | ||
1017 | * We won't take down the boot processor on i386 due to some | ||
1018 | * interrupts only being able to be serviced by the BSP. | ||
1019 | * Especially so if we're not using an IOAPIC -zwane | ||
1020 | */ | ||
1021 | if (cpu == 0) | ||
1022 | return -EBUSY; | ||
1023 | |||
1024 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1025 | stop_apic_nmi_watchdog(NULL); | ||
1026 | clear_local_APIC(); | ||
1027 | |||
1028 | /* | ||
1029 | * HACK: | ||
1030 | * Allow any queued timer interrupts to get serviced | ||
1031 | * This is only a temporary solution until we cleanup | ||
1032 | * fixup_irqs as we do for IA64. | ||
1033 | */ | ||
1034 | local_irq_enable(); | ||
1035 | mdelay(1); | ||
1036 | |||
1037 | local_irq_disable(); | ||
1038 | remove_siblinginfo(cpu); | ||
1039 | |||
1040 | spin_lock(&vector_lock); | ||
1041 | /* It's now safe to remove this processor from the online map */ | ||
1042 | cpu_clear(cpu, cpu_online_map); | ||
1043 | spin_unlock(&vector_lock); | ||
1044 | remove_cpu_from_maps(); | ||
1045 | fixup_irqs(cpu_online_map); | ||
1046 | return 0; | ||
1047 | } | ||
1048 | |||
1049 | void __cpu_die(unsigned int cpu) | ||
1050 | { | ||
1051 | /* We don't do anything here: idle task is faking death itself. */ | ||
1052 | unsigned int i; | ||
1053 | |||
1054 | for (i = 0; i < 10; i++) { | ||
1055 | /* They ack this in play_dead by setting CPU_DEAD */ | ||
1056 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) { | ||
1057 | printk ("CPU %d is now offline\n", cpu); | ||
1058 | if (1 == num_online_cpus()) | ||
1059 | alternatives_smp_switch(0); | ||
1060 | return; | ||
1061 | } | ||
1062 | msleep(100); | ||
1063 | } | ||
1064 | printk(KERN_ERR "CPU %u didn't die...\n", cpu); | ||
1065 | } | ||
1066 | |||
1067 | static __init int setup_additional_cpus(char *s) | ||
1068 | { | ||
1069 | return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; | ||
1070 | } | ||
1071 | early_param("additional_cpus", setup_additional_cpus); | ||
1072 | |||
1073 | #else /* ... !CONFIG_HOTPLUG_CPU */ | ||
1074 | |||
1075 | int __cpu_disable(void) | ||
1076 | { | ||
1077 | return -ENOSYS; | ||
1078 | } | ||
1079 | |||
1080 | void __cpu_die(unsigned int cpu) | ||
1081 | { | ||
1082 | /* We said "no" in __cpu_disable */ | ||
1083 | BUG(); | ||
1084 | } | ||
1085 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c deleted file mode 100644 index cb9109113584..000000000000 --- a/arch/x86_64/kernel/stacktrace.c +++ /dev/null | |||
@@ -1,54 +0,0 @@ | |||
1 | /* | ||
2 | * arch/x86_64/kernel/stacktrace.c | ||
3 | * | ||
4 | * Stack trace management functions | ||
5 | * | ||
6 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | */ | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/stacktrace.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <asm/stacktrace.h> | ||
12 | |||
13 | static void save_stack_warning(void *data, char *msg) | ||
14 | { | ||
15 | } | ||
16 | |||
17 | static void | ||
18 | save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
19 | { | ||
20 | } | ||
21 | |||
22 | static int save_stack_stack(void *data, char *name) | ||
23 | { | ||
24 | return -1; | ||
25 | } | ||
26 | |||
27 | static void save_stack_address(void *data, unsigned long addr) | ||
28 | { | ||
29 | struct stack_trace *trace = (struct stack_trace *)data; | ||
30 | if (trace->skip > 0) { | ||
31 | trace->skip--; | ||
32 | return; | ||
33 | } | ||
34 | if (trace->nr_entries < trace->max_entries) | ||
35 | trace->entries[trace->nr_entries++] = addr; | ||
36 | } | ||
37 | |||
38 | static struct stacktrace_ops save_stack_ops = { | ||
39 | .warning = save_stack_warning, | ||
40 | .warning_symbol = save_stack_warning_symbol, | ||
41 | .stack = save_stack_stack, | ||
42 | .address = save_stack_address, | ||
43 | }; | ||
44 | |||
45 | /* | ||
46 | * Save stack-backtrace addresses into a stack_trace buffer. | ||
47 | */ | ||
48 | void save_stack_trace(struct stack_trace *trace) | ||
49 | { | ||
50 | dump_trace(current, NULL, NULL, &save_stack_ops, trace); | ||
51 | if (trace->nr_entries < trace->max_entries) | ||
52 | trace->entries[trace->nr_entries++] = ULONG_MAX; | ||
53 | } | ||
54 | EXPORT_SYMBOL(save_stack_trace); | ||
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c deleted file mode 100644 index 573c0a6e0ac6..000000000000 --- a/arch/x86_64/kernel/suspend.c +++ /dev/null | |||
@@ -1,239 +0,0 @@ | |||
1 | /* | ||
2 | * Suspend support specific for i386. | ||
3 | * | ||
4 | * Distribute under GPLv2 | ||
5 | * | ||
6 | * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> | ||
7 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | ||
8 | */ | ||
9 | |||
10 | #include <linux/smp.h> | ||
11 | #include <linux/suspend.h> | ||
12 | #include <asm/proto.h> | ||
13 | #include <asm/page.h> | ||
14 | #include <asm/pgtable.h> | ||
15 | #include <asm/mtrr.h> | ||
16 | |||
17 | /* References to section boundaries */ | ||
18 | extern const void __nosave_begin, __nosave_end; | ||
19 | |||
20 | struct saved_context saved_context; | ||
21 | |||
22 | unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; | ||
23 | unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi; | ||
24 | unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11; | ||
25 | unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; | ||
26 | unsigned long saved_context_eflags; | ||
27 | |||
28 | void __save_processor_state(struct saved_context *ctxt) | ||
29 | { | ||
30 | kernel_fpu_begin(); | ||
31 | |||
32 | /* | ||
33 | * descriptor tables | ||
34 | */ | ||
35 | asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); | ||
36 | asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); | ||
37 | asm volatile ("str %0" : "=m" (ctxt->tr)); | ||
38 | |||
39 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ | ||
40 | /* | ||
41 | * segment registers | ||
42 | */ | ||
43 | asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); | ||
44 | asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); | ||
45 | asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); | ||
46 | asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); | ||
47 | asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); | ||
48 | |||
49 | rdmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
50 | rdmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
51 | rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
52 | mtrr_save_fixed_ranges(NULL); | ||
53 | |||
54 | /* | ||
55 | * control registers | ||
56 | */ | ||
57 | rdmsrl(MSR_EFER, ctxt->efer); | ||
58 | ctxt->cr0 = read_cr0(); | ||
59 | ctxt->cr2 = read_cr2(); | ||
60 | ctxt->cr3 = read_cr3(); | ||
61 | ctxt->cr4 = read_cr4(); | ||
62 | ctxt->cr8 = read_cr8(); | ||
63 | } | ||
64 | |||
65 | void save_processor_state(void) | ||
66 | { | ||
67 | __save_processor_state(&saved_context); | ||
68 | } | ||
69 | |||
70 | static void do_fpu_end(void) | ||
71 | { | ||
72 | /* | ||
73 | * Restore FPU regs if necessary | ||
74 | */ | ||
75 | kernel_fpu_end(); | ||
76 | } | ||
77 | |||
78 | void __restore_processor_state(struct saved_context *ctxt) | ||
79 | { | ||
80 | /* | ||
81 | * control registers | ||
82 | */ | ||
83 | wrmsrl(MSR_EFER, ctxt->efer); | ||
84 | write_cr8(ctxt->cr8); | ||
85 | write_cr4(ctxt->cr4); | ||
86 | write_cr3(ctxt->cr3); | ||
87 | write_cr2(ctxt->cr2); | ||
88 | write_cr0(ctxt->cr0); | ||
89 | |||
90 | /* | ||
91 | * now restore the descriptor tables to their proper values | ||
92 | * ltr is done i fix_processor_context(). | ||
93 | */ | ||
94 | asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); | ||
95 | asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); | ||
96 | |||
97 | /* | ||
98 | * segment registers | ||
99 | */ | ||
100 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); | ||
101 | asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); | ||
102 | asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); | ||
103 | load_gs_index(ctxt->gs); | ||
104 | asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); | ||
105 | |||
106 | wrmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
107 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
108 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
109 | |||
110 | fix_processor_context(); | ||
111 | |||
112 | do_fpu_end(); | ||
113 | mtrr_ap_init(); | ||
114 | } | ||
115 | |||
116 | void restore_processor_state(void) | ||
117 | { | ||
118 | __restore_processor_state(&saved_context); | ||
119 | } | ||
120 | |||
121 | void fix_processor_context(void) | ||
122 | { | ||
123 | int cpu = smp_processor_id(); | ||
124 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
125 | |||
126 | set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ | ||
127 | |||
128 | cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; | ||
129 | |||
130 | syscall_init(); /* This sets MSR_*STAR and related */ | ||
131 | load_TR_desc(); /* This does ltr */ | ||
132 | load_LDT(¤t->active_mm->context); /* This does lldt */ | ||
133 | |||
134 | /* | ||
135 | * Now maybe reload the debug registers | ||
136 | */ | ||
137 | if (current->thread.debugreg7){ | ||
138 | loaddebug(¤t->thread, 0); | ||
139 | loaddebug(¤t->thread, 1); | ||
140 | loaddebug(¤t->thread, 2); | ||
141 | loaddebug(¤t->thread, 3); | ||
142 | /* no 4 and 5 */ | ||
143 | loaddebug(¤t->thread, 6); | ||
144 | loaddebug(¤t->thread, 7); | ||
145 | } | ||
146 | |||
147 | } | ||
148 | |||
149 | #ifdef CONFIG_HIBERNATION | ||
150 | /* Defined in arch/x86_64/kernel/suspend_asm.S */ | ||
151 | extern int restore_image(void); | ||
152 | |||
153 | pgd_t *temp_level4_pgt; | ||
154 | |||
155 | static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) | ||
156 | { | ||
157 | long i, j; | ||
158 | |||
159 | i = pud_index(address); | ||
160 | pud = pud + i; | ||
161 | for (; i < PTRS_PER_PUD; pud++, i++) { | ||
162 | unsigned long paddr; | ||
163 | pmd_t *pmd; | ||
164 | |||
165 | paddr = address + i*PUD_SIZE; | ||
166 | if (paddr >= end) | ||
167 | break; | ||
168 | |||
169 | pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); | ||
170 | if (!pmd) | ||
171 | return -ENOMEM; | ||
172 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | ||
173 | for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { | ||
174 | unsigned long pe; | ||
175 | |||
176 | if (paddr >= end) | ||
177 | break; | ||
178 | pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr; | ||
179 | pe &= __supported_pte_mask; | ||
180 | set_pmd(pmd, __pmd(pe)); | ||
181 | } | ||
182 | } | ||
183 | return 0; | ||
184 | } | ||
185 | |||
186 | static int set_up_temporary_mappings(void) | ||
187 | { | ||
188 | unsigned long start, end, next; | ||
189 | int error; | ||
190 | |||
191 | temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); | ||
192 | if (!temp_level4_pgt) | ||
193 | return -ENOMEM; | ||
194 | |||
195 | /* It is safe to reuse the original kernel mapping */ | ||
196 | set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), | ||
197 | init_level4_pgt[pgd_index(__START_KERNEL_map)]); | ||
198 | |||
199 | /* Set up the direct mapping from scratch */ | ||
200 | start = (unsigned long)pfn_to_kaddr(0); | ||
201 | end = (unsigned long)pfn_to_kaddr(end_pfn); | ||
202 | |||
203 | for (; start < end; start = next) { | ||
204 | pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); | ||
205 | if (!pud) | ||
206 | return -ENOMEM; | ||
207 | next = start + PGDIR_SIZE; | ||
208 | if (next > end) | ||
209 | next = end; | ||
210 | if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) | ||
211 | return error; | ||
212 | set_pgd(temp_level4_pgt + pgd_index(start), | ||
213 | mk_kernel_pgd(__pa(pud))); | ||
214 | } | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | int swsusp_arch_resume(void) | ||
219 | { | ||
220 | int error; | ||
221 | |||
222 | /* We have got enough memory and from now on we cannot recover */ | ||
223 | if ((error = set_up_temporary_mappings())) | ||
224 | return error; | ||
225 | restore_image(); | ||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * pfn_is_nosave - check if given pfn is in the 'nosave' section | ||
231 | */ | ||
232 | |||
233 | int pfn_is_nosave(unsigned long pfn) | ||
234 | { | ||
235 | unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; | ||
236 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; | ||
237 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
238 | } | ||
239 | #endif /* CONFIG_HIBERNATION */ | ||
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S deleted file mode 100644 index 16d183f67bc1..000000000000 --- a/arch/x86_64/kernel/suspend_asm.S +++ /dev/null | |||
@@ -1,110 +0,0 @@ | |||
1 | /* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> | ||
2 | * | ||
3 | * Distribute under GPLv2. | ||
4 | * | ||
5 | * swsusp_arch_resume may not use any stack, nor any variable that is | ||
6 | * not "NoSave" during copying pages: | ||
7 | * | ||
8 | * Its rewriting one kernel image with another. What is stack in "old" | ||
9 | * image could very well be data page in "new" image, and overwriting | ||
10 | * your own stack under you is bad idea. | ||
11 | */ | ||
12 | |||
13 | .text | ||
14 | #include <linux/linkage.h> | ||
15 | #include <asm/segment.h> | ||
16 | #include <asm/page.h> | ||
17 | #include <asm/asm-offsets.h> | ||
18 | |||
19 | ENTRY(swsusp_arch_suspend) | ||
20 | |||
21 | movq %rsp, saved_context_esp(%rip) | ||
22 | movq %rax, saved_context_eax(%rip) | ||
23 | movq %rbx, saved_context_ebx(%rip) | ||
24 | movq %rcx, saved_context_ecx(%rip) | ||
25 | movq %rdx, saved_context_edx(%rip) | ||
26 | movq %rbp, saved_context_ebp(%rip) | ||
27 | movq %rsi, saved_context_esi(%rip) | ||
28 | movq %rdi, saved_context_edi(%rip) | ||
29 | movq %r8, saved_context_r08(%rip) | ||
30 | movq %r9, saved_context_r09(%rip) | ||
31 | movq %r10, saved_context_r10(%rip) | ||
32 | movq %r11, saved_context_r11(%rip) | ||
33 | movq %r12, saved_context_r12(%rip) | ||
34 | movq %r13, saved_context_r13(%rip) | ||
35 | movq %r14, saved_context_r14(%rip) | ||
36 | movq %r15, saved_context_r15(%rip) | ||
37 | pushfq ; popq saved_context_eflags(%rip) | ||
38 | |||
39 | call swsusp_save | ||
40 | ret | ||
41 | |||
42 | ENTRY(restore_image) | ||
43 | /* switch to temporary page tables */ | ||
44 | movq $__PAGE_OFFSET, %rdx | ||
45 | movq temp_level4_pgt(%rip), %rax | ||
46 | subq %rdx, %rax | ||
47 | movq %rax, %cr3 | ||
48 | /* Flush TLB */ | ||
49 | movq mmu_cr4_features(%rip), %rax | ||
50 | movq %rax, %rdx | ||
51 | andq $~(1<<7), %rdx # PGE | ||
52 | movq %rdx, %cr4; # turn off PGE | ||
53 | movq %cr3, %rcx; # flush TLB | ||
54 | movq %rcx, %cr3; | ||
55 | movq %rax, %cr4; # turn PGE back on | ||
56 | |||
57 | movq restore_pblist(%rip), %rdx | ||
58 | loop: | ||
59 | testq %rdx, %rdx | ||
60 | jz done | ||
61 | |||
62 | /* get addresses from the pbe and copy the page */ | ||
63 | movq pbe_address(%rdx), %rsi | ||
64 | movq pbe_orig_address(%rdx), %rdi | ||
65 | movq $512, %rcx | ||
66 | rep | ||
67 | movsq | ||
68 | |||
69 | /* progress to the next pbe */ | ||
70 | movq pbe_next(%rdx), %rdx | ||
71 | jmp loop | ||
72 | done: | ||
73 | /* go back to the original page tables */ | ||
74 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
75 | addq phys_base(%rip), %rax | ||
76 | movq %rax, %cr3 | ||
77 | |||
78 | /* Flush TLB, including "global" things (vmalloc) */ | ||
79 | movq mmu_cr4_features(%rip), %rax | ||
80 | movq %rax, %rdx | ||
81 | andq $~(1<<7), %rdx; # PGE | ||
82 | movq %rdx, %cr4; # turn off PGE | ||
83 | movq %cr3, %rcx; # flush TLB | ||
84 | movq %rcx, %cr3 | ||
85 | movq %rax, %cr4; # turn PGE back on | ||
86 | |||
87 | movl $24, %eax | ||
88 | movl %eax, %ds | ||
89 | |||
90 | movq saved_context_esp(%rip), %rsp | ||
91 | movq saved_context_ebp(%rip), %rbp | ||
92 | /* Don't restore %rax, it must be 0 anyway */ | ||
93 | movq saved_context_ebx(%rip), %rbx | ||
94 | movq saved_context_ecx(%rip), %rcx | ||
95 | movq saved_context_edx(%rip), %rdx | ||
96 | movq saved_context_esi(%rip), %rsi | ||
97 | movq saved_context_edi(%rip), %rdi | ||
98 | movq saved_context_r08(%rip), %r8 | ||
99 | movq saved_context_r09(%rip), %r9 | ||
100 | movq saved_context_r10(%rip), %r10 | ||
101 | movq saved_context_r11(%rip), %r11 | ||
102 | movq saved_context_r12(%rip), %r12 | ||
103 | movq saved_context_r13(%rip), %r13 | ||
104 | movq saved_context_r14(%rip), %r14 | ||
105 | movq saved_context_r15(%rip), %r15 | ||
106 | pushq saved_context_eflags(%rip) ; popfq | ||
107 | |||
108 | xorq %rax, %rax | ||
109 | |||
110 | ret | ||
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c deleted file mode 100644 index 4770b7a2052c..000000000000 --- a/arch/x86_64/kernel/sys_x86_64.c +++ /dev/null | |||
@@ -1,159 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/sys_x86_64.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/errno.h> | ||
6 | #include <linux/sched.h> | ||
7 | #include <linux/syscalls.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/fs.h> | ||
10 | #include <linux/smp.h> | ||
11 | #include <linux/sem.h> | ||
12 | #include <linux/msg.h> | ||
13 | #include <linux/shm.h> | ||
14 | #include <linux/stat.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/file.h> | ||
17 | #include <linux/utsname.h> | ||
18 | #include <linux/personality.h> | ||
19 | |||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/ia32.h> | ||
22 | |||
23 | /* | ||
24 | * sys_pipe() is the normal C calling standard for creating | ||
25 | * a pipe. It's not the way Unix traditionally does this, though. | ||
26 | */ | ||
27 | asmlinkage long sys_pipe(int __user *fildes) | ||
28 | { | ||
29 | int fd[2]; | ||
30 | int error; | ||
31 | |||
32 | error = do_pipe(fd); | ||
33 | if (!error) { | ||
34 | if (copy_to_user(fildes, fd, 2*sizeof(int))) | ||
35 | error = -EFAULT; | ||
36 | } | ||
37 | return error; | ||
38 | } | ||
39 | |||
40 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, | ||
41 | unsigned long fd, unsigned long off) | ||
42 | { | ||
43 | long error; | ||
44 | struct file * file; | ||
45 | |||
46 | error = -EINVAL; | ||
47 | if (off & ~PAGE_MASK) | ||
48 | goto out; | ||
49 | |||
50 | error = -EBADF; | ||
51 | file = NULL; | ||
52 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
53 | if (!(flags & MAP_ANONYMOUS)) { | ||
54 | file = fget(fd); | ||
55 | if (!file) | ||
56 | goto out; | ||
57 | } | ||
58 | down_write(¤t->mm->mmap_sem); | ||
59 | error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); | ||
60 | up_write(¤t->mm->mmap_sem); | ||
61 | |||
62 | if (file) | ||
63 | fput(file); | ||
64 | out: | ||
65 | return error; | ||
66 | } | ||
67 | |||
68 | static void find_start_end(unsigned long flags, unsigned long *begin, | ||
69 | unsigned long *end) | ||
70 | { | ||
71 | if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { | ||
72 | /* This is usually used needed to map code in small | ||
73 | model, so it needs to be in the first 31bit. Limit | ||
74 | it to that. This means we need to move the | ||
75 | unmapped base down for this case. This can give | ||
76 | conflicts with the heap, but we assume that glibc | ||
77 | malloc knows how to fall back to mmap. Give it 1GB | ||
78 | of playground for now. -AK */ | ||
79 | *begin = 0x40000000; | ||
80 | *end = 0x80000000; | ||
81 | } else { | ||
82 | *begin = TASK_UNMAPPED_BASE; | ||
83 | *end = TASK_SIZE; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | unsigned long | ||
88 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | ||
89 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
90 | { | ||
91 | struct mm_struct *mm = current->mm; | ||
92 | struct vm_area_struct *vma; | ||
93 | unsigned long start_addr; | ||
94 | unsigned long begin, end; | ||
95 | |||
96 | if (flags & MAP_FIXED) | ||
97 | return addr; | ||
98 | |||
99 | find_start_end(flags, &begin, &end); | ||
100 | |||
101 | if (len > end) | ||
102 | return -ENOMEM; | ||
103 | |||
104 | if (addr) { | ||
105 | addr = PAGE_ALIGN(addr); | ||
106 | vma = find_vma(mm, addr); | ||
107 | if (end - len >= addr && | ||
108 | (!vma || addr + len <= vma->vm_start)) | ||
109 | return addr; | ||
110 | } | ||
111 | if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) | ||
112 | && len <= mm->cached_hole_size) { | ||
113 | mm->cached_hole_size = 0; | ||
114 | mm->free_area_cache = begin; | ||
115 | } | ||
116 | addr = mm->free_area_cache; | ||
117 | if (addr < begin) | ||
118 | addr = begin; | ||
119 | start_addr = addr; | ||
120 | |||
121 | full_search: | ||
122 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
123 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
124 | if (end - len < addr) { | ||
125 | /* | ||
126 | * Start a new search - just in case we missed | ||
127 | * some holes. | ||
128 | */ | ||
129 | if (start_addr != begin) { | ||
130 | start_addr = addr = begin; | ||
131 | mm->cached_hole_size = 0; | ||
132 | goto full_search; | ||
133 | } | ||
134 | return -ENOMEM; | ||
135 | } | ||
136 | if (!vma || addr + len <= vma->vm_start) { | ||
137 | /* | ||
138 | * Remember the place where we stopped the search: | ||
139 | */ | ||
140 | mm->free_area_cache = addr + len; | ||
141 | return addr; | ||
142 | } | ||
143 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
144 | mm->cached_hole_size = vma->vm_start - addr; | ||
145 | |||
146 | addr = vma->vm_end; | ||
147 | } | ||
148 | } | ||
149 | |||
150 | asmlinkage long sys_uname(struct new_utsname __user * name) | ||
151 | { | ||
152 | int err; | ||
153 | down_read(&uts_sem); | ||
154 | err = copy_to_user(name, utsname(), sizeof (*name)); | ||
155 | up_read(&uts_sem); | ||
156 | if (personality(current->personality) == PER_LINUX32) | ||
157 | err |= copy_to_user(&name->machine, "i686", 5); | ||
158 | return err ? -EFAULT : 0; | ||
159 | } | ||
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c deleted file mode 100644 index 63d592c276cc..000000000000 --- a/arch/x86_64/kernel/syscall.c +++ /dev/null | |||
@@ -1,26 +0,0 @@ | |||
1 | /* System call table for x86-64. */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <linux/sys.h> | ||
5 | #include <linux/cache.h> | ||
6 | #include <asm/asm-offsets.h> | ||
7 | |||
8 | #define __NO_STUBS | ||
9 | |||
10 | #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; | ||
11 | #undef _ASM_X86_64_UNISTD_H_ | ||
12 | #include <asm-x86_64/unistd.h> | ||
13 | |||
14 | #undef __SYSCALL | ||
15 | #define __SYSCALL(nr, sym) [ nr ] = sym, | ||
16 | #undef _ASM_X86_64_UNISTD_H_ | ||
17 | |||
18 | typedef void (*sys_call_ptr_t)(void); | ||
19 | |||
20 | extern void sys_ni_syscall(void); | ||
21 | |||
22 | const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { | ||
23 | /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ | ||
24 | [0 ... __NR_syscall_max] = &sys_ni_syscall, | ||
25 | #include <asm-x86_64/unistd.h> | ||
26 | }; | ||
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c deleted file mode 100644 index e3f2569b2c44..000000000000 --- a/arch/x86_64/kernel/tce.c +++ /dev/null | |||
@@ -1,189 +0,0 @@ | |||
1 | /* | ||
2 | * This file manages the translation entries for the IBM Calgary IOMMU. | ||
3 | * | ||
4 | * Derived from arch/powerpc/platforms/pseries/iommu.c | ||
5 | * | ||
6 | * Copyright (C) IBM Corporation, 2006 | ||
7 | * | ||
8 | * Author: Jon Mason <jdmason@us.ibm.com> | ||
9 | * Author: Muli Ben-Yehuda <muli@il.ibm.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to the Free Software | ||
23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/string.h> | ||
31 | #include <linux/pci.h> | ||
32 | #include <linux/dma-mapping.h> | ||
33 | #include <linux/bootmem.h> | ||
34 | #include <asm/tce.h> | ||
35 | #include <asm/calgary.h> | ||
36 | #include <asm/proto.h> | ||
37 | |||
38 | /* flush a tce at 'tceaddr' to main memory */ | ||
39 | static inline void flush_tce(void* tceaddr) | ||
40 | { | ||
41 | /* a single tce can't cross a cache line */ | ||
42 | if (cpu_has_clflush) | ||
43 | asm volatile("clflush (%0)" :: "r" (tceaddr)); | ||
44 | else | ||
45 | asm volatile("wbinvd":::"memory"); | ||
46 | } | ||
47 | |||
48 | void tce_build(struct iommu_table *tbl, unsigned long index, | ||
49 | unsigned int npages, unsigned long uaddr, int direction) | ||
50 | { | ||
51 | u64* tp; | ||
52 | u64 t; | ||
53 | u64 rpn; | ||
54 | |||
55 | t = (1 << TCE_READ_SHIFT); | ||
56 | if (direction != DMA_TO_DEVICE) | ||
57 | t |= (1 << TCE_WRITE_SHIFT); | ||
58 | |||
59 | tp = ((u64*)tbl->it_base) + index; | ||
60 | |||
61 | while (npages--) { | ||
62 | rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; | ||
63 | t &= ~TCE_RPN_MASK; | ||
64 | t |= (rpn << TCE_RPN_SHIFT); | ||
65 | |||
66 | *tp = cpu_to_be64(t); | ||
67 | flush_tce(tp); | ||
68 | |||
69 | uaddr += PAGE_SIZE; | ||
70 | tp++; | ||
71 | } | ||
72 | } | ||
73 | |||
74 | void tce_free(struct iommu_table *tbl, long index, unsigned int npages) | ||
75 | { | ||
76 | u64* tp; | ||
77 | |||
78 | tp = ((u64*)tbl->it_base) + index; | ||
79 | |||
80 | while (npages--) { | ||
81 | *tp = cpu_to_be64(0); | ||
82 | flush_tce(tp); | ||
83 | tp++; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | static inline unsigned int table_size_to_number_of_entries(unsigned char size) | ||
88 | { | ||
89 | /* | ||
90 | * size is the order of the table, 0-7 | ||
91 | * smallest table is 8K entries, so shift result by 13 to | ||
92 | * multiply by 8K | ||
93 | */ | ||
94 | return (1 << size) << 13; | ||
95 | } | ||
96 | |||
97 | static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) | ||
98 | { | ||
99 | unsigned int bitmapsz; | ||
100 | unsigned long bmppages; | ||
101 | int ret; | ||
102 | |||
103 | tbl->it_busno = dev->bus->number; | ||
104 | |||
105 | /* set the tce table size - measured in entries */ | ||
106 | tbl->it_size = table_size_to_number_of_entries(specified_table_size); | ||
107 | |||
108 | /* | ||
109 | * number of bytes needed for the bitmap size in number of | ||
110 | * entries; we need one bit per entry | ||
111 | */ | ||
112 | bitmapsz = tbl->it_size / BITS_PER_BYTE; | ||
113 | bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); | ||
114 | if (!bmppages) { | ||
115 | printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); | ||
116 | ret = -ENOMEM; | ||
117 | goto done; | ||
118 | } | ||
119 | |||
120 | tbl->it_map = (unsigned long*)bmppages; | ||
121 | |||
122 | memset(tbl->it_map, 0, bitmapsz); | ||
123 | |||
124 | tbl->it_hint = 0; | ||
125 | |||
126 | spin_lock_init(&tbl->it_lock); | ||
127 | |||
128 | return 0; | ||
129 | |||
130 | done: | ||
131 | return ret; | ||
132 | } | ||
133 | |||
134 | int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) | ||
135 | { | ||
136 | struct iommu_table *tbl; | ||
137 | int ret; | ||
138 | |||
139 | if (pci_iommu(dev->bus)) { | ||
140 | printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", | ||
141 | dev, pci_iommu(dev->bus)); | ||
142 | BUG(); | ||
143 | } | ||
144 | |||
145 | tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); | ||
146 | if (!tbl) { | ||
147 | printk(KERN_ERR "Calgary: error allocating iommu_table\n"); | ||
148 | ret = -ENOMEM; | ||
149 | goto done; | ||
150 | } | ||
151 | |||
152 | ret = tce_table_setparms(dev, tbl); | ||
153 | if (ret) | ||
154 | goto free_tbl; | ||
155 | |||
156 | tbl->bbar = bbar; | ||
157 | |||
158 | set_pci_iommu(dev->bus, tbl); | ||
159 | |||
160 | return 0; | ||
161 | |||
162 | free_tbl: | ||
163 | kfree(tbl); | ||
164 | done: | ||
165 | return ret; | ||
166 | } | ||
167 | |||
168 | void * __init alloc_tce_table(void) | ||
169 | { | ||
170 | unsigned int size; | ||
171 | |||
172 | size = table_size_to_number_of_entries(specified_table_size); | ||
173 | size *= TCE_ENTRY_SIZE; | ||
174 | |||
175 | return __alloc_bootmem_low(size, size, 0); | ||
176 | } | ||
177 | |||
178 | void __init free_tce_table(void *tbl) | ||
179 | { | ||
180 | unsigned int size; | ||
181 | |||
182 | if (!tbl) | ||
183 | return; | ||
184 | |||
185 | size = table_size_to_number_of_entries(specified_table_size); | ||
186 | size *= TCE_ENTRY_SIZE; | ||
187 | |||
188 | free_bootmem(__pa(tbl), size); | ||
189 | } | ||
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c deleted file mode 100644 index 6d48a4e826d9..000000000000 --- a/arch/x86_64/kernel/time.c +++ /dev/null | |||
@@ -1,447 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/time.c | ||
3 | * | ||
4 | * "High Precision Event Timer" based timekeeping. | ||
5 | * | ||
6 | * Copyright (c) 1991,1992,1995 Linus Torvalds | ||
7 | * Copyright (c) 1994 Alan Modra | ||
8 | * Copyright (c) 1995 Markus Kuhn | ||
9 | * Copyright (c) 1996 Ingo Molnar | ||
10 | * Copyright (c) 1998 Andrea Arcangeli | ||
11 | * Copyright (c) 2002,2006 Vojtech Pavlik | ||
12 | * Copyright (c) 2003 Andi Kleen | ||
13 | * RTC support code taken from arch/i386/kernel/timers/time_hpet.c | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/mc146818rtc.h> | ||
21 | #include <linux/time.h> | ||
22 | #include <linux/ioport.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/device.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/bcd.h> | ||
27 | #include <linux/notifier.h> | ||
28 | #include <linux/cpu.h> | ||
29 | #include <linux/kallsyms.h> | ||
30 | #include <linux/acpi.h> | ||
31 | #ifdef CONFIG_ACPI | ||
32 | #include <acpi/achware.h> /* for PM timer frequency */ | ||
33 | #include <acpi/acpi_bus.h> | ||
34 | #endif | ||
35 | #include <asm/8253pit.h> | ||
36 | #include <asm/i8253.h> | ||
37 | #include <asm/pgtable.h> | ||
38 | #include <asm/vsyscall.h> | ||
39 | #include <asm/timex.h> | ||
40 | #include <asm/proto.h> | ||
41 | #include <asm/hpet.h> | ||
42 | #include <asm/sections.h> | ||
43 | #include <linux/hpet.h> | ||
44 | #include <asm/apic.h> | ||
45 | #include <asm/hpet.h> | ||
46 | #include <asm/mpspec.h> | ||
47 | #include <asm/nmi.h> | ||
48 | #include <asm/vgtod.h> | ||
49 | |||
50 | static char *timename = NULL; | ||
51 | |||
52 | DEFINE_SPINLOCK(rtc_lock); | ||
53 | EXPORT_SYMBOL(rtc_lock); | ||
54 | DEFINE_SPINLOCK(i8253_lock); | ||
55 | EXPORT_SYMBOL(i8253_lock); | ||
56 | |||
57 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | ||
58 | |||
59 | unsigned long profile_pc(struct pt_regs *regs) | ||
60 | { | ||
61 | unsigned long pc = instruction_pointer(regs); | ||
62 | |||
63 | /* Assume the lock function has either no stack frame or a copy | ||
64 | of eflags from PUSHF | ||
65 | Eflags always has bits 22 and up cleared unlike kernel addresses. */ | ||
66 | if (!user_mode(regs) && in_lock_functions(pc)) { | ||
67 | unsigned long *sp = (unsigned long *)regs->rsp; | ||
68 | if (sp[0] >> 22) | ||
69 | return sp[0]; | ||
70 | if (sp[1] >> 22) | ||
71 | return sp[1]; | ||
72 | } | ||
73 | return pc; | ||
74 | } | ||
75 | EXPORT_SYMBOL(profile_pc); | ||
76 | |||
77 | /* | ||
78 | * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 | ||
79 | * ms after the second nowtime has started, because when nowtime is written | ||
80 | * into the registers of the CMOS clock, it will jump to the next second | ||
81 | * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data | ||
82 | * sheet for details. | ||
83 | */ | ||
84 | |||
85 | static int set_rtc_mmss(unsigned long nowtime) | ||
86 | { | ||
87 | int retval = 0; | ||
88 | int real_seconds, real_minutes, cmos_minutes; | ||
89 | unsigned char control, freq_select; | ||
90 | |||
91 | /* | ||
92 | * IRQs are disabled when we're called from the timer interrupt, | ||
93 | * no need for spin_lock_irqsave() | ||
94 | */ | ||
95 | |||
96 | spin_lock(&rtc_lock); | ||
97 | |||
98 | /* | ||
99 | * Tell the clock it's being set and stop it. | ||
100 | */ | ||
101 | |||
102 | control = CMOS_READ(RTC_CONTROL); | ||
103 | CMOS_WRITE(control | RTC_SET, RTC_CONTROL); | ||
104 | |||
105 | freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
106 | CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); | ||
107 | |||
108 | cmos_minutes = CMOS_READ(RTC_MINUTES); | ||
109 | BCD_TO_BIN(cmos_minutes); | ||
110 | |||
111 | /* | ||
112 | * since we're only adjusting minutes and seconds, don't interfere with hour | ||
113 | * overflow. This avoids messing with unknown time zones but requires your RTC | ||
114 | * not to be off by more than 15 minutes. Since we're calling it only when | ||
115 | * our clock is externally synchronized using NTP, this shouldn't be a problem. | ||
116 | */ | ||
117 | |||
118 | real_seconds = nowtime % 60; | ||
119 | real_minutes = nowtime / 60; | ||
120 | if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) | ||
121 | real_minutes += 30; /* correct for half hour time zone */ | ||
122 | real_minutes %= 60; | ||
123 | |||
124 | if (abs(real_minutes - cmos_minutes) >= 30) { | ||
125 | printk(KERN_WARNING "time.c: can't update CMOS clock " | ||
126 | "from %d to %d\n", cmos_minutes, real_minutes); | ||
127 | retval = -1; | ||
128 | } else { | ||
129 | BIN_TO_BCD(real_seconds); | ||
130 | BIN_TO_BCD(real_minutes); | ||
131 | CMOS_WRITE(real_seconds, RTC_SECONDS); | ||
132 | CMOS_WRITE(real_minutes, RTC_MINUTES); | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * The following flags have to be released exactly in this order, otherwise the | ||
137 | * DS12887 (popular MC146818A clone with integrated battery and quartz) will | ||
138 | * not reset the oscillator and will not update precisely 500 ms later. You | ||
139 | * won't find this mentioned in the Dallas Semiconductor data sheets, but who | ||
140 | * believes data sheets anyway ... -- Markus Kuhn | ||
141 | */ | ||
142 | |||
143 | CMOS_WRITE(control, RTC_CONTROL); | ||
144 | CMOS_WRITE(freq_select, RTC_FREQ_SELECT); | ||
145 | |||
146 | spin_unlock(&rtc_lock); | ||
147 | |||
148 | return retval; | ||
149 | } | ||
150 | |||
151 | int update_persistent_clock(struct timespec now) | ||
152 | { | ||
153 | return set_rtc_mmss(now.tv_sec); | ||
154 | } | ||
155 | |||
156 | void main_timer_handler(void) | ||
157 | { | ||
158 | /* | ||
159 | * Here we are in the timer irq handler. We have irqs locally disabled (so we | ||
160 | * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running | ||
161 | * on the other CPU, so we need a lock. We also need to lock the vsyscall | ||
162 | * variables, because both do_timer() and us change them -arca+vojtech | ||
163 | */ | ||
164 | |||
165 | write_seqlock(&xtime_lock); | ||
166 | |||
167 | /* | ||
168 | * Do the timer stuff. | ||
169 | */ | ||
170 | |||
171 | do_timer(1); | ||
172 | #ifndef CONFIG_SMP | ||
173 | update_process_times(user_mode(get_irq_regs())); | ||
174 | #endif | ||
175 | |||
176 | /* | ||
177 | * In the SMP case we use the local APIC timer interrupt to do the profiling, | ||
178 | * except when we simulate SMP mode on a uniprocessor system, in that case we | ||
179 | * have to call the local interrupt handler. | ||
180 | */ | ||
181 | |||
182 | if (!using_apic_timer) | ||
183 | smp_local_timer_interrupt(); | ||
184 | |||
185 | write_sequnlock(&xtime_lock); | ||
186 | } | ||
187 | |||
188 | static irqreturn_t timer_interrupt(int irq, void *dev_id) | ||
189 | { | ||
190 | if (apic_runs_main_timer > 1) | ||
191 | return IRQ_HANDLED; | ||
192 | main_timer_handler(); | ||
193 | if (using_apic_timer) | ||
194 | smp_send_timer_broadcast_ipi(); | ||
195 | return IRQ_HANDLED; | ||
196 | } | ||
197 | |||
198 | unsigned long read_persistent_clock(void) | ||
199 | { | ||
200 | unsigned int year, mon, day, hour, min, sec; | ||
201 | unsigned long flags; | ||
202 | unsigned century = 0; | ||
203 | |||
204 | spin_lock_irqsave(&rtc_lock, flags); | ||
205 | |||
206 | do { | ||
207 | sec = CMOS_READ(RTC_SECONDS); | ||
208 | min = CMOS_READ(RTC_MINUTES); | ||
209 | hour = CMOS_READ(RTC_HOURS); | ||
210 | day = CMOS_READ(RTC_DAY_OF_MONTH); | ||
211 | mon = CMOS_READ(RTC_MONTH); | ||
212 | year = CMOS_READ(RTC_YEAR); | ||
213 | #ifdef CONFIG_ACPI | ||
214 | if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && | ||
215 | acpi_gbl_FADT.century) | ||
216 | century = CMOS_READ(acpi_gbl_FADT.century); | ||
217 | #endif | ||
218 | } while (sec != CMOS_READ(RTC_SECONDS)); | ||
219 | |||
220 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
221 | |||
222 | /* | ||
223 | * We know that x86-64 always uses BCD format, no need to check the | ||
224 | * config register. | ||
225 | */ | ||
226 | |||
227 | BCD_TO_BIN(sec); | ||
228 | BCD_TO_BIN(min); | ||
229 | BCD_TO_BIN(hour); | ||
230 | BCD_TO_BIN(day); | ||
231 | BCD_TO_BIN(mon); | ||
232 | BCD_TO_BIN(year); | ||
233 | |||
234 | if (century) { | ||
235 | BCD_TO_BIN(century); | ||
236 | year += century * 100; | ||
237 | printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); | ||
238 | } else { | ||
239 | /* | ||
240 | * x86-64 systems only exists since 2002. | ||
241 | * This will work up to Dec 31, 2100 | ||
242 | */ | ||
243 | year += 2000; | ||
244 | } | ||
245 | |||
246 | return mktime(year, mon, day, hour, min, sec); | ||
247 | } | ||
248 | |||
249 | /* calibrate_cpu is used on systems with fixed rate TSCs to determine | ||
250 | * processor frequency */ | ||
251 | #define TICK_COUNT 100000000 | ||
252 | static unsigned int __init tsc_calibrate_cpu_khz(void) | ||
253 | { | ||
254 | int tsc_start, tsc_now; | ||
255 | int i, no_ctr_free; | ||
256 | unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; | ||
257 | unsigned long flags; | ||
258 | |||
259 | for (i = 0; i < 4; i++) | ||
260 | if (avail_to_resrv_perfctr_nmi_bit(i)) | ||
261 | break; | ||
262 | no_ctr_free = (i == 4); | ||
263 | if (no_ctr_free) { | ||
264 | i = 3; | ||
265 | rdmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
266 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
267 | rdmsrl(MSR_K7_PERFCTR3, pmc3); | ||
268 | } else { | ||
269 | reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
270 | reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
271 | } | ||
272 | local_irq_save(flags); | ||
273 | /* start meauring cycles, incrementing from 0 */ | ||
274 | wrmsrl(MSR_K7_PERFCTR0 + i, 0); | ||
275 | wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); | ||
276 | rdtscl(tsc_start); | ||
277 | do { | ||
278 | rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); | ||
279 | tsc_now = get_cycles_sync(); | ||
280 | } while ((tsc_now - tsc_start) < TICK_COUNT); | ||
281 | |||
282 | local_irq_restore(flags); | ||
283 | if (no_ctr_free) { | ||
284 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
285 | wrmsrl(MSR_K7_PERFCTR3, pmc3); | ||
286 | wrmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
287 | } else { | ||
288 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
289 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
290 | } | ||
291 | |||
292 | return pmc_now * tsc_khz / (tsc_now - tsc_start); | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * pit_calibrate_tsc() uses the speaker output (channel 2) of | ||
297 | * the PIT. This is better than using the timer interrupt output, | ||
298 | * because we can read the value of the speaker with just one inb(), | ||
299 | * where we need three i/o operations for the interrupt channel. | ||
300 | * We count how many ticks the TSC does in 50 ms. | ||
301 | */ | ||
302 | |||
303 | static unsigned int __init pit_calibrate_tsc(void) | ||
304 | { | ||
305 | unsigned long start, end; | ||
306 | unsigned long flags; | ||
307 | |||
308 | spin_lock_irqsave(&i8253_lock, flags); | ||
309 | |||
310 | outb((inb(0x61) & ~0x02) | 0x01, 0x61); | ||
311 | |||
312 | outb(0xb0, 0x43); | ||
313 | outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); | ||
314 | outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); | ||
315 | start = get_cycles_sync(); | ||
316 | while ((inb(0x61) & 0x20) == 0); | ||
317 | end = get_cycles_sync(); | ||
318 | |||
319 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
320 | |||
321 | return (end - start) / 50; | ||
322 | } | ||
323 | |||
324 | #define PIT_MODE 0x43 | ||
325 | #define PIT_CH0 0x40 | ||
326 | |||
327 | static void __pit_init(int val, u8 mode) | ||
328 | { | ||
329 | unsigned long flags; | ||
330 | |||
331 | spin_lock_irqsave(&i8253_lock, flags); | ||
332 | outb_p(mode, PIT_MODE); | ||
333 | outb_p(val & 0xff, PIT_CH0); /* LSB */ | ||
334 | outb_p(val >> 8, PIT_CH0); /* MSB */ | ||
335 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
336 | } | ||
337 | |||
338 | void __init pit_init(void) | ||
339 | { | ||
340 | __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
341 | } | ||
342 | |||
343 | void pit_stop_interrupt(void) | ||
344 | { | ||
345 | __pit_init(0, 0x30); /* mode 0 */ | ||
346 | } | ||
347 | |||
348 | void stop_timer_interrupt(void) | ||
349 | { | ||
350 | char *name; | ||
351 | if (hpet_address) { | ||
352 | name = "HPET"; | ||
353 | hpet_timer_stop_set_go(0); | ||
354 | } else { | ||
355 | name = "PIT"; | ||
356 | pit_stop_interrupt(); | ||
357 | } | ||
358 | printk(KERN_INFO "timer: %s interrupt stopped.\n", name); | ||
359 | } | ||
360 | |||
361 | static struct irqaction irq0 = { | ||
362 | .handler = timer_interrupt, | ||
363 | .flags = IRQF_DISABLED | IRQF_IRQPOLL, | ||
364 | .mask = CPU_MASK_NONE, | ||
365 | .name = "timer" | ||
366 | }; | ||
367 | |||
368 | void __init time_init(void) | ||
369 | { | ||
370 | if (nohpet) | ||
371 | hpet_address = 0; | ||
372 | |||
373 | if (hpet_arch_init()) | ||
374 | hpet_address = 0; | ||
375 | |||
376 | if (hpet_use_timer) { | ||
377 | /* set tick_nsec to use the proper rate for HPET */ | ||
378 | tick_nsec = TICK_NSEC_HPET; | ||
379 | tsc_khz = hpet_calibrate_tsc(); | ||
380 | timename = "HPET"; | ||
381 | } else { | ||
382 | pit_init(); | ||
383 | tsc_khz = pit_calibrate_tsc(); | ||
384 | timename = "PIT"; | ||
385 | } | ||
386 | |||
387 | cpu_khz = tsc_khz; | ||
388 | if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && | ||
389 | boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
390 | boot_cpu_data.x86 == 16) | ||
391 | cpu_khz = tsc_calibrate_cpu_khz(); | ||
392 | |||
393 | if (unsynchronized_tsc()) | ||
394 | mark_tsc_unstable("TSCs unsynchronized"); | ||
395 | |||
396 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | ||
397 | vgetcpu_mode = VGETCPU_RDTSCP; | ||
398 | else | ||
399 | vgetcpu_mode = VGETCPU_LSL; | ||
400 | |||
401 | set_cyc2ns_scale(tsc_khz); | ||
402 | printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", | ||
403 | cpu_khz / 1000, cpu_khz % 1000); | ||
404 | init_tsc_clocksource(); | ||
405 | |||
406 | setup_irq(0, &irq0); | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * sysfs support for the timer. | ||
411 | */ | ||
412 | |||
413 | static int timer_suspend(struct sys_device *dev, pm_message_t state) | ||
414 | { | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static int timer_resume(struct sys_device *dev) | ||
419 | { | ||
420 | if (hpet_address) | ||
421 | hpet_reenable(); | ||
422 | else | ||
423 | i8254_timer_resume(); | ||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | static struct sysdev_class timer_sysclass = { | ||
428 | .resume = timer_resume, | ||
429 | .suspend = timer_suspend, | ||
430 | set_kset_name("timer"), | ||
431 | }; | ||
432 | |||
433 | /* XXX this sysfs stuff should probably go elsewhere later -john */ | ||
434 | static struct sys_device device_timer = { | ||
435 | .id = 0, | ||
436 | .cls = &timer_sysclass, | ||
437 | }; | ||
438 | |||
439 | static int time_init_device(void) | ||
440 | { | ||
441 | int error = sysdev_class_register(&timer_sysclass); | ||
442 | if (!error) | ||
443 | error = sysdev_register(&device_timer); | ||
444 | return error; | ||
445 | } | ||
446 | |||
447 | device_initcall(time_init_device); | ||
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S deleted file mode 100644 index e7e2764c461b..000000000000 --- a/arch/x86_64/kernel/trampoline.S +++ /dev/null | |||
@@ -1,166 +0,0 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Trampoline.S Derived from Setup.S by Linus Torvalds | ||
4 | * | ||
5 | * 4 Jan 1997 Michael Chastain: changed to gnu as. | ||
6 | * 15 Sept 2005 Eric Biederman: 64bit PIC support | ||
7 | * | ||
8 | * Entry: CS:IP point to the start of our code, we are | ||
9 | * in real mode with no stack, but the rest of the | ||
10 | * trampoline page to make our stack and everything else | ||
11 | * is a mystery. | ||
12 | * | ||
13 | * In fact we don't actually need a stack so we don't | ||
14 | * set one up. | ||
15 | * | ||
16 | * On entry to trampoline_data, the processor is in real mode | ||
17 | * with 16-bit addressing and 16-bit data. CS has some value | ||
18 | * and IP is zero. Thus, data addresses need to be absolute | ||
19 | * (no relocation) and are taken with regard to r_base. | ||
20 | * | ||
21 | * With the addition of trampoline_level4_pgt this code can | ||
22 | * now enter a 64bit kernel that lives at arbitrary 64bit | ||
23 | * physical addresses. | ||
24 | * | ||
25 | * If you work on this file, check the object module with objdump | ||
26 | * --full-contents --reloc to make sure there are no relocation | ||
27 | * entries. | ||
28 | */ | ||
29 | |||
30 | #include <linux/linkage.h> | ||
31 | #include <asm/pgtable.h> | ||
32 | #include <asm/page.h> | ||
33 | #include <asm/msr.h> | ||
34 | #include <asm/segment.h> | ||
35 | |||
36 | .data | ||
37 | |||
38 | .code16 | ||
39 | |||
40 | ENTRY(trampoline_data) | ||
41 | r_base = . | ||
42 | cli # We should be safe anyway | ||
43 | wbinvd | ||
44 | mov %cs, %ax # Code and data in the same place | ||
45 | mov %ax, %ds | ||
46 | mov %ax, %es | ||
47 | mov %ax, %ss | ||
48 | |||
49 | |||
50 | movl $0xA5A5A5A5, trampoline_data - r_base | ||
51 | # write marker for master knows we're running | ||
52 | |||
53 | # Setup stack | ||
54 | movw $(trampoline_stack_end - r_base), %sp | ||
55 | |||
56 | call verify_cpu # Verify the cpu supports long mode | ||
57 | testl %eax, %eax # Check for return code | ||
58 | jnz no_longmode | ||
59 | |||
60 | mov %cs, %ax | ||
61 | movzx %ax, %esi # Find the 32bit trampoline location | ||
62 | shll $4, %esi | ||
63 | |||
64 | # Fixup the vectors | ||
65 | addl %esi, startup_32_vector - r_base | ||
66 | addl %esi, startup_64_vector - r_base | ||
67 | addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer | ||
68 | |||
69 | /* | ||
70 | * GDT tables in non default location kernel can be beyond 16MB and | ||
71 | * lgdt will not be able to load the address as in real mode default | ||
72 | * operand size is 16bit. Use lgdtl instead to force operand size | ||
73 | * to 32 bit. | ||
74 | */ | ||
75 | |||
76 | lidtl tidt - r_base # load idt with 0, 0 | ||
77 | lgdtl tgdt - r_base # load gdt with whatever is appropriate | ||
78 | |||
79 | xor %ax, %ax | ||
80 | inc %ax # protected mode (PE) bit | ||
81 | lmsw %ax # into protected mode | ||
82 | |||
83 | # flush prefetch and jump to startup_32 | ||
84 | ljmpl *(startup_32_vector - r_base) | ||
85 | |||
86 | .code32 | ||
87 | .balign 4 | ||
88 | startup_32: | ||
89 | movl $__KERNEL_DS, %eax # Initialize the %ds segment register | ||
90 | movl %eax, %ds | ||
91 | |||
92 | xorl %eax, %eax | ||
93 | btsl $5, %eax # Enable PAE mode | ||
94 | movl %eax, %cr4 | ||
95 | |||
96 | # Setup trampoline 4 level pagetables | ||
97 | leal (trampoline_level4_pgt - r_base)(%esi), %eax | ||
98 | movl %eax, %cr3 | ||
99 | |||
100 | movl $MSR_EFER, %ecx | ||
101 | movl $(1 << _EFER_LME), %eax # Enable Long Mode | ||
102 | xorl %edx, %edx | ||
103 | wrmsr | ||
104 | |||
105 | xorl %eax, %eax | ||
106 | btsl $31, %eax # Enable paging and in turn activate Long Mode | ||
107 | btsl $0, %eax # Enable protected mode | ||
108 | movl %eax, %cr0 | ||
109 | |||
110 | /* | ||
111 | * At this point we're in long mode but in 32bit compatibility mode | ||
112 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
113 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use | ||
114 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
115 | */ | ||
116 | ljmp *(startup_64_vector - r_base)(%esi) | ||
117 | |||
118 | .code64 | ||
119 | .balign 4 | ||
120 | startup_64: | ||
121 | # Now jump into the kernel using virtual addresses | ||
122 | movq $secondary_startup_64, %rax | ||
123 | jmp *%rax | ||
124 | |||
125 | .code16 | ||
126 | no_longmode: | ||
127 | hlt | ||
128 | jmp no_longmode | ||
129 | #include "verify_cpu.S" | ||
130 | |||
131 | # Careful these need to be in the same 64K segment as the above; | ||
132 | tidt: | ||
133 | .word 0 # idt limit = 0 | ||
134 | .word 0, 0 # idt base = 0L | ||
135 | |||
136 | # Duplicate the global descriptor table | ||
137 | # so the kernel can live anywhere | ||
138 | .balign 4 | ||
139 | tgdt: | ||
140 | .short tgdt_end - tgdt # gdt limit | ||
141 | .long tgdt - r_base | ||
142 | .short 0 | ||
143 | .quad 0x00cf9b000000ffff # __KERNEL32_CS | ||
144 | .quad 0x00af9b000000ffff # __KERNEL_CS | ||
145 | .quad 0x00cf93000000ffff # __KERNEL_DS | ||
146 | tgdt_end: | ||
147 | |||
148 | .balign 4 | ||
149 | startup_32_vector: | ||
150 | .long startup_32 - r_base | ||
151 | .word __KERNEL32_CS, 0 | ||
152 | |||
153 | .balign 4 | ||
154 | startup_64_vector: | ||
155 | .long startup_64 - r_base | ||
156 | .word __KERNEL_CS, 0 | ||
157 | |||
158 | trampoline_stack: | ||
159 | .org 0x1000 | ||
160 | trampoline_stack_end: | ||
161 | ENTRY(trampoline_level4_pgt) | ||
162 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
163 | .fill 510,8,0 | ||
164 | .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
165 | |||
166 | ENTRY(trampoline_end) | ||
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c deleted file mode 100644 index 03888420775d..000000000000 --- a/arch/x86_64/kernel/traps.c +++ /dev/null | |||
@@ -1,1138 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/traps.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * 'Traps.c' handles hardware traps and faults after we have saved some | ||
13 | * state in 'entry.S'. | ||
14 | */ | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <linux/errno.h> | ||
19 | #include <linux/ptrace.h> | ||
20 | #include <linux/timer.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/delay.h> | ||
24 | #include <linux/spinlock.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/kallsyms.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/moduleparam.h> | ||
29 | #include <linux/nmi.h> | ||
30 | #include <linux/kprobes.h> | ||
31 | #include <linux/kexec.h> | ||
32 | #include <linux/unwind.h> | ||
33 | #include <linux/uaccess.h> | ||
34 | #include <linux/bug.h> | ||
35 | #include <linux/kdebug.h> | ||
36 | |||
37 | #if defined(CONFIG_EDAC) | ||
38 | #include <linux/edac.h> | ||
39 | #endif | ||
40 | |||
41 | #include <asm/system.h> | ||
42 | #include <asm/io.h> | ||
43 | #include <asm/atomic.h> | ||
44 | #include <asm/debugreg.h> | ||
45 | #include <asm/desc.h> | ||
46 | #include <asm/i387.h> | ||
47 | #include <asm/processor.h> | ||
48 | #include <asm/unwind.h> | ||
49 | #include <asm/smp.h> | ||
50 | #include <asm/pgalloc.h> | ||
51 | #include <asm/pda.h> | ||
52 | #include <asm/proto.h> | ||
53 | #include <asm/nmi.h> | ||
54 | #include <asm/stacktrace.h> | ||
55 | |||
56 | asmlinkage void divide_error(void); | ||
57 | asmlinkage void debug(void); | ||
58 | asmlinkage void nmi(void); | ||
59 | asmlinkage void int3(void); | ||
60 | asmlinkage void overflow(void); | ||
61 | asmlinkage void bounds(void); | ||
62 | asmlinkage void invalid_op(void); | ||
63 | asmlinkage void device_not_available(void); | ||
64 | asmlinkage void double_fault(void); | ||
65 | asmlinkage void coprocessor_segment_overrun(void); | ||
66 | asmlinkage void invalid_TSS(void); | ||
67 | asmlinkage void segment_not_present(void); | ||
68 | asmlinkage void stack_segment(void); | ||
69 | asmlinkage void general_protection(void); | ||
70 | asmlinkage void page_fault(void); | ||
71 | asmlinkage void coprocessor_error(void); | ||
72 | asmlinkage void simd_coprocessor_error(void); | ||
73 | asmlinkage void reserved(void); | ||
74 | asmlinkage void alignment_check(void); | ||
75 | asmlinkage void machine_check(void); | ||
76 | asmlinkage void spurious_interrupt_bug(void); | ||
77 | |||
78 | static inline void conditional_sti(struct pt_regs *regs) | ||
79 | { | ||
80 | if (regs->eflags & X86_EFLAGS_IF) | ||
81 | local_irq_enable(); | ||
82 | } | ||
83 | |||
84 | static inline void preempt_conditional_sti(struct pt_regs *regs) | ||
85 | { | ||
86 | preempt_disable(); | ||
87 | if (regs->eflags & X86_EFLAGS_IF) | ||
88 | local_irq_enable(); | ||
89 | } | ||
90 | |||
91 | static inline void preempt_conditional_cli(struct pt_regs *regs) | ||
92 | { | ||
93 | if (regs->eflags & X86_EFLAGS_IF) | ||
94 | local_irq_disable(); | ||
95 | /* Make sure to not schedule here because we could be running | ||
96 | on an exception stack. */ | ||
97 | preempt_enable_no_resched(); | ||
98 | } | ||
99 | |||
100 | int kstack_depth_to_print = 12; | ||
101 | |||
102 | #ifdef CONFIG_KALLSYMS | ||
103 | void printk_address(unsigned long address) | ||
104 | { | ||
105 | unsigned long offset = 0, symsize; | ||
106 | const char *symname; | ||
107 | char *modname; | ||
108 | char *delim = ":"; | ||
109 | char namebuf[128]; | ||
110 | |||
111 | symname = kallsyms_lookup(address, &symsize, &offset, | ||
112 | &modname, namebuf); | ||
113 | if (!symname) { | ||
114 | printk(" [<%016lx>]\n", address); | ||
115 | return; | ||
116 | } | ||
117 | if (!modname) | ||
118 | modname = delim = ""; | ||
119 | printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", | ||
120 | address, delim, modname, delim, symname, offset, symsize); | ||
121 | } | ||
122 | #else | ||
123 | void printk_address(unsigned long address) | ||
124 | { | ||
125 | printk(" [<%016lx>]\n", address); | ||
126 | } | ||
127 | #endif | ||
128 | |||
129 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | ||
130 | unsigned *usedp, char **idp) | ||
131 | { | ||
132 | static char ids[][8] = { | ||
133 | [DEBUG_STACK - 1] = "#DB", | ||
134 | [NMI_STACK - 1] = "NMI", | ||
135 | [DOUBLEFAULT_STACK - 1] = "#DF", | ||
136 | [STACKFAULT_STACK - 1] = "#SS", | ||
137 | [MCE_STACK - 1] = "#MC", | ||
138 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
139 | [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | ||
140 | #endif | ||
141 | }; | ||
142 | unsigned k; | ||
143 | |||
144 | /* | ||
145 | * Iterate over all exception stacks, and figure out whether | ||
146 | * 'stack' is in one of them: | ||
147 | */ | ||
148 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
149 | unsigned long end = per_cpu(orig_ist, cpu).ist[k]; | ||
150 | /* | ||
151 | * Is 'stack' above this exception frame's end? | ||
152 | * If yes then skip to the next frame. | ||
153 | */ | ||
154 | if (stack >= end) | ||
155 | continue; | ||
156 | /* | ||
157 | * Is 'stack' above this exception frame's start address? | ||
158 | * If yes then we found the right frame. | ||
159 | */ | ||
160 | if (stack >= end - EXCEPTION_STKSZ) { | ||
161 | /* | ||
162 | * Make sure we only iterate through an exception | ||
163 | * stack once. If it comes up for the second time | ||
164 | * then there's something wrong going on - just | ||
165 | * break out and return NULL: | ||
166 | */ | ||
167 | if (*usedp & (1U << k)) | ||
168 | break; | ||
169 | *usedp |= 1U << k; | ||
170 | *idp = ids[k]; | ||
171 | return (unsigned long *)end; | ||
172 | } | ||
173 | /* | ||
174 | * If this is a debug stack, and if it has a larger size than | ||
175 | * the usual exception stacks, then 'stack' might still | ||
176 | * be within the lower portion of the debug stack: | ||
177 | */ | ||
178 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
179 | if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { | ||
180 | unsigned j = N_EXCEPTION_STACKS - 1; | ||
181 | |||
182 | /* | ||
183 | * Black magic. A large debug stack is composed of | ||
184 | * multiple exception stack entries, which we | ||
185 | * iterate through now. Dont look: | ||
186 | */ | ||
187 | do { | ||
188 | ++j; | ||
189 | end -= EXCEPTION_STKSZ; | ||
190 | ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | ||
191 | } while (stack < end - EXCEPTION_STKSZ); | ||
192 | if (*usedp & (1U << j)) | ||
193 | break; | ||
194 | *usedp |= 1U << j; | ||
195 | *idp = ids[j]; | ||
196 | return (unsigned long *)end; | ||
197 | } | ||
198 | #endif | ||
199 | } | ||
200 | return NULL; | ||
201 | } | ||
202 | |||
203 | #define MSG(txt) ops->warning(data, txt) | ||
204 | |||
205 | /* | ||
206 | * x86-64 can have upto three kernel stacks: | ||
207 | * process stack | ||
208 | * interrupt stack | ||
209 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | ||
210 | */ | ||
211 | |||
212 | static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) | ||
213 | { | ||
214 | void *t = (void *)tinfo; | ||
215 | return p > t && p < t + THREAD_SIZE - 3; | ||
216 | } | ||
217 | |||
218 | void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | ||
219 | unsigned long *stack, | ||
220 | struct stacktrace_ops *ops, void *data) | ||
221 | { | ||
222 | const unsigned cpu = get_cpu(); | ||
223 | unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; | ||
224 | unsigned used = 0; | ||
225 | struct thread_info *tinfo; | ||
226 | |||
227 | if (!tsk) | ||
228 | tsk = current; | ||
229 | |||
230 | if (!stack) { | ||
231 | unsigned long dummy; | ||
232 | stack = &dummy; | ||
233 | if (tsk && tsk != current) | ||
234 | stack = (unsigned long *)tsk->thread.rsp; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Print function call entries within a stack. 'cond' is the | ||
239 | * "end of stackframe" condition, that the 'stack++' | ||
240 | * iteration will eventually trigger. | ||
241 | */ | ||
242 | #define HANDLE_STACK(cond) \ | ||
243 | do while (cond) { \ | ||
244 | unsigned long addr = *stack++; \ | ||
245 | /* Use unlocked access here because except for NMIs \ | ||
246 | we should be already protected against module unloads */ \ | ||
247 | if (__kernel_text_address(addr)) { \ | ||
248 | /* \ | ||
249 | * If the address is either in the text segment of the \ | ||
250 | * kernel, or in the region which contains vmalloc'ed \ | ||
251 | * memory, it *may* be the address of a calling \ | ||
252 | * routine; if so, print it so that someone tracing \ | ||
253 | * down the cause of the crash will be able to figure \ | ||
254 | * out the call path that was taken. \ | ||
255 | */ \ | ||
256 | ops->address(data, addr); \ | ||
257 | } \ | ||
258 | } while (0) | ||
259 | |||
260 | /* | ||
261 | * Print function call entries in all stacks, starting at the | ||
262 | * current stack address. If the stacks consist of nested | ||
263 | * exceptions | ||
264 | */ | ||
265 | for (;;) { | ||
266 | char *id; | ||
267 | unsigned long *estack_end; | ||
268 | estack_end = in_exception_stack(cpu, (unsigned long)stack, | ||
269 | &used, &id); | ||
270 | |||
271 | if (estack_end) { | ||
272 | if (ops->stack(data, id) < 0) | ||
273 | break; | ||
274 | HANDLE_STACK (stack < estack_end); | ||
275 | ops->stack(data, "<EOE>"); | ||
276 | /* | ||
277 | * We link to the next stack via the | ||
278 | * second-to-last pointer (index -2 to end) in the | ||
279 | * exception stack: | ||
280 | */ | ||
281 | stack = (unsigned long *) estack_end[-2]; | ||
282 | continue; | ||
283 | } | ||
284 | if (irqstack_end) { | ||
285 | unsigned long *irqstack; | ||
286 | irqstack = irqstack_end - | ||
287 | (IRQSTACKSIZE - 64) / sizeof(*irqstack); | ||
288 | |||
289 | if (stack >= irqstack && stack < irqstack_end) { | ||
290 | if (ops->stack(data, "IRQ") < 0) | ||
291 | break; | ||
292 | HANDLE_STACK (stack < irqstack_end); | ||
293 | /* | ||
294 | * We link to the next stack (which would be | ||
295 | * the process stack normally) the last | ||
296 | * pointer (index -1 to end) in the IRQ stack: | ||
297 | */ | ||
298 | stack = (unsigned long *) (irqstack_end[-1]); | ||
299 | irqstack_end = NULL; | ||
300 | ops->stack(data, "EOI"); | ||
301 | continue; | ||
302 | } | ||
303 | } | ||
304 | break; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * This handles the process stack: | ||
309 | */ | ||
310 | tinfo = task_thread_info(tsk); | ||
311 | HANDLE_STACK (valid_stack_ptr(tinfo, stack)); | ||
312 | #undef HANDLE_STACK | ||
313 | put_cpu(); | ||
314 | } | ||
315 | EXPORT_SYMBOL(dump_trace); | ||
316 | |||
317 | static void | ||
318 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
319 | { | ||
320 | print_symbol(msg, symbol); | ||
321 | printk("\n"); | ||
322 | } | ||
323 | |||
324 | static void print_trace_warning(void *data, char *msg) | ||
325 | { | ||
326 | printk("%s\n", msg); | ||
327 | } | ||
328 | |||
329 | static int print_trace_stack(void *data, char *name) | ||
330 | { | ||
331 | printk(" <%s> ", name); | ||
332 | return 0; | ||
333 | } | ||
334 | |||
335 | static void print_trace_address(void *data, unsigned long addr) | ||
336 | { | ||
337 | touch_nmi_watchdog(); | ||
338 | printk_address(addr); | ||
339 | } | ||
340 | |||
341 | static struct stacktrace_ops print_trace_ops = { | ||
342 | .warning = print_trace_warning, | ||
343 | .warning_symbol = print_trace_warning_symbol, | ||
344 | .stack = print_trace_stack, | ||
345 | .address = print_trace_address, | ||
346 | }; | ||
347 | |||
348 | void | ||
349 | show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) | ||
350 | { | ||
351 | printk("\nCall Trace:\n"); | ||
352 | dump_trace(tsk, regs, stack, &print_trace_ops, NULL); | ||
353 | printk("\n"); | ||
354 | } | ||
355 | |||
356 | static void | ||
357 | _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) | ||
358 | { | ||
359 | unsigned long *stack; | ||
360 | int i; | ||
361 | const int cpu = smp_processor_id(); | ||
362 | unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); | ||
363 | unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); | ||
364 | |||
365 | // debugging aid: "show_stack(NULL, NULL);" prints the | ||
366 | // back trace for this cpu. | ||
367 | |||
368 | if (rsp == NULL) { | ||
369 | if (tsk) | ||
370 | rsp = (unsigned long *)tsk->thread.rsp; | ||
371 | else | ||
372 | rsp = (unsigned long *)&rsp; | ||
373 | } | ||
374 | |||
375 | stack = rsp; | ||
376 | for(i=0; i < kstack_depth_to_print; i++) { | ||
377 | if (stack >= irqstack && stack <= irqstack_end) { | ||
378 | if (stack == irqstack_end) { | ||
379 | stack = (unsigned long *) (irqstack_end[-1]); | ||
380 | printk(" <EOI> "); | ||
381 | } | ||
382 | } else { | ||
383 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
384 | break; | ||
385 | } | ||
386 | if (i && ((i % 4) == 0)) | ||
387 | printk("\n"); | ||
388 | printk(" %016lx", *stack++); | ||
389 | touch_nmi_watchdog(); | ||
390 | } | ||
391 | show_trace(tsk, regs, rsp); | ||
392 | } | ||
393 | |||
394 | void show_stack(struct task_struct *tsk, unsigned long * rsp) | ||
395 | { | ||
396 | _show_stack(tsk, NULL, rsp); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * The architecture-independent dump_stack generator | ||
401 | */ | ||
402 | void dump_stack(void) | ||
403 | { | ||
404 | unsigned long dummy; | ||
405 | show_trace(NULL, NULL, &dummy); | ||
406 | } | ||
407 | |||
408 | EXPORT_SYMBOL(dump_stack); | ||
409 | |||
410 | void show_registers(struct pt_regs *regs) | ||
411 | { | ||
412 | int i; | ||
413 | int in_kernel = !user_mode(regs); | ||
414 | unsigned long rsp; | ||
415 | const int cpu = smp_processor_id(); | ||
416 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | ||
417 | |||
418 | rsp = regs->rsp; | ||
419 | printk("CPU %d ", cpu); | ||
420 | __show_regs(regs); | ||
421 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
422 | cur->comm, cur->pid, task_thread_info(cur), cur); | ||
423 | |||
424 | /* | ||
425 | * When in-kernel, we also print out the stack and code at the | ||
426 | * time of the fault.. | ||
427 | */ | ||
428 | if (in_kernel) { | ||
429 | printk("Stack: "); | ||
430 | _show_stack(NULL, regs, (unsigned long*)rsp); | ||
431 | |||
432 | printk("\nCode: "); | ||
433 | if (regs->rip < PAGE_OFFSET) | ||
434 | goto bad; | ||
435 | |||
436 | for (i=0; i<20; i++) { | ||
437 | unsigned char c; | ||
438 | if (__get_user(c, &((unsigned char*)regs->rip)[i])) { | ||
439 | bad: | ||
440 | printk(" Bad RIP value."); | ||
441 | break; | ||
442 | } | ||
443 | printk("%02x ", c); | ||
444 | } | ||
445 | } | ||
446 | printk("\n"); | ||
447 | } | ||
448 | |||
449 | int is_valid_bugaddr(unsigned long rip) | ||
450 | { | ||
451 | unsigned short ud2; | ||
452 | |||
453 | if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) | ||
454 | return 0; | ||
455 | |||
456 | return ud2 == 0x0b0f; | ||
457 | } | ||
458 | |||
459 | #ifdef CONFIG_BUG | ||
460 | void out_of_line_bug(void) | ||
461 | { | ||
462 | BUG(); | ||
463 | } | ||
464 | EXPORT_SYMBOL(out_of_line_bug); | ||
465 | #endif | ||
466 | |||
467 | static DEFINE_SPINLOCK(die_lock); | ||
468 | static int die_owner = -1; | ||
469 | static unsigned int die_nest_count; | ||
470 | |||
471 | unsigned __kprobes long oops_begin(void) | ||
472 | { | ||
473 | int cpu; | ||
474 | unsigned long flags; | ||
475 | |||
476 | oops_enter(); | ||
477 | |||
478 | /* racy, but better than risking deadlock. */ | ||
479 | local_irq_save(flags); | ||
480 | cpu = smp_processor_id(); | ||
481 | if (!spin_trylock(&die_lock)) { | ||
482 | if (cpu == die_owner) | ||
483 | /* nested oops. should stop eventually */; | ||
484 | else | ||
485 | spin_lock(&die_lock); | ||
486 | } | ||
487 | die_nest_count++; | ||
488 | die_owner = cpu; | ||
489 | console_verbose(); | ||
490 | bust_spinlocks(1); | ||
491 | return flags; | ||
492 | } | ||
493 | |||
494 | void __kprobes oops_end(unsigned long flags) | ||
495 | { | ||
496 | die_owner = -1; | ||
497 | bust_spinlocks(0); | ||
498 | die_nest_count--; | ||
499 | if (die_nest_count) | ||
500 | /* We still own the lock */ | ||
501 | local_irq_restore(flags); | ||
502 | else | ||
503 | /* Nest count reaches zero, release the lock. */ | ||
504 | spin_unlock_irqrestore(&die_lock, flags); | ||
505 | if (panic_on_oops) | ||
506 | panic("Fatal exception"); | ||
507 | oops_exit(); | ||
508 | } | ||
509 | |||
510 | void __kprobes __die(const char * str, struct pt_regs * regs, long err) | ||
511 | { | ||
512 | static int die_counter; | ||
513 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); | ||
514 | #ifdef CONFIG_PREEMPT | ||
515 | printk("PREEMPT "); | ||
516 | #endif | ||
517 | #ifdef CONFIG_SMP | ||
518 | printk("SMP "); | ||
519 | #endif | ||
520 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
521 | printk("DEBUG_PAGEALLOC"); | ||
522 | #endif | ||
523 | printk("\n"); | ||
524 | notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); | ||
525 | show_registers(regs); | ||
526 | add_taint(TAINT_DIE); | ||
527 | /* Executive summary in case the oops scrolled away */ | ||
528 | printk(KERN_ALERT "RIP "); | ||
529 | printk_address(regs->rip); | ||
530 | printk(" RSP <%016lx>\n", regs->rsp); | ||
531 | if (kexec_should_crash(current)) | ||
532 | crash_kexec(regs); | ||
533 | } | ||
534 | |||
535 | void die(const char * str, struct pt_regs * regs, long err) | ||
536 | { | ||
537 | unsigned long flags = oops_begin(); | ||
538 | |||
539 | if (!user_mode(regs)) | ||
540 | report_bug(regs->rip, regs); | ||
541 | |||
542 | __die(str, regs, err); | ||
543 | oops_end(flags); | ||
544 | do_exit(SIGSEGV); | ||
545 | } | ||
546 | |||
547 | void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
548 | { | ||
549 | unsigned long flags = oops_begin(); | ||
550 | |||
551 | /* | ||
552 | * We are in trouble anyway, lets at least try | ||
553 | * to get a message out. | ||
554 | */ | ||
555 | printk(str, smp_processor_id()); | ||
556 | show_registers(regs); | ||
557 | if (kexec_should_crash(current)) | ||
558 | crash_kexec(regs); | ||
559 | if (do_panic || panic_on_oops) | ||
560 | panic("Non maskable interrupt"); | ||
561 | oops_end(flags); | ||
562 | nmi_exit(); | ||
563 | local_irq_enable(); | ||
564 | do_exit(SIGSEGV); | ||
565 | } | ||
566 | |||
567 | static void __kprobes do_trap(int trapnr, int signr, char *str, | ||
568 | struct pt_regs * regs, long error_code, | ||
569 | siginfo_t *info) | ||
570 | { | ||
571 | struct task_struct *tsk = current; | ||
572 | |||
573 | if (user_mode(regs)) { | ||
574 | /* | ||
575 | * We want error_code and trap_no set for userspace | ||
576 | * faults and kernelspace faults which result in | ||
577 | * die(), but not kernelspace faults which are fixed | ||
578 | * up. die() gives the process no chance to handle | ||
579 | * the signal and notice the kernel fault information, | ||
580 | * so that won't result in polluting the information | ||
581 | * about previously queued, but not yet delivered, | ||
582 | * faults. See also do_general_protection below. | ||
583 | */ | ||
584 | tsk->thread.error_code = error_code; | ||
585 | tsk->thread.trap_no = trapnr; | ||
586 | |||
587 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
588 | printk_ratelimit()) | ||
589 | printk(KERN_INFO | ||
590 | "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", | ||
591 | tsk->comm, tsk->pid, str, | ||
592 | regs->rip, regs->rsp, error_code); | ||
593 | |||
594 | if (info) | ||
595 | force_sig_info(signr, info, tsk); | ||
596 | else | ||
597 | force_sig(signr, tsk); | ||
598 | return; | ||
599 | } | ||
600 | |||
601 | |||
602 | /* kernel trap */ | ||
603 | { | ||
604 | const struct exception_table_entry *fixup; | ||
605 | fixup = search_exception_tables(regs->rip); | ||
606 | if (fixup) | ||
607 | regs->rip = fixup->fixup; | ||
608 | else { | ||
609 | tsk->thread.error_code = error_code; | ||
610 | tsk->thread.trap_no = trapnr; | ||
611 | die(str, regs, error_code); | ||
612 | } | ||
613 | return; | ||
614 | } | ||
615 | } | ||
616 | |||
617 | #define DO_ERROR(trapnr, signr, str, name) \ | ||
618 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
619 | { \ | ||
620 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
621 | == NOTIFY_STOP) \ | ||
622 | return; \ | ||
623 | conditional_sti(regs); \ | ||
624 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
625 | } | ||
626 | |||
627 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
628 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
629 | { \ | ||
630 | siginfo_t info; \ | ||
631 | info.si_signo = signr; \ | ||
632 | info.si_errno = 0; \ | ||
633 | info.si_code = sicode; \ | ||
634 | info.si_addr = (void __user *)siaddr; \ | ||
635 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
636 | == NOTIFY_STOP) \ | ||
637 | return; \ | ||
638 | conditional_sti(regs); \ | ||
639 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
640 | } | ||
641 | |||
642 | DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) | ||
643 | DO_ERROR( 4, SIGSEGV, "overflow", overflow) | ||
644 | DO_ERROR( 5, SIGSEGV, "bounds", bounds) | ||
645 | DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) | ||
646 | DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) | ||
647 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
648 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | ||
649 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | ||
650 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | ||
651 | DO_ERROR(18, SIGSEGV, "reserved", reserved) | ||
652 | |||
653 | /* Runs on IST stack */ | ||
654 | asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) | ||
655 | { | ||
656 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | ||
657 | 12, SIGBUS) == NOTIFY_STOP) | ||
658 | return; | ||
659 | preempt_conditional_sti(regs); | ||
660 | do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); | ||
661 | preempt_conditional_cli(regs); | ||
662 | } | ||
663 | |||
664 | asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) | ||
665 | { | ||
666 | static const char str[] = "double fault"; | ||
667 | struct task_struct *tsk = current; | ||
668 | |||
669 | /* Return not checked because double check cannot be ignored */ | ||
670 | notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); | ||
671 | |||
672 | tsk->thread.error_code = error_code; | ||
673 | tsk->thread.trap_no = 8; | ||
674 | |||
675 | /* This is always a kernel trap and never fixable (and thus must | ||
676 | never return). */ | ||
677 | for (;;) | ||
678 | die(str, regs, error_code); | ||
679 | } | ||
680 | |||
681 | asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, | ||
682 | long error_code) | ||
683 | { | ||
684 | struct task_struct *tsk = current; | ||
685 | |||
686 | conditional_sti(regs); | ||
687 | |||
688 | if (user_mode(regs)) { | ||
689 | tsk->thread.error_code = error_code; | ||
690 | tsk->thread.trap_no = 13; | ||
691 | |||
692 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
693 | printk_ratelimit()) | ||
694 | printk(KERN_INFO | ||
695 | "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", | ||
696 | tsk->comm, tsk->pid, | ||
697 | regs->rip, regs->rsp, error_code); | ||
698 | |||
699 | force_sig(SIGSEGV, tsk); | ||
700 | return; | ||
701 | } | ||
702 | |||
703 | /* kernel gp */ | ||
704 | { | ||
705 | const struct exception_table_entry *fixup; | ||
706 | fixup = search_exception_tables(regs->rip); | ||
707 | if (fixup) { | ||
708 | regs->rip = fixup->fixup; | ||
709 | return; | ||
710 | } | ||
711 | |||
712 | tsk->thread.error_code = error_code; | ||
713 | tsk->thread.trap_no = 13; | ||
714 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
715 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | ||
716 | return; | ||
717 | die("general protection fault", regs, error_code); | ||
718 | } | ||
719 | } | ||
720 | |||
721 | static __kprobes void | ||
722 | mem_parity_error(unsigned char reason, struct pt_regs * regs) | ||
723 | { | ||
724 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
725 | reason); | ||
726 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | ||
727 | |||
728 | #if defined(CONFIG_EDAC) | ||
729 | if(edac_handler_set()) { | ||
730 | edac_atomic_assert_error(); | ||
731 | return; | ||
732 | } | ||
733 | #endif | ||
734 | |||
735 | if (panic_on_unrecovered_nmi) | ||
736 | panic("NMI: Not continuing"); | ||
737 | |||
738 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
739 | |||
740 | /* Clear and disable the memory parity error line. */ | ||
741 | reason = (reason & 0xf) | 4; | ||
742 | outb(reason, 0x61); | ||
743 | } | ||
744 | |||
745 | static __kprobes void | ||
746 | io_check_error(unsigned char reason, struct pt_regs * regs) | ||
747 | { | ||
748 | printk("NMI: IOCK error (debug interrupt?)\n"); | ||
749 | show_registers(regs); | ||
750 | |||
751 | /* Re-enable the IOCK line, wait for a few seconds */ | ||
752 | reason = (reason & 0xf) | 8; | ||
753 | outb(reason, 0x61); | ||
754 | mdelay(2000); | ||
755 | reason &= ~8; | ||
756 | outb(reason, 0x61); | ||
757 | } | ||
758 | |||
759 | static __kprobes void | ||
760 | unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | ||
761 | { | ||
762 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
763 | reason); | ||
764 | printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); | ||
765 | |||
766 | if (panic_on_unrecovered_nmi) | ||
767 | panic("NMI: Not continuing"); | ||
768 | |||
769 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
770 | } | ||
771 | |||
772 | /* Runs on IST stack. This code must keep interrupts off all the time. | ||
773 | Nested NMIs are prevented by the CPU. */ | ||
774 | asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) | ||
775 | { | ||
776 | unsigned char reason = 0; | ||
777 | int cpu; | ||
778 | |||
779 | cpu = smp_processor_id(); | ||
780 | |||
781 | /* Only the BSP gets external NMIs from the system. */ | ||
782 | if (!cpu) | ||
783 | reason = get_nmi_reason(); | ||
784 | |||
785 | if (!(reason & 0xc0)) { | ||
786 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | ||
787 | == NOTIFY_STOP) | ||
788 | return; | ||
789 | /* | ||
790 | * Ok, so this is none of the documented NMI sources, | ||
791 | * so it must be the NMI watchdog. | ||
792 | */ | ||
793 | if (nmi_watchdog_tick(regs,reason)) | ||
794 | return; | ||
795 | if (!do_nmi_callback(regs,cpu)) | ||
796 | unknown_nmi_error(reason, regs); | ||
797 | |||
798 | return; | ||
799 | } | ||
800 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | ||
801 | return; | ||
802 | |||
803 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | ||
804 | |||
805 | if (reason & 0x80) | ||
806 | mem_parity_error(reason, regs); | ||
807 | if (reason & 0x40) | ||
808 | io_check_error(reason, regs); | ||
809 | } | ||
810 | |||
811 | /* runs on IST stack. */ | ||
812 | asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) | ||
813 | { | ||
814 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { | ||
815 | return; | ||
816 | } | ||
817 | preempt_conditional_sti(regs); | ||
818 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
819 | preempt_conditional_cli(regs); | ||
820 | } | ||
821 | |||
822 | /* Help handler running on IST stack to switch back to user stack | ||
823 | for scheduling or signal handling. The actual stack switch is done in | ||
824 | entry.S */ | ||
825 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | ||
826 | { | ||
827 | struct pt_regs *regs = eregs; | ||
828 | /* Did already sync */ | ||
829 | if (eregs == (struct pt_regs *)eregs->rsp) | ||
830 | ; | ||
831 | /* Exception from user space */ | ||
832 | else if (user_mode(eregs)) | ||
833 | regs = task_pt_regs(current); | ||
834 | /* Exception from kernel and interrupts are enabled. Move to | ||
835 | kernel process stack. */ | ||
836 | else if (eregs->eflags & X86_EFLAGS_IF) | ||
837 | regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); | ||
838 | if (eregs != regs) | ||
839 | *regs = *eregs; | ||
840 | return regs; | ||
841 | } | ||
842 | |||
843 | /* runs on IST stack. */ | ||
844 | asmlinkage void __kprobes do_debug(struct pt_regs * regs, | ||
845 | unsigned long error_code) | ||
846 | { | ||
847 | unsigned long condition; | ||
848 | struct task_struct *tsk = current; | ||
849 | siginfo_t info; | ||
850 | |||
851 | get_debugreg(condition, 6); | ||
852 | |||
853 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | ||
854 | SIGTRAP) == NOTIFY_STOP) | ||
855 | return; | ||
856 | |||
857 | preempt_conditional_sti(regs); | ||
858 | |||
859 | /* Mask out spurious debug traps due to lazy DR7 setting */ | ||
860 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | ||
861 | if (!tsk->thread.debugreg7) { | ||
862 | goto clear_dr7; | ||
863 | } | ||
864 | } | ||
865 | |||
866 | tsk->thread.debugreg6 = condition; | ||
867 | |||
868 | /* Mask out spurious TF errors due to lazy TF clearing */ | ||
869 | if (condition & DR_STEP) { | ||
870 | /* | ||
871 | * The TF error should be masked out only if the current | ||
872 | * process is not traced and if the TRAP flag has been set | ||
873 | * previously by a tracing process (condition detected by | ||
874 | * the PT_DTRACE flag); remember that the i386 TRAP flag | ||
875 | * can be modified by the process itself in user mode, | ||
876 | * allowing programs to debug themselves without the ptrace() | ||
877 | * interface. | ||
878 | */ | ||
879 | if (!user_mode(regs)) | ||
880 | goto clear_TF_reenable; | ||
881 | /* | ||
882 | * Was the TF flag set by a debugger? If so, clear it now, | ||
883 | * so that register information is correct. | ||
884 | */ | ||
885 | if (tsk->ptrace & PT_DTRACE) { | ||
886 | regs->eflags &= ~TF_MASK; | ||
887 | tsk->ptrace &= ~PT_DTRACE; | ||
888 | } | ||
889 | } | ||
890 | |||
891 | /* Ok, finally something we can handle */ | ||
892 | tsk->thread.trap_no = 1; | ||
893 | tsk->thread.error_code = error_code; | ||
894 | info.si_signo = SIGTRAP; | ||
895 | info.si_errno = 0; | ||
896 | info.si_code = TRAP_BRKPT; | ||
897 | info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; | ||
898 | force_sig_info(SIGTRAP, &info, tsk); | ||
899 | |||
900 | clear_dr7: | ||
901 | set_debugreg(0UL, 7); | ||
902 | preempt_conditional_cli(regs); | ||
903 | return; | ||
904 | |||
905 | clear_TF_reenable: | ||
906 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | ||
907 | regs->eflags &= ~TF_MASK; | ||
908 | preempt_conditional_cli(regs); | ||
909 | } | ||
910 | |||
911 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | ||
912 | { | ||
913 | const struct exception_table_entry *fixup; | ||
914 | fixup = search_exception_tables(regs->rip); | ||
915 | if (fixup) { | ||
916 | regs->rip = fixup->fixup; | ||
917 | return 1; | ||
918 | } | ||
919 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | ||
920 | /* Illegal floating point operation in the kernel */ | ||
921 | current->thread.trap_no = trapnr; | ||
922 | die(str, regs, 0); | ||
923 | return 0; | ||
924 | } | ||
925 | |||
926 | /* | ||
927 | * Note that we play around with the 'TS' bit in an attempt to get | ||
928 | * the correct behaviour even in the presence of the asynchronous | ||
929 | * IRQ13 behaviour | ||
930 | */ | ||
931 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | ||
932 | { | ||
933 | void __user *rip = (void __user *)(regs->rip); | ||
934 | struct task_struct * task; | ||
935 | siginfo_t info; | ||
936 | unsigned short cwd, swd; | ||
937 | |||
938 | conditional_sti(regs); | ||
939 | if (!user_mode(regs) && | ||
940 | kernel_math_error(regs, "kernel x87 math error", 16)) | ||
941 | return; | ||
942 | |||
943 | /* | ||
944 | * Save the info for the exception handler and clear the error. | ||
945 | */ | ||
946 | task = current; | ||
947 | save_init_fpu(task); | ||
948 | task->thread.trap_no = 16; | ||
949 | task->thread.error_code = 0; | ||
950 | info.si_signo = SIGFPE; | ||
951 | info.si_errno = 0; | ||
952 | info.si_code = __SI_FAULT; | ||
953 | info.si_addr = rip; | ||
954 | /* | ||
955 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | ||
956 | * status. 0x3f is the exception bits in these regs, 0x200 is the | ||
957 | * C1 reg you need in case of a stack fault, 0x040 is the stack | ||
958 | * fault bit. We should only be taking one exception at a time, | ||
959 | * so if this combination doesn't produce any single exception, | ||
960 | * then we have a bad program that isn't synchronizing its FPU usage | ||
961 | * and it will suffer the consequences since we won't be able to | ||
962 | * fully reproduce the context of the exception | ||
963 | */ | ||
964 | cwd = get_fpu_cwd(task); | ||
965 | swd = get_fpu_swd(task); | ||
966 | switch (swd & ~cwd & 0x3f) { | ||
967 | case 0x000: | ||
968 | default: | ||
969 | break; | ||
970 | case 0x001: /* Invalid Op */ | ||
971 | /* | ||
972 | * swd & 0x240 == 0x040: Stack Underflow | ||
973 | * swd & 0x240 == 0x240: Stack Overflow | ||
974 | * User must clear the SF bit (0x40) if set | ||
975 | */ | ||
976 | info.si_code = FPE_FLTINV; | ||
977 | break; | ||
978 | case 0x002: /* Denormalize */ | ||
979 | case 0x010: /* Underflow */ | ||
980 | info.si_code = FPE_FLTUND; | ||
981 | break; | ||
982 | case 0x004: /* Zero Divide */ | ||
983 | info.si_code = FPE_FLTDIV; | ||
984 | break; | ||
985 | case 0x008: /* Overflow */ | ||
986 | info.si_code = FPE_FLTOVF; | ||
987 | break; | ||
988 | case 0x020: /* Precision */ | ||
989 | info.si_code = FPE_FLTRES; | ||
990 | break; | ||
991 | } | ||
992 | force_sig_info(SIGFPE, &info, task); | ||
993 | } | ||
994 | |||
995 | asmlinkage void bad_intr(void) | ||
996 | { | ||
997 | printk("bad interrupt"); | ||
998 | } | ||
999 | |||
1000 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | ||
1001 | { | ||
1002 | void __user *rip = (void __user *)(regs->rip); | ||
1003 | struct task_struct * task; | ||
1004 | siginfo_t info; | ||
1005 | unsigned short mxcsr; | ||
1006 | |||
1007 | conditional_sti(regs); | ||
1008 | if (!user_mode(regs) && | ||
1009 | kernel_math_error(regs, "kernel simd math error", 19)) | ||
1010 | return; | ||
1011 | |||
1012 | /* | ||
1013 | * Save the info for the exception handler and clear the error. | ||
1014 | */ | ||
1015 | task = current; | ||
1016 | save_init_fpu(task); | ||
1017 | task->thread.trap_no = 19; | ||
1018 | task->thread.error_code = 0; | ||
1019 | info.si_signo = SIGFPE; | ||
1020 | info.si_errno = 0; | ||
1021 | info.si_code = __SI_FAULT; | ||
1022 | info.si_addr = rip; | ||
1023 | /* | ||
1024 | * The SIMD FPU exceptions are handled a little differently, as there | ||
1025 | * is only a single status/control register. Thus, to determine which | ||
1026 | * unmasked exception was caught we must mask the exception mask bits | ||
1027 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
1028 | */ | ||
1029 | mxcsr = get_fpu_mxcsr(task); | ||
1030 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
1031 | case 0x000: | ||
1032 | default: | ||
1033 | break; | ||
1034 | case 0x001: /* Invalid Op */ | ||
1035 | info.si_code = FPE_FLTINV; | ||
1036 | break; | ||
1037 | case 0x002: /* Denormalize */ | ||
1038 | case 0x010: /* Underflow */ | ||
1039 | info.si_code = FPE_FLTUND; | ||
1040 | break; | ||
1041 | case 0x004: /* Zero Divide */ | ||
1042 | info.si_code = FPE_FLTDIV; | ||
1043 | break; | ||
1044 | case 0x008: /* Overflow */ | ||
1045 | info.si_code = FPE_FLTOVF; | ||
1046 | break; | ||
1047 | case 0x020: /* Precision */ | ||
1048 | info.si_code = FPE_FLTRES; | ||
1049 | break; | ||
1050 | } | ||
1051 | force_sig_info(SIGFPE, &info, task); | ||
1052 | } | ||
1053 | |||
1054 | asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) | ||
1055 | { | ||
1056 | } | ||
1057 | |||
1058 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
1059 | { | ||
1060 | } | ||
1061 | |||
1062 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | ||
1063 | { | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * 'math_state_restore()' saves the current math information in the | ||
1068 | * old math state array, and gets the new ones from the current task | ||
1069 | * | ||
1070 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | ||
1071 | * Don't touch unless you *really* know how it works. | ||
1072 | */ | ||
1073 | asmlinkage void math_state_restore(void) | ||
1074 | { | ||
1075 | struct task_struct *me = current; | ||
1076 | clts(); /* Allow maths ops (or we recurse) */ | ||
1077 | |||
1078 | if (!used_math()) | ||
1079 | init_fpu(me); | ||
1080 | restore_fpu_checking(&me->thread.i387.fxsave); | ||
1081 | task_thread_info(me)->status |= TS_USEDFPU; | ||
1082 | me->fpu_counter++; | ||
1083 | } | ||
1084 | |||
1085 | void __init trap_init(void) | ||
1086 | { | ||
1087 | set_intr_gate(0,÷_error); | ||
1088 | set_intr_gate_ist(1,&debug,DEBUG_STACK); | ||
1089 | set_intr_gate_ist(2,&nmi,NMI_STACK); | ||
1090 | set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ | ||
1091 | set_system_gate(4,&overflow); /* int4 can be called from all */ | ||
1092 | set_intr_gate(5,&bounds); | ||
1093 | set_intr_gate(6,&invalid_op); | ||
1094 | set_intr_gate(7,&device_not_available); | ||
1095 | set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); | ||
1096 | set_intr_gate(9,&coprocessor_segment_overrun); | ||
1097 | set_intr_gate(10,&invalid_TSS); | ||
1098 | set_intr_gate(11,&segment_not_present); | ||
1099 | set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); | ||
1100 | set_intr_gate(13,&general_protection); | ||
1101 | set_intr_gate(14,&page_fault); | ||
1102 | set_intr_gate(15,&spurious_interrupt_bug); | ||
1103 | set_intr_gate(16,&coprocessor_error); | ||
1104 | set_intr_gate(17,&alignment_check); | ||
1105 | #ifdef CONFIG_X86_MCE | ||
1106 | set_intr_gate_ist(18,&machine_check, MCE_STACK); | ||
1107 | #endif | ||
1108 | set_intr_gate(19,&simd_coprocessor_error); | ||
1109 | |||
1110 | #ifdef CONFIG_IA32_EMULATION | ||
1111 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
1112 | #endif | ||
1113 | |||
1114 | /* | ||
1115 | * Should be a barrier for any external CPU state. | ||
1116 | */ | ||
1117 | cpu_init(); | ||
1118 | } | ||
1119 | |||
1120 | |||
1121 | static int __init oops_setup(char *s) | ||
1122 | { | ||
1123 | if (!s) | ||
1124 | return -EINVAL; | ||
1125 | if (!strcmp(s, "panic")) | ||
1126 | panic_on_oops = 1; | ||
1127 | return 0; | ||
1128 | } | ||
1129 | early_param("oops", oops_setup); | ||
1130 | |||
1131 | static int __init kstack_setup(char *s) | ||
1132 | { | ||
1133 | if (!s) | ||
1134 | return -EINVAL; | ||
1135 | kstack_depth_to_print = simple_strtoul(s,NULL,0); | ||
1136 | return 0; | ||
1137 | } | ||
1138 | early_param("kstack", kstack_setup); | ||
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c deleted file mode 100644 index 2a59bde663f2..000000000000 --- a/arch/x86_64/kernel/tsc.c +++ /dev/null | |||
@@ -1,207 +0,0 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/interrupt.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/clocksource.h> | ||
6 | #include <linux/time.h> | ||
7 | #include <linux/acpi.h> | ||
8 | #include <linux/cpufreq.h> | ||
9 | |||
10 | #include <asm/timex.h> | ||
11 | |||
12 | static int notsc __initdata = 0; | ||
13 | |||
14 | unsigned int cpu_khz; /* TSC clocks / usec, not used here */ | ||
15 | EXPORT_SYMBOL(cpu_khz); | ||
16 | unsigned int tsc_khz; | ||
17 | EXPORT_SYMBOL(tsc_khz); | ||
18 | |||
19 | static unsigned int cyc2ns_scale __read_mostly; | ||
20 | |||
21 | void set_cyc2ns_scale(unsigned long khz) | ||
22 | { | ||
23 | cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; | ||
24 | } | ||
25 | |||
26 | static unsigned long long cycles_2_ns(unsigned long long cyc) | ||
27 | { | ||
28 | return (cyc * cyc2ns_scale) >> NS_SCALE; | ||
29 | } | ||
30 | |||
31 | unsigned long long sched_clock(void) | ||
32 | { | ||
33 | unsigned long a = 0; | ||
34 | |||
35 | /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, | ||
36 | * which means it is not completely exact and may not be monotonous | ||
37 | * between CPUs. But the errors should be too small to matter for | ||
38 | * scheduling purposes. | ||
39 | */ | ||
40 | |||
41 | rdtscll(a); | ||
42 | return cycles_2_ns(a); | ||
43 | } | ||
44 | |||
45 | static int tsc_unstable; | ||
46 | |||
47 | inline int check_tsc_unstable(void) | ||
48 | { | ||
49 | return tsc_unstable; | ||
50 | } | ||
51 | #ifdef CONFIG_CPU_FREQ | ||
52 | |||
53 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency | ||
54 | * changes. | ||
55 | * | ||
56 | * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's | ||
57 | * not that important because current Opteron setups do not support | ||
58 | * scaling on SMP anyroads. | ||
59 | * | ||
60 | * Should fix up last_tsc too. Currently gettimeofday in the | ||
61 | * first tick after the change will be slightly wrong. | ||
62 | */ | ||
63 | |||
64 | static unsigned int ref_freq; | ||
65 | static unsigned long loops_per_jiffy_ref; | ||
66 | static unsigned long tsc_khz_ref; | ||
67 | |||
68 | static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | ||
69 | void *data) | ||
70 | { | ||
71 | struct cpufreq_freqs *freq = data; | ||
72 | unsigned long *lpj, dummy; | ||
73 | |||
74 | if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) | ||
75 | return 0; | ||
76 | |||
77 | lpj = &dummy; | ||
78 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
79 | #ifdef CONFIG_SMP | ||
80 | lpj = &cpu_data[freq->cpu].loops_per_jiffy; | ||
81 | #else | ||
82 | lpj = &boot_cpu_data.loops_per_jiffy; | ||
83 | #endif | ||
84 | |||
85 | if (!ref_freq) { | ||
86 | ref_freq = freq->old; | ||
87 | loops_per_jiffy_ref = *lpj; | ||
88 | tsc_khz_ref = tsc_khz; | ||
89 | } | ||
90 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
91 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
92 | (val == CPUFREQ_RESUMECHANGE)) { | ||
93 | *lpj = | ||
94 | cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); | ||
95 | |||
96 | tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); | ||
97 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
98 | mark_tsc_unstable("cpufreq changes"); | ||
99 | } | ||
100 | |||
101 | set_cyc2ns_scale(tsc_khz_ref); | ||
102 | |||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static struct notifier_block time_cpufreq_notifier_block = { | ||
107 | .notifier_call = time_cpufreq_notifier | ||
108 | }; | ||
109 | |||
110 | static int __init cpufreq_tsc(void) | ||
111 | { | ||
112 | cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
113 | CPUFREQ_TRANSITION_NOTIFIER); | ||
114 | return 0; | ||
115 | } | ||
116 | |||
117 | core_initcall(cpufreq_tsc); | ||
118 | |||
119 | #endif | ||
120 | |||
121 | /* | ||
122 | * Make an educated guess if the TSC is trustworthy and synchronized | ||
123 | * over all CPUs. | ||
124 | */ | ||
125 | __cpuinit int unsynchronized_tsc(void) | ||
126 | { | ||
127 | if (tsc_unstable) | ||
128 | return 1; | ||
129 | |||
130 | #ifdef CONFIG_SMP | ||
131 | if (apic_is_clustered_box()) | ||
132 | return 1; | ||
133 | #endif | ||
134 | /* Most intel systems have synchronized TSCs except for | ||
135 | multi node systems */ | ||
136 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { | ||
137 | #ifdef CONFIG_ACPI | ||
138 | /* But TSC doesn't tick in C3 so don't use it there */ | ||
139 | if (acpi_gbl_FADT.header.length > 0 && | ||
140 | acpi_gbl_FADT.C3latency < 1000) | ||
141 | return 1; | ||
142 | #endif | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | /* Assume multi socket systems are not synchronized */ | ||
147 | return num_present_cpus() > 1; | ||
148 | } | ||
149 | |||
150 | int __init notsc_setup(char *s) | ||
151 | { | ||
152 | notsc = 1; | ||
153 | return 1; | ||
154 | } | ||
155 | |||
156 | __setup("notsc", notsc_setup); | ||
157 | |||
158 | |||
159 | /* clock source code: */ | ||
160 | static cycle_t read_tsc(void) | ||
161 | { | ||
162 | cycle_t ret = (cycle_t)get_cycles_sync(); | ||
163 | return ret; | ||
164 | } | ||
165 | |||
166 | static cycle_t __vsyscall_fn vread_tsc(void) | ||
167 | { | ||
168 | cycle_t ret = (cycle_t)get_cycles_sync(); | ||
169 | return ret; | ||
170 | } | ||
171 | |||
172 | static struct clocksource clocksource_tsc = { | ||
173 | .name = "tsc", | ||
174 | .rating = 300, | ||
175 | .read = read_tsc, | ||
176 | .mask = CLOCKSOURCE_MASK(64), | ||
177 | .shift = 22, | ||
178 | .flags = CLOCK_SOURCE_IS_CONTINUOUS | | ||
179 | CLOCK_SOURCE_MUST_VERIFY, | ||
180 | .vread = vread_tsc, | ||
181 | }; | ||
182 | |||
183 | void mark_tsc_unstable(char *reason) | ||
184 | { | ||
185 | if (!tsc_unstable) { | ||
186 | tsc_unstable = 1; | ||
187 | printk("Marking TSC unstable due to %s\n", reason); | ||
188 | /* Change only the rating, when not registered */ | ||
189 | if (clocksource_tsc.mult) | ||
190 | clocksource_change_rating(&clocksource_tsc, 0); | ||
191 | else | ||
192 | clocksource_tsc.rating = 0; | ||
193 | } | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(mark_tsc_unstable); | ||
196 | |||
197 | void __init init_tsc_clocksource(void) | ||
198 | { | ||
199 | if (!notsc) { | ||
200 | clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, | ||
201 | clocksource_tsc.shift); | ||
202 | if (check_tsc_unstable()) | ||
203 | clocksource_tsc.rating = 0; | ||
204 | |||
205 | clocksource_register(&clocksource_tsc); | ||
206 | } | ||
207 | } | ||
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c deleted file mode 100644 index 355f5f506c81..000000000000 --- a/arch/x86_64/kernel/tsc_sync.c +++ /dev/null | |||
@@ -1,187 +0,0 @@ | |||
1 | /* | ||
2 | * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization. | ||
3 | * | ||
4 | * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar | ||
5 | * | ||
6 | * We check whether all boot CPUs have their TSC's synchronized, | ||
7 | * print a warning if not and turn off the TSC clock-source. | ||
8 | * | ||
9 | * The warp-check is point-to-point between two CPUs, the CPU | ||
10 | * initiating the bootup is the 'source CPU', the freshly booting | ||
11 | * CPU is the 'target CPU'. | ||
12 | * | ||
13 | * Only two CPUs may participate - they can enter in any order. | ||
14 | * ( The serial nature of the boot logic and the CPU hotplug lock | ||
15 | * protects against more than 2 CPUs entering this code. ) | ||
16 | */ | ||
17 | #include <linux/spinlock.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/smp.h> | ||
21 | #include <linux/nmi.h> | ||
22 | #include <asm/tsc.h> | ||
23 | |||
24 | /* | ||
25 | * Entry/exit counters that make sure that both CPUs | ||
26 | * run the measurement code at once: | ||
27 | */ | ||
28 | static __cpuinitdata atomic_t start_count; | ||
29 | static __cpuinitdata atomic_t stop_count; | ||
30 | |||
31 | /* | ||
32 | * We use a raw spinlock in this exceptional case, because | ||
33 | * we want to have the fastest, inlined, non-debug version | ||
34 | * of a critical section, to be able to prove TSC time-warps: | ||
35 | */ | ||
36 | static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
37 | static __cpuinitdata cycles_t last_tsc; | ||
38 | static __cpuinitdata cycles_t max_warp; | ||
39 | static __cpuinitdata int nr_warps; | ||
40 | |||
41 | /* | ||
42 | * TSC-warp measurement loop running on both CPUs: | ||
43 | */ | ||
44 | static __cpuinit void check_tsc_warp(void) | ||
45 | { | ||
46 | cycles_t start, now, prev, end; | ||
47 | int i; | ||
48 | |||
49 | start = get_cycles_sync(); | ||
50 | /* | ||
51 | * The measurement runs for 20 msecs: | ||
52 | */ | ||
53 | end = start + tsc_khz * 20ULL; | ||
54 | now = start; | ||
55 | |||
56 | for (i = 0; ; i++) { | ||
57 | /* | ||
58 | * We take the global lock, measure TSC, save the | ||
59 | * previous TSC that was measured (possibly on | ||
60 | * another CPU) and update the previous TSC timestamp. | ||
61 | */ | ||
62 | __raw_spin_lock(&sync_lock); | ||
63 | prev = last_tsc; | ||
64 | now = get_cycles_sync(); | ||
65 | last_tsc = now; | ||
66 | __raw_spin_unlock(&sync_lock); | ||
67 | |||
68 | /* | ||
69 | * Be nice every now and then (and also check whether | ||
70 | * measurement is done [we also insert a 100 million | ||
71 | * loops safety exit, so we dont lock up in case the | ||
72 | * TSC readout is totally broken]): | ||
73 | */ | ||
74 | if (unlikely(!(i & 7))) { | ||
75 | if (now > end || i > 100000000) | ||
76 | break; | ||
77 | cpu_relax(); | ||
78 | touch_nmi_watchdog(); | ||
79 | } | ||
80 | /* | ||
81 | * Outside the critical section we can now see whether | ||
82 | * we saw a time-warp of the TSC going backwards: | ||
83 | */ | ||
84 | if (unlikely(prev > now)) { | ||
85 | __raw_spin_lock(&sync_lock); | ||
86 | max_warp = max(max_warp, prev - now); | ||
87 | nr_warps++; | ||
88 | __raw_spin_unlock(&sync_lock); | ||
89 | } | ||
90 | |||
91 | } | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Source CPU calls into this - it waits for the freshly booted | ||
96 | * target CPU to arrive and then starts the measurement: | ||
97 | */ | ||
98 | void __cpuinit check_tsc_sync_source(int cpu) | ||
99 | { | ||
100 | int cpus = 2; | ||
101 | |||
102 | /* | ||
103 | * No need to check if we already know that the TSC is not | ||
104 | * synchronized: | ||
105 | */ | ||
106 | if (unsynchronized_tsc()) | ||
107 | return; | ||
108 | |||
109 | printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", | ||
110 | smp_processor_id(), cpu); | ||
111 | |||
112 | /* | ||
113 | * Reset it - in case this is a second bootup: | ||
114 | */ | ||
115 | atomic_set(&stop_count, 0); | ||
116 | |||
117 | /* | ||
118 | * Wait for the target to arrive: | ||
119 | */ | ||
120 | while (atomic_read(&start_count) != cpus-1) | ||
121 | cpu_relax(); | ||
122 | /* | ||
123 | * Trigger the target to continue into the measurement too: | ||
124 | */ | ||
125 | atomic_inc(&start_count); | ||
126 | |||
127 | check_tsc_warp(); | ||
128 | |||
129 | while (atomic_read(&stop_count) != cpus-1) | ||
130 | cpu_relax(); | ||
131 | |||
132 | /* | ||
133 | * Reset it - just in case we boot another CPU later: | ||
134 | */ | ||
135 | atomic_set(&start_count, 0); | ||
136 | |||
137 | if (nr_warps) { | ||
138 | printk("\n"); | ||
139 | printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," | ||
140 | " turning off TSC clock.\n", max_warp); | ||
141 | mark_tsc_unstable("check_tsc_sync_source failed"); | ||
142 | nr_warps = 0; | ||
143 | max_warp = 0; | ||
144 | last_tsc = 0; | ||
145 | } else { | ||
146 | printk(" passed.\n"); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Let the target continue with the bootup: | ||
151 | */ | ||
152 | atomic_inc(&stop_count); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Freshly booted CPUs call into this: | ||
157 | */ | ||
158 | void __cpuinit check_tsc_sync_target(void) | ||
159 | { | ||
160 | int cpus = 2; | ||
161 | |||
162 | if (unsynchronized_tsc()) | ||
163 | return; | ||
164 | |||
165 | /* | ||
166 | * Register this CPU's participation and wait for the | ||
167 | * source CPU to start the measurement: | ||
168 | */ | ||
169 | atomic_inc(&start_count); | ||
170 | while (atomic_read(&start_count) != cpus) | ||
171 | cpu_relax(); | ||
172 | |||
173 | check_tsc_warp(); | ||
174 | |||
175 | /* | ||
176 | * Ok, we are done: | ||
177 | */ | ||
178 | atomic_inc(&stop_count); | ||
179 | |||
180 | /* | ||
181 | * Wait for the source CPU to print stuff: | ||
182 | */ | ||
183 | while (atomic_read(&stop_count) != cpus) | ||
184 | cpu_relax(); | ||
185 | } | ||
186 | #undef NR_LOOPS | ||
187 | |||
diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S deleted file mode 100644 index 45b6f8a975a1..000000000000 --- a/arch/x86_64/kernel/verify_cpu.S +++ /dev/null | |||
@@ -1,105 +0,0 @@ | |||
1 | /* | ||
2 | * | ||
3 | * verify_cpu.S - Code for cpu long mode and SSE verification. This | ||
4 | * code has been borrowed from boot/setup.S and was introduced by | ||
5 | * Andi Kleen. | ||
6 | * | ||
7 | * Copyright (c) 2007 Andi Kleen (ak@suse.de) | ||
8 | * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) | ||
9 | * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) | ||
10 | * | ||
11 | * This source code is licensed under the GNU General Public License, | ||
12 | * Version 2. See the file COPYING for more details. | ||
13 | * | ||
14 | * This is a common code for verification whether CPU supports | ||
15 | * long mode and SSE or not. It is not called directly instead this | ||
16 | * file is included at various places and compiled in that context. | ||
17 | * Following are the current usage. | ||
18 | * | ||
19 | * This file is included by both 16bit and 32bit code. | ||
20 | * | ||
21 | * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) | ||
22 | * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) | ||
23 | * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) | ||
24 | * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) | ||
25 | * | ||
26 | * verify_cpu, returns the status of cpu check in register %eax. | ||
27 | * 0: Success 1: Failure | ||
28 | * | ||
29 | * The caller needs to check for the error code and take the action | ||
30 | * appropriately. Either display a message or halt. | ||
31 | */ | ||
32 | |||
33 | #include <asm/cpufeature.h> | ||
34 | |||
35 | verify_cpu: | ||
36 | pushfl # Save caller passed flags | ||
37 | pushl $0 # Kill any dangerous flags | ||
38 | popfl | ||
39 | |||
40 | pushfl # standard way to check for cpuid | ||
41 | popl %eax | ||
42 | movl %eax,%ebx | ||
43 | xorl $0x200000,%eax | ||
44 | pushl %eax | ||
45 | popfl | ||
46 | pushfl | ||
47 | popl %eax | ||
48 | cmpl %eax,%ebx | ||
49 | jz verify_cpu_no_longmode # cpu has no cpuid | ||
50 | |||
51 | movl $0x0,%eax # See if cpuid 1 is implemented | ||
52 | cpuid | ||
53 | cmpl $0x1,%eax | ||
54 | jb verify_cpu_no_longmode # no cpuid 1 | ||
55 | |||
56 | xor %di,%di | ||
57 | cmpl $0x68747541,%ebx # AuthenticAMD | ||
58 | jnz verify_cpu_noamd | ||
59 | cmpl $0x69746e65,%edx | ||
60 | jnz verify_cpu_noamd | ||
61 | cmpl $0x444d4163,%ecx | ||
62 | jnz verify_cpu_noamd | ||
63 | mov $1,%di # cpu is from AMD | ||
64 | |||
65 | verify_cpu_noamd: | ||
66 | movl $0x1,%eax # Does the cpu have what it takes | ||
67 | cpuid | ||
68 | andl $REQUIRED_MASK0,%edx | ||
69 | xorl $REQUIRED_MASK0,%edx | ||
70 | jnz verify_cpu_no_longmode | ||
71 | |||
72 | movl $0x80000000,%eax # See if extended cpuid is implemented | ||
73 | cpuid | ||
74 | cmpl $0x80000001,%eax | ||
75 | jb verify_cpu_no_longmode # no extended cpuid | ||
76 | |||
77 | movl $0x80000001,%eax # Does the cpu have what it takes | ||
78 | cpuid | ||
79 | andl $REQUIRED_MASK1,%edx | ||
80 | xorl $REQUIRED_MASK1,%edx | ||
81 | jnz verify_cpu_no_longmode | ||
82 | |||
83 | verify_cpu_sse_test: | ||
84 | movl $1,%eax | ||
85 | cpuid | ||
86 | andl $SSE_MASK,%edx | ||
87 | cmpl $SSE_MASK,%edx | ||
88 | je verify_cpu_sse_ok | ||
89 | test %di,%di | ||
90 | jz verify_cpu_no_longmode # only try to force SSE on AMD | ||
91 | movl $0xc0010015,%ecx # HWCR | ||
92 | rdmsr | ||
93 | btr $15,%eax # enable SSE | ||
94 | wrmsr | ||
95 | xor %di,%di # don't loop | ||
96 | jmp verify_cpu_sse_test # try again | ||
97 | |||
98 | verify_cpu_no_longmode: | ||
99 | popfl # Restore caller passed flags | ||
100 | movl $1,%eax | ||
101 | ret | ||
102 | verify_cpu_sse_ok: | ||
103 | popfl # Restore caller passed flags | ||
104 | xorl %eax, %eax | ||
105 | ret | ||
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S deleted file mode 100644 index ba8ea97abd21..000000000000 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ /dev/null | |||
@@ -1,235 +0,0 @@ | |||
1 | /* ld script to make x86-64 Linux kernel | ||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | ||
3 | */ | ||
4 | |||
5 | #define LOAD_OFFSET __START_KERNEL_map | ||
6 | |||
7 | #include <asm-generic/vmlinux.lds.h> | ||
8 | #include <asm/page.h> | ||
9 | |||
10 | #undef i386 /* in case the preprocessor is a 32bit one */ | ||
11 | |||
12 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
13 | OUTPUT_ARCH(i386:x86-64) | ||
14 | ENTRY(phys_startup_64) | ||
15 | jiffies_64 = jiffies; | ||
16 | _proxy_pda = 1; | ||
17 | PHDRS { | ||
18 | text PT_LOAD FLAGS(5); /* R_E */ | ||
19 | data PT_LOAD FLAGS(7); /* RWE */ | ||
20 | user PT_LOAD FLAGS(7); /* RWE */ | ||
21 | data.init PT_LOAD FLAGS(7); /* RWE */ | ||
22 | note PT_NOTE FLAGS(4); /* R__ */ | ||
23 | } | ||
24 | SECTIONS | ||
25 | { | ||
26 | . = __START_KERNEL; | ||
27 | phys_startup_64 = startup_64 - LOAD_OFFSET; | ||
28 | _text = .; /* Text and read-only data */ | ||
29 | .text : AT(ADDR(.text) - LOAD_OFFSET) { | ||
30 | /* First the code that has to be first for bootstrapping */ | ||
31 | *(.text.head) | ||
32 | _stext = .; | ||
33 | /* Then the rest */ | ||
34 | TEXT_TEXT | ||
35 | SCHED_TEXT | ||
36 | LOCK_TEXT | ||
37 | KPROBES_TEXT | ||
38 | *(.fixup) | ||
39 | *(.gnu.warning) | ||
40 | } :text = 0x9090 | ||
41 | /* out-of-line lock text */ | ||
42 | .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } | ||
43 | |||
44 | _etext = .; /* End of text section */ | ||
45 | |||
46 | . = ALIGN(16); /* Exception table */ | ||
47 | __start___ex_table = .; | ||
48 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } | ||
49 | __stop___ex_table = .; | ||
50 | |||
51 | NOTES :text :note | ||
52 | |||
53 | BUG_TABLE :text | ||
54 | |||
55 | RODATA | ||
56 | |||
57 | . = ALIGN(4); | ||
58 | .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { | ||
59 | __tracedata_start = .; | ||
60 | *(.tracedata) | ||
61 | __tracedata_end = .; | ||
62 | } | ||
63 | |||
64 | . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ | ||
65 | /* Data */ | ||
66 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | ||
67 | DATA_DATA | ||
68 | CONSTRUCTORS | ||
69 | } :data | ||
70 | |||
71 | _edata = .; /* End of data section */ | ||
72 | |||
73 | . = ALIGN(PAGE_SIZE); | ||
74 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
75 | .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
76 | *(.data.cacheline_aligned) | ||
77 | } | ||
78 | . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); | ||
79 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
80 | *(.data.read_mostly) | ||
81 | } | ||
82 | |||
83 | #define VSYSCALL_ADDR (-10*1024*1024) | ||
84 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
85 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
86 | |||
87 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | ||
88 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | ||
89 | |||
90 | #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) | ||
91 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) | ||
92 | |||
93 | . = VSYSCALL_ADDR; | ||
94 | .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user | ||
95 | __vsyscall_0 = VSYSCALL_VIRT_ADDR; | ||
96 | |||
97 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
98 | .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } | ||
99 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
100 | .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) | ||
101 | { *(.vsyscall_gtod_data) } | ||
102 | vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); | ||
103 | .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) | ||
104 | { *(.vsyscall_clock) } | ||
105 | vsyscall_clock = VVIRT(.vsyscall_clock); | ||
106 | |||
107 | |||
108 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) | ||
109 | { *(.vsyscall_1) } | ||
110 | .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) | ||
111 | { *(.vsyscall_2) } | ||
112 | |||
113 | .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } | ||
114 | vgetcpu_mode = VVIRT(.vgetcpu_mode); | ||
115 | |||
116 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
117 | .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } | ||
118 | jiffies = VVIRT(.jiffies); | ||
119 | |||
120 | .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) | ||
121 | { *(.vsyscall_3) } | ||
122 | |||
123 | . = VSYSCALL_VIRT_ADDR + 4096; | ||
124 | |||
125 | #undef VSYSCALL_ADDR | ||
126 | #undef VSYSCALL_PHYS_ADDR | ||
127 | #undef VSYSCALL_VIRT_ADDR | ||
128 | #undef VLOAD_OFFSET | ||
129 | #undef VLOAD | ||
130 | #undef VVIRT_OFFSET | ||
131 | #undef VVIRT | ||
132 | |||
133 | . = ALIGN(8192); /* init_task */ | ||
134 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | ||
135 | *(.data.init_task) | ||
136 | }:data.init | ||
137 | |||
138 | . = ALIGN(4096); | ||
139 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
140 | *(.data.page_aligned) | ||
141 | } | ||
142 | |||
143 | /* might get freed after init */ | ||
144 | . = ALIGN(4096); | ||
145 | __smp_alt_begin = .; | ||
146 | __smp_locks = .; | ||
147 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
148 | *(.smp_locks) | ||
149 | } | ||
150 | __smp_locks_end = .; | ||
151 | . = ALIGN(4096); | ||
152 | __smp_alt_end = .; | ||
153 | |||
154 | . = ALIGN(4096); /* Init code and data */ | ||
155 | __init_begin = .; | ||
156 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | ||
157 | _sinittext = .; | ||
158 | *(.init.text) | ||
159 | _einittext = .; | ||
160 | } | ||
161 | __initdata_begin = .; | ||
162 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } | ||
163 | __initdata_end = .; | ||
164 | . = ALIGN(16); | ||
165 | __setup_start = .; | ||
166 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } | ||
167 | __setup_end = .; | ||
168 | __initcall_start = .; | ||
169 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { | ||
170 | INITCALLS | ||
171 | } | ||
172 | __initcall_end = .; | ||
173 | __con_initcall_start = .; | ||
174 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { | ||
175 | *(.con_initcall.init) | ||
176 | } | ||
177 | __con_initcall_end = .; | ||
178 | SECURITY_INIT | ||
179 | . = ALIGN(8); | ||
180 | __alt_instructions = .; | ||
181 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { | ||
182 | *(.altinstructions) | ||
183 | } | ||
184 | __alt_instructions_end = .; | ||
185 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { | ||
186 | *(.altinstr_replacement) | ||
187 | } | ||
188 | /* .exit.text is discard at runtime, not link time, to deal with references | ||
189 | from .altinstructions and .eh_frame */ | ||
190 | .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } | ||
191 | .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } | ||
192 | |||
193 | /* vdso blob that is mapped into user space */ | ||
194 | vdso_start = . ; | ||
195 | .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } | ||
196 | . = ALIGN(4096); | ||
197 | vdso_end = .; | ||
198 | |||
199 | #ifdef CONFIG_BLK_DEV_INITRD | ||
200 | . = ALIGN(4096); | ||
201 | __initramfs_start = .; | ||
202 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } | ||
203 | __initramfs_end = .; | ||
204 | #endif | ||
205 | |||
206 | PERCPU(4096) | ||
207 | |||
208 | . = ALIGN(4096); | ||
209 | __init_end = .; | ||
210 | |||
211 | . = ALIGN(4096); | ||
212 | __nosave_begin = .; | ||
213 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } | ||
214 | . = ALIGN(4096); | ||
215 | __nosave_end = .; | ||
216 | |||
217 | __bss_start = .; /* BSS */ | ||
218 | .bss : AT(ADDR(.bss) - LOAD_OFFSET) { | ||
219 | *(.bss.page_aligned) | ||
220 | *(.bss) | ||
221 | } | ||
222 | __bss_stop = .; | ||
223 | |||
224 | _end = . ; | ||
225 | |||
226 | /* Sections to be discarded */ | ||
227 | /DISCARD/ : { | ||
228 | *(.exitcall.exit) | ||
229 | *(.eh_frame) | ||
230 | } | ||
231 | |||
232 | STABS_DEBUG | ||
233 | |||
234 | DWARF_DEBUG | ||
235 | } | ||
diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c deleted file mode 100644 index 414caf0c5f9a..000000000000 --- a/arch/x86_64/kernel/vsmp.c +++ /dev/null | |||
@@ -1,49 +0,0 @@ | |||
1 | /* | ||
2 | * vSMPowered(tm) systems specific initialization | ||
3 | * Copyright (C) 2005 ScaleMP Inc. | ||
4 | * | ||
5 | * Use of this code is subject to the terms and conditions of the | ||
6 | * GNU general public license version 2. See "COPYING" or | ||
7 | * http://www.gnu.org/licenses/gpl.html | ||
8 | * | ||
9 | * Ravikiran Thirumalai <kiran@scalemp.com>, | ||
10 | * Shai Fultheim <shai@scalemp.com> | ||
11 | */ | ||
12 | |||
13 | #include <linux/init.h> | ||
14 | #include <linux/pci_ids.h> | ||
15 | #include <linux/pci_regs.h> | ||
16 | #include <asm/pci-direct.h> | ||
17 | #include <asm/io.h> | ||
18 | |||
19 | static int __init vsmp_init(void) | ||
20 | { | ||
21 | void *address; | ||
22 | unsigned int cap, ctl; | ||
23 | |||
24 | if (!early_pci_allowed()) | ||
25 | return 0; | ||
26 | |||
27 | /* Check if we are running on a ScaleMP vSMP box */ | ||
28 | if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || | ||
29 | (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) | ||
30 | return 0; | ||
31 | |||
32 | /* set vSMP magic bits to indicate vSMP capable kernel */ | ||
33 | address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); | ||
34 | cap = readl(address); | ||
35 | ctl = readl(address + 4); | ||
36 | printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); | ||
37 | if (cap & ctl & (1 << 4)) { | ||
38 | /* Turn on vSMP IRQ fastpath handling (see system.h) */ | ||
39 | ctl &= ~(1 << 4); | ||
40 | writel(ctl, address + 4); | ||
41 | ctl = readl(address + 4); | ||
42 | printk("vSMP CTL: control set to:0x%08x\n", ctl); | ||
43 | } | ||
44 | |||
45 | iounmap(address); | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | core_initcall(vsmp_init); | ||
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c deleted file mode 100644 index 06c34949bfdc..000000000000 --- a/arch/x86_64/kernel/vsyscall.c +++ /dev/null | |||
@@ -1,349 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/vsyscall.c | ||
3 | * | ||
4 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright 2003 Andi Kleen, SuSE Labs. | ||
6 | * | ||
7 | * Thanks to hpa@transmeta.com for some useful hint. | ||
8 | * Special thanks to Ingo Molnar for his early experience with | ||
9 | * a different vsyscall implementation for Linux/IA32 and for the name. | ||
10 | * | ||
11 | * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | ||
12 | * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | ||
13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | ||
14 | * jumping out of line if necessary. We cannot add more with this | ||
15 | * mechanism because older kernels won't return -ENOSYS. | ||
16 | * If we want more than four we need a vDSO. | ||
17 | * | ||
18 | * Note: the concept clashes with user mode linux. If you use UML and | ||
19 | * want per guest time just set the kernel.vsyscall64 sysctl to 0. | ||
20 | */ | ||
21 | |||
22 | #include <linux/time.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/timer.h> | ||
26 | #include <linux/seqlock.h> | ||
27 | #include <linux/jiffies.h> | ||
28 | #include <linux/sysctl.h> | ||
29 | #include <linux/clocksource.h> | ||
30 | #include <linux/getcpu.h> | ||
31 | #include <linux/cpu.h> | ||
32 | #include <linux/smp.h> | ||
33 | #include <linux/notifier.h> | ||
34 | |||
35 | #include <asm/vsyscall.h> | ||
36 | #include <asm/pgtable.h> | ||
37 | #include <asm/page.h> | ||
38 | #include <asm/unistd.h> | ||
39 | #include <asm/fixmap.h> | ||
40 | #include <asm/errno.h> | ||
41 | #include <asm/io.h> | ||
42 | #include <asm/segment.h> | ||
43 | #include <asm/desc.h> | ||
44 | #include <asm/topology.h> | ||
45 | #include <asm/vgtod.h> | ||
46 | |||
47 | #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) | ||
48 | #define __syscall_clobber "r11","rcx","memory" | ||
49 | #define __pa_vsymbol(x) \ | ||
50 | ({unsigned long v; \ | ||
51 | extern char __vsyscall_0; \ | ||
52 | asm("" : "=r" (v) : "0" (x)); \ | ||
53 | ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) | ||
54 | |||
55 | /* | ||
56 | * vsyscall_gtod_data contains data that is : | ||
57 | * - readonly from vsyscalls | ||
58 | * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) | ||
59 | * Try to keep this structure as small as possible to avoid cache line ping pongs | ||
60 | */ | ||
61 | int __vgetcpu_mode __section_vgetcpu_mode; | ||
62 | |||
63 | struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = | ||
64 | { | ||
65 | .lock = SEQLOCK_UNLOCKED, | ||
66 | .sysctl_enabled = 1, | ||
67 | }; | ||
68 | |||
69 | void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) | ||
70 | { | ||
71 | unsigned long flags; | ||
72 | |||
73 | write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); | ||
74 | /* copy vsyscall data */ | ||
75 | vsyscall_gtod_data.clock.vread = clock->vread; | ||
76 | vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; | ||
77 | vsyscall_gtod_data.clock.mask = clock->mask; | ||
78 | vsyscall_gtod_data.clock.mult = clock->mult; | ||
79 | vsyscall_gtod_data.clock.shift = clock->shift; | ||
80 | vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; | ||
81 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | ||
82 | vsyscall_gtod_data.sys_tz = sys_tz; | ||
83 | vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; | ||
84 | vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; | ||
85 | write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); | ||
86 | } | ||
87 | |||
88 | /* RED-PEN may want to readd seq locking, but then the variable should be | ||
89 | * write-once. | ||
90 | */ | ||
91 | static __always_inline void do_get_tz(struct timezone * tz) | ||
92 | { | ||
93 | *tz = __vsyscall_gtod_data.sys_tz; | ||
94 | } | ||
95 | |||
96 | static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) | ||
97 | { | ||
98 | int ret; | ||
99 | asm volatile("vsysc2: syscall" | ||
100 | : "=a" (ret) | ||
101 | : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) | ||
102 | : __syscall_clobber ); | ||
103 | return ret; | ||
104 | } | ||
105 | |||
106 | static __always_inline long time_syscall(long *t) | ||
107 | { | ||
108 | long secs; | ||
109 | asm volatile("vsysc1: syscall" | ||
110 | : "=a" (secs) | ||
111 | : "0" (__NR_time),"D" (t) : __syscall_clobber); | ||
112 | return secs; | ||
113 | } | ||
114 | |||
115 | static __always_inline void do_vgettimeofday(struct timeval * tv) | ||
116 | { | ||
117 | cycle_t now, base, mask, cycle_delta; | ||
118 | unsigned seq; | ||
119 | unsigned long mult, shift, nsec; | ||
120 | cycle_t (*vread)(void); | ||
121 | do { | ||
122 | seq = read_seqbegin(&__vsyscall_gtod_data.lock); | ||
123 | |||
124 | vread = __vsyscall_gtod_data.clock.vread; | ||
125 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { | ||
126 | gettimeofday(tv,NULL); | ||
127 | return; | ||
128 | } | ||
129 | now = vread(); | ||
130 | base = __vsyscall_gtod_data.clock.cycle_last; | ||
131 | mask = __vsyscall_gtod_data.clock.mask; | ||
132 | mult = __vsyscall_gtod_data.clock.mult; | ||
133 | shift = __vsyscall_gtod_data.clock.shift; | ||
134 | |||
135 | tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; | ||
136 | nsec = __vsyscall_gtod_data.wall_time_nsec; | ||
137 | } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); | ||
138 | |||
139 | /* calculate interval: */ | ||
140 | cycle_delta = (now - base) & mask; | ||
141 | /* convert to nsecs: */ | ||
142 | nsec += (cycle_delta * mult) >> shift; | ||
143 | |||
144 | while (nsec >= NSEC_PER_SEC) { | ||
145 | tv->tv_sec += 1; | ||
146 | nsec -= NSEC_PER_SEC; | ||
147 | } | ||
148 | tv->tv_usec = nsec / NSEC_PER_USEC; | ||
149 | } | ||
150 | |||
151 | int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | ||
152 | { | ||
153 | if (tv) | ||
154 | do_vgettimeofday(tv); | ||
155 | if (tz) | ||
156 | do_get_tz(tz); | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | /* This will break when the xtime seconds get inaccurate, but that is | ||
161 | * unlikely */ | ||
162 | time_t __vsyscall(1) vtime(time_t *t) | ||
163 | { | ||
164 | struct timeval tv; | ||
165 | time_t result; | ||
166 | if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) | ||
167 | return time_syscall(t); | ||
168 | |||
169 | vgettimeofday(&tv, 0); | ||
170 | result = tv.tv_sec; | ||
171 | if (t) | ||
172 | *t = result; | ||
173 | return result; | ||
174 | } | ||
175 | |||
176 | /* Fast way to get current CPU and node. | ||
177 | This helps to do per node and per CPU caches in user space. | ||
178 | The result is not guaranteed without CPU affinity, but usually | ||
179 | works out because the scheduler tries to keep a thread on the same | ||
180 | CPU. | ||
181 | |||
182 | tcache must point to a two element sized long array. | ||
183 | All arguments can be NULL. */ | ||
184 | long __vsyscall(2) | ||
185 | vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) | ||
186 | { | ||
187 | unsigned int dummy, p; | ||
188 | unsigned long j = 0; | ||
189 | |||
190 | /* Fast cache - only recompute value once per jiffies and avoid | ||
191 | relatively costly rdtscp/cpuid otherwise. | ||
192 | This works because the scheduler usually keeps the process | ||
193 | on the same CPU and this syscall doesn't guarantee its | ||
194 | results anyways. | ||
195 | We do this here because otherwise user space would do it on | ||
196 | its own in a likely inferior way (no access to jiffies). | ||
197 | If you don't like it pass NULL. */ | ||
198 | if (tcache && tcache->blob[0] == (j = __jiffies)) { | ||
199 | p = tcache->blob[1]; | ||
200 | } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { | ||
201 | /* Load per CPU data from RDTSCP */ | ||
202 | rdtscp(dummy, dummy, p); | ||
203 | } else { | ||
204 | /* Load per CPU data from GDT */ | ||
205 | asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); | ||
206 | } | ||
207 | if (tcache) { | ||
208 | tcache->blob[0] = j; | ||
209 | tcache->blob[1] = p; | ||
210 | } | ||
211 | if (cpu) | ||
212 | *cpu = p & 0xfff; | ||
213 | if (node) | ||
214 | *node = p >> 12; | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | long __vsyscall(3) venosys_1(void) | ||
219 | { | ||
220 | return -ENOSYS; | ||
221 | } | ||
222 | |||
223 | #ifdef CONFIG_SYSCTL | ||
224 | |||
225 | #define SYSCALL 0x050f | ||
226 | #define NOP2 0x9090 | ||
227 | |||
228 | /* | ||
229 | * NOP out syscall in vsyscall page when not needed. | ||
230 | */ | ||
231 | static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | ||
232 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
233 | { | ||
234 | extern u16 vsysc1, vsysc2; | ||
235 | u16 __iomem *map1; | ||
236 | u16 __iomem *map2; | ||
237 | int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | ||
238 | if (!write) | ||
239 | return ret; | ||
240 | /* gcc has some trouble with __va(__pa()), so just do it this | ||
241 | way. */ | ||
242 | map1 = ioremap(__pa_vsymbol(&vsysc1), 2); | ||
243 | if (!map1) | ||
244 | return -ENOMEM; | ||
245 | map2 = ioremap(__pa_vsymbol(&vsysc2), 2); | ||
246 | if (!map2) { | ||
247 | ret = -ENOMEM; | ||
248 | goto out; | ||
249 | } | ||
250 | if (!vsyscall_gtod_data.sysctl_enabled) { | ||
251 | writew(SYSCALL, map1); | ||
252 | writew(SYSCALL, map2); | ||
253 | } else { | ||
254 | writew(NOP2, map1); | ||
255 | writew(NOP2, map2); | ||
256 | } | ||
257 | iounmap(map2); | ||
258 | out: | ||
259 | iounmap(map1); | ||
260 | return ret; | ||
261 | } | ||
262 | |||
263 | static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, | ||
264 | void __user *oldval, size_t __user *oldlenp, | ||
265 | void __user *newval, size_t newlen) | ||
266 | { | ||
267 | return -ENOSYS; | ||
268 | } | ||
269 | |||
270 | static ctl_table kernel_table2[] = { | ||
271 | { .ctl_name = 99, .procname = "vsyscall64", | ||
272 | .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), | ||
273 | .mode = 0644, | ||
274 | .strategy = vsyscall_sysctl_nostrat, | ||
275 | .proc_handler = vsyscall_sysctl_change }, | ||
276 | {} | ||
277 | }; | ||
278 | |||
279 | static ctl_table kernel_root_table2[] = { | ||
280 | { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, | ||
281 | .child = kernel_table2 }, | ||
282 | {} | ||
283 | }; | ||
284 | |||
285 | #endif | ||
286 | |||
287 | /* Assume __initcall executes before all user space. Hopefully kmod | ||
288 | doesn't violate that. We'll find out if it does. */ | ||
289 | static void __cpuinit vsyscall_set_cpu(int cpu) | ||
290 | { | ||
291 | unsigned long *d; | ||
292 | unsigned long node = 0; | ||
293 | #ifdef CONFIG_NUMA | ||
294 | node = cpu_to_node[cpu]; | ||
295 | #endif | ||
296 | if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) | ||
297 | write_rdtscp_aux((node << 12) | cpu); | ||
298 | |||
299 | /* Store cpu number in limit so that it can be loaded quickly | ||
300 | in user space in vgetcpu. | ||
301 | 12 bits for the CPU and 8 bits for the node. */ | ||
302 | d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); | ||
303 | *d = 0x0f40000000000ULL; | ||
304 | *d |= cpu; | ||
305 | *d |= (node & 0xf) << 12; | ||
306 | *d |= (node >> 4) << 48; | ||
307 | } | ||
308 | |||
309 | static void __cpuinit cpu_vsyscall_init(void *arg) | ||
310 | { | ||
311 | /* preemption should be already off */ | ||
312 | vsyscall_set_cpu(raw_smp_processor_id()); | ||
313 | } | ||
314 | |||
315 | static int __cpuinit | ||
316 | cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) | ||
317 | { | ||
318 | long cpu = (long)arg; | ||
319 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | ||
320 | smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); | ||
321 | return NOTIFY_DONE; | ||
322 | } | ||
323 | |||
324 | static void __init map_vsyscall(void) | ||
325 | { | ||
326 | extern char __vsyscall_0; | ||
327 | unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); | ||
328 | |||
329 | /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ | ||
330 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); | ||
331 | } | ||
332 | |||
333 | static int __init vsyscall_init(void) | ||
334 | { | ||
335 | BUG_ON(((unsigned long) &vgettimeofday != | ||
336 | VSYSCALL_ADDR(__NR_vgettimeofday))); | ||
337 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | ||
338 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | ||
339 | BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); | ||
340 | map_vsyscall(); | ||
341 | #ifdef CONFIG_SYSCTL | ||
342 | register_sysctl_table(kernel_root_table2); | ||
343 | #endif | ||
344 | on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); | ||
345 | hotcpu_notifier(cpu_vsyscall_notifier, 0); | ||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | __initcall(vsyscall_init); | ||
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c deleted file mode 100644 index 77c25b307635..000000000000 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ /dev/null | |||
@@ -1,62 +0,0 @@ | |||
1 | /* Exports for assembly files. | ||
2 | All C exports should go in the respective C files. */ | ||
3 | |||
4 | #include <linux/module.h> | ||
5 | #include <linux/smp.h> | ||
6 | |||
7 | #include <asm/semaphore.h> | ||
8 | #include <asm/processor.h> | ||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | |||
12 | EXPORT_SYMBOL(kernel_thread); | ||
13 | |||
14 | EXPORT_SYMBOL(__down_failed); | ||
15 | EXPORT_SYMBOL(__down_failed_interruptible); | ||
16 | EXPORT_SYMBOL(__down_failed_trylock); | ||
17 | EXPORT_SYMBOL(__up_wakeup); | ||
18 | |||
19 | EXPORT_SYMBOL(__get_user_1); | ||
20 | EXPORT_SYMBOL(__get_user_2); | ||
21 | EXPORT_SYMBOL(__get_user_4); | ||
22 | EXPORT_SYMBOL(__get_user_8); | ||
23 | EXPORT_SYMBOL(__put_user_1); | ||
24 | EXPORT_SYMBOL(__put_user_2); | ||
25 | EXPORT_SYMBOL(__put_user_4); | ||
26 | EXPORT_SYMBOL(__put_user_8); | ||
27 | |||
28 | EXPORT_SYMBOL(copy_user_generic); | ||
29 | EXPORT_SYMBOL(__copy_user_nocache); | ||
30 | EXPORT_SYMBOL(copy_from_user); | ||
31 | EXPORT_SYMBOL(copy_to_user); | ||
32 | EXPORT_SYMBOL(__copy_from_user_inatomic); | ||
33 | |||
34 | EXPORT_SYMBOL(copy_page); | ||
35 | EXPORT_SYMBOL(clear_page); | ||
36 | |||
37 | #ifdef CONFIG_SMP | ||
38 | extern void __write_lock_failed(rwlock_t *rw); | ||
39 | extern void __read_lock_failed(rwlock_t *rw); | ||
40 | EXPORT_SYMBOL(__write_lock_failed); | ||
41 | EXPORT_SYMBOL(__read_lock_failed); | ||
42 | #endif | ||
43 | |||
44 | /* Export string functions. We normally rely on gcc builtin for most of these, | ||
45 | but gcc sometimes decides not to inline them. */ | ||
46 | #undef memcpy | ||
47 | #undef memset | ||
48 | #undef memmove | ||
49 | |||
50 | extern void * memset(void *,int,__kernel_size_t); | ||
51 | extern void * memcpy(void *,const void *,__kernel_size_t); | ||
52 | extern void * __memcpy(void *,const void *,__kernel_size_t); | ||
53 | |||
54 | EXPORT_SYMBOL(memset); | ||
55 | EXPORT_SYMBOL(memcpy); | ||
56 | EXPORT_SYMBOL(__memcpy); | ||
57 | |||
58 | EXPORT_SYMBOL(empty_zero_page); | ||
59 | EXPORT_SYMBOL(init_level4_pgt); | ||
60 | EXPORT_SYMBOL(load_gs_index); | ||
61 | |||
62 | EXPORT_SYMBOL(_proxy_pda); | ||
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile deleted file mode 100644 index c94327178398..000000000000 --- a/arch/x86_64/lib/Makefile +++ /dev/null | |||
@@ -1,13 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for x86_64-specific library files. | ||
3 | # | ||
4 | |||
5 | CFLAGS_csum-partial.o := -funroll-loops | ||
6 | |||
7 | obj-y := io.o iomap_copy.o | ||
8 | obj-$(CONFIG_SMP) += msr-on-cpu.o | ||
9 | |||
10 | lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ | ||
11 | usercopy.o getuser.o putuser.o \ | ||
12 | thunk.o clear_page.o copy_page.o bitstr.o bitops.o | ||
13 | lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o | ||
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c deleted file mode 100644 index 95b6d9639fba..000000000000 --- a/arch/x86_64/lib/bitops.c +++ /dev/null | |||
@@ -1,175 +0,0 @@ | |||
1 | #include <linux/bitops.h> | ||
2 | |||
3 | #undef find_first_zero_bit | ||
4 | #undef find_next_zero_bit | ||
5 | #undef find_first_bit | ||
6 | #undef find_next_bit | ||
7 | |||
8 | static inline long | ||
9 | __find_first_zero_bit(const unsigned long * addr, unsigned long size) | ||
10 | { | ||
11 | long d0, d1, d2; | ||
12 | long res; | ||
13 | |||
14 | /* | ||
15 | * We must test the size in words, not in bits, because | ||
16 | * otherwise incoming sizes in the range -63..-1 will not run | ||
17 | * any scasq instructions, and then the flags used by the je | ||
18 | * instruction will have whatever random value was in place | ||
19 | * before. Nobody should call us like that, but | ||
20 | * find_next_zero_bit() does when offset and size are at the | ||
21 | * same word and it fails to find a zero itself. | ||
22 | */ | ||
23 | size += 63; | ||
24 | size >>= 6; | ||
25 | if (!size) | ||
26 | return 0; | ||
27 | asm volatile( | ||
28 | " repe; scasq\n" | ||
29 | " je 1f\n" | ||
30 | " xorq -8(%%rdi),%%rax\n" | ||
31 | " subq $8,%%rdi\n" | ||
32 | " bsfq %%rax,%%rdx\n" | ||
33 | "1: subq %[addr],%%rdi\n" | ||
34 | " shlq $3,%%rdi\n" | ||
35 | " addq %%rdi,%%rdx" | ||
36 | :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) | ||
37 | :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL), | ||
38 | [addr] "S" (addr) : "memory"); | ||
39 | /* | ||
40 | * Any register would do for [addr] above, but GCC tends to | ||
41 | * prefer rbx over rsi, even though rsi is readily available | ||
42 | * and doesn't have to be saved. | ||
43 | */ | ||
44 | return res; | ||
45 | } | ||
46 | |||
47 | /** | ||
48 | * find_first_zero_bit - find the first zero bit in a memory region | ||
49 | * @addr: The address to start the search at | ||
50 | * @size: The maximum size to search | ||
51 | * | ||
52 | * Returns the bit-number of the first zero bit, not the number of the byte | ||
53 | * containing a bit. | ||
54 | */ | ||
55 | long find_first_zero_bit(const unsigned long * addr, unsigned long size) | ||
56 | { | ||
57 | return __find_first_zero_bit (addr, size); | ||
58 | } | ||
59 | |||
60 | /** | ||
61 | * find_next_zero_bit - find the first zero bit in a memory region | ||
62 | * @addr: The address to base the search on | ||
63 | * @offset: The bitnumber to start searching at | ||
64 | * @size: The maximum size to search | ||
65 | */ | ||
66 | long find_next_zero_bit (const unsigned long * addr, long size, long offset) | ||
67 | { | ||
68 | const unsigned long * p = addr + (offset >> 6); | ||
69 | unsigned long set = 0; | ||
70 | unsigned long res, bit = offset&63; | ||
71 | |||
72 | if (bit) { | ||
73 | /* | ||
74 | * Look for zero in first word | ||
75 | */ | ||
76 | asm("bsfq %1,%0\n\t" | ||
77 | "cmoveq %2,%0" | ||
78 | : "=r" (set) | ||
79 | : "r" (~(*p >> bit)), "r"(64L)); | ||
80 | if (set < (64 - bit)) | ||
81 | return set + offset; | ||
82 | set = 64 - bit; | ||
83 | p++; | ||
84 | } | ||
85 | /* | ||
86 | * No zero yet, search remaining full words for a zero | ||
87 | */ | ||
88 | res = __find_first_zero_bit (p, size - 64 * (p - addr)); | ||
89 | |||
90 | return (offset + set + res); | ||
91 | } | ||
92 | |||
93 | static inline long | ||
94 | __find_first_bit(const unsigned long * addr, unsigned long size) | ||
95 | { | ||
96 | long d0, d1; | ||
97 | long res; | ||
98 | |||
99 | /* | ||
100 | * We must test the size in words, not in bits, because | ||
101 | * otherwise incoming sizes in the range -63..-1 will not run | ||
102 | * any scasq instructions, and then the flags used by the jz | ||
103 | * instruction will have whatever random value was in place | ||
104 | * before. Nobody should call us like that, but | ||
105 | * find_next_bit() does when offset and size are at the same | ||
106 | * word and it fails to find a one itself. | ||
107 | */ | ||
108 | size += 63; | ||
109 | size >>= 6; | ||
110 | if (!size) | ||
111 | return 0; | ||
112 | asm volatile( | ||
113 | " repe; scasq\n" | ||
114 | " jz 1f\n" | ||
115 | " subq $8,%%rdi\n" | ||
116 | " bsfq (%%rdi),%%rax\n" | ||
117 | "1: subq %[addr],%%rdi\n" | ||
118 | " shlq $3,%%rdi\n" | ||
119 | " addq %%rdi,%%rax" | ||
120 | :"=a" (res), "=&c" (d0), "=&D" (d1) | ||
121 | :"0" (0ULL), "1" (size), "2" (addr), | ||
122 | [addr] "r" (addr) : "memory"); | ||
123 | return res; | ||
124 | } | ||
125 | |||
126 | /** | ||
127 | * find_first_bit - find the first set bit in a memory region | ||
128 | * @addr: The address to start the search at | ||
129 | * @size: The maximum size to search | ||
130 | * | ||
131 | * Returns the bit-number of the first set bit, not the number of the byte | ||
132 | * containing a bit. | ||
133 | */ | ||
134 | long find_first_bit(const unsigned long * addr, unsigned long size) | ||
135 | { | ||
136 | return __find_first_bit(addr,size); | ||
137 | } | ||
138 | |||
139 | /** | ||
140 | * find_next_bit - find the first set bit in a memory region | ||
141 | * @addr: The address to base the search on | ||
142 | * @offset: The bitnumber to start searching at | ||
143 | * @size: The maximum size to search | ||
144 | */ | ||
145 | long find_next_bit(const unsigned long * addr, long size, long offset) | ||
146 | { | ||
147 | const unsigned long * p = addr + (offset >> 6); | ||
148 | unsigned long set = 0, bit = offset & 63, res; | ||
149 | |||
150 | if (bit) { | ||
151 | /* | ||
152 | * Look for nonzero in the first 64 bits: | ||
153 | */ | ||
154 | asm("bsfq %1,%0\n\t" | ||
155 | "cmoveq %2,%0\n\t" | ||
156 | : "=r" (set) | ||
157 | : "r" (*p >> bit), "r" (64L)); | ||
158 | if (set < (64 - bit)) | ||
159 | return set + offset; | ||
160 | set = 64 - bit; | ||
161 | p++; | ||
162 | } | ||
163 | /* | ||
164 | * No set bit yet, search remaining full words for a bit | ||
165 | */ | ||
166 | res = __find_first_bit (p, size - 64 * (p - addr)); | ||
167 | return (offset + set + res); | ||
168 | } | ||
169 | |||
170 | #include <linux/module.h> | ||
171 | |||
172 | EXPORT_SYMBOL(find_next_bit); | ||
173 | EXPORT_SYMBOL(find_first_bit); | ||
174 | EXPORT_SYMBOL(find_first_zero_bit); | ||
175 | EXPORT_SYMBOL(find_next_zero_bit); | ||
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c deleted file mode 100644 index 24676609a6ac..000000000000 --- a/arch/x86_64/lib/bitstr.c +++ /dev/null | |||
@@ -1,28 +0,0 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/bitops.h> | ||
3 | |||
4 | /* Find string of zero bits in a bitmap */ | ||
5 | unsigned long | ||
6 | find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) | ||
7 | { | ||
8 | unsigned long n, end, i; | ||
9 | |||
10 | again: | ||
11 | n = find_next_zero_bit(bitmap, nbits, start); | ||
12 | if (n == -1) | ||
13 | return -1; | ||
14 | |||
15 | /* could test bitsliced, but it's hardly worth it */ | ||
16 | end = n+len; | ||
17 | if (end >= nbits) | ||
18 | return -1; | ||
19 | for (i = n+1; i < end; i++) { | ||
20 | if (test_bit(i, bitmap)) { | ||
21 | start = i+1; | ||
22 | goto again; | ||
23 | } | ||
24 | } | ||
25 | return n; | ||
26 | } | ||
27 | |||
28 | EXPORT_SYMBOL(find_next_zero_string); | ||
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S deleted file mode 100644 index 9a10a78bb4a4..000000000000 --- a/arch/x86_64/lib/clear_page.S +++ /dev/null | |||
@@ -1,59 +0,0 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <asm/dwarf2.h> | ||
3 | |||
4 | /* | ||
5 | * Zero a page. | ||
6 | * rdi page | ||
7 | */ | ||
8 | ALIGN | ||
9 | clear_page_c: | ||
10 | CFI_STARTPROC | ||
11 | movl $4096/8,%ecx | ||
12 | xorl %eax,%eax | ||
13 | rep stosq | ||
14 | ret | ||
15 | CFI_ENDPROC | ||
16 | ENDPROC(clear_page) | ||
17 | |||
18 | ENTRY(clear_page) | ||
19 | CFI_STARTPROC | ||
20 | xorl %eax,%eax | ||
21 | movl $4096/64,%ecx | ||
22 | .p2align 4 | ||
23 | .Lloop: | ||
24 | decl %ecx | ||
25 | #define PUT(x) movq %rax,x*8(%rdi) | ||
26 | movq %rax,(%rdi) | ||
27 | PUT(1) | ||
28 | PUT(2) | ||
29 | PUT(3) | ||
30 | PUT(4) | ||
31 | PUT(5) | ||
32 | PUT(6) | ||
33 | PUT(7) | ||
34 | leaq 64(%rdi),%rdi | ||
35 | jnz .Lloop | ||
36 | nop | ||
37 | ret | ||
38 | CFI_ENDPROC | ||
39 | .Lclear_page_end: | ||
40 | ENDPROC(clear_page) | ||
41 | |||
42 | /* Some CPUs run faster using the string instructions. | ||
43 | It is also a lot simpler. Use this when possible */ | ||
44 | |||
45 | #include <asm/cpufeature.h> | ||
46 | |||
47 | .section .altinstr_replacement,"ax" | ||
48 | 1: .byte 0xeb /* jmp <disp8> */ | ||
49 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ | ||
50 | 2: | ||
51 | .previous | ||
52 | .section .altinstructions,"a" | ||
53 | .align 8 | ||
54 | .quad clear_page | ||
55 | .quad 1b | ||
56 | .byte X86_FEATURE_REP_GOOD | ||
57 | .byte .Lclear_page_end - clear_page | ||
58 | .byte 2b - 1b | ||
59 | .previous | ||
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S deleted file mode 100644 index 727a5d46d2fc..000000000000 --- a/arch/x86_64/lib/copy_page.S +++ /dev/null | |||
@@ -1,119 +0,0 @@ | |||
1 | /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/dwarf2.h> | ||
5 | |||
6 | ALIGN | ||
7 | copy_page_c: | ||
8 | CFI_STARTPROC | ||
9 | movl $4096/8,%ecx | ||
10 | rep movsq | ||
11 | ret | ||
12 | CFI_ENDPROC | ||
13 | ENDPROC(copy_page_c) | ||
14 | |||
15 | /* Don't use streaming store because it's better when the target | ||
16 | ends up in cache. */ | ||
17 | |||
18 | /* Could vary the prefetch distance based on SMP/UP */ | ||
19 | |||
20 | ENTRY(copy_page) | ||
21 | CFI_STARTPROC | ||
22 | subq $3*8,%rsp | ||
23 | CFI_ADJUST_CFA_OFFSET 3*8 | ||
24 | movq %rbx,(%rsp) | ||
25 | CFI_REL_OFFSET rbx, 0 | ||
26 | movq %r12,1*8(%rsp) | ||
27 | CFI_REL_OFFSET r12, 1*8 | ||
28 | movq %r13,2*8(%rsp) | ||
29 | CFI_REL_OFFSET r13, 2*8 | ||
30 | |||
31 | movl $(4096/64)-5,%ecx | ||
32 | .p2align 4 | ||
33 | .Loop64: | ||
34 | dec %rcx | ||
35 | |||
36 | movq (%rsi), %rax | ||
37 | movq 8 (%rsi), %rbx | ||
38 | movq 16 (%rsi), %rdx | ||
39 | movq 24 (%rsi), %r8 | ||
40 | movq 32 (%rsi), %r9 | ||
41 | movq 40 (%rsi), %r10 | ||
42 | movq 48 (%rsi), %r11 | ||
43 | movq 56 (%rsi), %r12 | ||
44 | |||
45 | prefetcht0 5*64(%rsi) | ||
46 | |||
47 | movq %rax, (%rdi) | ||
48 | movq %rbx, 8 (%rdi) | ||
49 | movq %rdx, 16 (%rdi) | ||
50 | movq %r8, 24 (%rdi) | ||
51 | movq %r9, 32 (%rdi) | ||
52 | movq %r10, 40 (%rdi) | ||
53 | movq %r11, 48 (%rdi) | ||
54 | movq %r12, 56 (%rdi) | ||
55 | |||
56 | leaq 64 (%rsi), %rsi | ||
57 | leaq 64 (%rdi), %rdi | ||
58 | |||
59 | jnz .Loop64 | ||
60 | |||
61 | movl $5,%ecx | ||
62 | .p2align 4 | ||
63 | .Loop2: | ||
64 | decl %ecx | ||
65 | |||
66 | movq (%rsi), %rax | ||
67 | movq 8 (%rsi), %rbx | ||
68 | movq 16 (%rsi), %rdx | ||
69 | movq 24 (%rsi), %r8 | ||
70 | movq 32 (%rsi), %r9 | ||
71 | movq 40 (%rsi), %r10 | ||
72 | movq 48 (%rsi), %r11 | ||
73 | movq 56 (%rsi), %r12 | ||
74 | |||
75 | movq %rax, (%rdi) | ||
76 | movq %rbx, 8 (%rdi) | ||
77 | movq %rdx, 16 (%rdi) | ||
78 | movq %r8, 24 (%rdi) | ||
79 | movq %r9, 32 (%rdi) | ||
80 | movq %r10, 40 (%rdi) | ||
81 | movq %r11, 48 (%rdi) | ||
82 | movq %r12, 56 (%rdi) | ||
83 | |||
84 | leaq 64(%rdi),%rdi | ||
85 | leaq 64(%rsi),%rsi | ||
86 | |||
87 | jnz .Loop2 | ||
88 | |||
89 | movq (%rsp),%rbx | ||
90 | CFI_RESTORE rbx | ||
91 | movq 1*8(%rsp),%r12 | ||
92 | CFI_RESTORE r12 | ||
93 | movq 2*8(%rsp),%r13 | ||
94 | CFI_RESTORE r13 | ||
95 | addq $3*8,%rsp | ||
96 | CFI_ADJUST_CFA_OFFSET -3*8 | ||
97 | ret | ||
98 | .Lcopy_page_end: | ||
99 | CFI_ENDPROC | ||
100 | ENDPROC(copy_page) | ||
101 | |||
102 | /* Some CPUs run faster using the string copy instructions. | ||
103 | It is also a lot simpler. Use this when possible */ | ||
104 | |||
105 | #include <asm/cpufeature.h> | ||
106 | |||
107 | .section .altinstr_replacement,"ax" | ||
108 | 1: .byte 0xeb /* jmp <disp8> */ | ||
109 | .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ | ||
110 | 2: | ||
111 | .previous | ||
112 | .section .altinstructions,"a" | ||
113 | .align 8 | ||
114 | .quad copy_page | ||
115 | .quad 1b | ||
116 | .byte X86_FEATURE_REP_GOOD | ||
117 | .byte .Lcopy_page_end - copy_page | ||
118 | .byte 2b - 1b | ||
119 | .previous | ||
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S deleted file mode 100644 index 70bebd310408..000000000000 --- a/arch/x86_64/lib/copy_user.S +++ /dev/null | |||
@@ -1,354 +0,0 @@ | |||
1 | /* Copyright 2002 Andi Kleen, SuSE Labs. | ||
2 | * Subject to the GNU Public License v2. | ||
3 | * | ||
4 | * Functions to copy from and to user space. | ||
5 | */ | ||
6 | |||
7 | #include <linux/linkage.h> | ||
8 | #include <asm/dwarf2.h> | ||
9 | |||
10 | #define FIX_ALIGNMENT 1 | ||
11 | |||
12 | #include <asm/current.h> | ||
13 | #include <asm/asm-offsets.h> | ||
14 | #include <asm/thread_info.h> | ||
15 | #include <asm/cpufeature.h> | ||
16 | |||
17 | .macro ALTERNATIVE_JUMP feature,orig,alt | ||
18 | 0: | ||
19 | .byte 0xe9 /* 32bit jump */ | ||
20 | .long \orig-1f /* by default jump to orig */ | ||
21 | 1: | ||
22 | .section .altinstr_replacement,"ax" | ||
23 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | ||
24 | .long \alt-1b /* offset */ /* or alternatively to alt */ | ||
25 | .previous | ||
26 | .section .altinstructions,"a" | ||
27 | .align 8 | ||
28 | .quad 0b | ||
29 | .quad 2b | ||
30 | .byte \feature /* when feature is set */ | ||
31 | .byte 5 | ||
32 | .byte 5 | ||
33 | .previous | ||
34 | .endm | ||
35 | |||
36 | /* Standard copy_to_user with segment limit checking */ | ||
37 | ENTRY(copy_to_user) | ||
38 | CFI_STARTPROC | ||
39 | GET_THREAD_INFO(%rax) | ||
40 | movq %rdi,%rcx | ||
41 | addq %rdx,%rcx | ||
42 | jc bad_to_user | ||
43 | cmpq threadinfo_addr_limit(%rax),%rcx | ||
44 | jae bad_to_user | ||
45 | xorl %eax,%eax /* clear zero flag */ | ||
46 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | ||
47 | CFI_ENDPROC | ||
48 | |||
49 | ENTRY(copy_user_generic) | ||
50 | CFI_STARTPROC | ||
51 | movl $1,%ecx /* set zero flag */ | ||
52 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | ||
53 | CFI_ENDPROC | ||
54 | |||
55 | ENTRY(__copy_from_user_inatomic) | ||
56 | CFI_STARTPROC | ||
57 | xorl %ecx,%ecx /* clear zero flag */ | ||
58 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | ||
59 | CFI_ENDPROC | ||
60 | |||
61 | /* Standard copy_from_user with segment limit checking */ | ||
62 | ENTRY(copy_from_user) | ||
63 | CFI_STARTPROC | ||
64 | GET_THREAD_INFO(%rax) | ||
65 | movq %rsi,%rcx | ||
66 | addq %rdx,%rcx | ||
67 | jc bad_from_user | ||
68 | cmpq threadinfo_addr_limit(%rax),%rcx | ||
69 | jae bad_from_user | ||
70 | movl $1,%ecx /* set zero flag */ | ||
71 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | ||
72 | CFI_ENDPROC | ||
73 | ENDPROC(copy_from_user) | ||
74 | |||
75 | .section .fixup,"ax" | ||
76 | /* must zero dest */ | ||
77 | bad_from_user: | ||
78 | CFI_STARTPROC | ||
79 | movl %edx,%ecx | ||
80 | xorl %eax,%eax | ||
81 | rep | ||
82 | stosb | ||
83 | bad_to_user: | ||
84 | movl %edx,%eax | ||
85 | ret | ||
86 | CFI_ENDPROC | ||
87 | END(bad_from_user) | ||
88 | .previous | ||
89 | |||
90 | |||
91 | /* | ||
92 | * copy_user_generic_unrolled - memory copy with exception handling. | ||
93 | * This version is for CPUs like P4 that don't have efficient micro code for rep movsq | ||
94 | * | ||
95 | * Input: | ||
96 | * rdi destination | ||
97 | * rsi source | ||
98 | * rdx count | ||
99 | * ecx zero flag -- if true zero destination on error | ||
100 | * | ||
101 | * Output: | ||
102 | * eax uncopied bytes or 0 if successful. | ||
103 | */ | ||
104 | ENTRY(copy_user_generic_unrolled) | ||
105 | CFI_STARTPROC | ||
106 | pushq %rbx | ||
107 | CFI_ADJUST_CFA_OFFSET 8 | ||
108 | CFI_REL_OFFSET rbx, 0 | ||
109 | pushq %rcx | ||
110 | CFI_ADJUST_CFA_OFFSET 8 | ||
111 | CFI_REL_OFFSET rcx, 0 | ||
112 | xorl %eax,%eax /*zero for the exception handler */ | ||
113 | |||
114 | #ifdef FIX_ALIGNMENT | ||
115 | /* check for bad alignment of destination */ | ||
116 | movl %edi,%ecx | ||
117 | andl $7,%ecx | ||
118 | jnz .Lbad_alignment | ||
119 | .Lafter_bad_alignment: | ||
120 | #endif | ||
121 | |||
122 | movq %rdx,%rcx | ||
123 | |||
124 | movl $64,%ebx | ||
125 | shrq $6,%rdx | ||
126 | decq %rdx | ||
127 | js .Lhandle_tail | ||
128 | |||
129 | .p2align 4 | ||
130 | .Lloop: | ||
131 | .Ls1: movq (%rsi),%r11 | ||
132 | .Ls2: movq 1*8(%rsi),%r8 | ||
133 | .Ls3: movq 2*8(%rsi),%r9 | ||
134 | .Ls4: movq 3*8(%rsi),%r10 | ||
135 | .Ld1: movq %r11,(%rdi) | ||
136 | .Ld2: movq %r8,1*8(%rdi) | ||
137 | .Ld3: movq %r9,2*8(%rdi) | ||
138 | .Ld4: movq %r10,3*8(%rdi) | ||
139 | |||
140 | .Ls5: movq 4*8(%rsi),%r11 | ||
141 | .Ls6: movq 5*8(%rsi),%r8 | ||
142 | .Ls7: movq 6*8(%rsi),%r9 | ||
143 | .Ls8: movq 7*8(%rsi),%r10 | ||
144 | .Ld5: movq %r11,4*8(%rdi) | ||
145 | .Ld6: movq %r8,5*8(%rdi) | ||
146 | .Ld7: movq %r9,6*8(%rdi) | ||
147 | .Ld8: movq %r10,7*8(%rdi) | ||
148 | |||
149 | decq %rdx | ||
150 | |||
151 | leaq 64(%rsi),%rsi | ||
152 | leaq 64(%rdi),%rdi | ||
153 | |||
154 | jns .Lloop | ||
155 | |||
156 | .p2align 4 | ||
157 | .Lhandle_tail: | ||
158 | movl %ecx,%edx | ||
159 | andl $63,%ecx | ||
160 | shrl $3,%ecx | ||
161 | jz .Lhandle_7 | ||
162 | movl $8,%ebx | ||
163 | .p2align 4 | ||
164 | .Lloop_8: | ||
165 | .Ls9: movq (%rsi),%r8 | ||
166 | .Ld9: movq %r8,(%rdi) | ||
167 | decl %ecx | ||
168 | leaq 8(%rdi),%rdi | ||
169 | leaq 8(%rsi),%rsi | ||
170 | jnz .Lloop_8 | ||
171 | |||
172 | .Lhandle_7: | ||
173 | movl %edx,%ecx | ||
174 | andl $7,%ecx | ||
175 | jz .Lende | ||
176 | .p2align 4 | ||
177 | .Lloop_1: | ||
178 | .Ls10: movb (%rsi),%bl | ||
179 | .Ld10: movb %bl,(%rdi) | ||
180 | incq %rdi | ||
181 | incq %rsi | ||
182 | decl %ecx | ||
183 | jnz .Lloop_1 | ||
184 | |||
185 | CFI_REMEMBER_STATE | ||
186 | .Lende: | ||
187 | popq %rcx | ||
188 | CFI_ADJUST_CFA_OFFSET -8 | ||
189 | CFI_RESTORE rcx | ||
190 | popq %rbx | ||
191 | CFI_ADJUST_CFA_OFFSET -8 | ||
192 | CFI_RESTORE rbx | ||
193 | ret | ||
194 | CFI_RESTORE_STATE | ||
195 | |||
196 | #ifdef FIX_ALIGNMENT | ||
197 | /* align destination */ | ||
198 | .p2align 4 | ||
199 | .Lbad_alignment: | ||
200 | movl $8,%r9d | ||
201 | subl %ecx,%r9d | ||
202 | movl %r9d,%ecx | ||
203 | cmpq %r9,%rdx | ||
204 | jz .Lhandle_7 | ||
205 | js .Lhandle_7 | ||
206 | .Lalign_1: | ||
207 | .Ls11: movb (%rsi),%bl | ||
208 | .Ld11: movb %bl,(%rdi) | ||
209 | incq %rsi | ||
210 | incq %rdi | ||
211 | decl %ecx | ||
212 | jnz .Lalign_1 | ||
213 | subq %r9,%rdx | ||
214 | jmp .Lafter_bad_alignment | ||
215 | #endif | ||
216 | |||
217 | /* table sorted by exception address */ | ||
218 | .section __ex_table,"a" | ||
219 | .align 8 | ||
220 | .quad .Ls1,.Ls1e | ||
221 | .quad .Ls2,.Ls2e | ||
222 | .quad .Ls3,.Ls3e | ||
223 | .quad .Ls4,.Ls4e | ||
224 | .quad .Ld1,.Ls1e | ||
225 | .quad .Ld2,.Ls2e | ||
226 | .quad .Ld3,.Ls3e | ||
227 | .quad .Ld4,.Ls4e | ||
228 | .quad .Ls5,.Ls5e | ||
229 | .quad .Ls6,.Ls6e | ||
230 | .quad .Ls7,.Ls7e | ||
231 | .quad .Ls8,.Ls8e | ||
232 | .quad .Ld5,.Ls5e | ||
233 | .quad .Ld6,.Ls6e | ||
234 | .quad .Ld7,.Ls7e | ||
235 | .quad .Ld8,.Ls8e | ||
236 | .quad .Ls9,.Le_quad | ||
237 | .quad .Ld9,.Le_quad | ||
238 | .quad .Ls10,.Le_byte | ||
239 | .quad .Ld10,.Le_byte | ||
240 | #ifdef FIX_ALIGNMENT | ||
241 | .quad .Ls11,.Lzero_rest | ||
242 | .quad .Ld11,.Lzero_rest | ||
243 | #endif | ||
244 | .quad .Le5,.Le_zero | ||
245 | .previous | ||
246 | |||
247 | /* compute 64-offset for main loop. 8 bytes accuracy with error on the | ||
248 | pessimistic side. this is gross. it would be better to fix the | ||
249 | interface. */ | ||
250 | /* eax: zero, ebx: 64 */ | ||
251 | .Ls1e: addl $8,%eax | ||
252 | .Ls2e: addl $8,%eax | ||
253 | .Ls3e: addl $8,%eax | ||
254 | .Ls4e: addl $8,%eax | ||
255 | .Ls5e: addl $8,%eax | ||
256 | .Ls6e: addl $8,%eax | ||
257 | .Ls7e: addl $8,%eax | ||
258 | .Ls8e: addl $8,%eax | ||
259 | addq %rbx,%rdi /* +64 */ | ||
260 | subq %rax,%rdi /* correct destination with computed offset */ | ||
261 | |||
262 | shlq $6,%rdx /* loop counter * 64 (stride length) */ | ||
263 | addq %rax,%rdx /* add offset to loopcnt */ | ||
264 | andl $63,%ecx /* remaining bytes */ | ||
265 | addq %rcx,%rdx /* add them */ | ||
266 | jmp .Lzero_rest | ||
267 | |||
268 | /* exception on quad word loop in tail handling */ | ||
269 | /* ecx: loopcnt/8, %edx: length, rdi: correct */ | ||
270 | .Le_quad: | ||
271 | shll $3,%ecx | ||
272 | andl $7,%edx | ||
273 | addl %ecx,%edx | ||
274 | /* edx: bytes to zero, rdi: dest, eax:zero */ | ||
275 | .Lzero_rest: | ||
276 | cmpl $0,(%rsp) | ||
277 | jz .Le_zero | ||
278 | movq %rdx,%rcx | ||
279 | .Le_byte: | ||
280 | xorl %eax,%eax | ||
281 | .Le5: rep | ||
282 | stosb | ||
283 | /* when there is another exception while zeroing the rest just return */ | ||
284 | .Le_zero: | ||
285 | movq %rdx,%rax | ||
286 | jmp .Lende | ||
287 | CFI_ENDPROC | ||
288 | ENDPROC(copy_user_generic) | ||
289 | |||
290 | |||
291 | /* Some CPUs run faster using the string copy instructions. | ||
292 | This is also a lot simpler. Use them when possible. | ||
293 | Patch in jmps to this code instead of copying it fully | ||
294 | to avoid unwanted aliasing in the exception tables. */ | ||
295 | |||
296 | /* rdi destination | ||
297 | * rsi source | ||
298 | * rdx count | ||
299 | * ecx zero flag | ||
300 | * | ||
301 | * Output: | ||
302 | * eax uncopied bytes or 0 if successfull. | ||
303 | * | ||
304 | * Only 4GB of copy is supported. This shouldn't be a problem | ||
305 | * because the kernel normally only writes from/to page sized chunks | ||
306 | * even if user space passed a longer buffer. | ||
307 | * And more would be dangerous because both Intel and AMD have | ||
308 | * errata with rep movsq > 4GB. If someone feels the need to fix | ||
309 | * this please consider this. | ||
310 | */ | ||
311 | ENTRY(copy_user_generic_string) | ||
312 | CFI_STARTPROC | ||
313 | movl %ecx,%r8d /* save zero flag */ | ||
314 | movl %edx,%ecx | ||
315 | shrl $3,%ecx | ||
316 | andl $7,%edx | ||
317 | jz 10f | ||
318 | 1: rep | ||
319 | movsq | ||
320 | movl %edx,%ecx | ||
321 | 2: rep | ||
322 | movsb | ||
323 | 9: movl %ecx,%eax | ||
324 | ret | ||
325 | |||
326 | /* multiple of 8 byte */ | ||
327 | 10: rep | ||
328 | movsq | ||
329 | xor %eax,%eax | ||
330 | ret | ||
331 | |||
332 | /* exception handling */ | ||
333 | 3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ | ||
334 | jmp 6f | ||
335 | 5: movl %ecx,%eax /* exception on byte loop */ | ||
336 | /* eax: left over bytes */ | ||
337 | 6: testl %r8d,%r8d /* zero flag set? */ | ||
338 | jz 7f | ||
339 | movl %eax,%ecx /* initialize x86 loop counter */ | ||
340 | push %rax | ||
341 | xorl %eax,%eax | ||
342 | 8: rep | ||
343 | stosb /* zero the rest */ | ||
344 | 11: pop %rax | ||
345 | 7: ret | ||
346 | CFI_ENDPROC | ||
347 | END(copy_user_generic_c) | ||
348 | |||
349 | .section __ex_table,"a" | ||
350 | .quad 1b,3b | ||
351 | .quad 2b,5b | ||
352 | .quad 8b,11b | ||
353 | .quad 10b,3b | ||
354 | .previous | ||
diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S deleted file mode 100644 index 4620efb12f13..000000000000 --- a/arch/x86_64/lib/copy_user_nocache.S +++ /dev/null | |||
@@ -1,217 +0,0 @@ | |||
1 | /* Copyright 2002 Andi Kleen, SuSE Labs. | ||
2 | * Subject to the GNU Public License v2. | ||
3 | * | ||
4 | * Functions to copy from and to user space. | ||
5 | */ | ||
6 | |||
7 | #include <linux/linkage.h> | ||
8 | #include <asm/dwarf2.h> | ||
9 | |||
10 | #define FIX_ALIGNMENT 1 | ||
11 | |||
12 | #include <asm/current.h> | ||
13 | #include <asm/asm-offsets.h> | ||
14 | #include <asm/thread_info.h> | ||
15 | #include <asm/cpufeature.h> | ||
16 | |||
17 | /* | ||
18 | * copy_user_nocache - Uncached memory copy with exception handling | ||
19 | * This will force destination/source out of cache for more performance. | ||
20 | * | ||
21 | * Input: | ||
22 | * rdi destination | ||
23 | * rsi source | ||
24 | * rdx count | ||
25 | * rcx zero flag when 1 zero on exception | ||
26 | * | ||
27 | * Output: | ||
28 | * eax uncopied bytes or 0 if successful. | ||
29 | */ | ||
30 | ENTRY(__copy_user_nocache) | ||
31 | CFI_STARTPROC | ||
32 | pushq %rbx | ||
33 | CFI_ADJUST_CFA_OFFSET 8 | ||
34 | CFI_REL_OFFSET rbx, 0 | ||
35 | pushq %rcx /* save zero flag */ | ||
36 | CFI_ADJUST_CFA_OFFSET 8 | ||
37 | CFI_REL_OFFSET rcx, 0 | ||
38 | |||
39 | xorl %eax,%eax /* zero for the exception handler */ | ||
40 | |||
41 | #ifdef FIX_ALIGNMENT | ||
42 | /* check for bad alignment of destination */ | ||
43 | movl %edi,%ecx | ||
44 | andl $7,%ecx | ||
45 | jnz .Lbad_alignment | ||
46 | .Lafter_bad_alignment: | ||
47 | #endif | ||
48 | |||
49 | movq %rdx,%rcx | ||
50 | |||
51 | movl $64,%ebx | ||
52 | shrq $6,%rdx | ||
53 | decq %rdx | ||
54 | js .Lhandle_tail | ||
55 | |||
56 | .p2align 4 | ||
57 | .Lloop: | ||
58 | .Ls1: movq (%rsi),%r11 | ||
59 | .Ls2: movq 1*8(%rsi),%r8 | ||
60 | .Ls3: movq 2*8(%rsi),%r9 | ||
61 | .Ls4: movq 3*8(%rsi),%r10 | ||
62 | .Ld1: movnti %r11,(%rdi) | ||
63 | .Ld2: movnti %r8,1*8(%rdi) | ||
64 | .Ld3: movnti %r9,2*8(%rdi) | ||
65 | .Ld4: movnti %r10,3*8(%rdi) | ||
66 | |||
67 | .Ls5: movq 4*8(%rsi),%r11 | ||
68 | .Ls6: movq 5*8(%rsi),%r8 | ||
69 | .Ls7: movq 6*8(%rsi),%r9 | ||
70 | .Ls8: movq 7*8(%rsi),%r10 | ||
71 | .Ld5: movnti %r11,4*8(%rdi) | ||
72 | .Ld6: movnti %r8,5*8(%rdi) | ||
73 | .Ld7: movnti %r9,6*8(%rdi) | ||
74 | .Ld8: movnti %r10,7*8(%rdi) | ||
75 | |||
76 | dec %rdx | ||
77 | |||
78 | leaq 64(%rsi),%rsi | ||
79 | leaq 64(%rdi),%rdi | ||
80 | |||
81 | jns .Lloop | ||
82 | |||
83 | .p2align 4 | ||
84 | .Lhandle_tail: | ||
85 | movl %ecx,%edx | ||
86 | andl $63,%ecx | ||
87 | shrl $3,%ecx | ||
88 | jz .Lhandle_7 | ||
89 | movl $8,%ebx | ||
90 | .p2align 4 | ||
91 | .Lloop_8: | ||
92 | .Ls9: movq (%rsi),%r8 | ||
93 | .Ld9: movnti %r8,(%rdi) | ||
94 | decl %ecx | ||
95 | leaq 8(%rdi),%rdi | ||
96 | leaq 8(%rsi),%rsi | ||
97 | jnz .Lloop_8 | ||
98 | |||
99 | .Lhandle_7: | ||
100 | movl %edx,%ecx | ||
101 | andl $7,%ecx | ||
102 | jz .Lende | ||
103 | .p2align 4 | ||
104 | .Lloop_1: | ||
105 | .Ls10: movb (%rsi),%bl | ||
106 | .Ld10: movb %bl,(%rdi) | ||
107 | incq %rdi | ||
108 | incq %rsi | ||
109 | decl %ecx | ||
110 | jnz .Lloop_1 | ||
111 | |||
112 | CFI_REMEMBER_STATE | ||
113 | .Lende: | ||
114 | popq %rcx | ||
115 | CFI_ADJUST_CFA_OFFSET -8 | ||
116 | CFI_RESTORE %rcx | ||
117 | popq %rbx | ||
118 | CFI_ADJUST_CFA_OFFSET -8 | ||
119 | CFI_RESTORE rbx | ||
120 | ret | ||
121 | CFI_RESTORE_STATE | ||
122 | |||
123 | #ifdef FIX_ALIGNMENT | ||
124 | /* align destination */ | ||
125 | .p2align 4 | ||
126 | .Lbad_alignment: | ||
127 | movl $8,%r9d | ||
128 | subl %ecx,%r9d | ||
129 | movl %r9d,%ecx | ||
130 | cmpq %r9,%rdx | ||
131 | jz .Lhandle_7 | ||
132 | js .Lhandle_7 | ||
133 | .Lalign_1: | ||
134 | .Ls11: movb (%rsi),%bl | ||
135 | .Ld11: movb %bl,(%rdi) | ||
136 | incq %rsi | ||
137 | incq %rdi | ||
138 | decl %ecx | ||
139 | jnz .Lalign_1 | ||
140 | subq %r9,%rdx | ||
141 | jmp .Lafter_bad_alignment | ||
142 | #endif | ||
143 | |||
144 | /* table sorted by exception address */ | ||
145 | .section __ex_table,"a" | ||
146 | .align 8 | ||
147 | .quad .Ls1,.Ls1e | ||
148 | .quad .Ls2,.Ls2e | ||
149 | .quad .Ls3,.Ls3e | ||
150 | .quad .Ls4,.Ls4e | ||
151 | .quad .Ld1,.Ls1e | ||
152 | .quad .Ld2,.Ls2e | ||
153 | .quad .Ld3,.Ls3e | ||
154 | .quad .Ld4,.Ls4e | ||
155 | .quad .Ls5,.Ls5e | ||
156 | .quad .Ls6,.Ls6e | ||
157 | .quad .Ls7,.Ls7e | ||
158 | .quad .Ls8,.Ls8e | ||
159 | .quad .Ld5,.Ls5e | ||
160 | .quad .Ld6,.Ls6e | ||
161 | .quad .Ld7,.Ls7e | ||
162 | .quad .Ld8,.Ls8e | ||
163 | .quad .Ls9,.Le_quad | ||
164 | .quad .Ld9,.Le_quad | ||
165 | .quad .Ls10,.Le_byte | ||
166 | .quad .Ld10,.Le_byte | ||
167 | #ifdef FIX_ALIGNMENT | ||
168 | .quad .Ls11,.Lzero_rest | ||
169 | .quad .Ld11,.Lzero_rest | ||
170 | #endif | ||
171 | .quad .Le5,.Le_zero | ||
172 | .previous | ||
173 | |||
174 | /* compute 64-offset for main loop. 8 bytes accuracy with error on the | ||
175 | pessimistic side. this is gross. it would be better to fix the | ||
176 | interface. */ | ||
177 | /* eax: zero, ebx: 64 */ | ||
178 | .Ls1e: addl $8,%eax | ||
179 | .Ls2e: addl $8,%eax | ||
180 | .Ls3e: addl $8,%eax | ||
181 | .Ls4e: addl $8,%eax | ||
182 | .Ls5e: addl $8,%eax | ||
183 | .Ls6e: addl $8,%eax | ||
184 | .Ls7e: addl $8,%eax | ||
185 | .Ls8e: addl $8,%eax | ||
186 | addq %rbx,%rdi /* +64 */ | ||
187 | subq %rax,%rdi /* correct destination with computed offset */ | ||
188 | |||
189 | shlq $6,%rdx /* loop counter * 64 (stride length) */ | ||
190 | addq %rax,%rdx /* add offset to loopcnt */ | ||
191 | andl $63,%ecx /* remaining bytes */ | ||
192 | addq %rcx,%rdx /* add them */ | ||
193 | jmp .Lzero_rest | ||
194 | |||
195 | /* exception on quad word loop in tail handling */ | ||
196 | /* ecx: loopcnt/8, %edx: length, rdi: correct */ | ||
197 | .Le_quad: | ||
198 | shll $3,%ecx | ||
199 | andl $7,%edx | ||
200 | addl %ecx,%edx | ||
201 | /* edx: bytes to zero, rdi: dest, eax:zero */ | ||
202 | .Lzero_rest: | ||
203 | cmpl $0,(%rsp) /* zero flag set? */ | ||
204 | jz .Le_zero | ||
205 | movq %rdx,%rcx | ||
206 | .Le_byte: | ||
207 | xorl %eax,%eax | ||
208 | .Le5: rep | ||
209 | stosb | ||
210 | /* when there is another exception while zeroing the rest just return */ | ||
211 | .Le_zero: | ||
212 | movq %rdx,%rax | ||
213 | jmp .Lende | ||
214 | CFI_ENDPROC | ||
215 | ENDPROC(__copy_user_nocache) | ||
216 | |||
217 | |||
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S deleted file mode 100644 index f0dba36578ea..000000000000 --- a/arch/x86_64/lib/csum-copy.S +++ /dev/null | |||
@@ -1,249 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
3 | * | ||
4 | * This file is subject to the terms and conditions of the GNU General Public | ||
5 | * License. See the file COPYING in the main directory of this archive | ||
6 | * for more details. No warranty for anything given at all. | ||
7 | */ | ||
8 | #include <linux/linkage.h> | ||
9 | #include <asm/dwarf2.h> | ||
10 | #include <asm/errno.h> | ||
11 | |||
12 | /* | ||
13 | * Checksum copy with exception handling. | ||
14 | * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the | ||
15 | * destination is zeroed. | ||
16 | * | ||
17 | * Input | ||
18 | * rdi source | ||
19 | * rsi destination | ||
20 | * edx len (32bit) | ||
21 | * ecx sum (32bit) | ||
22 | * r8 src_err_ptr (int) | ||
23 | * r9 dst_err_ptr (int) | ||
24 | * | ||
25 | * Output | ||
26 | * eax 64bit sum. undefined in case of exception. | ||
27 | * | ||
28 | * Wrappers need to take care of valid exception sum and zeroing. | ||
29 | * They also should align source or destination to 8 bytes. | ||
30 | */ | ||
31 | |||
32 | .macro source | ||
33 | 10: | ||
34 | .section __ex_table,"a" | ||
35 | .align 8 | ||
36 | .quad 10b,.Lbad_source | ||
37 | .previous | ||
38 | .endm | ||
39 | |||
40 | .macro dest | ||
41 | 20: | ||
42 | .section __ex_table,"a" | ||
43 | .align 8 | ||
44 | .quad 20b,.Lbad_dest | ||
45 | .previous | ||
46 | .endm | ||
47 | |||
48 | .macro ignore L=.Lignore | ||
49 | 30: | ||
50 | .section __ex_table,"a" | ||
51 | .align 8 | ||
52 | .quad 30b,\L | ||
53 | .previous | ||
54 | .endm | ||
55 | |||
56 | |||
57 | ENTRY(csum_partial_copy_generic) | ||
58 | CFI_STARTPROC | ||
59 | cmpl $3*64,%edx | ||
60 | jle .Lignore | ||
61 | |||
62 | .Lignore: | ||
63 | subq $7*8,%rsp | ||
64 | CFI_ADJUST_CFA_OFFSET 7*8 | ||
65 | movq %rbx,2*8(%rsp) | ||
66 | CFI_REL_OFFSET rbx, 2*8 | ||
67 | movq %r12,3*8(%rsp) | ||
68 | CFI_REL_OFFSET r12, 3*8 | ||
69 | movq %r14,4*8(%rsp) | ||
70 | CFI_REL_OFFSET r14, 4*8 | ||
71 | movq %r13,5*8(%rsp) | ||
72 | CFI_REL_OFFSET r13, 5*8 | ||
73 | movq %rbp,6*8(%rsp) | ||
74 | CFI_REL_OFFSET rbp, 6*8 | ||
75 | |||
76 | movq %r8,(%rsp) | ||
77 | movq %r9,1*8(%rsp) | ||
78 | |||
79 | movl %ecx,%eax | ||
80 | movl %edx,%ecx | ||
81 | |||
82 | xorl %r9d,%r9d | ||
83 | movq %rcx,%r12 | ||
84 | |||
85 | shrq $6,%r12 | ||
86 | jz .Lhandle_tail /* < 64 */ | ||
87 | |||
88 | clc | ||
89 | |||
90 | /* main loop. clear in 64 byte blocks */ | ||
91 | /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ | ||
92 | /* r11: temp3, rdx: temp4, r12 loopcnt */ | ||
93 | /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ | ||
94 | .p2align 4 | ||
95 | .Lloop: | ||
96 | source | ||
97 | movq (%rdi),%rbx | ||
98 | source | ||
99 | movq 8(%rdi),%r8 | ||
100 | source | ||
101 | movq 16(%rdi),%r11 | ||
102 | source | ||
103 | movq 24(%rdi),%rdx | ||
104 | |||
105 | source | ||
106 | movq 32(%rdi),%r10 | ||
107 | source | ||
108 | movq 40(%rdi),%rbp | ||
109 | source | ||
110 | movq 48(%rdi),%r14 | ||
111 | source | ||
112 | movq 56(%rdi),%r13 | ||
113 | |||
114 | ignore 2f | ||
115 | prefetcht0 5*64(%rdi) | ||
116 | 2: | ||
117 | adcq %rbx,%rax | ||
118 | adcq %r8,%rax | ||
119 | adcq %r11,%rax | ||
120 | adcq %rdx,%rax | ||
121 | adcq %r10,%rax | ||
122 | adcq %rbp,%rax | ||
123 | adcq %r14,%rax | ||
124 | adcq %r13,%rax | ||
125 | |||
126 | decl %r12d | ||
127 | |||
128 | dest | ||
129 | movq %rbx,(%rsi) | ||
130 | dest | ||
131 | movq %r8,8(%rsi) | ||
132 | dest | ||
133 | movq %r11,16(%rsi) | ||
134 | dest | ||
135 | movq %rdx,24(%rsi) | ||
136 | |||
137 | dest | ||
138 | movq %r10,32(%rsi) | ||
139 | dest | ||
140 | movq %rbp,40(%rsi) | ||
141 | dest | ||
142 | movq %r14,48(%rsi) | ||
143 | dest | ||
144 | movq %r13,56(%rsi) | ||
145 | |||
146 | 3: | ||
147 | |||
148 | leaq 64(%rdi),%rdi | ||
149 | leaq 64(%rsi),%rsi | ||
150 | |||
151 | jnz .Lloop | ||
152 | |||
153 | adcq %r9,%rax | ||
154 | |||
155 | /* do last upto 56 bytes */ | ||
156 | .Lhandle_tail: | ||
157 | /* ecx: count */ | ||
158 | movl %ecx,%r10d | ||
159 | andl $63,%ecx | ||
160 | shrl $3,%ecx | ||
161 | jz .Lfold | ||
162 | clc | ||
163 | .p2align 4 | ||
164 | .Lloop_8: | ||
165 | source | ||
166 | movq (%rdi),%rbx | ||
167 | adcq %rbx,%rax | ||
168 | decl %ecx | ||
169 | dest | ||
170 | movq %rbx,(%rsi) | ||
171 | leaq 8(%rsi),%rsi /* preserve carry */ | ||
172 | leaq 8(%rdi),%rdi | ||
173 | jnz .Lloop_8 | ||
174 | adcq %r9,%rax /* add in carry */ | ||
175 | |||
176 | .Lfold: | ||
177 | /* reduce checksum to 32bits */ | ||
178 | movl %eax,%ebx | ||
179 | shrq $32,%rax | ||
180 | addl %ebx,%eax | ||
181 | adcl %r9d,%eax | ||
182 | |||
183 | /* do last upto 6 bytes */ | ||
184 | .Lhandle_7: | ||
185 | movl %r10d,%ecx | ||
186 | andl $7,%ecx | ||
187 | shrl $1,%ecx | ||
188 | jz .Lhandle_1 | ||
189 | movl $2,%edx | ||
190 | xorl %ebx,%ebx | ||
191 | clc | ||
192 | .p2align 4 | ||
193 | .Lloop_1: | ||
194 | source | ||
195 | movw (%rdi),%bx | ||
196 | adcl %ebx,%eax | ||
197 | decl %ecx | ||
198 | dest | ||
199 | movw %bx,(%rsi) | ||
200 | leaq 2(%rdi),%rdi | ||
201 | leaq 2(%rsi),%rsi | ||
202 | jnz .Lloop_1 | ||
203 | adcl %r9d,%eax /* add in carry */ | ||
204 | |||
205 | /* handle last odd byte */ | ||
206 | .Lhandle_1: | ||
207 | testl $1,%r10d | ||
208 | jz .Lende | ||
209 | xorl %ebx,%ebx | ||
210 | source | ||
211 | movb (%rdi),%bl | ||
212 | dest | ||
213 | movb %bl,(%rsi) | ||
214 | addl %ebx,%eax | ||
215 | adcl %r9d,%eax /* carry */ | ||
216 | |||
217 | CFI_REMEMBER_STATE | ||
218 | .Lende: | ||
219 | movq 2*8(%rsp),%rbx | ||
220 | CFI_RESTORE rbx | ||
221 | movq 3*8(%rsp),%r12 | ||
222 | CFI_RESTORE r12 | ||
223 | movq 4*8(%rsp),%r14 | ||
224 | CFI_RESTORE r14 | ||
225 | movq 5*8(%rsp),%r13 | ||
226 | CFI_RESTORE r13 | ||
227 | movq 6*8(%rsp),%rbp | ||
228 | CFI_RESTORE rbp | ||
229 | addq $7*8,%rsp | ||
230 | CFI_ADJUST_CFA_OFFSET -7*8 | ||
231 | ret | ||
232 | CFI_RESTORE_STATE | ||
233 | |||
234 | /* Exception handlers. Very simple, zeroing is done in the wrappers */ | ||
235 | .Lbad_source: | ||
236 | movq (%rsp),%rax | ||
237 | testq %rax,%rax | ||
238 | jz .Lende | ||
239 | movl $-EFAULT,(%rax) | ||
240 | jmp .Lende | ||
241 | |||
242 | .Lbad_dest: | ||
243 | movq 8(%rsp),%rax | ||
244 | testq %rax,%rax | ||
245 | jz .Lende | ||
246 | movl $-EFAULT,(%rax) | ||
247 | jmp .Lende | ||
248 | CFI_ENDPROC | ||
249 | ENDPROC(csum_partial_copy_generic) | ||
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c deleted file mode 100644 index bc503f506903..000000000000 --- a/arch/x86_64/lib/csum-partial.c +++ /dev/null | |||
@@ -1,150 +0,0 @@ | |||
1 | /* | ||
2 | * arch/x86_64/lib/csum-partial.c | ||
3 | * | ||
4 | * This file contains network checksum routines that are better done | ||
5 | * in an architecture-specific manner due to speed. | ||
6 | */ | ||
7 | |||
8 | #include <linux/compiler.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <asm/checksum.h> | ||
11 | |||
12 | static inline unsigned short from32to16(unsigned a) | ||
13 | { | ||
14 | unsigned short b = a >> 16; | ||
15 | asm("addw %w2,%w0\n\t" | ||
16 | "adcw $0,%w0\n" | ||
17 | : "=r" (b) | ||
18 | : "0" (b), "r" (a)); | ||
19 | return b; | ||
20 | } | ||
21 | |||
22 | /* | ||
23 | * Do a 64-bit checksum on an arbitrary memory area. | ||
24 | * Returns a 32bit checksum. | ||
25 | * | ||
26 | * This isn't as time critical as it used to be because many NICs | ||
27 | * do hardware checksumming these days. | ||
28 | * | ||
29 | * Things tried and found to not make it faster: | ||
30 | * Manual Prefetching | ||
31 | * Unrolling to an 128 bytes inner loop. | ||
32 | * Using interleaving with more registers to break the carry chains. | ||
33 | */ | ||
34 | static unsigned do_csum(const unsigned char *buff, unsigned len) | ||
35 | { | ||
36 | unsigned odd, count; | ||
37 | unsigned long result = 0; | ||
38 | |||
39 | if (unlikely(len == 0)) | ||
40 | return result; | ||
41 | odd = 1 & (unsigned long) buff; | ||
42 | if (unlikely(odd)) { | ||
43 | result = *buff << 8; | ||
44 | len--; | ||
45 | buff++; | ||
46 | } | ||
47 | count = len >> 1; /* nr of 16-bit words.. */ | ||
48 | if (count) { | ||
49 | if (2 & (unsigned long) buff) { | ||
50 | result += *(unsigned short *)buff; | ||
51 | count--; | ||
52 | len -= 2; | ||
53 | buff += 2; | ||
54 | } | ||
55 | count >>= 1; /* nr of 32-bit words.. */ | ||
56 | if (count) { | ||
57 | unsigned long zero; | ||
58 | unsigned count64; | ||
59 | if (4 & (unsigned long) buff) { | ||
60 | result += *(unsigned int *) buff; | ||
61 | count--; | ||
62 | len -= 4; | ||
63 | buff += 4; | ||
64 | } | ||
65 | count >>= 1; /* nr of 64-bit words.. */ | ||
66 | |||
67 | /* main loop using 64byte blocks */ | ||
68 | zero = 0; | ||
69 | count64 = count >> 3; | ||
70 | while (count64) { | ||
71 | asm("addq 0*8(%[src]),%[res]\n\t" | ||
72 | "adcq 1*8(%[src]),%[res]\n\t" | ||
73 | "adcq 2*8(%[src]),%[res]\n\t" | ||
74 | "adcq 3*8(%[src]),%[res]\n\t" | ||
75 | "adcq 4*8(%[src]),%[res]\n\t" | ||
76 | "adcq 5*8(%[src]),%[res]\n\t" | ||
77 | "adcq 6*8(%[src]),%[res]\n\t" | ||
78 | "adcq 7*8(%[src]),%[res]\n\t" | ||
79 | "adcq %[zero],%[res]" | ||
80 | : [res] "=r" (result) | ||
81 | : [src] "r" (buff), [zero] "r" (zero), | ||
82 | "[res]" (result)); | ||
83 | buff += 64; | ||
84 | count64--; | ||
85 | } | ||
86 | |||
87 | /* last upto 7 8byte blocks */ | ||
88 | count %= 8; | ||
89 | while (count) { | ||
90 | asm("addq %1,%0\n\t" | ||
91 | "adcq %2,%0\n" | ||
92 | : "=r" (result) | ||
93 | : "m" (*(unsigned long *)buff), | ||
94 | "r" (zero), "0" (result)); | ||
95 | --count; | ||
96 | buff += 8; | ||
97 | } | ||
98 | result = add32_with_carry(result>>32, | ||
99 | result&0xffffffff); | ||
100 | |||
101 | if (len & 4) { | ||
102 | result += *(unsigned int *) buff; | ||
103 | buff += 4; | ||
104 | } | ||
105 | } | ||
106 | if (len & 2) { | ||
107 | result += *(unsigned short *) buff; | ||
108 | buff += 2; | ||
109 | } | ||
110 | } | ||
111 | if (len & 1) | ||
112 | result += *buff; | ||
113 | result = add32_with_carry(result>>32, result & 0xffffffff); | ||
114 | if (unlikely(odd)) { | ||
115 | result = from32to16(result); | ||
116 | result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | ||
117 | } | ||
118 | return result; | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * computes the checksum of a memory block at buff, length len, | ||
123 | * and adds in "sum" (32-bit) | ||
124 | * | ||
125 | * returns a 32-bit number suitable for feeding into itself | ||
126 | * or csum_tcpudp_magic | ||
127 | * | ||
128 | * this function must be called with even lengths, except | ||
129 | * for the last fragment, which may be odd | ||
130 | * | ||
131 | * it's best to have buff aligned on a 64-bit boundary | ||
132 | */ | ||
133 | __wsum csum_partial(const void *buff, int len, __wsum sum) | ||
134 | { | ||
135 | return (__force __wsum)add32_with_carry(do_csum(buff, len), | ||
136 | (__force u32)sum); | ||
137 | } | ||
138 | |||
139 | EXPORT_SYMBOL(csum_partial); | ||
140 | |||
141 | /* | ||
142 | * this routine is used for miscellaneous IP-like checksums, mainly | ||
143 | * in icmp.c | ||
144 | */ | ||
145 | __sum16 ip_compute_csum(const void *buff, int len) | ||
146 | { | ||
147 | return csum_fold(csum_partial(buff,len,0)); | ||
148 | } | ||
149 | EXPORT_SYMBOL(ip_compute_csum); | ||
150 | |||
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c deleted file mode 100644 index fd42a4a095fc..000000000000 --- a/arch/x86_64/lib/csum-wrappers.c +++ /dev/null | |||
@@ -1,135 +0,0 @@ | |||
1 | /* Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
2 | * Subject to the GNU Public License v.2 | ||
3 | * | ||
4 | * Wrappers of assembly checksum functions for x86-64. | ||
5 | */ | ||
6 | |||
7 | #include <asm/checksum.h> | ||
8 | #include <linux/module.h> | ||
9 | |||
10 | /** | ||
11 | * csum_partial_copy_from_user - Copy and checksum from user space. | ||
12 | * @src: source address (user space) | ||
13 | * @dst: destination address | ||
14 | * @len: number of bytes to be copied. | ||
15 | * @isum: initial sum that is added into the result (32bit unfolded) | ||
16 | * @errp: set to -EFAULT for an bad source address. | ||
17 | * | ||
18 | * Returns an 32bit unfolded checksum of the buffer. | ||
19 | * src and dst are best aligned to 64bits. | ||
20 | */ | ||
21 | __wsum | ||
22 | csum_partial_copy_from_user(const void __user *src, void *dst, | ||
23 | int len, __wsum isum, int *errp) | ||
24 | { | ||
25 | might_sleep(); | ||
26 | *errp = 0; | ||
27 | if (likely(access_ok(VERIFY_READ,src, len))) { | ||
28 | /* Why 6, not 7? To handle odd addresses aligned we | ||
29 | would need to do considerable complications to fix the | ||
30 | checksum which is defined as an 16bit accumulator. The | ||
31 | fix alignment code is primarily for performance | ||
32 | compatibility with 32bit and that will handle odd | ||
33 | addresses slowly too. */ | ||
34 | if (unlikely((unsigned long)src & 6)) { | ||
35 | while (((unsigned long)src & 6) && len >= 2) { | ||
36 | __u16 val16; | ||
37 | *errp = __get_user(val16, (const __u16 __user *)src); | ||
38 | if (*errp) | ||
39 | return isum; | ||
40 | *(__u16 *)dst = val16; | ||
41 | isum = (__force __wsum)add32_with_carry( | ||
42 | (__force unsigned)isum, val16); | ||
43 | src += 2; | ||
44 | dst += 2; | ||
45 | len -= 2; | ||
46 | } | ||
47 | } | ||
48 | isum = csum_partial_copy_generic((__force const void *)src, | ||
49 | dst, len, isum, errp, NULL); | ||
50 | if (likely(*errp == 0)) | ||
51 | return isum; | ||
52 | } | ||
53 | *errp = -EFAULT; | ||
54 | memset(dst,0,len); | ||
55 | return isum; | ||
56 | } | ||
57 | |||
58 | EXPORT_SYMBOL(csum_partial_copy_from_user); | ||
59 | |||
60 | /** | ||
61 | * csum_partial_copy_to_user - Copy and checksum to user space. | ||
62 | * @src: source address | ||
63 | * @dst: destination address (user space) | ||
64 | * @len: number of bytes to be copied. | ||
65 | * @isum: initial sum that is added into the result (32bit unfolded) | ||
66 | * @errp: set to -EFAULT for an bad destination address. | ||
67 | * | ||
68 | * Returns an 32bit unfolded checksum of the buffer. | ||
69 | * src and dst are best aligned to 64bits. | ||
70 | */ | ||
71 | __wsum | ||
72 | csum_partial_copy_to_user(const void *src, void __user *dst, | ||
73 | int len, __wsum isum, int *errp) | ||
74 | { | ||
75 | might_sleep(); | ||
76 | if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { | ||
77 | *errp = -EFAULT; | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | if (unlikely((unsigned long)dst & 6)) { | ||
82 | while (((unsigned long)dst & 6) && len >= 2) { | ||
83 | __u16 val16 = *(__u16 *)src; | ||
84 | isum = (__force __wsum)add32_with_carry( | ||
85 | (__force unsigned)isum, val16); | ||
86 | *errp = __put_user(val16, (__u16 __user *)dst); | ||
87 | if (*errp) | ||
88 | return isum; | ||
89 | src += 2; | ||
90 | dst += 2; | ||
91 | len -= 2; | ||
92 | } | ||
93 | } | ||
94 | |||
95 | *errp = 0; | ||
96 | return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); | ||
97 | } | ||
98 | |||
99 | EXPORT_SYMBOL(csum_partial_copy_to_user); | ||
100 | |||
101 | /** | ||
102 | * csum_partial_copy_nocheck - Copy and checksum. | ||
103 | * @src: source address | ||
104 | * @dst: destination address | ||
105 | * @len: number of bytes to be copied. | ||
106 | * @isum: initial sum that is added into the result (32bit unfolded) | ||
107 | * | ||
108 | * Returns an 32bit unfolded checksum of the buffer. | ||
109 | */ | ||
110 | __wsum | ||
111 | csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) | ||
112 | { | ||
113 | return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); | ||
114 | } | ||
115 | EXPORT_SYMBOL(csum_partial_copy_nocheck); | ||
116 | |||
117 | __sum16 csum_ipv6_magic(const struct in6_addr *saddr, | ||
118 | const struct in6_addr *daddr, | ||
119 | __u32 len, unsigned short proto, __wsum sum) | ||
120 | { | ||
121 | __u64 rest, sum64; | ||
122 | |||
123 | rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) + | ||
124 | (__force __u64)sum; | ||
125 | asm(" addq (%[saddr]),%[sum]\n" | ||
126 | " adcq 8(%[saddr]),%[sum]\n" | ||
127 | " adcq (%[daddr]),%[sum]\n" | ||
128 | " adcq 8(%[daddr]),%[sum]\n" | ||
129 | " adcq $0,%[sum]\n" | ||
130 | : [sum] "=r" (sum64) | ||
131 | : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); | ||
132 | return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); | ||
133 | } | ||
134 | |||
135 | EXPORT_SYMBOL(csum_ipv6_magic); | ||
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c deleted file mode 100644 index 2dbebd308347..000000000000 --- a/arch/x86_64/lib/delay.c +++ /dev/null | |||
@@ -1,57 +0,0 @@ | |||
1 | /* | ||
2 | * Precise Delay Loops for x86-64 | ||
3 | * | ||
4 | * Copyright (C) 1993 Linus Torvalds | ||
5 | * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> | ||
6 | * | ||
7 | * The __delay function must _NOT_ be inlined as its execution time | ||
8 | * depends wildly on alignment on many x86 processors. | ||
9 | */ | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/delay.h> | ||
14 | #include <asm/delay.h> | ||
15 | #include <asm/msr.h> | ||
16 | |||
17 | #ifdef CONFIG_SMP | ||
18 | #include <asm/smp.h> | ||
19 | #endif | ||
20 | |||
21 | int read_current_timer(unsigned long *timer_value) | ||
22 | { | ||
23 | rdtscll(*timer_value); | ||
24 | return 0; | ||
25 | } | ||
26 | |||
27 | void __delay(unsigned long loops) | ||
28 | { | ||
29 | unsigned bclock, now; | ||
30 | |||
31 | rdtscl(bclock); | ||
32 | do | ||
33 | { | ||
34 | rep_nop(); | ||
35 | rdtscl(now); | ||
36 | } | ||
37 | while((now-bclock) < loops); | ||
38 | } | ||
39 | EXPORT_SYMBOL(__delay); | ||
40 | |||
41 | inline void __const_udelay(unsigned long xloops) | ||
42 | { | ||
43 | __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1); | ||
44 | } | ||
45 | EXPORT_SYMBOL(__const_udelay); | ||
46 | |||
47 | void __udelay(unsigned long usecs) | ||
48 | { | ||
49 | __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ | ||
50 | } | ||
51 | EXPORT_SYMBOL(__udelay); | ||
52 | |||
53 | void __ndelay(unsigned long nsecs) | ||
54 | { | ||
55 | __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ | ||
56 | } | ||
57 | EXPORT_SYMBOL(__ndelay); | ||
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S deleted file mode 100644 index 5448876261f8..000000000000 --- a/arch/x86_64/lib/getuser.S +++ /dev/null | |||
@@ -1,109 +0,0 @@ | |||
1 | /* | ||
2 | * __get_user functions. | ||
3 | * | ||
4 | * (C) Copyright 1998 Linus Torvalds | ||
5 | * (C) Copyright 2005 Andi Kleen | ||
6 | * | ||
7 | * These functions have a non-standard call interface | ||
8 | * to make them more efficient, especially as they | ||
9 | * return an error value in addition to the "real" | ||
10 | * return value. | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * __get_user_X | ||
15 | * | ||
16 | * Inputs: %rcx contains the address. | ||
17 | * The register is modified, but all changes are undone | ||
18 | * before returning because the C code doesn't know about it. | ||
19 | * | ||
20 | * Outputs: %rax is error code (0 or -EFAULT) | ||
21 | * %rdx contains zero-extended value | ||
22 | * | ||
23 | * %r8 is destroyed. | ||
24 | * | ||
25 | * These functions should not modify any other registers, | ||
26 | * as they get called from within inline assembly. | ||
27 | */ | ||
28 | |||
29 | #include <linux/linkage.h> | ||
30 | #include <asm/dwarf2.h> | ||
31 | #include <asm/page.h> | ||
32 | #include <asm/errno.h> | ||
33 | #include <asm/asm-offsets.h> | ||
34 | #include <asm/thread_info.h> | ||
35 | |||
36 | .text | ||
37 | ENTRY(__get_user_1) | ||
38 | CFI_STARTPROC | ||
39 | GET_THREAD_INFO(%r8) | ||
40 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
41 | jae bad_get_user | ||
42 | 1: movzb (%rcx),%edx | ||
43 | xorl %eax,%eax | ||
44 | ret | ||
45 | CFI_ENDPROC | ||
46 | ENDPROC(__get_user_1) | ||
47 | |||
48 | ENTRY(__get_user_2) | ||
49 | CFI_STARTPROC | ||
50 | GET_THREAD_INFO(%r8) | ||
51 | addq $1,%rcx | ||
52 | jc 20f | ||
53 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
54 | jae 20f | ||
55 | decq %rcx | ||
56 | 2: movzwl (%rcx),%edx | ||
57 | xorl %eax,%eax | ||
58 | ret | ||
59 | 20: decq %rcx | ||
60 | jmp bad_get_user | ||
61 | CFI_ENDPROC | ||
62 | ENDPROC(__get_user_2) | ||
63 | |||
64 | ENTRY(__get_user_4) | ||
65 | CFI_STARTPROC | ||
66 | GET_THREAD_INFO(%r8) | ||
67 | addq $3,%rcx | ||
68 | jc 30f | ||
69 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
70 | jae 30f | ||
71 | subq $3,%rcx | ||
72 | 3: movl (%rcx),%edx | ||
73 | xorl %eax,%eax | ||
74 | ret | ||
75 | 30: subq $3,%rcx | ||
76 | jmp bad_get_user | ||
77 | CFI_ENDPROC | ||
78 | ENDPROC(__get_user_4) | ||
79 | |||
80 | ENTRY(__get_user_8) | ||
81 | CFI_STARTPROC | ||
82 | GET_THREAD_INFO(%r8) | ||
83 | addq $7,%rcx | ||
84 | jc 40f | ||
85 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
86 | jae 40f | ||
87 | subq $7,%rcx | ||
88 | 4: movq (%rcx),%rdx | ||
89 | xorl %eax,%eax | ||
90 | ret | ||
91 | 40: subq $7,%rcx | ||
92 | jmp bad_get_user | ||
93 | CFI_ENDPROC | ||
94 | ENDPROC(__get_user_8) | ||
95 | |||
96 | bad_get_user: | ||
97 | CFI_STARTPROC | ||
98 | xorl %edx,%edx | ||
99 | movq $(-EFAULT),%rax | ||
100 | ret | ||
101 | CFI_ENDPROC | ||
102 | END(bad_get_user) | ||
103 | |||
104 | .section __ex_table,"a" | ||
105 | .quad 1b,bad_get_user | ||
106 | .quad 2b,bad_get_user | ||
107 | .quad 3b,bad_get_user | ||
108 | .quad 4b,bad_get_user | ||
109 | .previous | ||
diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c deleted file mode 100644 index 87b4a4e18039..000000000000 --- a/arch/x86_64/lib/io.c +++ /dev/null | |||
@@ -1,23 +0,0 @@ | |||
1 | #include <linux/string.h> | ||
2 | #include <asm/io.h> | ||
3 | #include <linux/module.h> | ||
4 | |||
5 | void __memcpy_toio(unsigned long dst,const void*src,unsigned len) | ||
6 | { | ||
7 | __inline_memcpy((void *) dst,src,len); | ||
8 | } | ||
9 | EXPORT_SYMBOL(__memcpy_toio); | ||
10 | |||
11 | void __memcpy_fromio(void *dst,unsigned long src,unsigned len) | ||
12 | { | ||
13 | __inline_memcpy(dst,(const void *) src,len); | ||
14 | } | ||
15 | EXPORT_SYMBOL(__memcpy_fromio); | ||
16 | |||
17 | void memset_io(volatile void __iomem *a, int b, size_t c) | ||
18 | { | ||
19 | /* XXX: memset can mangle the IO patterns quite a bit. | ||
20 | perhaps it would be better to use a dumb one */ | ||
21 | memset((void *)a,b,c); | ||
22 | } | ||
23 | EXPORT_SYMBOL(memset_io); | ||
diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S deleted file mode 100644 index 05a95e713da8..000000000000 --- a/arch/x86_64/lib/iomap_copy.S +++ /dev/null | |||
@@ -1,30 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2006 PathScale, Inc. All Rights Reserved. | ||
3 | * | ||
4 | * This file is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of version 2 of the GNU General Public License | ||
6 | * as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software Foundation, | ||
15 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
16 | */ | ||
17 | |||
18 | #include <linux/linkage.h> | ||
19 | #include <asm/dwarf2.h> | ||
20 | |||
21 | /* | ||
22 | * override generic version in lib/iomap_copy.c | ||
23 | */ | ||
24 | ENTRY(__iowrite32_copy) | ||
25 | CFI_STARTPROC | ||
26 | movl %edx,%ecx | ||
27 | rep movsd | ||
28 | ret | ||
29 | CFI_ENDPROC | ||
30 | ENDPROC(__iowrite32_copy) | ||
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S deleted file mode 100644 index c22981fa2f3a..000000000000 --- a/arch/x86_64/lib/memcpy.S +++ /dev/null | |||
@@ -1,131 +0,0 @@ | |||
1 | /* Copyright 2002 Andi Kleen */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/dwarf2.h> | ||
5 | #include <asm/cpufeature.h> | ||
6 | |||
7 | /* | ||
8 | * memcpy - Copy a memory block. | ||
9 | * | ||
10 | * Input: | ||
11 | * rdi destination | ||
12 | * rsi source | ||
13 | * rdx count | ||
14 | * | ||
15 | * Output: | ||
16 | * rax original destination | ||
17 | */ | ||
18 | |||
19 | ALIGN | ||
20 | memcpy_c: | ||
21 | CFI_STARTPROC | ||
22 | movq %rdi,%rax | ||
23 | movl %edx,%ecx | ||
24 | shrl $3,%ecx | ||
25 | andl $7,%edx | ||
26 | rep movsq | ||
27 | movl %edx,%ecx | ||
28 | rep movsb | ||
29 | ret | ||
30 | CFI_ENDPROC | ||
31 | ENDPROC(memcpy_c) | ||
32 | |||
33 | ENTRY(__memcpy) | ||
34 | ENTRY(memcpy) | ||
35 | CFI_STARTPROC | ||
36 | pushq %rbx | ||
37 | CFI_ADJUST_CFA_OFFSET 8 | ||
38 | CFI_REL_OFFSET rbx, 0 | ||
39 | movq %rdi,%rax | ||
40 | |||
41 | movl %edx,%ecx | ||
42 | shrl $6,%ecx | ||
43 | jz .Lhandle_tail | ||
44 | |||
45 | .p2align 4 | ||
46 | .Lloop_64: | ||
47 | decl %ecx | ||
48 | |||
49 | movq (%rsi),%r11 | ||
50 | movq 8(%rsi),%r8 | ||
51 | |||
52 | movq %r11,(%rdi) | ||
53 | movq %r8,1*8(%rdi) | ||
54 | |||
55 | movq 2*8(%rsi),%r9 | ||
56 | movq 3*8(%rsi),%r10 | ||
57 | |||
58 | movq %r9,2*8(%rdi) | ||
59 | movq %r10,3*8(%rdi) | ||
60 | |||
61 | movq 4*8(%rsi),%r11 | ||
62 | movq 5*8(%rsi),%r8 | ||
63 | |||
64 | movq %r11,4*8(%rdi) | ||
65 | movq %r8,5*8(%rdi) | ||
66 | |||
67 | movq 6*8(%rsi),%r9 | ||
68 | movq 7*8(%rsi),%r10 | ||
69 | |||
70 | movq %r9,6*8(%rdi) | ||
71 | movq %r10,7*8(%rdi) | ||
72 | |||
73 | leaq 64(%rsi),%rsi | ||
74 | leaq 64(%rdi),%rdi | ||
75 | jnz .Lloop_64 | ||
76 | |||
77 | .Lhandle_tail: | ||
78 | movl %edx,%ecx | ||
79 | andl $63,%ecx | ||
80 | shrl $3,%ecx | ||
81 | jz .Lhandle_7 | ||
82 | .p2align 4 | ||
83 | .Lloop_8: | ||
84 | decl %ecx | ||
85 | movq (%rsi),%r8 | ||
86 | movq %r8,(%rdi) | ||
87 | leaq 8(%rdi),%rdi | ||
88 | leaq 8(%rsi),%rsi | ||
89 | jnz .Lloop_8 | ||
90 | |||
91 | .Lhandle_7: | ||
92 | movl %edx,%ecx | ||
93 | andl $7,%ecx | ||
94 | jz .Lende | ||
95 | .p2align 4 | ||
96 | .Lloop_1: | ||
97 | movb (%rsi),%r8b | ||
98 | movb %r8b,(%rdi) | ||
99 | incq %rdi | ||
100 | incq %rsi | ||
101 | decl %ecx | ||
102 | jnz .Lloop_1 | ||
103 | |||
104 | .Lende: | ||
105 | popq %rbx | ||
106 | CFI_ADJUST_CFA_OFFSET -8 | ||
107 | CFI_RESTORE rbx | ||
108 | ret | ||
109 | .Lfinal: | ||
110 | CFI_ENDPROC | ||
111 | ENDPROC(memcpy) | ||
112 | ENDPROC(__memcpy) | ||
113 | |||
114 | /* Some CPUs run faster using the string copy instructions. | ||
115 | It is also a lot simpler. Use this when possible */ | ||
116 | |||
117 | .section .altinstr_replacement,"ax" | ||
118 | 1: .byte 0xeb /* jmp <disp8> */ | ||
119 | .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ | ||
120 | 2: | ||
121 | .previous | ||
122 | .section .altinstructions,"a" | ||
123 | .align 8 | ||
124 | .quad memcpy | ||
125 | .quad 1b | ||
126 | .byte X86_FEATURE_REP_GOOD | ||
127 | /* Replace only beginning, memcpy is used to apply alternatives, so it | ||
128 | * is silly to overwrite itself with nops - reboot is only outcome... */ | ||
129 | .byte 2b - 1b | ||
130 | .byte 2b - 1b | ||
131 | .previous | ||
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c deleted file mode 100644 index 751ebae8ec42..000000000000 --- a/arch/x86_64/lib/memmove.c +++ /dev/null | |||
@@ -1,21 +0,0 @@ | |||
1 | /* Normally compiler builtins are used, but sometimes the compiler calls out | ||
2 | of line code. Based on asm-i386/string.h. | ||
3 | */ | ||
4 | #define _STRING_C | ||
5 | #include <linux/string.h> | ||
6 | #include <linux/module.h> | ||
7 | |||
8 | #undef memmove | ||
9 | void *memmove(void * dest,const void *src,size_t count) | ||
10 | { | ||
11 | if (dest < src) { | ||
12 | return memcpy(dest,src,count); | ||
13 | } else { | ||
14 | char *p = (char *) dest + count; | ||
15 | char *s = (char *) src + count; | ||
16 | while (count--) | ||
17 | *--p = *--s; | ||
18 | } | ||
19 | return dest; | ||
20 | } | ||
21 | EXPORT_SYMBOL(memmove); | ||
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S deleted file mode 100644 index 2c5948116bd2..000000000000 --- a/arch/x86_64/lib/memset.S +++ /dev/null | |||
@@ -1,133 +0,0 @@ | |||
1 | /* Copyright 2002 Andi Kleen, SuSE Labs */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/dwarf2.h> | ||
5 | |||
6 | /* | ||
7 | * ISO C memset - set a memory block to a byte value. | ||
8 | * | ||
9 | * rdi destination | ||
10 | * rsi value (char) | ||
11 | * rdx count (bytes) | ||
12 | * | ||
13 | * rax original destination | ||
14 | */ | ||
15 | ALIGN | ||
16 | memset_c: | ||
17 | CFI_STARTPROC | ||
18 | movq %rdi,%r9 | ||
19 | movl %edx,%r8d | ||
20 | andl $7,%r8d | ||
21 | movl %edx,%ecx | ||
22 | shrl $3,%ecx | ||
23 | /* expand byte value */ | ||
24 | movzbl %sil,%esi | ||
25 | movabs $0x0101010101010101,%rax | ||
26 | mulq %rsi /* with rax, clobbers rdx */ | ||
27 | rep stosq | ||
28 | movl %r8d,%ecx | ||
29 | rep stosb | ||
30 | movq %r9,%rax | ||
31 | ret | ||
32 | CFI_ENDPROC | ||
33 | ENDPROC(memset_c) | ||
34 | |||
35 | ENTRY(memset) | ||
36 | ENTRY(__memset) | ||
37 | CFI_STARTPROC | ||
38 | movq %rdi,%r10 | ||
39 | movq %rdx,%r11 | ||
40 | |||
41 | /* expand byte value */ | ||
42 | movzbl %sil,%ecx | ||
43 | movabs $0x0101010101010101,%rax | ||
44 | mul %rcx /* with rax, clobbers rdx */ | ||
45 | |||
46 | /* align dst */ | ||
47 | movl %edi,%r9d | ||
48 | andl $7,%r9d | ||
49 | jnz .Lbad_alignment | ||
50 | CFI_REMEMBER_STATE | ||
51 | .Lafter_bad_alignment: | ||
52 | |||
53 | movl %r11d,%ecx | ||
54 | shrl $6,%ecx | ||
55 | jz .Lhandle_tail | ||
56 | |||
57 | .p2align 4 | ||
58 | .Lloop_64: | ||
59 | decl %ecx | ||
60 | movq %rax,(%rdi) | ||
61 | movq %rax,8(%rdi) | ||
62 | movq %rax,16(%rdi) | ||
63 | movq %rax,24(%rdi) | ||
64 | movq %rax,32(%rdi) | ||
65 | movq %rax,40(%rdi) | ||
66 | movq %rax,48(%rdi) | ||
67 | movq %rax,56(%rdi) | ||
68 | leaq 64(%rdi),%rdi | ||
69 | jnz .Lloop_64 | ||
70 | |||
71 | /* Handle tail in loops. The loops should be faster than hard | ||
72 | to predict jump tables. */ | ||
73 | .p2align 4 | ||
74 | .Lhandle_tail: | ||
75 | movl %r11d,%ecx | ||
76 | andl $63&(~7),%ecx | ||
77 | jz .Lhandle_7 | ||
78 | shrl $3,%ecx | ||
79 | .p2align 4 | ||
80 | .Lloop_8: | ||
81 | decl %ecx | ||
82 | movq %rax,(%rdi) | ||
83 | leaq 8(%rdi),%rdi | ||
84 | jnz .Lloop_8 | ||
85 | |||
86 | .Lhandle_7: | ||
87 | movl %r11d,%ecx | ||
88 | andl $7,%ecx | ||
89 | jz .Lende | ||
90 | .p2align 4 | ||
91 | .Lloop_1: | ||
92 | decl %ecx | ||
93 | movb %al,(%rdi) | ||
94 | leaq 1(%rdi),%rdi | ||
95 | jnz .Lloop_1 | ||
96 | |||
97 | .Lende: | ||
98 | movq %r10,%rax | ||
99 | ret | ||
100 | |||
101 | CFI_RESTORE_STATE | ||
102 | .Lbad_alignment: | ||
103 | cmpq $7,%r11 | ||
104 | jbe .Lhandle_7 | ||
105 | movq %rax,(%rdi) /* unaligned store */ | ||
106 | movq $8,%r8 | ||
107 | subq %r9,%r8 | ||
108 | addq %r8,%rdi | ||
109 | subq %r8,%r11 | ||
110 | jmp .Lafter_bad_alignment | ||
111 | .Lfinal: | ||
112 | CFI_ENDPROC | ||
113 | ENDPROC(memset) | ||
114 | ENDPROC(__memset) | ||
115 | |||
116 | /* Some CPUs run faster using the string instructions. | ||
117 | It is also a lot simpler. Use this when possible */ | ||
118 | |||
119 | #include <asm/cpufeature.h> | ||
120 | |||
121 | .section .altinstr_replacement,"ax" | ||
122 | 1: .byte 0xeb /* jmp <disp8> */ | ||
123 | .byte (memset_c - memset) - (2f - 1b) /* offset */ | ||
124 | 2: | ||
125 | .previous | ||
126 | .section .altinstructions,"a" | ||
127 | .align 8 | ||
128 | .quad memset | ||
129 | .quad 1b | ||
130 | .byte X86_FEATURE_REP_GOOD | ||
131 | .byte .Lfinal - memset | ||
132 | .byte 2b - 1b | ||
133 | .previous | ||
diff --git a/arch/x86_64/lib/msr-on-cpu.c b/arch/x86_64/lib/msr-on-cpu.c deleted file mode 100644 index 47e0ec47c376..000000000000 --- a/arch/x86_64/lib/msr-on-cpu.c +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | #include "../../i386/lib/msr-on-cpu.c" | ||
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S deleted file mode 100644 index 4989f5a8fa9b..000000000000 --- a/arch/x86_64/lib/putuser.S +++ /dev/null | |||
@@ -1,106 +0,0 @@ | |||
1 | /* | ||
2 | * __put_user functions. | ||
3 | * | ||
4 | * (C) Copyright 1998 Linus Torvalds | ||
5 | * (C) Copyright 2005 Andi Kleen | ||
6 | * | ||
7 | * These functions have a non-standard call interface | ||
8 | * to make them more efficient, especially as they | ||
9 | * return an error value in addition to the "real" | ||
10 | * return value. | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * __put_user_X | ||
15 | * | ||
16 | * Inputs: %rcx contains the address | ||
17 | * %rdx contains new value | ||
18 | * | ||
19 | * Outputs: %rax is error code (0 or -EFAULT) | ||
20 | * | ||
21 | * %r8 is destroyed. | ||
22 | * | ||
23 | * These functions should not modify any other registers, | ||
24 | * as they get called from within inline assembly. | ||
25 | */ | ||
26 | |||
27 | #include <linux/linkage.h> | ||
28 | #include <asm/dwarf2.h> | ||
29 | #include <asm/page.h> | ||
30 | #include <asm/errno.h> | ||
31 | #include <asm/asm-offsets.h> | ||
32 | #include <asm/thread_info.h> | ||
33 | |||
34 | .text | ||
35 | ENTRY(__put_user_1) | ||
36 | CFI_STARTPROC | ||
37 | GET_THREAD_INFO(%r8) | ||
38 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
39 | jae bad_put_user | ||
40 | 1: movb %dl,(%rcx) | ||
41 | xorl %eax,%eax | ||
42 | ret | ||
43 | CFI_ENDPROC | ||
44 | ENDPROC(__put_user_1) | ||
45 | |||
46 | ENTRY(__put_user_2) | ||
47 | CFI_STARTPROC | ||
48 | GET_THREAD_INFO(%r8) | ||
49 | addq $1,%rcx | ||
50 | jc 20f | ||
51 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
52 | jae 20f | ||
53 | decq %rcx | ||
54 | 2: movw %dx,(%rcx) | ||
55 | xorl %eax,%eax | ||
56 | ret | ||
57 | 20: decq %rcx | ||
58 | jmp bad_put_user | ||
59 | CFI_ENDPROC | ||
60 | ENDPROC(__put_user_2) | ||
61 | |||
62 | ENTRY(__put_user_4) | ||
63 | CFI_STARTPROC | ||
64 | GET_THREAD_INFO(%r8) | ||
65 | addq $3,%rcx | ||
66 | jc 30f | ||
67 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
68 | jae 30f | ||
69 | subq $3,%rcx | ||
70 | 3: movl %edx,(%rcx) | ||
71 | xorl %eax,%eax | ||
72 | ret | ||
73 | 30: subq $3,%rcx | ||
74 | jmp bad_put_user | ||
75 | CFI_ENDPROC | ||
76 | ENDPROC(__put_user_4) | ||
77 | |||
78 | ENTRY(__put_user_8) | ||
79 | CFI_STARTPROC | ||
80 | GET_THREAD_INFO(%r8) | ||
81 | addq $7,%rcx | ||
82 | jc 40f | ||
83 | cmpq threadinfo_addr_limit(%r8),%rcx | ||
84 | jae 40f | ||
85 | subq $7,%rcx | ||
86 | 4: movq %rdx,(%rcx) | ||
87 | xorl %eax,%eax | ||
88 | ret | ||
89 | 40: subq $7,%rcx | ||
90 | jmp bad_put_user | ||
91 | CFI_ENDPROC | ||
92 | ENDPROC(__put_user_8) | ||
93 | |||
94 | bad_put_user: | ||
95 | CFI_STARTPROC | ||
96 | movq $(-EFAULT),%rax | ||
97 | ret | ||
98 | CFI_ENDPROC | ||
99 | END(bad_put_user) | ||
100 | |||
101 | .section __ex_table,"a" | ||
102 | .quad 1b,bad_put_user | ||
103 | .quad 2b,bad_put_user | ||
104 | .quad 3b,bad_put_user | ||
105 | .quad 4b,bad_put_user | ||
106 | .previous | ||
diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S deleted file mode 100644 index 0cde1f807314..000000000000 --- a/arch/x86_64/lib/rwlock.S +++ /dev/null | |||
@@ -1,38 +0,0 @@ | |||
1 | /* Slow paths of read/write spinlocks. */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/rwlock.h> | ||
5 | #include <asm/alternative-asm.i> | ||
6 | #include <asm/dwarf2.h> | ||
7 | |||
8 | /* rdi: pointer to rwlock_t */ | ||
9 | ENTRY(__write_lock_failed) | ||
10 | CFI_STARTPROC | ||
11 | LOCK_PREFIX | ||
12 | addl $RW_LOCK_BIAS,(%rdi) | ||
13 | 1: rep | ||
14 | nop | ||
15 | cmpl $RW_LOCK_BIAS,(%rdi) | ||
16 | jne 1b | ||
17 | LOCK_PREFIX | ||
18 | subl $RW_LOCK_BIAS,(%rdi) | ||
19 | jnz __write_lock_failed | ||
20 | ret | ||
21 | CFI_ENDPROC | ||
22 | END(__write_lock_failed) | ||
23 | |||
24 | /* rdi: pointer to rwlock_t */ | ||
25 | ENTRY(__read_lock_failed) | ||
26 | CFI_STARTPROC | ||
27 | LOCK_PREFIX | ||
28 | incl (%rdi) | ||
29 | 1: rep | ||
30 | nop | ||
31 | cmpl $1,(%rdi) | ||
32 | js 1b | ||
33 | LOCK_PREFIX | ||
34 | decl (%rdi) | ||
35 | js __read_lock_failed | ||
36 | ret | ||
37 | CFI_ENDPROC | ||
38 | END(__read_lock_failed) | ||
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S deleted file mode 100644 index 55e586d352d3..000000000000 --- a/arch/x86_64/lib/thunk.S +++ /dev/null | |||
@@ -1,67 +0,0 @@ | |||
1 | /* | ||
2 | * Save registers before calling assembly functions. This avoids | ||
3 | * disturbance of register allocation in some inline assembly constructs. | ||
4 | * Copyright 2001,2002 by Andi Kleen, SuSE Labs. | ||
5 | * Subject to the GNU public license, v.2. No warranty of any kind. | ||
6 | */ | ||
7 | |||
8 | #include <linux/linkage.h> | ||
9 | #include <asm/dwarf2.h> | ||
10 | #include <asm/calling.h> | ||
11 | #include <asm/rwlock.h> | ||
12 | |||
13 | /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ | ||
14 | .macro thunk name,func | ||
15 | .globl \name | ||
16 | \name: | ||
17 | CFI_STARTPROC | ||
18 | SAVE_ARGS | ||
19 | call \func | ||
20 | jmp restore | ||
21 | CFI_ENDPROC | ||
22 | .endm | ||
23 | |||
24 | /* rdi: arg1 ... normal C conventions. rax is passed from C. */ | ||
25 | .macro thunk_retrax name,func | ||
26 | .globl \name | ||
27 | \name: | ||
28 | CFI_STARTPROC | ||
29 | SAVE_ARGS | ||
30 | call \func | ||
31 | jmp restore_norax | ||
32 | CFI_ENDPROC | ||
33 | .endm | ||
34 | |||
35 | |||
36 | .section .sched.text | ||
37 | #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM | ||
38 | thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed | ||
39 | thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed | ||
40 | thunk rwsem_wake_thunk,rwsem_wake | ||
41 | thunk rwsem_downgrade_thunk,rwsem_downgrade_wake | ||
42 | #endif | ||
43 | |||
44 | thunk __down_failed,__down | ||
45 | thunk_retrax __down_failed_interruptible,__down_interruptible | ||
46 | thunk_retrax __down_failed_trylock,__down_trylock | ||
47 | thunk __up_wakeup,__up | ||
48 | |||
49 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
50 | thunk trace_hardirqs_on_thunk,trace_hardirqs_on | ||
51 | thunk trace_hardirqs_off_thunk,trace_hardirqs_off | ||
52 | #endif | ||
53 | |||
54 | /* SAVE_ARGS below is used only for the .cfi directives it contains. */ | ||
55 | CFI_STARTPROC | ||
56 | SAVE_ARGS | ||
57 | restore: | ||
58 | RESTORE_ARGS | ||
59 | ret | ||
60 | CFI_ENDPROC | ||
61 | |||
62 | CFI_STARTPROC | ||
63 | SAVE_ARGS | ||
64 | restore_norax: | ||
65 | RESTORE_ARGS 1 | ||
66 | ret | ||
67 | CFI_ENDPROC | ||
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c deleted file mode 100644 index 893d43f838cc..000000000000 --- a/arch/x86_64/lib/usercopy.c +++ /dev/null | |||
@@ -1,166 +0,0 @@ | |||
1 | /* | ||
2 | * User address space access functions. | ||
3 | * | ||
4 | * Copyright 1997 Andi Kleen <ak@muc.de> | ||
5 | * Copyright 1997 Linus Torvalds | ||
6 | * Copyright 2002 Andi Kleen <ak@suse.de> | ||
7 | */ | ||
8 | #include <linux/module.h> | ||
9 | #include <asm/uaccess.h> | ||
10 | |||
11 | /* | ||
12 | * Copy a null terminated string from userspace. | ||
13 | */ | ||
14 | |||
15 | #define __do_strncpy_from_user(dst,src,count,res) \ | ||
16 | do { \ | ||
17 | long __d0, __d1, __d2; \ | ||
18 | might_sleep(); \ | ||
19 | __asm__ __volatile__( \ | ||
20 | " testq %1,%1\n" \ | ||
21 | " jz 2f\n" \ | ||
22 | "0: lodsb\n" \ | ||
23 | " stosb\n" \ | ||
24 | " testb %%al,%%al\n" \ | ||
25 | " jz 1f\n" \ | ||
26 | " decq %1\n" \ | ||
27 | " jnz 0b\n" \ | ||
28 | "1: subq %1,%0\n" \ | ||
29 | "2:\n" \ | ||
30 | ".section .fixup,\"ax\"\n" \ | ||
31 | "3: movq %5,%0\n" \ | ||
32 | " jmp 2b\n" \ | ||
33 | ".previous\n" \ | ||
34 | ".section __ex_table,\"a\"\n" \ | ||
35 | " .align 8\n" \ | ||
36 | " .quad 0b,3b\n" \ | ||
37 | ".previous" \ | ||
38 | : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ | ||
39 | "=&D" (__d2) \ | ||
40 | : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ | ||
41 | : "memory"); \ | ||
42 | } while (0) | ||
43 | |||
44 | long | ||
45 | __strncpy_from_user(char *dst, const char __user *src, long count) | ||
46 | { | ||
47 | long res; | ||
48 | __do_strncpy_from_user(dst, src, count, res); | ||
49 | return res; | ||
50 | } | ||
51 | EXPORT_SYMBOL(__strncpy_from_user); | ||
52 | |||
53 | long | ||
54 | strncpy_from_user(char *dst, const char __user *src, long count) | ||
55 | { | ||
56 | long res = -EFAULT; | ||
57 | if (access_ok(VERIFY_READ, src, 1)) | ||
58 | return __strncpy_from_user(dst, src, count); | ||
59 | return res; | ||
60 | } | ||
61 | EXPORT_SYMBOL(strncpy_from_user); | ||
62 | |||
63 | /* | ||
64 | * Zero Userspace | ||
65 | */ | ||
66 | |||
67 | unsigned long __clear_user(void __user *addr, unsigned long size) | ||
68 | { | ||
69 | long __d0; | ||
70 | might_sleep(); | ||
71 | /* no memory constraint because it doesn't change any memory gcc knows | ||
72 | about */ | ||
73 | asm volatile( | ||
74 | " testq %[size8],%[size8]\n" | ||
75 | " jz 4f\n" | ||
76 | "0: movq %[zero],(%[dst])\n" | ||
77 | " addq %[eight],%[dst]\n" | ||
78 | " decl %%ecx ; jnz 0b\n" | ||
79 | "4: movq %[size1],%%rcx\n" | ||
80 | " testl %%ecx,%%ecx\n" | ||
81 | " jz 2f\n" | ||
82 | "1: movb %b[zero],(%[dst])\n" | ||
83 | " incq %[dst]\n" | ||
84 | " decl %%ecx ; jnz 1b\n" | ||
85 | "2:\n" | ||
86 | ".section .fixup,\"ax\"\n" | ||
87 | "3: lea 0(%[size1],%[size8],8),%[size8]\n" | ||
88 | " jmp 2b\n" | ||
89 | ".previous\n" | ||
90 | ".section __ex_table,\"a\"\n" | ||
91 | " .align 8\n" | ||
92 | " .quad 0b,3b\n" | ||
93 | " .quad 1b,2b\n" | ||
94 | ".previous" | ||
95 | : [size8] "=c"(size), [dst] "=&D" (__d0) | ||
96 | : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), | ||
97 | [zero] "r" (0UL), [eight] "r" (8UL)); | ||
98 | return size; | ||
99 | } | ||
100 | EXPORT_SYMBOL(__clear_user); | ||
101 | |||
102 | unsigned long clear_user(void __user *to, unsigned long n) | ||
103 | { | ||
104 | if (access_ok(VERIFY_WRITE, to, n)) | ||
105 | return __clear_user(to, n); | ||
106 | return n; | ||
107 | } | ||
108 | EXPORT_SYMBOL(clear_user); | ||
109 | |||
110 | /* | ||
111 | * Return the size of a string (including the ending 0) | ||
112 | * | ||
113 | * Return 0 on exception, a value greater than N if too long | ||
114 | */ | ||
115 | |||
116 | long __strnlen_user(const char __user *s, long n) | ||
117 | { | ||
118 | long res = 0; | ||
119 | char c; | ||
120 | |||
121 | while (1) { | ||
122 | if (res>n) | ||
123 | return n+1; | ||
124 | if (__get_user(c, s)) | ||
125 | return 0; | ||
126 | if (!c) | ||
127 | return res+1; | ||
128 | res++; | ||
129 | s++; | ||
130 | } | ||
131 | } | ||
132 | EXPORT_SYMBOL(__strnlen_user); | ||
133 | |||
134 | long strnlen_user(const char __user *s, long n) | ||
135 | { | ||
136 | if (!access_ok(VERIFY_READ, s, n)) | ||
137 | return 0; | ||
138 | return __strnlen_user(s, n); | ||
139 | } | ||
140 | EXPORT_SYMBOL(strnlen_user); | ||
141 | |||
142 | long strlen_user(const char __user *s) | ||
143 | { | ||
144 | long res = 0; | ||
145 | char c; | ||
146 | |||
147 | for (;;) { | ||
148 | if (get_user(c, s)) | ||
149 | return 0; | ||
150 | if (!c) | ||
151 | return res+1; | ||
152 | res++; | ||
153 | s++; | ||
154 | } | ||
155 | } | ||
156 | EXPORT_SYMBOL(strlen_user); | ||
157 | |||
158 | unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) | ||
159 | { | ||
160 | if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { | ||
161 | return copy_user_generic((__force void *)to, (__force void *)from, len); | ||
162 | } | ||
163 | return len; | ||
164 | } | ||
165 | EXPORT_SYMBOL(copy_in_user); | ||
166 | |||
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile deleted file mode 100644 index d25ac86fe27a..000000000000 --- a/arch/x86_64/mm/Makefile +++ /dev/null | |||
@@ -1,11 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the linux x86_64-specific parts of the memory manager. | ||
3 | # | ||
4 | |||
5 | obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o | ||
6 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
7 | obj-$(CONFIG_NUMA) += numa.o | ||
8 | obj-$(CONFIG_K8_NUMA) += k8topology.o | ||
9 | obj-$(CONFIG_ACPI_NUMA) += srat.o | ||
10 | |||
11 | hugetlbpage-y = ../../i386/mm/hugetlbpage.o | ||
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c deleted file mode 100644 index 79ac6e7100af..000000000000 --- a/arch/x86_64/mm/extable.c +++ /dev/null | |||
@@ -1,34 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/extable.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/module.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <asm/uaccess.h> | ||
9 | |||
10 | /* Simple binary search */ | ||
11 | const struct exception_table_entry * | ||
12 | search_extable(const struct exception_table_entry *first, | ||
13 | const struct exception_table_entry *last, | ||
14 | unsigned long value) | ||
15 | { | ||
16 | /* Work around a B stepping K8 bug */ | ||
17 | if ((value >> 32) == 0) | ||
18 | value |= 0xffffffffUL << 32; | ||
19 | |||
20 | while (first <= last) { | ||
21 | const struct exception_table_entry *mid; | ||
22 | long diff; | ||
23 | |||
24 | mid = (last - first) / 2 + first; | ||
25 | diff = mid->insn - value; | ||
26 | if (diff == 0) | ||
27 | return mid; | ||
28 | else if (diff < 0) | ||
29 | first = mid+1; | ||
30 | else | ||
31 | last = mid-1; | ||
32 | } | ||
33 | return NULL; | ||
34 | } | ||
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c deleted file mode 100644 index 54816adb8e93..000000000000 --- a/arch/x86_64/mm/fault.c +++ /dev/null | |||
@@ -1,636 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/mm/fault.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. | ||
6 | */ | ||
7 | |||
8 | #include <linux/signal.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/types.h> | ||
14 | #include <linux/ptrace.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/smp.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/tty.h> | ||
21 | #include <linux/vt_kern.h> /* For unblank_screen() */ | ||
22 | #include <linux/compiler.h> | ||
23 | #include <linux/vmalloc.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/kprobes.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <linux/kdebug.h> | ||
28 | |||
29 | #include <asm/system.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/smp.h> | ||
32 | #include <asm/tlbflush.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm-generic/sections.h> | ||
35 | |||
36 | /* Page fault error code bits */ | ||
37 | #define PF_PROT (1<<0) /* or no page found */ | ||
38 | #define PF_WRITE (1<<1) | ||
39 | #define PF_USER (1<<2) | ||
40 | #define PF_RSVD (1<<3) | ||
41 | #define PF_INSTR (1<<4) | ||
42 | |||
43 | static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); | ||
44 | |||
45 | /* Hook to register for page fault notifications */ | ||
46 | int register_page_fault_notifier(struct notifier_block *nb) | ||
47 | { | ||
48 | vmalloc_sync_all(); | ||
49 | return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); | ||
50 | } | ||
51 | EXPORT_SYMBOL_GPL(register_page_fault_notifier); | ||
52 | |||
53 | int unregister_page_fault_notifier(struct notifier_block *nb) | ||
54 | { | ||
55 | return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); | ||
56 | } | ||
57 | EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); | ||
58 | |||
59 | static inline int notify_page_fault(struct pt_regs *regs, long err) | ||
60 | { | ||
61 | struct die_args args = { | ||
62 | .regs = regs, | ||
63 | .str = "page fault", | ||
64 | .err = err, | ||
65 | .trapnr = 14, | ||
66 | .signr = SIGSEGV | ||
67 | }; | ||
68 | return atomic_notifier_call_chain(¬ify_page_fault_chain, | ||
69 | DIE_PAGE_FAULT, &args); | ||
70 | } | ||
71 | |||
72 | /* Sometimes the CPU reports invalid exceptions on prefetch. | ||
73 | Check that here and ignore. | ||
74 | Opcode checker based on code by Richard Brunner */ | ||
75 | static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, | ||
76 | unsigned long error_code) | ||
77 | { | ||
78 | unsigned char *instr; | ||
79 | int scan_more = 1; | ||
80 | int prefetch = 0; | ||
81 | unsigned char *max_instr; | ||
82 | |||
83 | /* If it was a exec fault ignore */ | ||
84 | if (error_code & PF_INSTR) | ||
85 | return 0; | ||
86 | |||
87 | instr = (unsigned char __user *)convert_rip_to_linear(current, regs); | ||
88 | max_instr = instr + 15; | ||
89 | |||
90 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | ||
91 | return 0; | ||
92 | |||
93 | while (scan_more && instr < max_instr) { | ||
94 | unsigned char opcode; | ||
95 | unsigned char instr_hi; | ||
96 | unsigned char instr_lo; | ||
97 | |||
98 | if (probe_kernel_address(instr, opcode)) | ||
99 | break; | ||
100 | |||
101 | instr_hi = opcode & 0xf0; | ||
102 | instr_lo = opcode & 0x0f; | ||
103 | instr++; | ||
104 | |||
105 | switch (instr_hi) { | ||
106 | case 0x20: | ||
107 | case 0x30: | ||
108 | /* Values 0x26,0x2E,0x36,0x3E are valid x86 | ||
109 | prefixes. In long mode, the CPU will signal | ||
110 | invalid opcode if some of these prefixes are | ||
111 | present so we will never get here anyway */ | ||
112 | scan_more = ((instr_lo & 7) == 0x6); | ||
113 | break; | ||
114 | |||
115 | case 0x40: | ||
116 | /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes | ||
117 | Need to figure out under what instruction mode the | ||
118 | instruction was issued ... */ | ||
119 | /* Could check the LDT for lm, but for now it's good | ||
120 | enough to assume that long mode only uses well known | ||
121 | segments or kernel. */ | ||
122 | scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); | ||
123 | break; | ||
124 | |||
125 | case 0x60: | ||
126 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ | ||
127 | scan_more = (instr_lo & 0xC) == 0x4; | ||
128 | break; | ||
129 | case 0xF0: | ||
130 | /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ | ||
131 | scan_more = !instr_lo || (instr_lo>>1) == 1; | ||
132 | break; | ||
133 | case 0x00: | ||
134 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ | ||
135 | scan_more = 0; | ||
136 | if (probe_kernel_address(instr, opcode)) | ||
137 | break; | ||
138 | prefetch = (instr_lo == 0xF) && | ||
139 | (opcode == 0x0D || opcode == 0x18); | ||
140 | break; | ||
141 | default: | ||
142 | scan_more = 0; | ||
143 | break; | ||
144 | } | ||
145 | } | ||
146 | return prefetch; | ||
147 | } | ||
148 | |||
149 | static int bad_address(void *p) | ||
150 | { | ||
151 | unsigned long dummy; | ||
152 | return probe_kernel_address((unsigned long *)p, dummy); | ||
153 | } | ||
154 | |||
155 | void dump_pagetable(unsigned long address) | ||
156 | { | ||
157 | pgd_t *pgd; | ||
158 | pud_t *pud; | ||
159 | pmd_t *pmd; | ||
160 | pte_t *pte; | ||
161 | |||
162 | pgd = (pgd_t *)read_cr3(); | ||
163 | |||
164 | pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); | ||
165 | pgd += pgd_index(address); | ||
166 | if (bad_address(pgd)) goto bad; | ||
167 | printk("PGD %lx ", pgd_val(*pgd)); | ||
168 | if (!pgd_present(*pgd)) goto ret; | ||
169 | |||
170 | pud = pud_offset(pgd, address); | ||
171 | if (bad_address(pud)) goto bad; | ||
172 | printk("PUD %lx ", pud_val(*pud)); | ||
173 | if (!pud_present(*pud)) goto ret; | ||
174 | |||
175 | pmd = pmd_offset(pud, address); | ||
176 | if (bad_address(pmd)) goto bad; | ||
177 | printk("PMD %lx ", pmd_val(*pmd)); | ||
178 | if (!pmd_present(*pmd)) goto ret; | ||
179 | |||
180 | pte = pte_offset_kernel(pmd, address); | ||
181 | if (bad_address(pte)) goto bad; | ||
182 | printk("PTE %lx", pte_val(*pte)); | ||
183 | ret: | ||
184 | printk("\n"); | ||
185 | return; | ||
186 | bad: | ||
187 | printk("BAD\n"); | ||
188 | } | ||
189 | |||
190 | static const char errata93_warning[] = | ||
191 | KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | ||
192 | KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | ||
193 | KERN_ERR "******* Please consider a BIOS update.\n" | ||
194 | KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | ||
195 | |||
196 | /* Workaround for K8 erratum #93 & buggy BIOS. | ||
197 | BIOS SMM functions are required to use a specific workaround | ||
198 | to avoid corruption of the 64bit RIP register on C stepping K8. | ||
199 | A lot of BIOS that didn't get tested properly miss this. | ||
200 | The OS sees this as a page fault with the upper 32bits of RIP cleared. | ||
201 | Try to work around it here. | ||
202 | Note we only handle faults in kernel here. */ | ||
203 | |||
204 | static int is_errata93(struct pt_regs *regs, unsigned long address) | ||
205 | { | ||
206 | static int warned; | ||
207 | if (address != regs->rip) | ||
208 | return 0; | ||
209 | if ((address >> 32) != 0) | ||
210 | return 0; | ||
211 | address |= 0xffffffffUL << 32; | ||
212 | if ((address >= (u64)_stext && address <= (u64)_etext) || | ||
213 | (address >= MODULES_VADDR && address <= MODULES_END)) { | ||
214 | if (!warned) { | ||
215 | printk(errata93_warning); | ||
216 | warned = 1; | ||
217 | } | ||
218 | regs->rip = address; | ||
219 | return 1; | ||
220 | } | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | ||
225 | unsigned long error_code) | ||
226 | { | ||
227 | unsigned long flags = oops_begin(); | ||
228 | struct task_struct *tsk; | ||
229 | |||
230 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | ||
231 | current->comm, address); | ||
232 | dump_pagetable(address); | ||
233 | tsk = current; | ||
234 | tsk->thread.cr2 = address; | ||
235 | tsk->thread.trap_no = 14; | ||
236 | tsk->thread.error_code = error_code; | ||
237 | __die("Bad pagetable", regs, error_code); | ||
238 | oops_end(flags); | ||
239 | do_exit(SIGKILL); | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * Handle a fault on the vmalloc area | ||
244 | * | ||
245 | * This assumes no large pages in there. | ||
246 | */ | ||
247 | static int vmalloc_fault(unsigned long address) | ||
248 | { | ||
249 | pgd_t *pgd, *pgd_ref; | ||
250 | pud_t *pud, *pud_ref; | ||
251 | pmd_t *pmd, *pmd_ref; | ||
252 | pte_t *pte, *pte_ref; | ||
253 | |||
254 | /* Copy kernel mappings over when needed. This can also | ||
255 | happen within a race in page table update. In the later | ||
256 | case just flush. */ | ||
257 | |||
258 | pgd = pgd_offset(current->mm ?: &init_mm, address); | ||
259 | pgd_ref = pgd_offset_k(address); | ||
260 | if (pgd_none(*pgd_ref)) | ||
261 | return -1; | ||
262 | if (pgd_none(*pgd)) | ||
263 | set_pgd(pgd, *pgd_ref); | ||
264 | else | ||
265 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
266 | |||
267 | /* Below here mismatches are bugs because these lower tables | ||
268 | are shared */ | ||
269 | |||
270 | pud = pud_offset(pgd, address); | ||
271 | pud_ref = pud_offset(pgd_ref, address); | ||
272 | if (pud_none(*pud_ref)) | ||
273 | return -1; | ||
274 | if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) | ||
275 | BUG(); | ||
276 | pmd = pmd_offset(pud, address); | ||
277 | pmd_ref = pmd_offset(pud_ref, address); | ||
278 | if (pmd_none(*pmd_ref)) | ||
279 | return -1; | ||
280 | if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | ||
281 | BUG(); | ||
282 | pte_ref = pte_offset_kernel(pmd_ref, address); | ||
283 | if (!pte_present(*pte_ref)) | ||
284 | return -1; | ||
285 | pte = pte_offset_kernel(pmd, address); | ||
286 | /* Don't use pte_page here, because the mappings can point | ||
287 | outside mem_map, and the NUMA hash lookup cannot handle | ||
288 | that. */ | ||
289 | if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) | ||
290 | BUG(); | ||
291 | return 0; | ||
292 | } | ||
293 | |||
294 | static int page_fault_trace; | ||
295 | int show_unhandled_signals = 1; | ||
296 | |||
297 | /* | ||
298 | * This routine handles page faults. It determines the address, | ||
299 | * and the problem, and then passes it off to one of the appropriate | ||
300 | * routines. | ||
301 | */ | ||
302 | asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | ||
303 | unsigned long error_code) | ||
304 | { | ||
305 | struct task_struct *tsk; | ||
306 | struct mm_struct *mm; | ||
307 | struct vm_area_struct * vma; | ||
308 | unsigned long address; | ||
309 | const struct exception_table_entry *fixup; | ||
310 | int write, fault; | ||
311 | unsigned long flags; | ||
312 | siginfo_t info; | ||
313 | |||
314 | tsk = current; | ||
315 | mm = tsk->mm; | ||
316 | prefetchw(&mm->mmap_sem); | ||
317 | |||
318 | /* get the address */ | ||
319 | address = read_cr2(); | ||
320 | |||
321 | info.si_code = SEGV_MAPERR; | ||
322 | |||
323 | |||
324 | /* | ||
325 | * We fault-in kernel-space virtual memory on-demand. The | ||
326 | * 'reference' page table is init_mm.pgd. | ||
327 | * | ||
328 | * NOTE! We MUST NOT take any locks for this case. We may | ||
329 | * be in an interrupt or a critical region, and should | ||
330 | * only copy the information from the master page table, | ||
331 | * nothing more. | ||
332 | * | ||
333 | * This verifies that the fault happens in kernel space | ||
334 | * (error_code & 4) == 0, and that the fault was not a | ||
335 | * protection error (error_code & 9) == 0. | ||
336 | */ | ||
337 | if (unlikely(address >= TASK_SIZE64)) { | ||
338 | /* | ||
339 | * Don't check for the module range here: its PML4 | ||
340 | * is always initialized because it's shared with the main | ||
341 | * kernel text. Only vmalloc may need PML4 syncups. | ||
342 | */ | ||
343 | if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | ||
344 | ((address >= VMALLOC_START && address < VMALLOC_END))) { | ||
345 | if (vmalloc_fault(address) >= 0) | ||
346 | return; | ||
347 | } | ||
348 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) | ||
349 | return; | ||
350 | /* | ||
351 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
352 | * fault we could otherwise deadlock. | ||
353 | */ | ||
354 | goto bad_area_nosemaphore; | ||
355 | } | ||
356 | |||
357 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) | ||
358 | return; | ||
359 | |||
360 | if (likely(regs->eflags & X86_EFLAGS_IF)) | ||
361 | local_irq_enable(); | ||
362 | |||
363 | if (unlikely(page_fault_trace)) | ||
364 | printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", | ||
365 | regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); | ||
366 | |||
367 | if (unlikely(error_code & PF_RSVD)) | ||
368 | pgtable_bad(address, regs, error_code); | ||
369 | |||
370 | /* | ||
371 | * If we're in an interrupt or have no user | ||
372 | * context, we must not take the fault.. | ||
373 | */ | ||
374 | if (unlikely(in_atomic() || !mm)) | ||
375 | goto bad_area_nosemaphore; | ||
376 | |||
377 | /* | ||
378 | * User-mode registers count as a user access even for any | ||
379 | * potential system fault or CPU buglet. | ||
380 | */ | ||
381 | if (user_mode_vm(regs)) | ||
382 | error_code |= PF_USER; | ||
383 | |||
384 | again: | ||
385 | /* When running in the kernel we expect faults to occur only to | ||
386 | * addresses in user space. All other faults represent errors in the | ||
387 | * kernel and should generate an OOPS. Unfortunatly, in the case of an | ||
388 | * erroneous fault occurring in a code path which already holds mmap_sem | ||
389 | * we will deadlock attempting to validate the fault against the | ||
390 | * address space. Luckily the kernel only validly references user | ||
391 | * space from well defined areas of code, which are listed in the | ||
392 | * exceptions table. | ||
393 | * | ||
394 | * As the vast majority of faults will be valid we will only perform | ||
395 | * the source reference check when there is a possibilty of a deadlock. | ||
396 | * Attempt to lock the address space, if we cannot we then validate the | ||
397 | * source. If this is invalid we can skip the address space check, | ||
398 | * thus avoiding the deadlock. | ||
399 | */ | ||
400 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
401 | if ((error_code & PF_USER) == 0 && | ||
402 | !search_exception_tables(regs->rip)) | ||
403 | goto bad_area_nosemaphore; | ||
404 | down_read(&mm->mmap_sem); | ||
405 | } | ||
406 | |||
407 | vma = find_vma(mm, address); | ||
408 | if (!vma) | ||
409 | goto bad_area; | ||
410 | if (likely(vma->vm_start <= address)) | ||
411 | goto good_area; | ||
412 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
413 | goto bad_area; | ||
414 | if (error_code & 4) { | ||
415 | /* Allow userspace just enough access below the stack pointer | ||
416 | * to let the 'enter' instruction work. | ||
417 | */ | ||
418 | if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) | ||
419 | goto bad_area; | ||
420 | } | ||
421 | if (expand_stack(vma, address)) | ||
422 | goto bad_area; | ||
423 | /* | ||
424 | * Ok, we have a good vm_area for this memory access, so | ||
425 | * we can handle it.. | ||
426 | */ | ||
427 | good_area: | ||
428 | info.si_code = SEGV_ACCERR; | ||
429 | write = 0; | ||
430 | switch (error_code & (PF_PROT|PF_WRITE)) { | ||
431 | default: /* 3: write, present */ | ||
432 | /* fall through */ | ||
433 | case PF_WRITE: /* write, not present */ | ||
434 | if (!(vma->vm_flags & VM_WRITE)) | ||
435 | goto bad_area; | ||
436 | write++; | ||
437 | break; | ||
438 | case PF_PROT: /* read, present */ | ||
439 | goto bad_area; | ||
440 | case 0: /* read, not present */ | ||
441 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | ||
442 | goto bad_area; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * If for any reason at all we couldn't handle the fault, | ||
447 | * make sure we exit gracefully rather than endlessly redo | ||
448 | * the fault. | ||
449 | */ | ||
450 | fault = handle_mm_fault(mm, vma, address, write); | ||
451 | if (unlikely(fault & VM_FAULT_ERROR)) { | ||
452 | if (fault & VM_FAULT_OOM) | ||
453 | goto out_of_memory; | ||
454 | else if (fault & VM_FAULT_SIGBUS) | ||
455 | goto do_sigbus; | ||
456 | BUG(); | ||
457 | } | ||
458 | if (fault & VM_FAULT_MAJOR) | ||
459 | tsk->maj_flt++; | ||
460 | else | ||
461 | tsk->min_flt++; | ||
462 | up_read(&mm->mmap_sem); | ||
463 | return; | ||
464 | |||
465 | /* | ||
466 | * Something tried to access memory that isn't in our memory map.. | ||
467 | * Fix it, but check if it's kernel or user first.. | ||
468 | */ | ||
469 | bad_area: | ||
470 | up_read(&mm->mmap_sem); | ||
471 | |||
472 | bad_area_nosemaphore: | ||
473 | /* User mode accesses just cause a SIGSEGV */ | ||
474 | if (error_code & PF_USER) { | ||
475 | |||
476 | /* | ||
477 | * It's possible to have interrupts off here. | ||
478 | */ | ||
479 | local_irq_enable(); | ||
480 | |||
481 | if (is_prefetch(regs, address, error_code)) | ||
482 | return; | ||
483 | |||
484 | /* Work around K8 erratum #100 K8 in compat mode | ||
485 | occasionally jumps to illegal addresses >4GB. We | ||
486 | catch this here in the page fault handler because | ||
487 | these addresses are not reachable. Just detect this | ||
488 | case and return. Any code segment in LDT is | ||
489 | compatibility mode. */ | ||
490 | if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && | ||
491 | (address >> 32)) | ||
492 | return; | ||
493 | |||
494 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
495 | printk_ratelimit()) { | ||
496 | printk( | ||
497 | "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", | ||
498 | tsk->pid > 1 ? KERN_INFO : KERN_EMERG, | ||
499 | tsk->comm, tsk->pid, address, regs->rip, | ||
500 | regs->rsp, error_code); | ||
501 | } | ||
502 | |||
503 | tsk->thread.cr2 = address; | ||
504 | /* Kernel addresses are always protection faults */ | ||
505 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
506 | tsk->thread.trap_no = 14; | ||
507 | info.si_signo = SIGSEGV; | ||
508 | info.si_errno = 0; | ||
509 | /* info.si_code has been set above */ | ||
510 | info.si_addr = (void __user *)address; | ||
511 | force_sig_info(SIGSEGV, &info, tsk); | ||
512 | return; | ||
513 | } | ||
514 | |||
515 | no_context: | ||
516 | |||
517 | /* Are we prepared to handle this kernel fault? */ | ||
518 | fixup = search_exception_tables(regs->rip); | ||
519 | if (fixup) { | ||
520 | regs->rip = fixup->fixup; | ||
521 | return; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Hall of shame of CPU/BIOS bugs. | ||
526 | */ | ||
527 | |||
528 | if (is_prefetch(regs, address, error_code)) | ||
529 | return; | ||
530 | |||
531 | if (is_errata93(regs, address)) | ||
532 | return; | ||
533 | |||
534 | /* | ||
535 | * Oops. The kernel tried to access some bad page. We'll have to | ||
536 | * terminate things with extreme prejudice. | ||
537 | */ | ||
538 | |||
539 | flags = oops_begin(); | ||
540 | |||
541 | if (address < PAGE_SIZE) | ||
542 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
543 | else | ||
544 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
545 | printk(" at %016lx RIP: \n" KERN_ALERT,address); | ||
546 | printk_address(regs->rip); | ||
547 | dump_pagetable(address); | ||
548 | tsk->thread.cr2 = address; | ||
549 | tsk->thread.trap_no = 14; | ||
550 | tsk->thread.error_code = error_code; | ||
551 | __die("Oops", regs, error_code); | ||
552 | /* Executive summary in case the body of the oops scrolled away */ | ||
553 | printk(KERN_EMERG "CR2: %016lx\n", address); | ||
554 | oops_end(flags); | ||
555 | do_exit(SIGKILL); | ||
556 | |||
557 | /* | ||
558 | * We ran out of memory, or some other thing happened to us that made | ||
559 | * us unable to handle the page fault gracefully. | ||
560 | */ | ||
561 | out_of_memory: | ||
562 | up_read(&mm->mmap_sem); | ||
563 | if (is_init(current)) { | ||
564 | yield(); | ||
565 | goto again; | ||
566 | } | ||
567 | printk("VM: killing process %s\n", tsk->comm); | ||
568 | if (error_code & 4) | ||
569 | do_group_exit(SIGKILL); | ||
570 | goto no_context; | ||
571 | |||
572 | do_sigbus: | ||
573 | up_read(&mm->mmap_sem); | ||
574 | |||
575 | /* Kernel mode? Handle exceptions or die */ | ||
576 | if (!(error_code & PF_USER)) | ||
577 | goto no_context; | ||
578 | |||
579 | tsk->thread.cr2 = address; | ||
580 | tsk->thread.error_code = error_code; | ||
581 | tsk->thread.trap_no = 14; | ||
582 | info.si_signo = SIGBUS; | ||
583 | info.si_errno = 0; | ||
584 | info.si_code = BUS_ADRERR; | ||
585 | info.si_addr = (void __user *)address; | ||
586 | force_sig_info(SIGBUS, &info, tsk); | ||
587 | return; | ||
588 | } | ||
589 | |||
590 | DEFINE_SPINLOCK(pgd_lock); | ||
591 | LIST_HEAD(pgd_list); | ||
592 | |||
593 | void vmalloc_sync_all(void) | ||
594 | { | ||
595 | /* Note that races in the updates of insync and start aren't | ||
596 | problematic: | ||
597 | insync can only get set bits added, and updates to start are only | ||
598 | improving performance (without affecting correctness if undone). */ | ||
599 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); | ||
600 | static unsigned long start = VMALLOC_START & PGDIR_MASK; | ||
601 | unsigned long address; | ||
602 | |||
603 | for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { | ||
604 | if (!test_bit(pgd_index(address), insync)) { | ||
605 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
606 | struct page *page; | ||
607 | |||
608 | if (pgd_none(*pgd_ref)) | ||
609 | continue; | ||
610 | spin_lock(&pgd_lock); | ||
611 | list_for_each_entry(page, &pgd_list, lru) { | ||
612 | pgd_t *pgd; | ||
613 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
614 | if (pgd_none(*pgd)) | ||
615 | set_pgd(pgd, *pgd_ref); | ||
616 | else | ||
617 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
618 | } | ||
619 | spin_unlock(&pgd_lock); | ||
620 | set_bit(pgd_index(address), insync); | ||
621 | } | ||
622 | if (address == start) | ||
623 | start = address + PGDIR_SIZE; | ||
624 | } | ||
625 | /* Check that there is no need to do the same for the modules area. */ | ||
626 | BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); | ||
627 | BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == | ||
628 | (__START_KERNEL & PGDIR_MASK))); | ||
629 | } | ||
630 | |||
631 | static int __init enable_pagefaulttrace(char *str) | ||
632 | { | ||
633 | page_fault_trace = 1; | ||
634 | return 1; | ||
635 | } | ||
636 | __setup("pagefaulttrace", enable_pagefaulttrace); | ||
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c deleted file mode 100644 index 458893b376f8..000000000000 --- a/arch/x86_64/mm/init.c +++ /dev/null | |||
@@ -1,750 +0,0 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/init.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> | ||
7 | */ | ||
8 | |||
9 | #include <linux/signal.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/errno.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/mman.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/swap.h> | ||
19 | #include <linux/smp.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/pagemap.h> | ||
22 | #include <linux/bootmem.h> | ||
23 | #include <linux/proc_fs.h> | ||
24 | #include <linux/pci.h> | ||
25 | #include <linux/pfn.h> | ||
26 | #include <linux/poison.h> | ||
27 | #include <linux/dma-mapping.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/memory_hotplug.h> | ||
30 | #include <linux/nmi.h> | ||
31 | |||
32 | #include <asm/processor.h> | ||
33 | #include <asm/system.h> | ||
34 | #include <asm/uaccess.h> | ||
35 | #include <asm/pgtable.h> | ||
36 | #include <asm/pgalloc.h> | ||
37 | #include <asm/dma.h> | ||
38 | #include <asm/fixmap.h> | ||
39 | #include <asm/e820.h> | ||
40 | #include <asm/apic.h> | ||
41 | #include <asm/tlb.h> | ||
42 | #include <asm/mmu_context.h> | ||
43 | #include <asm/proto.h> | ||
44 | #include <asm/smp.h> | ||
45 | #include <asm/sections.h> | ||
46 | |||
47 | #ifndef Dprintk | ||
48 | #define Dprintk(x...) | ||
49 | #endif | ||
50 | |||
51 | const struct dma_mapping_ops* dma_ops; | ||
52 | EXPORT_SYMBOL(dma_ops); | ||
53 | |||
54 | static unsigned long dma_reserve __initdata; | ||
55 | |||
56 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
57 | |||
58 | /* | ||
59 | * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the | ||
60 | * physical space so we can cache the place of the first one and move | ||
61 | * around without checking the pgd every time. | ||
62 | */ | ||
63 | |||
64 | void show_mem(void) | ||
65 | { | ||
66 | long i, total = 0, reserved = 0; | ||
67 | long shared = 0, cached = 0; | ||
68 | pg_data_t *pgdat; | ||
69 | struct page *page; | ||
70 | |||
71 | printk(KERN_INFO "Mem-info:\n"); | ||
72 | show_free_areas(); | ||
73 | printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | ||
74 | |||
75 | for_each_online_pgdat(pgdat) { | ||
76 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
77 | /* this loop can take a while with 256 GB and 4k pages | ||
78 | so update the NMI watchdog */ | ||
79 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { | ||
80 | touch_nmi_watchdog(); | ||
81 | } | ||
82 | if (!pfn_valid(pgdat->node_start_pfn + i)) | ||
83 | continue; | ||
84 | page = pfn_to_page(pgdat->node_start_pfn + i); | ||
85 | total++; | ||
86 | if (PageReserved(page)) | ||
87 | reserved++; | ||
88 | else if (PageSwapCache(page)) | ||
89 | cached++; | ||
90 | else if (page_count(page)) | ||
91 | shared += page_count(page) - 1; | ||
92 | } | ||
93 | } | ||
94 | printk(KERN_INFO "%lu pages of RAM\n", total); | ||
95 | printk(KERN_INFO "%lu reserved pages\n",reserved); | ||
96 | printk(KERN_INFO "%lu pages shared\n",shared); | ||
97 | printk(KERN_INFO "%lu pages swap cached\n",cached); | ||
98 | } | ||
99 | |||
100 | int after_bootmem; | ||
101 | |||
102 | static __init void *spp_getpage(void) | ||
103 | { | ||
104 | void *ptr; | ||
105 | if (after_bootmem) | ||
106 | ptr = (void *) get_zeroed_page(GFP_ATOMIC); | ||
107 | else | ||
108 | ptr = alloc_bootmem_pages(PAGE_SIZE); | ||
109 | if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) | ||
110 | panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); | ||
111 | |||
112 | Dprintk("spp_getpage %p\n", ptr); | ||
113 | return ptr; | ||
114 | } | ||
115 | |||
116 | static __init void set_pte_phys(unsigned long vaddr, | ||
117 | unsigned long phys, pgprot_t prot) | ||
118 | { | ||
119 | pgd_t *pgd; | ||
120 | pud_t *pud; | ||
121 | pmd_t *pmd; | ||
122 | pte_t *pte, new_pte; | ||
123 | |||
124 | Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | ||
125 | |||
126 | pgd = pgd_offset_k(vaddr); | ||
127 | if (pgd_none(*pgd)) { | ||
128 | printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | ||
129 | return; | ||
130 | } | ||
131 | pud = pud_offset(pgd, vaddr); | ||
132 | if (pud_none(*pud)) { | ||
133 | pmd = (pmd_t *) spp_getpage(); | ||
134 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | ||
135 | if (pmd != pmd_offset(pud, 0)) { | ||
136 | printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | ||
137 | return; | ||
138 | } | ||
139 | } | ||
140 | pmd = pmd_offset(pud, vaddr); | ||
141 | if (pmd_none(*pmd)) { | ||
142 | pte = (pte_t *) spp_getpage(); | ||
143 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | ||
144 | if (pte != pte_offset_kernel(pmd, 0)) { | ||
145 | printk("PAGETABLE BUG #02!\n"); | ||
146 | return; | ||
147 | } | ||
148 | } | ||
149 | new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); | ||
150 | |||
151 | pte = pte_offset_kernel(pmd, vaddr); | ||
152 | if (!pte_none(*pte) && | ||
153 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) | ||
154 | pte_ERROR(*pte); | ||
155 | set_pte(pte, new_pte); | ||
156 | |||
157 | /* | ||
158 | * It's enough to flush this one mapping. | ||
159 | * (PGE mappings get flushed as well) | ||
160 | */ | ||
161 | __flush_tlb_one(vaddr); | ||
162 | } | ||
163 | |||
164 | /* NOTE: this is meant to be run only at boot */ | ||
165 | void __init | ||
166 | __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | ||
167 | { | ||
168 | unsigned long address = __fix_to_virt(idx); | ||
169 | |||
170 | if (idx >= __end_of_fixed_addresses) { | ||
171 | printk("Invalid __set_fixmap\n"); | ||
172 | return; | ||
173 | } | ||
174 | set_pte_phys(address, phys, prot); | ||
175 | } | ||
176 | |||
177 | unsigned long __meminitdata table_start, table_end; | ||
178 | |||
179 | static __meminit void *alloc_low_page(unsigned long *phys) | ||
180 | { | ||
181 | unsigned long pfn = table_end++; | ||
182 | void *adr; | ||
183 | |||
184 | if (after_bootmem) { | ||
185 | adr = (void *)get_zeroed_page(GFP_ATOMIC); | ||
186 | *phys = __pa(adr); | ||
187 | return adr; | ||
188 | } | ||
189 | |||
190 | if (pfn >= end_pfn) | ||
191 | panic("alloc_low_page: ran out of memory"); | ||
192 | |||
193 | adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); | ||
194 | memset(adr, 0, PAGE_SIZE); | ||
195 | *phys = pfn * PAGE_SIZE; | ||
196 | return adr; | ||
197 | } | ||
198 | |||
199 | static __meminit void unmap_low_page(void *adr) | ||
200 | { | ||
201 | |||
202 | if (after_bootmem) | ||
203 | return; | ||
204 | |||
205 | early_iounmap(adr, PAGE_SIZE); | ||
206 | } | ||
207 | |||
208 | /* Must run before zap_low_mappings */ | ||
209 | __meminit void *early_ioremap(unsigned long addr, unsigned long size) | ||
210 | { | ||
211 | unsigned long vaddr; | ||
212 | pmd_t *pmd, *last_pmd; | ||
213 | int i, pmds; | ||
214 | |||
215 | pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | ||
216 | vaddr = __START_KERNEL_map; | ||
217 | pmd = level2_kernel_pgt; | ||
218 | last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; | ||
219 | for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { | ||
220 | for (i = 0; i < pmds; i++) { | ||
221 | if (pmd_present(pmd[i])) | ||
222 | goto next; | ||
223 | } | ||
224 | vaddr += addr & ~PMD_MASK; | ||
225 | addr &= PMD_MASK; | ||
226 | for (i = 0; i < pmds; i++, addr += PMD_SIZE) | ||
227 | set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); | ||
228 | __flush_tlb(); | ||
229 | return (void *)vaddr; | ||
230 | next: | ||
231 | ; | ||
232 | } | ||
233 | printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); | ||
234 | return NULL; | ||
235 | } | ||
236 | |||
237 | /* To avoid virtual aliases later */ | ||
238 | __meminit void early_iounmap(void *addr, unsigned long size) | ||
239 | { | ||
240 | unsigned long vaddr; | ||
241 | pmd_t *pmd; | ||
242 | int i, pmds; | ||
243 | |||
244 | vaddr = (unsigned long)addr; | ||
245 | pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | ||
246 | pmd = level2_kernel_pgt + pmd_index(vaddr); | ||
247 | for (i = 0; i < pmds; i++) | ||
248 | pmd_clear(pmd + i); | ||
249 | __flush_tlb(); | ||
250 | } | ||
251 | |||
252 | static void __meminit | ||
253 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) | ||
254 | { | ||
255 | int i = pmd_index(address); | ||
256 | |||
257 | for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { | ||
258 | unsigned long entry; | ||
259 | pmd_t *pmd = pmd_page + pmd_index(address); | ||
260 | |||
261 | if (address >= end) { | ||
262 | if (!after_bootmem) | ||
263 | for (; i < PTRS_PER_PMD; i++, pmd++) | ||
264 | set_pmd(pmd, __pmd(0)); | ||
265 | break; | ||
266 | } | ||
267 | |||
268 | if (pmd_val(*pmd)) | ||
269 | continue; | ||
270 | |||
271 | entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; | ||
272 | entry &= __supported_pte_mask; | ||
273 | set_pmd(pmd, __pmd(entry)); | ||
274 | } | ||
275 | } | ||
276 | |||
277 | static void __meminit | ||
278 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) | ||
279 | { | ||
280 | pmd_t *pmd = pmd_offset(pud,0); | ||
281 | spin_lock(&init_mm.page_table_lock); | ||
282 | phys_pmd_init(pmd, address, end); | ||
283 | spin_unlock(&init_mm.page_table_lock); | ||
284 | __flush_tlb_all(); | ||
285 | } | ||
286 | |||
287 | static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) | ||
288 | { | ||
289 | int i = pud_index(addr); | ||
290 | |||
291 | |||
292 | for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { | ||
293 | unsigned long pmd_phys; | ||
294 | pud_t *pud = pud_page + pud_index(addr); | ||
295 | pmd_t *pmd; | ||
296 | |||
297 | if (addr >= end) | ||
298 | break; | ||
299 | |||
300 | if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { | ||
301 | set_pud(pud, __pud(0)); | ||
302 | continue; | ||
303 | } | ||
304 | |||
305 | if (pud_val(*pud)) { | ||
306 | phys_pmd_update(pud, addr, end); | ||
307 | continue; | ||
308 | } | ||
309 | |||
310 | pmd = alloc_low_page(&pmd_phys); | ||
311 | spin_lock(&init_mm.page_table_lock); | ||
312 | set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); | ||
313 | phys_pmd_init(pmd, addr, end); | ||
314 | spin_unlock(&init_mm.page_table_lock); | ||
315 | unmap_low_page(pmd); | ||
316 | } | ||
317 | __flush_tlb(); | ||
318 | } | ||
319 | |||
320 | static void __init find_early_table_space(unsigned long end) | ||
321 | { | ||
322 | unsigned long puds, pmds, tables, start; | ||
323 | |||
324 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
325 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
326 | tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + | ||
327 | round_up(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
328 | |||
329 | /* RED-PEN putting page tables only on node 0 could | ||
330 | cause a hotspot and fill up ZONE_DMA. The page tables | ||
331 | need roughly 0.5KB per GB. */ | ||
332 | start = 0x8000; | ||
333 | table_start = find_e820_area(start, end, tables); | ||
334 | if (table_start == -1UL) | ||
335 | panic("Cannot find space for the kernel page tables"); | ||
336 | |||
337 | table_start >>= PAGE_SHIFT; | ||
338 | table_end = table_start; | ||
339 | |||
340 | early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
341 | end, table_start << PAGE_SHIFT, | ||
342 | (table_start << PAGE_SHIFT) + tables); | ||
343 | } | ||
344 | |||
345 | /* Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
346 | This runs before bootmem is initialized and gets pages directly from the | ||
347 | physical memory. To access them they are temporarily mapped. */ | ||
348 | void __meminit init_memory_mapping(unsigned long start, unsigned long end) | ||
349 | { | ||
350 | unsigned long next; | ||
351 | |||
352 | Dprintk("init_memory_mapping\n"); | ||
353 | |||
354 | /* | ||
355 | * Find space for the kernel direct mapping tables. | ||
356 | * Later we should allocate these tables in the local node of the memory | ||
357 | * mapped. Unfortunately this is done currently before the nodes are | ||
358 | * discovered. | ||
359 | */ | ||
360 | if (!after_bootmem) | ||
361 | find_early_table_space(end); | ||
362 | |||
363 | start = (unsigned long)__va(start); | ||
364 | end = (unsigned long)__va(end); | ||
365 | |||
366 | for (; start < end; start = next) { | ||
367 | unsigned long pud_phys; | ||
368 | pgd_t *pgd = pgd_offset_k(start); | ||
369 | pud_t *pud; | ||
370 | |||
371 | if (after_bootmem) | ||
372 | pud = pud_offset(pgd, start & PGDIR_MASK); | ||
373 | else | ||
374 | pud = alloc_low_page(&pud_phys); | ||
375 | |||
376 | next = start + PGDIR_SIZE; | ||
377 | if (next > end) | ||
378 | next = end; | ||
379 | phys_pud_init(pud, __pa(start), __pa(next)); | ||
380 | if (!after_bootmem) | ||
381 | set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); | ||
382 | unmap_low_page(pud); | ||
383 | } | ||
384 | |||
385 | if (!after_bootmem) | ||
386 | mmu_cr4_features = read_cr4(); | ||
387 | __flush_tlb_all(); | ||
388 | } | ||
389 | |||
390 | #ifndef CONFIG_NUMA | ||
391 | void __init paging_init(void) | ||
392 | { | ||
393 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
394 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
395 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | ||
396 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | ||
397 | max_zone_pfns[ZONE_NORMAL] = end_pfn; | ||
398 | |||
399 | memory_present(0, 0, end_pfn); | ||
400 | sparse_init(); | ||
401 | free_area_init_nodes(max_zone_pfns); | ||
402 | } | ||
403 | #endif | ||
404 | |||
405 | /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches | ||
406 | from the CPU leading to inconsistent cache lines. address and size | ||
407 | must be aligned to 2MB boundaries. | ||
408 | Does nothing when the mapping doesn't exist. */ | ||
409 | void __init clear_kernel_mapping(unsigned long address, unsigned long size) | ||
410 | { | ||
411 | unsigned long end = address + size; | ||
412 | |||
413 | BUG_ON(address & ~LARGE_PAGE_MASK); | ||
414 | BUG_ON(size & ~LARGE_PAGE_MASK); | ||
415 | |||
416 | for (; address < end; address += LARGE_PAGE_SIZE) { | ||
417 | pgd_t *pgd = pgd_offset_k(address); | ||
418 | pud_t *pud; | ||
419 | pmd_t *pmd; | ||
420 | if (pgd_none(*pgd)) | ||
421 | continue; | ||
422 | pud = pud_offset(pgd, address); | ||
423 | if (pud_none(*pud)) | ||
424 | continue; | ||
425 | pmd = pmd_offset(pud, address); | ||
426 | if (!pmd || pmd_none(*pmd)) | ||
427 | continue; | ||
428 | if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { | ||
429 | /* Could handle this, but it should not happen currently. */ | ||
430 | printk(KERN_ERR | ||
431 | "clear_kernel_mapping: mapping has been split. will leak memory\n"); | ||
432 | pmd_ERROR(*pmd); | ||
433 | } | ||
434 | set_pmd(pmd, __pmd(0)); | ||
435 | } | ||
436 | __flush_tlb_all(); | ||
437 | } | ||
438 | |||
439 | /* | ||
440 | * Memory hotplug specific functions | ||
441 | */ | ||
442 | void online_page(struct page *page) | ||
443 | { | ||
444 | ClearPageReserved(page); | ||
445 | init_page_count(page); | ||
446 | __free_page(page); | ||
447 | totalram_pages++; | ||
448 | num_physpages++; | ||
449 | } | ||
450 | |||
451 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
452 | /* | ||
453 | * Memory is added always to NORMAL zone. This means you will never get | ||
454 | * additional DMA/DMA32 memory. | ||
455 | */ | ||
456 | int arch_add_memory(int nid, u64 start, u64 size) | ||
457 | { | ||
458 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
459 | struct zone *zone = pgdat->node_zones + ZONE_NORMAL; | ||
460 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
461 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
462 | int ret; | ||
463 | |||
464 | init_memory_mapping(start, (start + size -1)); | ||
465 | |||
466 | ret = __add_pages(zone, start_pfn, nr_pages); | ||
467 | if (ret) | ||
468 | goto error; | ||
469 | |||
470 | return ret; | ||
471 | error: | ||
472 | printk("%s: Problem encountered in __add_pages!\n", __func__); | ||
473 | return ret; | ||
474 | } | ||
475 | EXPORT_SYMBOL_GPL(arch_add_memory); | ||
476 | |||
477 | int remove_memory(u64 start, u64 size) | ||
478 | { | ||
479 | return -EINVAL; | ||
480 | } | ||
481 | EXPORT_SYMBOL_GPL(remove_memory); | ||
482 | |||
483 | #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) | ||
484 | int memory_add_physaddr_to_nid(u64 start) | ||
485 | { | ||
486 | return 0; | ||
487 | } | ||
488 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
489 | #endif | ||
490 | |||
491 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
492 | |||
493 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
494 | /* | ||
495 | * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, | ||
496 | * just online the pages. | ||
497 | */ | ||
498 | int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) | ||
499 | { | ||
500 | int err = -EIO; | ||
501 | unsigned long pfn; | ||
502 | unsigned long total = 0, mem = 0; | ||
503 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { | ||
504 | if (pfn_valid(pfn)) { | ||
505 | online_page(pfn_to_page(pfn)); | ||
506 | err = 0; | ||
507 | mem++; | ||
508 | } | ||
509 | total++; | ||
510 | } | ||
511 | if (!err) { | ||
512 | z->spanned_pages += total; | ||
513 | z->present_pages += mem; | ||
514 | z->zone_pgdat->node_spanned_pages += total; | ||
515 | z->zone_pgdat->node_present_pages += mem; | ||
516 | } | ||
517 | return err; | ||
518 | } | ||
519 | #endif | ||
520 | |||
521 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, | ||
522 | kcore_vsyscall; | ||
523 | |||
524 | void __init mem_init(void) | ||
525 | { | ||
526 | long codesize, reservedpages, datasize, initsize; | ||
527 | |||
528 | pci_iommu_alloc(); | ||
529 | |||
530 | /* clear the zero-page */ | ||
531 | memset(empty_zero_page, 0, PAGE_SIZE); | ||
532 | |||
533 | reservedpages = 0; | ||
534 | |||
535 | /* this will put all low memory onto the freelists */ | ||
536 | #ifdef CONFIG_NUMA | ||
537 | totalram_pages = numa_free_all_bootmem(); | ||
538 | #else | ||
539 | totalram_pages = free_all_bootmem(); | ||
540 | #endif | ||
541 | reservedpages = end_pfn - totalram_pages - | ||
542 | absent_pages_in_range(0, end_pfn); | ||
543 | |||
544 | after_bootmem = 1; | ||
545 | |||
546 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | ||
547 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | ||
548 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | ||
549 | |||
550 | /* Register memory areas for /proc/kcore */ | ||
551 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
552 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
553 | VMALLOC_END-VMALLOC_START); | ||
554 | kclist_add(&kcore_kernel, &_stext, _end - _stext); | ||
555 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); | ||
556 | kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | ||
557 | VSYSCALL_END - VSYSCALL_START); | ||
558 | |||
559 | printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", | ||
560 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | ||
561 | end_pfn << (PAGE_SHIFT-10), | ||
562 | codesize >> 10, | ||
563 | reservedpages << (PAGE_SHIFT-10), | ||
564 | datasize >> 10, | ||
565 | initsize >> 10); | ||
566 | } | ||
567 | |||
568 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | ||
569 | { | ||
570 | unsigned long addr; | ||
571 | |||
572 | if (begin >= end) | ||
573 | return; | ||
574 | |||
575 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); | ||
576 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | ||
577 | ClearPageReserved(virt_to_page(addr)); | ||
578 | init_page_count(virt_to_page(addr)); | ||
579 | memset((void *)(addr & ~(PAGE_SIZE-1)), | ||
580 | POISON_FREE_INITMEM, PAGE_SIZE); | ||
581 | if (addr >= __START_KERNEL_map) | ||
582 | change_page_attr_addr(addr, 1, __pgprot(0)); | ||
583 | free_page(addr); | ||
584 | totalram_pages++; | ||
585 | } | ||
586 | if (addr > __START_KERNEL_map) | ||
587 | global_flush_tlb(); | ||
588 | } | ||
589 | |||
590 | void free_initmem(void) | ||
591 | { | ||
592 | free_init_pages("unused kernel memory", | ||
593 | (unsigned long)(&__init_begin), | ||
594 | (unsigned long)(&__init_end)); | ||
595 | } | ||
596 | |||
597 | #ifdef CONFIG_DEBUG_RODATA | ||
598 | |||
599 | void mark_rodata_ro(void) | ||
600 | { | ||
601 | unsigned long start = (unsigned long)_stext, end; | ||
602 | |||
603 | #ifdef CONFIG_HOTPLUG_CPU | ||
604 | /* It must still be possible to apply SMP alternatives. */ | ||
605 | if (num_possible_cpus() > 1) | ||
606 | start = (unsigned long)_etext; | ||
607 | #endif | ||
608 | |||
609 | #ifdef CONFIG_KPROBES | ||
610 | start = (unsigned long)__start_rodata; | ||
611 | #endif | ||
612 | |||
613 | end = (unsigned long)__end_rodata; | ||
614 | start = (start + PAGE_SIZE - 1) & PAGE_MASK; | ||
615 | end &= PAGE_MASK; | ||
616 | if (end <= start) | ||
617 | return; | ||
618 | |||
619 | change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); | ||
620 | |||
621 | printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", | ||
622 | (end - start) >> 10); | ||
623 | |||
624 | /* | ||
625 | * change_page_attr_addr() requires a global_flush_tlb() call after it. | ||
626 | * We do this after the printk so that if something went wrong in the | ||
627 | * change, the printk gets out at least to give a better debug hint | ||
628 | * of who is the culprit. | ||
629 | */ | ||
630 | global_flush_tlb(); | ||
631 | } | ||
632 | #endif | ||
633 | |||
634 | #ifdef CONFIG_BLK_DEV_INITRD | ||
635 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
636 | { | ||
637 | free_init_pages("initrd memory", start, end); | ||
638 | } | ||
639 | #endif | ||
640 | |||
641 | void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | ||
642 | { | ||
643 | #ifdef CONFIG_NUMA | ||
644 | int nid = phys_to_nid(phys); | ||
645 | #endif | ||
646 | unsigned long pfn = phys >> PAGE_SHIFT; | ||
647 | if (pfn >= end_pfn) { | ||
648 | /* This can happen with kdump kernels when accessing firmware | ||
649 | tables. */ | ||
650 | if (pfn < end_pfn_map) | ||
651 | return; | ||
652 | printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", | ||
653 | phys, len); | ||
654 | return; | ||
655 | } | ||
656 | |||
657 | /* Should check here against the e820 map to avoid double free */ | ||
658 | #ifdef CONFIG_NUMA | ||
659 | reserve_bootmem_node(NODE_DATA(nid), phys, len); | ||
660 | #else | ||
661 | reserve_bootmem(phys, len); | ||
662 | #endif | ||
663 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | ||
664 | dma_reserve += len / PAGE_SIZE; | ||
665 | set_dma_reserve(dma_reserve); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | int kern_addr_valid(unsigned long addr) | ||
670 | { | ||
671 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | ||
672 | pgd_t *pgd; | ||
673 | pud_t *pud; | ||
674 | pmd_t *pmd; | ||
675 | pte_t *pte; | ||
676 | |||
677 | if (above != 0 && above != -1UL) | ||
678 | return 0; | ||
679 | |||
680 | pgd = pgd_offset_k(addr); | ||
681 | if (pgd_none(*pgd)) | ||
682 | return 0; | ||
683 | |||
684 | pud = pud_offset(pgd, addr); | ||
685 | if (pud_none(*pud)) | ||
686 | return 0; | ||
687 | |||
688 | pmd = pmd_offset(pud, addr); | ||
689 | if (pmd_none(*pmd)) | ||
690 | return 0; | ||
691 | if (pmd_large(*pmd)) | ||
692 | return pfn_valid(pmd_pfn(*pmd)); | ||
693 | |||
694 | pte = pte_offset_kernel(pmd, addr); | ||
695 | if (pte_none(*pte)) | ||
696 | return 0; | ||
697 | return pfn_valid(pte_pfn(*pte)); | ||
698 | } | ||
699 | |||
700 | /* A pseudo VMA to allow ptrace access for the vsyscall page. This only | ||
701 | covers the 64bit vsyscall page now. 32bit has a real VMA now and does | ||
702 | not need special handling anymore. */ | ||
703 | |||
704 | static struct vm_area_struct gate_vma = { | ||
705 | .vm_start = VSYSCALL_START, | ||
706 | .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), | ||
707 | .vm_page_prot = PAGE_READONLY_EXEC, | ||
708 | .vm_flags = VM_READ | VM_EXEC | ||
709 | }; | ||
710 | |||
711 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | ||
712 | { | ||
713 | #ifdef CONFIG_IA32_EMULATION | ||
714 | if (test_tsk_thread_flag(tsk, TIF_IA32)) | ||
715 | return NULL; | ||
716 | #endif | ||
717 | return &gate_vma; | ||
718 | } | ||
719 | |||
720 | int in_gate_area(struct task_struct *task, unsigned long addr) | ||
721 | { | ||
722 | struct vm_area_struct *vma = get_gate_vma(task); | ||
723 | if (!vma) | ||
724 | return 0; | ||
725 | return (addr >= vma->vm_start) && (addr < vma->vm_end); | ||
726 | } | ||
727 | |||
728 | /* Use this when you have no reliable task/vma, typically from interrupt | ||
729 | * context. It is less reliable than using the task's vma and may give | ||
730 | * false positives. | ||
731 | */ | ||
732 | int in_gate_area_no_task(unsigned long addr) | ||
733 | { | ||
734 | return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); | ||
735 | } | ||
736 | |||
737 | void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) | ||
738 | { | ||
739 | return __alloc_bootmem_core(pgdat->bdata, size, | ||
740 | SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); | ||
741 | } | ||
742 | |||
743 | const char *arch_vma_name(struct vm_area_struct *vma) | ||
744 | { | ||
745 | if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) | ||
746 | return "[vdso]"; | ||
747 | if (vma == &gate_vma) | ||
748 | return "[vsyscall]"; | ||
749 | return NULL; | ||
750 | } | ||
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c deleted file mode 100644 index 6cac90aa5032..000000000000 --- a/arch/x86_64/mm/ioremap.c +++ /dev/null | |||
@@ -1,210 +0,0 @@ | |||
1 | /* | ||
2 | * arch/x86_64/mm/ioremap.c | ||
3 | * | ||
4 | * Re-map IO memory to kernel address space so that we can access it. | ||
5 | * This is needed for high PCI addresses that aren't mapped in the | ||
6 | * 640k-1MB IO memory area on PC's | ||
7 | * | ||
8 | * (C) Copyright 1995 1996 Linus Torvalds | ||
9 | */ | ||
10 | |||
11 | #include <linux/vmalloc.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/io.h> | ||
16 | |||
17 | #include <asm/pgalloc.h> | ||
18 | #include <asm/fixmap.h> | ||
19 | #include <asm/tlbflush.h> | ||
20 | #include <asm/cacheflush.h> | ||
21 | #include <asm/proto.h> | ||
22 | |||
23 | unsigned long __phys_addr(unsigned long x) | ||
24 | { | ||
25 | if (x >= __START_KERNEL_map) | ||
26 | return x - __START_KERNEL_map + phys_base; | ||
27 | return x - PAGE_OFFSET; | ||
28 | } | ||
29 | EXPORT_SYMBOL(__phys_addr); | ||
30 | |||
31 | #define ISA_START_ADDRESS 0xa0000 | ||
32 | #define ISA_END_ADDRESS 0x100000 | ||
33 | |||
34 | /* | ||
35 | * Fix up the linear direct mapping of the kernel to avoid cache attribute | ||
36 | * conflicts. | ||
37 | */ | ||
38 | static int | ||
39 | ioremap_change_attr(unsigned long phys_addr, unsigned long size, | ||
40 | unsigned long flags) | ||
41 | { | ||
42 | int err = 0; | ||
43 | if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { | ||
44 | unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
45 | unsigned long vaddr = (unsigned long) __va(phys_addr); | ||
46 | |||
47 | /* | ||
48 | * Must use a address here and not struct page because the phys addr | ||
49 | * can be a in hole between nodes and not have an memmap entry. | ||
50 | */ | ||
51 | err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); | ||
52 | if (!err) | ||
53 | global_flush_tlb(); | ||
54 | } | ||
55 | return err; | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * Generic mapping function | ||
60 | */ | ||
61 | |||
62 | /* | ||
63 | * Remap an arbitrary physical address space into the kernel virtual | ||
64 | * address space. Needed when the kernel wants to access high addresses | ||
65 | * directly. | ||
66 | * | ||
67 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously | ||
68 | * have to convert them into an offset in a page-aligned mapping, but the | ||
69 | * caller shouldn't need to know that small detail. | ||
70 | */ | ||
71 | void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | ||
72 | { | ||
73 | void * addr; | ||
74 | struct vm_struct * area; | ||
75 | unsigned long offset, last_addr; | ||
76 | pgprot_t pgprot; | ||
77 | |||
78 | /* Don't allow wraparound or zero size */ | ||
79 | last_addr = phys_addr + size - 1; | ||
80 | if (!size || last_addr < phys_addr) | ||
81 | return NULL; | ||
82 | |||
83 | /* | ||
84 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
85 | */ | ||
86 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
87 | return (__force void __iomem *)phys_to_virt(phys_addr); | ||
88 | |||
89 | #ifdef CONFIG_FLATMEM | ||
90 | /* | ||
91 | * Don't allow anybody to remap normal RAM that we're using.. | ||
92 | */ | ||
93 | if (last_addr < virt_to_phys(high_memory)) { | ||
94 | char *t_addr, *t_end; | ||
95 | struct page *page; | ||
96 | |||
97 | t_addr = __va(phys_addr); | ||
98 | t_end = t_addr + (size - 1); | ||
99 | |||
100 | for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | ||
101 | if(!PageReserved(page)) | ||
102 | return NULL; | ||
103 | } | ||
104 | #endif | ||
105 | |||
106 | pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL | ||
107 | | _PAGE_DIRTY | _PAGE_ACCESSED | flags); | ||
108 | /* | ||
109 | * Mappings have to be page-aligned | ||
110 | */ | ||
111 | offset = phys_addr & ~PAGE_MASK; | ||
112 | phys_addr &= PAGE_MASK; | ||
113 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | ||
114 | |||
115 | /* | ||
116 | * Ok, go for it.. | ||
117 | */ | ||
118 | area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | ||
119 | if (!area) | ||
120 | return NULL; | ||
121 | area->phys_addr = phys_addr; | ||
122 | addr = area->addr; | ||
123 | if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, | ||
124 | phys_addr, pgprot)) { | ||
125 | remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); | ||
126 | return NULL; | ||
127 | } | ||
128 | if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { | ||
129 | area->flags &= 0xffffff; | ||
130 | vunmap(addr); | ||
131 | return NULL; | ||
132 | } | ||
133 | return (__force void __iomem *) (offset + (char *)addr); | ||
134 | } | ||
135 | EXPORT_SYMBOL(__ioremap); | ||
136 | |||
137 | /** | ||
138 | * ioremap_nocache - map bus memory into CPU space | ||
139 | * @offset: bus address of the memory | ||
140 | * @size: size of the resource to map | ||
141 | * | ||
142 | * ioremap_nocache performs a platform specific sequence of operations to | ||
143 | * make bus memory CPU accessible via the readb/readw/readl/writeb/ | ||
144 | * writew/writel functions and the other mmio helpers. The returned | ||
145 | * address is not guaranteed to be usable directly as a virtual | ||
146 | * address. | ||
147 | * | ||
148 | * This version of ioremap ensures that the memory is marked uncachable | ||
149 | * on the CPU as well as honouring existing caching rules from things like | ||
150 | * the PCI bus. Note that there are other caches and buffers on many | ||
151 | * busses. In particular driver authors should read up on PCI writes | ||
152 | * | ||
153 | * It's useful if some control registers are in such an area and | ||
154 | * write combining or read caching is not desirable: | ||
155 | * | ||
156 | * Must be freed with iounmap. | ||
157 | */ | ||
158 | |||
159 | void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | ||
160 | { | ||
161 | return __ioremap(phys_addr, size, _PAGE_PCD); | ||
162 | } | ||
163 | EXPORT_SYMBOL(ioremap_nocache); | ||
164 | |||
165 | /** | ||
166 | * iounmap - Free a IO remapping | ||
167 | * @addr: virtual address from ioremap_* | ||
168 | * | ||
169 | * Caller must ensure there is only one unmapping for the same pointer. | ||
170 | */ | ||
171 | void iounmap(volatile void __iomem *addr) | ||
172 | { | ||
173 | struct vm_struct *p, *o; | ||
174 | |||
175 | if (addr <= high_memory) | ||
176 | return; | ||
177 | if (addr >= phys_to_virt(ISA_START_ADDRESS) && | ||
178 | addr < phys_to_virt(ISA_END_ADDRESS)) | ||
179 | return; | ||
180 | |||
181 | addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); | ||
182 | /* Use the vm area unlocked, assuming the caller | ||
183 | ensures there isn't another iounmap for the same address | ||
184 | in parallel. Reuse of the virtual address is prevented by | ||
185 | leaving it in the global lists until we're done with it. | ||
186 | cpa takes care of the direct mappings. */ | ||
187 | read_lock(&vmlist_lock); | ||
188 | for (p = vmlist; p; p = p->next) { | ||
189 | if (p->addr == addr) | ||
190 | break; | ||
191 | } | ||
192 | read_unlock(&vmlist_lock); | ||
193 | |||
194 | if (!p) { | ||
195 | printk("iounmap: bad address %p\n", addr); | ||
196 | dump_stack(); | ||
197 | return; | ||
198 | } | ||
199 | |||
200 | /* Reset the direct mapping. Can block */ | ||
201 | if (p->flags >> 20) | ||
202 | ioremap_change_attr(p->phys_addr, p->size, 0); | ||
203 | |||
204 | /* Finally remove it */ | ||
205 | o = remove_vm_area((void *)addr); | ||
206 | BUG_ON(p != o || o == NULL); | ||
207 | kfree(p); | ||
208 | } | ||
209 | EXPORT_SYMBOL(iounmap); | ||
210 | |||
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c deleted file mode 100644 index a96006f7ae0c..000000000000 --- a/arch/x86_64/mm/k8topology.c +++ /dev/null | |||
@@ -1,182 +0,0 @@ | |||
1 | /* | ||
2 | * AMD K8 NUMA support. | ||
3 | * Discover the memory map and associated nodes. | ||
4 | * | ||
5 | * This version reads it directly from the K8 northbridge. | ||
6 | * | ||
7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
8 | */ | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | #include <asm/io.h> | ||
15 | #include <linux/pci_ids.h> | ||
16 | #include <asm/types.h> | ||
17 | #include <asm/mmzone.h> | ||
18 | #include <asm/proto.h> | ||
19 | #include <asm/e820.h> | ||
20 | #include <asm/pci-direct.h> | ||
21 | #include <asm/numa.h> | ||
22 | |||
23 | static __init int find_northbridge(void) | ||
24 | { | ||
25 | int num; | ||
26 | |||
27 | for (num = 0; num < 32; num++) { | ||
28 | u32 header; | ||
29 | |||
30 | header = read_pci_config(0, num, 0, 0x00); | ||
31 | if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) | ||
32 | continue; | ||
33 | |||
34 | header = read_pci_config(0, num, 1, 0x00); | ||
35 | if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) | ||
36 | continue; | ||
37 | return num; | ||
38 | } | ||
39 | |||
40 | return -1; | ||
41 | } | ||
42 | |||
43 | int __init k8_scan_nodes(unsigned long start, unsigned long end) | ||
44 | { | ||
45 | unsigned long prevbase; | ||
46 | struct bootnode nodes[8]; | ||
47 | int nodeid, i, j, nb; | ||
48 | unsigned char nodeids[8]; | ||
49 | int found = 0; | ||
50 | u32 reg; | ||
51 | unsigned numnodes; | ||
52 | unsigned num_cores; | ||
53 | |||
54 | if (!early_pci_allowed()) | ||
55 | return -1; | ||
56 | |||
57 | nb = find_northbridge(); | ||
58 | if (nb < 0) | ||
59 | return nb; | ||
60 | |||
61 | printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); | ||
62 | |||
63 | num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; | ||
64 | printk(KERN_INFO "CPU has %d num_cores\n", num_cores); | ||
65 | |||
66 | reg = read_pci_config(0, nb, 0, 0x60); | ||
67 | numnodes = ((reg >> 4) & 0xF) + 1; | ||
68 | if (numnodes <= 1) | ||
69 | return -1; | ||
70 | |||
71 | printk(KERN_INFO "Number of nodes %d\n", numnodes); | ||
72 | |||
73 | memset(&nodes,0,sizeof(nodes)); | ||
74 | prevbase = 0; | ||
75 | for (i = 0; i < 8; i++) { | ||
76 | unsigned long base,limit; | ||
77 | u32 nodeid; | ||
78 | |||
79 | base = read_pci_config(0, nb, 1, 0x40 + i*8); | ||
80 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); | ||
81 | |||
82 | nodeid = limit & 7; | ||
83 | nodeids[i] = nodeid; | ||
84 | if ((base & 3) == 0) { | ||
85 | if (i < numnodes) | ||
86 | printk("Skipping disabled node %d\n", i); | ||
87 | continue; | ||
88 | } | ||
89 | if (nodeid >= numnodes) { | ||
90 | printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, | ||
91 | base, limit); | ||
92 | continue; | ||
93 | } | ||
94 | |||
95 | if (!limit) { | ||
96 | printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, | ||
97 | base); | ||
98 | continue; | ||
99 | } | ||
100 | if ((base >> 8) & 3 || (limit >> 8) & 3) { | ||
101 | printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", | ||
102 | nodeid, (base>>8)&3, (limit>>8) & 3); | ||
103 | return -1; | ||
104 | } | ||
105 | if (node_isset(nodeid, node_possible_map)) { | ||
106 | printk(KERN_INFO "Node %d already present. Skipping\n", | ||
107 | nodeid); | ||
108 | continue; | ||
109 | } | ||
110 | |||
111 | limit >>= 16; | ||
112 | limit <<= 24; | ||
113 | limit |= (1<<24)-1; | ||
114 | limit++; | ||
115 | |||
116 | if (limit > end_pfn << PAGE_SHIFT) | ||
117 | limit = end_pfn << PAGE_SHIFT; | ||
118 | if (limit <= base) | ||
119 | continue; | ||
120 | |||
121 | base >>= 16; | ||
122 | base <<= 24; | ||
123 | |||
124 | if (base < start) | ||
125 | base = start; | ||
126 | if (limit > end) | ||
127 | limit = end; | ||
128 | if (limit == base) { | ||
129 | printk(KERN_ERR "Empty node %d\n", nodeid); | ||
130 | continue; | ||
131 | } | ||
132 | if (limit < base) { | ||
133 | printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", | ||
134 | nodeid, base, limit); | ||
135 | continue; | ||
136 | } | ||
137 | |||
138 | /* Could sort here, but pun for now. Should not happen anyroads. */ | ||
139 | if (prevbase > base) { | ||
140 | printk(KERN_ERR "Node map not sorted %lx,%lx\n", | ||
141 | prevbase,base); | ||
142 | return -1; | ||
143 | } | ||
144 | |||
145 | printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", | ||
146 | nodeid, base, limit); | ||
147 | |||
148 | found++; | ||
149 | |||
150 | nodes[nodeid].start = base; | ||
151 | nodes[nodeid].end = limit; | ||
152 | e820_register_active_regions(nodeid, | ||
153 | nodes[nodeid].start >> PAGE_SHIFT, | ||
154 | nodes[nodeid].end >> PAGE_SHIFT); | ||
155 | |||
156 | prevbase = base; | ||
157 | |||
158 | node_set(nodeid, node_possible_map); | ||
159 | } | ||
160 | |||
161 | if (!found) | ||
162 | return -1; | ||
163 | |||
164 | memnode_shift = compute_hash_shift(nodes, 8); | ||
165 | if (memnode_shift < 0) { | ||
166 | printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); | ||
167 | return -1; | ||
168 | } | ||
169 | printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); | ||
170 | |||
171 | for (i = 0; i < 8; i++) { | ||
172 | if (nodes[i].start != nodes[i].end) { | ||
173 | nodeid = nodeids[i]; | ||
174 | for (j = 0; j < num_cores; j++) | ||
175 | apicid_to_node[(nodeid * num_cores) + j] = i; | ||
176 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
177 | } | ||
178 | } | ||
179 | |||
180 | numa_init_array(); | ||
181 | return 0; | ||
182 | } | ||
diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c deleted file mode 100644 index 80bba0dc000e..000000000000 --- a/arch/x86_64/mm/mmap.c +++ /dev/null | |||
@@ -1,29 +0,0 @@ | |||
1 | /* Copyright 2005 Andi Kleen, SuSE Labs. | ||
2 | * Licensed under GPL, v.2 | ||
3 | */ | ||
4 | #include <linux/mm.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/random.h> | ||
7 | #include <asm/ia32.h> | ||
8 | |||
9 | /* Notebook: move the mmap code from sys_x86_64.c over here. */ | ||
10 | |||
11 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
12 | { | ||
13 | #ifdef CONFIG_IA32_EMULATION | ||
14 | if (current_thread_info()->flags & _TIF_IA32) | ||
15 | return ia32_pick_mmap_layout(mm); | ||
16 | #endif | ||
17 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
18 | if (current->flags & PF_RANDOMIZE) { | ||
19 | /* Add 28bit randomness which is about 40bits of address space | ||
20 | because mmap base has to be page aligned. | ||
21 | or ~1/128 of the total user VM | ||
22 | (total user address space is 47bits) */ | ||
23 | unsigned rnd = get_random_int() & 0xfffffff; | ||
24 | mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; | ||
25 | } | ||
26 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
27 | mm->unmap_area = arch_unmap_area; | ||
28 | } | ||
29 | |||
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c deleted file mode 100644 index 6da235522269..000000000000 --- a/arch/x86_64/mm/numa.c +++ /dev/null | |||
@@ -1,648 +0,0 @@ | |||
1 | /* | ||
2 | * Generic VM initialization for x86-64 NUMA setups. | ||
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | */ | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/bootmem.h> | ||
10 | #include <linux/mmzone.h> | ||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | |||
15 | #include <asm/e820.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/dma.h> | ||
18 | #include <asm/numa.h> | ||
19 | #include <asm/acpi.h> | ||
20 | |||
21 | #ifndef Dprintk | ||
22 | #define Dprintk(x...) | ||
23 | #endif | ||
24 | |||
25 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | ||
26 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | ||
27 | |||
28 | struct memnode memnode; | ||
29 | |||
30 | unsigned char cpu_to_node[NR_CPUS] __read_mostly = { | ||
31 | [0 ... NR_CPUS-1] = NUMA_NO_NODE | ||
32 | }; | ||
33 | unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
34 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
35 | }; | ||
36 | cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; | ||
37 | |||
38 | int numa_off __initdata; | ||
39 | unsigned long __initdata nodemap_addr; | ||
40 | unsigned long __initdata nodemap_size; | ||
41 | |||
42 | |||
43 | /* | ||
44 | * Given a shift value, try to populate memnodemap[] | ||
45 | * Returns : | ||
46 | * 1 if OK | ||
47 | * 0 if memnodmap[] too small (of shift too small) | ||
48 | * -1 if node overlap or lost ram (shift too big) | ||
49 | */ | ||
50 | static int __init | ||
51 | populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) | ||
52 | { | ||
53 | int i; | ||
54 | int res = -1; | ||
55 | unsigned long addr, end; | ||
56 | |||
57 | memset(memnodemap, 0xff, memnodemapsize); | ||
58 | for (i = 0; i < numnodes; i++) { | ||
59 | addr = nodes[i].start; | ||
60 | end = nodes[i].end; | ||
61 | if (addr >= end) | ||
62 | continue; | ||
63 | if ((end >> shift) >= memnodemapsize) | ||
64 | return 0; | ||
65 | do { | ||
66 | if (memnodemap[addr >> shift] != 0xff) | ||
67 | return -1; | ||
68 | memnodemap[addr >> shift] = i; | ||
69 | addr += (1UL << shift); | ||
70 | } while (addr < end); | ||
71 | res = 1; | ||
72 | } | ||
73 | return res; | ||
74 | } | ||
75 | |||
76 | static int __init allocate_cachealigned_memnodemap(void) | ||
77 | { | ||
78 | unsigned long pad, pad_addr; | ||
79 | |||
80 | memnodemap = memnode.embedded_map; | ||
81 | if (memnodemapsize <= 48) | ||
82 | return 0; | ||
83 | |||
84 | pad = L1_CACHE_BYTES - 1; | ||
85 | pad_addr = 0x8000; | ||
86 | nodemap_size = pad + memnodemapsize; | ||
87 | nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, | ||
88 | nodemap_size); | ||
89 | if (nodemap_addr == -1UL) { | ||
90 | printk(KERN_ERR | ||
91 | "NUMA: Unable to allocate Memory to Node hash map\n"); | ||
92 | nodemap_addr = nodemap_size = 0; | ||
93 | return -1; | ||
94 | } | ||
95 | pad_addr = (nodemap_addr + pad) & ~pad; | ||
96 | memnodemap = phys_to_virt(pad_addr); | ||
97 | |||
98 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | ||
99 | nodemap_addr, nodemap_addr + nodemap_size); | ||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * The LSB of all start and end addresses in the node map is the value of the | ||
105 | * maximum possible shift. | ||
106 | */ | ||
107 | static int __init | ||
108 | extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) | ||
109 | { | ||
110 | int i, nodes_used = 0; | ||
111 | unsigned long start, end; | ||
112 | unsigned long bitfield = 0, memtop = 0; | ||
113 | |||
114 | for (i = 0; i < numnodes; i++) { | ||
115 | start = nodes[i].start; | ||
116 | end = nodes[i].end; | ||
117 | if (start >= end) | ||
118 | continue; | ||
119 | bitfield |= start; | ||
120 | nodes_used++; | ||
121 | if (end > memtop) | ||
122 | memtop = end; | ||
123 | } | ||
124 | if (nodes_used <= 1) | ||
125 | i = 63; | ||
126 | else | ||
127 | i = find_first_bit(&bitfield, sizeof(unsigned long)*8); | ||
128 | memnodemapsize = (memtop >> i)+1; | ||
129 | return i; | ||
130 | } | ||
131 | |||
132 | int __init compute_hash_shift(struct bootnode *nodes, int numnodes) | ||
133 | { | ||
134 | int shift; | ||
135 | |||
136 | shift = extract_lsb_from_nodes(nodes, numnodes); | ||
137 | if (allocate_cachealigned_memnodemap()) | ||
138 | return -1; | ||
139 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", | ||
140 | shift); | ||
141 | |||
142 | if (populate_memnodemap(nodes, numnodes, shift) != 1) { | ||
143 | printk(KERN_INFO | ||
144 | "Your memory is not aligned you need to rebuild your kernel " | ||
145 | "with a bigger NODEMAPSIZE shift=%d\n", | ||
146 | shift); | ||
147 | return -1; | ||
148 | } | ||
149 | return shift; | ||
150 | } | ||
151 | |||
152 | #ifdef CONFIG_SPARSEMEM | ||
153 | int early_pfn_to_nid(unsigned long pfn) | ||
154 | { | ||
155 | return phys_to_nid(pfn << PAGE_SHIFT); | ||
156 | } | ||
157 | #endif | ||
158 | |||
159 | static void * __init | ||
160 | early_node_mem(int nodeid, unsigned long start, unsigned long end, | ||
161 | unsigned long size) | ||
162 | { | ||
163 | unsigned long mem = find_e820_area(start, end, size); | ||
164 | void *ptr; | ||
165 | if (mem != -1L) | ||
166 | return __va(mem); | ||
167 | ptr = __alloc_bootmem_nopanic(size, | ||
168 | SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); | ||
169 | if (ptr == 0) { | ||
170 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | ||
171 | size, nodeid); | ||
172 | return NULL; | ||
173 | } | ||
174 | return ptr; | ||
175 | } | ||
176 | |||
177 | /* Initialize bootmem allocator for a node */ | ||
178 | void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | ||
179 | { | ||
180 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; | ||
181 | unsigned long nodedata_phys; | ||
182 | void *bootmap; | ||
183 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | ||
184 | |||
185 | start = round_up(start, ZONE_ALIGN); | ||
186 | |||
187 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); | ||
188 | |||
189 | start_pfn = start >> PAGE_SHIFT; | ||
190 | end_pfn = end >> PAGE_SHIFT; | ||
191 | |||
192 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); | ||
193 | if (node_data[nodeid] == NULL) | ||
194 | return; | ||
195 | nodedata_phys = __pa(node_data[nodeid]); | ||
196 | |||
197 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | ||
198 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | ||
199 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | ||
200 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | ||
201 | |||
202 | /* Find a place for the bootmem map */ | ||
203 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | ||
204 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
205 | bootmap = early_node_mem(nodeid, bootmap_start, end, | ||
206 | bootmap_pages<<PAGE_SHIFT); | ||
207 | if (bootmap == NULL) { | ||
208 | if (nodedata_phys < start || nodedata_phys >= end) | ||
209 | free_bootmem((unsigned long)node_data[nodeid],pgdat_size); | ||
210 | node_data[nodeid] = NULL; | ||
211 | return; | ||
212 | } | ||
213 | bootmap_start = __pa(bootmap); | ||
214 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | ||
215 | |||
216 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
217 | bootmap_start >> PAGE_SHIFT, | ||
218 | start_pfn, end_pfn); | ||
219 | |||
220 | free_bootmem_with_active_regions(nodeid, end); | ||
221 | |||
222 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | ||
223 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | ||
224 | #ifdef CONFIG_ACPI_NUMA | ||
225 | srat_reserve_add_area(nodeid); | ||
226 | #endif | ||
227 | node_set_online(nodeid); | ||
228 | } | ||
229 | |||
230 | /* Initialize final allocator for a zone */ | ||
231 | void __init setup_node_zones(int nodeid) | ||
232 | { | ||
233 | unsigned long start_pfn, end_pfn, memmapsize, limit; | ||
234 | |||
235 | start_pfn = node_start_pfn(nodeid); | ||
236 | end_pfn = node_end_pfn(nodeid); | ||
237 | |||
238 | Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", | ||
239 | nodeid, start_pfn, end_pfn); | ||
240 | |||
241 | /* Try to allocate mem_map at end to not fill up precious <4GB | ||
242 | memory. */ | ||
243 | memmapsize = sizeof(struct page) * (end_pfn-start_pfn); | ||
244 | limit = end_pfn << PAGE_SHIFT; | ||
245 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
246 | NODE_DATA(nodeid)->node_mem_map = | ||
247 | __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, | ||
248 | memmapsize, SMP_CACHE_BYTES, | ||
249 | round_down(limit - memmapsize, PAGE_SIZE), | ||
250 | limit); | ||
251 | #endif | ||
252 | } | ||
253 | |||
254 | void __init numa_init_array(void) | ||
255 | { | ||
256 | int rr, i; | ||
257 | /* There are unfortunately some poorly designed mainboards around | ||
258 | that only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
259 | mapping. To avoid this fill in the mapping for all possible | ||
260 | CPUs, as the number of CPUs is not known yet. | ||
261 | We round robin the existing nodes. */ | ||
262 | rr = first_node(node_online_map); | ||
263 | for (i = 0; i < NR_CPUS; i++) { | ||
264 | if (cpu_to_node[i] != NUMA_NO_NODE) | ||
265 | continue; | ||
266 | numa_set_node(i, rr); | ||
267 | rr = next_node(rr, node_online_map); | ||
268 | if (rr == MAX_NUMNODES) | ||
269 | rr = first_node(node_online_map); | ||
270 | } | ||
271 | |||
272 | } | ||
273 | |||
274 | #ifdef CONFIG_NUMA_EMU | ||
275 | /* Numa emulation */ | ||
276 | char *cmdline __initdata; | ||
277 | |||
278 | /* | ||
279 | * Setups up nid to range from addr to addr + size. If the end boundary is | ||
280 | * greater than max_addr, then max_addr is used instead. The return value is 0 | ||
281 | * if there is additional memory left for allocation past addr and -1 otherwise. | ||
282 | * addr is adjusted to be at the end of the node. | ||
283 | */ | ||
284 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | ||
285 | u64 size, u64 max_addr) | ||
286 | { | ||
287 | int ret = 0; | ||
288 | nodes[nid].start = *addr; | ||
289 | *addr += size; | ||
290 | if (*addr >= max_addr) { | ||
291 | *addr = max_addr; | ||
292 | ret = -1; | ||
293 | } | ||
294 | nodes[nid].end = *addr; | ||
295 | node_set(nid, node_possible_map); | ||
296 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
297 | nodes[nid].start, nodes[nid].end, | ||
298 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
299 | return ret; | ||
300 | } | ||
301 | |||
302 | /* | ||
303 | * Splits num_nodes nodes up equally starting at node_start. The return value | ||
304 | * is the number of nodes split up and addr is adjusted to be at the end of the | ||
305 | * last node allocated. | ||
306 | */ | ||
307 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | ||
308 | u64 max_addr, int node_start, | ||
309 | int num_nodes) | ||
310 | { | ||
311 | unsigned int big; | ||
312 | u64 size; | ||
313 | int i; | ||
314 | |||
315 | if (num_nodes <= 0) | ||
316 | return -1; | ||
317 | if (num_nodes > MAX_NUMNODES) | ||
318 | num_nodes = MAX_NUMNODES; | ||
319 | size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / | ||
320 | num_nodes; | ||
321 | /* | ||
322 | * Calculate the number of big nodes that can be allocated as a result | ||
323 | * of consolidating the leftovers. | ||
324 | */ | ||
325 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / | ||
326 | FAKE_NODE_MIN_SIZE; | ||
327 | |||
328 | /* Round down to nearest FAKE_NODE_MIN_SIZE. */ | ||
329 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
330 | if (!size) { | ||
331 | printk(KERN_ERR "Not enough memory for each node. " | ||
332 | "NUMA emulation disabled.\n"); | ||
333 | return -1; | ||
334 | } | ||
335 | |||
336 | for (i = node_start; i < num_nodes + node_start; i++) { | ||
337 | u64 end = *addr + size; | ||
338 | if (i < big) | ||
339 | end += FAKE_NODE_MIN_SIZE; | ||
340 | /* | ||
341 | * The final node can have the remaining system RAM. Other | ||
342 | * nodes receive roughly the same amount of available pages. | ||
343 | */ | ||
344 | if (i == num_nodes + node_start - 1) | ||
345 | end = max_addr; | ||
346 | else | ||
347 | while (end - *addr - e820_hole_size(*addr, end) < | ||
348 | size) { | ||
349 | end += FAKE_NODE_MIN_SIZE; | ||
350 | if (end > max_addr) { | ||
351 | end = max_addr; | ||
352 | break; | ||
353 | } | ||
354 | } | ||
355 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | ||
356 | break; | ||
357 | } | ||
358 | return i - node_start + 1; | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Splits the remaining system RAM into chunks of size. The remaining memory is | ||
363 | * always assigned to a final node and can be asymmetric. Returns the number of | ||
364 | * nodes split. | ||
365 | */ | ||
366 | static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | ||
367 | u64 max_addr, int node_start, u64 size) | ||
368 | { | ||
369 | int i = node_start; | ||
370 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | ||
371 | while (!setup_node_range(i++, nodes, addr, size, max_addr)) | ||
372 | ; | ||
373 | return i - node_start; | ||
374 | } | ||
375 | |||
376 | /* | ||
377 | * Sets up the system RAM area from start_pfn to end_pfn according to the | ||
378 | * numa=fake command-line option. | ||
379 | */ | ||
380 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
381 | { | ||
382 | struct bootnode nodes[MAX_NUMNODES]; | ||
383 | u64 addr = start_pfn << PAGE_SHIFT; | ||
384 | u64 max_addr = end_pfn << PAGE_SHIFT; | ||
385 | int num_nodes = 0; | ||
386 | int coeff_flag; | ||
387 | int coeff = -1; | ||
388 | int num = 0; | ||
389 | u64 size; | ||
390 | int i; | ||
391 | |||
392 | memset(&nodes, 0, sizeof(nodes)); | ||
393 | /* | ||
394 | * If the numa=fake command-line is just a single number N, split the | ||
395 | * system RAM into N fake nodes. | ||
396 | */ | ||
397 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | ||
398 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, | ||
399 | simple_strtol(cmdline, NULL, 0)); | ||
400 | if (num_nodes < 0) | ||
401 | return num_nodes; | ||
402 | goto out; | ||
403 | } | ||
404 | |||
405 | /* Parse the command line. */ | ||
406 | for (coeff_flag = 0; ; cmdline++) { | ||
407 | if (*cmdline && isdigit(*cmdline)) { | ||
408 | num = num * 10 + *cmdline - '0'; | ||
409 | continue; | ||
410 | } | ||
411 | if (*cmdline == '*') { | ||
412 | if (num > 0) | ||
413 | coeff = num; | ||
414 | coeff_flag = 1; | ||
415 | } | ||
416 | if (!*cmdline || *cmdline == ',') { | ||
417 | if (!coeff_flag) | ||
418 | coeff = 1; | ||
419 | /* | ||
420 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
421 | * Command-line coefficients are in megabytes. | ||
422 | */ | ||
423 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | ||
424 | if (size) | ||
425 | for (i = 0; i < coeff; i++, num_nodes++) | ||
426 | if (setup_node_range(num_nodes, nodes, | ||
427 | &addr, size, max_addr) < 0) | ||
428 | goto done; | ||
429 | if (!*cmdline) | ||
430 | break; | ||
431 | coeff_flag = 0; | ||
432 | coeff = -1; | ||
433 | } | ||
434 | num = 0; | ||
435 | } | ||
436 | done: | ||
437 | if (!num_nodes) | ||
438 | return -1; | ||
439 | /* Fill remainder of system RAM, if appropriate. */ | ||
440 | if (addr < max_addr) { | ||
441 | if (coeff_flag && coeff < 0) { | ||
442 | /* Split remaining nodes into num-sized chunks */ | ||
443 | num_nodes += split_nodes_by_size(nodes, &addr, max_addr, | ||
444 | num_nodes, num); | ||
445 | goto out; | ||
446 | } | ||
447 | switch (*(cmdline - 1)) { | ||
448 | case '*': | ||
449 | /* Split remaining nodes into coeff chunks */ | ||
450 | if (coeff <= 0) | ||
451 | break; | ||
452 | num_nodes += split_nodes_equally(nodes, &addr, max_addr, | ||
453 | num_nodes, coeff); | ||
454 | break; | ||
455 | case ',': | ||
456 | /* Do not allocate remaining system RAM */ | ||
457 | break; | ||
458 | default: | ||
459 | /* Give one final node */ | ||
460 | setup_node_range(num_nodes, nodes, &addr, | ||
461 | max_addr - addr, max_addr); | ||
462 | num_nodes++; | ||
463 | } | ||
464 | } | ||
465 | out: | ||
466 | memnode_shift = compute_hash_shift(nodes, num_nodes); | ||
467 | if (memnode_shift < 0) { | ||
468 | memnode_shift = 0; | ||
469 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | ||
470 | "disabled.\n"); | ||
471 | return -1; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * We need to vacate all active ranges that may have been registered by | ||
476 | * SRAT and set acpi_numa to -1 so that srat_disabled() always returns | ||
477 | * true. NUMA emulation has succeeded so we will not scan ACPI nodes. | ||
478 | */ | ||
479 | remove_all_active_ranges(); | ||
480 | #ifdef CONFIG_ACPI_NUMA | ||
481 | acpi_numa = -1; | ||
482 | #endif | ||
483 | for_each_node_mask(i, node_possible_map) { | ||
484 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | ||
485 | nodes[i].end >> PAGE_SHIFT); | ||
486 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
487 | } | ||
488 | acpi_fake_nodes(nodes, num_nodes); | ||
489 | numa_init_array(); | ||
490 | return 0; | ||
491 | } | ||
492 | #endif /* CONFIG_NUMA_EMU */ | ||
493 | |||
494 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
495 | { | ||
496 | int i; | ||
497 | |||
498 | nodes_clear(node_possible_map); | ||
499 | |||
500 | #ifdef CONFIG_NUMA_EMU | ||
501 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) | ||
502 | return; | ||
503 | nodes_clear(node_possible_map); | ||
504 | #endif | ||
505 | |||
506 | #ifdef CONFIG_ACPI_NUMA | ||
507 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | ||
508 | end_pfn << PAGE_SHIFT)) | ||
509 | return; | ||
510 | nodes_clear(node_possible_map); | ||
511 | #endif | ||
512 | |||
513 | #ifdef CONFIG_K8_NUMA | ||
514 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | ||
515 | return; | ||
516 | nodes_clear(node_possible_map); | ||
517 | #endif | ||
518 | printk(KERN_INFO "%s\n", | ||
519 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
520 | |||
521 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | ||
522 | start_pfn << PAGE_SHIFT, | ||
523 | end_pfn << PAGE_SHIFT); | ||
524 | /* setup dummy node covering all memory */ | ||
525 | memnode_shift = 63; | ||
526 | memnodemap = memnode.embedded_map; | ||
527 | memnodemap[0] = 0; | ||
528 | nodes_clear(node_online_map); | ||
529 | node_set_online(0); | ||
530 | node_set(0, node_possible_map); | ||
531 | for (i = 0; i < NR_CPUS; i++) | ||
532 | numa_set_node(i, 0); | ||
533 | node_to_cpumask[0] = cpumask_of_cpu(0); | ||
534 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
535 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | ||
536 | } | ||
537 | |||
538 | __cpuinit void numa_add_cpu(int cpu) | ||
539 | { | ||
540 | set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); | ||
541 | } | ||
542 | |||
543 | void __cpuinit numa_set_node(int cpu, int node) | ||
544 | { | ||
545 | cpu_pda(cpu)->nodenumber = node; | ||
546 | cpu_to_node[cpu] = node; | ||
547 | } | ||
548 | |||
549 | unsigned long __init numa_free_all_bootmem(void) | ||
550 | { | ||
551 | int i; | ||
552 | unsigned long pages = 0; | ||
553 | for_each_online_node(i) { | ||
554 | pages += free_all_bootmem_node(NODE_DATA(i)); | ||
555 | } | ||
556 | return pages; | ||
557 | } | ||
558 | |||
559 | void __init paging_init(void) | ||
560 | { | ||
561 | int i; | ||
562 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
563 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
564 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | ||
565 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | ||
566 | max_zone_pfns[ZONE_NORMAL] = end_pfn; | ||
567 | |||
568 | sparse_memory_present_with_active_regions(MAX_NUMNODES); | ||
569 | sparse_init(); | ||
570 | |||
571 | for_each_online_node(i) { | ||
572 | setup_node_zones(i); | ||
573 | } | ||
574 | |||
575 | free_area_init_nodes(max_zone_pfns); | ||
576 | } | ||
577 | |||
578 | static __init int numa_setup(char *opt) | ||
579 | { | ||
580 | if (!opt) | ||
581 | return -EINVAL; | ||
582 | if (!strncmp(opt,"off",3)) | ||
583 | numa_off = 1; | ||
584 | #ifdef CONFIG_NUMA_EMU | ||
585 | if (!strncmp(opt, "fake=", 5)) | ||
586 | cmdline = opt + 5; | ||
587 | #endif | ||
588 | #ifdef CONFIG_ACPI_NUMA | ||
589 | if (!strncmp(opt,"noacpi",6)) | ||
590 | acpi_numa = -1; | ||
591 | if (!strncmp(opt,"hotadd=", 7)) | ||
592 | hotadd_percent = simple_strtoul(opt+7, NULL, 10); | ||
593 | #endif | ||
594 | return 0; | ||
595 | } | ||
596 | |||
597 | early_param("numa", numa_setup); | ||
598 | |||
599 | /* | ||
600 | * Setup early cpu_to_node. | ||
601 | * | ||
602 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | ||
603 | * and apicid_to_node[] tables have valid entries for a CPU. | ||
604 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
605 | * emulation and faking node case (when running a kernel compiled | ||
606 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
607 | * is already initialized in a round robin manner at numa_init_array, | ||
608 | * prior to this call, and this initialization is good enough | ||
609 | * for the fake NUMA cases. | ||
610 | */ | ||
611 | void __init init_cpu_to_node(void) | ||
612 | { | ||
613 | int i; | ||
614 | for (i = 0; i < NR_CPUS; i++) { | ||
615 | u8 apicid = x86_cpu_to_apicid[i]; | ||
616 | if (apicid == BAD_APICID) | ||
617 | continue; | ||
618 | if (apicid_to_node[apicid] == NUMA_NO_NODE) | ||
619 | continue; | ||
620 | numa_set_node(i,apicid_to_node[apicid]); | ||
621 | } | ||
622 | } | ||
623 | |||
624 | EXPORT_SYMBOL(cpu_to_node); | ||
625 | EXPORT_SYMBOL(node_to_cpumask); | ||
626 | EXPORT_SYMBOL(memnode); | ||
627 | EXPORT_SYMBOL(node_data); | ||
628 | |||
629 | #ifdef CONFIG_DISCONTIGMEM | ||
630 | /* | ||
631 | * Functions to convert PFNs from/to per node page addresses. | ||
632 | * These are out of line because they are quite big. | ||
633 | * They could be all tuned by pre caching more state. | ||
634 | * Should do that. | ||
635 | */ | ||
636 | |||
637 | int pfn_valid(unsigned long pfn) | ||
638 | { | ||
639 | unsigned nid; | ||
640 | if (pfn >= num_physpages) | ||
641 | return 0; | ||
642 | nid = pfn_to_nid(pfn); | ||
643 | if (nid == 0xff) | ||
644 | return 0; | ||
645 | return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); | ||
646 | } | ||
647 | EXPORT_SYMBOL(pfn_valid); | ||
648 | #endif | ||
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c deleted file mode 100644 index 10b9809ce821..000000000000 --- a/arch/x86_64/mm/pageattr.c +++ /dev/null | |||
@@ -1,249 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
3 | * Thanks to Ben LaHaise for precious feedback. | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/highmem.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <asm/uaccess.h> | ||
12 | #include <asm/processor.h> | ||
13 | #include <asm/tlbflush.h> | ||
14 | #include <asm/io.h> | ||
15 | |||
16 | pte_t *lookup_address(unsigned long address) | ||
17 | { | ||
18 | pgd_t *pgd = pgd_offset_k(address); | ||
19 | pud_t *pud; | ||
20 | pmd_t *pmd; | ||
21 | pte_t *pte; | ||
22 | if (pgd_none(*pgd)) | ||
23 | return NULL; | ||
24 | pud = pud_offset(pgd, address); | ||
25 | if (!pud_present(*pud)) | ||
26 | return NULL; | ||
27 | pmd = pmd_offset(pud, address); | ||
28 | if (!pmd_present(*pmd)) | ||
29 | return NULL; | ||
30 | if (pmd_large(*pmd)) | ||
31 | return (pte_t *)pmd; | ||
32 | pte = pte_offset_kernel(pmd, address); | ||
33 | if (pte && !pte_present(*pte)) | ||
34 | pte = NULL; | ||
35 | return pte; | ||
36 | } | ||
37 | |||
38 | static struct page *split_large_page(unsigned long address, pgprot_t prot, | ||
39 | pgprot_t ref_prot) | ||
40 | { | ||
41 | int i; | ||
42 | unsigned long addr; | ||
43 | struct page *base = alloc_pages(GFP_KERNEL, 0); | ||
44 | pte_t *pbase; | ||
45 | if (!base) | ||
46 | return NULL; | ||
47 | /* | ||
48 | * page_private is used to track the number of entries in | ||
49 | * the page table page have non standard attributes. | ||
50 | */ | ||
51 | SetPagePrivate(base); | ||
52 | page_private(base) = 0; | ||
53 | |||
54 | address = __pa(address); | ||
55 | addr = address & LARGE_PAGE_MASK; | ||
56 | pbase = (pte_t *)page_address(base); | ||
57 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | ||
58 | pbase[i] = pfn_pte(addr >> PAGE_SHIFT, | ||
59 | addr == address ? prot : ref_prot); | ||
60 | } | ||
61 | return base; | ||
62 | } | ||
63 | |||
64 | static void cache_flush_page(void *adr) | ||
65 | { | ||
66 | int i; | ||
67 | for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) | ||
68 | asm volatile("clflush (%0)" :: "r" (adr + i)); | ||
69 | } | ||
70 | |||
71 | static void flush_kernel_map(void *arg) | ||
72 | { | ||
73 | struct list_head *l = (struct list_head *)arg; | ||
74 | struct page *pg; | ||
75 | |||
76 | /* When clflush is available always use it because it is | ||
77 | much cheaper than WBINVD. */ | ||
78 | /* clflush is still broken. Disable for now. */ | ||
79 | if (1 || !cpu_has_clflush) | ||
80 | asm volatile("wbinvd" ::: "memory"); | ||
81 | else list_for_each_entry(pg, l, lru) { | ||
82 | void *adr = page_address(pg); | ||
83 | cache_flush_page(adr); | ||
84 | } | ||
85 | __flush_tlb_all(); | ||
86 | } | ||
87 | |||
88 | static inline void flush_map(struct list_head *l) | ||
89 | { | ||
90 | on_each_cpu(flush_kernel_map, l, 1, 1); | ||
91 | } | ||
92 | |||
93 | static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ | ||
94 | |||
95 | static inline void save_page(struct page *fpage) | ||
96 | { | ||
97 | if (!test_and_set_bit(PG_arch_1, &fpage->flags)) | ||
98 | list_add(&fpage->lru, &deferred_pages); | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * No more special protections in this 2/4MB area - revert to a | ||
103 | * large page again. | ||
104 | */ | ||
105 | static void revert_page(unsigned long address, pgprot_t ref_prot) | ||
106 | { | ||
107 | pgd_t *pgd; | ||
108 | pud_t *pud; | ||
109 | pmd_t *pmd; | ||
110 | pte_t large_pte; | ||
111 | unsigned long pfn; | ||
112 | |||
113 | pgd = pgd_offset_k(address); | ||
114 | BUG_ON(pgd_none(*pgd)); | ||
115 | pud = pud_offset(pgd,address); | ||
116 | BUG_ON(pud_none(*pud)); | ||
117 | pmd = pmd_offset(pud, address); | ||
118 | BUG_ON(pmd_val(*pmd) & _PAGE_PSE); | ||
119 | pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; | ||
120 | large_pte = pfn_pte(pfn, ref_prot); | ||
121 | large_pte = pte_mkhuge(large_pte); | ||
122 | set_pte((pte_t *)pmd, large_pte); | ||
123 | } | ||
124 | |||
125 | static int | ||
126 | __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, | ||
127 | pgprot_t ref_prot) | ||
128 | { | ||
129 | pte_t *kpte; | ||
130 | struct page *kpte_page; | ||
131 | pgprot_t ref_prot2; | ||
132 | |||
133 | kpte = lookup_address(address); | ||
134 | if (!kpte) return 0; | ||
135 | kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); | ||
136 | BUG_ON(PageLRU(kpte_page)); | ||
137 | BUG_ON(PageCompound(kpte_page)); | ||
138 | if (pgprot_val(prot) != pgprot_val(ref_prot)) { | ||
139 | if (!pte_huge(*kpte)) { | ||
140 | set_pte(kpte, pfn_pte(pfn, prot)); | ||
141 | } else { | ||
142 | /* | ||
143 | * split_large_page will take the reference for this | ||
144 | * change_page_attr on the split page. | ||
145 | */ | ||
146 | struct page *split; | ||
147 | ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); | ||
148 | split = split_large_page(address, prot, ref_prot2); | ||
149 | if (!split) | ||
150 | return -ENOMEM; | ||
151 | set_pte(kpte, mk_pte(split, ref_prot2)); | ||
152 | kpte_page = split; | ||
153 | } | ||
154 | page_private(kpte_page)++; | ||
155 | } else if (!pte_huge(*kpte)) { | ||
156 | set_pte(kpte, pfn_pte(pfn, ref_prot)); | ||
157 | BUG_ON(page_private(kpte_page) == 0); | ||
158 | page_private(kpte_page)--; | ||
159 | } else | ||
160 | BUG(); | ||
161 | |||
162 | /* on x86-64 the direct mapping set at boot is not using 4k pages */ | ||
163 | BUG_ON(PageReserved(kpte_page)); | ||
164 | |||
165 | save_page(kpte_page); | ||
166 | if (page_private(kpte_page) == 0) | ||
167 | revert_page(address, ref_prot); | ||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | /* | ||
172 | * Change the page attributes of an page in the linear mapping. | ||
173 | * | ||
174 | * This should be used when a page is mapped with a different caching policy | ||
175 | * than write-back somewhere - some CPUs do not like it when mappings with | ||
176 | * different caching policies exist. This changes the page attributes of the | ||
177 | * in kernel linear mapping too. | ||
178 | * | ||
179 | * The caller needs to ensure that there are no conflicting mappings elsewhere. | ||
180 | * This function only deals with the kernel linear map. | ||
181 | * | ||
182 | * Caller must call global_flush_tlb() after this. | ||
183 | */ | ||
184 | int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) | ||
185 | { | ||
186 | int err = 0, kernel_map = 0; | ||
187 | int i; | ||
188 | |||
189 | if (address >= __START_KERNEL_map | ||
190 | && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { | ||
191 | address = (unsigned long)__va(__pa(address)); | ||
192 | kernel_map = 1; | ||
193 | } | ||
194 | |||
195 | down_write(&init_mm.mmap_sem); | ||
196 | for (i = 0; i < numpages; i++, address += PAGE_SIZE) { | ||
197 | unsigned long pfn = __pa(address) >> PAGE_SHIFT; | ||
198 | |||
199 | if (!kernel_map || pte_present(pfn_pte(0, prot))) { | ||
200 | err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); | ||
201 | if (err) | ||
202 | break; | ||
203 | } | ||
204 | /* Handle kernel mapping too which aliases part of the | ||
205 | * lowmem */ | ||
206 | if (__pa(address) < KERNEL_TEXT_SIZE) { | ||
207 | unsigned long addr2; | ||
208 | pgprot_t prot2; | ||
209 | addr2 = __START_KERNEL_map + __pa(address); | ||
210 | /* Make sure the kernel mappings stay executable */ | ||
211 | prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); | ||
212 | err = __change_page_attr(addr2, pfn, prot2, | ||
213 | PAGE_KERNEL_EXEC); | ||
214 | } | ||
215 | } | ||
216 | up_write(&init_mm.mmap_sem); | ||
217 | return err; | ||
218 | } | ||
219 | |||
220 | /* Don't call this for MMIO areas that may not have a mem_map entry */ | ||
221 | int change_page_attr(struct page *page, int numpages, pgprot_t prot) | ||
222 | { | ||
223 | unsigned long addr = (unsigned long)page_address(page); | ||
224 | return change_page_attr_addr(addr, numpages, prot); | ||
225 | } | ||
226 | |||
227 | void global_flush_tlb(void) | ||
228 | { | ||
229 | struct page *pg, *next; | ||
230 | struct list_head l; | ||
231 | |||
232 | down_read(&init_mm.mmap_sem); | ||
233 | list_replace_init(&deferred_pages, &l); | ||
234 | up_read(&init_mm.mmap_sem); | ||
235 | |||
236 | flush_map(&l); | ||
237 | |||
238 | list_for_each_entry_safe(pg, next, &l, lru) { | ||
239 | list_del(&pg->lru); | ||
240 | clear_bit(PG_arch_1, &pg->flags); | ||
241 | if (page_private(pg) != 0) | ||
242 | continue; | ||
243 | ClearPagePrivate(pg); | ||
244 | __free_page(pg); | ||
245 | } | ||
246 | } | ||
247 | |||
248 | EXPORT_SYMBOL(change_page_attr); | ||
249 | EXPORT_SYMBOL(global_flush_tlb); | ||
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c deleted file mode 100644 index acdf03e19146..000000000000 --- a/arch/x86_64/mm/srat.c +++ /dev/null | |||
@@ -1,566 +0,0 @@ | |||
1 | /* | ||
2 | * ACPI 3.0 based NUMA setup | ||
3 | * Copyright 2004 Andi Kleen, SuSE Labs. | ||
4 | * | ||
5 | * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. | ||
6 | * | ||
7 | * Called from acpi_numa_init while reading the SRAT and SLIT tables. | ||
8 | * Assumes all memory regions belonging to a single proximity domain | ||
9 | * are in one chunk. Holes between them will be included in the node. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/mmzone.h> | ||
15 | #include <linux/bitmap.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/topology.h> | ||
18 | #include <linux/bootmem.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <asm/proto.h> | ||
21 | #include <asm/numa.h> | ||
22 | #include <asm/e820.h> | ||
23 | |||
24 | int acpi_numa __initdata; | ||
25 | |||
26 | static struct acpi_table_slit *acpi_slit; | ||
27 | |||
28 | static nodemask_t nodes_parsed __initdata; | ||
29 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
30 | static struct bootnode nodes_add[MAX_NUMNODES]; | ||
31 | static int found_add_area __initdata; | ||
32 | int hotadd_percent __initdata = 0; | ||
33 | |||
34 | /* Too small nodes confuse the VM badly. Usually they result | ||
35 | from BIOS bugs. */ | ||
36 | #define NODE_MIN_SIZE (4*1024*1024) | ||
37 | |||
38 | static __init int setup_node(int pxm) | ||
39 | { | ||
40 | return acpi_map_pxm_to_node(pxm); | ||
41 | } | ||
42 | |||
43 | static __init int conflicting_nodes(unsigned long start, unsigned long end) | ||
44 | { | ||
45 | int i; | ||
46 | for_each_node_mask(i, nodes_parsed) { | ||
47 | struct bootnode *nd = &nodes[i]; | ||
48 | if (nd->start == nd->end) | ||
49 | continue; | ||
50 | if (nd->end > start && nd->start < end) | ||
51 | return i; | ||
52 | if (nd->end == end && nd->start == start) | ||
53 | return i; | ||
54 | } | ||
55 | return -1; | ||
56 | } | ||
57 | |||
58 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
59 | { | ||
60 | struct bootnode *nd = &nodes[i]; | ||
61 | |||
62 | if (found_add_area) | ||
63 | return; | ||
64 | |||
65 | if (nd->start < start) { | ||
66 | nd->start = start; | ||
67 | if (nd->end < nd->start) | ||
68 | nd->start = nd->end; | ||
69 | } | ||
70 | if (nd->end > end) { | ||
71 | nd->end = end; | ||
72 | if (nd->start > nd->end) | ||
73 | nd->start = nd->end; | ||
74 | } | ||
75 | } | ||
76 | |||
77 | static __init void bad_srat(void) | ||
78 | { | ||
79 | int i; | ||
80 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
81 | acpi_numa = -1; | ||
82 | found_add_area = 0; | ||
83 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
84 | apicid_to_node[i] = NUMA_NO_NODE; | ||
85 | for (i = 0; i < MAX_NUMNODES; i++) | ||
86 | nodes_add[i].start = nodes[i].end = 0; | ||
87 | remove_all_active_ranges(); | ||
88 | } | ||
89 | |||
90 | static __init inline int srat_disabled(void) | ||
91 | { | ||
92 | return numa_off || acpi_numa < 0; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * A lot of BIOS fill in 10 (= no distance) everywhere. This messes | ||
97 | * up the NUMA heuristics which wants the local node to have a smaller | ||
98 | * distance than the others. | ||
99 | * Do some quick checks here and only use the SLIT if it passes. | ||
100 | */ | ||
101 | static __init int slit_valid(struct acpi_table_slit *slit) | ||
102 | { | ||
103 | int i, j; | ||
104 | int d = slit->locality_count; | ||
105 | for (i = 0; i < d; i++) { | ||
106 | for (j = 0; j < d; j++) { | ||
107 | u8 val = slit->entry[d*i + j]; | ||
108 | if (i == j) { | ||
109 | if (val != LOCAL_DISTANCE) | ||
110 | return 0; | ||
111 | } else if (val <= LOCAL_DISTANCE) | ||
112 | return 0; | ||
113 | } | ||
114 | } | ||
115 | return 1; | ||
116 | } | ||
117 | |||
118 | /* Callback for SLIT parsing */ | ||
119 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
120 | { | ||
121 | if (!slit_valid(slit)) { | ||
122 | printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); | ||
123 | return; | ||
124 | } | ||
125 | acpi_slit = slit; | ||
126 | } | ||
127 | |||
128 | /* Callback for Proximity Domain -> LAPIC mapping */ | ||
129 | void __init | ||
130 | acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | ||
131 | { | ||
132 | int pxm, node; | ||
133 | if (srat_disabled()) | ||
134 | return; | ||
135 | if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { | ||
136 | bad_srat(); | ||
137 | return; | ||
138 | } | ||
139 | if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
140 | return; | ||
141 | pxm = pa->proximity_domain_lo; | ||
142 | node = setup_node(pxm); | ||
143 | if (node < 0) { | ||
144 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
145 | bad_srat(); | ||
146 | return; | ||
147 | } | ||
148 | apicid_to_node[pa->apic_id] = node; | ||
149 | acpi_numa = 1; | ||
150 | printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", | ||
151 | pxm, pa->apic_id, node); | ||
152 | } | ||
153 | |||
154 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
155 | /* | ||
156 | * Protect against too large hotadd areas that would fill up memory. | ||
157 | */ | ||
158 | static int hotadd_enough_memory(struct bootnode *nd) | ||
159 | { | ||
160 | static unsigned long allocated; | ||
161 | static unsigned long last_area_end; | ||
162 | unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; | ||
163 | long mem = pages * sizeof(struct page); | ||
164 | unsigned long addr; | ||
165 | unsigned long allowed; | ||
166 | unsigned long oldpages = pages; | ||
167 | |||
168 | if (mem < 0) | ||
169 | return 0; | ||
170 | allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; | ||
171 | allowed = (allowed / 100) * hotadd_percent; | ||
172 | if (allocated + mem > allowed) { | ||
173 | unsigned long range; | ||
174 | /* Give them at least part of their hotadd memory upto hotadd_percent | ||
175 | It would be better to spread the limit out | ||
176 | over multiple hotplug areas, but that is too complicated | ||
177 | right now */ | ||
178 | if (allocated >= allowed) | ||
179 | return 0; | ||
180 | range = allowed - allocated; | ||
181 | pages = (range / PAGE_SIZE); | ||
182 | mem = pages * sizeof(struct page); | ||
183 | nd->end = nd->start + range; | ||
184 | } | ||
185 | /* Not completely fool proof, but a good sanity check */ | ||
186 | addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); | ||
187 | if (addr == -1UL) | ||
188 | return 0; | ||
189 | if (pages != oldpages) | ||
190 | printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", | ||
191 | pages << PAGE_SHIFT); | ||
192 | last_area_end = addr + mem; | ||
193 | allocated += mem; | ||
194 | return 1; | ||
195 | } | ||
196 | |||
197 | static int update_end_of_memory(unsigned long end) | ||
198 | { | ||
199 | found_add_area = 1; | ||
200 | if ((end >> PAGE_SHIFT) > end_pfn) | ||
201 | end_pfn = end >> PAGE_SHIFT; | ||
202 | return 1; | ||
203 | } | ||
204 | |||
205 | static inline int save_add_info(void) | ||
206 | { | ||
207 | return hotadd_percent > 0; | ||
208 | } | ||
209 | #else | ||
210 | int update_end_of_memory(unsigned long end) {return -1;} | ||
211 | static int hotadd_enough_memory(struct bootnode *nd) {return 1;} | ||
212 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | ||
213 | static inline int save_add_info(void) {return 1;} | ||
214 | #else | ||
215 | static inline int save_add_info(void) {return 0;} | ||
216 | #endif | ||
217 | #endif | ||
218 | /* | ||
219 | * Update nodes_add and decide if to include add are in the zone. | ||
220 | * Both SPARSE and RESERVE need nodes_add infomation. | ||
221 | * This code supports one contigious hot add area per node. | ||
222 | */ | ||
223 | static int reserve_hotadd(int node, unsigned long start, unsigned long end) | ||
224 | { | ||
225 | unsigned long s_pfn = start >> PAGE_SHIFT; | ||
226 | unsigned long e_pfn = end >> PAGE_SHIFT; | ||
227 | int ret = 0, changed = 0; | ||
228 | struct bootnode *nd = &nodes_add[node]; | ||
229 | |||
230 | /* I had some trouble with strange memory hotadd regions breaking | ||
231 | the boot. Be very strict here and reject anything unexpected. | ||
232 | If you want working memory hotadd write correct SRATs. | ||
233 | |||
234 | The node size check is a basic sanity check to guard against | ||
235 | mistakes */ | ||
236 | if ((signed long)(end - start) < NODE_MIN_SIZE) { | ||
237 | printk(KERN_ERR "SRAT: Hotplug area too small\n"); | ||
238 | return -1; | ||
239 | } | ||
240 | |||
241 | /* This check might be a bit too strict, but I'm keeping it for now. */ | ||
242 | if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { | ||
243 | printk(KERN_ERR | ||
244 | "SRAT: Hotplug area %lu -> %lu has existing memory\n", | ||
245 | s_pfn, e_pfn); | ||
246 | return -1; | ||
247 | } | ||
248 | |||
249 | if (!hotadd_enough_memory(&nodes_add[node])) { | ||
250 | printk(KERN_ERR "SRAT: Hotplug area too large\n"); | ||
251 | return -1; | ||
252 | } | ||
253 | |||
254 | /* Looks good */ | ||
255 | |||
256 | if (nd->start == nd->end) { | ||
257 | nd->start = start; | ||
258 | nd->end = end; | ||
259 | changed = 1; | ||
260 | } else { | ||
261 | if (nd->start == end) { | ||
262 | nd->start = start; | ||
263 | changed = 1; | ||
264 | } | ||
265 | if (nd->end == start) { | ||
266 | nd->end = end; | ||
267 | changed = 1; | ||
268 | } | ||
269 | if (!changed) | ||
270 | printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); | ||
271 | } | ||
272 | |||
273 | ret = update_end_of_memory(nd->end); | ||
274 | |||
275 | if (changed) | ||
276 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); | ||
277 | return ret; | ||
278 | } | ||
279 | |||
280 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | ||
281 | void __init | ||
282 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | ||
283 | { | ||
284 | struct bootnode *nd, oldnode; | ||
285 | unsigned long start, end; | ||
286 | int node, pxm; | ||
287 | int i; | ||
288 | |||
289 | if (srat_disabled()) | ||
290 | return; | ||
291 | if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { | ||
292 | bad_srat(); | ||
293 | return; | ||
294 | } | ||
295 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
296 | return; | ||
297 | |||
298 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) | ||
299 | return; | ||
300 | start = ma->base_address; | ||
301 | end = start + ma->length; | ||
302 | pxm = ma->proximity_domain; | ||
303 | node = setup_node(pxm); | ||
304 | if (node < 0) { | ||
305 | printk(KERN_ERR "SRAT: Too many proximity domains.\n"); | ||
306 | bad_srat(); | ||
307 | return; | ||
308 | } | ||
309 | i = conflicting_nodes(start, end); | ||
310 | if (i == node) { | ||
311 | printk(KERN_WARNING | ||
312 | "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", | ||
313 | pxm, start, end, nodes[i].start, nodes[i].end); | ||
314 | } else if (i >= 0) { | ||
315 | printk(KERN_ERR | ||
316 | "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", | ||
317 | pxm, start, end, node_to_pxm(i), | ||
318 | nodes[i].start, nodes[i].end); | ||
319 | bad_srat(); | ||
320 | return; | ||
321 | } | ||
322 | nd = &nodes[node]; | ||
323 | oldnode = *nd; | ||
324 | if (!node_test_and_set(node, nodes_parsed)) { | ||
325 | nd->start = start; | ||
326 | nd->end = end; | ||
327 | } else { | ||
328 | if (start < nd->start) | ||
329 | nd->start = start; | ||
330 | if (nd->end < end) | ||
331 | nd->end = end; | ||
332 | } | ||
333 | |||
334 | printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, | ||
335 | nd->start, nd->end); | ||
336 | e820_register_active_regions(node, nd->start >> PAGE_SHIFT, | ||
337 | nd->end >> PAGE_SHIFT); | ||
338 | push_node_boundaries(node, nd->start >> PAGE_SHIFT, | ||
339 | nd->end >> PAGE_SHIFT); | ||
340 | |||
341 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && | ||
342 | (reserve_hotadd(node, start, end) < 0)) { | ||
343 | /* Ignore hotadd region. Undo damage */ | ||
344 | printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); | ||
345 | *nd = oldnode; | ||
346 | if ((nd->start | nd->end) == 0) | ||
347 | node_clear(node, nodes_parsed); | ||
348 | } | ||
349 | } | ||
350 | |||
351 | /* Sanity check to catch more bad SRATs (they are amazingly common). | ||
352 | Make sure the PXMs cover all memory. */ | ||
353 | static int __init nodes_cover_memory(const struct bootnode *nodes) | ||
354 | { | ||
355 | int i; | ||
356 | unsigned long pxmram, e820ram; | ||
357 | |||
358 | pxmram = 0; | ||
359 | for_each_node_mask(i, nodes_parsed) { | ||
360 | unsigned long s = nodes[i].start >> PAGE_SHIFT; | ||
361 | unsigned long e = nodes[i].end >> PAGE_SHIFT; | ||
362 | pxmram += e - s; | ||
363 | pxmram -= absent_pages_in_range(s, e); | ||
364 | if ((long)pxmram < 0) | ||
365 | pxmram = 0; | ||
366 | } | ||
367 | |||
368 | e820ram = end_pfn - absent_pages_in_range(0, end_pfn); | ||
369 | /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ | ||
370 | if ((long)(e820ram - pxmram) >= 1*1024*1024) { | ||
371 | printk(KERN_ERR | ||
372 | "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
373 | (pxmram << PAGE_SHIFT) >> 20, | ||
374 | (e820ram << PAGE_SHIFT) >> 20); | ||
375 | return 0; | ||
376 | } | ||
377 | return 1; | ||
378 | } | ||
379 | |||
380 | static void unparse_node(int node) | ||
381 | { | ||
382 | int i; | ||
383 | node_clear(node, nodes_parsed); | ||
384 | for (i = 0; i < MAX_LOCAL_APIC; i++) { | ||
385 | if (apicid_to_node[i] == node) | ||
386 | apicid_to_node[i] = NUMA_NO_NODE; | ||
387 | } | ||
388 | } | ||
389 | |||
390 | void __init acpi_numa_arch_fixup(void) {} | ||
391 | |||
392 | /* Use the information discovered above to actually set up the nodes. */ | ||
393 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
394 | { | ||
395 | int i; | ||
396 | |||
397 | if (acpi_numa <= 0) | ||
398 | return -1; | ||
399 | |||
400 | /* First clean up the node list */ | ||
401 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
402 | cutoff_node(i, start, end); | ||
403 | if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { | ||
404 | unparse_node(i); | ||
405 | node_set_offline(i); | ||
406 | } | ||
407 | } | ||
408 | |||
409 | if (!nodes_cover_memory(nodes)) { | ||
410 | bad_srat(); | ||
411 | return -1; | ||
412 | } | ||
413 | |||
414 | memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); | ||
415 | if (memnode_shift < 0) { | ||
416 | printk(KERN_ERR | ||
417 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
418 | bad_srat(); | ||
419 | return -1; | ||
420 | } | ||
421 | |||
422 | node_possible_map = nodes_parsed; | ||
423 | |||
424 | /* Finally register nodes */ | ||
425 | for_each_node_mask(i, node_possible_map) | ||
426 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
427 | /* Try again in case setup_node_bootmem missed one due | ||
428 | to missing bootmem */ | ||
429 | for_each_node_mask(i, node_possible_map) | ||
430 | if (!node_online(i)) | ||
431 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
432 | |||
433 | for (i = 0; i < NR_CPUS; i++) { | ||
434 | if (cpu_to_node[i] == NUMA_NO_NODE) | ||
435 | continue; | ||
436 | if (!node_isset(cpu_to_node[i], node_possible_map)) | ||
437 | numa_set_node(i, NUMA_NO_NODE); | ||
438 | } | ||
439 | numa_init_array(); | ||
440 | return 0; | ||
441 | } | ||
442 | |||
443 | #ifdef CONFIG_NUMA_EMU | ||
444 | static int __init find_node_by_addr(unsigned long addr) | ||
445 | { | ||
446 | int ret = NUMA_NO_NODE; | ||
447 | int i; | ||
448 | |||
449 | for_each_node_mask(i, nodes_parsed) { | ||
450 | /* | ||
451 | * Find the real node that this emulated node appears on. For | ||
452 | * the sake of simplicity, we only use a real node's starting | ||
453 | * address to determine which emulated node it appears on. | ||
454 | */ | ||
455 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
456 | ret = i; | ||
457 | break; | ||
458 | } | ||
459 | } | ||
460 | return i; | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID | ||
465 | * mappings that respect the real ACPI topology but reflect our emulated | ||
466 | * environment. For each emulated node, we find which real node it appears on | ||
467 | * and create PXM to NID mappings for those fake nodes which mirror that | ||
468 | * locality. SLIT will now represent the correct distances between emulated | ||
469 | * nodes as a result of the real topology. | ||
470 | */ | ||
471 | void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | ||
472 | { | ||
473 | int i, j; | ||
474 | int fake_node_to_pxm_map[MAX_NUMNODES] = { | ||
475 | [0 ... MAX_NUMNODES-1] = PXM_INVAL | ||
476 | }; | ||
477 | unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { | ||
478 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
479 | }; | ||
480 | |||
481 | printk(KERN_INFO "Faking PXM affinity for fake nodes on real " | ||
482 | "topology.\n"); | ||
483 | for (i = 0; i < num_nodes; i++) { | ||
484 | int nid, pxm; | ||
485 | |||
486 | nid = find_node_by_addr(fake_nodes[i].start); | ||
487 | if (nid == NUMA_NO_NODE) | ||
488 | continue; | ||
489 | pxm = node_to_pxm(nid); | ||
490 | if (pxm == PXM_INVAL) | ||
491 | continue; | ||
492 | fake_node_to_pxm_map[i] = pxm; | ||
493 | /* | ||
494 | * For each apicid_to_node mapping that exists for this real | ||
495 | * node, it must now point to the fake node ID. | ||
496 | */ | ||
497 | for (j = 0; j < MAX_LOCAL_APIC; j++) | ||
498 | if (apicid_to_node[j] == nid) | ||
499 | fake_apicid_to_node[j] = i; | ||
500 | } | ||
501 | for (i = 0; i < num_nodes; i++) | ||
502 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); | ||
503 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
504 | |||
505 | nodes_clear(nodes_parsed); | ||
506 | for (i = 0; i < num_nodes; i++) | ||
507 | if (fake_nodes[i].start != fake_nodes[i].end) | ||
508 | node_set(i, nodes_parsed); | ||
509 | WARN_ON(!nodes_cover_memory(fake_nodes)); | ||
510 | } | ||
511 | |||
512 | static int null_slit_node_compare(int a, int b) | ||
513 | { | ||
514 | return node_to_pxm(a) == node_to_pxm(b); | ||
515 | } | ||
516 | #else | ||
517 | static int null_slit_node_compare(int a, int b) | ||
518 | { | ||
519 | return a == b; | ||
520 | } | ||
521 | #endif /* CONFIG_NUMA_EMU */ | ||
522 | |||
523 | void __init srat_reserve_add_area(int nodeid) | ||
524 | { | ||
525 | if (found_add_area && nodes_add[nodeid].end) { | ||
526 | u64 total_mb; | ||
527 | |||
528 | printk(KERN_INFO "SRAT: Reserving hot-add memory space " | ||
529 | "for node %d at %Lx-%Lx\n", | ||
530 | nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); | ||
531 | total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) | ||
532 | >> PAGE_SHIFT; | ||
533 | total_mb *= sizeof(struct page); | ||
534 | total_mb >>= 20; | ||
535 | printk(KERN_INFO "SRAT: This will cost you %Lu MB of " | ||
536 | "pre-allocated memory.\n", (unsigned long long)total_mb); | ||
537 | reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, | ||
538 | nodes_add[nodeid].end - nodes_add[nodeid].start); | ||
539 | } | ||
540 | } | ||
541 | |||
542 | int __node_distance(int a, int b) | ||
543 | { | ||
544 | int index; | ||
545 | |||
546 | if (!acpi_slit) | ||
547 | return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : | ||
548 | REMOTE_DISTANCE; | ||
549 | index = acpi_slit->locality_count * node_to_pxm(a); | ||
550 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
551 | } | ||
552 | |||
553 | EXPORT_SYMBOL(__node_distance); | ||
554 | |||
555 | int memory_add_physaddr_to_nid(u64 start) | ||
556 | { | ||
557 | int i, ret = 0; | ||
558 | |||
559 | for_each_node(i) | ||
560 | if (nodes_add[i].start <= start && nodes_add[i].end > start) | ||
561 | ret = i; | ||
562 | |||
563 | return ret; | ||
564 | } | ||
565 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
566 | |||
diff --git a/arch/x86_64/oprofile/Kconfig b/arch/x86_64/oprofile/Kconfig deleted file mode 100644 index d8a84088471a..000000000000 --- a/arch/x86_64/oprofile/Kconfig +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | config PROFILING | ||
2 | bool "Profiling support (EXPERIMENTAL)" | ||
3 | help | ||
4 | Say Y here to enable the extended profiling support mechanisms used | ||
5 | by profilers such as OProfile. | ||
6 | |||
7 | |||
8 | config OPROFILE | ||
9 | tristate "OProfile system profiling (EXPERIMENTAL)" | ||
10 | depends on PROFILING | ||
11 | help | ||
12 | OProfile is a profiling system capable of profiling the | ||
13 | whole system, include the kernel, kernel modules, libraries, | ||
14 | and applications. | ||
15 | |||
16 | If unsure, say N. | ||
17 | |||
diff --git a/arch/x86_64/oprofile/Makefile b/arch/x86_64/oprofile/Makefile deleted file mode 100644 index 6be32683e1bc..000000000000 --- a/arch/x86_64/oprofile/Makefile +++ /dev/null | |||
@@ -1,19 +0,0 @@ | |||
1 | # | ||
2 | # oprofile for x86-64. | ||
3 | # Just reuse the one from i386. | ||
4 | # | ||
5 | |||
6 | obj-$(CONFIG_OPROFILE) += oprofile.o | ||
7 | |||
8 | DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ | ||
9 | oprof.o cpu_buffer.o buffer_sync.o \ | ||
10 | event_buffer.o oprofile_files.o \ | ||
11 | oprofilefs.o oprofile_stats.o \ | ||
12 | timer_int.o ) | ||
13 | |||
14 | OPROFILE-y := init.o backtrace.o | ||
15 | OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \ | ||
16 | op_model_ppro.o | ||
17 | OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o | ||
18 | |||
19 | oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y)) | ||
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile deleted file mode 100644 index c9eddc8859c0..000000000000 --- a/arch/x86_64/pci/Makefile +++ /dev/null | |||
@@ -1,27 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for X86_64 specific PCI routines | ||
3 | # | ||
4 | # Reuse the i386 PCI subsystem | ||
5 | # | ||
6 | EXTRA_CFLAGS += -Iarch/i386/pci | ||
7 | |||
8 | obj-y := i386.o | ||
9 | obj-$(CONFIG_PCI_DIRECT)+= direct.o | ||
10 | obj-y += fixup.o init.o | ||
11 | obj-$(CONFIG_ACPI) += acpi.o | ||
12 | obj-y += legacy.o irq.o common.o early.o | ||
13 | # mmconfig has a 64bit special | ||
14 | obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o | ||
15 | |||
16 | obj-$(CONFIG_NUMA) += k8-bus.o | ||
17 | |||
18 | direct-y += ../../i386/pci/direct.o | ||
19 | acpi-y += ../../i386/pci/acpi.o | ||
20 | legacy-y += ../../i386/pci/legacy.o | ||
21 | irq-y += ../../i386/pci/irq.o | ||
22 | common-y += ../../i386/pci/common.o | ||
23 | fixup-y += ../../i386/pci/fixup.o | ||
24 | i386-y += ../../i386/pci/i386.o | ||
25 | init-y += ../../i386/pci/init.o | ||
26 | early-y += ../../i386/pci/early.o | ||
27 | mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o | ||
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c deleted file mode 100644 index 9cc813e29706..000000000000 --- a/arch/x86_64/pci/k8-bus.c +++ /dev/null | |||
@@ -1,83 +0,0 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/pci.h> | ||
3 | #include <asm/mpspec.h> | ||
4 | #include <linux/cpumask.h> | ||
5 | |||
6 | /* | ||
7 | * This discovers the pcibus <-> node mapping on AMD K8. | ||
8 | * | ||
9 | * RED-PEN need to call this again on PCI hotplug | ||
10 | * RED-PEN empty cpus get reported wrong | ||
11 | */ | ||
12 | |||
13 | #define NODE_ID_REGISTER 0x60 | ||
14 | #define NODE_ID(dword) (dword & 0x07) | ||
15 | #define LDT_BUS_NUMBER_REGISTER_0 0x94 | ||
16 | #define LDT_BUS_NUMBER_REGISTER_1 0xB4 | ||
17 | #define LDT_BUS_NUMBER_REGISTER_2 0xD4 | ||
18 | #define NR_LDT_BUS_NUMBER_REGISTERS 3 | ||
19 | #define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF) | ||
20 | #define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) | ||
21 | #define PCI_DEVICE_ID_K8HTCONFIG 0x1100 | ||
22 | |||
23 | /** | ||
24 | * fill_mp_bus_to_cpumask() | ||
25 | * fills the mp_bus_to_cpumask array based according to the LDT Bus Number | ||
26 | * Registers found in the K8 northbridge | ||
27 | */ | ||
28 | __init static int | ||
29 | fill_mp_bus_to_cpumask(void) | ||
30 | { | ||
31 | struct pci_dev *nb_dev = NULL; | ||
32 | int i, j; | ||
33 | u32 ldtbus, nid; | ||
34 | static int lbnr[3] = { | ||
35 | LDT_BUS_NUMBER_REGISTER_0, | ||
36 | LDT_BUS_NUMBER_REGISTER_1, | ||
37 | LDT_BUS_NUMBER_REGISTER_2 | ||
38 | }; | ||
39 | |||
40 | while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD, | ||
41 | PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) { | ||
42 | pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid); | ||
43 | |||
44 | for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { | ||
45 | pci_read_config_dword(nb_dev, lbnr[i], &ldtbus); | ||
46 | /* | ||
47 | * if there are no busses hanging off of the current | ||
48 | * ldt link then both the secondary and subordinate | ||
49 | * bus number fields are set to 0. | ||
50 | * | ||
51 | * RED-PEN | ||
52 | * This is slightly broken because it assumes | ||
53 | * HT node IDs == Linux node ids, which is not always | ||
54 | * true. However it is probably mostly true. | ||
55 | */ | ||
56 | if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0 | ||
57 | && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) { | ||
58 | for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); | ||
59 | j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); | ||
60 | j++) { | ||
61 | struct pci_bus *bus; | ||
62 | struct pci_sysdata *sd; | ||
63 | |||
64 | long node = NODE_ID(nid); | ||
65 | /* Algorithm a bit dumb, but | ||
66 | it shouldn't matter here */ | ||
67 | bus = pci_find_bus(0, j); | ||
68 | if (!bus) | ||
69 | continue; | ||
70 | if (!node_online(node)) | ||
71 | node = 0; | ||
72 | |||
73 | sd = bus->sysdata; | ||
74 | sd->node = node; | ||
75 | } | ||
76 | } | ||
77 | } | ||
78 | } | ||
79 | |||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | fs_initcall(fill_mp_bus_to_cpumask); | ||
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c deleted file mode 100644 index 4095e4d66a1d..000000000000 --- a/arch/x86_64/pci/mmconfig.c +++ /dev/null | |||
@@ -1,157 +0,0 @@ | |||
1 | /* | ||
2 | * mmconfig.c - Low-level direct PCI config space access via MMCONFIG | ||
3 | * | ||
4 | * This is an 64bit optimized version that always keeps the full mmconfig | ||
5 | * space mapped. This allows lockless config space operation. | ||
6 | */ | ||
7 | |||
8 | #include <linux/pci.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/acpi.h> | ||
11 | #include <linux/bitmap.h> | ||
12 | #include <asm/e820.h> | ||
13 | |||
14 | #include "pci.h" | ||
15 | |||
16 | /* Static virtual mapping of the MMCONFIG aperture */ | ||
17 | struct mmcfg_virt { | ||
18 | struct acpi_mcfg_allocation *cfg; | ||
19 | char __iomem *virt; | ||
20 | }; | ||
21 | static struct mmcfg_virt *pci_mmcfg_virt; | ||
22 | |||
23 | static char __iomem *get_virt(unsigned int seg, unsigned bus) | ||
24 | { | ||
25 | struct acpi_mcfg_allocation *cfg; | ||
26 | int cfg_num; | ||
27 | |||
28 | for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { | ||
29 | cfg = pci_mmcfg_virt[cfg_num].cfg; | ||
30 | if (cfg->pci_segment == seg && | ||
31 | (cfg->start_bus_number <= bus) && | ||
32 | (cfg->end_bus_number >= bus)) | ||
33 | return pci_mmcfg_virt[cfg_num].virt; | ||
34 | } | ||
35 | |||
36 | /* Fall back to type 0 */ | ||
37 | return NULL; | ||
38 | } | ||
39 | |||
40 | static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) | ||
41 | { | ||
42 | char __iomem *addr; | ||
43 | if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && | ||
44 | test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots)) | ||
45 | return NULL; | ||
46 | addr = get_virt(seg, bus); | ||
47 | if (!addr) | ||
48 | return NULL; | ||
49 | return addr + ((bus << 20) | (devfn << 12)); | ||
50 | } | ||
51 | |||
52 | static int pci_mmcfg_read(unsigned int seg, unsigned int bus, | ||
53 | unsigned int devfn, int reg, int len, u32 *value) | ||
54 | { | ||
55 | char __iomem *addr; | ||
56 | |||
57 | /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ | ||
58 | if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { | ||
59 | *value = -1; | ||
60 | return -EINVAL; | ||
61 | } | ||
62 | |||
63 | addr = pci_dev_base(seg, bus, devfn); | ||
64 | if (!addr) | ||
65 | return pci_conf1_read(seg,bus,devfn,reg,len,value); | ||
66 | |||
67 | switch (len) { | ||
68 | case 1: | ||
69 | *value = mmio_config_readb(addr + reg); | ||
70 | break; | ||
71 | case 2: | ||
72 | *value = mmio_config_readw(addr + reg); | ||
73 | break; | ||
74 | case 4: | ||
75 | *value = mmio_config_readl(addr + reg); | ||
76 | break; | ||
77 | } | ||
78 | |||
79 | return 0; | ||
80 | } | ||
81 | |||
82 | static int pci_mmcfg_write(unsigned int seg, unsigned int bus, | ||
83 | unsigned int devfn, int reg, int len, u32 value) | ||
84 | { | ||
85 | char __iomem *addr; | ||
86 | |||
87 | /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ | ||
88 | if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) | ||
89 | return -EINVAL; | ||
90 | |||
91 | addr = pci_dev_base(seg, bus, devfn); | ||
92 | if (!addr) | ||
93 | return pci_conf1_write(seg,bus,devfn,reg,len,value); | ||
94 | |||
95 | switch (len) { | ||
96 | case 1: | ||
97 | mmio_config_writeb(addr + reg, value); | ||
98 | break; | ||
99 | case 2: | ||
100 | mmio_config_writew(addr + reg, value); | ||
101 | break; | ||
102 | case 4: | ||
103 | mmio_config_writel(addr + reg, value); | ||
104 | break; | ||
105 | } | ||
106 | |||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | static struct pci_raw_ops pci_mmcfg = { | ||
111 | .read = pci_mmcfg_read, | ||
112 | .write = pci_mmcfg_write, | ||
113 | }; | ||
114 | |||
115 | static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) | ||
116 | { | ||
117 | void __iomem *addr; | ||
118 | u32 size; | ||
119 | |||
120 | size = (cfg->end_bus_number + 1) << 20; | ||
121 | addr = ioremap_nocache(cfg->address, size); | ||
122 | if (addr) { | ||
123 | printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", | ||
124 | cfg->address, cfg->address + size - 1); | ||
125 | } | ||
126 | return addr; | ||
127 | } | ||
128 | |||
129 | int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, | ||
130 | unsigned int devfn) | ||
131 | { | ||
132 | return pci_dev_base(seg, bus, devfn) != NULL; | ||
133 | } | ||
134 | |||
135 | int __init pci_mmcfg_arch_init(void) | ||
136 | { | ||
137 | int i; | ||
138 | pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * | ||
139 | pci_mmcfg_config_num, GFP_KERNEL); | ||
140 | if (pci_mmcfg_virt == NULL) { | ||
141 | printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); | ||
142 | return 0; | ||
143 | } | ||
144 | |||
145 | for (i = 0; i < pci_mmcfg_config_num; ++i) { | ||
146 | pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; | ||
147 | pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); | ||
148 | if (!pci_mmcfg_virt[i].virt) { | ||
149 | printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " | ||
150 | "segment %d\n", | ||
151 | pci_mmcfg_config[i].pci_segment); | ||
152 | return 0; | ||
153 | } | ||
154 | } | ||
155 | raw_pci_ops = &pci_mmcfg; | ||
156 | return 1; | ||
157 | } | ||
diff --git a/arch/x86_64/vdso/.gitignore b/arch/x86_64/vdso/.gitignore deleted file mode 100644 index f8b69d84238e..000000000000 --- a/arch/x86_64/vdso/.gitignore +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | vdso.lds | ||
diff --git a/arch/x86_64/vdso/Makefile b/arch/x86_64/vdso/Makefile deleted file mode 100644 index 8d03de029d9b..000000000000 --- a/arch/x86_64/vdso/Makefile +++ /dev/null | |||
@@ -1,49 +0,0 @@ | |||
1 | # | ||
2 | # x86-64 vDSO. | ||
3 | # | ||
4 | |||
5 | # files to link into the vdso | ||
6 | # vdso-start.o has to be first | ||
7 | vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o | ||
8 | |||
9 | # files to link into kernel | ||
10 | obj-y := vma.o vdso.o vdso-syms.o | ||
11 | |||
12 | vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) | ||
13 | |||
14 | $(obj)/vdso.o: $(obj)/vdso.so | ||
15 | |||
16 | targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o | ||
17 | |||
18 | # The DSO images are built using a special linker script. | ||
19 | quiet_cmd_syscall = SYSCALL $@ | ||
20 | cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \ | ||
21 | -Wl,-T,$(filter-out FORCE,$^) -o $@ | ||
22 | |||
23 | export CPPFLAGS_vdso.lds += -P -C -U$(ARCH) | ||
24 | |||
25 | vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ | ||
26 | $(call ld-option, -Wl$(comma)--hash-style=sysv) \ | ||
27 | -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 | ||
28 | SYSCFLAGS_vdso.so = $(vdso-flags) | ||
29 | |||
30 | $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so | ||
31 | |||
32 | $(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE | ||
33 | $(call if_changed,syscall) | ||
34 | |||
35 | CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 | ||
36 | |||
37 | $(obj)/vclock_gettime.o: CFLAGS = $(CFL) | ||
38 | $(obj)/vgetcpu.o: CFLAGS = $(CFL) | ||
39 | |||
40 | # We also create a special relocatable object that should mirror the symbol | ||
41 | # table and layout of the linked DSO. With ld -R we can then refer to | ||
42 | # these symbols in the kernel code rather than hand-coded addresses. | ||
43 | extra-y += vdso-syms.o | ||
44 | $(obj)/built-in.o: $(obj)/vdso-syms.o | ||
45 | $(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o | ||
46 | |||
47 | SYSCFLAGS_vdso-syms.o = -r -d | ||
48 | $(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE | ||
49 | $(call if_changed,syscall) | ||
diff --git a/arch/x86_64/vdso/vclock_gettime.c b/arch/x86_64/vdso/vclock_gettime.c deleted file mode 100644 index 5b54cdfb2b07..000000000000 --- a/arch/x86_64/vdso/vclock_gettime.c +++ /dev/null | |||
@@ -1,121 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2006 Andi Kleen, SUSE Labs. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Fast user context implementation of clock_gettime and gettimeofday. | ||
6 | * | ||
7 | * The code should have no internal unresolved relocations. | ||
8 | * Check with readelf after changing. | ||
9 | * Also alternative() doesn't work. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/posix-timers.h> | ||
14 | #include <linux/time.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <asm/vsyscall.h> | ||
17 | #include <asm/vgtod.h> | ||
18 | #include <asm/timex.h> | ||
19 | #include <asm/hpet.h> | ||
20 | #include <asm/unistd.h> | ||
21 | #include <asm/io.h> | ||
22 | #include <asm/vgtod.h> | ||
23 | #include "vextern.h" | ||
24 | |||
25 | #define gtod vdso_vsyscall_gtod_data | ||
26 | |||
27 | static long vdso_fallback_gettime(long clock, struct timespec *ts) | ||
28 | { | ||
29 | long ret; | ||
30 | asm("syscall" : "=a" (ret) : | ||
31 | "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); | ||
32 | return ret; | ||
33 | } | ||
34 | |||
35 | static inline long vgetns(void) | ||
36 | { | ||
37 | long v; | ||
38 | cycles_t (*vread)(void); | ||
39 | vread = gtod->clock.vread; | ||
40 | v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; | ||
41 | return (v * gtod->clock.mult) >> gtod->clock.shift; | ||
42 | } | ||
43 | |||
44 | static noinline int do_realtime(struct timespec *ts) | ||
45 | { | ||
46 | unsigned long seq, ns; | ||
47 | do { | ||
48 | seq = read_seqbegin(>od->lock); | ||
49 | ts->tv_sec = gtod->wall_time_sec; | ||
50 | ts->tv_nsec = gtod->wall_time_nsec; | ||
51 | ns = vgetns(); | ||
52 | } while (unlikely(read_seqretry(>od->lock, seq))); | ||
53 | timespec_add_ns(ts, ns); | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | /* Copy of the version in kernel/time.c which we cannot directly access */ | ||
58 | static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) | ||
59 | { | ||
60 | while (nsec >= NSEC_PER_SEC) { | ||
61 | nsec -= NSEC_PER_SEC; | ||
62 | ++sec; | ||
63 | } | ||
64 | while (nsec < 0) { | ||
65 | nsec += NSEC_PER_SEC; | ||
66 | --sec; | ||
67 | } | ||
68 | ts->tv_sec = sec; | ||
69 | ts->tv_nsec = nsec; | ||
70 | } | ||
71 | |||
72 | static noinline int do_monotonic(struct timespec *ts) | ||
73 | { | ||
74 | unsigned long seq, ns, secs; | ||
75 | do { | ||
76 | seq = read_seqbegin(>od->lock); | ||
77 | secs = gtod->wall_time_sec; | ||
78 | ns = gtod->wall_time_nsec + vgetns(); | ||
79 | secs += gtod->wall_to_monotonic.tv_sec; | ||
80 | ns += gtod->wall_to_monotonic.tv_nsec; | ||
81 | } while (unlikely(read_seqretry(>od->lock, seq))); | ||
82 | vset_normalized_timespec(ts, secs, ns); | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) | ||
87 | { | ||
88 | if (likely(gtod->sysctl_enabled && gtod->clock.vread)) | ||
89 | switch (clock) { | ||
90 | case CLOCK_REALTIME: | ||
91 | return do_realtime(ts); | ||
92 | case CLOCK_MONOTONIC: | ||
93 | return do_monotonic(ts); | ||
94 | } | ||
95 | return vdso_fallback_gettime(clock, ts); | ||
96 | } | ||
97 | int clock_gettime(clockid_t, struct timespec *) | ||
98 | __attribute__((weak, alias("__vdso_clock_gettime"))); | ||
99 | |||
100 | int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) | ||
101 | { | ||
102 | long ret; | ||
103 | if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { | ||
104 | BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != | ||
105 | offsetof(struct timespec, tv_nsec) || | ||
106 | sizeof(*tv) != sizeof(struct timespec)); | ||
107 | do_realtime((struct timespec *)tv); | ||
108 | tv->tv_usec /= 1000; | ||
109 | if (unlikely(tz != NULL)) { | ||
110 | /* This relies on gcc inlining the memcpy. We'll notice | ||
111 | if it ever fails to do so. */ | ||
112 | memcpy(tz, >od->sys_tz, sizeof(struct timezone)); | ||
113 | } | ||
114 | return 0; | ||
115 | } | ||
116 | asm("syscall" : "=a" (ret) : | ||
117 | "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); | ||
118 | return ret; | ||
119 | } | ||
120 | int gettimeofday(struct timeval *, struct timezone *) | ||
121 | __attribute__((weak, alias("__vdso_gettimeofday"))); | ||
diff --git a/arch/x86_64/vdso/vdso-note.S b/arch/x86_64/vdso/vdso-note.S deleted file mode 100644 index 79a071e4357e..000000000000 --- a/arch/x86_64/vdso/vdso-note.S +++ /dev/null | |||
@@ -1,12 +0,0 @@ | |||
1 | /* | ||
2 | * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. | ||
3 | * Here we can supply some information useful to userland. | ||
4 | */ | ||
5 | |||
6 | #include <linux/uts.h> | ||
7 | #include <linux/version.h> | ||
8 | #include <linux/elfnote.h> | ||
9 | |||
10 | ELFNOTE_START(Linux, 0, "a") | ||
11 | .long LINUX_VERSION_CODE | ||
12 | ELFNOTE_END | ||
diff --git a/arch/x86_64/vdso/vdso-start.S b/arch/x86_64/vdso/vdso-start.S deleted file mode 100644 index 2dc2cdb84d67..000000000000 --- a/arch/x86_64/vdso/vdso-start.S +++ /dev/null | |||
@@ -1,2 +0,0 @@ | |||
1 | .globl vdso_kernel_start | ||
2 | vdso_kernel_start: | ||
diff --git a/arch/x86_64/vdso/vdso.S b/arch/x86_64/vdso/vdso.S deleted file mode 100644 index 92e80c1972a7..000000000000 --- a/arch/x86_64/vdso/vdso.S +++ /dev/null | |||
@@ -1,2 +0,0 @@ | |||
1 | .section ".vdso","a" | ||
2 | .incbin "arch/x86_64/vdso/vdso.so" | ||
diff --git a/arch/x86_64/vdso/vdso.lds.S b/arch/x86_64/vdso/vdso.lds.S deleted file mode 100644 index b9a60e665d08..000000000000 --- a/arch/x86_64/vdso/vdso.lds.S +++ /dev/null | |||
@@ -1,77 +0,0 @@ | |||
1 | /* | ||
2 | * Linker script for vsyscall DSO. The vsyscall page is an ELF shared | ||
3 | * object prelinked to its virtual address, and with only one read-only | ||
4 | * segment (that fits in one page). This script controls its layout. | ||
5 | */ | ||
6 | #include <asm/asm-offsets.h> | ||
7 | #include "voffset.h" | ||
8 | |||
9 | #define VDSO_PRELINK 0xffffffffff700000 | ||
10 | |||
11 | SECTIONS | ||
12 | { | ||
13 | . = VDSO_PRELINK + SIZEOF_HEADERS; | ||
14 | |||
15 | .hash : { *(.hash) } :text | ||
16 | .gnu.hash : { *(.gnu.hash) } | ||
17 | .dynsym : { *(.dynsym) } | ||
18 | .dynstr : { *(.dynstr) } | ||
19 | .gnu.version : { *(.gnu.version) } | ||
20 | .gnu.version_d : { *(.gnu.version_d) } | ||
21 | .gnu.version_r : { *(.gnu.version_r) } | ||
22 | |||
23 | /* This linker script is used both with -r and with -shared. | ||
24 | For the layouts to match, we need to skip more than enough | ||
25 | space for the dynamic symbol table et al. If this amount | ||
26 | is insufficient, ld -shared will barf. Just increase it here. */ | ||
27 | . = VDSO_PRELINK + VDSO_TEXT_OFFSET; | ||
28 | |||
29 | .text : { *(.text) } :text | ||
30 | .text.ptr : { *(.text.ptr) } :text | ||
31 | . = VDSO_PRELINK + 0x900; | ||
32 | .data : { *(.data) } :text | ||
33 | .bss : { *(.bss) } :text | ||
34 | |||
35 | .altinstructions : { *(.altinstructions) } :text | ||
36 | .altinstr_replacement : { *(.altinstr_replacement) } :text | ||
37 | |||
38 | .note : { *(.note.*) } :text :note | ||
39 | .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr | ||
40 | .eh_frame : { KEEP (*(.eh_frame)) } :text | ||
41 | .dynamic : { *(.dynamic) } :text :dynamic | ||
42 | .useless : { | ||
43 | *(.got.plt) *(.got) | ||
44 | *(.gnu.linkonce.d.*) | ||
45 | *(.dynbss) | ||
46 | *(.gnu.linkonce.b.*) | ||
47 | } :text | ||
48 | } | ||
49 | |||
50 | /* | ||
51 | * We must supply the ELF program headers explicitly to get just one | ||
52 | * PT_LOAD segment, and set the flags explicitly to make segments read-only. | ||
53 | */ | ||
54 | PHDRS | ||
55 | { | ||
56 | text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ | ||
57 | dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ | ||
58 | note PT_NOTE FLAGS(4); /* PF_R */ | ||
59 | eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This controls what symbols we export from the DSO. | ||
64 | */ | ||
65 | VERSION | ||
66 | { | ||
67 | LINUX_2.6 { | ||
68 | global: | ||
69 | clock_gettime; | ||
70 | __vdso_clock_gettime; | ||
71 | gettimeofday; | ||
72 | __vdso_gettimeofday; | ||
73 | getcpu; | ||
74 | __vdso_getcpu; | ||
75 | local: *; | ||
76 | }; | ||
77 | } | ||
diff --git a/arch/x86_64/vdso/vextern.h b/arch/x86_64/vdso/vextern.h deleted file mode 100644 index 1683ba2ae3e8..000000000000 --- a/arch/x86_64/vdso/vextern.h +++ /dev/null | |||
@@ -1,16 +0,0 @@ | |||
1 | #ifndef VEXTERN | ||
2 | #include <asm/vsyscall.h> | ||
3 | #define VEXTERN(x) \ | ||
4 | extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden"))); | ||
5 | #endif | ||
6 | |||
7 | #define VMAGIC 0xfeedbabeabcdefabUL | ||
8 | |||
9 | /* Any kernel variables used in the vDSO must be exported in the main | ||
10 | kernel's vmlinux.lds.S/vsyscall.h/proper __section and | ||
11 | put into vextern.h and be referenced as a pointer with vdso prefix. | ||
12 | The main kernel later fills in the values. */ | ||
13 | |||
14 | VEXTERN(jiffies) | ||
15 | VEXTERN(vgetcpu_mode) | ||
16 | VEXTERN(vsyscall_gtod_data) | ||
diff --git a/arch/x86_64/vdso/vgetcpu.c b/arch/x86_64/vdso/vgetcpu.c deleted file mode 100644 index 91f6e85d0fc2..000000000000 --- a/arch/x86_64/vdso/vgetcpu.c +++ /dev/null | |||
@@ -1,50 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2006 Andi Kleen, SUSE Labs. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Fast user context implementation of getcpu() | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/getcpu.h> | ||
10 | #include <linux/jiffies.h> | ||
11 | #include <linux/time.h> | ||
12 | #include <asm/vsyscall.h> | ||
13 | #include <asm/vgtod.h> | ||
14 | #include "vextern.h" | ||
15 | |||
16 | long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) | ||
17 | { | ||
18 | unsigned int dummy, p; | ||
19 | unsigned long j = 0; | ||
20 | |||
21 | /* Fast cache - only recompute value once per jiffies and avoid | ||
22 | relatively costly rdtscp/cpuid otherwise. | ||
23 | This works because the scheduler usually keeps the process | ||
24 | on the same CPU and this syscall doesn't guarantee its | ||
25 | results anyways. | ||
26 | We do this here because otherwise user space would do it on | ||
27 | its own in a likely inferior way (no access to jiffies). | ||
28 | If you don't like it pass NULL. */ | ||
29 | if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) { | ||
30 | p = tcache->blob[1]; | ||
31 | } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { | ||
32 | /* Load per CPU data from RDTSCP */ | ||
33 | rdtscp(dummy, dummy, p); | ||
34 | } else { | ||
35 | /* Load per CPU data from GDT */ | ||
36 | asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); | ||
37 | } | ||
38 | if (tcache) { | ||
39 | tcache->blob[0] = j; | ||
40 | tcache->blob[1] = p; | ||
41 | } | ||
42 | if (cpu) | ||
43 | *cpu = p & 0xfff; | ||
44 | if (node) | ||
45 | *node = p >> 12; | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) | ||
50 | __attribute__((weak, alias("__vdso_getcpu"))); | ||
diff --git a/arch/x86_64/vdso/vma.c b/arch/x86_64/vdso/vma.c deleted file mode 100644 index ff9333e5fb08..000000000000 --- a/arch/x86_64/vdso/vma.c +++ /dev/null | |||
@@ -1,140 +0,0 @@ | |||
1 | /* | ||
2 | * Set up the VMAs to tell the VM about the vDSO. | ||
3 | * Copyright 2007 Andi Kleen, SUSE Labs. | ||
4 | * Subject to the GPL, v.2 | ||
5 | */ | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/err.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/random.h> | ||
11 | #include <asm/vsyscall.h> | ||
12 | #include <asm/vgtod.h> | ||
13 | #include <asm/proto.h> | ||
14 | #include "voffset.h" | ||
15 | |||
16 | int vdso_enabled = 1; | ||
17 | |||
18 | #define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x; | ||
19 | #include "vextern.h" | ||
20 | #undef VEXTERN | ||
21 | |||
22 | extern char vdso_kernel_start[], vdso_start[], vdso_end[]; | ||
23 | extern unsigned short vdso_sync_cpuid; | ||
24 | |||
25 | struct page **vdso_pages; | ||
26 | |||
27 | static inline void *var_ref(void *vbase, char *var, char *name) | ||
28 | { | ||
29 | unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET; | ||
30 | void *p = vbase + offset; | ||
31 | if (*(void **)p != (void *)VMAGIC) { | ||
32 | printk("VDSO: variable %s broken\n", name); | ||
33 | vdso_enabled = 0; | ||
34 | } | ||
35 | return p; | ||
36 | } | ||
37 | |||
38 | static int __init init_vdso_vars(void) | ||
39 | { | ||
40 | int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; | ||
41 | int i; | ||
42 | char *vbase; | ||
43 | |||
44 | vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); | ||
45 | if (!vdso_pages) | ||
46 | goto oom; | ||
47 | for (i = 0; i < npages; i++) { | ||
48 | struct page *p; | ||
49 | p = alloc_page(GFP_KERNEL); | ||
50 | if (!p) | ||
51 | goto oom; | ||
52 | vdso_pages[i] = p; | ||
53 | copy_page(page_address(p), vdso_start + i*PAGE_SIZE); | ||
54 | } | ||
55 | |||
56 | vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL); | ||
57 | if (!vbase) | ||
58 | goto oom; | ||
59 | |||
60 | if (memcmp(vbase, "\177ELF", 4)) { | ||
61 | printk("VDSO: I'm broken; not ELF\n"); | ||
62 | vdso_enabled = 0; | ||
63 | } | ||
64 | |||
65 | #define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x) | ||
66 | #define VEXTERN(x) \ | ||
67 | V(vdso_ ## x) = &__ ## x; | ||
68 | #include "vextern.h" | ||
69 | #undef VEXTERN | ||
70 | return 0; | ||
71 | |||
72 | oom: | ||
73 | printk("Cannot allocate vdso\n"); | ||
74 | vdso_enabled = 0; | ||
75 | return -ENOMEM; | ||
76 | } | ||
77 | __initcall(init_vdso_vars); | ||
78 | |||
79 | struct linux_binprm; | ||
80 | |||
81 | /* Put the vdso above the (randomized) stack with another randomized offset. | ||
82 | This way there is no hole in the middle of address space. | ||
83 | To save memory make sure it is still in the same PTE as the stack top. | ||
84 | This doesn't give that many random bits */ | ||
85 | static unsigned long vdso_addr(unsigned long start, unsigned len) | ||
86 | { | ||
87 | unsigned long addr, end; | ||
88 | unsigned offset; | ||
89 | end = (start + PMD_SIZE - 1) & PMD_MASK; | ||
90 | if (end >= TASK_SIZE64) | ||
91 | end = TASK_SIZE64; | ||
92 | end -= len; | ||
93 | /* This loses some more bits than a modulo, but is cheaper */ | ||
94 | offset = get_random_int() & (PTRS_PER_PTE - 1); | ||
95 | addr = start + (offset << PAGE_SHIFT); | ||
96 | if (addr >= end) | ||
97 | addr = end; | ||
98 | return addr; | ||
99 | } | ||
100 | |||
101 | /* Setup a VMA at program startup for the vsyscall page. | ||
102 | Not called for compat tasks */ | ||
103 | int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) | ||
104 | { | ||
105 | struct mm_struct *mm = current->mm; | ||
106 | unsigned long addr; | ||
107 | int ret; | ||
108 | unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); | ||
109 | |||
110 | if (!vdso_enabled) | ||
111 | return 0; | ||
112 | |||
113 | down_write(&mm->mmap_sem); | ||
114 | addr = vdso_addr(mm->start_stack, len); | ||
115 | addr = get_unmapped_area(NULL, addr, len, 0, 0); | ||
116 | if (IS_ERR_VALUE(addr)) { | ||
117 | ret = addr; | ||
118 | goto up_fail; | ||
119 | } | ||
120 | |||
121 | ret = install_special_mapping(mm, addr, len, | ||
122 | VM_READ|VM_EXEC| | ||
123 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| | ||
124 | VM_ALWAYSDUMP, | ||
125 | vdso_pages); | ||
126 | if (ret) | ||
127 | goto up_fail; | ||
128 | |||
129 | current->mm->context.vdso = (void *)addr; | ||
130 | up_fail: | ||
131 | up_write(&mm->mmap_sem); | ||
132 | return ret; | ||
133 | } | ||
134 | |||
135 | static __init int vdso_setup(char *s) | ||
136 | { | ||
137 | vdso_enabled = simple_strtoul(s, NULL, 0); | ||
138 | return 0; | ||
139 | } | ||
140 | __setup("vdso=", vdso_setup); | ||
diff --git a/arch/x86_64/vdso/voffset.h b/arch/x86_64/vdso/voffset.h deleted file mode 100644 index 4af67c79085f..000000000000 --- a/arch/x86_64/vdso/voffset.h +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | #define VDSO_TEXT_OFFSET 0x600 | ||
diff --git a/arch/x86_64/vdso/vvar.c b/arch/x86_64/vdso/vvar.c deleted file mode 100644 index 6fc22219a472..000000000000 --- a/arch/x86_64/vdso/vvar.c +++ /dev/null | |||
@@ -1,12 +0,0 @@ | |||
1 | /* Define pointer to external vDSO variables. | ||
2 | These are part of the vDSO. The kernel fills in the real addresses | ||
3 | at boot time. This is done because when the vdso is linked the | ||
4 | kernel isn't yet and we don't know the final addresses. */ | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/time.h> | ||
7 | #include <asm/vsyscall.h> | ||
8 | #include <asm/timex.h> | ||
9 | #include <asm/vgtod.h> | ||
10 | |||
11 | #define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC; | ||
12 | #include "vextern.h" | ||