diff options
Diffstat (limited to 'arch/x86_64/kernel')
53 files changed, 20636 insertions, 0 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile new file mode 100644 index 000000000000..0a3318e08ab6 --- /dev/null +++ b/arch/x86_64/kernel/Makefile | |||
@@ -0,0 +1,45 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | extra-y := head.o head64.o init_task.o vmlinux.lds | ||
6 | EXTRA_AFLAGS := -traditional | ||
7 | obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ | ||
8 | ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ | ||
9 | x8664_ksyms.o i387.o syscall.o vsyscall.o \ | ||
10 | setup64.o bootflag.o e820.o reboot.o quirks.o | ||
11 | |||
12 | obj-$(CONFIG_X86_MCE) += mce.o | ||
13 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o | ||
14 | obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ | ||
15 | obj-$(CONFIG_ACPI_BOOT) += acpi/ | ||
16 | obj-$(CONFIG_X86_MSR) += msr.o | ||
17 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
18 | obj-$(CONFIG_X86_CPUID) += cpuid.o | ||
19 | obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o | ||
20 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o | ||
21 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ | ||
22 | genapic.o genapic_cluster.o genapic_flat.o | ||
23 | obj-$(CONFIG_PM) += suspend.o | ||
24 | obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o | ||
25 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | ||
26 | obj-$(CONFIG_EARLY_PRINTK) += early_printk.o | ||
27 | obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o | ||
28 | obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o | ||
29 | obj-$(CONFIG_SWIOTLB) += swiotlb.o | ||
30 | obj-$(CONFIG_KPROBES) += kprobes.o | ||
31 | |||
32 | obj-$(CONFIG_MODULES) += module.o | ||
33 | |||
34 | obj-y += topology.o | ||
35 | obj-y += intel_cacheinfo.o | ||
36 | |||
37 | CFLAGS_vsyscall.o := $(PROFILING) -g0 | ||
38 | |||
39 | bootflag-y += ../../i386/kernel/bootflag.o | ||
40 | cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o | ||
41 | topology-y += ../../i386/mach-default/topology.o | ||
42 | swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o | ||
43 | microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o | ||
44 | intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o | ||
45 | quirks-y += ../../i386/kernel/quirks.o | ||
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile new file mode 100644 index 000000000000..d2c2ee5f9a88 --- /dev/null +++ b/arch/x86_64/kernel/acpi/Makefile | |||
@@ -0,0 +1,3 @@ | |||
1 | obj-$(CONFIG_ACPI_BOOT) := boot.o | ||
2 | boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o | ||
3 | obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o | ||
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c new file mode 100644 index 000000000000..7a275de6df22 --- /dev/null +++ b/arch/x86_64/kernel/acpi/sleep.c | |||
@@ -0,0 +1,132 @@ | |||
1 | /* | ||
2 | * acpi.c - Architecture-Specific Low-Level ACPI Support | ||
3 | * | ||
4 | * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> | ||
5 | * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> | ||
6 | * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> | ||
7 | * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) | ||
8 | * Copyright (C) 2003 Pavel Machek, SuSE Labs | ||
9 | * | ||
10 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License as published by | ||
14 | * the Free Software Foundation; either version 2 of the License, or | ||
15 | * (at your option) any later version. | ||
16 | * | ||
17 | * This program is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
20 | * GNU General Public License for more details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU General Public License | ||
23 | * along with this program; if not, write to the Free Software | ||
24 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
25 | * | ||
26 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
27 | */ | ||
28 | |||
29 | #include <linux/config.h> | ||
30 | #include <linux/kernel.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/types.h> | ||
33 | #include <linux/stddef.h> | ||
34 | #include <linux/slab.h> | ||
35 | #include <linux/pci.h> | ||
36 | #include <linux/bootmem.h> | ||
37 | #include <linux/irq.h> | ||
38 | #include <linux/acpi.h> | ||
39 | #include <asm/mpspec.h> | ||
40 | #include <asm/io.h> | ||
41 | #include <asm/apic.h> | ||
42 | #include <asm/apicdef.h> | ||
43 | #include <asm/page.h> | ||
44 | #include <asm/pgtable.h> | ||
45 | #include <asm/pgalloc.h> | ||
46 | #include <asm/io_apic.h> | ||
47 | #include <asm/proto.h> | ||
48 | #include <asm/tlbflush.h> | ||
49 | |||
50 | |||
51 | /* -------------------------------------------------------------------------- | ||
52 | Low-Level Sleep Support | ||
53 | -------------------------------------------------------------------------- */ | ||
54 | |||
55 | #ifdef CONFIG_ACPI_SLEEP | ||
56 | |||
57 | /* address in low memory of the wakeup routine. */ | ||
58 | unsigned long acpi_wakeup_address = 0; | ||
59 | unsigned long acpi_video_flags; | ||
60 | extern char wakeup_start, wakeup_end; | ||
61 | |||
62 | extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); | ||
63 | |||
64 | static pgd_t low_ptr; | ||
65 | |||
66 | static void init_low_mapping(void) | ||
67 | { | ||
68 | pgd_t *slot0 = pgd_offset(current->mm, 0UL); | ||
69 | low_ptr = *slot0; | ||
70 | set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); | ||
71 | flush_tlb_all(); | ||
72 | } | ||
73 | |||
74 | /** | ||
75 | * acpi_save_state_mem - save kernel state | ||
76 | * | ||
77 | * Create an identity mapped page table and copy the wakeup routine to | ||
78 | * low memory. | ||
79 | */ | ||
80 | int acpi_save_state_mem (void) | ||
81 | { | ||
82 | init_low_mapping(); | ||
83 | |||
84 | memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); | ||
85 | acpi_copy_wakeup_routine(acpi_wakeup_address); | ||
86 | |||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * acpi_restore_state | ||
92 | */ | ||
93 | void acpi_restore_state_mem (void) | ||
94 | { | ||
95 | set_pgd(pgd_offset(current->mm, 0UL), low_ptr); | ||
96 | flush_tlb_all(); | ||
97 | } | ||
98 | |||
99 | /** | ||
100 | * acpi_reserve_bootmem - do _very_ early ACPI initialisation | ||
101 | * | ||
102 | * We allocate a page in low memory for the wakeup | ||
103 | * routine for when we come back from a sleep state. The | ||
104 | * runtime allocator allows specification of <16M pages, but not | ||
105 | * <1M pages. | ||
106 | */ | ||
107 | void __init acpi_reserve_bootmem(void) | ||
108 | { | ||
109 | acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); | ||
110 | if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) | ||
111 | printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); | ||
112 | } | ||
113 | |||
114 | static int __init acpi_sleep_setup(char *str) | ||
115 | { | ||
116 | while ((str != NULL) && (*str != '\0')) { | ||
117 | if (strncmp(str, "s3_bios", 7) == 0) | ||
118 | acpi_video_flags = 1; | ||
119 | if (strncmp(str, "s3_mode", 7) == 0) | ||
120 | acpi_video_flags |= 2; | ||
121 | str = strchr(str, ','); | ||
122 | if (str != NULL) | ||
123 | str += strspn(str, ", \t"); | ||
124 | } | ||
125 | return 1; | ||
126 | } | ||
127 | |||
128 | __setup("acpi_sleep=", acpi_sleep_setup); | ||
129 | |||
130 | #endif /*CONFIG_ACPI_SLEEP*/ | ||
131 | |||
132 | void acpi_pci_link_exit(void) {} | ||
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S new file mode 100644 index 000000000000..a4c630034cd4 --- /dev/null +++ b/arch/x86_64/kernel/acpi/wakeup.S | |||
@@ -0,0 +1,527 @@ | |||
1 | .text | ||
2 | #include <linux/linkage.h> | ||
3 | #include <asm/segment.h> | ||
4 | #include <asm/page.h> | ||
5 | #include <asm/msr.h> | ||
6 | |||
7 | # Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 | ||
8 | # | ||
9 | # wakeup_code runs in real mode, and at unknown address (determined at run-time). | ||
10 | # Therefore it must only use relative jumps/calls. | ||
11 | # | ||
12 | # Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled | ||
13 | # | ||
14 | # If physical address of wakeup_code is 0x12345, BIOS should call us with | ||
15 | # cs = 0x1234, eip = 0x05 | ||
16 | # | ||
17 | |||
18 | |||
19 | ALIGN | ||
20 | .align 16 | ||
21 | ENTRY(wakeup_start) | ||
22 | wakeup_code: | ||
23 | wakeup_code_start = . | ||
24 | .code16 | ||
25 | |||
26 | # Running in *copy* of this code, somewhere in low 1MB. | ||
27 | |||
28 | movb $0xa1, %al ; outb %al, $0x80 | ||
29 | cli | ||
30 | cld | ||
31 | # setup data segment | ||
32 | movw %cs, %ax | ||
33 | movw %ax, %ds # Make ds:0 point to wakeup_start | ||
34 | movw %ax, %ss | ||
35 | mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board | ||
36 | |||
37 | pushl $0 # Kill any dangerous flags | ||
38 | popfl | ||
39 | |||
40 | movl real_magic - wakeup_code, %eax | ||
41 | cmpl $0x12345678, %eax | ||
42 | jne bogus_real_magic | ||
43 | |||
44 | testl $1, video_flags - wakeup_code | ||
45 | jz 1f | ||
46 | lcall $0xc000,$3 | ||
47 | movw %cs, %ax | ||
48 | movw %ax, %ds # Bios might have played with that | ||
49 | movw %ax, %ss | ||
50 | 1: | ||
51 | |||
52 | testl $2, video_flags - wakeup_code | ||
53 | jz 1f | ||
54 | mov video_mode - wakeup_code, %ax | ||
55 | call mode_seta | ||
56 | 1: | ||
57 | |||
58 | movw $0xb800, %ax | ||
59 | movw %ax,%fs | ||
60 | movw $0x0e00 + 'L', %fs:(0x10) | ||
61 | |||
62 | movb $0xa2, %al ; outb %al, $0x80 | ||
63 | |||
64 | lidt %ds:idt_48a - wakeup_code | ||
65 | xorl %eax, %eax | ||
66 | movw %ds, %ax # (Convert %ds:gdt to a linear ptr) | ||
67 | shll $4, %eax | ||
68 | addl $(gdta - wakeup_code), %eax | ||
69 | movl %eax, gdt_48a +2 - wakeup_code | ||
70 | lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is | ||
71 | # appropriate | ||
72 | |||
73 | movl $1, %eax # protected mode (PE) bit | ||
74 | lmsw %ax # This is it! | ||
75 | jmp 1f | ||
76 | 1: | ||
77 | |||
78 | .byte 0x66, 0xea # prefix + jmpi-opcode | ||
79 | .long wakeup_32 - __START_KERNEL_map | ||
80 | .word __KERNEL_CS | ||
81 | |||
82 | .code32 | ||
83 | wakeup_32: | ||
84 | # Running in this code, but at low address; paging is not yet turned on. | ||
85 | movb $0xa5, %al ; outb %al, $0x80 | ||
86 | |||
87 | /* Check if extended functions are implemented */ | ||
88 | movl $0x80000000, %eax | ||
89 | cpuid | ||
90 | cmpl $0x80000000, %eax | ||
91 | jbe bogus_cpu | ||
92 | wbinvd | ||
93 | mov $0x80000001, %eax | ||
94 | cpuid | ||
95 | btl $29, %edx | ||
96 | jnc bogus_cpu | ||
97 | movl %edx,%edi | ||
98 | |||
99 | movw $__KERNEL_DS, %ax | ||
100 | movw %ax, %ds | ||
101 | movw %ax, %es | ||
102 | movw %ax, %fs | ||
103 | movw %ax, %gs | ||
104 | |||
105 | movw $__KERNEL_DS, %ax | ||
106 | movw %ax, %ss | ||
107 | |||
108 | mov $(wakeup_stack - __START_KERNEL_map), %esp | ||
109 | movl saved_magic - __START_KERNEL_map, %eax | ||
110 | cmpl $0x9abcdef0, %eax | ||
111 | jne bogus_32_magic | ||
112 | |||
113 | /* | ||
114 | * Prepare for entering 64bits mode | ||
115 | */ | ||
116 | |||
117 | /* Enable PAE mode and PGE */ | ||
118 | xorl %eax, %eax | ||
119 | btsl $5, %eax | ||
120 | btsl $7, %eax | ||
121 | movl %eax, %cr4 | ||
122 | |||
123 | /* Setup early boot stage 4 level pagetables */ | ||
124 | movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax | ||
125 | movl %eax, %cr3 | ||
126 | |||
127 | /* Setup EFER (Extended Feature Enable Register) */ | ||
128 | movl $MSR_EFER, %ecx | ||
129 | rdmsr | ||
130 | /* Fool rdmsr and reset %eax to avoid dependences */ | ||
131 | xorl %eax, %eax | ||
132 | /* Enable Long Mode */ | ||
133 | btsl $_EFER_LME, %eax | ||
134 | /* Enable System Call */ | ||
135 | btsl $_EFER_SCE, %eax | ||
136 | |||
137 | /* No Execute supported? */ | ||
138 | btl $20,%edi | ||
139 | jnc 1f | ||
140 | btsl $_EFER_NX, %eax | ||
141 | 1: | ||
142 | |||
143 | /* Make changes effective */ | ||
144 | wrmsr | ||
145 | wbinvd | ||
146 | |||
147 | xorl %eax, %eax | ||
148 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | ||
149 | btsl $0, %eax /* Enable protected mode */ | ||
150 | btsl $1, %eax /* Enable MP */ | ||
151 | btsl $4, %eax /* Enable ET */ | ||
152 | btsl $5, %eax /* Enable NE */ | ||
153 | btsl $16, %eax /* Enable WP */ | ||
154 | btsl $18, %eax /* Enable AM */ | ||
155 | |||
156 | /* Make changes effective */ | ||
157 | movl %eax, %cr0 | ||
158 | /* At this point: | ||
159 | CR4.PAE must be 1 | ||
160 | CS.L must be 0 | ||
161 | CR3 must point to PML4 | ||
162 | Next instruction must be a branch | ||
163 | This must be on identity-mapped page | ||
164 | */ | ||
165 | jmp reach_compatibility_mode | ||
166 | reach_compatibility_mode: | ||
167 | movw $0x0e00 + 'i', %ds:(0xb8012) | ||
168 | movb $0xa8, %al ; outb %al, $0x80; | ||
169 | |||
170 | /* | ||
171 | * At this point we're in long mode but in 32bit compatibility mode | ||
172 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
173 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load | ||
174 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
175 | */ | ||
176 | |||
177 | movw $0x0e00 + 'n', %ds:(0xb8014) | ||
178 | movb $0xa9, %al ; outb %al, $0x80 | ||
179 | |||
180 | /* Load new GDT with the 64bit segment using 32bit descriptor */ | ||
181 | movl $(pGDT32 - __START_KERNEL_map), %eax | ||
182 | lgdt (%eax) | ||
183 | |||
184 | movl $(wakeup_jumpvector - __START_KERNEL_map), %eax | ||
185 | /* Finally jump in 64bit mode */ | ||
186 | ljmp *(%eax) | ||
187 | |||
188 | wakeup_jumpvector: | ||
189 | .long wakeup_long64 - __START_KERNEL_map | ||
190 | .word __KERNEL_CS | ||
191 | |||
192 | .code64 | ||
193 | |||
194 | /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ | ||
195 | wakeup_long64: | ||
196 | /* | ||
197 | * We must switch to a new descriptor in kernel space for the GDT | ||
198 | * because soon the kernel won't have access anymore to the userspace | ||
199 | * addresses where we're currently running on. We have to do that here | ||
200 | * because in 32bit we couldn't load a 64bit linear address. | ||
201 | */ | ||
202 | lgdt cpu_gdt_descr - __START_KERNEL_map | ||
203 | |||
204 | movw $0x0e00 + 'u', %ds:(0xb8016) | ||
205 | |||
206 | nop | ||
207 | nop | ||
208 | movw $__KERNEL_DS, %ax | ||
209 | movw %ax, %ss | ||
210 | movw %ax, %ds | ||
211 | movw %ax, %es | ||
212 | movw %ax, %fs | ||
213 | movw %ax, %gs | ||
214 | movq saved_esp, %rsp | ||
215 | |||
216 | movw $0x0e00 + 'x', %ds:(0xb8018) | ||
217 | movq saved_ebx, %rbx | ||
218 | movq saved_edi, %rdi | ||
219 | movq saved_esi, %rsi | ||
220 | movq saved_ebp, %rbp | ||
221 | |||
222 | movw $0x0e00 + '!', %ds:(0xb801a) | ||
223 | movq saved_eip, %rax | ||
224 | jmp *%rax | ||
225 | |||
226 | .code32 | ||
227 | |||
228 | .align 64 | ||
229 | gdta: | ||
230 | .word 0, 0, 0, 0 # dummy | ||
231 | |||
232 | .word 0, 0, 0, 0 # unused | ||
233 | |||
234 | .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) | ||
235 | .word 0 # base address = 0 | ||
236 | .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) | ||
237 | .word 0x00CF # granularity = 4096, 386 | ||
238 | # (+5th nibble of limit) | ||
239 | |||
240 | .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) | ||
241 | .word 0 # base address = 0 | ||
242 | .word 0x9200 # data read/write | ||
243 | .word 0x00CF # granularity = 4096, 386 | ||
244 | # (+5th nibble of limit) | ||
245 | # this is 64bit descriptor for code | ||
246 | .word 0xFFFF | ||
247 | .word 0 | ||
248 | .word 0x9A00 # code read/exec | ||
249 | .word 0x00AF # as above, but it is long mode and with D=0 | ||
250 | |||
251 | idt_48a: | ||
252 | .word 0 # idt limit = 0 | ||
253 | .word 0, 0 # idt base = 0L | ||
254 | |||
255 | gdt_48a: | ||
256 | .word 0x8000 # gdt limit=2048, | ||
257 | # 256 GDT entries | ||
258 | .word 0, 0 # gdt base (filled in later) | ||
259 | |||
260 | |||
261 | real_save_gdt: .word 0 | ||
262 | .quad 0 | ||
263 | real_magic: .quad 0 | ||
264 | video_mode: .quad 0 | ||
265 | video_flags: .quad 0 | ||
266 | |||
267 | bogus_real_magic: | ||
268 | movb $0xba,%al ; outb %al,$0x80 | ||
269 | jmp bogus_real_magic | ||
270 | |||
271 | bogus_32_magic: | ||
272 | movb $0xb3,%al ; outb %al,$0x80 | ||
273 | jmp bogus_32_magic | ||
274 | |||
275 | bogus_31_magic: | ||
276 | movb $0xb1,%al ; outb %al,$0x80 | ||
277 | jmp bogus_31_magic | ||
278 | |||
279 | bogus_cpu: | ||
280 | movb $0xbc,%al ; outb %al,$0x80 | ||
281 | jmp bogus_cpu | ||
282 | |||
283 | |||
284 | /* This code uses an extended set of video mode numbers. These include: | ||
285 | * Aliases for standard modes | ||
286 | * NORMAL_VGA (-1) | ||
287 | * EXTENDED_VGA (-2) | ||
288 | * ASK_VGA (-3) | ||
289 | * Video modes numbered by menu position -- NOT RECOMMENDED because of lack | ||
290 | * of compatibility when extending the table. These are between 0x00 and 0xff. | ||
291 | */ | ||
292 | #define VIDEO_FIRST_MENU 0x0000 | ||
293 | |||
294 | /* Standard BIOS video modes (BIOS number + 0x0100) */ | ||
295 | #define VIDEO_FIRST_BIOS 0x0100 | ||
296 | |||
297 | /* VESA BIOS video modes (VESA number + 0x0200) */ | ||
298 | #define VIDEO_FIRST_VESA 0x0200 | ||
299 | |||
300 | /* Video7 special modes (BIOS number + 0x0900) */ | ||
301 | #define VIDEO_FIRST_V7 0x0900 | ||
302 | |||
303 | # Setting of user mode (AX=mode ID) => CF=success | ||
304 | mode_seta: | ||
305 | movw %ax, %bx | ||
306 | #if 0 | ||
307 | cmpb $0xff, %ah | ||
308 | jz setalias | ||
309 | |||
310 | testb $VIDEO_RECALC>>8, %ah | ||
311 | jnz _setrec | ||
312 | |||
313 | cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah | ||
314 | jnc setres | ||
315 | |||
316 | cmpb $VIDEO_FIRST_SPECIAL>>8, %ah | ||
317 | jz setspc | ||
318 | |||
319 | cmpb $VIDEO_FIRST_V7>>8, %ah | ||
320 | jz setv7 | ||
321 | #endif | ||
322 | |||
323 | cmpb $VIDEO_FIRST_VESA>>8, %ah | ||
324 | jnc check_vesaa | ||
325 | #if 0 | ||
326 | orb %ah, %ah | ||
327 | jz setmenu | ||
328 | #endif | ||
329 | |||
330 | decb %ah | ||
331 | # jz setbios Add bios modes later | ||
332 | |||
333 | setbada: clc | ||
334 | ret | ||
335 | |||
336 | check_vesaa: | ||
337 | subb $VIDEO_FIRST_VESA>>8, %bh | ||
338 | orw $0x4000, %bx # Use linear frame buffer | ||
339 | movw $0x4f02, %ax # VESA BIOS mode set call | ||
340 | int $0x10 | ||
341 | cmpw $0x004f, %ax # AL=4f if implemented | ||
342 | jnz _setbada # AH=0 if OK | ||
343 | |||
344 | stc | ||
345 | ret | ||
346 | |||
347 | _setbada: jmp setbada | ||
348 | |||
349 | .code64 | ||
350 | bogus_magic: | ||
351 | movw $0x0e00 + 'B', %ds:(0xb8018) | ||
352 | jmp bogus_magic | ||
353 | |||
354 | bogus_magic2: | ||
355 | movw $0x0e00 + '2', %ds:(0xb8018) | ||
356 | jmp bogus_magic2 | ||
357 | |||
358 | |||
359 | wakeup_stack_begin: # Stack grows down | ||
360 | |||
361 | .org 0xff0 | ||
362 | wakeup_stack: # Just below end of page | ||
363 | |||
364 | ENTRY(wakeup_end) | ||
365 | |||
366 | ## | ||
367 | # acpi_copy_wakeup_routine | ||
368 | # | ||
369 | # Copy the above routine to low memory. | ||
370 | # | ||
371 | # Parameters: | ||
372 | # %rdi: place to copy wakeup routine to | ||
373 | # | ||
374 | # Returned address is location of code in low memory (past data and stack) | ||
375 | # | ||
376 | ENTRY(acpi_copy_wakeup_routine) | ||
377 | pushq %rax | ||
378 | pushq %rcx | ||
379 | pushq %rdx | ||
380 | |||
381 | sgdt saved_gdt | ||
382 | sidt saved_idt | ||
383 | sldt saved_ldt | ||
384 | str saved_tss | ||
385 | |||
386 | movq %cr3, %rdx | ||
387 | movq %rdx, saved_cr3 | ||
388 | movq %cr4, %rdx | ||
389 | movq %rdx, saved_cr4 | ||
390 | movq %cr0, %rdx | ||
391 | movq %rdx, saved_cr0 | ||
392 | sgdt real_save_gdt - wakeup_start (,%rdi) | ||
393 | movl $MSR_EFER, %ecx | ||
394 | rdmsr | ||
395 | movl %eax, saved_efer | ||
396 | movl %edx, saved_efer2 | ||
397 | |||
398 | movl saved_video_mode, %edx | ||
399 | movl %edx, video_mode - wakeup_start (,%rdi) | ||
400 | movl acpi_video_flags, %edx | ||
401 | movl %edx, video_flags - wakeup_start (,%rdi) | ||
402 | movq $0x12345678, real_magic - wakeup_start (,%rdi) | ||
403 | movq $0x123456789abcdef0, %rdx | ||
404 | movq %rdx, saved_magic | ||
405 | |||
406 | movl saved_magic - __START_KERNEL_map, %eax | ||
407 | cmpl $0x9abcdef0, %eax | ||
408 | jne bogus_32_magic | ||
409 | |||
410 | # make sure %cr4 is set correctly (features, etc) | ||
411 | movl saved_cr4 - __START_KERNEL_map, %eax | ||
412 | movq %rax, %cr4 | ||
413 | |||
414 | movl saved_cr0 - __START_KERNEL_map, %eax | ||
415 | movq %rax, %cr0 | ||
416 | jmp 1f # Flush pipelines | ||
417 | 1: | ||
418 | # restore the regs we used | ||
419 | popq %rdx | ||
420 | popq %rcx | ||
421 | popq %rax | ||
422 | ENTRY(do_suspend_lowlevel_s4bios) | ||
423 | ret | ||
424 | |||
425 | .align 2 | ||
426 | .p2align 4,,15 | ||
427 | .globl do_suspend_lowlevel | ||
428 | .type do_suspend_lowlevel,@function | ||
429 | do_suspend_lowlevel: | ||
430 | .LFB5: | ||
431 | subq $8, %rsp | ||
432 | xorl %eax, %eax | ||
433 | call save_processor_state | ||
434 | |||
435 | movq %rsp, saved_context_esp(%rip) | ||
436 | movq %rax, saved_context_eax(%rip) | ||
437 | movq %rbx, saved_context_ebx(%rip) | ||
438 | movq %rcx, saved_context_ecx(%rip) | ||
439 | movq %rdx, saved_context_edx(%rip) | ||
440 | movq %rbp, saved_context_ebp(%rip) | ||
441 | movq %rsi, saved_context_esi(%rip) | ||
442 | movq %rdi, saved_context_edi(%rip) | ||
443 | movq %r8, saved_context_r08(%rip) | ||
444 | movq %r9, saved_context_r09(%rip) | ||
445 | movq %r10, saved_context_r10(%rip) | ||
446 | movq %r11, saved_context_r11(%rip) | ||
447 | movq %r12, saved_context_r12(%rip) | ||
448 | movq %r13, saved_context_r13(%rip) | ||
449 | movq %r14, saved_context_r14(%rip) | ||
450 | movq %r15, saved_context_r15(%rip) | ||
451 | pushfq ; popq saved_context_eflags(%rip) | ||
452 | |||
453 | movq $.L97, saved_eip(%rip) | ||
454 | |||
455 | movq %rsp,saved_esp | ||
456 | movq %rbp,saved_ebp | ||
457 | movq %rbx,saved_ebx | ||
458 | movq %rdi,saved_edi | ||
459 | movq %rsi,saved_esi | ||
460 | |||
461 | addq $8, %rsp | ||
462 | movl $3, %edi | ||
463 | xorl %eax, %eax | ||
464 | jmp acpi_enter_sleep_state | ||
465 | .L97: | ||
466 | .p2align 4,,7 | ||
467 | .L99: | ||
468 | .align 4 | ||
469 | movl $24, %eax | ||
470 | movw %ax, %ds | ||
471 | movq saved_context+58(%rip), %rax | ||
472 | movq %rax, %cr4 | ||
473 | movq saved_context+50(%rip), %rax | ||
474 | movq %rax, %cr3 | ||
475 | movq saved_context+42(%rip), %rax | ||
476 | movq %rax, %cr2 | ||
477 | movq saved_context+34(%rip), %rax | ||
478 | movq %rax, %cr0 | ||
479 | pushq saved_context_eflags(%rip) ; popfq | ||
480 | movq saved_context_esp(%rip), %rsp | ||
481 | movq saved_context_ebp(%rip), %rbp | ||
482 | movq saved_context_eax(%rip), %rax | ||
483 | movq saved_context_ebx(%rip), %rbx | ||
484 | movq saved_context_ecx(%rip), %rcx | ||
485 | movq saved_context_edx(%rip), %rdx | ||
486 | movq saved_context_esi(%rip), %rsi | ||
487 | movq saved_context_edi(%rip), %rdi | ||
488 | movq saved_context_r08(%rip), %r8 | ||
489 | movq saved_context_r09(%rip), %r9 | ||
490 | movq saved_context_r10(%rip), %r10 | ||
491 | movq saved_context_r11(%rip), %r11 | ||
492 | movq saved_context_r12(%rip), %r12 | ||
493 | movq saved_context_r13(%rip), %r13 | ||
494 | movq saved_context_r14(%rip), %r14 | ||
495 | movq saved_context_r15(%rip), %r15 | ||
496 | |||
497 | xorl %eax, %eax | ||
498 | addq $8, %rsp | ||
499 | jmp restore_processor_state | ||
500 | .LFE5: | ||
501 | .Lfe5: | ||
502 | .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel | ||
503 | |||
504 | .data | ||
505 | ALIGN | ||
506 | ENTRY(saved_ebp) .quad 0 | ||
507 | ENTRY(saved_esi) .quad 0 | ||
508 | ENTRY(saved_edi) .quad 0 | ||
509 | ENTRY(saved_ebx) .quad 0 | ||
510 | |||
511 | ENTRY(saved_eip) .quad 0 | ||
512 | ENTRY(saved_esp) .quad 0 | ||
513 | |||
514 | ENTRY(saved_magic) .quad 0 | ||
515 | |||
516 | ALIGN | ||
517 | # saved registers | ||
518 | saved_gdt: .quad 0,0 | ||
519 | saved_idt: .quad 0,0 | ||
520 | saved_ldt: .quad 0 | ||
521 | saved_tss: .quad 0 | ||
522 | |||
523 | saved_cr0: .quad 0 | ||
524 | saved_cr3: .quad 0 | ||
525 | saved_cr4: .quad 0 | ||
526 | saved_efer: .quad 0 | ||
527 | saved_efer2: .quad 0 | ||
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c new file mode 100644 index 000000000000..4baa99fe1e5c --- /dev/null +++ b/arch/x86_64/kernel/aperture.c | |||
@@ -0,0 +1,286 @@ | |||
1 | /* | ||
2 | * Firmware replacement code. | ||
3 | * | ||
4 | * Work around broken BIOSes that don't set an aperture or only set the | ||
5 | * aperture in the AGP bridge. | ||
6 | * If all fails map the aperture over some low memory. This is cheaper than | ||
7 | * doing bounce buffering. The memory is lost. This is done at early boot | ||
8 | * because only the bootmem allocator can allocate 32+MB. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ | ||
12 | */ | ||
13 | #include <linux/config.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/bootmem.h> | ||
18 | #include <linux/mmzone.h> | ||
19 | #include <linux/pci_ids.h> | ||
20 | #include <linux/pci.h> | ||
21 | #include <linux/bitops.h> | ||
22 | #include <asm/e820.h> | ||
23 | #include <asm/io.h> | ||
24 | #include <asm/proto.h> | ||
25 | #include <asm/pci-direct.h> | ||
26 | |||
27 | int iommu_aperture; | ||
28 | int iommu_aperture_disabled __initdata = 0; | ||
29 | int iommu_aperture_allowed __initdata = 0; | ||
30 | |||
31 | int fallback_aper_order __initdata = 1; /* 64MB */ | ||
32 | int fallback_aper_force __initdata = 0; | ||
33 | |||
34 | int fix_aperture __initdata = 1; | ||
35 | |||
36 | /* This code runs before the PCI subsystem is initialized, so just | ||
37 | access the northbridge directly. */ | ||
38 | |||
39 | #define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) | ||
40 | |||
41 | static u32 __init allocate_aperture(void) | ||
42 | { | ||
43 | #ifdef CONFIG_DISCONTIGMEM | ||
44 | pg_data_t *nd0 = NODE_DATA(0); | ||
45 | #else | ||
46 | pg_data_t *nd0 = &contig_page_data; | ||
47 | #endif | ||
48 | u32 aper_size; | ||
49 | void *p; | ||
50 | |||
51 | if (fallback_aper_order > 7) | ||
52 | fallback_aper_order = 7; | ||
53 | aper_size = (32 * 1024 * 1024) << fallback_aper_order; | ||
54 | |||
55 | /* | ||
56 | * Aperture has to be naturally aligned. This means an 2GB aperture won't | ||
57 | * have much chances to find a place in the lower 4GB of memory. | ||
58 | * Unfortunately we cannot move it up because that would make the | ||
59 | * IOMMU useless. | ||
60 | */ | ||
61 | p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); | ||
62 | if (!p || __pa(p)+aper_size > 0xffffffff) { | ||
63 | printk("Cannot allocate aperture memory hole (%p,%uK)\n", | ||
64 | p, aper_size>>10); | ||
65 | if (p) | ||
66 | free_bootmem_node(nd0, (unsigned long)p, aper_size); | ||
67 | return 0; | ||
68 | } | ||
69 | printk("Mapping aperture over %d KB of RAM @ %lx\n", | ||
70 | aper_size >> 10, __pa(p)); | ||
71 | return (u32)__pa(p); | ||
72 | } | ||
73 | |||
74 | static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) | ||
75 | { | ||
76 | if (!aper_base) | ||
77 | return 0; | ||
78 | if (aper_size < 64*1024*1024) { | ||
79 | printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); | ||
80 | return 0; | ||
81 | } | ||
82 | if (aper_base + aper_size >= 0xffffffff) { | ||
83 | printk("Aperture from %s beyond 4GB. Ignoring.\n",name); | ||
84 | return 0; | ||
85 | } | ||
86 | if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { | ||
87 | printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); | ||
88 | return 0; | ||
89 | } | ||
90 | return 1; | ||
91 | } | ||
92 | |||
93 | /* Find a PCI capability */ | ||
94 | static __u32 __init find_cap(int num, int slot, int func, int cap) | ||
95 | { | ||
96 | u8 pos; | ||
97 | int bytes; | ||
98 | if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) | ||
99 | return 0; | ||
100 | pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); | ||
101 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { | ||
102 | u8 id; | ||
103 | pos &= ~3; | ||
104 | id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); | ||
105 | if (id == 0xff) | ||
106 | break; | ||
107 | if (id == cap) | ||
108 | return pos; | ||
109 | pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); | ||
110 | } | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | /* Read a standard AGPv3 bridge header */ | ||
115 | static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) | ||
116 | { | ||
117 | u32 apsize; | ||
118 | u32 apsizereg; | ||
119 | int nbits; | ||
120 | u32 aper_low, aper_hi; | ||
121 | u64 aper; | ||
122 | |||
123 | printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); | ||
124 | apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); | ||
125 | if (apsizereg == 0xffffffff) { | ||
126 | printk("APSIZE in AGP bridge unreadable\n"); | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | apsize = apsizereg & 0xfff; | ||
131 | /* Some BIOS use weird encodings not in the AGPv3 table. */ | ||
132 | if (apsize & 0xff) | ||
133 | apsize |= 0xf00; | ||
134 | nbits = hweight16(apsize); | ||
135 | *order = 7 - nbits; | ||
136 | if ((int)*order < 0) /* < 32MB */ | ||
137 | *order = 0; | ||
138 | |||
139 | aper_low = read_pci_config(num,slot,func, 0x10); | ||
140 | aper_hi = read_pci_config(num,slot,func,0x14); | ||
141 | aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); | ||
142 | |||
143 | printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", | ||
144 | aper, 32 << *order, apsizereg); | ||
145 | |||
146 | if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) | ||
147 | return 0; | ||
148 | return (u32)aper; | ||
149 | } | ||
150 | |||
151 | /* Look for an AGP bridge. Windows only expects the aperture in the | ||
152 | AGP bridge and some BIOS forget to initialize the Northbridge too. | ||
153 | Work around this here. | ||
154 | |||
155 | Do an PCI bus scan by hand because we're running before the PCI | ||
156 | subsystem. | ||
157 | |||
158 | All K8 AGP bridges are AGPv3 compliant, so we can do this scan | ||
159 | generically. It's probably overkill to always scan all slots because | ||
160 | the AGP bridges should be always an own bus on the HT hierarchy, | ||
161 | but do it here for future safety. */ | ||
162 | static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) | ||
163 | { | ||
164 | int num, slot, func; | ||
165 | |||
166 | /* Poor man's PCI discovery */ | ||
167 | for (num = 0; num < 32; num++) { | ||
168 | for (slot = 0; slot < 32; slot++) { | ||
169 | for (func = 0; func < 8; func++) { | ||
170 | u32 class, cap; | ||
171 | u8 type; | ||
172 | class = read_pci_config(num,slot,func, | ||
173 | PCI_CLASS_REVISION); | ||
174 | if (class == 0xffffffff) | ||
175 | break; | ||
176 | |||
177 | switch (class >> 16) { | ||
178 | case PCI_CLASS_BRIDGE_HOST: | ||
179 | case PCI_CLASS_BRIDGE_OTHER: /* needed? */ | ||
180 | /* AGP bridge? */ | ||
181 | cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); | ||
182 | if (!cap) | ||
183 | break; | ||
184 | *valid_agp = 1; | ||
185 | return read_agp(num,slot,func,cap,order); | ||
186 | } | ||
187 | |||
188 | /* No multi-function device? */ | ||
189 | type = read_pci_config_byte(num,slot,func, | ||
190 | PCI_HEADER_TYPE); | ||
191 | if (!(type & 0x80)) | ||
192 | break; | ||
193 | } | ||
194 | } | ||
195 | } | ||
196 | printk("No AGP bridge found\n"); | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | void __init iommu_hole_init(void) | ||
201 | { | ||
202 | int fix, num; | ||
203 | u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; | ||
204 | u64 aper_base, last_aper_base = 0; | ||
205 | int valid_agp = 0; | ||
206 | |||
207 | if (iommu_aperture_disabled || !fix_aperture) | ||
208 | return; | ||
209 | |||
210 | printk("Checking aperture...\n"); | ||
211 | |||
212 | fix = 0; | ||
213 | for (num = 24; num < 32; num++) { | ||
214 | char name[30]; | ||
215 | if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) | ||
216 | continue; | ||
217 | |||
218 | iommu_aperture = 1; | ||
219 | |||
220 | aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; | ||
221 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
222 | aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; | ||
223 | aper_base <<= 25; | ||
224 | |||
225 | printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, | ||
226 | aper_base, aper_size>>20); | ||
227 | |||
228 | sprintf(name, "northbridge cpu %d", num-24); | ||
229 | |||
230 | if (!aperture_valid(name, aper_base, aper_size)) { | ||
231 | fix = 1; | ||
232 | break; | ||
233 | } | ||
234 | |||
235 | if ((last_aper_order && aper_order != last_aper_order) || | ||
236 | (last_aper_base && aper_base != last_aper_base)) { | ||
237 | fix = 1; | ||
238 | break; | ||
239 | } | ||
240 | last_aper_order = aper_order; | ||
241 | last_aper_base = aper_base; | ||
242 | } | ||
243 | |||
244 | if (!fix && !fallback_aper_force) | ||
245 | return; | ||
246 | |||
247 | if (!fallback_aper_force) | ||
248 | aper_alloc = search_agp_bridge(&aper_order, &valid_agp); | ||
249 | |||
250 | if (aper_alloc) { | ||
251 | /* Got the aperture from the AGP bridge */ | ||
252 | } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || | ||
253 | force_iommu || | ||
254 | valid_agp || | ||
255 | fallback_aper_force) { | ||
256 | printk("Your BIOS doesn't leave a aperture memory hole\n"); | ||
257 | printk("Please enable the IOMMU option in the BIOS setup\n"); | ||
258 | printk("This costs you %d MB of RAM\n", | ||
259 | 32 << fallback_aper_order); | ||
260 | |||
261 | aper_order = fallback_aper_order; | ||
262 | aper_alloc = allocate_aperture(); | ||
263 | if (!aper_alloc) { | ||
264 | /* Could disable AGP and IOMMU here, but it's probably | ||
265 | not worth it. But the later users cannot deal with | ||
266 | bad apertures and turning on the aperture over memory | ||
267 | causes very strange problems, so it's better to | ||
268 | panic early. */ | ||
269 | panic("Not enough memory for aperture"); | ||
270 | } | ||
271 | } else { | ||
272 | return; | ||
273 | } | ||
274 | |||
275 | /* Fix up the north bridges */ | ||
276 | for (num = 24; num < 32; num++) { | ||
277 | if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) | ||
278 | continue; | ||
279 | |||
280 | /* Don't enable translation yet. That is done later. | ||
281 | Assume this BIOS didn't initialise the GART so | ||
282 | just overwrite all previous bits */ | ||
283 | write_pci_config(0, num, 3, 0x90, aper_order<<1); | ||
284 | write_pci_config(0, num, 3, 0x94, aper_alloc>>25); | ||
285 | } | ||
286 | } | ||
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c new file mode 100644 index 000000000000..9826f03dc9a3 --- /dev/null +++ b/arch/x86_64/kernel/apic.c | |||
@@ -0,0 +1,1088 @@ | |||
1 | /* | ||
2 | * Local APIC handling, local APIC timers | ||
3 | * | ||
4 | * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes | ||
7 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
8 | * thanks to Eric Gilmore | ||
9 | * and Rolf G. Tews | ||
10 | * for testing these extensively. | ||
11 | * Maciej W. Rozycki : Various updates and fixes. | ||
12 | * Mikael Pettersson : Power Management for UP-APIC. | ||
13 | * Pavel Machek and | ||
14 | * Mikael Pettersson : PM converted to driver model. | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <linux/init.h> | ||
19 | |||
20 | #include <linux/mm.h> | ||
21 | #include <linux/irq.h> | ||
22 | #include <linux/delay.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/smp_lock.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/mc146818rtc.h> | ||
27 | #include <linux/kernel_stat.h> | ||
28 | #include <linux/sysdev.h> | ||
29 | |||
30 | #include <asm/atomic.h> | ||
31 | #include <asm/smp.h> | ||
32 | #include <asm/mtrr.h> | ||
33 | #include <asm/mpspec.h> | ||
34 | #include <asm/pgalloc.h> | ||
35 | #include <asm/mach_apic.h> | ||
36 | |||
37 | int apic_verbosity; | ||
38 | |||
39 | int disable_apic_timer __initdata; | ||
40 | |||
41 | /* Using APIC to generate smp_local_timer_interrupt? */ | ||
42 | int using_apic_timer = 0; | ||
43 | |||
44 | static DEFINE_PER_CPU(int, prof_multiplier) = 1; | ||
45 | static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; | ||
46 | static DEFINE_PER_CPU(int, prof_counter) = 1; | ||
47 | |||
48 | static void apic_pm_activate(void); | ||
49 | |||
50 | void enable_NMI_through_LVT0 (void * dummy) | ||
51 | { | ||
52 | unsigned int v, ver; | ||
53 | |||
54 | ver = apic_read(APIC_LVR); | ||
55 | ver = GET_APIC_VERSION(ver); | ||
56 | v = APIC_DM_NMI; /* unmask and set to NMI */ | ||
57 | apic_write_around(APIC_LVT0, v); | ||
58 | } | ||
59 | |||
60 | int get_maxlvt(void) | ||
61 | { | ||
62 | unsigned int v, ver, maxlvt; | ||
63 | |||
64 | v = apic_read(APIC_LVR); | ||
65 | ver = GET_APIC_VERSION(v); | ||
66 | maxlvt = GET_APIC_MAXLVT(v); | ||
67 | return maxlvt; | ||
68 | } | ||
69 | |||
70 | void clear_local_APIC(void) | ||
71 | { | ||
72 | int maxlvt; | ||
73 | unsigned int v; | ||
74 | |||
75 | maxlvt = get_maxlvt(); | ||
76 | |||
77 | /* | ||
78 | * Masking an LVT entry on a P6 can trigger a local APIC error | ||
79 | * if the vector is zero. Mask LVTERR first to prevent this. | ||
80 | */ | ||
81 | if (maxlvt >= 3) { | ||
82 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ | ||
83 | apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); | ||
84 | } | ||
85 | /* | ||
86 | * Careful: we have to set masks only first to deassert | ||
87 | * any level-triggered sources. | ||
88 | */ | ||
89 | v = apic_read(APIC_LVTT); | ||
90 | apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | ||
91 | v = apic_read(APIC_LVT0); | ||
92 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | ||
93 | v = apic_read(APIC_LVT1); | ||
94 | apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); | ||
95 | if (maxlvt >= 4) { | ||
96 | v = apic_read(APIC_LVTPC); | ||
97 | apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Clean APIC state for other OSs: | ||
102 | */ | ||
103 | apic_write_around(APIC_LVTT, APIC_LVT_MASKED); | ||
104 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED); | ||
105 | apic_write_around(APIC_LVT1, APIC_LVT_MASKED); | ||
106 | if (maxlvt >= 3) | ||
107 | apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); | ||
108 | if (maxlvt >= 4) | ||
109 | apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); | ||
110 | v = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
111 | if (APIC_INTEGRATED(v)) { /* !82489DX */ | ||
112 | if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ | ||
113 | apic_write(APIC_ESR, 0); | ||
114 | apic_read(APIC_ESR); | ||
115 | } | ||
116 | } | ||
117 | |||
118 | void __init connect_bsp_APIC(void) | ||
119 | { | ||
120 | if (pic_mode) { | ||
121 | /* | ||
122 | * Do not trust the local APIC being empty at bootup. | ||
123 | */ | ||
124 | clear_local_APIC(); | ||
125 | /* | ||
126 | * PIC mode, enable APIC mode in the IMCR, i.e. | ||
127 | * connect BSP's local APIC to INT and NMI lines. | ||
128 | */ | ||
129 | apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); | ||
130 | outb(0x70, 0x22); | ||
131 | outb(0x01, 0x23); | ||
132 | } | ||
133 | } | ||
134 | |||
135 | void disconnect_bsp_APIC(void) | ||
136 | { | ||
137 | if (pic_mode) { | ||
138 | /* | ||
139 | * Put the board back into PIC mode (has an effect | ||
140 | * only on certain older boards). Note that APIC | ||
141 | * interrupts, including IPIs, won't work beyond | ||
142 | * this point! The only exception are INIT IPIs. | ||
143 | */ | ||
144 | apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); | ||
145 | outb(0x70, 0x22); | ||
146 | outb(0x00, 0x23); | ||
147 | } | ||
148 | } | ||
149 | |||
150 | void disable_local_APIC(void) | ||
151 | { | ||
152 | unsigned int value; | ||
153 | |||
154 | clear_local_APIC(); | ||
155 | |||
156 | /* | ||
157 | * Disable APIC (implies clearing of registers | ||
158 | * for 82489DX!). | ||
159 | */ | ||
160 | value = apic_read(APIC_SPIV); | ||
161 | value &= ~APIC_SPIV_APIC_ENABLED; | ||
162 | apic_write_around(APIC_SPIV, value); | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * This is to verify that we're looking at a real local APIC. | ||
167 | * Check these against your board if the CPUs aren't getting | ||
168 | * started for no apparent reason. | ||
169 | */ | ||
170 | int __init verify_local_APIC(void) | ||
171 | { | ||
172 | unsigned int reg0, reg1; | ||
173 | |||
174 | /* | ||
175 | * The version register is read-only in a real APIC. | ||
176 | */ | ||
177 | reg0 = apic_read(APIC_LVR); | ||
178 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); | ||
179 | apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); | ||
180 | reg1 = apic_read(APIC_LVR); | ||
181 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); | ||
182 | |||
183 | /* | ||
184 | * The two version reads above should print the same | ||
185 | * numbers. If the second one is different, then we | ||
186 | * poke at a non-APIC. | ||
187 | */ | ||
188 | if (reg1 != reg0) | ||
189 | return 0; | ||
190 | |||
191 | /* | ||
192 | * Check if the version looks reasonably. | ||
193 | */ | ||
194 | reg1 = GET_APIC_VERSION(reg0); | ||
195 | if (reg1 == 0x00 || reg1 == 0xff) | ||
196 | return 0; | ||
197 | reg1 = get_maxlvt(); | ||
198 | if (reg1 < 0x02 || reg1 == 0xff) | ||
199 | return 0; | ||
200 | |||
201 | /* | ||
202 | * The ID register is read/write in a real APIC. | ||
203 | */ | ||
204 | reg0 = apic_read(APIC_ID); | ||
205 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | ||
206 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); | ||
207 | reg1 = apic_read(APIC_ID); | ||
208 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | ||
209 | apic_write(APIC_ID, reg0); | ||
210 | if (reg1 != (reg0 ^ APIC_ID_MASK)) | ||
211 | return 0; | ||
212 | |||
213 | /* | ||
214 | * The next two are just to see if we have sane values. | ||
215 | * They're only really relevant if we're in Virtual Wire | ||
216 | * compatibility mode, but most boxes are anymore. | ||
217 | */ | ||
218 | reg0 = apic_read(APIC_LVT0); | ||
219 | apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); | ||
220 | reg1 = apic_read(APIC_LVT1); | ||
221 | apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); | ||
222 | |||
223 | return 1; | ||
224 | } | ||
225 | |||
226 | void __init sync_Arb_IDs(void) | ||
227 | { | ||
228 | /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ | ||
229 | unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
230 | if (ver >= 0x14) /* P4 or higher */ | ||
231 | return; | ||
232 | |||
233 | /* | ||
234 | * Wait for idle. | ||
235 | */ | ||
236 | apic_wait_icr_idle(); | ||
237 | |||
238 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | ||
239 | apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | ||
240 | | APIC_DM_INIT); | ||
241 | } | ||
242 | |||
243 | extern void __error_in_apic_c (void); | ||
244 | |||
245 | /* | ||
246 | * An initial setup of the virtual wire mode. | ||
247 | */ | ||
248 | void __init init_bsp_APIC(void) | ||
249 | { | ||
250 | unsigned int value, ver; | ||
251 | |||
252 | /* | ||
253 | * Don't do the setup now if we have a SMP BIOS as the | ||
254 | * through-I/O-APIC virtual wire mode might be active. | ||
255 | */ | ||
256 | if (smp_found_config || !cpu_has_apic) | ||
257 | return; | ||
258 | |||
259 | value = apic_read(APIC_LVR); | ||
260 | ver = GET_APIC_VERSION(value); | ||
261 | |||
262 | /* | ||
263 | * Do not trust the local APIC being empty at bootup. | ||
264 | */ | ||
265 | clear_local_APIC(); | ||
266 | |||
267 | /* | ||
268 | * Enable APIC. | ||
269 | */ | ||
270 | value = apic_read(APIC_SPIV); | ||
271 | value &= ~APIC_VECTOR_MASK; | ||
272 | value |= APIC_SPIV_APIC_ENABLED; | ||
273 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
274 | value |= SPURIOUS_APIC_VECTOR; | ||
275 | apic_write_around(APIC_SPIV, value); | ||
276 | |||
277 | /* | ||
278 | * Set up the virtual wire mode. | ||
279 | */ | ||
280 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | ||
281 | value = APIC_DM_NMI; | ||
282 | if (!APIC_INTEGRATED(ver)) /* 82489DX */ | ||
283 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
284 | apic_write_around(APIC_LVT1, value); | ||
285 | } | ||
286 | |||
287 | void __init setup_local_APIC (void) | ||
288 | { | ||
289 | unsigned int value, ver, maxlvt; | ||
290 | |||
291 | /* Pound the ESR really hard over the head with a big hammer - mbligh */ | ||
292 | if (esr_disable) { | ||
293 | apic_write(APIC_ESR, 0); | ||
294 | apic_write(APIC_ESR, 0); | ||
295 | apic_write(APIC_ESR, 0); | ||
296 | apic_write(APIC_ESR, 0); | ||
297 | } | ||
298 | |||
299 | value = apic_read(APIC_LVR); | ||
300 | ver = GET_APIC_VERSION(value); | ||
301 | |||
302 | if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) | ||
303 | __error_in_apic_c(); | ||
304 | |||
305 | /* | ||
306 | * Double-check whether this APIC is really registered. | ||
307 | * This is meaningless in clustered apic mode, so we skip it. | ||
308 | */ | ||
309 | if (!apic_id_registered()) | ||
310 | BUG(); | ||
311 | |||
312 | /* | ||
313 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
314 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
315 | * document number 292116). So here it goes... | ||
316 | */ | ||
317 | init_apic_ldr(); | ||
318 | |||
319 | /* | ||
320 | * Set Task Priority to 'accept all'. We never change this | ||
321 | * later on. | ||
322 | */ | ||
323 | value = apic_read(APIC_TASKPRI); | ||
324 | value &= ~APIC_TPRI_MASK; | ||
325 | apic_write_around(APIC_TASKPRI, value); | ||
326 | |||
327 | /* | ||
328 | * Now that we are all set up, enable the APIC | ||
329 | */ | ||
330 | value = apic_read(APIC_SPIV); | ||
331 | value &= ~APIC_VECTOR_MASK; | ||
332 | /* | ||
333 | * Enable APIC | ||
334 | */ | ||
335 | value |= APIC_SPIV_APIC_ENABLED; | ||
336 | |||
337 | /* | ||
338 | * Some unknown Intel IO/APIC (or APIC) errata is biting us with | ||
339 | * certain networking cards. If high frequency interrupts are | ||
340 | * happening on a particular IOAPIC pin, plus the IOAPIC routing | ||
341 | * entry is masked/unmasked at a high rate as well then sooner or | ||
342 | * later IOAPIC line gets 'stuck', no more interrupts are received | ||
343 | * from the device. If focus CPU is disabled then the hang goes | ||
344 | * away, oh well :-( | ||
345 | * | ||
346 | * [ This bug can be reproduced easily with a level-triggered | ||
347 | * PCI Ne2000 networking cards and PII/PIII processors, dual | ||
348 | * BX chipset. ] | ||
349 | */ | ||
350 | /* | ||
351 | * Actually disabling the focus CPU check just makes the hang less | ||
352 | * frequent as it makes the interrupt distributon model be more | ||
353 | * like LRU than MRU (the short-term load is more even across CPUs). | ||
354 | * See also the comment in end_level_ioapic_irq(). --macro | ||
355 | */ | ||
356 | #if 1 | ||
357 | /* Enable focus processor (bit==0) */ | ||
358 | value &= ~APIC_SPIV_FOCUS_DISABLED; | ||
359 | #else | ||
360 | /* Disable focus processor (bit==1) */ | ||
361 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
362 | #endif | ||
363 | /* | ||
364 | * Set spurious IRQ vector | ||
365 | */ | ||
366 | value |= SPURIOUS_APIC_VECTOR; | ||
367 | apic_write_around(APIC_SPIV, value); | ||
368 | |||
369 | /* | ||
370 | * Set up LVT0, LVT1: | ||
371 | * | ||
372 | * set up through-local-APIC on the BP's LINT0. This is not | ||
373 | * strictly necessary in pure symmetric-IO mode, but sometimes | ||
374 | * we delegate interrupts to the 8259A. | ||
375 | */ | ||
376 | /* | ||
377 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro | ||
378 | */ | ||
379 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; | ||
380 | if (!smp_processor_id() && (pic_mode || !value)) { | ||
381 | value = APIC_DM_EXTINT; | ||
382 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); | ||
383 | } else { | ||
384 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; | ||
385 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); | ||
386 | } | ||
387 | apic_write_around(APIC_LVT0, value); | ||
388 | |||
389 | /* | ||
390 | * only the BP should see the LINT1 NMI signal, obviously. | ||
391 | */ | ||
392 | if (!smp_processor_id()) | ||
393 | value = APIC_DM_NMI; | ||
394 | else | ||
395 | value = APIC_DM_NMI | APIC_LVT_MASKED; | ||
396 | if (!APIC_INTEGRATED(ver)) /* 82489DX */ | ||
397 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
398 | apic_write_around(APIC_LVT1, value); | ||
399 | |||
400 | if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ | ||
401 | unsigned oldvalue; | ||
402 | maxlvt = get_maxlvt(); | ||
403 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
404 | apic_write(APIC_ESR, 0); | ||
405 | oldvalue = apic_read(APIC_ESR); | ||
406 | value = ERROR_APIC_VECTOR; // enables sending errors | ||
407 | apic_write_around(APIC_LVTERR, value); | ||
408 | /* | ||
409 | * spec says clear errors after enabling vector. | ||
410 | */ | ||
411 | if (maxlvt > 3) | ||
412 | apic_write(APIC_ESR, 0); | ||
413 | value = apic_read(APIC_ESR); | ||
414 | if (value != oldvalue) | ||
415 | apic_printk(APIC_VERBOSE, | ||
416 | "ESR value after enabling vector: %08x, after %08x\n", | ||
417 | oldvalue, value); | ||
418 | } else { | ||
419 | if (esr_disable) | ||
420 | /* | ||
421 | * Something untraceble is creating bad interrupts on | ||
422 | * secondary quads ... for the moment, just leave the | ||
423 | * ESR disabled - we can't do anything useful with the | ||
424 | * errors anyway - mbligh | ||
425 | */ | ||
426 | apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n"); | ||
427 | else | ||
428 | apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n"); | ||
429 | } | ||
430 | |||
431 | nmi_watchdog_default(); | ||
432 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
433 | setup_apic_nmi_watchdog(); | ||
434 | apic_pm_activate(); | ||
435 | } | ||
436 | |||
437 | #ifdef CONFIG_PM | ||
438 | |||
439 | static struct { | ||
440 | /* 'active' is true if the local APIC was enabled by us and | ||
441 | not the BIOS; this signifies that we are also responsible | ||
442 | for disabling it before entering apm/acpi suspend */ | ||
443 | int active; | ||
444 | /* r/w apic fields */ | ||
445 | unsigned int apic_id; | ||
446 | unsigned int apic_taskpri; | ||
447 | unsigned int apic_ldr; | ||
448 | unsigned int apic_dfr; | ||
449 | unsigned int apic_spiv; | ||
450 | unsigned int apic_lvtt; | ||
451 | unsigned int apic_lvtpc; | ||
452 | unsigned int apic_lvt0; | ||
453 | unsigned int apic_lvt1; | ||
454 | unsigned int apic_lvterr; | ||
455 | unsigned int apic_tmict; | ||
456 | unsigned int apic_tdcr; | ||
457 | unsigned int apic_thmr; | ||
458 | } apic_pm_state; | ||
459 | |||
460 | static int lapic_suspend(struct sys_device *dev, u32 state) | ||
461 | { | ||
462 | unsigned long flags; | ||
463 | |||
464 | if (!apic_pm_state.active) | ||
465 | return 0; | ||
466 | |||
467 | apic_pm_state.apic_id = apic_read(APIC_ID); | ||
468 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); | ||
469 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); | ||
470 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); | ||
471 | apic_pm_state.apic_spiv = apic_read(APIC_SPIV); | ||
472 | apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); | ||
473 | apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); | ||
474 | apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); | ||
475 | apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); | ||
476 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | ||
477 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | ||
478 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | ||
479 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | ||
480 | local_save_flags(flags); | ||
481 | local_irq_disable(); | ||
482 | disable_local_APIC(); | ||
483 | local_irq_restore(flags); | ||
484 | return 0; | ||
485 | } | ||
486 | |||
487 | static int lapic_resume(struct sys_device *dev) | ||
488 | { | ||
489 | unsigned int l, h; | ||
490 | unsigned long flags; | ||
491 | |||
492 | if (!apic_pm_state.active) | ||
493 | return 0; | ||
494 | |||
495 | /* XXX: Pavel needs this for S3 resume, but can't explain why */ | ||
496 | set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); | ||
497 | |||
498 | local_irq_save(flags); | ||
499 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
500 | l &= ~MSR_IA32_APICBASE_BASE; | ||
501 | l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; | ||
502 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
503 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | ||
504 | apic_write(APIC_ID, apic_pm_state.apic_id); | ||
505 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); | ||
506 | apic_write(APIC_LDR, apic_pm_state.apic_ldr); | ||
507 | apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); | ||
508 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | ||
509 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | ||
510 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | ||
511 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | ||
512 | apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); | ||
513 | apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); | ||
514 | apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); | ||
515 | apic_write(APIC_TMICT, apic_pm_state.apic_tmict); | ||
516 | apic_write(APIC_ESR, 0); | ||
517 | apic_read(APIC_ESR); | ||
518 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | ||
519 | apic_write(APIC_ESR, 0); | ||
520 | apic_read(APIC_ESR); | ||
521 | local_irq_restore(flags); | ||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | static struct sysdev_class lapic_sysclass = { | ||
526 | set_kset_name("lapic"), | ||
527 | .resume = lapic_resume, | ||
528 | .suspend = lapic_suspend, | ||
529 | }; | ||
530 | |||
531 | static struct sys_device device_lapic = { | ||
532 | .id = 0, | ||
533 | .cls = &lapic_sysclass, | ||
534 | }; | ||
535 | |||
536 | static void __init apic_pm_activate(void) | ||
537 | { | ||
538 | apic_pm_state.active = 1; | ||
539 | } | ||
540 | |||
541 | static int __init init_lapic_sysfs(void) | ||
542 | { | ||
543 | int error; | ||
544 | if (!cpu_has_apic) | ||
545 | return 0; | ||
546 | /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ | ||
547 | error = sysdev_class_register(&lapic_sysclass); | ||
548 | if (!error) | ||
549 | error = sysdev_register(&device_lapic); | ||
550 | return error; | ||
551 | } | ||
552 | device_initcall(init_lapic_sysfs); | ||
553 | |||
554 | #else /* CONFIG_PM */ | ||
555 | |||
556 | static void apic_pm_activate(void) { } | ||
557 | |||
558 | #endif /* CONFIG_PM */ | ||
559 | |||
560 | static int __init apic_set_verbosity(char *str) | ||
561 | { | ||
562 | if (strcmp("debug", str) == 0) | ||
563 | apic_verbosity = APIC_DEBUG; | ||
564 | else if (strcmp("verbose", str) == 0) | ||
565 | apic_verbosity = APIC_VERBOSE; | ||
566 | else | ||
567 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
568 | " use apic=verbose or apic=debug", str); | ||
569 | |||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | __setup("apic=", apic_set_verbosity); | ||
574 | |||
575 | /* | ||
576 | * Detect and enable local APICs on non-SMP boards. | ||
577 | * Original code written by Keir Fraser. | ||
578 | * On AMD64 we trust the BIOS - if it says no APIC it is likely | ||
579 | * not correctly set up (usually the APIC timer won't work etc.) | ||
580 | */ | ||
581 | |||
582 | static int __init detect_init_APIC (void) | ||
583 | { | ||
584 | if (!cpu_has_apic) { | ||
585 | printk(KERN_INFO "No local APIC present\n"); | ||
586 | return -1; | ||
587 | } | ||
588 | |||
589 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
590 | boot_cpu_id = 0; | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | void __init init_apic_mappings(void) | ||
595 | { | ||
596 | unsigned long apic_phys; | ||
597 | |||
598 | /* | ||
599 | * If no local APIC can be found then set up a fake all | ||
600 | * zeroes page to simulate the local APIC and another | ||
601 | * one for the IO-APIC. | ||
602 | */ | ||
603 | if (!smp_found_config && detect_init_APIC()) { | ||
604 | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
605 | apic_phys = __pa(apic_phys); | ||
606 | } else | ||
607 | apic_phys = mp_lapic_addr; | ||
608 | |||
609 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | ||
610 | apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); | ||
611 | |||
612 | /* | ||
613 | * Fetch the APIC ID of the BSP in case we have a | ||
614 | * default configuration (or the MP table is broken). | ||
615 | */ | ||
616 | if (boot_cpu_id == -1U) | ||
617 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
618 | |||
619 | #ifdef CONFIG_X86_IO_APIC | ||
620 | { | ||
621 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
622 | int i; | ||
623 | |||
624 | for (i = 0; i < nr_ioapics; i++) { | ||
625 | if (smp_found_config) { | ||
626 | ioapic_phys = mp_ioapics[i].mpc_apicaddr; | ||
627 | } else { | ||
628 | ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
629 | ioapic_phys = __pa(ioapic_phys); | ||
630 | } | ||
631 | set_fixmap_nocache(idx, ioapic_phys); | ||
632 | apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", | ||
633 | __fix_to_virt(idx), ioapic_phys); | ||
634 | idx++; | ||
635 | } | ||
636 | } | ||
637 | #endif | ||
638 | } | ||
639 | |||
640 | /* | ||
641 | * This function sets up the local APIC timer, with a timeout of | ||
642 | * 'clocks' APIC bus clock. During calibration we actually call | ||
643 | * this function twice on the boot CPU, once with a bogus timeout | ||
644 | * value, second time for real. The other (noncalibrating) CPUs | ||
645 | * call this function only once, with the real, calibrated value. | ||
646 | * | ||
647 | * We do reads before writes even if unnecessary, to get around the | ||
648 | * P5 APIC double write bug. | ||
649 | */ | ||
650 | |||
651 | #define APIC_DIVISOR 16 | ||
652 | |||
653 | static void __setup_APIC_LVTT(unsigned int clocks) | ||
654 | { | ||
655 | unsigned int lvtt_value, tmp_value, ver; | ||
656 | |||
657 | ver = GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
658 | lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; | ||
659 | if (!APIC_INTEGRATED(ver)) | ||
660 | lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); | ||
661 | apic_write_around(APIC_LVTT, lvtt_value); | ||
662 | |||
663 | /* | ||
664 | * Divide PICLK by 16 | ||
665 | */ | ||
666 | tmp_value = apic_read(APIC_TDCR); | ||
667 | apic_write_around(APIC_TDCR, (tmp_value | ||
668 | & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | ||
669 | | APIC_TDR_DIV_16); | ||
670 | |||
671 | apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); | ||
672 | } | ||
673 | |||
674 | static void setup_APIC_timer(unsigned int clocks) | ||
675 | { | ||
676 | unsigned long flags; | ||
677 | |||
678 | local_irq_save(flags); | ||
679 | |||
680 | /* For some reasons this doesn't work on Simics, so fake it for now */ | ||
681 | if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { | ||
682 | __setup_APIC_LVTT(clocks); | ||
683 | return; | ||
684 | } | ||
685 | |||
686 | /* wait for irq slice */ | ||
687 | if (vxtime.hpet_address) { | ||
688 | int trigger = hpet_readl(HPET_T0_CMP); | ||
689 | while (hpet_readl(HPET_COUNTER) >= trigger) | ||
690 | /* do nothing */ ; | ||
691 | while (hpet_readl(HPET_COUNTER) < trigger) | ||
692 | /* do nothing */ ; | ||
693 | } else { | ||
694 | int c1, c2; | ||
695 | outb_p(0x00, 0x43); | ||
696 | c2 = inb_p(0x40); | ||
697 | c2 |= inb_p(0x40) << 8; | ||
698 | do { | ||
699 | c1 = c2; | ||
700 | outb_p(0x00, 0x43); | ||
701 | c2 = inb_p(0x40); | ||
702 | c2 |= inb_p(0x40) << 8; | ||
703 | } while (c2 - c1 < 300); | ||
704 | } | ||
705 | |||
706 | __setup_APIC_LVTT(clocks); | ||
707 | |||
708 | local_irq_restore(flags); | ||
709 | } | ||
710 | |||
711 | /* | ||
712 | * In this function we calibrate APIC bus clocks to the external | ||
713 | * timer. Unfortunately we cannot use jiffies and the timer irq | ||
714 | * to calibrate, since some later bootup code depends on getting | ||
715 | * the first irq? Ugh. | ||
716 | * | ||
717 | * We want to do the calibration only once since we | ||
718 | * want to have local timer irqs syncron. CPUs connected | ||
719 | * by the same APIC bus have the very same bus frequency. | ||
720 | * And we want to have irqs off anyways, no accidental | ||
721 | * APIC irq that way. | ||
722 | */ | ||
723 | |||
724 | #define TICK_COUNT 100000000 | ||
725 | |||
726 | static int __init calibrate_APIC_clock(void) | ||
727 | { | ||
728 | int apic, apic_start, tsc, tsc_start; | ||
729 | int result; | ||
730 | /* | ||
731 | * Put whatever arbitrary (but long enough) timeout | ||
732 | * value into the APIC clock, we just want to get the | ||
733 | * counter running for calibration. | ||
734 | */ | ||
735 | __setup_APIC_LVTT(1000000000); | ||
736 | |||
737 | apic_start = apic_read(APIC_TMCCT); | ||
738 | rdtscl(tsc_start); | ||
739 | |||
740 | do { | ||
741 | apic = apic_read(APIC_TMCCT); | ||
742 | rdtscl(tsc); | ||
743 | } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); | ||
744 | |||
745 | result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); | ||
746 | |||
747 | printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", | ||
748 | result / 1000 / 1000, result / 1000 % 1000); | ||
749 | |||
750 | return result * APIC_DIVISOR / HZ; | ||
751 | } | ||
752 | |||
753 | static unsigned int calibration_result; | ||
754 | |||
755 | void __init setup_boot_APIC_clock (void) | ||
756 | { | ||
757 | if (disable_apic_timer) { | ||
758 | printk(KERN_INFO "Disabling APIC timer\n"); | ||
759 | return; | ||
760 | } | ||
761 | |||
762 | printk(KERN_INFO "Using local APIC timer interrupts.\n"); | ||
763 | using_apic_timer = 1; | ||
764 | |||
765 | local_irq_disable(); | ||
766 | |||
767 | calibration_result = calibrate_APIC_clock(); | ||
768 | /* | ||
769 | * Now set up the timer for real. | ||
770 | */ | ||
771 | setup_APIC_timer(calibration_result); | ||
772 | |||
773 | local_irq_enable(); | ||
774 | } | ||
775 | |||
776 | void __init setup_secondary_APIC_clock(void) | ||
777 | { | ||
778 | local_irq_disable(); /* FIXME: Do we need this? --RR */ | ||
779 | setup_APIC_timer(calibration_result); | ||
780 | local_irq_enable(); | ||
781 | } | ||
782 | |||
783 | void __init disable_APIC_timer(void) | ||
784 | { | ||
785 | if (using_apic_timer) { | ||
786 | unsigned long v; | ||
787 | |||
788 | v = apic_read(APIC_LVTT); | ||
789 | apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); | ||
790 | } | ||
791 | } | ||
792 | |||
793 | void enable_APIC_timer(void) | ||
794 | { | ||
795 | if (using_apic_timer) { | ||
796 | unsigned long v; | ||
797 | |||
798 | v = apic_read(APIC_LVTT); | ||
799 | apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); | ||
800 | } | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * the frequency of the profiling timer can be changed | ||
805 | * by writing a multiplier value into /proc/profile. | ||
806 | */ | ||
807 | int setup_profiling_timer(unsigned int multiplier) | ||
808 | { | ||
809 | int i; | ||
810 | |||
811 | /* | ||
812 | * Sanity check. [at least 500 APIC cycles should be | ||
813 | * between APIC interrupts as a rule of thumb, to avoid | ||
814 | * irqs flooding us] | ||
815 | */ | ||
816 | if ( (!multiplier) || (calibration_result/multiplier < 500)) | ||
817 | return -EINVAL; | ||
818 | |||
819 | /* | ||
820 | * Set the new multiplier for each CPU. CPUs don't start using the | ||
821 | * new values until the next timer interrupt in which they do process | ||
822 | * accounting. At that time they also adjust their APIC timers | ||
823 | * accordingly. | ||
824 | */ | ||
825 | for (i = 0; i < NR_CPUS; ++i) | ||
826 | per_cpu(prof_multiplier, i) = multiplier; | ||
827 | |||
828 | return 0; | ||
829 | } | ||
830 | |||
831 | #undef APIC_DIVISOR | ||
832 | |||
833 | /* | ||
834 | * Local timer interrupt handler. It does both profiling and | ||
835 | * process statistics/rescheduling. | ||
836 | * | ||
837 | * We do profiling in every local tick, statistics/rescheduling | ||
838 | * happen only every 'profiling multiplier' ticks. The default | ||
839 | * multiplier is 1 and it can be changed by writing the new multiplier | ||
840 | * value into /proc/profile. | ||
841 | */ | ||
842 | |||
843 | void smp_local_timer_interrupt(struct pt_regs *regs) | ||
844 | { | ||
845 | int cpu = smp_processor_id(); | ||
846 | |||
847 | profile_tick(CPU_PROFILING, regs); | ||
848 | if (--per_cpu(prof_counter, cpu) <= 0) { | ||
849 | /* | ||
850 | * The multiplier may have changed since the last time we got | ||
851 | * to this point as a result of the user writing to | ||
852 | * /proc/profile. In this case we need to adjust the APIC | ||
853 | * timer accordingly. | ||
854 | * | ||
855 | * Interrupts are already masked off at this point. | ||
856 | */ | ||
857 | per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); | ||
858 | if (per_cpu(prof_counter, cpu) != | ||
859 | per_cpu(prof_old_multiplier, cpu)) { | ||
860 | __setup_APIC_LVTT(calibration_result/ | ||
861 | per_cpu(prof_counter, cpu)); | ||
862 | per_cpu(prof_old_multiplier, cpu) = | ||
863 | per_cpu(prof_counter, cpu); | ||
864 | } | ||
865 | |||
866 | #ifdef CONFIG_SMP | ||
867 | update_process_times(user_mode(regs)); | ||
868 | #endif | ||
869 | } | ||
870 | |||
871 | /* | ||
872 | * We take the 'long' return path, and there every subsystem | ||
873 | * grabs the appropriate locks (kernel lock/ irq lock). | ||
874 | * | ||
875 | * we might want to decouple profiling from the 'long path', | ||
876 | * and do the profiling totally in assembly. | ||
877 | * | ||
878 | * Currently this isn't too much of an issue (performance wise), | ||
879 | * we can take more than 100K local irqs per second on a 100 MHz P5. | ||
880 | */ | ||
881 | } | ||
882 | |||
883 | /* | ||
884 | * Local APIC timer interrupt. This is the most natural way for doing | ||
885 | * local interrupts, but local timer interrupts can be emulated by | ||
886 | * broadcast interrupts too. [in case the hw doesn't support APIC timers] | ||
887 | * | ||
888 | * [ if a single-CPU system runs an SMP kernel then we call the local | ||
889 | * interrupt as well. Thus we cannot inline the local irq ... ] | ||
890 | */ | ||
891 | void smp_apic_timer_interrupt(struct pt_regs *regs) | ||
892 | { | ||
893 | /* | ||
894 | * the NMI deadlock-detector uses this. | ||
895 | */ | ||
896 | add_pda(apic_timer_irqs, 1); | ||
897 | |||
898 | /* | ||
899 | * NOTE! We'd better ACK the irq immediately, | ||
900 | * because timer handling can be slow. | ||
901 | */ | ||
902 | ack_APIC_irq(); | ||
903 | /* | ||
904 | * update_process_times() expects us to have done irq_enter(). | ||
905 | * Besides, if we don't timer interrupts ignore the global | ||
906 | * interrupt lock, which is the WrongThing (tm) to do. | ||
907 | */ | ||
908 | irq_enter(); | ||
909 | smp_local_timer_interrupt(regs); | ||
910 | irq_exit(); | ||
911 | } | ||
912 | |||
913 | /* | ||
914 | * oem_force_hpet_timer -- force HPET mode for some boxes. | ||
915 | * | ||
916 | * Thus far, the major user of this is IBM's Summit2 series: | ||
917 | * | ||
918 | * Clustered boxes may have unsynced TSC problems if they are | ||
919 | * multi-chassis. Use available data to take a good guess. | ||
920 | * If in doubt, go HPET. | ||
921 | */ | ||
922 | __init int oem_force_hpet_timer(void) | ||
923 | { | ||
924 | int i, clusters, zeros; | ||
925 | unsigned id; | ||
926 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | ||
927 | |||
928 | bitmap_empty(clustermap, NUM_APIC_CLUSTERS); | ||
929 | |||
930 | for (i = 0; i < NR_CPUS; i++) { | ||
931 | id = bios_cpu_apicid[i]; | ||
932 | if (id != BAD_APICID) | ||
933 | __set_bit(APIC_CLUSTERID(id), clustermap); | ||
934 | } | ||
935 | |||
936 | /* Problem: Partially populated chassis may not have CPUs in some of | ||
937 | * the APIC clusters they have been allocated. Only present CPUs have | ||
938 | * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since | ||
939 | * clusters are allocated sequentially, count zeros only if they are | ||
940 | * bounded by ones. | ||
941 | */ | ||
942 | clusters = 0; | ||
943 | zeros = 0; | ||
944 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
945 | if (test_bit(i, clustermap)) { | ||
946 | clusters += 1 + zeros; | ||
947 | zeros = 0; | ||
948 | } else | ||
949 | ++zeros; | ||
950 | } | ||
951 | |||
952 | /* | ||
953 | * If clusters > 2, then should be multi-chassis. Return 1 for HPET. | ||
954 | * Else return 0 to use TSC. | ||
955 | * May have to revisit this when multi-core + hyperthreaded CPUs come | ||
956 | * out, but AFAIK this will work even for them. | ||
957 | */ | ||
958 | return (clusters > 2); | ||
959 | } | ||
960 | |||
961 | /* | ||
962 | * This interrupt should _never_ happen with our APIC/SMP architecture | ||
963 | */ | ||
964 | asmlinkage void smp_spurious_interrupt(void) | ||
965 | { | ||
966 | unsigned int v; | ||
967 | irq_enter(); | ||
968 | /* | ||
969 | * Check if this really is a spurious interrupt and ACK it | ||
970 | * if it is a vectored one. Just in case... | ||
971 | * Spurious interrupts should not be ACKed. | ||
972 | */ | ||
973 | v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); | ||
974 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | ||
975 | ack_APIC_irq(); | ||
976 | |||
977 | #if 0 | ||
978 | static unsigned long last_warning; | ||
979 | static unsigned long skipped; | ||
980 | |||
981 | /* see sw-dev-man vol 3, chapter 7.4.13.5 */ | ||
982 | if (time_before(last_warning+30*HZ,jiffies)) { | ||
983 | printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", | ||
984 | smp_processor_id(), skipped); | ||
985 | last_warning = jiffies; | ||
986 | skipped = 0; | ||
987 | } else { | ||
988 | skipped++; | ||
989 | } | ||
990 | #endif | ||
991 | irq_exit(); | ||
992 | } | ||
993 | |||
994 | /* | ||
995 | * This interrupt should never happen with our APIC/SMP architecture | ||
996 | */ | ||
997 | |||
998 | asmlinkage void smp_error_interrupt(void) | ||
999 | { | ||
1000 | unsigned int v, v1; | ||
1001 | |||
1002 | irq_enter(); | ||
1003 | /* First tickle the hardware, only then report what went on. -- REW */ | ||
1004 | v = apic_read(APIC_ESR); | ||
1005 | apic_write(APIC_ESR, 0); | ||
1006 | v1 = apic_read(APIC_ESR); | ||
1007 | ack_APIC_irq(); | ||
1008 | atomic_inc(&irq_err_count); | ||
1009 | |||
1010 | /* Here is what the APIC error bits mean: | ||
1011 | 0: Send CS error | ||
1012 | 1: Receive CS error | ||
1013 | 2: Send accept error | ||
1014 | 3: Receive accept error | ||
1015 | 4: Reserved | ||
1016 | 5: Send illegal vector | ||
1017 | 6: Received illegal vector | ||
1018 | 7: Illegal register address | ||
1019 | */ | ||
1020 | printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", | ||
1021 | smp_processor_id(), v , v1); | ||
1022 | irq_exit(); | ||
1023 | } | ||
1024 | |||
1025 | int disable_apic; | ||
1026 | |||
1027 | /* | ||
1028 | * This initializes the IO-APIC and APIC hardware if this is | ||
1029 | * a UP kernel. | ||
1030 | */ | ||
1031 | int __init APIC_init_uniprocessor (void) | ||
1032 | { | ||
1033 | if (disable_apic) { | ||
1034 | printk(KERN_INFO "Apic disabled\n"); | ||
1035 | return -1; | ||
1036 | } | ||
1037 | if (!cpu_has_apic) { | ||
1038 | disable_apic = 1; | ||
1039 | printk(KERN_INFO "Apic disabled by BIOS\n"); | ||
1040 | return -1; | ||
1041 | } | ||
1042 | |||
1043 | verify_local_APIC(); | ||
1044 | |||
1045 | connect_bsp_APIC(); | ||
1046 | |||
1047 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
1048 | apic_write_around(APIC_ID, boot_cpu_id); | ||
1049 | |||
1050 | setup_local_APIC(); | ||
1051 | |||
1052 | #ifdef CONFIG_X86_IO_APIC | ||
1053 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | ||
1054 | setup_IO_APIC(); | ||
1055 | else | ||
1056 | nr_ioapics = 0; | ||
1057 | #endif | ||
1058 | setup_boot_APIC_clock(); | ||
1059 | |||
1060 | return 0; | ||
1061 | } | ||
1062 | |||
1063 | static __init int setup_disableapic(char *str) | ||
1064 | { | ||
1065 | disable_apic = 1; | ||
1066 | return 0; | ||
1067 | } | ||
1068 | |||
1069 | static __init int setup_nolapic(char *str) | ||
1070 | { | ||
1071 | disable_apic = 1; | ||
1072 | return 0; | ||
1073 | } | ||
1074 | |||
1075 | static __init int setup_noapictimer(char *str) | ||
1076 | { | ||
1077 | disable_apic_timer = 1; | ||
1078 | return 0; | ||
1079 | } | ||
1080 | |||
1081 | /* dummy parsing: see setup.c */ | ||
1082 | |||
1083 | __setup("disableapic", setup_disableapic); | ||
1084 | __setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ | ||
1085 | |||
1086 | __setup("noapictimer", setup_noapictimer); | ||
1087 | |||
1088 | /* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ | ||
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c new file mode 100644 index 000000000000..35b4c3fcbb37 --- /dev/null +++ b/arch/x86_64/kernel/asm-offsets.c | |||
@@ -0,0 +1,69 @@ | |||
1 | /* | ||
2 | * Generate definitions needed by assembly language modules. | ||
3 | * This code generates raw asm output which is post-processed to extract | ||
4 | * and format the required data. | ||
5 | */ | ||
6 | |||
7 | #include <linux/sched.h> | ||
8 | #include <linux/stddef.h> | ||
9 | #include <linux/errno.h> | ||
10 | #include <linux/hardirq.h> | ||
11 | #include <linux/suspend.h> | ||
12 | #include <asm/pda.h> | ||
13 | #include <asm/processor.h> | ||
14 | #include <asm/segment.h> | ||
15 | #include <asm/thread_info.h> | ||
16 | #include <asm/ia32.h> | ||
17 | |||
18 | #define DEFINE(sym, val) \ | ||
19 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) | ||
20 | |||
21 | #define BLANK() asm volatile("\n->" : : ) | ||
22 | |||
23 | int main(void) | ||
24 | { | ||
25 | #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) | ||
26 | ENTRY(state); | ||
27 | ENTRY(flags); | ||
28 | ENTRY(thread); | ||
29 | ENTRY(pid); | ||
30 | BLANK(); | ||
31 | #undef ENTRY | ||
32 | #define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) | ||
33 | ENTRY(flags); | ||
34 | ENTRY(addr_limit); | ||
35 | ENTRY(preempt_count); | ||
36 | BLANK(); | ||
37 | #undef ENTRY | ||
38 | #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) | ||
39 | ENTRY(kernelstack); | ||
40 | ENTRY(oldrsp); | ||
41 | ENTRY(pcurrent); | ||
42 | ENTRY(irqrsp); | ||
43 | ENTRY(irqcount); | ||
44 | ENTRY(cpunumber); | ||
45 | ENTRY(irqstackptr); | ||
46 | BLANK(); | ||
47 | #undef ENTRY | ||
48 | #ifdef CONFIG_IA32_EMULATION | ||
49 | #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) | ||
50 | ENTRY(eax); | ||
51 | ENTRY(ebx); | ||
52 | ENTRY(ecx); | ||
53 | ENTRY(edx); | ||
54 | ENTRY(esi); | ||
55 | ENTRY(edi); | ||
56 | ENTRY(ebp); | ||
57 | ENTRY(esp); | ||
58 | ENTRY(eip); | ||
59 | BLANK(); | ||
60 | #undef ENTRY | ||
61 | DEFINE(IA32_RT_SIGFRAME_sigcontext, | ||
62 | offsetof (struct rt_sigframe32, uc.uc_mcontext)); | ||
63 | BLANK(); | ||
64 | #endif | ||
65 | DEFINE(pbe_address, offsetof(struct pbe, address)); | ||
66 | DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); | ||
67 | DEFINE(pbe_next, offsetof(struct pbe, next)); | ||
68 | return 0; | ||
69 | } | ||
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig new file mode 100644 index 000000000000..81f1562e5393 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Kconfig | |||
@@ -0,0 +1,96 @@ | |||
1 | # | ||
2 | # CPU Frequency scaling | ||
3 | # | ||
4 | |||
5 | menu "CPU Frequency scaling" | ||
6 | |||
7 | source "drivers/cpufreq/Kconfig" | ||
8 | |||
9 | if CPU_FREQ | ||
10 | |||
11 | comment "CPUFreq processor drivers" | ||
12 | |||
13 | config X86_POWERNOW_K8 | ||
14 | tristate "AMD Opteron/Athlon64 PowerNow!" | ||
15 | select CPU_FREQ_TABLE | ||
16 | help | ||
17 | This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. | ||
18 | |||
19 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
20 | |||
21 | If in doubt, say N. | ||
22 | |||
23 | config X86_POWERNOW_K8_ACPI | ||
24 | bool | ||
25 | depends on X86_POWERNOW_K8 && ACPI_PROCESSOR | ||
26 | depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) | ||
27 | default y | ||
28 | |||
29 | config X86_SPEEDSTEP_CENTRINO | ||
30 | tristate "Intel Enhanced SpeedStep" | ||
31 | select CPU_FREQ_TABLE | ||
32 | depends on ACPI_PROCESSOR | ||
33 | help | ||
34 | This adds the CPUFreq driver for Enhanced SpeedStep enabled | ||
35 | mobile CPUs. This means Intel Pentium M (Centrino) CPUs | ||
36 | or 64bit enabled Intel Xeons. | ||
37 | |||
38 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
39 | |||
40 | If in doubt, say N. | ||
41 | |||
42 | config X86_SPEEDSTEP_CENTRINO_ACPI | ||
43 | bool | ||
44 | depends on X86_SPEEDSTEP_CENTRINO | ||
45 | default y | ||
46 | |||
47 | config X86_ACPI_CPUFREQ | ||
48 | tristate "ACPI Processor P-States driver" | ||
49 | depends on ACPI_PROCESSOR | ||
50 | help | ||
51 | This driver adds a CPUFreq driver which utilizes the ACPI | ||
52 | Processor Performance States. | ||
53 | |||
54 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
55 | |||
56 | If in doubt, say N. | ||
57 | |||
58 | comment "shared options" | ||
59 | |||
60 | config X86_ACPI_CPUFREQ_PROC_INTF | ||
61 | bool "/proc/acpi/processor/../performance interface (deprecated)" | ||
62 | depends on PROC_FS | ||
63 | depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI | ||
64 | help | ||
65 | This enables the deprecated /proc/acpi/processor/../performance | ||
66 | interface. While it is helpful for debugging, the generic, | ||
67 | cross-architecture cpufreq interfaces should be used. | ||
68 | |||
69 | If in doubt, say N. | ||
70 | |||
71 | config X86_P4_CLOCKMOD | ||
72 | tristate "Intel Pentium 4 clock modulation" | ||
73 | depends on EMBEDDED | ||
74 | help | ||
75 | This adds the clock modulation driver for Intel Pentium 4 / XEON | ||
76 | processors. When enabled it will lower CPU temperature by skipping | ||
77 | clocks. | ||
78 | |||
79 | This driver should be only used in exceptional | ||
80 | circumstances when very low power is needed because it causes severe | ||
81 | slowdowns and noticeable latencies. Normally Speedstep should be used | ||
82 | instead. | ||
83 | |||
84 | For details, take a look at <file:Documentation/cpu-freq/>. | ||
85 | |||
86 | Unless you are absolutely sure say N. | ||
87 | |||
88 | |||
89 | config X86_SPEEDSTEP_LIB | ||
90 | tristate | ||
91 | default X86_P4_CLOCKMOD | ||
92 | |||
93 | endif | ||
94 | |||
95 | endmenu | ||
96 | |||
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile new file mode 100644 index 000000000000..d8b593879224 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Makefile | |||
@@ -0,0 +1,17 @@ | |||
1 | # | ||
2 | # Reuse the i386 cpufreq drivers | ||
3 | # | ||
4 | |||
5 | SRCDIR := ../../../i386/kernel/cpu/cpufreq | ||
6 | |||
7 | obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o | ||
8 | obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o | ||
9 | obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o | ||
10 | obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o | ||
11 | obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o | ||
12 | |||
13 | powernow-k8-objs := ${SRCDIR}/powernow-k8.o | ||
14 | speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o | ||
15 | acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o | ||
16 | p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o | ||
17 | speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o | ||
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c new file mode 100644 index 000000000000..657003e461e6 --- /dev/null +++ b/arch/x86_64/kernel/e820.c | |||
@@ -0,0 +1,513 @@ | |||
1 | /* | ||
2 | * Handle the memory map. | ||
3 | * The functions here do the job until bootmem takes over. | ||
4 | * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ | ||
5 | */ | ||
6 | #include <linux/config.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/bootmem.h> | ||
11 | #include <linux/ioport.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <asm/page.h> | ||
14 | #include <asm/e820.h> | ||
15 | #include <asm/proto.h> | ||
16 | #include <asm/bootsetup.h> | ||
17 | |||
18 | extern char _end[]; | ||
19 | |||
20 | /* | ||
21 | * PFN of last memory page. | ||
22 | */ | ||
23 | unsigned long end_pfn; | ||
24 | |||
25 | /* | ||
26 | * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. | ||
27 | * The direct mapping extends to end_pfn_map, so that we can directly access | ||
28 | * apertures, ACPI and other tables without having to play with fixmaps. | ||
29 | */ | ||
30 | unsigned long end_pfn_map; | ||
31 | |||
32 | /* | ||
33 | * Last pfn which the user wants to use. | ||
34 | */ | ||
35 | unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; | ||
36 | |||
37 | extern struct resource code_resource, data_resource; | ||
38 | |||
39 | /* Check for some hardcoded bad areas that early boot is not allowed to touch */ | ||
40 | static inline int bad_addr(unsigned long *addrp, unsigned long size) | ||
41 | { | ||
42 | unsigned long addr = *addrp, last = addr + size; | ||
43 | |||
44 | /* various gunk below that needed for SMP startup */ | ||
45 | if (addr < 0x8000) { | ||
46 | *addrp = 0x8000; | ||
47 | return 1; | ||
48 | } | ||
49 | |||
50 | /* direct mapping tables of the kernel */ | ||
51 | if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { | ||
52 | *addrp = table_end << PAGE_SHIFT; | ||
53 | return 1; | ||
54 | } | ||
55 | |||
56 | /* initrd */ | ||
57 | #ifdef CONFIG_BLK_DEV_INITRD | ||
58 | if (LOADER_TYPE && INITRD_START && last >= INITRD_START && | ||
59 | addr < INITRD_START+INITRD_SIZE) { | ||
60 | *addrp = INITRD_START + INITRD_SIZE; | ||
61 | return 1; | ||
62 | } | ||
63 | #endif | ||
64 | /* kernel code + 640k memory hole (later should not be needed, but | ||
65 | be paranoid for now) */ | ||
66 | if (last >= 640*1024 && addr < __pa_symbol(&_end)) { | ||
67 | *addrp = __pa_symbol(&_end); | ||
68 | return 1; | ||
69 | } | ||
70 | /* XXX ramdisk image here? */ | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) | ||
75 | { | ||
76 | int i; | ||
77 | for (i = 0; i < e820.nr_map; i++) { | ||
78 | struct e820entry *ei = &e820.map[i]; | ||
79 | if (type && ei->type != type) | ||
80 | continue; | ||
81 | if (ei->addr >= end || ei->addr + ei->size < start) | ||
82 | continue; | ||
83 | return 1; | ||
84 | } | ||
85 | return 0; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Find a free area in a specific range. | ||
90 | */ | ||
91 | unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) | ||
92 | { | ||
93 | int i; | ||
94 | for (i = 0; i < e820.nr_map; i++) { | ||
95 | struct e820entry *ei = &e820.map[i]; | ||
96 | unsigned long addr = ei->addr, last; | ||
97 | if (ei->type != E820_RAM) | ||
98 | continue; | ||
99 | if (addr < start) | ||
100 | addr = start; | ||
101 | if (addr > ei->addr + ei->size) | ||
102 | continue; | ||
103 | while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size) | ||
104 | ; | ||
105 | last = addr + size; | ||
106 | if (last > ei->addr + ei->size) | ||
107 | continue; | ||
108 | if (last > end) | ||
109 | continue; | ||
110 | return addr; | ||
111 | } | ||
112 | return -1UL; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Free bootmem based on the e820 table for a node. | ||
117 | */ | ||
118 | void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) | ||
119 | { | ||
120 | int i; | ||
121 | for (i = 0; i < e820.nr_map; i++) { | ||
122 | struct e820entry *ei = &e820.map[i]; | ||
123 | unsigned long last, addr; | ||
124 | |||
125 | if (ei->type != E820_RAM || | ||
126 | ei->addr+ei->size <= start || | ||
127 | ei->addr > end) | ||
128 | continue; | ||
129 | |||
130 | addr = round_up(ei->addr, PAGE_SIZE); | ||
131 | if (addr < start) | ||
132 | addr = start; | ||
133 | |||
134 | last = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
135 | if (last >= end) | ||
136 | last = end; | ||
137 | |||
138 | if (last > addr && last-addr >= PAGE_SIZE) | ||
139 | free_bootmem_node(pgdat, addr, last-addr); | ||
140 | } | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * Find the highest page frame number we have available | ||
145 | */ | ||
146 | unsigned long __init e820_end_of_ram(void) | ||
147 | { | ||
148 | int i; | ||
149 | unsigned long end_pfn = 0; | ||
150 | |||
151 | for (i = 0; i < e820.nr_map; i++) { | ||
152 | struct e820entry *ei = &e820.map[i]; | ||
153 | unsigned long start, end; | ||
154 | |||
155 | start = round_up(ei->addr, PAGE_SIZE); | ||
156 | end = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
157 | if (start >= end) | ||
158 | continue; | ||
159 | if (ei->type == E820_RAM) { | ||
160 | if (end > end_pfn<<PAGE_SHIFT) | ||
161 | end_pfn = end>>PAGE_SHIFT; | ||
162 | } else { | ||
163 | if (end > end_pfn_map<<PAGE_SHIFT) | ||
164 | end_pfn_map = end>>PAGE_SHIFT; | ||
165 | } | ||
166 | } | ||
167 | |||
168 | if (end_pfn > end_pfn_map) | ||
169 | end_pfn_map = end_pfn; | ||
170 | if (end_pfn_map > MAXMEM>>PAGE_SHIFT) | ||
171 | end_pfn_map = MAXMEM>>PAGE_SHIFT; | ||
172 | if (end_pfn > end_user_pfn) | ||
173 | end_pfn = end_user_pfn; | ||
174 | if (end_pfn > end_pfn_map) | ||
175 | end_pfn = end_pfn_map; | ||
176 | |||
177 | return end_pfn; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * Mark e820 reserved areas as busy for the resource manager. | ||
182 | */ | ||
183 | void __init e820_reserve_resources(void) | ||
184 | { | ||
185 | int i; | ||
186 | for (i = 0; i < e820.nr_map; i++) { | ||
187 | struct resource *res; | ||
188 | if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) | ||
189 | continue; | ||
190 | res = alloc_bootmem_low(sizeof(struct resource)); | ||
191 | switch (e820.map[i].type) { | ||
192 | case E820_RAM: res->name = "System RAM"; break; | ||
193 | case E820_ACPI: res->name = "ACPI Tables"; break; | ||
194 | case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; | ||
195 | default: res->name = "reserved"; | ||
196 | } | ||
197 | res->start = e820.map[i].addr; | ||
198 | res->end = res->start + e820.map[i].size - 1; | ||
199 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
200 | request_resource(&iomem_resource, res); | ||
201 | if (e820.map[i].type == E820_RAM) { | ||
202 | /* | ||
203 | * We don't know which RAM region contains kernel data, | ||
204 | * so we try it repeatedly and let the resource manager | ||
205 | * test it. | ||
206 | */ | ||
207 | request_resource(res, &code_resource); | ||
208 | request_resource(res, &data_resource); | ||
209 | } | ||
210 | } | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * Add a memory region to the kernel e820 map. | ||
215 | */ | ||
216 | void __init add_memory_region(unsigned long start, unsigned long size, int type) | ||
217 | { | ||
218 | int x = e820.nr_map; | ||
219 | |||
220 | if (x == E820MAX) { | ||
221 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | ||
222 | return; | ||
223 | } | ||
224 | |||
225 | e820.map[x].addr = start; | ||
226 | e820.map[x].size = size; | ||
227 | e820.map[x].type = type; | ||
228 | e820.nr_map++; | ||
229 | } | ||
230 | |||
231 | void __init e820_print_map(char *who) | ||
232 | { | ||
233 | int i; | ||
234 | |||
235 | for (i = 0; i < e820.nr_map; i++) { | ||
236 | printk(" %s: %016Lx - %016Lx ", who, | ||
237 | (unsigned long long) e820.map[i].addr, | ||
238 | (unsigned long long) (e820.map[i].addr + e820.map[i].size)); | ||
239 | switch (e820.map[i].type) { | ||
240 | case E820_RAM: printk("(usable)\n"); | ||
241 | break; | ||
242 | case E820_RESERVED: | ||
243 | printk("(reserved)\n"); | ||
244 | break; | ||
245 | case E820_ACPI: | ||
246 | printk("(ACPI data)\n"); | ||
247 | break; | ||
248 | case E820_NVS: | ||
249 | printk("(ACPI NVS)\n"); | ||
250 | break; | ||
251 | default: printk("type %u\n", e820.map[i].type); | ||
252 | break; | ||
253 | } | ||
254 | } | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Sanitize the BIOS e820 map. | ||
259 | * | ||
260 | * Some e820 responses include overlapping entries. The following | ||
261 | * replaces the original e820 map with a new one, removing overlaps. | ||
262 | * | ||
263 | */ | ||
264 | static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) | ||
265 | { | ||
266 | struct change_member { | ||
267 | struct e820entry *pbios; /* pointer to original bios entry */ | ||
268 | unsigned long long addr; /* address for this change point */ | ||
269 | }; | ||
270 | static struct change_member change_point_list[2*E820MAX] __initdata; | ||
271 | static struct change_member *change_point[2*E820MAX] __initdata; | ||
272 | static struct e820entry *overlap_list[E820MAX] __initdata; | ||
273 | static struct e820entry new_bios[E820MAX] __initdata; | ||
274 | struct change_member *change_tmp; | ||
275 | unsigned long current_type, last_type; | ||
276 | unsigned long long last_addr; | ||
277 | int chgidx, still_changing; | ||
278 | int overlap_entries; | ||
279 | int new_bios_entry; | ||
280 | int old_nr, new_nr; | ||
281 | int i; | ||
282 | |||
283 | /* | ||
284 | Visually we're performing the following (1,2,3,4 = memory types)... | ||
285 | |||
286 | Sample memory map (w/overlaps): | ||
287 | ____22__________________ | ||
288 | ______________________4_ | ||
289 | ____1111________________ | ||
290 | _44_____________________ | ||
291 | 11111111________________ | ||
292 | ____________________33__ | ||
293 | ___________44___________ | ||
294 | __________33333_________ | ||
295 | ______________22________ | ||
296 | ___________________2222_ | ||
297 | _________111111111______ | ||
298 | _____________________11_ | ||
299 | _________________4______ | ||
300 | |||
301 | Sanitized equivalent (no overlap): | ||
302 | 1_______________________ | ||
303 | _44_____________________ | ||
304 | ___1____________________ | ||
305 | ____22__________________ | ||
306 | ______11________________ | ||
307 | _________1______________ | ||
308 | __________3_____________ | ||
309 | ___________44___________ | ||
310 | _____________33_________ | ||
311 | _______________2________ | ||
312 | ________________1_______ | ||
313 | _________________4______ | ||
314 | ___________________2____ | ||
315 | ____________________33__ | ||
316 | ______________________4_ | ||
317 | */ | ||
318 | |||
319 | /* if there's only one memory region, don't bother */ | ||
320 | if (*pnr_map < 2) | ||
321 | return -1; | ||
322 | |||
323 | old_nr = *pnr_map; | ||
324 | |||
325 | /* bail out if we find any unreasonable addresses in bios map */ | ||
326 | for (i=0; i<old_nr; i++) | ||
327 | if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) | ||
328 | return -1; | ||
329 | |||
330 | /* create pointers for initial change-point information (for sorting) */ | ||
331 | for (i=0; i < 2*old_nr; i++) | ||
332 | change_point[i] = &change_point_list[i]; | ||
333 | |||
334 | /* record all known change-points (starting and ending addresses) */ | ||
335 | chgidx = 0; | ||
336 | for (i=0; i < old_nr; i++) { | ||
337 | change_point[chgidx]->addr = biosmap[i].addr; | ||
338 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
339 | change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; | ||
340 | change_point[chgidx++]->pbios = &biosmap[i]; | ||
341 | } | ||
342 | |||
343 | /* sort change-point list by memory addresses (low -> high) */ | ||
344 | still_changing = 1; | ||
345 | while (still_changing) { | ||
346 | still_changing = 0; | ||
347 | for (i=1; i < 2*old_nr; i++) { | ||
348 | /* if <current_addr> > <last_addr>, swap */ | ||
349 | /* or, if current=<start_addr> & last=<end_addr>, swap */ | ||
350 | if ((change_point[i]->addr < change_point[i-1]->addr) || | ||
351 | ((change_point[i]->addr == change_point[i-1]->addr) && | ||
352 | (change_point[i]->addr == change_point[i]->pbios->addr) && | ||
353 | (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) | ||
354 | ) | ||
355 | { | ||
356 | change_tmp = change_point[i]; | ||
357 | change_point[i] = change_point[i-1]; | ||
358 | change_point[i-1] = change_tmp; | ||
359 | still_changing=1; | ||
360 | } | ||
361 | } | ||
362 | } | ||
363 | |||
364 | /* create a new bios memory map, removing overlaps */ | ||
365 | overlap_entries=0; /* number of entries in the overlap table */ | ||
366 | new_bios_entry=0; /* index for creating new bios map entries */ | ||
367 | last_type = 0; /* start with undefined memory type */ | ||
368 | last_addr = 0; /* start with 0 as last starting address */ | ||
369 | /* loop through change-points, determining affect on the new bios map */ | ||
370 | for (chgidx=0; chgidx < 2*old_nr; chgidx++) | ||
371 | { | ||
372 | /* keep track of all overlapping bios entries */ | ||
373 | if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) | ||
374 | { | ||
375 | /* add map entry to overlap list (> 1 entry implies an overlap) */ | ||
376 | overlap_list[overlap_entries++]=change_point[chgidx]->pbios; | ||
377 | } | ||
378 | else | ||
379 | { | ||
380 | /* remove entry from list (order independent, so swap with last) */ | ||
381 | for (i=0; i<overlap_entries; i++) | ||
382 | { | ||
383 | if (overlap_list[i] == change_point[chgidx]->pbios) | ||
384 | overlap_list[i] = overlap_list[overlap_entries-1]; | ||
385 | } | ||
386 | overlap_entries--; | ||
387 | } | ||
388 | /* if there are overlapping entries, decide which "type" to use */ | ||
389 | /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ | ||
390 | current_type = 0; | ||
391 | for (i=0; i<overlap_entries; i++) | ||
392 | if (overlap_list[i]->type > current_type) | ||
393 | current_type = overlap_list[i]->type; | ||
394 | /* continue building up new bios map based on this information */ | ||
395 | if (current_type != last_type) { | ||
396 | if (last_type != 0) { | ||
397 | new_bios[new_bios_entry].size = | ||
398 | change_point[chgidx]->addr - last_addr; | ||
399 | /* move forward only if the new size was non-zero */ | ||
400 | if (new_bios[new_bios_entry].size != 0) | ||
401 | if (++new_bios_entry >= E820MAX) | ||
402 | break; /* no more space left for new bios entries */ | ||
403 | } | ||
404 | if (current_type != 0) { | ||
405 | new_bios[new_bios_entry].addr = change_point[chgidx]->addr; | ||
406 | new_bios[new_bios_entry].type = current_type; | ||
407 | last_addr=change_point[chgidx]->addr; | ||
408 | } | ||
409 | last_type = current_type; | ||
410 | } | ||
411 | } | ||
412 | new_nr = new_bios_entry; /* retain count for new bios entries */ | ||
413 | |||
414 | /* copy new bios mapping into original location */ | ||
415 | memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); | ||
416 | *pnr_map = new_nr; | ||
417 | |||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | /* | ||
422 | * Copy the BIOS e820 map into a safe place. | ||
423 | * | ||
424 | * Sanity-check it while we're at it.. | ||
425 | * | ||
426 | * If we're lucky and live on a modern system, the setup code | ||
427 | * will have given us a memory map that we can use to properly | ||
428 | * set up memory. If we aren't, we'll fake a memory map. | ||
429 | * | ||
430 | * We check to see that the memory map contains at least 2 elements | ||
431 | * before we'll use it, because the detection code in setup.S may | ||
432 | * not be perfect and most every PC known to man has two memory | ||
433 | * regions: one from 0 to 640k, and one from 1mb up. (The IBM | ||
434 | * thinkpad 560x, for example, does not cooperate with the memory | ||
435 | * detection code.) | ||
436 | */ | ||
437 | static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) | ||
438 | { | ||
439 | /* Only one memory region (or negative)? Ignore it */ | ||
440 | if (nr_map < 2) | ||
441 | return -1; | ||
442 | |||
443 | do { | ||
444 | unsigned long start = biosmap->addr; | ||
445 | unsigned long size = biosmap->size; | ||
446 | unsigned long end = start + size; | ||
447 | unsigned long type = biosmap->type; | ||
448 | |||
449 | /* Overflow in 64 bits? Ignore the memory map. */ | ||
450 | if (start > end) | ||
451 | return -1; | ||
452 | |||
453 | /* | ||
454 | * Some BIOSes claim RAM in the 640k - 1M region. | ||
455 | * Not right. Fix it up. | ||
456 | * | ||
457 | * This should be removed on Hammer which is supposed to not | ||
458 | * have non e820 covered ISA mappings there, but I had some strange | ||
459 | * problems so it stays for now. -AK | ||
460 | */ | ||
461 | if (type == E820_RAM) { | ||
462 | if (start < 0x100000ULL && end > 0xA0000ULL) { | ||
463 | if (start < 0xA0000ULL) | ||
464 | add_memory_region(start, 0xA0000ULL-start, type); | ||
465 | if (end <= 0x100000ULL) | ||
466 | continue; | ||
467 | start = 0x100000ULL; | ||
468 | size = end - start; | ||
469 | } | ||
470 | } | ||
471 | |||
472 | add_memory_region(start, size, type); | ||
473 | } while (biosmap++,--nr_map); | ||
474 | return 0; | ||
475 | } | ||
476 | |||
477 | void __init setup_memory_region(void) | ||
478 | { | ||
479 | char *who = "BIOS-e820"; | ||
480 | |||
481 | /* | ||
482 | * Try to copy the BIOS-supplied E820-map. | ||
483 | * | ||
484 | * Otherwise fake a memory map; one section from 0k->640k, | ||
485 | * the next section from 1mb->appropriate_mem_k | ||
486 | */ | ||
487 | sanitize_e820_map(E820_MAP, &E820_MAP_NR); | ||
488 | if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { | ||
489 | unsigned long mem_size; | ||
490 | |||
491 | /* compare results from other methods and take the greater */ | ||
492 | if (ALT_MEM_K < EXT_MEM_K) { | ||
493 | mem_size = EXT_MEM_K; | ||
494 | who = "BIOS-88"; | ||
495 | } else { | ||
496 | mem_size = ALT_MEM_K; | ||
497 | who = "BIOS-e801"; | ||
498 | } | ||
499 | |||
500 | e820.nr_map = 0; | ||
501 | add_memory_region(0, LOWMEMSIZE(), E820_RAM); | ||
502 | add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); | ||
503 | } | ||
504 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | ||
505 | e820_print_map(who); | ||
506 | } | ||
507 | |||
508 | void __init parse_memopt(char *p, char **from) | ||
509 | { | ||
510 | end_user_pfn = memparse(p, from); | ||
511 | end_user_pfn >>= PAGE_SHIFT; | ||
512 | } | ||
513 | |||
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c new file mode 100644 index 000000000000..750bcd0655dc --- /dev/null +++ b/arch/x86_64/kernel/early_printk.c | |||
@@ -0,0 +1,220 @@ | |||
1 | #include <linux/console.h> | ||
2 | #include <linux/kernel.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <asm/io.h> | ||
6 | #include <asm/processor.h> | ||
7 | |||
8 | /* Simple VGA output */ | ||
9 | |||
10 | #ifdef __i386__ | ||
11 | #define VGABASE (__ISA_IO_base + 0xb8000) | ||
12 | #else | ||
13 | #define VGABASE ((void __iomem *)0xffffffff800b8000UL) | ||
14 | #endif | ||
15 | |||
16 | #define MAX_YPOS 25 | ||
17 | #define MAX_XPOS 80 | ||
18 | |||
19 | static int current_ypos = 1, current_xpos = 0; | ||
20 | |||
21 | static void early_vga_write(struct console *con, const char *str, unsigned n) | ||
22 | { | ||
23 | char c; | ||
24 | int i, k, j; | ||
25 | |||
26 | while ((c = *str++) != '\0' && n-- > 0) { | ||
27 | if (current_ypos >= MAX_YPOS) { | ||
28 | /* scroll 1 line up */ | ||
29 | for (k = 1, j = 0; k < MAX_YPOS; k++, j++) { | ||
30 | for (i = 0; i < MAX_XPOS; i++) { | ||
31 | writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), | ||
32 | VGABASE + 2*(MAX_XPOS*j + i)); | ||
33 | } | ||
34 | } | ||
35 | for (i = 0; i < MAX_XPOS; i++) | ||
36 | writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); | ||
37 | current_ypos = MAX_YPOS-1; | ||
38 | } | ||
39 | if (c == '\n') { | ||
40 | current_xpos = 0; | ||
41 | current_ypos++; | ||
42 | } else if (c != '\r') { | ||
43 | writew(((0x7 << 8) | (unsigned short) c), | ||
44 | VGABASE + 2*(MAX_XPOS*current_ypos + | ||
45 | current_xpos++)); | ||
46 | if (current_xpos >= MAX_XPOS) { | ||
47 | current_xpos = 0; | ||
48 | current_ypos++; | ||
49 | } | ||
50 | } | ||
51 | } | ||
52 | } | ||
53 | |||
54 | static struct console early_vga_console = { | ||
55 | .name = "earlyvga", | ||
56 | .write = early_vga_write, | ||
57 | .flags = CON_PRINTBUFFER, | ||
58 | .index = -1, | ||
59 | }; | ||
60 | |||
61 | /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ | ||
62 | |||
63 | int early_serial_base = 0x3f8; /* ttyS0 */ | ||
64 | |||
65 | #define XMTRDY 0x20 | ||
66 | |||
67 | #define DLAB 0x80 | ||
68 | |||
69 | #define TXR 0 /* Transmit register (WRITE) */ | ||
70 | #define RXR 0 /* Receive register (READ) */ | ||
71 | #define IER 1 /* Interrupt Enable */ | ||
72 | #define IIR 2 /* Interrupt ID */ | ||
73 | #define FCR 2 /* FIFO control */ | ||
74 | #define LCR 3 /* Line control */ | ||
75 | #define MCR 4 /* Modem control */ | ||
76 | #define LSR 5 /* Line Status */ | ||
77 | #define MSR 6 /* Modem Status */ | ||
78 | #define DLL 0 /* Divisor Latch Low */ | ||
79 | #define DLH 1 /* Divisor latch High */ | ||
80 | |||
81 | static int early_serial_putc(unsigned char ch) | ||
82 | { | ||
83 | unsigned timeout = 0xffff; | ||
84 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) | ||
85 | cpu_relax(); | ||
86 | outb(ch, early_serial_base + TXR); | ||
87 | return timeout ? 0 : -1; | ||
88 | } | ||
89 | |||
90 | static void early_serial_write(struct console *con, const char *s, unsigned n) | ||
91 | { | ||
92 | while (*s && n-- > 0) { | ||
93 | early_serial_putc(*s); | ||
94 | if (*s == '\n') | ||
95 | early_serial_putc('\r'); | ||
96 | s++; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | #define DEFAULT_BAUD 9600 | ||
101 | |||
102 | static __init void early_serial_init(char *s) | ||
103 | { | ||
104 | unsigned char c; | ||
105 | unsigned divisor; | ||
106 | unsigned baud = DEFAULT_BAUD; | ||
107 | char *e; | ||
108 | |||
109 | if (*s == ',') | ||
110 | ++s; | ||
111 | |||
112 | if (*s) { | ||
113 | unsigned port; | ||
114 | if (!strncmp(s,"0x",2)) { | ||
115 | early_serial_base = simple_strtoul(s, &e, 16); | ||
116 | } else { | ||
117 | static int bases[] = { 0x3f8, 0x2f8 }; | ||
118 | |||
119 | if (!strncmp(s,"ttyS",4)) | ||
120 | s += 4; | ||
121 | port = simple_strtoul(s, &e, 10); | ||
122 | if (port > 1 || s == e) | ||
123 | port = 0; | ||
124 | early_serial_base = bases[port]; | ||
125 | } | ||
126 | s += strcspn(s, ","); | ||
127 | if (*s == ',') | ||
128 | s++; | ||
129 | } | ||
130 | |||
131 | outb(0x3, early_serial_base + LCR); /* 8n1 */ | ||
132 | outb(0, early_serial_base + IER); /* no interrupt */ | ||
133 | outb(0, early_serial_base + FCR); /* no fifo */ | ||
134 | outb(0x3, early_serial_base + MCR); /* DTR + RTS */ | ||
135 | |||
136 | if (*s) { | ||
137 | baud = simple_strtoul(s, &e, 0); | ||
138 | if (baud == 0 || s == e) | ||
139 | baud = DEFAULT_BAUD; | ||
140 | } | ||
141 | |||
142 | divisor = 115200 / baud; | ||
143 | c = inb(early_serial_base + LCR); | ||
144 | outb(c | DLAB, early_serial_base + LCR); | ||
145 | outb(divisor & 0xff, early_serial_base + DLL); | ||
146 | outb((divisor >> 8) & 0xff, early_serial_base + DLH); | ||
147 | outb(c & ~DLAB, early_serial_base + LCR); | ||
148 | } | ||
149 | |||
150 | static struct console early_serial_console = { | ||
151 | .name = "earlyser", | ||
152 | .write = early_serial_write, | ||
153 | .flags = CON_PRINTBUFFER, | ||
154 | .index = -1, | ||
155 | }; | ||
156 | |||
157 | /* Direct interface for emergencies */ | ||
158 | struct console *early_console = &early_vga_console; | ||
159 | static int early_console_initialized = 0; | ||
160 | |||
161 | void early_printk(const char *fmt, ...) | ||
162 | { | ||
163 | char buf[512]; | ||
164 | int n; | ||
165 | va_list ap; | ||
166 | |||
167 | va_start(ap,fmt); | ||
168 | n = vscnprintf(buf,512,fmt,ap); | ||
169 | early_console->write(early_console,buf,n); | ||
170 | va_end(ap); | ||
171 | } | ||
172 | |||
173 | static int keep_early; | ||
174 | |||
175 | int __init setup_early_printk(char *opt) | ||
176 | { | ||
177 | char *space; | ||
178 | char buf[256]; | ||
179 | |||
180 | if (early_console_initialized) | ||
181 | return -1; | ||
182 | |||
183 | opt = strchr(opt, '=') + 1; | ||
184 | |||
185 | strlcpy(buf,opt,sizeof(buf)); | ||
186 | space = strchr(buf, ' '); | ||
187 | if (space) | ||
188 | *space = 0; | ||
189 | |||
190 | if (strstr(buf,"keep")) | ||
191 | keep_early = 1; | ||
192 | |||
193 | if (!strncmp(buf, "serial", 6)) { | ||
194 | early_serial_init(buf + 6); | ||
195 | early_console = &early_serial_console; | ||
196 | } else if (!strncmp(buf, "ttyS", 4)) { | ||
197 | early_serial_init(buf); | ||
198 | early_console = &early_serial_console; | ||
199 | } else if (!strncmp(buf, "vga", 3)) { | ||
200 | early_console = &early_vga_console; | ||
201 | } | ||
202 | early_console_initialized = 1; | ||
203 | register_console(early_console); | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | void __init disable_early_printk(void) | ||
208 | { | ||
209 | if (!early_console_initialized || !early_console) | ||
210 | return; | ||
211 | if (!keep_early) { | ||
212 | printk("disabling early console\n"); | ||
213 | unregister_console(early_console); | ||
214 | early_console_initialized = 0; | ||
215 | } else { | ||
216 | printk("keeping early console\n"); | ||
217 | } | ||
218 | } | ||
219 | |||
220 | __setup("earlyprintk=", setup_early_printk); | ||
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S new file mode 100644 index 000000000000..e126284db7a8 --- /dev/null +++ b/arch/x86_64/kernel/entry.S | |||
@@ -0,0 +1,920 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/entry.S | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
7 | * | ||
8 | * $Id$ | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * entry.S contains the system-call and fault low-level handling routines. | ||
13 | * | ||
14 | * NOTE: This code handles signal-recognition, which happens every time | ||
15 | * after an interrupt and after each system call. | ||
16 | * | ||
17 | * Normal syscalls and interrupts don't save a full stack frame, this is | ||
18 | * only done for syscall tracing, signals or fork/exec et.al. | ||
19 | * | ||
20 | * A note on terminology: | ||
21 | * - top of stack: Architecture defined interrupt frame from SS to RIP | ||
22 | * at the top of the kernel process stack. | ||
23 | * - partial stack frame: partially saved registers upto R11. | ||
24 | * - full stack frame: Like partial stack frame, but all register saved. | ||
25 | * | ||
26 | * TODO: | ||
27 | * - schedule it carefully for the final hardware. | ||
28 | */ | ||
29 | |||
30 | #define ASSEMBLY 1 | ||
31 | #include <linux/config.h> | ||
32 | #include <linux/linkage.h> | ||
33 | #include <asm/segment.h> | ||
34 | #include <asm/smp.h> | ||
35 | #include <asm/cache.h> | ||
36 | #include <asm/errno.h> | ||
37 | #include <asm/dwarf2.h> | ||
38 | #include <asm/calling.h> | ||
39 | #include <asm/offset.h> | ||
40 | #include <asm/msr.h> | ||
41 | #include <asm/unistd.h> | ||
42 | #include <asm/thread_info.h> | ||
43 | #include <asm/hw_irq.h> | ||
44 | |||
45 | .code64 | ||
46 | |||
47 | #ifdef CONFIG_PREEMPT | ||
48 | #define preempt_stop cli | ||
49 | #else | ||
50 | #define preempt_stop | ||
51 | #define retint_kernel retint_restore_args | ||
52 | #endif | ||
53 | |||
54 | /* | ||
55 | * C code is not supposed to know about undefined top of stack. Every time | ||
56 | * a C function with an pt_regs argument is called from the SYSCALL based | ||
57 | * fast path FIXUP_TOP_OF_STACK is needed. | ||
58 | * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs | ||
59 | * manipulation. | ||
60 | */ | ||
61 | |||
62 | /* %rsp:at FRAMEEND */ | ||
63 | .macro FIXUP_TOP_OF_STACK tmp | ||
64 | movq %gs:pda_oldrsp,\tmp | ||
65 | movq \tmp,RSP(%rsp) | ||
66 | movq $__USER_DS,SS(%rsp) | ||
67 | movq $__USER_CS,CS(%rsp) | ||
68 | movq $-1,RCX(%rsp) | ||
69 | movq R11(%rsp),\tmp /* get eflags */ | ||
70 | movq \tmp,EFLAGS(%rsp) | ||
71 | .endm | ||
72 | |||
73 | .macro RESTORE_TOP_OF_STACK tmp,offset=0 | ||
74 | movq RSP-\offset(%rsp),\tmp | ||
75 | movq \tmp,%gs:pda_oldrsp | ||
76 | movq EFLAGS-\offset(%rsp),\tmp | ||
77 | movq \tmp,R11-\offset(%rsp) | ||
78 | .endm | ||
79 | |||
80 | .macro FAKE_STACK_FRAME child_rip | ||
81 | /* push in order ss, rsp, eflags, cs, rip */ | ||
82 | xorq %rax, %rax | ||
83 | pushq %rax /* ss */ | ||
84 | CFI_ADJUST_CFA_OFFSET 8 | ||
85 | pushq %rax /* rsp */ | ||
86 | CFI_ADJUST_CFA_OFFSET 8 | ||
87 | CFI_OFFSET rip,0 | ||
88 | pushq $(1<<9) /* eflags - interrupts on */ | ||
89 | CFI_ADJUST_CFA_OFFSET 8 | ||
90 | pushq $__KERNEL_CS /* cs */ | ||
91 | CFI_ADJUST_CFA_OFFSET 8 | ||
92 | pushq \child_rip /* rip */ | ||
93 | CFI_ADJUST_CFA_OFFSET 8 | ||
94 | CFI_OFFSET rip,0 | ||
95 | pushq %rax /* orig rax */ | ||
96 | CFI_ADJUST_CFA_OFFSET 8 | ||
97 | .endm | ||
98 | |||
99 | .macro UNFAKE_STACK_FRAME | ||
100 | addq $8*6, %rsp | ||
101 | CFI_ADJUST_CFA_OFFSET -(6*8) | ||
102 | .endm | ||
103 | |||
104 | .macro CFI_DEFAULT_STACK | ||
105 | CFI_ADJUST_CFA_OFFSET (SS) | ||
106 | CFI_OFFSET r15,R15-SS | ||
107 | CFI_OFFSET r14,R14-SS | ||
108 | CFI_OFFSET r13,R13-SS | ||
109 | CFI_OFFSET r12,R12-SS | ||
110 | CFI_OFFSET rbp,RBP-SS | ||
111 | CFI_OFFSET rbx,RBX-SS | ||
112 | CFI_OFFSET r11,R11-SS | ||
113 | CFI_OFFSET r10,R10-SS | ||
114 | CFI_OFFSET r9,R9-SS | ||
115 | CFI_OFFSET r8,R8-SS | ||
116 | CFI_OFFSET rax,RAX-SS | ||
117 | CFI_OFFSET rcx,RCX-SS | ||
118 | CFI_OFFSET rdx,RDX-SS | ||
119 | CFI_OFFSET rsi,RSI-SS | ||
120 | CFI_OFFSET rdi,RDI-SS | ||
121 | CFI_OFFSET rsp,RSP-SS | ||
122 | CFI_OFFSET rip,RIP-SS | ||
123 | .endm | ||
124 | /* | ||
125 | * A newly forked process directly context switches into this. | ||
126 | */ | ||
127 | /* rdi: prev */ | ||
128 | ENTRY(ret_from_fork) | ||
129 | CFI_STARTPROC | ||
130 | CFI_DEFAULT_STACK | ||
131 | call schedule_tail | ||
132 | GET_THREAD_INFO(%rcx) | ||
133 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) | ||
134 | jnz rff_trace | ||
135 | rff_action: | ||
136 | RESTORE_REST | ||
137 | testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? | ||
138 | je int_ret_from_sys_call | ||
139 | testl $_TIF_IA32,threadinfo_flags(%rcx) | ||
140 | jnz int_ret_from_sys_call | ||
141 | RESTORE_TOP_OF_STACK %rdi,ARGOFFSET | ||
142 | jmp ret_from_sys_call | ||
143 | rff_trace: | ||
144 | movq %rsp,%rdi | ||
145 | call syscall_trace_leave | ||
146 | GET_THREAD_INFO(%rcx) | ||
147 | jmp rff_action | ||
148 | CFI_ENDPROC | ||
149 | |||
150 | /* | ||
151 | * System call entry. Upto 6 arguments in registers are supported. | ||
152 | * | ||
153 | * SYSCALL does not save anything on the stack and does not change the | ||
154 | * stack pointer. | ||
155 | */ | ||
156 | |||
157 | /* | ||
158 | * Register setup: | ||
159 | * rax system call number | ||
160 | * rdi arg0 | ||
161 | * rcx return address for syscall/sysret, C arg3 | ||
162 | * rsi arg1 | ||
163 | * rdx arg2 | ||
164 | * r10 arg3 (--> moved to rcx for C) | ||
165 | * r8 arg4 | ||
166 | * r9 arg5 | ||
167 | * r11 eflags for syscall/sysret, temporary for C | ||
168 | * r12-r15,rbp,rbx saved by C code, not touched. | ||
169 | * | ||
170 | * Interrupts are off on entry. | ||
171 | * Only called from user space. | ||
172 | * | ||
173 | * XXX if we had a free scratch register we could save the RSP into the stack frame | ||
174 | * and report it properly in ps. Unfortunately we haven't. | ||
175 | */ | ||
176 | |||
177 | ENTRY(system_call) | ||
178 | CFI_STARTPROC | ||
179 | swapgs | ||
180 | movq %rsp,%gs:pda_oldrsp | ||
181 | movq %gs:pda_kernelstack,%rsp | ||
182 | sti | ||
183 | SAVE_ARGS 8,1 | ||
184 | movq %rax,ORIG_RAX-ARGOFFSET(%rsp) | ||
185 | movq %rcx,RIP-ARGOFFSET(%rsp) | ||
186 | GET_THREAD_INFO(%rcx) | ||
187 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) | ||
188 | jnz tracesys | ||
189 | cmpq $__NR_syscall_max,%rax | ||
190 | ja badsys | ||
191 | movq %r10,%rcx | ||
192 | call *sys_call_table(,%rax,8) # XXX: rip relative | ||
193 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
194 | /* | ||
195 | * Syscall return path ending with SYSRET (fast path) | ||
196 | * Has incomplete stack frame and undefined top of stack. | ||
197 | */ | ||
198 | .globl ret_from_sys_call | ||
199 | ret_from_sys_call: | ||
200 | movl $_TIF_WORK_MASK,%edi | ||
201 | /* edi: flagmask */ | ||
202 | sysret_check: | ||
203 | GET_THREAD_INFO(%rcx) | ||
204 | cli | ||
205 | movl threadinfo_flags(%rcx),%edx | ||
206 | andl %edi,%edx | ||
207 | jnz sysret_careful | ||
208 | movq RIP-ARGOFFSET(%rsp),%rcx | ||
209 | RESTORE_ARGS 0,-ARG_SKIP,1 | ||
210 | movq %gs:pda_oldrsp,%rsp | ||
211 | swapgs | ||
212 | sysretq | ||
213 | |||
214 | /* Handle reschedules */ | ||
215 | /* edx: work, edi: workmask */ | ||
216 | sysret_careful: | ||
217 | bt $TIF_NEED_RESCHED,%edx | ||
218 | jnc sysret_signal | ||
219 | sti | ||
220 | pushq %rdi | ||
221 | call schedule | ||
222 | popq %rdi | ||
223 | jmp sysret_check | ||
224 | |||
225 | /* Handle a signal */ | ||
226 | sysret_signal: | ||
227 | sti | ||
228 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
229 | jz 1f | ||
230 | |||
231 | /* Really a signal */ | ||
232 | /* edx: work flags (arg3) */ | ||
233 | leaq do_notify_resume(%rip),%rax | ||
234 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | ||
235 | xorl %esi,%esi # oldset -> arg2 | ||
236 | call ptregscall_common | ||
237 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
238 | jmp sysret_check | ||
239 | |||
240 | /* Do syscall tracing */ | ||
241 | tracesys: | ||
242 | SAVE_REST | ||
243 | movq $-ENOSYS,RAX(%rsp) | ||
244 | FIXUP_TOP_OF_STACK %rdi | ||
245 | movq %rsp,%rdi | ||
246 | call syscall_trace_enter | ||
247 | LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ | ||
248 | RESTORE_REST | ||
249 | cmpq $__NR_syscall_max,%rax | ||
250 | ja 1f | ||
251 | movq %r10,%rcx /* fixup for C */ | ||
252 | call *sys_call_table(,%rax,8) | ||
253 | movq %rax,RAX-ARGOFFSET(%rsp) | ||
254 | 1: SAVE_REST | ||
255 | movq %rsp,%rdi | ||
256 | call syscall_trace_leave | ||
257 | RESTORE_TOP_OF_STACK %rbx | ||
258 | RESTORE_REST | ||
259 | jmp ret_from_sys_call | ||
260 | |||
261 | badsys: | ||
262 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | ||
263 | jmp ret_from_sys_call | ||
264 | |||
265 | /* | ||
266 | * Syscall return path ending with IRET. | ||
267 | * Has correct top of stack, but partial stack frame. | ||
268 | */ | ||
269 | ENTRY(int_ret_from_sys_call) | ||
270 | cli | ||
271 | testl $3,CS-ARGOFFSET(%rsp) | ||
272 | je retint_restore_args | ||
273 | movl $_TIF_ALLWORK_MASK,%edi | ||
274 | /* edi: mask to check */ | ||
275 | int_with_check: | ||
276 | GET_THREAD_INFO(%rcx) | ||
277 | movl threadinfo_flags(%rcx),%edx | ||
278 | andl %edi,%edx | ||
279 | jnz int_careful | ||
280 | jmp retint_swapgs | ||
281 | |||
282 | /* Either reschedule or signal or syscall exit tracking needed. */ | ||
283 | /* First do a reschedule test. */ | ||
284 | /* edx: work, edi: workmask */ | ||
285 | int_careful: | ||
286 | bt $TIF_NEED_RESCHED,%edx | ||
287 | jnc int_very_careful | ||
288 | sti | ||
289 | pushq %rdi | ||
290 | call schedule | ||
291 | popq %rdi | ||
292 | jmp int_with_check | ||
293 | |||
294 | /* handle signals and tracing -- both require a full stack frame */ | ||
295 | int_very_careful: | ||
296 | sti | ||
297 | SAVE_REST | ||
298 | /* Check for syscall exit trace */ | ||
299 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx | ||
300 | jz int_signal | ||
301 | pushq %rdi | ||
302 | leaq 8(%rsp),%rdi # &ptregs -> arg1 | ||
303 | call syscall_trace_leave | ||
304 | popq %rdi | ||
305 | btr $TIF_SYSCALL_TRACE,%edi | ||
306 | btr $TIF_SYSCALL_AUDIT,%edi | ||
307 | btr $TIF_SINGLESTEP,%edi | ||
308 | jmp int_restore_rest | ||
309 | |||
310 | int_signal: | ||
311 | testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx | ||
312 | jz 1f | ||
313 | movq %rsp,%rdi # &ptregs -> arg1 | ||
314 | xorl %esi,%esi # oldset -> arg2 | ||
315 | call do_notify_resume | ||
316 | 1: movl $_TIF_NEED_RESCHED,%edi | ||
317 | int_restore_rest: | ||
318 | RESTORE_REST | ||
319 | jmp int_with_check | ||
320 | CFI_ENDPROC | ||
321 | |||
322 | /* | ||
323 | * Certain special system calls that need to save a complete full stack frame. | ||
324 | */ | ||
325 | |||
326 | .macro PTREGSCALL label,func,arg | ||
327 | .globl \label | ||
328 | \label: | ||
329 | leaq \func(%rip),%rax | ||
330 | leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ | ||
331 | jmp ptregscall_common | ||
332 | .endm | ||
333 | |||
334 | PTREGSCALL stub_clone, sys_clone, %r8 | ||
335 | PTREGSCALL stub_fork, sys_fork, %rdi | ||
336 | PTREGSCALL stub_vfork, sys_vfork, %rdi | ||
337 | PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx | ||
338 | PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx | ||
339 | PTREGSCALL stub_iopl, sys_iopl, %rsi | ||
340 | |||
341 | ENTRY(ptregscall_common) | ||
342 | CFI_STARTPROC | ||
343 | popq %r11 | ||
344 | CFI_ADJUST_CFA_OFFSET -8 | ||
345 | SAVE_REST | ||
346 | movq %r11, %r15 | ||
347 | FIXUP_TOP_OF_STACK %r11 | ||
348 | call *%rax | ||
349 | RESTORE_TOP_OF_STACK %r11 | ||
350 | movq %r15, %r11 | ||
351 | RESTORE_REST | ||
352 | pushq %r11 | ||
353 | CFI_ADJUST_CFA_OFFSET 8 | ||
354 | ret | ||
355 | CFI_ENDPROC | ||
356 | |||
357 | ENTRY(stub_execve) | ||
358 | CFI_STARTPROC | ||
359 | popq %r11 | ||
360 | CFI_ADJUST_CFA_OFFSET -8 | ||
361 | SAVE_REST | ||
362 | movq %r11, %r15 | ||
363 | FIXUP_TOP_OF_STACK %r11 | ||
364 | call sys_execve | ||
365 | GET_THREAD_INFO(%rcx) | ||
366 | bt $TIF_IA32,threadinfo_flags(%rcx) | ||
367 | jc exec_32bit | ||
368 | RESTORE_TOP_OF_STACK %r11 | ||
369 | movq %r15, %r11 | ||
370 | RESTORE_REST | ||
371 | push %r11 | ||
372 | ret | ||
373 | |||
374 | exec_32bit: | ||
375 | CFI_ADJUST_CFA_OFFSET REST_SKIP | ||
376 | movq %rax,RAX(%rsp) | ||
377 | RESTORE_REST | ||
378 | jmp int_ret_from_sys_call | ||
379 | CFI_ENDPROC | ||
380 | |||
381 | /* | ||
382 | * sigreturn is special because it needs to restore all registers on return. | ||
383 | * This cannot be done with SYSRET, so use the IRET return path instead. | ||
384 | */ | ||
385 | ENTRY(stub_rt_sigreturn) | ||
386 | CFI_STARTPROC | ||
387 | addq $8, %rsp | ||
388 | SAVE_REST | ||
389 | movq %rsp,%rdi | ||
390 | FIXUP_TOP_OF_STACK %r11 | ||
391 | call sys_rt_sigreturn | ||
392 | movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer | ||
393 | RESTORE_REST | ||
394 | jmp int_ret_from_sys_call | ||
395 | CFI_ENDPROC | ||
396 | |||
397 | /* | ||
398 | * Interrupt entry/exit. | ||
399 | * | ||
400 | * Interrupt entry points save only callee clobbered registers in fast path. | ||
401 | * | ||
402 | * Entry runs with interrupts off. | ||
403 | */ | ||
404 | |||
405 | /* 0(%rsp): interrupt number */ | ||
406 | .macro interrupt func | ||
407 | CFI_STARTPROC simple | ||
408 | CFI_DEF_CFA rsp,(SS-RDI) | ||
409 | CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) | ||
410 | CFI_REL_OFFSET rip,(RIP-ORIG_RAX) | ||
411 | cld | ||
412 | #ifdef CONFIG_DEBUG_INFO | ||
413 | SAVE_ALL | ||
414 | movq %rsp,%rdi | ||
415 | /* | ||
416 | * Setup a stack frame pointer. This allows gdb to trace | ||
417 | * back to the original stack. | ||
418 | */ | ||
419 | movq %rsp,%rbp | ||
420 | CFI_DEF_CFA_REGISTER rbp | ||
421 | #else | ||
422 | SAVE_ARGS | ||
423 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler | ||
424 | #endif | ||
425 | testl $3,CS(%rdi) | ||
426 | je 1f | ||
427 | swapgs | ||
428 | 1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count | ||
429 | movq %gs:pda_irqstackptr,%rax | ||
430 | cmoveq %rax,%rsp | ||
431 | pushq %rdi # save old stack | ||
432 | call \func | ||
433 | .endm | ||
434 | |||
435 | ENTRY(common_interrupt) | ||
436 | interrupt do_IRQ | ||
437 | /* 0(%rsp): oldrsp-ARGOFFSET */ | ||
438 | ret_from_intr: | ||
439 | popq %rdi | ||
440 | cli | ||
441 | subl $1,%gs:pda_irqcount | ||
442 | #ifdef CONFIG_DEBUG_INFO | ||
443 | movq RBP(%rdi),%rbp | ||
444 | #endif | ||
445 | leaq ARGOFFSET(%rdi),%rsp | ||
446 | exit_intr: | ||
447 | GET_THREAD_INFO(%rcx) | ||
448 | testl $3,CS-ARGOFFSET(%rsp) | ||
449 | je retint_kernel | ||
450 | |||
451 | /* Interrupt came from user space */ | ||
452 | /* | ||
453 | * Has a correct top of stack, but a partial stack frame | ||
454 | * %rcx: thread info. Interrupts off. | ||
455 | */ | ||
456 | retint_with_reschedule: | ||
457 | movl $_TIF_WORK_MASK,%edi | ||
458 | retint_check: | ||
459 | movl threadinfo_flags(%rcx),%edx | ||
460 | andl %edi,%edx | ||
461 | jnz retint_careful | ||
462 | retint_swapgs: | ||
463 | cli | ||
464 | swapgs | ||
465 | retint_restore_args: | ||
466 | cli | ||
467 | RESTORE_ARGS 0,8,0 | ||
468 | iret_label: | ||
469 | iretq | ||
470 | |||
471 | .section __ex_table,"a" | ||
472 | .quad iret_label,bad_iret | ||
473 | .previous | ||
474 | .section .fixup,"ax" | ||
475 | /* force a signal here? this matches i386 behaviour */ | ||
476 | /* running with kernel gs */ | ||
477 | bad_iret: | ||
478 | movq $-9999,%rdi /* better code? */ | ||
479 | jmp do_exit | ||
480 | .previous | ||
481 | |||
482 | /* edi: workmask, edx: work */ | ||
483 | retint_careful: | ||
484 | bt $TIF_NEED_RESCHED,%edx | ||
485 | jnc retint_signal | ||
486 | sti | ||
487 | pushq %rdi | ||
488 | call schedule | ||
489 | popq %rdi | ||
490 | GET_THREAD_INFO(%rcx) | ||
491 | cli | ||
492 | jmp retint_check | ||
493 | |||
494 | retint_signal: | ||
495 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
496 | jz retint_swapgs | ||
497 | sti | ||
498 | SAVE_REST | ||
499 | movq $-1,ORIG_RAX(%rsp) | ||
500 | xorq %rsi,%rsi # oldset | ||
501 | movq %rsp,%rdi # &pt_regs | ||
502 | call do_notify_resume | ||
503 | RESTORE_REST | ||
504 | cli | ||
505 | movl $_TIF_NEED_RESCHED,%edi | ||
506 | GET_THREAD_INFO(%rcx) | ||
507 | jmp retint_check | ||
508 | |||
509 | #ifdef CONFIG_PREEMPT | ||
510 | /* Returning to kernel space. Check if we need preemption */ | ||
511 | /* rcx: threadinfo. interrupts off. */ | ||
512 | .p2align | ||
513 | retint_kernel: | ||
514 | cmpl $0,threadinfo_preempt_count(%rcx) | ||
515 | jnz retint_restore_args | ||
516 | bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) | ||
517 | jnc retint_restore_args | ||
518 | bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ | ||
519 | jnc retint_restore_args | ||
520 | call preempt_schedule_irq | ||
521 | jmp exit_intr | ||
522 | #endif | ||
523 | CFI_ENDPROC | ||
524 | |||
525 | /* | ||
526 | * APIC interrupts. | ||
527 | */ | ||
528 | .macro apicinterrupt num,func | ||
529 | pushq $\num-256 | ||
530 | interrupt \func | ||
531 | jmp ret_from_intr | ||
532 | CFI_ENDPROC | ||
533 | .endm | ||
534 | |||
535 | ENTRY(thermal_interrupt) | ||
536 | apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt | ||
537 | |||
538 | #ifdef CONFIG_SMP | ||
539 | ENTRY(reschedule_interrupt) | ||
540 | apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt | ||
541 | |||
542 | ENTRY(invalidate_interrupt) | ||
543 | apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt | ||
544 | |||
545 | ENTRY(call_function_interrupt) | ||
546 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt | ||
547 | #endif | ||
548 | |||
549 | #ifdef CONFIG_X86_LOCAL_APIC | ||
550 | ENTRY(apic_timer_interrupt) | ||
551 | apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt | ||
552 | |||
553 | ENTRY(error_interrupt) | ||
554 | apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt | ||
555 | |||
556 | ENTRY(spurious_interrupt) | ||
557 | apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt | ||
558 | #endif | ||
559 | |||
560 | /* | ||
561 | * Exception entry points. | ||
562 | */ | ||
563 | .macro zeroentry sym | ||
564 | pushq $0 /* push error code/oldrax */ | ||
565 | pushq %rax /* push real oldrax to the rdi slot */ | ||
566 | leaq \sym(%rip),%rax | ||
567 | jmp error_entry | ||
568 | .endm | ||
569 | |||
570 | .macro errorentry sym | ||
571 | pushq %rax | ||
572 | leaq \sym(%rip),%rax | ||
573 | jmp error_entry | ||
574 | .endm | ||
575 | |||
576 | /* error code is on the stack already */ | ||
577 | /* handle NMI like exceptions that can happen everywhere */ | ||
578 | .macro paranoidentry sym | ||
579 | SAVE_ALL | ||
580 | cld | ||
581 | movl $1,%ebx | ||
582 | movl $MSR_GS_BASE,%ecx | ||
583 | rdmsr | ||
584 | testl %edx,%edx | ||
585 | js 1f | ||
586 | swapgs | ||
587 | xorl %ebx,%ebx | ||
588 | 1: movq %rsp,%rdi | ||
589 | movq ORIG_RAX(%rsp),%rsi | ||
590 | movq $-1,ORIG_RAX(%rsp) | ||
591 | call \sym | ||
592 | .endm | ||
593 | |||
594 | /* | ||
595 | * Exception entry point. This expects an error code/orig_rax on the stack | ||
596 | * and the exception handler in %rax. | ||
597 | */ | ||
598 | ENTRY(error_entry) | ||
599 | CFI_STARTPROC simple | ||
600 | CFI_DEF_CFA rsp,(SS-RDI) | ||
601 | CFI_REL_OFFSET rsp,(RSP-RDI) | ||
602 | CFI_REL_OFFSET rip,(RIP-RDI) | ||
603 | /* rdi slot contains rax, oldrax contains error code */ | ||
604 | cld | ||
605 | subq $14*8,%rsp | ||
606 | CFI_ADJUST_CFA_OFFSET (14*8) | ||
607 | movq %rsi,13*8(%rsp) | ||
608 | CFI_REL_OFFSET rsi,RSI | ||
609 | movq 14*8(%rsp),%rsi /* load rax from rdi slot */ | ||
610 | movq %rdx,12*8(%rsp) | ||
611 | CFI_REL_OFFSET rdx,RDX | ||
612 | movq %rcx,11*8(%rsp) | ||
613 | CFI_REL_OFFSET rcx,RCX | ||
614 | movq %rsi,10*8(%rsp) /* store rax */ | ||
615 | CFI_REL_OFFSET rax,RAX | ||
616 | movq %r8, 9*8(%rsp) | ||
617 | CFI_REL_OFFSET r8,R8 | ||
618 | movq %r9, 8*8(%rsp) | ||
619 | CFI_REL_OFFSET r9,R9 | ||
620 | movq %r10,7*8(%rsp) | ||
621 | CFI_REL_OFFSET r10,R10 | ||
622 | movq %r11,6*8(%rsp) | ||
623 | CFI_REL_OFFSET r11,R11 | ||
624 | movq %rbx,5*8(%rsp) | ||
625 | CFI_REL_OFFSET rbx,RBX | ||
626 | movq %rbp,4*8(%rsp) | ||
627 | CFI_REL_OFFSET rbp,RBP | ||
628 | movq %r12,3*8(%rsp) | ||
629 | CFI_REL_OFFSET r12,R12 | ||
630 | movq %r13,2*8(%rsp) | ||
631 | CFI_REL_OFFSET r13,R13 | ||
632 | movq %r14,1*8(%rsp) | ||
633 | CFI_REL_OFFSET r14,R14 | ||
634 | movq %r15,(%rsp) | ||
635 | CFI_REL_OFFSET r15,R15 | ||
636 | xorl %ebx,%ebx | ||
637 | testl $3,CS(%rsp) | ||
638 | je error_kernelspace | ||
639 | error_swapgs: | ||
640 | swapgs | ||
641 | error_sti: | ||
642 | movq %rdi,RDI(%rsp) | ||
643 | movq %rsp,%rdi | ||
644 | movq ORIG_RAX(%rsp),%rsi /* get error code */ | ||
645 | movq $-1,ORIG_RAX(%rsp) | ||
646 | call *%rax | ||
647 | /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ | ||
648 | error_exit: | ||
649 | movl %ebx,%eax | ||
650 | RESTORE_REST | ||
651 | cli | ||
652 | GET_THREAD_INFO(%rcx) | ||
653 | testl %eax,%eax | ||
654 | jne retint_kernel | ||
655 | movl threadinfo_flags(%rcx),%edx | ||
656 | movl $_TIF_WORK_MASK,%edi | ||
657 | andl %edi,%edx | ||
658 | jnz retint_careful | ||
659 | swapgs | ||
660 | RESTORE_ARGS 0,8,0 | ||
661 | iretq | ||
662 | CFI_ENDPROC | ||
663 | |||
664 | error_kernelspace: | ||
665 | incl %ebx | ||
666 | /* There are two places in the kernel that can potentially fault with | ||
667 | usergs. Handle them here. The exception handlers after | ||
668 | iret run with kernel gs again, so don't set the user space flag. | ||
669 | B stepping K8s sometimes report an truncated RIP for IRET | ||
670 | exceptions returning to compat mode. Check for these here too. */ | ||
671 | leaq iret_label(%rip),%rbp | ||
672 | cmpq %rbp,RIP(%rsp) | ||
673 | je error_swapgs | ||
674 | movl %ebp,%ebp /* zero extend */ | ||
675 | cmpq %rbp,RIP(%rsp) | ||
676 | je error_swapgs | ||
677 | cmpq $gs_change,RIP(%rsp) | ||
678 | je error_swapgs | ||
679 | jmp error_sti | ||
680 | |||
681 | /* Reload gs selector with exception handling */ | ||
682 | /* edi: new selector */ | ||
683 | ENTRY(load_gs_index) | ||
684 | pushf | ||
685 | cli | ||
686 | swapgs | ||
687 | gs_change: | ||
688 | movl %edi,%gs | ||
689 | 2: mfence /* workaround */ | ||
690 | swapgs | ||
691 | popf | ||
692 | ret | ||
693 | |||
694 | .section __ex_table,"a" | ||
695 | .align 8 | ||
696 | .quad gs_change,bad_gs | ||
697 | .previous | ||
698 | .section .fixup,"ax" | ||
699 | /* running with kernelgs */ | ||
700 | bad_gs: | ||
701 | swapgs /* switch back to user gs */ | ||
702 | xorl %eax,%eax | ||
703 | movl %eax,%gs | ||
704 | jmp 2b | ||
705 | .previous | ||
706 | |||
707 | /* | ||
708 | * Create a kernel thread. | ||
709 | * | ||
710 | * C extern interface: | ||
711 | * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | ||
712 | * | ||
713 | * asm input arguments: | ||
714 | * rdi: fn, rsi: arg, rdx: flags | ||
715 | */ | ||
716 | ENTRY(kernel_thread) | ||
717 | CFI_STARTPROC | ||
718 | FAKE_STACK_FRAME $child_rip | ||
719 | SAVE_ALL | ||
720 | |||
721 | # rdi: flags, rsi: usp, rdx: will be &pt_regs | ||
722 | movq %rdx,%rdi | ||
723 | orq kernel_thread_flags(%rip),%rdi | ||
724 | movq $-1, %rsi | ||
725 | movq %rsp, %rdx | ||
726 | |||
727 | xorl %r8d,%r8d | ||
728 | xorl %r9d,%r9d | ||
729 | |||
730 | # clone now | ||
731 | call do_fork | ||
732 | movq %rax,RAX(%rsp) | ||
733 | xorl %edi,%edi | ||
734 | |||
735 | /* | ||
736 | * It isn't worth to check for reschedule here, | ||
737 | * so internally to the x86_64 port you can rely on kernel_thread() | ||
738 | * not to reschedule the child before returning, this avoids the need | ||
739 | * of hacks for example to fork off the per-CPU idle tasks. | ||
740 | * [Hopefully no generic code relies on the reschedule -AK] | ||
741 | */ | ||
742 | RESTORE_ALL | ||
743 | UNFAKE_STACK_FRAME | ||
744 | ret | ||
745 | CFI_ENDPROC | ||
746 | |||
747 | |||
748 | child_rip: | ||
749 | /* | ||
750 | * Here we are in the child and the registers are set as they were | ||
751 | * at kernel_thread() invocation in the parent. | ||
752 | */ | ||
753 | movq %rdi, %rax | ||
754 | movq %rsi, %rdi | ||
755 | call *%rax | ||
756 | # exit | ||
757 | xorq %rdi, %rdi | ||
758 | call do_exit | ||
759 | |||
760 | /* | ||
761 | * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. | ||
762 | * | ||
763 | * C extern interface: | ||
764 | * extern long execve(char *name, char **argv, char **envp) | ||
765 | * | ||
766 | * asm input arguments: | ||
767 | * rdi: name, rsi: argv, rdx: envp | ||
768 | * | ||
769 | * We want to fallback into: | ||
770 | * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) | ||
771 | * | ||
772 | * do_sys_execve asm fallback arguments: | ||
773 | * rdi: name, rsi: argv, rdx: envp, fake frame on the stack | ||
774 | */ | ||
775 | ENTRY(execve) | ||
776 | CFI_STARTPROC | ||
777 | FAKE_STACK_FRAME $0 | ||
778 | SAVE_ALL | ||
779 | call sys_execve | ||
780 | movq %rax, RAX(%rsp) | ||
781 | RESTORE_REST | ||
782 | testq %rax,%rax | ||
783 | je int_ret_from_sys_call | ||
784 | RESTORE_ARGS | ||
785 | UNFAKE_STACK_FRAME | ||
786 | ret | ||
787 | CFI_ENDPROC | ||
788 | |||
789 | ENTRY(page_fault) | ||
790 | errorentry do_page_fault | ||
791 | |||
792 | ENTRY(coprocessor_error) | ||
793 | zeroentry do_coprocessor_error | ||
794 | |||
795 | ENTRY(simd_coprocessor_error) | ||
796 | zeroentry do_simd_coprocessor_error | ||
797 | |||
798 | ENTRY(device_not_available) | ||
799 | zeroentry math_state_restore | ||
800 | |||
801 | /* runs on exception stack */ | ||
802 | ENTRY(debug) | ||
803 | CFI_STARTPROC | ||
804 | pushq $0 | ||
805 | CFI_ADJUST_CFA_OFFSET 8 | ||
806 | paranoidentry do_debug | ||
807 | /* switch back to process stack to restore the state ptrace touched */ | ||
808 | movq %rax,%rsp | ||
809 | testl $3,CS(%rsp) | ||
810 | jnz paranoid_userspace | ||
811 | jmp paranoid_exit | ||
812 | CFI_ENDPROC | ||
813 | |||
814 | /* runs on exception stack */ | ||
815 | ENTRY(nmi) | ||
816 | CFI_STARTPROC | ||
817 | pushq $-1 | ||
818 | CFI_ADJUST_CFA_OFFSET 8 | ||
819 | paranoidentry do_nmi | ||
820 | /* ebx: no swapgs flag */ | ||
821 | paranoid_exit: | ||
822 | testl %ebx,%ebx /* swapgs needed? */ | ||
823 | jnz paranoid_restore | ||
824 | paranoid_swapgs: | ||
825 | cli | ||
826 | swapgs | ||
827 | paranoid_restore: | ||
828 | RESTORE_ALL 8 | ||
829 | iretq | ||
830 | paranoid_userspace: | ||
831 | cli | ||
832 | GET_THREAD_INFO(%rcx) | ||
833 | movl threadinfo_flags(%rcx),%edx | ||
834 | testl $_TIF_NEED_RESCHED,%edx | ||
835 | jnz paranoid_resched | ||
836 | testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx | ||
837 | jnz paranoid_signal | ||
838 | jmp paranoid_swapgs | ||
839 | paranoid_resched: | ||
840 | sti | ||
841 | call schedule | ||
842 | jmp paranoid_exit | ||
843 | paranoid_signal: | ||
844 | sti | ||
845 | xorl %esi,%esi /* oldset */ | ||
846 | movq %rsp,%rdi /* &pt_regs */ | ||
847 | call do_notify_resume | ||
848 | jmp paranoid_exit | ||
849 | CFI_ENDPROC | ||
850 | |||
851 | ENTRY(int3) | ||
852 | zeroentry do_int3 | ||
853 | |||
854 | ENTRY(overflow) | ||
855 | zeroentry do_overflow | ||
856 | |||
857 | ENTRY(bounds) | ||
858 | zeroentry do_bounds | ||
859 | |||
860 | ENTRY(invalid_op) | ||
861 | zeroentry do_invalid_op | ||
862 | |||
863 | ENTRY(coprocessor_segment_overrun) | ||
864 | zeroentry do_coprocessor_segment_overrun | ||
865 | |||
866 | ENTRY(reserved) | ||
867 | zeroentry do_reserved | ||
868 | |||
869 | /* runs on exception stack */ | ||
870 | ENTRY(double_fault) | ||
871 | CFI_STARTPROC | ||
872 | paranoidentry do_double_fault | ||
873 | movq %rax,%rsp | ||
874 | testl $3,CS(%rsp) | ||
875 | jnz paranoid_userspace | ||
876 | jmp paranoid_exit | ||
877 | CFI_ENDPROC | ||
878 | |||
879 | ENTRY(invalid_TSS) | ||
880 | errorentry do_invalid_TSS | ||
881 | |||
882 | ENTRY(segment_not_present) | ||
883 | errorentry do_segment_not_present | ||
884 | |||
885 | /* runs on exception stack */ | ||
886 | ENTRY(stack_segment) | ||
887 | CFI_STARTPROC | ||
888 | paranoidentry do_stack_segment | ||
889 | movq %rax,%rsp | ||
890 | testl $3,CS(%rsp) | ||
891 | jnz paranoid_userspace | ||
892 | jmp paranoid_exit | ||
893 | CFI_ENDPROC | ||
894 | |||
895 | ENTRY(general_protection) | ||
896 | errorentry do_general_protection | ||
897 | |||
898 | ENTRY(alignment_check) | ||
899 | errorentry do_alignment_check | ||
900 | |||
901 | ENTRY(divide_error) | ||
902 | zeroentry do_divide_error | ||
903 | |||
904 | ENTRY(spurious_interrupt_bug) | ||
905 | zeroentry do_spurious_interrupt_bug | ||
906 | |||
907 | #ifdef CONFIG_X86_MCE | ||
908 | /* runs on exception stack */ | ||
909 | ENTRY(machine_check) | ||
910 | CFI_STARTPROC | ||
911 | pushq $0 | ||
912 | CFI_ADJUST_CFA_OFFSET 8 | ||
913 | paranoidentry do_machine_check | ||
914 | jmp paranoid_exit | ||
915 | CFI_ENDPROC | ||
916 | #endif | ||
917 | |||
918 | ENTRY(call_debug) | ||
919 | zeroentry do_call_debug | ||
920 | |||
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c new file mode 100644 index 000000000000..d2c42fb99cb3 --- /dev/null +++ b/arch/x86_64/kernel/genapic.c | |||
@@ -0,0 +1,89 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Generic APIC sub-arch probe layer. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/config.h> | ||
12 | #include <linux/threads.h> | ||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/module.h> | ||
19 | |||
20 | #include <asm/smp.h> | ||
21 | #include <asm/ipi.h> | ||
22 | |||
23 | /* which logical CPU number maps to which CPU (physical APIC ID) */ | ||
24 | u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
25 | EXPORT_SYMBOL(x86_cpu_to_apicid); | ||
26 | u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
27 | |||
28 | extern struct genapic apic_cluster; | ||
29 | extern struct genapic apic_flat; | ||
30 | |||
31 | struct genapic *genapic = &apic_flat; | ||
32 | |||
33 | |||
34 | /* | ||
35 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | ||
36 | */ | ||
37 | void __init clustered_apic_check(void) | ||
38 | { | ||
39 | long i; | ||
40 | u8 clusters, max_cluster; | ||
41 | u8 id; | ||
42 | u8 cluster_cnt[NUM_APIC_CLUSTERS]; | ||
43 | |||
44 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | ||
45 | /* AMD always uses flat mode right now */ | ||
46 | genapic = &apic_flat; | ||
47 | goto print; | ||
48 | } | ||
49 | |||
50 | memset(cluster_cnt, 0, sizeof(cluster_cnt)); | ||
51 | |||
52 | for (i = 0; i < NR_CPUS; i++) { | ||
53 | id = bios_cpu_apicid[i]; | ||
54 | if (id != BAD_APICID) | ||
55 | cluster_cnt[APIC_CLUSTERID(id)]++; | ||
56 | } | ||
57 | |||
58 | clusters = 0; | ||
59 | max_cluster = 0; | ||
60 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
61 | if (cluster_cnt[i] > 0) { | ||
62 | ++clusters; | ||
63 | if (cluster_cnt[i] > max_cluster) | ||
64 | max_cluster = cluster_cnt[i]; | ||
65 | } | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, | ||
70 | * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical | ||
71 | * else physical mode. | ||
72 | * (We don't use lowest priority delivery + HW APIC IRQ steering, so | ||
73 | * can ignore the clustered logical case and go straight to physical.) | ||
74 | */ | ||
75 | if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) | ||
76 | genapic = &apic_flat; | ||
77 | else | ||
78 | genapic = &apic_cluster; | ||
79 | |||
80 | print: | ||
81 | printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); | ||
82 | } | ||
83 | |||
84 | /* Same for both flat and clustered. */ | ||
85 | |||
86 | void send_IPI_self(int vector) | ||
87 | { | ||
88 | __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); | ||
89 | } | ||
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c new file mode 100644 index 000000000000..9703da7202e3 --- /dev/null +++ b/arch/x86_64/kernel/genapic_cluster.c | |||
@@ -0,0 +1,130 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. | ||
6 | * (A more realistic maximum is around 230 CPUs.) | ||
7 | * | ||
8 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
9 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
10 | * James Cleverdon. | ||
11 | */ | ||
12 | #include <linux/config.h> | ||
13 | #include <linux/threads.h> | ||
14 | #include <linux/cpumask.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/ctype.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <asm/smp.h> | ||
20 | #include <asm/ipi.h> | ||
21 | |||
22 | |||
23 | /* | ||
24 | * Set up the logical destination ID. | ||
25 | * | ||
26 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
27 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
28 | * document number 292116). So here it goes... | ||
29 | */ | ||
30 | static void cluster_init_apic_ldr(void) | ||
31 | { | ||
32 | unsigned long val, id; | ||
33 | long i, count; | ||
34 | u8 lid; | ||
35 | u8 my_id = hard_smp_processor_id(); | ||
36 | u8 my_cluster = APIC_CLUSTER(my_id); | ||
37 | |||
38 | /* Create logical APIC IDs by counting CPUs already in cluster. */ | ||
39 | for (count = 0, i = NR_CPUS; --i >= 0; ) { | ||
40 | lid = x86_cpu_to_log_apicid[i]; | ||
41 | if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) | ||
42 | ++count; | ||
43 | } | ||
44 | /* | ||
45 | * We only have a 4 wide bitmap in cluster mode. There's no way | ||
46 | * to get above 60 CPUs and still give each one it's own bit. | ||
47 | * But, we're using physical IRQ delivery, so we don't care. | ||
48 | * Use bit 3 for the 4th through Nth CPU in each cluster. | ||
49 | */ | ||
50 | if (count >= XAPIC_DEST_CPUS_SHIFT) | ||
51 | count = 3; | ||
52 | id = my_cluster | (1UL << count); | ||
53 | x86_cpu_to_log_apicid[smp_processor_id()] = id; | ||
54 | apic_write_around(APIC_DFR, APIC_DFR_CLUSTER); | ||
55 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; | ||
56 | val |= SET_APIC_LOGICAL_ID(id); | ||
57 | apic_write_around(APIC_LDR, val); | ||
58 | } | ||
59 | |||
60 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | ||
61 | |||
62 | static cpumask_t cluster_target_cpus(void) | ||
63 | { | ||
64 | return cpumask_of_cpu(0); | ||
65 | } | ||
66 | |||
67 | static void cluster_send_IPI_mask(cpumask_t mask, int vector) | ||
68 | { | ||
69 | send_IPI_mask_sequence(mask, vector); | ||
70 | } | ||
71 | |||
72 | static void cluster_send_IPI_allbutself(int vector) | ||
73 | { | ||
74 | cpumask_t mask = cpu_online_map; | ||
75 | cpu_clear(smp_processor_id(), mask); | ||
76 | |||
77 | if (!cpus_empty(mask)) | ||
78 | cluster_send_IPI_mask(mask, vector); | ||
79 | } | ||
80 | |||
81 | static void cluster_send_IPI_all(int vector) | ||
82 | { | ||
83 | cluster_send_IPI_mask(cpu_online_map, vector); | ||
84 | } | ||
85 | |||
86 | static int cluster_apic_id_registered(void) | ||
87 | { | ||
88 | return 1; | ||
89 | } | ||
90 | |||
91 | static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) | ||
92 | { | ||
93 | int cpu; | ||
94 | |||
95 | /* | ||
96 | * We're using fixed IRQ delivery, can only return one phys APIC ID. | ||
97 | * May as well be the first. | ||
98 | */ | ||
99 | cpu = first_cpu(cpumask); | ||
100 | if ((unsigned)cpu < NR_CPUS) | ||
101 | return x86_cpu_to_apicid[cpu]; | ||
102 | else | ||
103 | return BAD_APICID; | ||
104 | } | ||
105 | |||
106 | /* cpuid returns the value latched in the HW at reset, not the APIC ID | ||
107 | * register's value. For any box whose BIOS changes APIC IDs, like | ||
108 | * clustered APIC systems, we must use hard_smp_processor_id. | ||
109 | * | ||
110 | * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. | ||
111 | */ | ||
112 | static unsigned int phys_pkg_id(int index_msb) | ||
113 | { | ||
114 | return hard_smp_processor_id() >> index_msb; | ||
115 | } | ||
116 | |||
117 | struct genapic apic_cluster = { | ||
118 | .name = "clustered", | ||
119 | .int_delivery_mode = dest_Fixed, | ||
120 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | ||
121 | .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, | ||
122 | .target_cpus = cluster_target_cpus, | ||
123 | .apic_id_registered = cluster_apic_id_registered, | ||
124 | .init_apic_ldr = cluster_init_apic_ldr, | ||
125 | .send_IPI_all = cluster_send_IPI_all, | ||
126 | .send_IPI_allbutself = cluster_send_IPI_allbutself, | ||
127 | .send_IPI_mask = cluster_send_IPI_mask, | ||
128 | .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, | ||
129 | .phys_pkg_id = phys_pkg_id, | ||
130 | }; | ||
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c new file mode 100644 index 000000000000..b4cbbad04226 --- /dev/null +++ b/arch/x86_64/kernel/genapic_flat.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * Copyright 2004 James Cleverdon, IBM. | ||
3 | * Subject to the GNU Public License, v.2 | ||
4 | * | ||
5 | * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. | ||
6 | * | ||
7 | * Hacked for x86-64 by James Cleverdon from i386 architecture code by | ||
8 | * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and | ||
9 | * James Cleverdon. | ||
10 | */ | ||
11 | #include <linux/config.h> | ||
12 | #include <linux/threads.h> | ||
13 | #include <linux/cpumask.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <asm/smp.h> | ||
19 | #include <asm/ipi.h> | ||
20 | |||
21 | |||
22 | static cpumask_t flat_target_cpus(void) | ||
23 | { | ||
24 | return cpu_online_map; | ||
25 | } | ||
26 | |||
27 | /* | ||
28 | * Set up the logical destination ID. | ||
29 | * | ||
30 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
31 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
32 | * document number 292116). So here it goes... | ||
33 | */ | ||
34 | static void flat_init_apic_ldr(void) | ||
35 | { | ||
36 | unsigned long val; | ||
37 | unsigned long num, id; | ||
38 | |||
39 | num = smp_processor_id(); | ||
40 | id = 1UL << num; | ||
41 | x86_cpu_to_log_apicid[num] = id; | ||
42 | apic_write_around(APIC_DFR, APIC_DFR_FLAT); | ||
43 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; | ||
44 | val |= SET_APIC_LOGICAL_ID(id); | ||
45 | apic_write_around(APIC_LDR, val); | ||
46 | } | ||
47 | |||
48 | static void flat_send_IPI_allbutself(int vector) | ||
49 | { | ||
50 | /* | ||
51 | * if there are no other CPUs in the system then | ||
52 | * we get an APIC send error if we try to broadcast. | ||
53 | * thus we have to avoid sending IPIs in this case. | ||
54 | */ | ||
55 | if (num_online_cpus() > 1) | ||
56 | __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); | ||
57 | } | ||
58 | |||
59 | static void flat_send_IPI_all(int vector) | ||
60 | { | ||
61 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | ||
62 | } | ||
63 | |||
64 | static void flat_send_IPI_mask(cpumask_t cpumask, int vector) | ||
65 | { | ||
66 | unsigned long mask = cpus_addr(cpumask)[0]; | ||
67 | unsigned long cfg; | ||
68 | unsigned long flags; | ||
69 | |||
70 | local_save_flags(flags); | ||
71 | local_irq_disable(); | ||
72 | |||
73 | /* | ||
74 | * Wait for idle. | ||
75 | */ | ||
76 | apic_wait_icr_idle(); | ||
77 | |||
78 | /* | ||
79 | * prepare target chip field | ||
80 | */ | ||
81 | cfg = __prepare_ICR2(mask); | ||
82 | apic_write_around(APIC_ICR2, cfg); | ||
83 | |||
84 | /* | ||
85 | * program the ICR | ||
86 | */ | ||
87 | cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); | ||
88 | |||
89 | /* | ||
90 | * Send the IPI. The write to APIC_ICR fires this off. | ||
91 | */ | ||
92 | apic_write_around(APIC_ICR, cfg); | ||
93 | local_irq_restore(flags); | ||
94 | } | ||
95 | |||
96 | static int flat_apic_id_registered(void) | ||
97 | { | ||
98 | return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); | ||
99 | } | ||
100 | |||
101 | static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) | ||
102 | { | ||
103 | return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; | ||
104 | } | ||
105 | |||
106 | static unsigned int phys_pkg_id(int index_msb) | ||
107 | { | ||
108 | u32 ebx; | ||
109 | |||
110 | ebx = cpuid_ebx(1); | ||
111 | return ((ebx >> 24) & 0xFF) >> index_msb; | ||
112 | } | ||
113 | |||
114 | struct genapic apic_flat = { | ||
115 | .name = "flat", | ||
116 | .int_delivery_mode = dest_LowestPrio, | ||
117 | .int_dest_mode = (APIC_DEST_LOGICAL != 0), | ||
118 | .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, | ||
119 | .target_cpus = flat_target_cpus, | ||
120 | .apic_id_registered = flat_apic_id_registered, | ||
121 | .init_apic_ldr = flat_init_apic_ldr, | ||
122 | .send_IPI_all = flat_send_IPI_all, | ||
123 | .send_IPI_allbutself = flat_send_IPI_allbutself, | ||
124 | .send_IPI_mask = flat_send_IPI_mask, | ||
125 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, | ||
126 | .phys_pkg_id = phys_pkg_id, | ||
127 | }; | ||
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S new file mode 100644 index 000000000000..b6d8725c1f61 --- /dev/null +++ b/arch/x86_64/kernel/head.S | |||
@@ -0,0 +1,396 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | ||
7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | ||
8 | * | ||
9 | * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ | ||
10 | */ | ||
11 | |||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | #include <linux/threads.h> | ||
15 | #include <asm/desc.h> | ||
16 | #include <asm/segment.h> | ||
17 | #include <asm/page.h> | ||
18 | #include <asm/msr.h> | ||
19 | #include <asm/cache.h> | ||
20 | |||
21 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | ||
22 | * because we need identity-mapped pages on setup so define __START_KERNEL to | ||
23 | * 0x100000 for this stage | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | .text | ||
28 | .code32 | ||
29 | .globl startup_32 | ||
30 | /* %bx: 1 if coming from smp trampoline on secondary cpu */ | ||
31 | startup_32: | ||
32 | |||
33 | /* | ||
34 | * At this point the CPU runs in 32bit protected mode (CS.D = 1) with | ||
35 | * paging disabled and the point of this file is to switch to 64bit | ||
36 | * long mode with a kernel mapping for kerneland to jump into the | ||
37 | * kernel virtual addresses. | ||
38 | * There is no stack until we set one up. | ||
39 | */ | ||
40 | |||
41 | /* Initialize the %ds segment register */ | ||
42 | movl $__KERNEL_DS,%eax | ||
43 | movl %eax,%ds | ||
44 | |||
45 | /* Load new GDT with the 64bit segments using 32bit descriptor */ | ||
46 | lgdt pGDT32 - __START_KERNEL_map | ||
47 | |||
48 | /* If the CPU doesn't support CPUID this will double fault. | ||
49 | * Unfortunately it is hard to check for CPUID without a stack. | ||
50 | */ | ||
51 | |||
52 | /* Check if extended functions are implemented */ | ||
53 | movl $0x80000000, %eax | ||
54 | cpuid | ||
55 | cmpl $0x80000000, %eax | ||
56 | jbe no_long_mode | ||
57 | /* Check if long mode is implemented */ | ||
58 | mov $0x80000001, %eax | ||
59 | cpuid | ||
60 | btl $29, %edx | ||
61 | jnc no_long_mode | ||
62 | |||
63 | /* | ||
64 | * Prepare for entering 64bits mode | ||
65 | */ | ||
66 | |||
67 | /* Enable PAE mode */ | ||
68 | xorl %eax, %eax | ||
69 | btsl $5, %eax | ||
70 | movl %eax, %cr4 | ||
71 | |||
72 | /* Setup early boot stage 4 level pagetables */ | ||
73 | movl $(init_level4_pgt - __START_KERNEL_map), %eax | ||
74 | movl %eax, %cr3 | ||
75 | |||
76 | /* Setup EFER (Extended Feature Enable Register) */ | ||
77 | movl $MSR_EFER, %ecx | ||
78 | rdmsr | ||
79 | |||
80 | /* Enable Long Mode */ | ||
81 | btsl $_EFER_LME, %eax | ||
82 | |||
83 | /* Make changes effective */ | ||
84 | wrmsr | ||
85 | |||
86 | xorl %eax, %eax | ||
87 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | ||
88 | btsl $0, %eax /* Enable protected mode */ | ||
89 | /* Make changes effective */ | ||
90 | movl %eax, %cr0 | ||
91 | /* | ||
92 | * At this point we're in long mode but in 32bit compatibility mode | ||
93 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
94 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use | ||
95 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
96 | */ | ||
97 | ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) | ||
98 | |||
99 | .code64 | ||
100 | .org 0x100 | ||
101 | .globl startup_64 | ||
102 | startup_64: | ||
103 | /* We come here either from startup_32 | ||
104 | * or directly from a 64bit bootloader. | ||
105 | * Since we may have come directly from a bootloader we | ||
106 | * reload the page tables here. | ||
107 | */ | ||
108 | |||
109 | /* Enable PAE mode and PGE */ | ||
110 | xorq %rax, %rax | ||
111 | btsq $5, %rax | ||
112 | btsq $7, %rax | ||
113 | movq %rax, %cr4 | ||
114 | |||
115 | /* Setup early boot stage 4 level pagetables. */ | ||
116 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
117 | movq %rax, %cr3 | ||
118 | |||
119 | /* Check if nx is implemented */ | ||
120 | movl $0x80000001, %eax | ||
121 | cpuid | ||
122 | movl %edx,%edi | ||
123 | |||
124 | /* Setup EFER (Extended Feature Enable Register) */ | ||
125 | movl $MSR_EFER, %ecx | ||
126 | rdmsr | ||
127 | |||
128 | /* Enable System Call */ | ||
129 | btsl $_EFER_SCE, %eax | ||
130 | |||
131 | /* No Execute supported? */ | ||
132 | btl $20,%edi | ||
133 | jnc 1f | ||
134 | btsl $_EFER_NX, %eax | ||
135 | 1: | ||
136 | /* Make changes effective */ | ||
137 | wrmsr | ||
138 | |||
139 | /* Setup cr0 */ | ||
140 | xorq %rax, %rax | ||
141 | btsq $31, %rax /* Enable paging */ | ||
142 | btsq $0, %rax /* Enable protected mode */ | ||
143 | btsq $1, %rax /* Enable MP */ | ||
144 | btsq $4, %rax /* Enable ET */ | ||
145 | btsq $5, %rax /* Enable NE */ | ||
146 | btsq $16, %rax /* Enable WP */ | ||
147 | btsq $18, %rax /* Enable AM */ | ||
148 | /* Make changes effective */ | ||
149 | movq %rax, %cr0 | ||
150 | |||
151 | /* Setup a boot time stack */ | ||
152 | movq init_rsp(%rip),%rsp | ||
153 | |||
154 | /* zero EFLAGS after setting rsp */ | ||
155 | pushq $0 | ||
156 | popfq | ||
157 | |||
158 | /* | ||
159 | * We must switch to a new descriptor in kernel space for the GDT | ||
160 | * because soon the kernel won't have access anymore to the userspace | ||
161 | * addresses where we're currently running on. We have to do that here | ||
162 | * because in 32bit we couldn't load a 64bit linear address. | ||
163 | */ | ||
164 | lgdt cpu_gdt_descr | ||
165 | |||
166 | /* | ||
167 | * Setup up a dummy PDA. this is just for some early bootup code | ||
168 | * that does in_interrupt() | ||
169 | */ | ||
170 | movl $MSR_GS_BASE,%ecx | ||
171 | movq $empty_zero_page,%rax | ||
172 | movq %rax,%rdx | ||
173 | shrq $32,%rdx | ||
174 | wrmsr | ||
175 | |||
176 | /* set up data segments. actually 0 would do too */ | ||
177 | movl $__KERNEL_DS,%eax | ||
178 | movl %eax,%ds | ||
179 | movl %eax,%ss | ||
180 | movl %eax,%es | ||
181 | |||
182 | /* esi is pointer to real mode structure with interesting info. | ||
183 | pass it to C */ | ||
184 | movl %esi, %edi | ||
185 | |||
186 | /* Finally jump to run C code and to be on real kernel address | ||
187 | * Since we are running on identity-mapped space we have to jump | ||
188 | * to the full 64bit address , this is only possible as indirect | ||
189 | * jump | ||
190 | */ | ||
191 | movq initial_code(%rip),%rax | ||
192 | jmp *%rax | ||
193 | |||
194 | /* SMP bootup changes these two */ | ||
195 | .globl initial_code | ||
196 | initial_code: | ||
197 | .quad x86_64_start_kernel | ||
198 | .globl init_rsp | ||
199 | init_rsp: | ||
200 | .quad init_thread_union+THREAD_SIZE-8 | ||
201 | |||
202 | ENTRY(early_idt_handler) | ||
203 | xorl %eax,%eax | ||
204 | movq 8(%rsp),%rsi # get rip | ||
205 | movq (%rsp),%rdx | ||
206 | movq %cr2,%rcx | ||
207 | leaq early_idt_msg(%rip),%rdi | ||
208 | call early_printk | ||
209 | 1: hlt | ||
210 | jmp 1b | ||
211 | |||
212 | early_idt_msg: | ||
213 | .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" | ||
214 | |||
215 | .code32 | ||
216 | ENTRY(no_long_mode) | ||
217 | /* This isn't an x86-64 CPU so hang */ | ||
218 | 1: | ||
219 | jmp 1b | ||
220 | |||
221 | .org 0xf00 | ||
222 | .globl pGDT32 | ||
223 | pGDT32: | ||
224 | .word gdt_end-cpu_gdt_table | ||
225 | .long cpu_gdt_table-__START_KERNEL_map | ||
226 | |||
227 | .org 0xf10 | ||
228 | ljumpvector: | ||
229 | .long startup_64-__START_KERNEL_map | ||
230 | .word __KERNEL_CS | ||
231 | |||
232 | ENTRY(stext) | ||
233 | ENTRY(_stext) | ||
234 | |||
235 | /* | ||
236 | * This default setting generates an ident mapping at address 0x100000 | ||
237 | * and a mapping for the kernel that precisely maps virtual address | ||
238 | * 0xffffffff80000000 to physical address 0x000000. (always using | ||
239 | * 2Mbyte large pages provided by PAE mode) | ||
240 | */ | ||
241 | .org 0x1000 | ||
242 | ENTRY(init_level4_pgt) | ||
243 | .quad 0x0000000000102007 /* -> level3_ident_pgt */ | ||
244 | .fill 255,8,0 | ||
245 | .quad 0x000000000010a007 | ||
246 | .fill 254,8,0 | ||
247 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
248 | .quad 0x0000000000103007 /* -> level3_kernel_pgt */ | ||
249 | |||
250 | .org 0x2000 | ||
251 | ENTRY(level3_ident_pgt) | ||
252 | .quad 0x0000000000104007 | ||
253 | .fill 511,8,0 | ||
254 | |||
255 | .org 0x3000 | ||
256 | ENTRY(level3_kernel_pgt) | ||
257 | .fill 510,8,0 | ||
258 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | ||
259 | .quad 0x0000000000105007 /* -> level2_kernel_pgt */ | ||
260 | .fill 1,8,0 | ||
261 | |||
262 | .org 0x4000 | ||
263 | ENTRY(level2_ident_pgt) | ||
264 | /* 40MB for bootup. */ | ||
265 | .quad 0x0000000000000283 | ||
266 | .quad 0x0000000000200183 | ||
267 | .quad 0x0000000000400183 | ||
268 | .quad 0x0000000000600183 | ||
269 | .quad 0x0000000000800183 | ||
270 | .quad 0x0000000000A00183 | ||
271 | .quad 0x0000000000C00183 | ||
272 | .quad 0x0000000000E00183 | ||
273 | .quad 0x0000000001000183 | ||
274 | .quad 0x0000000001200183 | ||
275 | .quad 0x0000000001400183 | ||
276 | .quad 0x0000000001600183 | ||
277 | .quad 0x0000000001800183 | ||
278 | .quad 0x0000000001A00183 | ||
279 | .quad 0x0000000001C00183 | ||
280 | .quad 0x0000000001E00183 | ||
281 | .quad 0x0000000002000183 | ||
282 | .quad 0x0000000002200183 | ||
283 | .quad 0x0000000002400183 | ||
284 | .quad 0x0000000002600183 | ||
285 | /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ | ||
286 | .globl temp_boot_pmds | ||
287 | temp_boot_pmds: | ||
288 | .fill 492,8,0 | ||
289 | |||
290 | .org 0x5000 | ||
291 | ENTRY(level2_kernel_pgt) | ||
292 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. | ||
293 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ | ||
294 | /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ | ||
295 | .quad 0x0000000000000183 | ||
296 | .quad 0x0000000000200183 | ||
297 | .quad 0x0000000000400183 | ||
298 | .quad 0x0000000000600183 | ||
299 | .quad 0x0000000000800183 | ||
300 | .quad 0x0000000000A00183 | ||
301 | .quad 0x0000000000C00183 | ||
302 | .quad 0x0000000000E00183 | ||
303 | .quad 0x0000000001000183 | ||
304 | .quad 0x0000000001200183 | ||
305 | .quad 0x0000000001400183 | ||
306 | .quad 0x0000000001600183 | ||
307 | .quad 0x0000000001800183 | ||
308 | .quad 0x0000000001A00183 | ||
309 | .quad 0x0000000001C00183 | ||
310 | .quad 0x0000000001E00183 | ||
311 | .quad 0x0000000002000183 | ||
312 | .quad 0x0000000002200183 | ||
313 | .quad 0x0000000002400183 | ||
314 | .quad 0x0000000002600183 | ||
315 | /* Module mapping starts here */ | ||
316 | .fill 492,8,0 | ||
317 | |||
318 | .org 0x6000 | ||
319 | ENTRY(empty_zero_page) | ||
320 | |||
321 | .org 0x7000 | ||
322 | ENTRY(empty_bad_page) | ||
323 | |||
324 | .org 0x8000 | ||
325 | ENTRY(empty_bad_pte_table) | ||
326 | |||
327 | .org 0x9000 | ||
328 | ENTRY(empty_bad_pmd_table) | ||
329 | |||
330 | .org 0xa000 | ||
331 | ENTRY(level3_physmem_pgt) | ||
332 | .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ | ||
333 | |||
334 | .org 0xb000 | ||
335 | #ifdef CONFIG_ACPI_SLEEP | ||
336 | ENTRY(wakeup_level4_pgt) | ||
337 | .quad 0x0000000000102007 /* -> level3_ident_pgt */ | ||
338 | .fill 255,8,0 | ||
339 | .quad 0x000000000010a007 | ||
340 | .fill 254,8,0 | ||
341 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
342 | .quad 0x0000000000103007 /* -> level3_kernel_pgt */ | ||
343 | #endif | ||
344 | |||
345 | .data | ||
346 | |||
347 | .align 16 | ||
348 | .globl cpu_gdt_descr | ||
349 | cpu_gdt_descr: | ||
350 | .word gdt_end-cpu_gdt_table | ||
351 | gdt: | ||
352 | .quad cpu_gdt_table | ||
353 | #ifdef CONFIG_SMP | ||
354 | .rept NR_CPUS-1 | ||
355 | .word 0 | ||
356 | .quad 0 | ||
357 | .endr | ||
358 | #endif | ||
359 | |||
360 | /* We need valid kernel segments for data and code in long mode too | ||
361 | * IRET will check the segment types kkeil 2000/10/28 | ||
362 | * Also sysret mandates a special GDT layout | ||
363 | */ | ||
364 | |||
365 | .align L1_CACHE_BYTES | ||
366 | |||
367 | /* The TLS descriptors are currently at a different place compared to i386. | ||
368 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
369 | |||
370 | ENTRY(cpu_gdt_table) | ||
371 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
372 | .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */ | ||
373 | .quad 0x00af9a000000ffff /* __KERNEL_CS */ | ||
374 | .quad 0x00cf92000000ffff /* __KERNEL_DS */ | ||
375 | .quad 0x00cffa000000ffff /* __USER32_CS */ | ||
376 | .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ | ||
377 | .quad 0x00affa000000ffff /* __USER_CS */ | ||
378 | .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ | ||
379 | .quad 0,0 /* TSS */ | ||
380 | .quad 0,0 /* LDT */ | ||
381 | .quad 0,0,0 /* three TLS descriptors */ | ||
382 | .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */ | ||
383 | /* base must be patched for real base address. */ | ||
384 | gdt_end: | ||
385 | /* asm/segment.h:GDT_ENTRIES must match this */ | ||
386 | /* This should be a multiple of the cache line size */ | ||
387 | /* GDTs of other CPUs: */ | ||
388 | .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) | ||
389 | |||
390 | .align L1_CACHE_BYTES | ||
391 | ENTRY(idt_table) | ||
392 | .rept 256 | ||
393 | .quad 0 | ||
394 | .quad 0 | ||
395 | .endr | ||
396 | |||
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c new file mode 100644 index 000000000000..6cad46c98a23 --- /dev/null +++ b/arch/x86_64/kernel/head64.c | |||
@@ -0,0 +1,117 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/head64.c -- prepare to run common code | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * | ||
6 | * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ | ||
7 | */ | ||
8 | |||
9 | #include <linux/init.h> | ||
10 | #include <linux/linkage.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/percpu.h> | ||
15 | |||
16 | #include <asm/processor.h> | ||
17 | #include <asm/proto.h> | ||
18 | #include <asm/smp.h> | ||
19 | #include <asm/bootsetup.h> | ||
20 | #include <asm/setup.h> | ||
21 | #include <asm/desc.h> | ||
22 | |||
23 | /* Don't add a printk in there. printk relies on the PDA which is not initialized | ||
24 | yet. */ | ||
25 | static void __init clear_bss(void) | ||
26 | { | ||
27 | extern char __bss_start[], __bss_end[]; | ||
28 | memset(__bss_start, 0, | ||
29 | (unsigned long) __bss_end - (unsigned long) __bss_start); | ||
30 | } | ||
31 | |||
32 | extern char x86_boot_params[2048]; | ||
33 | |||
34 | #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ | ||
35 | #define OLD_CL_MAGIC_ADDR 0x90020 | ||
36 | #define OLD_CL_MAGIC 0xA33F | ||
37 | #define OLD_CL_BASE_ADDR 0x90000 | ||
38 | #define OLD_CL_OFFSET 0x90022 | ||
39 | |||
40 | extern char saved_command_line[]; | ||
41 | |||
42 | static void __init copy_bootdata(char *real_mode_data) | ||
43 | { | ||
44 | int new_data; | ||
45 | char * command_line; | ||
46 | |||
47 | memcpy(x86_boot_params, real_mode_data, 2048); | ||
48 | new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); | ||
49 | if (!new_data) { | ||
50 | if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { | ||
51 | printk("so old bootloader that it does not support commandline?!\n"); | ||
52 | return; | ||
53 | } | ||
54 | new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; | ||
55 | printk("old bootloader convention, maybe loadlin?\n"); | ||
56 | } | ||
57 | command_line = (char *) ((u64)(new_data)); | ||
58 | memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); | ||
59 | printk("Bootdata ok (command line is %s)\n", saved_command_line); | ||
60 | } | ||
61 | |||
62 | static void __init setup_boot_cpu_data(void) | ||
63 | { | ||
64 | unsigned int dummy, eax; | ||
65 | |||
66 | /* get vendor info */ | ||
67 | cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, | ||
68 | (unsigned int *)&boot_cpu_data.x86_vendor_id[0], | ||
69 | (unsigned int *)&boot_cpu_data.x86_vendor_id[8], | ||
70 | (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); | ||
71 | |||
72 | /* get cpu type */ | ||
73 | cpuid(1, &eax, &dummy, &dummy, | ||
74 | (unsigned int *) &boot_cpu_data.x86_capability); | ||
75 | boot_cpu_data.x86 = (eax >> 8) & 0xf; | ||
76 | boot_cpu_data.x86_model = (eax >> 4) & 0xf; | ||
77 | boot_cpu_data.x86_mask = eax & 0xf; | ||
78 | } | ||
79 | |||
80 | extern char _end[]; | ||
81 | |||
82 | void __init x86_64_start_kernel(char * real_mode_data) | ||
83 | { | ||
84 | char *s; | ||
85 | int i; | ||
86 | |||
87 | for (i = 0; i < 256; i++) | ||
88 | set_intr_gate(i, early_idt_handler); | ||
89 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
90 | clear_bss(); | ||
91 | pda_init(0); | ||
92 | copy_bootdata(real_mode_data); | ||
93 | #ifdef CONFIG_SMP | ||
94 | cpu_set(0, cpu_online_map); | ||
95 | #endif | ||
96 | /* default console: */ | ||
97 | if (!strstr(saved_command_line, "console=")) | ||
98 | strcat(saved_command_line, " console=tty0"); | ||
99 | s = strstr(saved_command_line, "earlyprintk="); | ||
100 | if (s != NULL) | ||
101 | setup_early_printk(s); | ||
102 | #ifdef CONFIG_DISCONTIGMEM | ||
103 | s = strstr(saved_command_line, "numa="); | ||
104 | if (s != NULL) | ||
105 | numa_setup(s+5); | ||
106 | #endif | ||
107 | #ifdef CONFIG_X86_IO_APIC | ||
108 | if (strstr(saved_command_line, "disableapic")) | ||
109 | disable_apic = 1; | ||
110 | #endif | ||
111 | /* You need early console to see that */ | ||
112 | if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) | ||
113 | panic("Kernel too big for kernel mapping\n"); | ||
114 | |||
115 | setup_boot_cpu_data(); | ||
116 | start_kernel(); | ||
117 | } | ||
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c new file mode 100644 index 000000000000..ba139cac57ce --- /dev/null +++ b/arch/x86_64/kernel/i387.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/i387.c | ||
3 | * | ||
4 | * Copyright (C) 1994 Linus Torvalds | ||
5 | * Copyright (C) 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * General FPU state handling cleanups | ||
9 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
10 | * | ||
11 | * x86-64 rework 2002 Andi Kleen. | ||
12 | * Does direct fxsave in and out of user space now for signal handlers. | ||
13 | * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, | ||
14 | * the 64bit user space sees a FXSAVE frame directly. | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <asm/processor.h> | ||
21 | #include <asm/i387.h> | ||
22 | #include <asm/sigcontext.h> | ||
23 | #include <asm/user.h> | ||
24 | #include <asm/ptrace.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | |||
27 | unsigned int mxcsr_feature_mask = 0xffffffff; | ||
28 | |||
29 | void mxcsr_feature_mask_init(void) | ||
30 | { | ||
31 | unsigned int mask; | ||
32 | clts(); | ||
33 | memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
34 | asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); | ||
35 | mask = current->thread.i387.fxsave.mxcsr_mask; | ||
36 | if (mask == 0) mask = 0x0000ffbf; | ||
37 | mxcsr_feature_mask &= mask; | ||
38 | stts(); | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Called at bootup to set up the initial FPU state that is later cloned | ||
43 | * into all processes. | ||
44 | */ | ||
45 | void __init fpu_init(void) | ||
46 | { | ||
47 | unsigned long oldcr0 = read_cr0(); | ||
48 | extern void __bad_fxsave_alignment(void); | ||
49 | |||
50 | if (offsetof(struct task_struct, thread.i387.fxsave) & 15) | ||
51 | __bad_fxsave_alignment(); | ||
52 | set_in_cr4(X86_CR4_OSFXSR); | ||
53 | set_in_cr4(X86_CR4_OSXMMEXCPT); | ||
54 | |||
55 | write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ | ||
56 | |||
57 | mxcsr_feature_mask_init(); | ||
58 | /* clean state in init */ | ||
59 | current_thread_info()->status = 0; | ||
60 | clear_used_math(); | ||
61 | } | ||
62 | |||
63 | void init_fpu(struct task_struct *child) | ||
64 | { | ||
65 | if (tsk_used_math(child)) { | ||
66 | if (child == current) | ||
67 | unlazy_fpu(child); | ||
68 | return; | ||
69 | } | ||
70 | memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); | ||
71 | child->thread.i387.fxsave.cwd = 0x37f; | ||
72 | child->thread.i387.fxsave.mxcsr = 0x1f80; | ||
73 | /* only the device not available exception or ptrace can call init_fpu */ | ||
74 | set_stopped_child_used_math(child); | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * Signal frame handlers. | ||
79 | */ | ||
80 | |||
81 | int save_i387(struct _fpstate __user *buf) | ||
82 | { | ||
83 | struct task_struct *tsk = current; | ||
84 | int err = 0; | ||
85 | |||
86 | { | ||
87 | extern void bad_user_i387_struct(void); | ||
88 | if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave)) | ||
89 | bad_user_i387_struct(); | ||
90 | } | ||
91 | |||
92 | if ((unsigned long)buf % 16) | ||
93 | printk("save_i387: bad fpstate %p\n",buf); | ||
94 | |||
95 | if (!used_math()) | ||
96 | return 0; | ||
97 | clear_used_math(); /* trigger finit */ | ||
98 | if (tsk->thread_info->status & TS_USEDFPU) { | ||
99 | err = save_i387_checking((struct i387_fxsave_struct __user *)buf); | ||
100 | if (err) return err; | ||
101 | stts(); | ||
102 | } else { | ||
103 | if (__copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
104 | sizeof(struct i387_fxsave_struct))) | ||
105 | return -1; | ||
106 | } | ||
107 | return 1; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * ptrace request handlers. | ||
112 | */ | ||
113 | |||
114 | int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) | ||
115 | { | ||
116 | init_fpu(tsk); | ||
117 | return __copy_to_user(buf, &tsk->thread.i387.fxsave, | ||
118 | sizeof(struct user_i387_struct)) ? -EFAULT : 0; | ||
119 | } | ||
120 | |||
121 | int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) | ||
122 | { | ||
123 | if (__copy_from_user(&tsk->thread.i387.fxsave, buf, | ||
124 | sizeof(struct user_i387_struct))) | ||
125 | return -EFAULT; | ||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * FPU state for core dumps. | ||
131 | */ | ||
132 | |||
133 | int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) | ||
134 | { | ||
135 | struct task_struct *tsk = current; | ||
136 | |||
137 | if (!used_math()) | ||
138 | return 0; | ||
139 | |||
140 | unlazy_fpu(tsk); | ||
141 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
142 | return 1; | ||
143 | } | ||
144 | |||
145 | int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) | ||
146 | { | ||
147 | int fpvalid = !!tsk_used_math(tsk); | ||
148 | |||
149 | if (fpvalid) { | ||
150 | if (tsk == current) | ||
151 | unlazy_fpu(tsk); | ||
152 | memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); | ||
153 | } | ||
154 | return fpvalid; | ||
155 | } | ||
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c new file mode 100644 index 000000000000..d52701246010 --- /dev/null +++ b/arch/x86_64/kernel/i8259.c | |||
@@ -0,0 +1,579 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/config.h> | ||
3 | #include <linux/errno.h> | ||
4 | #include <linux/signal.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/ioport.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/timex.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/random.h> | ||
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/kernel_stat.h> | ||
14 | #include <linux/sysdev.h> | ||
15 | #include <linux/bitops.h> | ||
16 | |||
17 | #include <asm/acpi.h> | ||
18 | #include <asm/atomic.h> | ||
19 | #include <asm/system.h> | ||
20 | #include <asm/io.h> | ||
21 | #include <asm/irq.h> | ||
22 | #include <asm/hw_irq.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/delay.h> | ||
25 | #include <asm/desc.h> | ||
26 | #include <asm/apic.h> | ||
27 | |||
28 | #include <linux/irq.h> | ||
29 | |||
30 | /* | ||
31 | * Common place to define all x86 IRQ vectors | ||
32 | * | ||
33 | * This builds up the IRQ handler stubs using some ugly macros in irq.h | ||
34 | * | ||
35 | * These macros create the low-level assembly IRQ routines that save | ||
36 | * register context and call do_IRQ(). do_IRQ() then does all the | ||
37 | * operations that are needed to keep the AT (or SMP IOAPIC) | ||
38 | * interrupt-controller happy. | ||
39 | */ | ||
40 | |||
41 | #define BI(x,y) \ | ||
42 | BUILD_IRQ(x##y) | ||
43 | |||
44 | #define BUILD_16_IRQS(x) \ | ||
45 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
46 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
47 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
48 | BI(x,c) BI(x,d) BI(x,e) BI(x,f) | ||
49 | |||
50 | #define BUILD_14_IRQS(x) \ | ||
51 | BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ | ||
52 | BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ | ||
53 | BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ | ||
54 | BI(x,c) BI(x,d) | ||
55 | |||
56 | /* | ||
57 | * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: | ||
58 | * (these are usually mapped to vectors 0x20-0x2f) | ||
59 | */ | ||
60 | BUILD_16_IRQS(0x0) | ||
61 | |||
62 | #ifdef CONFIG_X86_LOCAL_APIC | ||
63 | /* | ||
64 | * The IO-APIC gives us many more interrupt sources. Most of these | ||
65 | * are unused but an SMP system is supposed to have enough memory ... | ||
66 | * sometimes (mostly wrt. hw bugs) we get corrupted vectors all | ||
67 | * across the spectrum, so we really want to be prepared to get all | ||
68 | * of these. Plus, more powerful systems might have more than 64 | ||
69 | * IO-APIC registers. | ||
70 | * | ||
71 | * (these are usually mapped into the 0x30-0xff vector range) | ||
72 | */ | ||
73 | BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) | ||
74 | BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) | ||
75 | BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) | ||
76 | BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) | ||
77 | |||
78 | #ifdef CONFIG_PCI_MSI | ||
79 | BUILD_14_IRQS(0xe) | ||
80 | #endif | ||
81 | |||
82 | #endif | ||
83 | |||
84 | #undef BUILD_16_IRQS | ||
85 | #undef BUILD_14_IRQS | ||
86 | #undef BI | ||
87 | |||
88 | |||
89 | #define IRQ(x,y) \ | ||
90 | IRQ##x##y##_interrupt | ||
91 | |||
92 | #define IRQLIST_16(x) \ | ||
93 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
94 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
95 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
96 | IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) | ||
97 | |||
98 | #define IRQLIST_14(x) \ | ||
99 | IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ | ||
100 | IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ | ||
101 | IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ | ||
102 | IRQ(x,c), IRQ(x,d) | ||
103 | |||
104 | void (*interrupt[NR_IRQS])(void) = { | ||
105 | IRQLIST_16(0x0), | ||
106 | |||
107 | #ifdef CONFIG_X86_IO_APIC | ||
108 | IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), | ||
109 | IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), | ||
110 | IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), | ||
111 | IRQLIST_16(0xc), IRQLIST_16(0xd) | ||
112 | |||
113 | #ifdef CONFIG_PCI_MSI | ||
114 | , IRQLIST_14(0xe) | ||
115 | #endif | ||
116 | |||
117 | #endif | ||
118 | }; | ||
119 | |||
120 | #undef IRQ | ||
121 | #undef IRQLIST_16 | ||
122 | #undef IRQLIST_14 | ||
123 | |||
124 | /* | ||
125 | * This is the 'legacy' 8259A Programmable Interrupt Controller, | ||
126 | * present in the majority of PC/AT boxes. | ||
127 | * plus some generic x86 specific things if generic specifics makes | ||
128 | * any sense at all. | ||
129 | * this file should become arch/i386/kernel/irq.c when the old irq.c | ||
130 | * moves to arch independent land | ||
131 | */ | ||
132 | |||
133 | DEFINE_SPINLOCK(i8259A_lock); | ||
134 | |||
135 | static void end_8259A_irq (unsigned int irq) | ||
136 | { | ||
137 | if (irq > 256) { | ||
138 | char var; | ||
139 | printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); | ||
140 | |||
141 | BUG(); | ||
142 | } | ||
143 | |||
144 | if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && | ||
145 | irq_desc[irq].action) | ||
146 | enable_8259A_irq(irq); | ||
147 | } | ||
148 | |||
149 | #define shutdown_8259A_irq disable_8259A_irq | ||
150 | |||
151 | static void mask_and_ack_8259A(unsigned int); | ||
152 | |||
153 | static unsigned int startup_8259A_irq(unsigned int irq) | ||
154 | { | ||
155 | enable_8259A_irq(irq); | ||
156 | return 0; /* never anything pending */ | ||
157 | } | ||
158 | |||
159 | static struct hw_interrupt_type i8259A_irq_type = { | ||
160 | "XT-PIC", | ||
161 | startup_8259A_irq, | ||
162 | shutdown_8259A_irq, | ||
163 | enable_8259A_irq, | ||
164 | disable_8259A_irq, | ||
165 | mask_and_ack_8259A, | ||
166 | end_8259A_irq, | ||
167 | NULL | ||
168 | }; | ||
169 | |||
170 | /* | ||
171 | * 8259A PIC functions to handle ISA devices: | ||
172 | */ | ||
173 | |||
174 | /* | ||
175 | * This contains the irq mask for both 8259A irq controllers, | ||
176 | */ | ||
177 | static unsigned int cached_irq_mask = 0xffff; | ||
178 | |||
179 | #define __byte(x,y) (((unsigned char *)&(y))[x]) | ||
180 | #define cached_21 (__byte(0,cached_irq_mask)) | ||
181 | #define cached_A1 (__byte(1,cached_irq_mask)) | ||
182 | |||
183 | /* | ||
184 | * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) | ||
185 | * boards the timer interrupt is not really connected to any IO-APIC pin, | ||
186 | * it's fed to the master 8259A's IR0 line only. | ||
187 | * | ||
188 | * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. | ||
189 | * this 'mixed mode' IRQ handling costs nothing because it's only used | ||
190 | * at IRQ setup time. | ||
191 | */ | ||
192 | unsigned long io_apic_irqs; | ||
193 | |||
194 | void disable_8259A_irq(unsigned int irq) | ||
195 | { | ||
196 | unsigned int mask = 1 << irq; | ||
197 | unsigned long flags; | ||
198 | |||
199 | spin_lock_irqsave(&i8259A_lock, flags); | ||
200 | cached_irq_mask |= mask; | ||
201 | if (irq & 8) | ||
202 | outb(cached_A1,0xA1); | ||
203 | else | ||
204 | outb(cached_21,0x21); | ||
205 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
206 | } | ||
207 | |||
208 | void enable_8259A_irq(unsigned int irq) | ||
209 | { | ||
210 | unsigned int mask = ~(1 << irq); | ||
211 | unsigned long flags; | ||
212 | |||
213 | spin_lock_irqsave(&i8259A_lock, flags); | ||
214 | cached_irq_mask &= mask; | ||
215 | if (irq & 8) | ||
216 | outb(cached_A1,0xA1); | ||
217 | else | ||
218 | outb(cached_21,0x21); | ||
219 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
220 | } | ||
221 | |||
222 | int i8259A_irq_pending(unsigned int irq) | ||
223 | { | ||
224 | unsigned int mask = 1<<irq; | ||
225 | unsigned long flags; | ||
226 | int ret; | ||
227 | |||
228 | spin_lock_irqsave(&i8259A_lock, flags); | ||
229 | if (irq < 8) | ||
230 | ret = inb(0x20) & mask; | ||
231 | else | ||
232 | ret = inb(0xA0) & (mask >> 8); | ||
233 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
234 | |||
235 | return ret; | ||
236 | } | ||
237 | |||
238 | void make_8259A_irq(unsigned int irq) | ||
239 | { | ||
240 | disable_irq_nosync(irq); | ||
241 | io_apic_irqs &= ~(1<<irq); | ||
242 | irq_desc[irq].handler = &i8259A_irq_type; | ||
243 | enable_irq(irq); | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * This function assumes to be called rarely. Switching between | ||
248 | * 8259A registers is slow. | ||
249 | * This has to be protected by the irq controller spinlock | ||
250 | * before being called. | ||
251 | */ | ||
252 | static inline int i8259A_irq_real(unsigned int irq) | ||
253 | { | ||
254 | int value; | ||
255 | int irqmask = 1<<irq; | ||
256 | |||
257 | if (irq < 8) { | ||
258 | outb(0x0B,0x20); /* ISR register */ | ||
259 | value = inb(0x20) & irqmask; | ||
260 | outb(0x0A,0x20); /* back to the IRR register */ | ||
261 | return value; | ||
262 | } | ||
263 | outb(0x0B,0xA0); /* ISR register */ | ||
264 | value = inb(0xA0) & (irqmask >> 8); | ||
265 | outb(0x0A,0xA0); /* back to the IRR register */ | ||
266 | return value; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Careful! The 8259A is a fragile beast, it pretty | ||
271 | * much _has_ to be done exactly like this (mask it | ||
272 | * first, _then_ send the EOI, and the order of EOI | ||
273 | * to the two 8259s is important! | ||
274 | */ | ||
275 | static void mask_and_ack_8259A(unsigned int irq) | ||
276 | { | ||
277 | unsigned int irqmask = 1 << irq; | ||
278 | unsigned long flags; | ||
279 | |||
280 | spin_lock_irqsave(&i8259A_lock, flags); | ||
281 | /* | ||
282 | * Lightweight spurious IRQ detection. We do not want | ||
283 | * to overdo spurious IRQ handling - it's usually a sign | ||
284 | * of hardware problems, so we only do the checks we can | ||
285 | * do without slowing down good hardware unnecesserily. | ||
286 | * | ||
287 | * Note that IRQ7 and IRQ15 (the two spurious IRQs | ||
288 | * usually resulting from the 8259A-1|2 PICs) occur | ||
289 | * even if the IRQ is masked in the 8259A. Thus we | ||
290 | * can check spurious 8259A IRQs without doing the | ||
291 | * quite slow i8259A_irq_real() call for every IRQ. | ||
292 | * This does not cover 100% of spurious interrupts, | ||
293 | * but should be enough to warn the user that there | ||
294 | * is something bad going on ... | ||
295 | */ | ||
296 | if (cached_irq_mask & irqmask) | ||
297 | goto spurious_8259A_irq; | ||
298 | cached_irq_mask |= irqmask; | ||
299 | |||
300 | handle_real_irq: | ||
301 | if (irq & 8) { | ||
302 | inb(0xA1); /* DUMMY - (do we need this?) */ | ||
303 | outb(cached_A1,0xA1); | ||
304 | outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ | ||
305 | outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ | ||
306 | } else { | ||
307 | inb(0x21); /* DUMMY - (do we need this?) */ | ||
308 | outb(cached_21,0x21); | ||
309 | outb(0x60+irq,0x20); /* 'Specific EOI' to master */ | ||
310 | } | ||
311 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
312 | return; | ||
313 | |||
314 | spurious_8259A_irq: | ||
315 | /* | ||
316 | * this is the slow path - should happen rarely. | ||
317 | */ | ||
318 | if (i8259A_irq_real(irq)) | ||
319 | /* | ||
320 | * oops, the IRQ _is_ in service according to the | ||
321 | * 8259A - not spurious, go handle it. | ||
322 | */ | ||
323 | goto handle_real_irq; | ||
324 | |||
325 | { | ||
326 | static int spurious_irq_mask; | ||
327 | /* | ||
328 | * At this point we can be sure the IRQ is spurious, | ||
329 | * lets ACK and report it. [once per IRQ] | ||
330 | */ | ||
331 | if (!(spurious_irq_mask & irqmask)) { | ||
332 | printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); | ||
333 | spurious_irq_mask |= irqmask; | ||
334 | } | ||
335 | atomic_inc(&irq_err_count); | ||
336 | /* | ||
337 | * Theoretically we do not have to handle this IRQ, | ||
338 | * but in Linux this does not cause problems and is | ||
339 | * simpler for us. | ||
340 | */ | ||
341 | goto handle_real_irq; | ||
342 | } | ||
343 | } | ||
344 | |||
345 | void init_8259A(int auto_eoi) | ||
346 | { | ||
347 | unsigned long flags; | ||
348 | |||
349 | spin_lock_irqsave(&i8259A_lock, flags); | ||
350 | |||
351 | outb(0xff, 0x21); /* mask all of 8259A-1 */ | ||
352 | outb(0xff, 0xA1); /* mask all of 8259A-2 */ | ||
353 | |||
354 | /* | ||
355 | * outb_p - this has to work on a wide range of PC hardware. | ||
356 | */ | ||
357 | outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ | ||
358 | outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ | ||
359 | outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ | ||
360 | if (auto_eoi) | ||
361 | outb_p(0x03, 0x21); /* master does Auto EOI */ | ||
362 | else | ||
363 | outb_p(0x01, 0x21); /* master expects normal EOI */ | ||
364 | |||
365 | outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ | ||
366 | outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ | ||
367 | outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ | ||
368 | outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode | ||
369 | is to be investigated) */ | ||
370 | |||
371 | if (auto_eoi) | ||
372 | /* | ||
373 | * in AEOI mode we just have to mask the interrupt | ||
374 | * when acking. | ||
375 | */ | ||
376 | i8259A_irq_type.ack = disable_8259A_irq; | ||
377 | else | ||
378 | i8259A_irq_type.ack = mask_and_ack_8259A; | ||
379 | |||
380 | udelay(100); /* wait for 8259A to initialize */ | ||
381 | |||
382 | outb(cached_21, 0x21); /* restore master IRQ mask */ | ||
383 | outb(cached_A1, 0xA1); /* restore slave IRQ mask */ | ||
384 | |||
385 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
386 | } | ||
387 | |||
388 | static char irq_trigger[2]; | ||
389 | /** | ||
390 | * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ | ||
391 | */ | ||
392 | static void restore_ELCR(char *trigger) | ||
393 | { | ||
394 | outb(trigger[0], 0x4d0); | ||
395 | outb(trigger[1], 0x4d1); | ||
396 | } | ||
397 | |||
398 | static void save_ELCR(char *trigger) | ||
399 | { | ||
400 | /* IRQ 0,1,2,8,13 are marked as reserved */ | ||
401 | trigger[0] = inb(0x4d0) & 0xF8; | ||
402 | trigger[1] = inb(0x4d1) & 0xDE; | ||
403 | } | ||
404 | |||
405 | static int i8259A_resume(struct sys_device *dev) | ||
406 | { | ||
407 | init_8259A(0); | ||
408 | restore_ELCR(irq_trigger); | ||
409 | return 0; | ||
410 | } | ||
411 | |||
412 | static int i8259A_suspend(struct sys_device *dev, u32 state) | ||
413 | { | ||
414 | save_ELCR(irq_trigger); | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static struct sysdev_class i8259_sysdev_class = { | ||
419 | set_kset_name("i8259"), | ||
420 | .suspend = i8259A_suspend, | ||
421 | .resume = i8259A_resume, | ||
422 | }; | ||
423 | |||
424 | static struct sys_device device_i8259A = { | ||
425 | .id = 0, | ||
426 | .cls = &i8259_sysdev_class, | ||
427 | }; | ||
428 | |||
429 | static int __init i8259A_init_sysfs(void) | ||
430 | { | ||
431 | int error = sysdev_class_register(&i8259_sysdev_class); | ||
432 | if (!error) | ||
433 | error = sysdev_register(&device_i8259A); | ||
434 | return error; | ||
435 | } | ||
436 | |||
437 | device_initcall(i8259A_init_sysfs); | ||
438 | |||
439 | /* | ||
440 | * IRQ2 is cascade interrupt to second interrupt controller | ||
441 | */ | ||
442 | |||
443 | static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; | ||
444 | |||
445 | void __init init_ISA_irqs (void) | ||
446 | { | ||
447 | int i; | ||
448 | |||
449 | #ifdef CONFIG_X86_LOCAL_APIC | ||
450 | init_bsp_APIC(); | ||
451 | #endif | ||
452 | init_8259A(0); | ||
453 | |||
454 | for (i = 0; i < NR_IRQS; i++) { | ||
455 | irq_desc[i].status = IRQ_DISABLED; | ||
456 | irq_desc[i].action = NULL; | ||
457 | irq_desc[i].depth = 1; | ||
458 | |||
459 | if (i < 16) { | ||
460 | /* | ||
461 | * 16 old-style INTA-cycle interrupts: | ||
462 | */ | ||
463 | irq_desc[i].handler = &i8259A_irq_type; | ||
464 | } else { | ||
465 | /* | ||
466 | * 'high' PCI IRQs filled in on demand | ||
467 | */ | ||
468 | irq_desc[i].handler = &no_irq_type; | ||
469 | } | ||
470 | } | ||
471 | } | ||
472 | |||
473 | void apic_timer_interrupt(void); | ||
474 | void spurious_interrupt(void); | ||
475 | void error_interrupt(void); | ||
476 | void reschedule_interrupt(void); | ||
477 | void call_function_interrupt(void); | ||
478 | void invalidate_interrupt(void); | ||
479 | void thermal_interrupt(void); | ||
480 | void i8254_timer_resume(void); | ||
481 | |||
482 | static void setup_timer(void) | ||
483 | { | ||
484 | outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
485 | udelay(10); | ||
486 | outb_p(LATCH & 0xff , 0x40); /* LSB */ | ||
487 | udelay(10); | ||
488 | outb(LATCH >> 8 , 0x40); /* MSB */ | ||
489 | } | ||
490 | |||
491 | static int timer_resume(struct sys_device *dev) | ||
492 | { | ||
493 | setup_timer(); | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | void i8254_timer_resume(void) | ||
498 | { | ||
499 | setup_timer(); | ||
500 | } | ||
501 | |||
502 | static struct sysdev_class timer_sysclass = { | ||
503 | set_kset_name("timer"), | ||
504 | .resume = timer_resume, | ||
505 | }; | ||
506 | |||
507 | static struct sys_device device_timer = { | ||
508 | .id = 0, | ||
509 | .cls = &timer_sysclass, | ||
510 | }; | ||
511 | |||
512 | static int __init init_timer_sysfs(void) | ||
513 | { | ||
514 | int error = sysdev_class_register(&timer_sysclass); | ||
515 | if (!error) | ||
516 | error = sysdev_register(&device_timer); | ||
517 | return error; | ||
518 | } | ||
519 | |||
520 | device_initcall(init_timer_sysfs); | ||
521 | |||
522 | void __init init_IRQ(void) | ||
523 | { | ||
524 | int i; | ||
525 | |||
526 | init_ISA_irqs(); | ||
527 | /* | ||
528 | * Cover the whole vector space, no vector can escape | ||
529 | * us. (some of these will be overridden and become | ||
530 | * 'special' SMP interrupts) | ||
531 | */ | ||
532 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
533 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
534 | if (i >= NR_IRQS) | ||
535 | break; | ||
536 | if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { | ||
537 | set_intr_gate(vector, interrupt[i]); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | #ifdef CONFIG_SMP | ||
542 | /* | ||
543 | * IRQ0 must be given a fixed assignment and initialized, | ||
544 | * because it's used before the IO-APIC is set up. | ||
545 | */ | ||
546 | set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); | ||
547 | |||
548 | /* | ||
549 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
550 | * IPI, driven by wakeup. | ||
551 | */ | ||
552 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
553 | |||
554 | /* IPI for invalidation */ | ||
555 | set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | ||
556 | |||
557 | /* IPI for generic function call */ | ||
558 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
559 | #endif | ||
560 | set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
561 | |||
562 | #ifdef CONFIG_X86_LOCAL_APIC | ||
563 | /* self generated IPI for local APIC timer */ | ||
564 | set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
565 | |||
566 | /* IPI vectors for APIC spurious and error interrupts */ | ||
567 | set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
568 | set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
569 | #endif | ||
570 | |||
571 | /* | ||
572 | * Set the clock to HZ Hz, we already have a valid | ||
573 | * vector now: | ||
574 | */ | ||
575 | setup_timer(); | ||
576 | |||
577 | if (!acpi_ioapic) | ||
578 | setup_irq(2, &irq2); | ||
579 | } | ||
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c new file mode 100644 index 000000000000..c4dc91491157 --- /dev/null +++ b/arch/x86_64/kernel/init_task.c | |||
@@ -0,0 +1,49 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/init_task.h> | ||
6 | #include <linux/fs.h> | ||
7 | #include <linux/mqueue.h> | ||
8 | |||
9 | #include <asm/uaccess.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/desc.h> | ||
12 | |||
13 | static struct fs_struct init_fs = INIT_FS; | ||
14 | static struct files_struct init_files = INIT_FILES; | ||
15 | static struct signal_struct init_signals = INIT_SIGNALS(init_signals); | ||
16 | static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | ||
17 | struct mm_struct init_mm = INIT_MM(init_mm); | ||
18 | |||
19 | EXPORT_SYMBOL(init_mm); | ||
20 | |||
21 | /* | ||
22 | * Initial task structure. | ||
23 | * | ||
24 | * We need to make sure that this is 8192-byte aligned due to the | ||
25 | * way process stacks are handled. This is done by having a special | ||
26 | * "init_task" linker map entry.. | ||
27 | */ | ||
28 | union thread_union init_thread_union | ||
29 | __attribute__((__section__(".data.init_task"))) = | ||
30 | { INIT_THREAD_INFO(init_task) }; | ||
31 | |||
32 | /* | ||
33 | * Initial task structure. | ||
34 | * | ||
35 | * All other task structs will be allocated on slabs in fork.c | ||
36 | */ | ||
37 | struct task_struct init_task = INIT_TASK(init_task); | ||
38 | |||
39 | EXPORT_SYMBOL(init_task); | ||
40 | /* | ||
41 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
42 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | ||
43 | * so they are allowed to end up in the .data.cacheline_aligned | ||
44 | * section. Since TSS's are completely CPU-local, we want them | ||
45 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | ||
46 | */ | ||
47 | DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; | ||
48 | |||
49 | #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) | ||
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c new file mode 100644 index 000000000000..adebe70c180e --- /dev/null +++ b/arch/x86_64/kernel/io_apic.c | |||
@@ -0,0 +1,1982 @@ | |||
1 | /* | ||
2 | * Intel IO-APIC support for multi-Pentium hosts. | ||
3 | * | ||
4 | * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | ||
5 | * | ||
6 | * Many thanks to Stig Venaas for trying out countless experimental | ||
7 | * patches and reporting/debugging problems patiently! | ||
8 | * | ||
9 | * (c) 1999, Multiple IO-APIC support, developed by | ||
10 | * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | ||
11 | * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | ||
12 | * further tested and cleaned up by Zach Brown <zab@redhat.com> | ||
13 | * and Ingo Molnar <mingo@redhat.com> | ||
14 | * | ||
15 | * Fixes | ||
16 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
17 | * thanks to Eric Gilmore | ||
18 | * and Rolf G. Tews | ||
19 | * for testing these extensively | ||
20 | * Paul Diefenbaugh : Added full ACPI support | ||
21 | */ | ||
22 | |||
23 | #include <linux/mm.h> | ||
24 | #include <linux/irq.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/delay.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/config.h> | ||
30 | #include <linux/smp_lock.h> | ||
31 | #include <linux/mc146818rtc.h> | ||
32 | #include <linux/acpi.h> | ||
33 | #include <linux/sysdev.h> | ||
34 | |||
35 | #include <asm/io.h> | ||
36 | #include <asm/smp.h> | ||
37 | #include <asm/desc.h> | ||
38 | #include <asm/proto.h> | ||
39 | #include <asm/mach_apic.h> | ||
40 | |||
41 | #define __apicdebuginit __init | ||
42 | |||
43 | int sis_apic_bug; /* not actually supported, dummy for compile */ | ||
44 | |||
45 | static DEFINE_SPINLOCK(ioapic_lock); | ||
46 | |||
47 | /* | ||
48 | * # of IRQ routing registers | ||
49 | */ | ||
50 | int nr_ioapic_registers[MAX_IO_APICS]; | ||
51 | |||
52 | /* | ||
53 | * Rough estimation of how many shared IRQs there are, can | ||
54 | * be changed anytime. | ||
55 | */ | ||
56 | #define MAX_PLUS_SHARED_IRQS NR_IRQS | ||
57 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | ||
58 | |||
59 | /* | ||
60 | * This is performance-critical, we want to do it O(1) | ||
61 | * | ||
62 | * the indexing order of this array favors 1:1 mappings | ||
63 | * between pins and IRQs. | ||
64 | */ | ||
65 | |||
66 | static struct irq_pin_list { | ||
67 | short apic, pin, next; | ||
68 | } irq_2_pin[PIN_MAP_SIZE]; | ||
69 | |||
70 | int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; | ||
71 | #ifdef CONFIG_PCI_MSI | ||
72 | #define vector_to_irq(vector) \ | ||
73 | (platform_legacy_irq(vector) ? vector : vector_irq[vector]) | ||
74 | #else | ||
75 | #define vector_to_irq(vector) (vector) | ||
76 | #endif | ||
77 | |||
78 | /* | ||
79 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | ||
80 | * shared ISA-space IRQs, so we have to support them. We are super | ||
81 | * fast in the common case, and fast for shared ISA-space IRQs. | ||
82 | */ | ||
83 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) | ||
84 | { | ||
85 | static int first_free_entry = NR_IRQS; | ||
86 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
87 | |||
88 | while (entry->next) | ||
89 | entry = irq_2_pin + entry->next; | ||
90 | |||
91 | if (entry->pin != -1) { | ||
92 | entry->next = first_free_entry; | ||
93 | entry = irq_2_pin + entry->next; | ||
94 | if (++first_free_entry >= PIN_MAP_SIZE) | ||
95 | panic("io_apic.c: whoops"); | ||
96 | } | ||
97 | entry->apic = apic; | ||
98 | entry->pin = pin; | ||
99 | } | ||
100 | |||
101 | #define __DO_ACTION(R, ACTION, FINAL) \ | ||
102 | \ | ||
103 | { \ | ||
104 | int pin; \ | ||
105 | struct irq_pin_list *entry = irq_2_pin + irq; \ | ||
106 | \ | ||
107 | for (;;) { \ | ||
108 | unsigned int reg; \ | ||
109 | pin = entry->pin; \ | ||
110 | if (pin == -1) \ | ||
111 | break; \ | ||
112 | reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ | ||
113 | reg ACTION; \ | ||
114 | io_apic_modify(entry->apic, reg); \ | ||
115 | if (!entry->next) \ | ||
116 | break; \ | ||
117 | entry = irq_2_pin + entry->next; \ | ||
118 | } \ | ||
119 | FINAL; \ | ||
120 | } | ||
121 | |||
122 | #define DO_ACTION(name,R,ACTION, FINAL) \ | ||
123 | \ | ||
124 | static void name##_IO_APIC_irq (unsigned int irq) \ | ||
125 | __DO_ACTION(R, ACTION, FINAL) | ||
126 | |||
127 | DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) | ||
128 | /* mask = 1 */ | ||
129 | DO_ACTION( __unmask, 0, &= 0xfffeffff, ) | ||
130 | /* mask = 0 */ | ||
131 | |||
132 | static void mask_IO_APIC_irq (unsigned int irq) | ||
133 | { | ||
134 | unsigned long flags; | ||
135 | |||
136 | spin_lock_irqsave(&ioapic_lock, flags); | ||
137 | __mask_IO_APIC_irq(irq); | ||
138 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
139 | } | ||
140 | |||
141 | static void unmask_IO_APIC_irq (unsigned int irq) | ||
142 | { | ||
143 | unsigned long flags; | ||
144 | |||
145 | spin_lock_irqsave(&ioapic_lock, flags); | ||
146 | __unmask_IO_APIC_irq(irq); | ||
147 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
148 | } | ||
149 | |||
150 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | ||
151 | { | ||
152 | struct IO_APIC_route_entry entry; | ||
153 | unsigned long flags; | ||
154 | |||
155 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ | ||
156 | spin_lock_irqsave(&ioapic_lock, flags); | ||
157 | *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); | ||
158 | *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); | ||
159 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
160 | if (entry.delivery_mode == dest_SMI) | ||
161 | return; | ||
162 | /* | ||
163 | * Disable it in the IO-APIC irq-routing table: | ||
164 | */ | ||
165 | memset(&entry, 0, sizeof(entry)); | ||
166 | entry.mask = 1; | ||
167 | spin_lock_irqsave(&ioapic_lock, flags); | ||
168 | io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); | ||
169 | io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); | ||
170 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
171 | } | ||
172 | |||
173 | static void clear_IO_APIC (void) | ||
174 | { | ||
175 | int apic, pin; | ||
176 | |||
177 | for (apic = 0; apic < nr_ioapics; apic++) | ||
178 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | ||
179 | clear_IO_APIC_pin(apic, pin); | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to | ||
184 | * specific CPU-side IRQs. | ||
185 | */ | ||
186 | |||
187 | #define MAX_PIRQS 8 | ||
188 | static int pirq_entries [MAX_PIRQS]; | ||
189 | static int pirqs_enabled; | ||
190 | int skip_ioapic_setup; | ||
191 | int ioapic_force; | ||
192 | |||
193 | /* dummy parsing: see setup.c */ | ||
194 | |||
195 | static int __init disable_ioapic_setup(char *str) | ||
196 | { | ||
197 | skip_ioapic_setup = 1; | ||
198 | return 1; | ||
199 | } | ||
200 | |||
201 | static int __init enable_ioapic_setup(char *str) | ||
202 | { | ||
203 | ioapic_force = 1; | ||
204 | skip_ioapic_setup = 0; | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | __setup("noapic", disable_ioapic_setup); | ||
209 | __setup("apic", enable_ioapic_setup); | ||
210 | |||
211 | #include <asm/pci-direct.h> | ||
212 | #include <linux/pci_ids.h> | ||
213 | #include <linux/pci.h> | ||
214 | |||
215 | /* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC | ||
216 | off. Check for an Nvidia or VIA PCI bridge and turn it off. | ||
217 | Use pci direct infrastructure because this runs before the PCI subsystem. | ||
218 | |||
219 | Can be overwritten with "apic" | ||
220 | |||
221 | And another hack to disable the IOMMU on VIA chipsets. | ||
222 | |||
223 | Kludge-O-Rama. */ | ||
224 | void __init check_ioapic(void) | ||
225 | { | ||
226 | int num,slot,func; | ||
227 | if (ioapic_force) | ||
228 | return; | ||
229 | |||
230 | /* Poor man's PCI discovery */ | ||
231 | for (num = 0; num < 32; num++) { | ||
232 | for (slot = 0; slot < 32; slot++) { | ||
233 | for (func = 0; func < 8; func++) { | ||
234 | u32 class; | ||
235 | u32 vendor; | ||
236 | u8 type; | ||
237 | class = read_pci_config(num,slot,func, | ||
238 | PCI_CLASS_REVISION); | ||
239 | if (class == 0xffffffff) | ||
240 | break; | ||
241 | |||
242 | if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) | ||
243 | continue; | ||
244 | |||
245 | vendor = read_pci_config(num, slot, func, | ||
246 | PCI_VENDOR_ID); | ||
247 | vendor &= 0xffff; | ||
248 | switch (vendor) { | ||
249 | case PCI_VENDOR_ID_VIA: | ||
250 | #ifdef CONFIG_GART_IOMMU | ||
251 | if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) || | ||
252 | force_iommu) && | ||
253 | !iommu_aperture_allowed) { | ||
254 | printk(KERN_INFO | ||
255 | "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n"); | ||
256 | iommu_aperture_disabled = 1; | ||
257 | } | ||
258 | #endif | ||
259 | return; | ||
260 | case PCI_VENDOR_ID_NVIDIA: | ||
261 | #ifdef CONFIG_ACPI | ||
262 | /* All timer overrides on Nvidia | ||
263 | seem to be wrong. Skip them. */ | ||
264 | acpi_skip_timer_override = 1; | ||
265 | printk(KERN_INFO | ||
266 | "Nvidia board detected. Ignoring ACPI timer override.\n"); | ||
267 | #endif | ||
268 | /* RED-PEN skip them on mptables too? */ | ||
269 | return; | ||
270 | } | ||
271 | |||
272 | /* No multi-function device? */ | ||
273 | type = read_pci_config_byte(num,slot,func, | ||
274 | PCI_HEADER_TYPE); | ||
275 | if (!(type & 0x80)) | ||
276 | break; | ||
277 | } | ||
278 | } | ||
279 | } | ||
280 | } | ||
281 | |||
282 | static int __init ioapic_pirq_setup(char *str) | ||
283 | { | ||
284 | int i, max; | ||
285 | int ints[MAX_PIRQS+1]; | ||
286 | |||
287 | get_options(str, ARRAY_SIZE(ints), ints); | ||
288 | |||
289 | for (i = 0; i < MAX_PIRQS; i++) | ||
290 | pirq_entries[i] = -1; | ||
291 | |||
292 | pirqs_enabled = 1; | ||
293 | apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); | ||
294 | max = MAX_PIRQS; | ||
295 | if (ints[0] < MAX_PIRQS) | ||
296 | max = ints[0]; | ||
297 | |||
298 | for (i = 0; i < max; i++) { | ||
299 | apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); | ||
300 | /* | ||
301 | * PIRQs are mapped upside down, usually. | ||
302 | */ | ||
303 | pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; | ||
304 | } | ||
305 | return 1; | ||
306 | } | ||
307 | |||
308 | __setup("pirq=", ioapic_pirq_setup); | ||
309 | |||
310 | /* | ||
311 | * Find the IRQ entry number of a certain pin. | ||
312 | */ | ||
313 | static int find_irq_entry(int apic, int pin, int type) | ||
314 | { | ||
315 | int i; | ||
316 | |||
317 | for (i = 0; i < mp_irq_entries; i++) | ||
318 | if (mp_irqs[i].mpc_irqtype == type && | ||
319 | (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || | ||
320 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && | ||
321 | mp_irqs[i].mpc_dstirq == pin) | ||
322 | return i; | ||
323 | |||
324 | return -1; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Find the pin to which IRQ[irq] (ISA) is connected | ||
329 | */ | ||
330 | static int __init find_isa_irq_pin(int irq, int type) | ||
331 | { | ||
332 | int i; | ||
333 | |||
334 | for (i = 0; i < mp_irq_entries; i++) { | ||
335 | int lbus = mp_irqs[i].mpc_srcbus; | ||
336 | |||
337 | if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || | ||
338 | mp_bus_id_to_type[lbus] == MP_BUS_EISA || | ||
339 | mp_bus_id_to_type[lbus] == MP_BUS_MCA) && | ||
340 | (mp_irqs[i].mpc_irqtype == type) && | ||
341 | (mp_irqs[i].mpc_srcbusirq == irq)) | ||
342 | |||
343 | return mp_irqs[i].mpc_dstirq; | ||
344 | } | ||
345 | return -1; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * Find a specific PCI IRQ entry. | ||
350 | * Not an __init, possibly needed by modules | ||
351 | */ | ||
352 | static int pin_2_irq(int idx, int apic, int pin); | ||
353 | |||
354 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | ||
355 | { | ||
356 | int apic, i, best_guess = -1; | ||
357 | |||
358 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", | ||
359 | bus, slot, pin); | ||
360 | if (mp_bus_id_to_pci_bus[bus] == -1) { | ||
361 | apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
362 | return -1; | ||
363 | } | ||
364 | for (i = 0; i < mp_irq_entries; i++) { | ||
365 | int lbus = mp_irqs[i].mpc_srcbus; | ||
366 | |||
367 | for (apic = 0; apic < nr_ioapics; apic++) | ||
368 | if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || | ||
369 | mp_irqs[i].mpc_dstapic == MP_APIC_ALL) | ||
370 | break; | ||
371 | |||
372 | if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && | ||
373 | !mp_irqs[i].mpc_irqtype && | ||
374 | (bus == lbus) && | ||
375 | (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { | ||
376 | int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); | ||
377 | |||
378 | if (!(apic || IO_APIC_IRQ(irq))) | ||
379 | continue; | ||
380 | |||
381 | if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) | ||
382 | return irq; | ||
383 | /* | ||
384 | * Use the first all-but-pin matching entry as a | ||
385 | * best-guess fuzzy result for broken mptables. | ||
386 | */ | ||
387 | if (best_guess < 0) | ||
388 | best_guess = irq; | ||
389 | } | ||
390 | } | ||
391 | return best_guess; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * EISA Edge/Level control register, ELCR | ||
396 | */ | ||
397 | static int EISA_ELCR(unsigned int irq) | ||
398 | { | ||
399 | if (irq < 16) { | ||
400 | unsigned int port = 0x4d0 + (irq >> 3); | ||
401 | return (inb(port) >> (irq & 7)) & 1; | ||
402 | } | ||
403 | apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); | ||
404 | return 0; | ||
405 | } | ||
406 | |||
407 | /* EISA interrupts are always polarity zero and can be edge or level | ||
408 | * trigger depending on the ELCR value. If an interrupt is listed as | ||
409 | * EISA conforming in the MP table, that means its trigger type must | ||
410 | * be read in from the ELCR */ | ||
411 | |||
412 | #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) | ||
413 | #define default_EISA_polarity(idx) (0) | ||
414 | |||
415 | /* ISA interrupts are always polarity zero edge triggered, | ||
416 | * when listed as conforming in the MP table. */ | ||
417 | |||
418 | #define default_ISA_trigger(idx) (0) | ||
419 | #define default_ISA_polarity(idx) (0) | ||
420 | |||
421 | /* PCI interrupts are always polarity one level triggered, | ||
422 | * when listed as conforming in the MP table. */ | ||
423 | |||
424 | #define default_PCI_trigger(idx) (1) | ||
425 | #define default_PCI_polarity(idx) (1) | ||
426 | |||
427 | /* MCA interrupts are always polarity zero level triggered, | ||
428 | * when listed as conforming in the MP table. */ | ||
429 | |||
430 | #define default_MCA_trigger(idx) (1) | ||
431 | #define default_MCA_polarity(idx) (0) | ||
432 | |||
433 | static int __init MPBIOS_polarity(int idx) | ||
434 | { | ||
435 | int bus = mp_irqs[idx].mpc_srcbus; | ||
436 | int polarity; | ||
437 | |||
438 | /* | ||
439 | * Determine IRQ line polarity (high active or low active): | ||
440 | */ | ||
441 | switch (mp_irqs[idx].mpc_irqflag & 3) | ||
442 | { | ||
443 | case 0: /* conforms, ie. bus-type dependent polarity */ | ||
444 | { | ||
445 | switch (mp_bus_id_to_type[bus]) | ||
446 | { | ||
447 | case MP_BUS_ISA: /* ISA pin */ | ||
448 | { | ||
449 | polarity = default_ISA_polarity(idx); | ||
450 | break; | ||
451 | } | ||
452 | case MP_BUS_EISA: /* EISA pin */ | ||
453 | { | ||
454 | polarity = default_EISA_polarity(idx); | ||
455 | break; | ||
456 | } | ||
457 | case MP_BUS_PCI: /* PCI pin */ | ||
458 | { | ||
459 | polarity = default_PCI_polarity(idx); | ||
460 | break; | ||
461 | } | ||
462 | case MP_BUS_MCA: /* MCA pin */ | ||
463 | { | ||
464 | polarity = default_MCA_polarity(idx); | ||
465 | break; | ||
466 | } | ||
467 | default: | ||
468 | { | ||
469 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
470 | polarity = 1; | ||
471 | break; | ||
472 | } | ||
473 | } | ||
474 | break; | ||
475 | } | ||
476 | case 1: /* high active */ | ||
477 | { | ||
478 | polarity = 0; | ||
479 | break; | ||
480 | } | ||
481 | case 2: /* reserved */ | ||
482 | { | ||
483 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
484 | polarity = 1; | ||
485 | break; | ||
486 | } | ||
487 | case 3: /* low active */ | ||
488 | { | ||
489 | polarity = 1; | ||
490 | break; | ||
491 | } | ||
492 | default: /* invalid */ | ||
493 | { | ||
494 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
495 | polarity = 1; | ||
496 | break; | ||
497 | } | ||
498 | } | ||
499 | return polarity; | ||
500 | } | ||
501 | |||
502 | static int MPBIOS_trigger(int idx) | ||
503 | { | ||
504 | int bus = mp_irqs[idx].mpc_srcbus; | ||
505 | int trigger; | ||
506 | |||
507 | /* | ||
508 | * Determine IRQ trigger mode (edge or level sensitive): | ||
509 | */ | ||
510 | switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) | ||
511 | { | ||
512 | case 0: /* conforms, ie. bus-type dependent */ | ||
513 | { | ||
514 | switch (mp_bus_id_to_type[bus]) | ||
515 | { | ||
516 | case MP_BUS_ISA: /* ISA pin */ | ||
517 | { | ||
518 | trigger = default_ISA_trigger(idx); | ||
519 | break; | ||
520 | } | ||
521 | case MP_BUS_EISA: /* EISA pin */ | ||
522 | { | ||
523 | trigger = default_EISA_trigger(idx); | ||
524 | break; | ||
525 | } | ||
526 | case MP_BUS_PCI: /* PCI pin */ | ||
527 | { | ||
528 | trigger = default_PCI_trigger(idx); | ||
529 | break; | ||
530 | } | ||
531 | case MP_BUS_MCA: /* MCA pin */ | ||
532 | { | ||
533 | trigger = default_MCA_trigger(idx); | ||
534 | break; | ||
535 | } | ||
536 | default: | ||
537 | { | ||
538 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
539 | trigger = 1; | ||
540 | break; | ||
541 | } | ||
542 | } | ||
543 | break; | ||
544 | } | ||
545 | case 1: /* edge */ | ||
546 | { | ||
547 | trigger = 0; | ||
548 | break; | ||
549 | } | ||
550 | case 2: /* reserved */ | ||
551 | { | ||
552 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
553 | trigger = 1; | ||
554 | break; | ||
555 | } | ||
556 | case 3: /* level */ | ||
557 | { | ||
558 | trigger = 1; | ||
559 | break; | ||
560 | } | ||
561 | default: /* invalid */ | ||
562 | { | ||
563 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
564 | trigger = 0; | ||
565 | break; | ||
566 | } | ||
567 | } | ||
568 | return trigger; | ||
569 | } | ||
570 | |||
571 | static inline int irq_polarity(int idx) | ||
572 | { | ||
573 | return MPBIOS_polarity(idx); | ||
574 | } | ||
575 | |||
576 | static inline int irq_trigger(int idx) | ||
577 | { | ||
578 | return MPBIOS_trigger(idx); | ||
579 | } | ||
580 | |||
581 | static int pin_2_irq(int idx, int apic, int pin) | ||
582 | { | ||
583 | int irq, i; | ||
584 | int bus = mp_irqs[idx].mpc_srcbus; | ||
585 | |||
586 | /* | ||
587 | * Debugging check, we are in big trouble if this message pops up! | ||
588 | */ | ||
589 | if (mp_irqs[idx].mpc_dstirq != pin) | ||
590 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | ||
591 | |||
592 | switch (mp_bus_id_to_type[bus]) | ||
593 | { | ||
594 | case MP_BUS_ISA: /* ISA pin */ | ||
595 | case MP_BUS_EISA: | ||
596 | case MP_BUS_MCA: | ||
597 | { | ||
598 | irq = mp_irqs[idx].mpc_srcbusirq; | ||
599 | break; | ||
600 | } | ||
601 | case MP_BUS_PCI: /* PCI pin */ | ||
602 | { | ||
603 | /* | ||
604 | * PCI IRQs are mapped in order | ||
605 | */ | ||
606 | i = irq = 0; | ||
607 | while (i < apic) | ||
608 | irq += nr_ioapic_registers[i++]; | ||
609 | irq += pin; | ||
610 | break; | ||
611 | } | ||
612 | default: | ||
613 | { | ||
614 | printk(KERN_ERR "unknown bus type %d.\n",bus); | ||
615 | irq = 0; | ||
616 | break; | ||
617 | } | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * PCI IRQ command line redirection. Yes, limits are hardcoded. | ||
622 | */ | ||
623 | if ((pin >= 16) && (pin <= 23)) { | ||
624 | if (pirq_entries[pin-16] != -1) { | ||
625 | if (!pirq_entries[pin-16]) { | ||
626 | apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); | ||
627 | } else { | ||
628 | irq = pirq_entries[pin-16]; | ||
629 | apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", | ||
630 | pin-16, irq); | ||
631 | } | ||
632 | } | ||
633 | } | ||
634 | return irq; | ||
635 | } | ||
636 | |||
637 | static inline int IO_APIC_irq_trigger(int irq) | ||
638 | { | ||
639 | int apic, idx, pin; | ||
640 | |||
641 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
642 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
643 | idx = find_irq_entry(apic,pin,mp_INT); | ||
644 | if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) | ||
645 | return irq_trigger(idx); | ||
646 | } | ||
647 | } | ||
648 | /* | ||
649 | * nonexistent IRQs are edge default | ||
650 | */ | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ | ||
655 | u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; | ||
656 | |||
657 | int assign_irq_vector(int irq) | ||
658 | { | ||
659 | static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; | ||
660 | |||
661 | BUG_ON(irq >= NR_IRQ_VECTORS); | ||
662 | if (IO_APIC_VECTOR(irq) > 0) | ||
663 | return IO_APIC_VECTOR(irq); | ||
664 | next: | ||
665 | current_vector += 8; | ||
666 | if (current_vector == IA32_SYSCALL_VECTOR) | ||
667 | goto next; | ||
668 | |||
669 | if (current_vector >= FIRST_SYSTEM_VECTOR) { | ||
670 | offset++; | ||
671 | if (!(offset%8)) | ||
672 | return -ENOSPC; | ||
673 | current_vector = FIRST_DEVICE_VECTOR + offset; | ||
674 | } | ||
675 | |||
676 | vector_irq[current_vector] = irq; | ||
677 | if (irq != AUTO_ASSIGN) | ||
678 | IO_APIC_VECTOR(irq) = current_vector; | ||
679 | |||
680 | return current_vector; | ||
681 | } | ||
682 | |||
683 | extern void (*interrupt[NR_IRQS])(void); | ||
684 | static struct hw_interrupt_type ioapic_level_type; | ||
685 | static struct hw_interrupt_type ioapic_edge_type; | ||
686 | |||
687 | #define IOAPIC_AUTO -1 | ||
688 | #define IOAPIC_EDGE 0 | ||
689 | #define IOAPIC_LEVEL 1 | ||
690 | |||
691 | static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) | ||
692 | { | ||
693 | if (use_pci_vector() && !platform_legacy_irq(irq)) { | ||
694 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | ||
695 | trigger == IOAPIC_LEVEL) | ||
696 | irq_desc[vector].handler = &ioapic_level_type; | ||
697 | else | ||
698 | irq_desc[vector].handler = &ioapic_edge_type; | ||
699 | set_intr_gate(vector, interrupt[vector]); | ||
700 | } else { | ||
701 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | ||
702 | trigger == IOAPIC_LEVEL) | ||
703 | irq_desc[irq].handler = &ioapic_level_type; | ||
704 | else | ||
705 | irq_desc[irq].handler = &ioapic_edge_type; | ||
706 | set_intr_gate(vector, interrupt[irq]); | ||
707 | } | ||
708 | } | ||
709 | |||
710 | static void __init setup_IO_APIC_irqs(void) | ||
711 | { | ||
712 | struct IO_APIC_route_entry entry; | ||
713 | int apic, pin, idx, irq, first_notcon = 1, vector; | ||
714 | unsigned long flags; | ||
715 | |||
716 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | ||
717 | |||
718 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
719 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
720 | |||
721 | /* | ||
722 | * add it to the IO-APIC irq-routing table: | ||
723 | */ | ||
724 | memset(&entry,0,sizeof(entry)); | ||
725 | |||
726 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
727 | entry.dest_mode = INT_DEST_MODE; | ||
728 | entry.mask = 0; /* enable IRQ */ | ||
729 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
730 | |||
731 | idx = find_irq_entry(apic,pin,mp_INT); | ||
732 | if (idx == -1) { | ||
733 | if (first_notcon) { | ||
734 | apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
735 | first_notcon = 0; | ||
736 | } else | ||
737 | apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); | ||
738 | continue; | ||
739 | } | ||
740 | |||
741 | entry.trigger = irq_trigger(idx); | ||
742 | entry.polarity = irq_polarity(idx); | ||
743 | |||
744 | if (irq_trigger(idx)) { | ||
745 | entry.trigger = 1; | ||
746 | entry.mask = 1; | ||
747 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
748 | } | ||
749 | |||
750 | irq = pin_2_irq(idx, apic, pin); | ||
751 | add_pin_to_irq(irq, apic, pin); | ||
752 | |||
753 | if (!apic && !IO_APIC_IRQ(irq)) | ||
754 | continue; | ||
755 | |||
756 | if (IO_APIC_IRQ(irq)) { | ||
757 | vector = assign_irq_vector(irq); | ||
758 | entry.vector = vector; | ||
759 | |||
760 | ioapic_register_intr(irq, vector, IOAPIC_AUTO); | ||
761 | if (!apic && (irq < 16)) | ||
762 | disable_8259A_irq(irq); | ||
763 | } | ||
764 | spin_lock_irqsave(&ioapic_lock, flags); | ||
765 | io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); | ||
766 | io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); | ||
767 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
768 | } | ||
769 | } | ||
770 | |||
771 | if (!first_notcon) | ||
772 | apic_printk(APIC_VERBOSE," not connected.\n"); | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * Set up the 8259A-master output pin as broadcast to all | ||
777 | * CPUs. | ||
778 | */ | ||
779 | static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) | ||
780 | { | ||
781 | struct IO_APIC_route_entry entry; | ||
782 | unsigned long flags; | ||
783 | |||
784 | memset(&entry,0,sizeof(entry)); | ||
785 | |||
786 | disable_8259A_irq(0); | ||
787 | |||
788 | /* mask LVT0 */ | ||
789 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
790 | |||
791 | /* | ||
792 | * We use logical delivery to get the timer IRQ | ||
793 | * to the first CPU. | ||
794 | */ | ||
795 | entry.dest_mode = INT_DEST_MODE; | ||
796 | entry.mask = 0; /* unmask IRQ now */ | ||
797 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
798 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
799 | entry.polarity = 0; | ||
800 | entry.trigger = 0; | ||
801 | entry.vector = vector; | ||
802 | |||
803 | /* | ||
804 | * The timer IRQ doesn't have to know that behind the | ||
805 | * scene we have a 8259A-master in AEOI mode ... | ||
806 | */ | ||
807 | irq_desc[0].handler = &ioapic_edge_type; | ||
808 | |||
809 | /* | ||
810 | * Add it to the IO-APIC irq-routing table: | ||
811 | */ | ||
812 | spin_lock_irqsave(&ioapic_lock, flags); | ||
813 | io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); | ||
814 | io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); | ||
815 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
816 | |||
817 | enable_8259A_irq(0); | ||
818 | } | ||
819 | |||
820 | void __init UNEXPECTED_IO_APIC(void) | ||
821 | { | ||
822 | } | ||
823 | |||
824 | void __apicdebuginit print_IO_APIC(void) | ||
825 | { | ||
826 | int apic, i; | ||
827 | union IO_APIC_reg_00 reg_00; | ||
828 | union IO_APIC_reg_01 reg_01; | ||
829 | union IO_APIC_reg_02 reg_02; | ||
830 | unsigned long flags; | ||
831 | |||
832 | if (apic_verbosity == APIC_QUIET) | ||
833 | return; | ||
834 | |||
835 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | ||
836 | for (i = 0; i < nr_ioapics; i++) | ||
837 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | ||
838 | mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); | ||
839 | |||
840 | /* | ||
841 | * We are a bit conservative about what we expect. We have to | ||
842 | * know about every hardware change ASAP. | ||
843 | */ | ||
844 | printk(KERN_INFO "testing the IO APIC.......................\n"); | ||
845 | |||
846 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
847 | |||
848 | spin_lock_irqsave(&ioapic_lock, flags); | ||
849 | reg_00.raw = io_apic_read(apic, 0); | ||
850 | reg_01.raw = io_apic_read(apic, 1); | ||
851 | if (reg_01.bits.version >= 0x10) | ||
852 | reg_02.raw = io_apic_read(apic, 2); | ||
853 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
854 | |||
855 | printk("\n"); | ||
856 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); | ||
857 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | ||
858 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | ||
859 | if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) | ||
860 | UNEXPECTED_IO_APIC(); | ||
861 | |||
862 | printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); | ||
863 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | ||
864 | if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ | ||
865 | (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ | ||
866 | (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ | ||
867 | (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ | ||
868 | (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ | ||
869 | (reg_01.bits.entries != 0x2E) && | ||
870 | (reg_01.bits.entries != 0x3F) && | ||
871 | (reg_01.bits.entries != 0x03) | ||
872 | ) | ||
873 | UNEXPECTED_IO_APIC(); | ||
874 | |||
875 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | ||
876 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | ||
877 | if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ | ||
878 | (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ | ||
879 | (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ | ||
880 | (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ | ||
881 | (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ | ||
882 | (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ | ||
883 | ) | ||
884 | UNEXPECTED_IO_APIC(); | ||
885 | if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) | ||
886 | UNEXPECTED_IO_APIC(); | ||
887 | |||
888 | if (reg_01.bits.version >= 0x10) { | ||
889 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | ||
890 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | ||
891 | if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) | ||
892 | UNEXPECTED_IO_APIC(); | ||
893 | } | ||
894 | |||
895 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | ||
896 | |||
897 | printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" | ||
898 | " Stat Dest Deli Vect: \n"); | ||
899 | |||
900 | for (i = 0; i <= reg_01.bits.entries; i++) { | ||
901 | struct IO_APIC_route_entry entry; | ||
902 | |||
903 | spin_lock_irqsave(&ioapic_lock, flags); | ||
904 | *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); | ||
905 | *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); | ||
906 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
907 | |||
908 | printk(KERN_DEBUG " %02x %03X %02X ", | ||
909 | i, | ||
910 | entry.dest.logical.logical_dest, | ||
911 | entry.dest.physical.physical_dest | ||
912 | ); | ||
913 | |||
914 | printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | ||
915 | entry.mask, | ||
916 | entry.trigger, | ||
917 | entry.irr, | ||
918 | entry.polarity, | ||
919 | entry.delivery_status, | ||
920 | entry.dest_mode, | ||
921 | entry.delivery_mode, | ||
922 | entry.vector | ||
923 | ); | ||
924 | } | ||
925 | } | ||
926 | if (use_pci_vector()) | ||
927 | printk(KERN_INFO "Using vector-based indexing\n"); | ||
928 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); | ||
929 | for (i = 0; i < NR_IRQS; i++) { | ||
930 | struct irq_pin_list *entry = irq_2_pin + i; | ||
931 | if (entry->pin < 0) | ||
932 | continue; | ||
933 | if (use_pci_vector() && !platform_legacy_irq(i)) | ||
934 | printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); | ||
935 | else | ||
936 | printk(KERN_DEBUG "IRQ%d ", i); | ||
937 | for (;;) { | ||
938 | printk("-> %d:%d", entry->apic, entry->pin); | ||
939 | if (!entry->next) | ||
940 | break; | ||
941 | entry = irq_2_pin + entry->next; | ||
942 | } | ||
943 | printk("\n"); | ||
944 | } | ||
945 | |||
946 | printk(KERN_INFO ".................................... done.\n"); | ||
947 | |||
948 | return; | ||
949 | } | ||
950 | |||
951 | #if 0 | ||
952 | |||
953 | static __apicdebuginit void print_APIC_bitfield (int base) | ||
954 | { | ||
955 | unsigned int v; | ||
956 | int i, j; | ||
957 | |||
958 | if (apic_verbosity == APIC_QUIET) | ||
959 | return; | ||
960 | |||
961 | printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | ||
962 | for (i = 0; i < 8; i++) { | ||
963 | v = apic_read(base + i*0x10); | ||
964 | for (j = 0; j < 32; j++) { | ||
965 | if (v & (1<<j)) | ||
966 | printk("1"); | ||
967 | else | ||
968 | printk("0"); | ||
969 | } | ||
970 | printk("\n"); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | void __apicdebuginit print_local_APIC(void * dummy) | ||
975 | { | ||
976 | unsigned int v, ver, maxlvt; | ||
977 | |||
978 | if (apic_verbosity == APIC_QUIET) | ||
979 | return; | ||
980 | |||
981 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | ||
982 | smp_processor_id(), hard_smp_processor_id()); | ||
983 | v = apic_read(APIC_ID); | ||
984 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); | ||
985 | v = apic_read(APIC_LVR); | ||
986 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | ||
987 | ver = GET_APIC_VERSION(v); | ||
988 | maxlvt = get_maxlvt(); | ||
989 | |||
990 | v = apic_read(APIC_TASKPRI); | ||
991 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | ||
992 | |||
993 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
994 | v = apic_read(APIC_ARBPRI); | ||
995 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | ||
996 | v & APIC_ARBPRI_MASK); | ||
997 | v = apic_read(APIC_PROCPRI); | ||
998 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | ||
999 | } | ||
1000 | |||
1001 | v = apic_read(APIC_EOI); | ||
1002 | printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | ||
1003 | v = apic_read(APIC_RRR); | ||
1004 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1005 | v = apic_read(APIC_LDR); | ||
1006 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | ||
1007 | v = apic_read(APIC_DFR); | ||
1008 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | ||
1009 | v = apic_read(APIC_SPIV); | ||
1010 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | ||
1011 | |||
1012 | printk(KERN_DEBUG "... APIC ISR field:\n"); | ||
1013 | print_APIC_bitfield(APIC_ISR); | ||
1014 | printk(KERN_DEBUG "... APIC TMR field:\n"); | ||
1015 | print_APIC_bitfield(APIC_TMR); | ||
1016 | printk(KERN_DEBUG "... APIC IRR field:\n"); | ||
1017 | print_APIC_bitfield(APIC_IRR); | ||
1018 | |||
1019 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
1020 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
1021 | apic_write(APIC_ESR, 0); | ||
1022 | v = apic_read(APIC_ESR); | ||
1023 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | ||
1024 | } | ||
1025 | |||
1026 | v = apic_read(APIC_ICR); | ||
1027 | printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | ||
1028 | v = apic_read(APIC_ICR2); | ||
1029 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | ||
1030 | |||
1031 | v = apic_read(APIC_LVTT); | ||
1032 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | ||
1033 | |||
1034 | if (maxlvt > 3) { /* PC is LVT#4. */ | ||
1035 | v = apic_read(APIC_LVTPC); | ||
1036 | printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | ||
1037 | } | ||
1038 | v = apic_read(APIC_LVT0); | ||
1039 | printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | ||
1040 | v = apic_read(APIC_LVT1); | ||
1041 | printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | ||
1042 | |||
1043 | if (maxlvt > 2) { /* ERR is LVT#3. */ | ||
1044 | v = apic_read(APIC_LVTERR); | ||
1045 | printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | ||
1046 | } | ||
1047 | |||
1048 | v = apic_read(APIC_TMICT); | ||
1049 | printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | ||
1050 | v = apic_read(APIC_TMCCT); | ||
1051 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | ||
1052 | v = apic_read(APIC_TDCR); | ||
1053 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | ||
1054 | printk("\n"); | ||
1055 | } | ||
1056 | |||
1057 | void print_all_local_APICs (void) | ||
1058 | { | ||
1059 | on_each_cpu(print_local_APIC, NULL, 1, 1); | ||
1060 | } | ||
1061 | |||
1062 | void __apicdebuginit print_PIC(void) | ||
1063 | { | ||
1064 | extern spinlock_t i8259A_lock; | ||
1065 | unsigned int v; | ||
1066 | unsigned long flags; | ||
1067 | |||
1068 | if (apic_verbosity == APIC_QUIET) | ||
1069 | return; | ||
1070 | |||
1071 | printk(KERN_DEBUG "\nprinting PIC contents\n"); | ||
1072 | |||
1073 | spin_lock_irqsave(&i8259A_lock, flags); | ||
1074 | |||
1075 | v = inb(0xa1) << 8 | inb(0x21); | ||
1076 | printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | ||
1077 | |||
1078 | v = inb(0xa0) << 8 | inb(0x20); | ||
1079 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | ||
1080 | |||
1081 | outb(0x0b,0xa0); | ||
1082 | outb(0x0b,0x20); | ||
1083 | v = inb(0xa0) << 8 | inb(0x20); | ||
1084 | outb(0x0a,0xa0); | ||
1085 | outb(0x0a,0x20); | ||
1086 | |||
1087 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
1088 | |||
1089 | printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | ||
1090 | |||
1091 | v = inb(0x4d1) << 8 | inb(0x4d0); | ||
1092 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | ||
1093 | } | ||
1094 | |||
1095 | #endif /* 0 */ | ||
1096 | |||
1097 | static void __init enable_IO_APIC(void) | ||
1098 | { | ||
1099 | union IO_APIC_reg_01 reg_01; | ||
1100 | int i; | ||
1101 | unsigned long flags; | ||
1102 | |||
1103 | for (i = 0; i < PIN_MAP_SIZE; i++) { | ||
1104 | irq_2_pin[i].pin = -1; | ||
1105 | irq_2_pin[i].next = 0; | ||
1106 | } | ||
1107 | if (!pirqs_enabled) | ||
1108 | for (i = 0; i < MAX_PIRQS; i++) | ||
1109 | pirq_entries[i] = -1; | ||
1110 | |||
1111 | /* | ||
1112 | * The number of IO-APIC IRQ registers (== #pins): | ||
1113 | */ | ||
1114 | for (i = 0; i < nr_ioapics; i++) { | ||
1115 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1116 | reg_01.raw = io_apic_read(i, 1); | ||
1117 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1118 | nr_ioapic_registers[i] = reg_01.bits.entries+1; | ||
1119 | } | ||
1120 | |||
1121 | /* | ||
1122 | * Do not trust the IO-APIC being empty at bootup | ||
1123 | */ | ||
1124 | clear_IO_APIC(); | ||
1125 | } | ||
1126 | |||
1127 | /* | ||
1128 | * Not an __init, needed by the reboot code | ||
1129 | */ | ||
1130 | void disable_IO_APIC(void) | ||
1131 | { | ||
1132 | /* | ||
1133 | * Clear the IO-APIC before rebooting: | ||
1134 | */ | ||
1135 | clear_IO_APIC(); | ||
1136 | |||
1137 | disconnect_bsp_APIC(); | ||
1138 | } | ||
1139 | |||
1140 | /* | ||
1141 | * function to set the IO-APIC physical IDs based on the | ||
1142 | * values stored in the MPC table. | ||
1143 | * | ||
1144 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | ||
1145 | */ | ||
1146 | |||
1147 | static void __init setup_ioapic_ids_from_mpc (void) | ||
1148 | { | ||
1149 | union IO_APIC_reg_00 reg_00; | ||
1150 | int apic; | ||
1151 | int i; | ||
1152 | unsigned char old_id; | ||
1153 | unsigned long flags; | ||
1154 | |||
1155 | /* | ||
1156 | * Set the IOAPIC ID to the value stored in the MPC table. | ||
1157 | */ | ||
1158 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1159 | |||
1160 | /* Read the register 0 value */ | ||
1161 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1162 | reg_00.raw = io_apic_read(apic, 0); | ||
1163 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1164 | |||
1165 | old_id = mp_ioapics[apic].mpc_apicid; | ||
1166 | |||
1167 | |||
1168 | printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); | ||
1169 | |||
1170 | |||
1171 | /* | ||
1172 | * We need to adjust the IRQ routing table | ||
1173 | * if the ID changed. | ||
1174 | */ | ||
1175 | if (old_id != mp_ioapics[apic].mpc_apicid) | ||
1176 | for (i = 0; i < mp_irq_entries; i++) | ||
1177 | if (mp_irqs[i].mpc_dstapic == old_id) | ||
1178 | mp_irqs[i].mpc_dstapic | ||
1179 | = mp_ioapics[apic].mpc_apicid; | ||
1180 | |||
1181 | /* | ||
1182 | * Read the right value from the MPC table and | ||
1183 | * write it into the ID register. | ||
1184 | */ | ||
1185 | apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", | ||
1186 | mp_ioapics[apic].mpc_apicid); | ||
1187 | |||
1188 | reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; | ||
1189 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1190 | io_apic_write(apic, 0, reg_00.raw); | ||
1191 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1192 | |||
1193 | /* | ||
1194 | * Sanity check | ||
1195 | */ | ||
1196 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1197 | reg_00.raw = io_apic_read(apic, 0); | ||
1198 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1199 | if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) | ||
1200 | printk("could not set ID!\n"); | ||
1201 | else | ||
1202 | apic_printk(APIC_VERBOSE," ok.\n"); | ||
1203 | } | ||
1204 | } | ||
1205 | |||
1206 | /* | ||
1207 | * There is a nasty bug in some older SMP boards, their mptable lies | ||
1208 | * about the timer IRQ. We do the following to work around the situation: | ||
1209 | * | ||
1210 | * - timer IRQ defaults to IO-APIC IRQ | ||
1211 | * - if this function detects that timer IRQs are defunct, then we fall | ||
1212 | * back to ISA timer IRQs | ||
1213 | */ | ||
1214 | static int __init timer_irq_works(void) | ||
1215 | { | ||
1216 | unsigned long t1 = jiffies; | ||
1217 | |||
1218 | local_irq_enable(); | ||
1219 | /* Let ten ticks pass... */ | ||
1220 | mdelay((10 * 1000) / HZ); | ||
1221 | |||
1222 | /* | ||
1223 | * Expect a few ticks at least, to be sure some possible | ||
1224 | * glue logic does not lock up after one or two first | ||
1225 | * ticks in a non-ExtINT mode. Also the local APIC | ||
1226 | * might have cached one ExtINT interrupt. Finally, at | ||
1227 | * least one tick may be lost due to delays. | ||
1228 | */ | ||
1229 | |||
1230 | /* jiffies wrap? */ | ||
1231 | if (jiffies - t1 > 4) | ||
1232 | return 1; | ||
1233 | return 0; | ||
1234 | } | ||
1235 | |||
1236 | /* | ||
1237 | * In the SMP+IOAPIC case it might happen that there are an unspecified | ||
1238 | * number of pending IRQ events unhandled. These cases are very rare, | ||
1239 | * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | ||
1240 | * better to do it this way as thus we do not have to be aware of | ||
1241 | * 'pending' interrupts in the IRQ path, except at this point. | ||
1242 | */ | ||
1243 | /* | ||
1244 | * Edge triggered needs to resend any interrupt | ||
1245 | * that was delayed but this is now handled in the device | ||
1246 | * independent code. | ||
1247 | */ | ||
1248 | |||
1249 | /* | ||
1250 | * Starting up a edge-triggered IO-APIC interrupt is | ||
1251 | * nasty - we need to make sure that we get the edge. | ||
1252 | * If it is already asserted for some reason, we need | ||
1253 | * return 1 to indicate that is was pending. | ||
1254 | * | ||
1255 | * This is not complete - we should be able to fake | ||
1256 | * an edge even if it isn't on the 8259A... | ||
1257 | */ | ||
1258 | |||
1259 | static unsigned int startup_edge_ioapic_irq(unsigned int irq) | ||
1260 | { | ||
1261 | int was_pending = 0; | ||
1262 | unsigned long flags; | ||
1263 | |||
1264 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1265 | if (irq < 16) { | ||
1266 | disable_8259A_irq(irq); | ||
1267 | if (i8259A_irq_pending(irq)) | ||
1268 | was_pending = 1; | ||
1269 | } | ||
1270 | __unmask_IO_APIC_irq(irq); | ||
1271 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1272 | |||
1273 | return was_pending; | ||
1274 | } | ||
1275 | |||
1276 | /* | ||
1277 | * Once we have recorded IRQ_PENDING already, we can mask the | ||
1278 | * interrupt for real. This prevents IRQ storms from unhandled | ||
1279 | * devices. | ||
1280 | */ | ||
1281 | static void ack_edge_ioapic_irq(unsigned int irq) | ||
1282 | { | ||
1283 | if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) | ||
1284 | == (IRQ_PENDING | IRQ_DISABLED)) | ||
1285 | mask_IO_APIC_irq(irq); | ||
1286 | ack_APIC_irq(); | ||
1287 | } | ||
1288 | |||
1289 | /* | ||
1290 | * Level triggered interrupts can just be masked, | ||
1291 | * and shutting down and starting up the interrupt | ||
1292 | * is the same as enabling and disabling them -- except | ||
1293 | * with a startup need to return a "was pending" value. | ||
1294 | * | ||
1295 | * Level triggered interrupts are special because we | ||
1296 | * do not touch any IO-APIC register while handling | ||
1297 | * them. We ack the APIC in the end-IRQ handler, not | ||
1298 | * in the start-IRQ-handler. Protection against reentrance | ||
1299 | * from the same interrupt is still provided, both by the | ||
1300 | * generic IRQ layer and by the fact that an unacked local | ||
1301 | * APIC does not accept IRQs. | ||
1302 | */ | ||
1303 | static unsigned int startup_level_ioapic_irq (unsigned int irq) | ||
1304 | { | ||
1305 | unmask_IO_APIC_irq(irq); | ||
1306 | |||
1307 | return 0; /* don't check for pending */ | ||
1308 | } | ||
1309 | |||
1310 | static void end_level_ioapic_irq (unsigned int irq) | ||
1311 | { | ||
1312 | ack_APIC_irq(); | ||
1313 | } | ||
1314 | |||
1315 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | ||
1316 | { | ||
1317 | unsigned long flags; | ||
1318 | unsigned int dest; | ||
1319 | |||
1320 | dest = cpu_mask_to_apicid(mask); | ||
1321 | |||
1322 | /* | ||
1323 | * Only the high 8 bits are valid. | ||
1324 | */ | ||
1325 | dest = SET_APIC_LOGICAL_ID(dest); | ||
1326 | |||
1327 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1328 | __DO_ACTION(1, = dest, ) | ||
1329 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1330 | } | ||
1331 | |||
1332 | #ifdef CONFIG_PCI_MSI | ||
1333 | static unsigned int startup_edge_ioapic_vector(unsigned int vector) | ||
1334 | { | ||
1335 | int irq = vector_to_irq(vector); | ||
1336 | |||
1337 | return startup_edge_ioapic_irq(irq); | ||
1338 | } | ||
1339 | |||
1340 | static void ack_edge_ioapic_vector(unsigned int vector) | ||
1341 | { | ||
1342 | int irq = vector_to_irq(vector); | ||
1343 | |||
1344 | ack_edge_ioapic_irq(irq); | ||
1345 | } | ||
1346 | |||
1347 | static unsigned int startup_level_ioapic_vector (unsigned int vector) | ||
1348 | { | ||
1349 | int irq = vector_to_irq(vector); | ||
1350 | |||
1351 | return startup_level_ioapic_irq (irq); | ||
1352 | } | ||
1353 | |||
1354 | static void end_level_ioapic_vector (unsigned int vector) | ||
1355 | { | ||
1356 | int irq = vector_to_irq(vector); | ||
1357 | |||
1358 | end_level_ioapic_irq(irq); | ||
1359 | } | ||
1360 | |||
1361 | static void mask_IO_APIC_vector (unsigned int vector) | ||
1362 | { | ||
1363 | int irq = vector_to_irq(vector); | ||
1364 | |||
1365 | mask_IO_APIC_irq(irq); | ||
1366 | } | ||
1367 | |||
1368 | static void unmask_IO_APIC_vector (unsigned int vector) | ||
1369 | { | ||
1370 | int irq = vector_to_irq(vector); | ||
1371 | |||
1372 | unmask_IO_APIC_irq(irq); | ||
1373 | } | ||
1374 | |||
1375 | static void set_ioapic_affinity_vector (unsigned int vector, | ||
1376 | cpumask_t cpu_mask) | ||
1377 | { | ||
1378 | int irq = vector_to_irq(vector); | ||
1379 | |||
1380 | set_ioapic_affinity_irq(irq, cpu_mask); | ||
1381 | } | ||
1382 | #endif | ||
1383 | |||
1384 | /* | ||
1385 | * Level and edge triggered IO-APIC interrupts need different handling, | ||
1386 | * so we use two separate IRQ descriptors. Edge triggered IRQs can be | ||
1387 | * handled with the level-triggered descriptor, but that one has slightly | ||
1388 | * more overhead. Level-triggered interrupts cannot be handled with the | ||
1389 | * edge-triggered handler, without risking IRQ storms and other ugly | ||
1390 | * races. | ||
1391 | */ | ||
1392 | |||
1393 | static struct hw_interrupt_type ioapic_edge_type = { | ||
1394 | .typename = "IO-APIC-edge", | ||
1395 | .startup = startup_edge_ioapic, | ||
1396 | .shutdown = shutdown_edge_ioapic, | ||
1397 | .enable = enable_edge_ioapic, | ||
1398 | .disable = disable_edge_ioapic, | ||
1399 | .ack = ack_edge_ioapic, | ||
1400 | .end = end_edge_ioapic, | ||
1401 | .set_affinity = set_ioapic_affinity, | ||
1402 | }; | ||
1403 | |||
1404 | static struct hw_interrupt_type ioapic_level_type = { | ||
1405 | .typename = "IO-APIC-level", | ||
1406 | .startup = startup_level_ioapic, | ||
1407 | .shutdown = shutdown_level_ioapic, | ||
1408 | .enable = enable_level_ioapic, | ||
1409 | .disable = disable_level_ioapic, | ||
1410 | .ack = mask_and_ack_level_ioapic, | ||
1411 | .end = end_level_ioapic, | ||
1412 | .set_affinity = set_ioapic_affinity, | ||
1413 | }; | ||
1414 | |||
1415 | static inline void init_IO_APIC_traps(void) | ||
1416 | { | ||
1417 | int irq; | ||
1418 | |||
1419 | /* | ||
1420 | * NOTE! The local APIC isn't very good at handling | ||
1421 | * multiple interrupts at the same interrupt level. | ||
1422 | * As the interrupt level is determined by taking the | ||
1423 | * vector number and shifting that right by 4, we | ||
1424 | * want to spread these out a bit so that they don't | ||
1425 | * all fall in the same interrupt level. | ||
1426 | * | ||
1427 | * Also, we've got to be careful not to trash gate | ||
1428 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
1429 | */ | ||
1430 | for (irq = 0; irq < NR_IRQS ; irq++) { | ||
1431 | int tmp = irq; | ||
1432 | if (use_pci_vector()) { | ||
1433 | if (!platform_legacy_irq(tmp)) | ||
1434 | if ((tmp = vector_to_irq(tmp)) == -1) | ||
1435 | continue; | ||
1436 | } | ||
1437 | if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { | ||
1438 | /* | ||
1439 | * Hmm.. We don't have an entry for this, | ||
1440 | * so default to an old-fashioned 8259 | ||
1441 | * interrupt if we can.. | ||
1442 | */ | ||
1443 | if (irq < 16) | ||
1444 | make_8259A_irq(irq); | ||
1445 | else | ||
1446 | /* Strange. Oh, well.. */ | ||
1447 | irq_desc[irq].handler = &no_irq_type; | ||
1448 | } | ||
1449 | } | ||
1450 | } | ||
1451 | |||
1452 | static void enable_lapic_irq (unsigned int irq) | ||
1453 | { | ||
1454 | unsigned long v; | ||
1455 | |||
1456 | v = apic_read(APIC_LVT0); | ||
1457 | apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); | ||
1458 | } | ||
1459 | |||
1460 | static void disable_lapic_irq (unsigned int irq) | ||
1461 | { | ||
1462 | unsigned long v; | ||
1463 | |||
1464 | v = apic_read(APIC_LVT0); | ||
1465 | apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); | ||
1466 | } | ||
1467 | |||
1468 | static void ack_lapic_irq (unsigned int irq) | ||
1469 | { | ||
1470 | ack_APIC_irq(); | ||
1471 | } | ||
1472 | |||
1473 | static void end_lapic_irq (unsigned int i) { /* nothing */ } | ||
1474 | |||
1475 | static struct hw_interrupt_type lapic_irq_type = { | ||
1476 | .typename = "local-APIC-edge", | ||
1477 | .startup = NULL, /* startup_irq() not used for IRQ0 */ | ||
1478 | .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ | ||
1479 | .enable = enable_lapic_irq, | ||
1480 | .disable = disable_lapic_irq, | ||
1481 | .ack = ack_lapic_irq, | ||
1482 | .end = end_lapic_irq, | ||
1483 | }; | ||
1484 | |||
1485 | static void setup_nmi (void) | ||
1486 | { | ||
1487 | /* | ||
1488 | * Dirty trick to enable the NMI watchdog ... | ||
1489 | * We put the 8259A master into AEOI mode and | ||
1490 | * unmask on all local APICs LVT0 as NMI. | ||
1491 | * | ||
1492 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | ||
1493 | * is from Maciej W. Rozycki - so we do not have to EOI from | ||
1494 | * the NMI handler or the timer interrupt. | ||
1495 | */ | ||
1496 | printk(KERN_INFO "activating NMI Watchdog ..."); | ||
1497 | |||
1498 | enable_NMI_through_LVT0(NULL); | ||
1499 | |||
1500 | printk(" done.\n"); | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * This looks a bit hackish but it's about the only one way of sending | ||
1505 | * a few INTA cycles to 8259As and any associated glue logic. ICR does | ||
1506 | * not support the ExtINT mode, unfortunately. We need to send these | ||
1507 | * cycles as some i82489DX-based boards have glue logic that keeps the | ||
1508 | * 8259A interrupt line asserted until INTA. --macro | ||
1509 | */ | ||
1510 | static inline void unlock_ExtINT_logic(void) | ||
1511 | { | ||
1512 | int pin, i; | ||
1513 | struct IO_APIC_route_entry entry0, entry1; | ||
1514 | unsigned char save_control, save_freq_select; | ||
1515 | unsigned long flags; | ||
1516 | |||
1517 | pin = find_isa_irq_pin(8, mp_INT); | ||
1518 | if (pin == -1) | ||
1519 | return; | ||
1520 | |||
1521 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1522 | *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); | ||
1523 | *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); | ||
1524 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1525 | clear_IO_APIC_pin(0, pin); | ||
1526 | |||
1527 | memset(&entry1, 0, sizeof(entry1)); | ||
1528 | |||
1529 | entry1.dest_mode = 0; /* physical delivery */ | ||
1530 | entry1.mask = 0; /* unmask IRQ now */ | ||
1531 | entry1.dest.physical.physical_dest = hard_smp_processor_id(); | ||
1532 | entry1.delivery_mode = dest_ExtINT; | ||
1533 | entry1.polarity = entry0.polarity; | ||
1534 | entry1.trigger = 0; | ||
1535 | entry1.vector = 0; | ||
1536 | |||
1537 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1538 | io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); | ||
1539 | io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); | ||
1540 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1541 | |||
1542 | save_control = CMOS_READ(RTC_CONTROL); | ||
1543 | save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
1544 | CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | ||
1545 | RTC_FREQ_SELECT); | ||
1546 | CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | ||
1547 | |||
1548 | i = 100; | ||
1549 | while (i-- > 0) { | ||
1550 | mdelay(10); | ||
1551 | if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | ||
1552 | i -= 10; | ||
1553 | } | ||
1554 | |||
1555 | CMOS_WRITE(save_control, RTC_CONTROL); | ||
1556 | CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | ||
1557 | clear_IO_APIC_pin(0, pin); | ||
1558 | |||
1559 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1560 | io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); | ||
1561 | io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); | ||
1562 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1563 | } | ||
1564 | |||
1565 | /* | ||
1566 | * This code may look a bit paranoid, but it's supposed to cooperate with | ||
1567 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | ||
1568 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | ||
1569 | * fanatically on his truly buggy board. | ||
1570 | */ | ||
1571 | static inline void check_timer(void) | ||
1572 | { | ||
1573 | int pin1, pin2; | ||
1574 | int vector; | ||
1575 | |||
1576 | /* | ||
1577 | * get/set the timer IRQ vector: | ||
1578 | */ | ||
1579 | disable_8259A_irq(0); | ||
1580 | vector = assign_irq_vector(0); | ||
1581 | set_intr_gate(vector, interrupt[0]); | ||
1582 | |||
1583 | /* | ||
1584 | * Subtle, code in do_timer_interrupt() expects an AEOI | ||
1585 | * mode for the 8259A whenever interrupts are routed | ||
1586 | * through I/O APICs. Also IRQ0 has to be enabled in | ||
1587 | * the 8259A which implies the virtual wire has to be | ||
1588 | * disabled in the local APIC. | ||
1589 | */ | ||
1590 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
1591 | init_8259A(1); | ||
1592 | enable_8259A_irq(0); | ||
1593 | |||
1594 | pin1 = find_isa_irq_pin(0, mp_INT); | ||
1595 | pin2 = find_isa_irq_pin(0, mp_ExtINT); | ||
1596 | |||
1597 | apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); | ||
1598 | |||
1599 | if (pin1 != -1) { | ||
1600 | /* | ||
1601 | * Ok, does IRQ0 through the IOAPIC work? | ||
1602 | */ | ||
1603 | unmask_IO_APIC_irq(0); | ||
1604 | if (timer_irq_works()) { | ||
1605 | nmi_watchdog_default(); | ||
1606 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1607 | disable_8259A_irq(0); | ||
1608 | setup_nmi(); | ||
1609 | enable_8259A_irq(0); | ||
1610 | check_nmi_watchdog(); | ||
1611 | } | ||
1612 | return; | ||
1613 | } | ||
1614 | clear_IO_APIC_pin(0, pin1); | ||
1615 | apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); | ||
1616 | } | ||
1617 | |||
1618 | apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); | ||
1619 | if (pin2 != -1) { | ||
1620 | apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2); | ||
1621 | /* | ||
1622 | * legacy devices should be connected to IO APIC #0 | ||
1623 | */ | ||
1624 | setup_ExtINT_IRQ0_pin(pin2, vector); | ||
1625 | if (timer_irq_works()) { | ||
1626 | printk("works.\n"); | ||
1627 | nmi_watchdog_default(); | ||
1628 | if (nmi_watchdog == NMI_IO_APIC) { | ||
1629 | setup_nmi(); | ||
1630 | check_nmi_watchdog(); | ||
1631 | } | ||
1632 | return; | ||
1633 | } | ||
1634 | /* | ||
1635 | * Cleanup, just in case ... | ||
1636 | */ | ||
1637 | clear_IO_APIC_pin(0, pin2); | ||
1638 | } | ||
1639 | printk(" failed.\n"); | ||
1640 | |||
1641 | if (nmi_watchdog) { | ||
1642 | printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); | ||
1643 | nmi_watchdog = 0; | ||
1644 | } | ||
1645 | |||
1646 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); | ||
1647 | |||
1648 | disable_8259A_irq(0); | ||
1649 | irq_desc[0].handler = &lapic_irq_type; | ||
1650 | apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ | ||
1651 | enable_8259A_irq(0); | ||
1652 | |||
1653 | if (timer_irq_works()) { | ||
1654 | apic_printk(APIC_QUIET, " works.\n"); | ||
1655 | return; | ||
1656 | } | ||
1657 | apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); | ||
1658 | apic_printk(APIC_VERBOSE," failed.\n"); | ||
1659 | |||
1660 | apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); | ||
1661 | |||
1662 | init_8259A(0); | ||
1663 | make_8259A_irq(0); | ||
1664 | apic_write_around(APIC_LVT0, APIC_DM_EXTINT); | ||
1665 | |||
1666 | unlock_ExtINT_logic(); | ||
1667 | |||
1668 | if (timer_irq_works()) { | ||
1669 | apic_printk(APIC_VERBOSE," works.\n"); | ||
1670 | return; | ||
1671 | } | ||
1672 | apic_printk(APIC_VERBOSE," failed :(.\n"); | ||
1673 | panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); | ||
1674 | } | ||
1675 | |||
1676 | /* | ||
1677 | * | ||
1678 | * IRQ's that are handled by the PIC in the MPS IOAPIC case. | ||
1679 | * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. | ||
1680 | * Linux doesn't really care, as it's not actually used | ||
1681 | * for any interrupt handling anyway. | ||
1682 | */ | ||
1683 | #define PIC_IRQS (1<<2) | ||
1684 | |||
1685 | void __init setup_IO_APIC(void) | ||
1686 | { | ||
1687 | enable_IO_APIC(); | ||
1688 | |||
1689 | if (acpi_ioapic) | ||
1690 | io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ | ||
1691 | else | ||
1692 | io_apic_irqs = ~PIC_IRQS; | ||
1693 | |||
1694 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | ||
1695 | |||
1696 | /* | ||
1697 | * Set up the IO-APIC IRQ routing table. | ||
1698 | */ | ||
1699 | if (!acpi_ioapic) | ||
1700 | setup_ioapic_ids_from_mpc(); | ||
1701 | sync_Arb_IDs(); | ||
1702 | setup_IO_APIC_irqs(); | ||
1703 | init_IO_APIC_traps(); | ||
1704 | check_timer(); | ||
1705 | if (!acpi_ioapic) | ||
1706 | print_IO_APIC(); | ||
1707 | } | ||
1708 | |||
1709 | struct sysfs_ioapic_data { | ||
1710 | struct sys_device dev; | ||
1711 | struct IO_APIC_route_entry entry[0]; | ||
1712 | }; | ||
1713 | static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; | ||
1714 | |||
1715 | static int ioapic_suspend(struct sys_device *dev, u32 state) | ||
1716 | { | ||
1717 | struct IO_APIC_route_entry *entry; | ||
1718 | struct sysfs_ioapic_data *data; | ||
1719 | unsigned long flags; | ||
1720 | int i; | ||
1721 | |||
1722 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1723 | entry = data->entry; | ||
1724 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1725 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | ||
1726 | *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); | ||
1727 | *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); | ||
1728 | } | ||
1729 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1730 | |||
1731 | return 0; | ||
1732 | } | ||
1733 | |||
1734 | static int ioapic_resume(struct sys_device *dev) | ||
1735 | { | ||
1736 | struct IO_APIC_route_entry *entry; | ||
1737 | struct sysfs_ioapic_data *data; | ||
1738 | unsigned long flags; | ||
1739 | union IO_APIC_reg_00 reg_00; | ||
1740 | int i; | ||
1741 | |||
1742 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
1743 | entry = data->entry; | ||
1744 | |||
1745 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1746 | reg_00.raw = io_apic_read(dev->id, 0); | ||
1747 | if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { | ||
1748 | reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; | ||
1749 | io_apic_write(dev->id, 0, reg_00.raw); | ||
1750 | } | ||
1751 | for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { | ||
1752 | io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); | ||
1753 | io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); | ||
1754 | } | ||
1755 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1756 | |||
1757 | return 0; | ||
1758 | } | ||
1759 | |||
1760 | static struct sysdev_class ioapic_sysdev_class = { | ||
1761 | set_kset_name("ioapic"), | ||
1762 | .suspend = ioapic_suspend, | ||
1763 | .resume = ioapic_resume, | ||
1764 | }; | ||
1765 | |||
1766 | static int __init ioapic_init_sysfs(void) | ||
1767 | { | ||
1768 | struct sys_device * dev; | ||
1769 | int i, size, error = 0; | ||
1770 | |||
1771 | error = sysdev_class_register(&ioapic_sysdev_class); | ||
1772 | if (error) | ||
1773 | return error; | ||
1774 | |||
1775 | for (i = 0; i < nr_ioapics; i++ ) { | ||
1776 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] | ||
1777 | * sizeof(struct IO_APIC_route_entry); | ||
1778 | mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); | ||
1779 | if (!mp_ioapic_data[i]) { | ||
1780 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1781 | continue; | ||
1782 | } | ||
1783 | memset(mp_ioapic_data[i], 0, size); | ||
1784 | dev = &mp_ioapic_data[i]->dev; | ||
1785 | dev->id = i; | ||
1786 | dev->cls = &ioapic_sysdev_class; | ||
1787 | error = sysdev_register(dev); | ||
1788 | if (error) { | ||
1789 | kfree(mp_ioapic_data[i]); | ||
1790 | mp_ioapic_data[i] = NULL; | ||
1791 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
1792 | continue; | ||
1793 | } | ||
1794 | } | ||
1795 | |||
1796 | return 0; | ||
1797 | } | ||
1798 | |||
1799 | device_initcall(ioapic_init_sysfs); | ||
1800 | |||
1801 | /* -------------------------------------------------------------------------- | ||
1802 | ACPI-based IOAPIC Configuration | ||
1803 | -------------------------------------------------------------------------- */ | ||
1804 | |||
1805 | #ifdef CONFIG_ACPI_BOOT | ||
1806 | |||
1807 | #define IO_APIC_MAX_ID 0xFE | ||
1808 | |||
1809 | int __init io_apic_get_unique_id (int ioapic, int apic_id) | ||
1810 | { | ||
1811 | union IO_APIC_reg_00 reg_00; | ||
1812 | static physid_mask_t apic_id_map; | ||
1813 | unsigned long flags; | ||
1814 | int i = 0; | ||
1815 | |||
1816 | /* | ||
1817 | * The P4 platform supports up to 256 APIC IDs on two separate APIC | ||
1818 | * buses (one for LAPICs, one for IOAPICs), where predecessors only | ||
1819 | * supports up to 16 on one shared APIC bus. | ||
1820 | * | ||
1821 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | ||
1822 | * advantage of new APIC bus architecture. | ||
1823 | */ | ||
1824 | |||
1825 | if (physids_empty(apic_id_map)) | ||
1826 | apic_id_map = phys_cpu_present_map; | ||
1827 | |||
1828 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1829 | reg_00.raw = io_apic_read(ioapic, 0); | ||
1830 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1831 | |||
1832 | if (apic_id >= IO_APIC_MAX_ID) { | ||
1833 | apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " | ||
1834 | "%d\n", ioapic, apic_id, reg_00.bits.ID); | ||
1835 | apic_id = reg_00.bits.ID; | ||
1836 | } | ||
1837 | |||
1838 | /* | ||
1839 | * Every APIC in a system must have a unique ID or we get lots of nice | ||
1840 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
1841 | */ | ||
1842 | if (physid_isset(apic_id, apic_id_map)) { | ||
1843 | |||
1844 | for (i = 0; i < IO_APIC_MAX_ID; i++) { | ||
1845 | if (!physid_isset(i, apic_id_map)) | ||
1846 | break; | ||
1847 | } | ||
1848 | |||
1849 | if (i == IO_APIC_MAX_ID) | ||
1850 | panic("Max apic_id exceeded!\n"); | ||
1851 | |||
1852 | apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " | ||
1853 | "trying %d\n", ioapic, apic_id, i); | ||
1854 | |||
1855 | apic_id = i; | ||
1856 | } | ||
1857 | |||
1858 | physid_set(apic_id, apic_id_map); | ||
1859 | |||
1860 | if (reg_00.bits.ID != apic_id) { | ||
1861 | reg_00.bits.ID = apic_id; | ||
1862 | |||
1863 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1864 | io_apic_write(ioapic, 0, reg_00.raw); | ||
1865 | reg_00.raw = io_apic_read(ioapic, 0); | ||
1866 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1867 | |||
1868 | /* Sanity check */ | ||
1869 | if (reg_00.bits.ID != apic_id) | ||
1870 | panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); | ||
1871 | } | ||
1872 | |||
1873 | apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); | ||
1874 | |||
1875 | return apic_id; | ||
1876 | } | ||
1877 | |||
1878 | |||
1879 | int __init io_apic_get_version (int ioapic) | ||
1880 | { | ||
1881 | union IO_APIC_reg_01 reg_01; | ||
1882 | unsigned long flags; | ||
1883 | |||
1884 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1885 | reg_01.raw = io_apic_read(ioapic, 1); | ||
1886 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1887 | |||
1888 | return reg_01.bits.version; | ||
1889 | } | ||
1890 | |||
1891 | |||
1892 | int __init io_apic_get_redir_entries (int ioapic) | ||
1893 | { | ||
1894 | union IO_APIC_reg_01 reg_01; | ||
1895 | unsigned long flags; | ||
1896 | |||
1897 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1898 | reg_01.raw = io_apic_read(ioapic, 1); | ||
1899 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1900 | |||
1901 | return reg_01.bits.entries; | ||
1902 | } | ||
1903 | |||
1904 | |||
1905 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) | ||
1906 | { | ||
1907 | struct IO_APIC_route_entry entry; | ||
1908 | unsigned long flags; | ||
1909 | |||
1910 | if (!IO_APIC_IRQ(irq)) { | ||
1911 | apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
1912 | ioapic); | ||
1913 | return -EINVAL; | ||
1914 | } | ||
1915 | |||
1916 | /* | ||
1917 | * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. | ||
1918 | * Note that we mask (disable) IRQs now -- these get enabled when the | ||
1919 | * corresponding device driver registers for this IRQ. | ||
1920 | */ | ||
1921 | |||
1922 | memset(&entry,0,sizeof(entry)); | ||
1923 | |||
1924 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
1925 | entry.dest_mode = INT_DEST_MODE; | ||
1926 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
1927 | entry.trigger = edge_level; | ||
1928 | entry.polarity = active_high_low; | ||
1929 | entry.mask = 1; /* Disabled (masked) */ | ||
1930 | |||
1931 | /* | ||
1932 | * IRQs < 16 are already in the irq_2_pin[] map | ||
1933 | */ | ||
1934 | if (irq >= 16) | ||
1935 | add_pin_to_irq(irq, ioapic, pin); | ||
1936 | |||
1937 | entry.vector = assign_irq_vector(irq); | ||
1938 | |||
1939 | apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " | ||
1940 | "IRQ %d Mode:%i Active:%i)\n", ioapic, | ||
1941 | mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, | ||
1942 | edge_level, active_high_low); | ||
1943 | |||
1944 | ioapic_register_intr(irq, entry.vector, edge_level); | ||
1945 | |||
1946 | if (!ioapic && (irq < 16)) | ||
1947 | disable_8259A_irq(irq); | ||
1948 | |||
1949 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1950 | io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); | ||
1951 | io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); | ||
1952 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1953 | |||
1954 | return 0; | ||
1955 | } | ||
1956 | |||
1957 | #endif /*CONFIG_ACPI_BOOT*/ | ||
1958 | |||
1959 | |||
1960 | /* | ||
1961 | * This function currently is only a helper for the i386 smp boot process where | ||
1962 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | ||
1963 | * so mask in all cases should simply be TARGET_CPUS | ||
1964 | */ | ||
1965 | void __init setup_ioapic_dest(void) | ||
1966 | { | ||
1967 | int pin, ioapic, irq, irq_entry; | ||
1968 | |||
1969 | if (skip_ioapic_setup == 1) | ||
1970 | return; | ||
1971 | |||
1972 | for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | ||
1973 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | ||
1974 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | ||
1975 | if (irq_entry == -1) | ||
1976 | continue; | ||
1977 | irq = pin_2_irq(irq_entry, ioapic, pin); | ||
1978 | set_ioapic_affinity_irq(irq, TARGET_CPUS); | ||
1979 | } | ||
1980 | |||
1981 | } | ||
1982 | } | ||
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c new file mode 100644 index 000000000000..cde0e868efe3 --- /dev/null +++ b/arch/x86_64/kernel/ioport.c | |||
@@ -0,0 +1,117 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ioport.c | ||
3 | * | ||
4 | * This contains the io-permission bitmap code - written by obz, with changes | ||
5 | * by Linus. | ||
6 | */ | ||
7 | |||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/errno.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/ioport.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/smp_lock.h> | ||
15 | #include <linux/stddef.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/thread_info.h> | ||
18 | |||
19 | /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | ||
20 | static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) | ||
21 | { | ||
22 | int i; | ||
23 | if (new_value) | ||
24 | for (i = base; i < base + extent; i++) | ||
25 | __set_bit(i, bitmap); | ||
26 | else | ||
27 | for (i = base; i < base + extent; i++) | ||
28 | clear_bit(i, bitmap); | ||
29 | } | ||
30 | |||
31 | /* | ||
32 | * this changes the io permissions bitmap in the current task. | ||
33 | */ | ||
34 | asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) | ||
35 | { | ||
36 | unsigned int i, max_long, bytes, bytes_updated; | ||
37 | struct thread_struct * t = ¤t->thread; | ||
38 | struct tss_struct * tss; | ||
39 | unsigned long *bitmap; | ||
40 | |||
41 | if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) | ||
42 | return -EINVAL; | ||
43 | if (turn_on && !capable(CAP_SYS_RAWIO)) | ||
44 | return -EPERM; | ||
45 | |||
46 | /* | ||
47 | * If it's the first ioperm() call in this thread's lifetime, set the | ||
48 | * IO bitmap up. ioperm() is much less timing critical than clone(), | ||
49 | * this is why we delay this operation until now: | ||
50 | */ | ||
51 | if (!t->io_bitmap_ptr) { | ||
52 | bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
53 | if (!bitmap) | ||
54 | return -ENOMEM; | ||
55 | |||
56 | memset(bitmap, 0xff, IO_BITMAP_BYTES); | ||
57 | t->io_bitmap_ptr = bitmap; | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * do it in the per-thread copy and in the TSS ... | ||
62 | * | ||
63 | * Disable preemption via get_cpu() - we must not switch away | ||
64 | * because the ->io_bitmap_max value must match the bitmap | ||
65 | * contents: | ||
66 | */ | ||
67 | tss = &per_cpu(init_tss, get_cpu()); | ||
68 | |||
69 | set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); | ||
70 | |||
71 | /* | ||
72 | * Search for a (possibly new) maximum. This is simple and stupid, | ||
73 | * to keep it obviously correct: | ||
74 | */ | ||
75 | max_long = 0; | ||
76 | for (i = 0; i < IO_BITMAP_LONGS; i++) | ||
77 | if (t->io_bitmap_ptr[i] != ~0UL) | ||
78 | max_long = i; | ||
79 | |||
80 | bytes = (max_long + 1) * sizeof(long); | ||
81 | bytes_updated = max(bytes, t->io_bitmap_max); | ||
82 | |||
83 | t->io_bitmap_max = bytes; | ||
84 | |||
85 | /* Update the TSS: */ | ||
86 | memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); | ||
87 | |||
88 | put_cpu(); | ||
89 | |||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * sys_iopl has to be used when you want to access the IO ports | ||
95 | * beyond the 0x3ff range: to get the full 65536 ports bitmapped | ||
96 | * you'd need 8kB of bitmaps/process, which is a bit excessive. | ||
97 | * | ||
98 | * Here we just change the eflags value on the stack: we allow | ||
99 | * only the super-user to do it. This depends on the stack-layout | ||
100 | * on system-call entry - see also fork() and the signal handling | ||
101 | * code. | ||
102 | */ | ||
103 | |||
104 | asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) | ||
105 | { | ||
106 | unsigned int old = (regs->eflags >> 12) & 3; | ||
107 | |||
108 | if (level > 3) | ||
109 | return -EINVAL; | ||
110 | /* Trying to gain more privileges? */ | ||
111 | if (level > old) { | ||
112 | if (!capable(CAP_SYS_RAWIO)) | ||
113 | return -EPERM; | ||
114 | } | ||
115 | regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); | ||
116 | return 0; | ||
117 | } | ||
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c new file mode 100644 index 000000000000..62b112e4deb4 --- /dev/null +++ b/arch/x86_64/kernel/irq.c | |||
@@ -0,0 +1,108 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/irq.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the lowest level x86_64-specific interrupt | ||
7 | * entry and irq statistics code. All the remaining irq logic is | ||
8 | * done by the generic kernel/irq/ code and in the | ||
9 | * x86_64-specific irq controller code. (e.g. i8259.c and | ||
10 | * io_apic.c.) | ||
11 | */ | ||
12 | |||
13 | #include <linux/kernel_stat.h> | ||
14 | #include <linux/interrupt.h> | ||
15 | #include <linux/seq_file.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/io_apic.h> | ||
19 | |||
20 | atomic_t irq_err_count; | ||
21 | #ifdef CONFIG_X86_IO_APIC | ||
22 | #ifdef APIC_MISMATCH_DEBUG | ||
23 | atomic_t irq_mis_count; | ||
24 | #endif | ||
25 | #endif | ||
26 | |||
27 | /* | ||
28 | * Generic, controller-independent functions: | ||
29 | */ | ||
30 | |||
31 | int show_interrupts(struct seq_file *p, void *v) | ||
32 | { | ||
33 | int i = *(loff_t *) v, j; | ||
34 | struct irqaction * action; | ||
35 | unsigned long flags; | ||
36 | |||
37 | if (i == 0) { | ||
38 | seq_printf(p, " "); | ||
39 | for (j=0; j<NR_CPUS; j++) | ||
40 | if (cpu_online(j)) | ||
41 | seq_printf(p, "CPU%d ",j); | ||
42 | seq_putc(p, '\n'); | ||
43 | } | ||
44 | |||
45 | if (i < NR_IRQS) { | ||
46 | spin_lock_irqsave(&irq_desc[i].lock, flags); | ||
47 | action = irq_desc[i].action; | ||
48 | if (!action) | ||
49 | goto skip; | ||
50 | seq_printf(p, "%3d: ",i); | ||
51 | #ifndef CONFIG_SMP | ||
52 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
53 | #else | ||
54 | for (j=0; j<NR_CPUS; j++) | ||
55 | if (cpu_online(j)) | ||
56 | seq_printf(p, "%10u ", | ||
57 | kstat_cpu(j).irqs[i]); | ||
58 | #endif | ||
59 | seq_printf(p, " %14s", irq_desc[i].handler->typename); | ||
60 | |||
61 | seq_printf(p, " %s", action->name); | ||
62 | for (action=action->next; action; action = action->next) | ||
63 | seq_printf(p, ", %s", action->name); | ||
64 | seq_putc(p, '\n'); | ||
65 | skip: | ||
66 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | ||
67 | } else if (i == NR_IRQS) { | ||
68 | seq_printf(p, "NMI: "); | ||
69 | for (j = 0; j < NR_CPUS; j++) | ||
70 | if (cpu_online(j)) | ||
71 | seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); | ||
72 | seq_putc(p, '\n'); | ||
73 | #ifdef CONFIG_X86_LOCAL_APIC | ||
74 | seq_printf(p, "LOC: "); | ||
75 | for (j = 0; j < NR_CPUS; j++) | ||
76 | if (cpu_online(j)) | ||
77 | seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); | ||
78 | seq_putc(p, '\n'); | ||
79 | #endif | ||
80 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
81 | #ifdef CONFIG_X86_IO_APIC | ||
82 | #ifdef APIC_MISMATCH_DEBUG | ||
83 | seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | ||
84 | #endif | ||
85 | #endif | ||
86 | } | ||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * do_IRQ handles all normal device IRQ's (the special | ||
92 | * SMP cross-CPU interrupts have their own specific | ||
93 | * handlers). | ||
94 | */ | ||
95 | asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | ||
96 | { | ||
97 | /* high bits used in ret_from_ code */ | ||
98 | unsigned irq = regs->orig_rax & 0xff; | ||
99 | |||
100 | irq_enter(); | ||
101 | BUG_ON(irq > 256); | ||
102 | |||
103 | __do_IRQ(irq, regs); | ||
104 | irq_exit(); | ||
105 | |||
106 | return 1; | ||
107 | } | ||
108 | |||
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c new file mode 100644 index 000000000000..4f2a852299b6 --- /dev/null +++ b/arch/x86_64/kernel/kprobes.c | |||
@@ -0,0 +1,631 @@ | |||
1 | /* | ||
2 | * Kernel Probes (KProbes) | ||
3 | * arch/x86_64/kernel/kprobes.c | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
20 | * | ||
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | ||
22 | * Probes initial implementation ( includes contributions from | ||
23 | * Rusty Russell). | ||
24 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | ||
25 | * interface to access function arguments. | ||
26 | * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi | ||
27 | * <prasanna@in.ibm.com> adapted for x86_64 | ||
28 | * 2005-Mar Roland McGrath <roland@redhat.com> | ||
29 | * Fixed to handle %rip-relative addressing mode correctly. | ||
30 | */ | ||
31 | |||
32 | #include <linux/config.h> | ||
33 | #include <linux/kprobes.h> | ||
34 | #include <linux/ptrace.h> | ||
35 | #include <linux/spinlock.h> | ||
36 | #include <linux/string.h> | ||
37 | #include <linux/slab.h> | ||
38 | #include <linux/preempt.h> | ||
39 | #include <linux/moduleloader.h> | ||
40 | |||
41 | #include <asm/pgtable.h> | ||
42 | #include <asm/kdebug.h> | ||
43 | |||
44 | static DECLARE_MUTEX(kprobe_mutex); | ||
45 | |||
46 | /* kprobe_status settings */ | ||
47 | #define KPROBE_HIT_ACTIVE 0x00000001 | ||
48 | #define KPROBE_HIT_SS 0x00000002 | ||
49 | |||
50 | static struct kprobe *current_kprobe; | ||
51 | static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; | ||
52 | static struct pt_regs jprobe_saved_regs; | ||
53 | static long *jprobe_saved_rsp; | ||
54 | static kprobe_opcode_t *get_insn_slot(void); | ||
55 | static void free_insn_slot(kprobe_opcode_t *slot); | ||
56 | void jprobe_return_end(void); | ||
57 | |||
58 | /* copy of the kernel stack at the probe fire time */ | ||
59 | static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; | ||
60 | |||
61 | /* | ||
62 | * returns non-zero if opcode modifies the interrupt flag. | ||
63 | */ | ||
64 | static inline int is_IF_modifier(kprobe_opcode_t *insn) | ||
65 | { | ||
66 | switch (*insn) { | ||
67 | case 0xfa: /* cli */ | ||
68 | case 0xfb: /* sti */ | ||
69 | case 0xcf: /* iret/iretd */ | ||
70 | case 0x9d: /* popf/popfd */ | ||
71 | return 1; | ||
72 | } | ||
73 | |||
74 | if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) | ||
75 | return 1; | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | int arch_prepare_kprobe(struct kprobe *p) | ||
80 | { | ||
81 | /* insn: must be on special executable page on x86_64. */ | ||
82 | up(&kprobe_mutex); | ||
83 | p->ainsn.insn = get_insn_slot(); | ||
84 | down(&kprobe_mutex); | ||
85 | if (!p->ainsn.insn) { | ||
86 | return -ENOMEM; | ||
87 | } | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * Determine if the instruction uses the %rip-relative addressing mode. | ||
93 | * If it does, return the address of the 32-bit displacement word. | ||
94 | * If not, return null. | ||
95 | */ | ||
96 | static inline s32 *is_riprel(u8 *insn) | ||
97 | { | ||
98 | #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ | ||
99 | (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ | ||
100 | (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ | ||
101 | (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ | ||
102 | (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ | ||
103 | << (row % 64)) | ||
104 | static const u64 onebyte_has_modrm[256 / 64] = { | ||
105 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
106 | /* ------------------------------- */ | ||
107 | W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ | ||
108 | W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ | ||
109 | W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ | ||
110 | W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ | ||
111 | W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ | ||
112 | W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ | ||
113 | W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ | ||
114 | W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ | ||
115 | W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ | ||
116 | W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ | ||
117 | W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ | ||
118 | W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ | ||
119 | W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ | ||
120 | W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ | ||
121 | W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ | ||
122 | W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ | ||
123 | /* ------------------------------- */ | ||
124 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
125 | }; | ||
126 | static const u64 twobyte_has_modrm[256 / 64] = { | ||
127 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
128 | /* ------------------------------- */ | ||
129 | W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ | ||
130 | W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ | ||
131 | W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ | ||
132 | W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ | ||
133 | W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ | ||
134 | W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ | ||
135 | W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ | ||
136 | W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ | ||
137 | W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ | ||
138 | W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ | ||
139 | W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ | ||
140 | W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ | ||
141 | W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ | ||
142 | W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ | ||
143 | W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ | ||
144 | W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ | ||
145 | /* ------------------------------- */ | ||
146 | /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ | ||
147 | }; | ||
148 | #undef W | ||
149 | int need_modrm; | ||
150 | |||
151 | /* Skip legacy instruction prefixes. */ | ||
152 | while (1) { | ||
153 | switch (*insn) { | ||
154 | case 0x66: | ||
155 | case 0x67: | ||
156 | case 0x2e: | ||
157 | case 0x3e: | ||
158 | case 0x26: | ||
159 | case 0x64: | ||
160 | case 0x65: | ||
161 | case 0x36: | ||
162 | case 0xf0: | ||
163 | case 0xf3: | ||
164 | case 0xf2: | ||
165 | ++insn; | ||
166 | continue; | ||
167 | } | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | /* Skip REX instruction prefix. */ | ||
172 | if ((*insn & 0xf0) == 0x40) | ||
173 | ++insn; | ||
174 | |||
175 | if (*insn == 0x0f) { /* Two-byte opcode. */ | ||
176 | ++insn; | ||
177 | need_modrm = test_bit(*insn, twobyte_has_modrm); | ||
178 | } else { /* One-byte opcode. */ | ||
179 | need_modrm = test_bit(*insn, onebyte_has_modrm); | ||
180 | } | ||
181 | |||
182 | if (need_modrm) { | ||
183 | u8 modrm = *++insn; | ||
184 | if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ | ||
185 | /* Displacement follows ModRM byte. */ | ||
186 | return (s32 *) ++insn; | ||
187 | } | ||
188 | } | ||
189 | |||
190 | /* No %rip-relative addressing mode here. */ | ||
191 | return NULL; | ||
192 | } | ||
193 | |||
194 | void arch_copy_kprobe(struct kprobe *p) | ||
195 | { | ||
196 | s32 *ripdisp; | ||
197 | memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); | ||
198 | ripdisp = is_riprel(p->ainsn.insn); | ||
199 | if (ripdisp) { | ||
200 | /* | ||
201 | * The copied instruction uses the %rip-relative | ||
202 | * addressing mode. Adjust the displacement for the | ||
203 | * difference between the original location of this | ||
204 | * instruction and the location of the copy that will | ||
205 | * actually be run. The tricky bit here is making sure | ||
206 | * that the sign extension happens correctly in this | ||
207 | * calculation, since we need a signed 32-bit result to | ||
208 | * be sign-extended to 64 bits when it's added to the | ||
209 | * %rip value and yield the same 64-bit result that the | ||
210 | * sign-extension of the original signed 32-bit | ||
211 | * displacement would have given. | ||
212 | */ | ||
213 | s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; | ||
214 | BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ | ||
215 | *ripdisp = disp; | ||
216 | } | ||
217 | } | ||
218 | |||
219 | void arch_remove_kprobe(struct kprobe *p) | ||
220 | { | ||
221 | up(&kprobe_mutex); | ||
222 | free_insn_slot(p->ainsn.insn); | ||
223 | down(&kprobe_mutex); | ||
224 | } | ||
225 | |||
226 | static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) | ||
227 | { | ||
228 | *p->addr = p->opcode; | ||
229 | regs->rip = (unsigned long)p->addr; | ||
230 | } | ||
231 | |||
232 | static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) | ||
233 | { | ||
234 | regs->eflags |= TF_MASK; | ||
235 | regs->eflags &= ~IF_MASK; | ||
236 | /*single step inline if the instruction is an int3*/ | ||
237 | if (p->opcode == BREAKPOINT_INSTRUCTION) | ||
238 | regs->rip = (unsigned long)p->addr; | ||
239 | else | ||
240 | regs->rip = (unsigned long)p->ainsn.insn; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Interrupts are disabled on entry as trap3 is an interrupt gate and they | ||
245 | * remain disabled thorough out this function. | ||
246 | */ | ||
247 | int kprobe_handler(struct pt_regs *regs) | ||
248 | { | ||
249 | struct kprobe *p; | ||
250 | int ret = 0; | ||
251 | kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); | ||
252 | |||
253 | /* We're in an interrupt, but this is clear and BUG()-safe. */ | ||
254 | preempt_disable(); | ||
255 | |||
256 | /* Check we're not actually recursing */ | ||
257 | if (kprobe_running()) { | ||
258 | /* We *are* holding lock here, so this is safe. | ||
259 | Disarm the probe we just hit, and ignore it. */ | ||
260 | p = get_kprobe(addr); | ||
261 | if (p) { | ||
262 | if (kprobe_status == KPROBE_HIT_SS) { | ||
263 | regs->eflags &= ~TF_MASK; | ||
264 | regs->eflags |= kprobe_saved_rflags; | ||
265 | unlock_kprobes(); | ||
266 | goto no_kprobe; | ||
267 | } | ||
268 | disarm_kprobe(p, regs); | ||
269 | ret = 1; | ||
270 | } else { | ||
271 | p = current_kprobe; | ||
272 | if (p->break_handler && p->break_handler(p, regs)) { | ||
273 | goto ss_probe; | ||
274 | } | ||
275 | } | ||
276 | /* If it's not ours, can't be delete race, (we hold lock). */ | ||
277 | goto no_kprobe; | ||
278 | } | ||
279 | |||
280 | lock_kprobes(); | ||
281 | p = get_kprobe(addr); | ||
282 | if (!p) { | ||
283 | unlock_kprobes(); | ||
284 | if (*addr != BREAKPOINT_INSTRUCTION) { | ||
285 | /* | ||
286 | * The breakpoint instruction was removed right | ||
287 | * after we hit it. Another cpu has removed | ||
288 | * either a probepoint or a debugger breakpoint | ||
289 | * at this address. In either case, no further | ||
290 | * handling of this interrupt is appropriate. | ||
291 | */ | ||
292 | ret = 1; | ||
293 | } | ||
294 | /* Not one of ours: let kernel handle it */ | ||
295 | goto no_kprobe; | ||
296 | } | ||
297 | |||
298 | kprobe_status = KPROBE_HIT_ACTIVE; | ||
299 | current_kprobe = p; | ||
300 | kprobe_saved_rflags = kprobe_old_rflags | ||
301 | = (regs->eflags & (TF_MASK | IF_MASK)); | ||
302 | if (is_IF_modifier(p->ainsn.insn)) | ||
303 | kprobe_saved_rflags &= ~IF_MASK; | ||
304 | |||
305 | if (p->pre_handler && p->pre_handler(p, regs)) | ||
306 | /* handler has already set things up, so skip ss setup */ | ||
307 | return 1; | ||
308 | |||
309 | ss_probe: | ||
310 | prepare_singlestep(p, regs); | ||
311 | kprobe_status = KPROBE_HIT_SS; | ||
312 | return 1; | ||
313 | |||
314 | no_kprobe: | ||
315 | preempt_enable_no_resched(); | ||
316 | return ret; | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * Called after single-stepping. p->addr is the address of the | ||
321 | * instruction whose first byte has been replaced by the "int 3" | ||
322 | * instruction. To avoid the SMP problems that can occur when we | ||
323 | * temporarily put back the original opcode to single-step, we | ||
324 | * single-stepped a copy of the instruction. The address of this | ||
325 | * copy is p->ainsn.insn. | ||
326 | * | ||
327 | * This function prepares to return from the post-single-step | ||
328 | * interrupt. We have to fix up the stack as follows: | ||
329 | * | ||
330 | * 0) Except in the case of absolute or indirect jump or call instructions, | ||
331 | * the new rip is relative to the copied instruction. We need to make | ||
332 | * it relative to the original instruction. | ||
333 | * | ||
334 | * 1) If the single-stepped instruction was pushfl, then the TF and IF | ||
335 | * flags are set in the just-pushed eflags, and may need to be cleared. | ||
336 | * | ||
337 | * 2) If the single-stepped instruction was a call, the return address | ||
338 | * that is atop the stack is the address following the copied instruction. | ||
339 | * We need to make it the address following the original instruction. | ||
340 | */ | ||
341 | static void resume_execution(struct kprobe *p, struct pt_regs *regs) | ||
342 | { | ||
343 | unsigned long *tos = (unsigned long *)regs->rsp; | ||
344 | unsigned long next_rip = 0; | ||
345 | unsigned long copy_rip = (unsigned long)p->ainsn.insn; | ||
346 | unsigned long orig_rip = (unsigned long)p->addr; | ||
347 | kprobe_opcode_t *insn = p->ainsn.insn; | ||
348 | |||
349 | /*skip the REX prefix*/ | ||
350 | if (*insn >= 0x40 && *insn <= 0x4f) | ||
351 | insn++; | ||
352 | |||
353 | switch (*insn) { | ||
354 | case 0x9c: /* pushfl */ | ||
355 | *tos &= ~(TF_MASK | IF_MASK); | ||
356 | *tos |= kprobe_old_rflags; | ||
357 | break; | ||
358 | case 0xe8: /* call relative - Fix return addr */ | ||
359 | *tos = orig_rip + (*tos - copy_rip); | ||
360 | break; | ||
361 | case 0xff: | ||
362 | if ((*insn & 0x30) == 0x10) { | ||
363 | /* call absolute, indirect */ | ||
364 | /* Fix return addr; rip is correct. */ | ||
365 | next_rip = regs->rip; | ||
366 | *tos = orig_rip + (*tos - copy_rip); | ||
367 | } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ | ||
368 | ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ | ||
369 | /* rip is correct. */ | ||
370 | next_rip = regs->rip; | ||
371 | } | ||
372 | break; | ||
373 | case 0xea: /* jmp absolute -- rip is correct */ | ||
374 | next_rip = regs->rip; | ||
375 | break; | ||
376 | default: | ||
377 | break; | ||
378 | } | ||
379 | |||
380 | regs->eflags &= ~TF_MASK; | ||
381 | if (next_rip) { | ||
382 | regs->rip = next_rip; | ||
383 | } else { | ||
384 | regs->rip = orig_rip + (regs->rip - copy_rip); | ||
385 | } | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * Interrupts are disabled on entry as trap1 is an interrupt gate and they | ||
390 | * remain disabled thoroughout this function. And we hold kprobe lock. | ||
391 | */ | ||
392 | int post_kprobe_handler(struct pt_regs *regs) | ||
393 | { | ||
394 | if (!kprobe_running()) | ||
395 | return 0; | ||
396 | |||
397 | if (current_kprobe->post_handler) | ||
398 | current_kprobe->post_handler(current_kprobe, regs, 0); | ||
399 | |||
400 | resume_execution(current_kprobe, regs); | ||
401 | regs->eflags |= kprobe_saved_rflags; | ||
402 | |||
403 | unlock_kprobes(); | ||
404 | preempt_enable_no_resched(); | ||
405 | |||
406 | /* | ||
407 | * if somebody else is singlestepping across a probe point, eflags | ||
408 | * will have TF set, in which case, continue the remaining processing | ||
409 | * of do_debug, as if this is not a probe hit. | ||
410 | */ | ||
411 | if (regs->eflags & TF_MASK) | ||
412 | return 0; | ||
413 | |||
414 | return 1; | ||
415 | } | ||
416 | |||
417 | /* Interrupts disabled, kprobe_lock held. */ | ||
418 | int kprobe_fault_handler(struct pt_regs *regs, int trapnr) | ||
419 | { | ||
420 | if (current_kprobe->fault_handler | ||
421 | && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) | ||
422 | return 1; | ||
423 | |||
424 | if (kprobe_status & KPROBE_HIT_SS) { | ||
425 | resume_execution(current_kprobe, regs); | ||
426 | regs->eflags |= kprobe_old_rflags; | ||
427 | |||
428 | unlock_kprobes(); | ||
429 | preempt_enable_no_resched(); | ||
430 | } | ||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Wrapper routine for handling exceptions. | ||
436 | */ | ||
437 | int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, | ||
438 | void *data) | ||
439 | { | ||
440 | struct die_args *args = (struct die_args *)data; | ||
441 | switch (val) { | ||
442 | case DIE_INT3: | ||
443 | if (kprobe_handler(args->regs)) | ||
444 | return NOTIFY_STOP; | ||
445 | break; | ||
446 | case DIE_DEBUG: | ||
447 | if (post_kprobe_handler(args->regs)) | ||
448 | return NOTIFY_STOP; | ||
449 | break; | ||
450 | case DIE_GPF: | ||
451 | if (kprobe_running() && | ||
452 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
453 | return NOTIFY_STOP; | ||
454 | break; | ||
455 | case DIE_PAGE_FAULT: | ||
456 | if (kprobe_running() && | ||
457 | kprobe_fault_handler(args->regs, args->trapnr)) | ||
458 | return NOTIFY_STOP; | ||
459 | break; | ||
460 | default: | ||
461 | break; | ||
462 | } | ||
463 | return NOTIFY_DONE; | ||
464 | } | ||
465 | |||
466 | int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
467 | { | ||
468 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
469 | unsigned long addr; | ||
470 | |||
471 | jprobe_saved_regs = *regs; | ||
472 | jprobe_saved_rsp = (long *) regs->rsp; | ||
473 | addr = (unsigned long)jprobe_saved_rsp; | ||
474 | /* | ||
475 | * As Linus pointed out, gcc assumes that the callee | ||
476 | * owns the argument space and could overwrite it, e.g. | ||
477 | * tailcall optimization. So, to be absolutely safe | ||
478 | * we also save and restore enough stack bytes to cover | ||
479 | * the argument area. | ||
480 | */ | ||
481 | memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); | ||
482 | regs->eflags &= ~IF_MASK; | ||
483 | regs->rip = (unsigned long)(jp->entry); | ||
484 | return 1; | ||
485 | } | ||
486 | |||
487 | void jprobe_return(void) | ||
488 | { | ||
489 | preempt_enable_no_resched(); | ||
490 | asm volatile (" xchg %%rbx,%%rsp \n" | ||
491 | " int3 \n" | ||
492 | " .globl jprobe_return_end \n" | ||
493 | " jprobe_return_end: \n" | ||
494 | " nop \n"::"b" | ||
495 | (jprobe_saved_rsp):"memory"); | ||
496 | } | ||
497 | |||
498 | int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
499 | { | ||
500 | u8 *addr = (u8 *) (regs->rip - 1); | ||
501 | unsigned long stack_addr = (unsigned long)jprobe_saved_rsp; | ||
502 | struct jprobe *jp = container_of(p, struct jprobe, kp); | ||
503 | |||
504 | if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { | ||
505 | if ((long *)regs->rsp != jprobe_saved_rsp) { | ||
506 | struct pt_regs *saved_regs = | ||
507 | container_of(jprobe_saved_rsp, struct pt_regs, rsp); | ||
508 | printk("current rsp %p does not match saved rsp %p\n", | ||
509 | (long *)regs->rsp, jprobe_saved_rsp); | ||
510 | printk("Saved registers for jprobe %p\n", jp); | ||
511 | show_registers(saved_regs); | ||
512 | printk("Current registers\n"); | ||
513 | show_registers(regs); | ||
514 | BUG(); | ||
515 | } | ||
516 | *regs = jprobe_saved_regs; | ||
517 | memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, | ||
518 | MIN_STACK_SIZE(stack_addr)); | ||
519 | return 1; | ||
520 | } | ||
521 | return 0; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. | ||
526 | * By default on x86_64, pages we get from kmalloc or vmalloc are not | ||
527 | * executable. Single-stepping an instruction on such a page yields an | ||
528 | * oops. So instead of storing the instruction copies in their respective | ||
529 | * kprobe objects, we allocate a page, map it executable, and store all the | ||
530 | * instruction copies there. (We can allocate additional pages if somebody | ||
531 | * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE | ||
532 | * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) | ||
533 | * bytes. | ||
534 | */ | ||
535 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) | ||
536 | struct kprobe_insn_page { | ||
537 | struct hlist_node hlist; | ||
538 | kprobe_opcode_t *insns; /* page of instruction slots */ | ||
539 | char slot_used[INSNS_PER_PAGE]; | ||
540 | int nused; | ||
541 | }; | ||
542 | |||
543 | static struct hlist_head kprobe_insn_pages; | ||
544 | |||
545 | /** | ||
546 | * get_insn_slot() - Find a slot on an executable page for an instruction. | ||
547 | * We allocate an executable page if there's no room on existing ones. | ||
548 | */ | ||
549 | static kprobe_opcode_t *get_insn_slot(void) | ||
550 | { | ||
551 | struct kprobe_insn_page *kip; | ||
552 | struct hlist_node *pos; | ||
553 | |||
554 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
555 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
556 | if (kip->nused < INSNS_PER_PAGE) { | ||
557 | int i; | ||
558 | for (i = 0; i < INSNS_PER_PAGE; i++) { | ||
559 | if (!kip->slot_used[i]) { | ||
560 | kip->slot_used[i] = 1; | ||
561 | kip->nused++; | ||
562 | return kip->insns + (i*MAX_INSN_SIZE); | ||
563 | } | ||
564 | } | ||
565 | /* Surprise! No unused slots. Fix kip->nused. */ | ||
566 | kip->nused = INSNS_PER_PAGE; | ||
567 | } | ||
568 | } | ||
569 | |||
570 | /* All out of space. Need to allocate a new page. Use slot 0.*/ | ||
571 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | ||
572 | if (!kip) { | ||
573 | return NULL; | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * For the %rip-relative displacement fixups to be doable, we | ||
578 | * need our instruction copy to be within +/- 2GB of any data it | ||
579 | * might access via %rip. That is, within 2GB of where the | ||
580 | * kernel image and loaded module images reside. So we allocate | ||
581 | * a page in the module loading area. | ||
582 | */ | ||
583 | kip->insns = module_alloc(PAGE_SIZE); | ||
584 | if (!kip->insns) { | ||
585 | kfree(kip); | ||
586 | return NULL; | ||
587 | } | ||
588 | INIT_HLIST_NODE(&kip->hlist); | ||
589 | hlist_add_head(&kip->hlist, &kprobe_insn_pages); | ||
590 | memset(kip->slot_used, 0, INSNS_PER_PAGE); | ||
591 | kip->slot_used[0] = 1; | ||
592 | kip->nused = 1; | ||
593 | return kip->insns; | ||
594 | } | ||
595 | |||
596 | /** | ||
597 | * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). | ||
598 | */ | ||
599 | static void free_insn_slot(kprobe_opcode_t *slot) | ||
600 | { | ||
601 | struct kprobe_insn_page *kip; | ||
602 | struct hlist_node *pos; | ||
603 | |||
604 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
605 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
606 | if (kip->insns <= slot | ||
607 | && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { | ||
608 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | ||
609 | kip->slot_used[i] = 0; | ||
610 | kip->nused--; | ||
611 | if (kip->nused == 0) { | ||
612 | /* | ||
613 | * Page is no longer in use. Free it unless | ||
614 | * it's the last one. We keep the last one | ||
615 | * so as not to have to set it up again the | ||
616 | * next time somebody inserts a probe. | ||
617 | */ | ||
618 | hlist_del(&kip->hlist); | ||
619 | if (hlist_empty(&kprobe_insn_pages)) { | ||
620 | INIT_HLIST_NODE(&kip->hlist); | ||
621 | hlist_add_head(&kip->hlist, | ||
622 | &kprobe_insn_pages); | ||
623 | } else { | ||
624 | module_free(NULL, kip->insns); | ||
625 | kfree(kip); | ||
626 | } | ||
627 | } | ||
628 | return; | ||
629 | } | ||
630 | } | ||
631 | } | ||
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c new file mode 100644 index 000000000000..d7e5d0cf4285 --- /dev/null +++ b/arch/x86_64/kernel/ldt.c | |||
@@ -0,0 +1,253 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/ldt.c | ||
3 | * | ||
4 | * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds | ||
5 | * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright (C) 2002 Andi Kleen | ||
7 | * | ||
8 | * This handles calls from both 32bit and 64bit mode. | ||
9 | */ | ||
10 | |||
11 | #include <linux/errno.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/smp.h> | ||
16 | #include <linux/smp_lock.h> | ||
17 | #include <linux/vmalloc.h> | ||
18 | #include <linux/slab.h> | ||
19 | |||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/system.h> | ||
22 | #include <asm/ldt.h> | ||
23 | #include <asm/desc.h> | ||
24 | #include <asm/proto.h> | ||
25 | |||
26 | #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ | ||
27 | static void flush_ldt(void *null) | ||
28 | { | ||
29 | if (current->active_mm) | ||
30 | load_LDT(¤t->active_mm->context); | ||
31 | } | ||
32 | #endif | ||
33 | |||
34 | static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) | ||
35 | { | ||
36 | void *oldldt; | ||
37 | void *newldt; | ||
38 | unsigned oldsize; | ||
39 | |||
40 | if (mincount <= (unsigned)pc->size) | ||
41 | return 0; | ||
42 | oldsize = pc->size; | ||
43 | mincount = (mincount+511)&(~511); | ||
44 | if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
45 | newldt = vmalloc(mincount*LDT_ENTRY_SIZE); | ||
46 | else | ||
47 | newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); | ||
48 | |||
49 | if (!newldt) | ||
50 | return -ENOMEM; | ||
51 | |||
52 | if (oldsize) | ||
53 | memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); | ||
54 | oldldt = pc->ldt; | ||
55 | memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); | ||
56 | wmb(); | ||
57 | pc->ldt = newldt; | ||
58 | wmb(); | ||
59 | pc->size = mincount; | ||
60 | wmb(); | ||
61 | if (reload) { | ||
62 | #ifdef CONFIG_SMP | ||
63 | cpumask_t mask; | ||
64 | |||
65 | preempt_disable(); | ||
66 | mask = cpumask_of_cpu(smp_processor_id()); | ||
67 | load_LDT(pc); | ||
68 | if (!cpus_equal(current->mm->cpu_vm_mask, mask)) | ||
69 | smp_call_function(flush_ldt, NULL, 1, 1); | ||
70 | preempt_enable(); | ||
71 | #else | ||
72 | load_LDT(pc); | ||
73 | #endif | ||
74 | } | ||
75 | if (oldsize) { | ||
76 | if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
77 | vfree(oldldt); | ||
78 | else | ||
79 | kfree(oldldt); | ||
80 | } | ||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | ||
85 | { | ||
86 | int err = alloc_ldt(new, old->size, 0); | ||
87 | if (err < 0) | ||
88 | return err; | ||
89 | memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * we do not have to muck with descriptors here, that is | ||
95 | * done in switch_mm() as needed. | ||
96 | */ | ||
97 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | ||
98 | { | ||
99 | struct mm_struct * old_mm; | ||
100 | int retval = 0; | ||
101 | |||
102 | init_MUTEX(&mm->context.sem); | ||
103 | mm->context.size = 0; | ||
104 | old_mm = current->mm; | ||
105 | if (old_mm && old_mm->context.size > 0) { | ||
106 | down(&old_mm->context.sem); | ||
107 | retval = copy_ldt(&mm->context, &old_mm->context); | ||
108 | up(&old_mm->context.sem); | ||
109 | } | ||
110 | return retval; | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * | ||
115 | * Don't touch the LDT register - we're already in the next thread. | ||
116 | */ | ||
117 | void destroy_context(struct mm_struct *mm) | ||
118 | { | ||
119 | if (mm->context.size) { | ||
120 | if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) | ||
121 | vfree(mm->context.ldt); | ||
122 | else | ||
123 | kfree(mm->context.ldt); | ||
124 | mm->context.size = 0; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | static int read_ldt(void __user * ptr, unsigned long bytecount) | ||
129 | { | ||
130 | int err; | ||
131 | unsigned long size; | ||
132 | struct mm_struct * mm = current->mm; | ||
133 | |||
134 | if (!mm->context.size) | ||
135 | return 0; | ||
136 | if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) | ||
137 | bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; | ||
138 | |||
139 | down(&mm->context.sem); | ||
140 | size = mm->context.size*LDT_ENTRY_SIZE; | ||
141 | if (size > bytecount) | ||
142 | size = bytecount; | ||
143 | |||
144 | err = 0; | ||
145 | if (copy_to_user(ptr, mm->context.ldt, size)) | ||
146 | err = -EFAULT; | ||
147 | up(&mm->context.sem); | ||
148 | if (err < 0) | ||
149 | goto error_return; | ||
150 | if (size != bytecount) { | ||
151 | /* zero-fill the rest */ | ||
152 | if (clear_user(ptr+size, bytecount-size) != 0) { | ||
153 | err = -EFAULT; | ||
154 | goto error_return; | ||
155 | } | ||
156 | } | ||
157 | return bytecount; | ||
158 | error_return: | ||
159 | return err; | ||
160 | } | ||
161 | |||
162 | static int read_default_ldt(void __user * ptr, unsigned long bytecount) | ||
163 | { | ||
164 | /* Arbitrary number */ | ||
165 | /* x86-64 default LDT is all zeros */ | ||
166 | if (bytecount > 128) | ||
167 | bytecount = 128; | ||
168 | if (clear_user(ptr, bytecount)) | ||
169 | return -EFAULT; | ||
170 | return bytecount; | ||
171 | } | ||
172 | |||
173 | static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) | ||
174 | { | ||
175 | struct task_struct *me = current; | ||
176 | struct mm_struct * mm = me->mm; | ||
177 | __u32 entry_1, entry_2, *lp; | ||
178 | int error; | ||
179 | struct user_desc ldt_info; | ||
180 | |||
181 | error = -EINVAL; | ||
182 | |||
183 | if (bytecount != sizeof(ldt_info)) | ||
184 | goto out; | ||
185 | error = -EFAULT; | ||
186 | if (copy_from_user(&ldt_info, ptr, bytecount)) | ||
187 | goto out; | ||
188 | |||
189 | error = -EINVAL; | ||
190 | if (ldt_info.entry_number >= LDT_ENTRIES) | ||
191 | goto out; | ||
192 | if (ldt_info.contents == 3) { | ||
193 | if (oldmode) | ||
194 | goto out; | ||
195 | if (ldt_info.seg_not_present == 0) | ||
196 | goto out; | ||
197 | } | ||
198 | |||
199 | down(&mm->context.sem); | ||
200 | if (ldt_info.entry_number >= (unsigned)mm->context.size) { | ||
201 | error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); | ||
202 | if (error < 0) | ||
203 | goto out_unlock; | ||
204 | } | ||
205 | |||
206 | lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); | ||
207 | |||
208 | /* Allow LDTs to be cleared by the user. */ | ||
209 | if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { | ||
210 | if (oldmode || LDT_empty(&ldt_info)) { | ||
211 | entry_1 = 0; | ||
212 | entry_2 = 0; | ||
213 | goto install; | ||
214 | } | ||
215 | } | ||
216 | |||
217 | entry_1 = LDT_entry_a(&ldt_info); | ||
218 | entry_2 = LDT_entry_b(&ldt_info); | ||
219 | if (oldmode) | ||
220 | entry_2 &= ~(1 << 20); | ||
221 | |||
222 | /* Install the new entry ... */ | ||
223 | install: | ||
224 | *lp = entry_1; | ||
225 | *(lp+1) = entry_2; | ||
226 | error = 0; | ||
227 | |||
228 | out_unlock: | ||
229 | up(&mm->context.sem); | ||
230 | out: | ||
231 | return error; | ||
232 | } | ||
233 | |||
234 | asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) | ||
235 | { | ||
236 | int ret = -ENOSYS; | ||
237 | |||
238 | switch (func) { | ||
239 | case 0: | ||
240 | ret = read_ldt(ptr, bytecount); | ||
241 | break; | ||
242 | case 1: | ||
243 | ret = write_ldt(ptr, bytecount, 1); | ||
244 | break; | ||
245 | case 2: | ||
246 | ret = read_default_ldt(ptr, bytecount); | ||
247 | break; | ||
248 | case 0x11: | ||
249 | ret = write_ldt(ptr, bytecount, 0); | ||
250 | break; | ||
251 | } | ||
252 | return ret; | ||
253 | } | ||
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c new file mode 100644 index 000000000000..86f9fd85016a --- /dev/null +++ b/arch/x86_64/kernel/mce.c | |||
@@ -0,0 +1,548 @@ | |||
1 | /* | ||
2 | * Machine check handler. | ||
3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | * Rest from unknown author(s). | ||
5 | * 2004 Andi Kleen. Rewrote most of it. | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/types.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/rcupdate.h> | ||
14 | #include <linux/kallsyms.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/miscdevice.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <asm/processor.h> | ||
19 | #include <asm/msr.h> | ||
20 | #include <asm/mce.h> | ||
21 | #include <asm/kdebug.h> | ||
22 | #include <asm/uaccess.h> | ||
23 | |||
24 | #define MISC_MCELOG_MINOR 227 | ||
25 | #define NR_BANKS 5 | ||
26 | |||
27 | static int mce_dont_init; | ||
28 | |||
29 | /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, | ||
30 | 3: never panic or exit (for testing only) */ | ||
31 | static int tolerant = 1; | ||
32 | static int banks; | ||
33 | static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; | ||
34 | static unsigned long console_logged; | ||
35 | static int notify_user; | ||
36 | |||
37 | /* | ||
38 | * Lockless MCE logging infrastructure. | ||
39 | * This avoids deadlocks on printk locks without having to break locks. Also | ||
40 | * separate MCEs from kernel messages to avoid bogus bug reports. | ||
41 | */ | ||
42 | |||
43 | struct mce_log mcelog = { | ||
44 | MCE_LOG_SIGNATURE, | ||
45 | MCE_LOG_LEN, | ||
46 | }; | ||
47 | |||
48 | void mce_log(struct mce *mce) | ||
49 | { | ||
50 | unsigned next, entry; | ||
51 | mce->finished = 0; | ||
52 | smp_wmb(); | ||
53 | for (;;) { | ||
54 | entry = rcu_dereference(mcelog.next); | ||
55 | /* When the buffer fills up discard new entries. Assume | ||
56 | that the earlier errors are the more interesting. */ | ||
57 | if (entry >= MCE_LOG_LEN) { | ||
58 | set_bit(MCE_OVERFLOW, &mcelog.flags); | ||
59 | return; | ||
60 | } | ||
61 | /* Old left over entry. Skip. */ | ||
62 | if (mcelog.entry[entry].finished) | ||
63 | continue; | ||
64 | smp_rmb(); | ||
65 | next = entry + 1; | ||
66 | if (cmpxchg(&mcelog.next, entry, next) == entry) | ||
67 | break; | ||
68 | } | ||
69 | memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); | ||
70 | smp_wmb(); | ||
71 | mcelog.entry[entry].finished = 1; | ||
72 | smp_wmb(); | ||
73 | |||
74 | if (!test_and_set_bit(0, &console_logged)) | ||
75 | notify_user = 1; | ||
76 | } | ||
77 | |||
78 | static void print_mce(struct mce *m) | ||
79 | { | ||
80 | printk(KERN_EMERG "\n" | ||
81 | KERN_EMERG | ||
82 | "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", | ||
83 | m->cpu, m->mcgstatus, m->bank, m->status); | ||
84 | if (m->rip) { | ||
85 | printk(KERN_EMERG | ||
86 | "RIP%s %02x:<%016Lx> ", | ||
87 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
88 | m->cs, m->rip); | ||
89 | if (m->cs == __KERNEL_CS) | ||
90 | print_symbol("{%s}", m->rip); | ||
91 | printk("\n"); | ||
92 | } | ||
93 | printk(KERN_EMERG "TSC %Lx ", m->tsc); | ||
94 | if (m->addr) | ||
95 | printk("ADDR %Lx ", m->addr); | ||
96 | if (m->misc) | ||
97 | printk("MISC %Lx ", m->misc); | ||
98 | printk("\n"); | ||
99 | } | ||
100 | |||
101 | static void mce_panic(char *msg, struct mce *backup, unsigned long start) | ||
102 | { | ||
103 | int i; | ||
104 | oops_begin(); | ||
105 | for (i = 0; i < MCE_LOG_LEN; i++) { | ||
106 | unsigned long tsc = mcelog.entry[i].tsc; | ||
107 | if (time_before(tsc, start)) | ||
108 | continue; | ||
109 | print_mce(&mcelog.entry[i]); | ||
110 | if (backup && mcelog.entry[i].tsc == backup->tsc) | ||
111 | backup = NULL; | ||
112 | } | ||
113 | if (backup) | ||
114 | print_mce(backup); | ||
115 | if (tolerant >= 3) | ||
116 | printk("Fake panic: %s\n", msg); | ||
117 | else | ||
118 | panic(msg); | ||
119 | } | ||
120 | |||
121 | static int mce_available(struct cpuinfo_x86 *c) | ||
122 | { | ||
123 | return test_bit(X86_FEATURE_MCE, &c->x86_capability) && | ||
124 | test_bit(X86_FEATURE_MCA, &c->x86_capability); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * The actual machine check handler | ||
129 | */ | ||
130 | |||
131 | void do_machine_check(struct pt_regs * regs, long error_code) | ||
132 | { | ||
133 | struct mce m, panicm; | ||
134 | int nowayout = (tolerant < 1); | ||
135 | int kill_it = 0; | ||
136 | u64 mcestart = 0; | ||
137 | int i; | ||
138 | int panicm_found = 0; | ||
139 | |||
140 | if (regs) | ||
141 | notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); | ||
142 | if (!banks) | ||
143 | return; | ||
144 | |||
145 | memset(&m, 0, sizeof(struct mce)); | ||
146 | m.cpu = hard_smp_processor_id(); | ||
147 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
148 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | ||
149 | kill_it = 1; | ||
150 | |||
151 | rdtscll(mcestart); | ||
152 | barrier(); | ||
153 | |||
154 | for (i = 0; i < banks; i++) { | ||
155 | if (!bank[i]) | ||
156 | continue; | ||
157 | |||
158 | m.misc = 0; | ||
159 | m.addr = 0; | ||
160 | m.bank = i; | ||
161 | m.tsc = 0; | ||
162 | |||
163 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
164 | if ((m.status & MCI_STATUS_VAL) == 0) | ||
165 | continue; | ||
166 | |||
167 | if (m.status & MCI_STATUS_EN) { | ||
168 | /* In theory _OVER could be a nowayout too, but | ||
169 | assume any overflowed errors were no fatal. */ | ||
170 | nowayout |= !!(m.status & MCI_STATUS_PCC); | ||
171 | kill_it |= !!(m.status & MCI_STATUS_UC); | ||
172 | } | ||
173 | |||
174 | if (m.status & MCI_STATUS_MISCV) | ||
175 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
176 | if (m.status & MCI_STATUS_ADDRV) | ||
177 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
178 | |||
179 | if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) { | ||
180 | m.rip = regs->rip; | ||
181 | m.cs = regs->cs; | ||
182 | } else { | ||
183 | m.rip = 0; | ||
184 | m.cs = 0; | ||
185 | } | ||
186 | |||
187 | if (error_code != -1) | ||
188 | rdtscll(m.tsc); | ||
189 | wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); | ||
190 | mce_log(&m); | ||
191 | |||
192 | /* Did this bank cause the exception? */ | ||
193 | /* Assume that the bank with uncorrectable errors did it, | ||
194 | and that there is only a single one. */ | ||
195 | if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { | ||
196 | panicm = m; | ||
197 | panicm_found = 1; | ||
198 | } | ||
199 | |||
200 | tainted |= TAINT_MACHINE_CHECK; | ||
201 | } | ||
202 | |||
203 | /* Never do anything final in the polling timer */ | ||
204 | if (!regs) | ||
205 | goto out; | ||
206 | |||
207 | /* If we didn't find an uncorrectable error, pick | ||
208 | the last one (shouldn't happen, just being safe). */ | ||
209 | if (!panicm_found) | ||
210 | panicm = m; | ||
211 | if (nowayout) | ||
212 | mce_panic("Machine check", &panicm, mcestart); | ||
213 | if (kill_it) { | ||
214 | int user_space = 0; | ||
215 | |||
216 | if (m.mcgstatus & MCG_STATUS_RIPV) | ||
217 | user_space = panicm.rip && (panicm.cs & 3); | ||
218 | |||
219 | /* When the machine was in user space and the CPU didn't get | ||
220 | confused it's normally not necessary to panic, unless you | ||
221 | are paranoid (tolerant == 0) | ||
222 | |||
223 | RED-PEN could be more tolerant for MCEs in idle, | ||
224 | but most likely they occur at boot anyways, where | ||
225 | it is best to just halt the machine. */ | ||
226 | if ((!user_space && (panic_on_oops || tolerant < 2)) || | ||
227 | (unsigned)current->pid <= 1) | ||
228 | mce_panic("Uncorrected machine check", &panicm, mcestart); | ||
229 | |||
230 | /* do_exit takes an awful lot of locks and has as | ||
231 | slight risk of deadlocking. If you don't want that | ||
232 | don't set tolerant >= 2 */ | ||
233 | if (tolerant < 3) | ||
234 | do_exit(SIGBUS); | ||
235 | } | ||
236 | |||
237 | out: | ||
238 | /* Last thing done in the machine check exception to clear state. */ | ||
239 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * Periodic polling timer for "silent" machine check errors. | ||
244 | */ | ||
245 | |||
246 | static int check_interval = 5 * 60; /* 5 minutes */ | ||
247 | static void mcheck_timer(void *data); | ||
248 | static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); | ||
249 | |||
250 | static void mcheck_check_cpu(void *info) | ||
251 | { | ||
252 | if (mce_available(¤t_cpu_data)) | ||
253 | do_machine_check(NULL, 0); | ||
254 | } | ||
255 | |||
256 | static void mcheck_timer(void *data) | ||
257 | { | ||
258 | on_each_cpu(mcheck_check_cpu, NULL, 1, 1); | ||
259 | schedule_delayed_work(&mcheck_work, check_interval * HZ); | ||
260 | |||
261 | /* | ||
262 | * It's ok to read stale data here for notify_user and | ||
263 | * console_logged as we'll simply get the updated versions | ||
264 | * on the next mcheck_timer execution and atomic operations | ||
265 | * on console_logged act as synchronization for notify_user | ||
266 | * writes. | ||
267 | */ | ||
268 | if (notify_user && console_logged) { | ||
269 | notify_user = 0; | ||
270 | clear_bit(0, &console_logged); | ||
271 | printk(KERN_INFO "Machine check events logged\n"); | ||
272 | } | ||
273 | } | ||
274 | |||
275 | |||
276 | static __init int periodic_mcheck_init(void) | ||
277 | { | ||
278 | if (check_interval) | ||
279 | schedule_delayed_work(&mcheck_work, check_interval*HZ); | ||
280 | return 0; | ||
281 | } | ||
282 | __initcall(periodic_mcheck_init); | ||
283 | |||
284 | |||
285 | /* | ||
286 | * Initialize Machine Checks for a CPU. | ||
287 | */ | ||
288 | static void mce_init(void *dummy) | ||
289 | { | ||
290 | u64 cap; | ||
291 | int i; | ||
292 | |||
293 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
294 | banks = cap & 0xff; | ||
295 | if (banks > NR_BANKS) { | ||
296 | printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); | ||
297 | banks = NR_BANKS; | ||
298 | } | ||
299 | |||
300 | /* Log the machine checks left over from the previous reset. | ||
301 | This also clears all registers */ | ||
302 | do_machine_check(NULL, -1); | ||
303 | |||
304 | set_in_cr4(X86_CR4_MCE); | ||
305 | |||
306 | if (cap & MCG_CTL_P) | ||
307 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
308 | |||
309 | for (i = 0; i < banks; i++) { | ||
310 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
311 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
312 | } | ||
313 | } | ||
314 | |||
315 | /* Add per CPU specific workarounds here */ | ||
316 | static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) | ||
317 | { | ||
318 | /* This should be disabled by the BIOS, but isn't always */ | ||
319 | if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { | ||
320 | /* disable GART TBL walk error reporting, which trips off | ||
321 | incorrectly with the IOMMU & 3ware & Cerberus. */ | ||
322 | clear_bit(10, &bank[4]); | ||
323 | } | ||
324 | } | ||
325 | |||
326 | static void __init mce_cpu_features(struct cpuinfo_x86 *c) | ||
327 | { | ||
328 | switch (c->x86_vendor) { | ||
329 | case X86_VENDOR_INTEL: | ||
330 | mce_intel_feature_init(c); | ||
331 | break; | ||
332 | default: | ||
333 | break; | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Called for each booted CPU to set up machine checks. | ||
339 | * Must be called with preempt off. | ||
340 | */ | ||
341 | void __init mcheck_init(struct cpuinfo_x86 *c) | ||
342 | { | ||
343 | static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; | ||
344 | |||
345 | mce_cpu_quirks(c); | ||
346 | |||
347 | if (mce_dont_init || | ||
348 | cpu_test_and_set(smp_processor_id(), mce_cpus) || | ||
349 | !mce_available(c)) | ||
350 | return; | ||
351 | |||
352 | mce_init(NULL); | ||
353 | mce_cpu_features(c); | ||
354 | } | ||
355 | |||
356 | /* | ||
357 | * Character device to read and clear the MCE log. | ||
358 | */ | ||
359 | |||
360 | static void collect_tscs(void *data) | ||
361 | { | ||
362 | unsigned long *cpu_tsc = (unsigned long *)data; | ||
363 | rdtscll(cpu_tsc[smp_processor_id()]); | ||
364 | } | ||
365 | |||
366 | static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) | ||
367 | { | ||
368 | unsigned long cpu_tsc[NR_CPUS]; | ||
369 | static DECLARE_MUTEX(mce_read_sem); | ||
370 | unsigned next; | ||
371 | char __user *buf = ubuf; | ||
372 | int i, err; | ||
373 | |||
374 | down(&mce_read_sem); | ||
375 | next = rcu_dereference(mcelog.next); | ||
376 | |||
377 | /* Only supports full reads right now */ | ||
378 | if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { | ||
379 | up(&mce_read_sem); | ||
380 | return -EINVAL; | ||
381 | } | ||
382 | |||
383 | err = 0; | ||
384 | for (i = 0; i < next; i++) { | ||
385 | if (!mcelog.entry[i].finished) | ||
386 | continue; | ||
387 | smp_rmb(); | ||
388 | err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); | ||
389 | buf += sizeof(struct mce); | ||
390 | } | ||
391 | |||
392 | memset(mcelog.entry, 0, next * sizeof(struct mce)); | ||
393 | mcelog.next = 0; | ||
394 | |||
395 | synchronize_kernel(); | ||
396 | |||
397 | /* Collect entries that were still getting written before the synchronize. */ | ||
398 | |||
399 | on_each_cpu(collect_tscs, cpu_tsc, 1, 1); | ||
400 | for (i = next; i < MCE_LOG_LEN; i++) { | ||
401 | if (mcelog.entry[i].finished && | ||
402 | mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { | ||
403 | err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); | ||
404 | smp_rmb(); | ||
405 | buf += sizeof(struct mce); | ||
406 | memset(&mcelog.entry[i], 0, sizeof(struct mce)); | ||
407 | } | ||
408 | } | ||
409 | up(&mce_read_sem); | ||
410 | return err ? -EFAULT : buf - ubuf; | ||
411 | } | ||
412 | |||
413 | static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) | ||
414 | { | ||
415 | int __user *p = (int __user *)arg; | ||
416 | if (!capable(CAP_SYS_ADMIN)) | ||
417 | return -EPERM; | ||
418 | switch (cmd) { | ||
419 | case MCE_GET_RECORD_LEN: | ||
420 | return put_user(sizeof(struct mce), p); | ||
421 | case MCE_GET_LOG_LEN: | ||
422 | return put_user(MCE_LOG_LEN, p); | ||
423 | case MCE_GETCLEAR_FLAGS: { | ||
424 | unsigned flags; | ||
425 | do { | ||
426 | flags = mcelog.flags; | ||
427 | } while (cmpxchg(&mcelog.flags, flags, 0) != flags); | ||
428 | return put_user(flags, p); | ||
429 | } | ||
430 | default: | ||
431 | return -ENOTTY; | ||
432 | } | ||
433 | } | ||
434 | |||
435 | static struct file_operations mce_chrdev_ops = { | ||
436 | .read = mce_read, | ||
437 | .ioctl = mce_ioctl, | ||
438 | }; | ||
439 | |||
440 | static struct miscdevice mce_log_device = { | ||
441 | MISC_MCELOG_MINOR, | ||
442 | "mcelog", | ||
443 | &mce_chrdev_ops, | ||
444 | }; | ||
445 | |||
446 | /* | ||
447 | * Old style boot options parsing. Only for compatibility. | ||
448 | */ | ||
449 | |||
450 | static int __init mcheck_disable(char *str) | ||
451 | { | ||
452 | mce_dont_init = 1; | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | /* mce=off disables machine check. Note you can reenable it later | ||
457 | using sysfs */ | ||
458 | static int __init mcheck_enable(char *str) | ||
459 | { | ||
460 | if (!strcmp(str, "off")) | ||
461 | mce_dont_init = 1; | ||
462 | else | ||
463 | printk("mce= argument %s ignored. Please use /sys", str); | ||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | __setup("nomce", mcheck_disable); | ||
468 | __setup("mce", mcheck_enable); | ||
469 | |||
470 | /* | ||
471 | * Sysfs support | ||
472 | */ | ||
473 | |||
474 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ | ||
475 | static int mce_resume(struct sys_device *dev) | ||
476 | { | ||
477 | on_each_cpu(mce_init, NULL, 1, 1); | ||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | /* Reinit MCEs after user configuration changes */ | ||
482 | static void mce_restart(void) | ||
483 | { | ||
484 | if (check_interval) | ||
485 | cancel_delayed_work(&mcheck_work); | ||
486 | /* Timer race is harmless here */ | ||
487 | on_each_cpu(mce_init, NULL, 1, 1); | ||
488 | if (check_interval) | ||
489 | schedule_delayed_work(&mcheck_work, check_interval*HZ); | ||
490 | } | ||
491 | |||
492 | static struct sysdev_class mce_sysclass = { | ||
493 | .resume = mce_resume, | ||
494 | set_kset_name("machinecheck"), | ||
495 | }; | ||
496 | |||
497 | static struct sys_device device_mce = { | ||
498 | .id = 0, | ||
499 | .cls = &mce_sysclass, | ||
500 | }; | ||
501 | |||
502 | /* Why are there no generic functions for this? */ | ||
503 | #define ACCESSOR(name, var, start) \ | ||
504 | static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ | ||
505 | return sprintf(buf, "%lx\n", (unsigned long)var); \ | ||
506 | } \ | ||
507 | static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ | ||
508 | char *end; \ | ||
509 | unsigned long new = simple_strtoul(buf, &end, 0); \ | ||
510 | if (end == buf) return -EINVAL; \ | ||
511 | var = new; \ | ||
512 | start; \ | ||
513 | return end-buf; \ | ||
514 | } \ | ||
515 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | ||
516 | |||
517 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | ||
518 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | ||
519 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | ||
520 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | ||
521 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | ||
522 | ACCESSOR(tolerant,tolerant,) | ||
523 | ACCESSOR(check_interval,check_interval,mce_restart()) | ||
524 | |||
525 | static __init int mce_init_device(void) | ||
526 | { | ||
527 | int err; | ||
528 | if (!mce_available(&boot_cpu_data)) | ||
529 | return -EIO; | ||
530 | err = sysdev_class_register(&mce_sysclass); | ||
531 | if (!err) | ||
532 | err = sysdev_register(&device_mce); | ||
533 | if (!err) { | ||
534 | /* could create per CPU objects, but it is not worth it. */ | ||
535 | sysdev_create_file(&device_mce, &attr_bank0ctl); | ||
536 | sysdev_create_file(&device_mce, &attr_bank1ctl); | ||
537 | sysdev_create_file(&device_mce, &attr_bank2ctl); | ||
538 | sysdev_create_file(&device_mce, &attr_bank3ctl); | ||
539 | sysdev_create_file(&device_mce, &attr_bank4ctl); | ||
540 | sysdev_create_file(&device_mce, &attr_tolerant); | ||
541 | sysdev_create_file(&device_mce, &attr_check_interval); | ||
542 | } | ||
543 | |||
544 | misc_register(&mce_log_device); | ||
545 | return err; | ||
546 | |||
547 | } | ||
548 | device_initcall(mce_init_device); | ||
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c new file mode 100644 index 000000000000..4db9a640069f --- /dev/null +++ b/arch/x86_64/kernel/mce_intel.c | |||
@@ -0,0 +1,99 @@ | |||
1 | /* | ||
2 | * Intel specific MCE features. | ||
3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | ||
4 | */ | ||
5 | |||
6 | #include <linux/init.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/percpu.h> | ||
9 | #include <asm/processor.h> | ||
10 | #include <asm/msr.h> | ||
11 | #include <asm/mce.h> | ||
12 | #include <asm/hw_irq.h> | ||
13 | |||
14 | static DEFINE_PER_CPU(unsigned long, next_check); | ||
15 | |||
16 | asmlinkage void smp_thermal_interrupt(void) | ||
17 | { | ||
18 | struct mce m; | ||
19 | |||
20 | ack_APIC_irq(); | ||
21 | |||
22 | irq_enter(); | ||
23 | if (time_before(jiffies, __get_cpu_var(next_check))) | ||
24 | goto done; | ||
25 | |||
26 | __get_cpu_var(next_check) = jiffies + HZ*300; | ||
27 | memset(&m, 0, sizeof(m)); | ||
28 | m.cpu = smp_processor_id(); | ||
29 | m.bank = MCE_THERMAL_BANK; | ||
30 | rdtscll(m.tsc); | ||
31 | rdmsrl(MSR_IA32_THERM_STATUS, m.status); | ||
32 | if (m.status & 0x1) { | ||
33 | printk(KERN_EMERG | ||
34 | "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); | ||
35 | add_taint(TAINT_MACHINE_CHECK); | ||
36 | } else { | ||
37 | printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); | ||
38 | } | ||
39 | |||
40 | mce_log(&m); | ||
41 | done: | ||
42 | irq_exit(); | ||
43 | } | ||
44 | |||
45 | static void __init intel_init_thermal(struct cpuinfo_x86 *c) | ||
46 | { | ||
47 | u32 l, h; | ||
48 | int tm2 = 0; | ||
49 | unsigned int cpu = smp_processor_id(); | ||
50 | |||
51 | if (!cpu_has(c, X86_FEATURE_ACPI)) | ||
52 | return; | ||
53 | |||
54 | if (!cpu_has(c, X86_FEATURE_ACC)) | ||
55 | return; | ||
56 | |||
57 | /* first check if TM1 is already enabled by the BIOS, in which | ||
58 | * case there might be some SMM goo which handles it, so we can't even | ||
59 | * put a handler since it might be delivered via SMI already. | ||
60 | */ | ||
61 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
62 | h = apic_read(APIC_LVTTHMR); | ||
63 | if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { | ||
64 | printk(KERN_DEBUG | ||
65 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
66 | return; | ||
67 | } | ||
68 | |||
69 | if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) | ||
70 | tm2 = 1; | ||
71 | |||
72 | if (h & APIC_VECTOR_MASK) { | ||
73 | printk(KERN_DEBUG | ||
74 | "CPU%d: Thermal LVT vector (%#x) already " | ||
75 | "installed\n", cpu, (h & APIC_VECTOR_MASK)); | ||
76 | return; | ||
77 | } | ||
78 | |||
79 | h = THERMAL_APIC_VECTOR; | ||
80 | h |= (APIC_DM_FIXED | APIC_LVT_MASKED); | ||
81 | apic_write_around(APIC_LVTTHMR, h); | ||
82 | |||
83 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
84 | wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); | ||
85 | |||
86 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
87 | wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); | ||
88 | |||
89 | l = apic_read(APIC_LVTTHMR); | ||
90 | apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
91 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
92 | cpu, tm2 ? "TM2" : "TM1"); | ||
93 | return; | ||
94 | } | ||
95 | |||
96 | void __init mce_intel_feature_init(struct cpuinfo_x86 *c) | ||
97 | { | ||
98 | intel_init_thermal(c); | ||
99 | } | ||
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c new file mode 100644 index 000000000000..c2ffea8845ed --- /dev/null +++ b/arch/x86_64/kernel/module.c | |||
@@ -0,0 +1,166 @@ | |||
1 | /* Kernel module help for x86-64 | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 2 of the License, or | ||
8 | (at your option) any later version. | ||
9 | |||
10 | This program is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with this program; if not, write to the Free Software | ||
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #include <linux/moduleloader.h> | ||
20 | #include <linux/elf.h> | ||
21 | #include <linux/vmalloc.h> | ||
22 | #include <linux/fs.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/slab.h> | ||
26 | |||
27 | #include <asm/system.h> | ||
28 | #include <asm/page.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | |||
31 | #define DEBUGP(fmt...) | ||
32 | |||
33 | void module_free(struct module *mod, void *module_region) | ||
34 | { | ||
35 | vfree(module_region); | ||
36 | } | ||
37 | |||
38 | void *module_alloc(unsigned long size) | ||
39 | { | ||
40 | struct vm_struct *area; | ||
41 | |||
42 | if (!size) | ||
43 | return NULL; | ||
44 | size = PAGE_ALIGN(size); | ||
45 | if (size > MODULES_LEN) | ||
46 | return NULL; | ||
47 | |||
48 | area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); | ||
49 | if (!area) | ||
50 | return NULL; | ||
51 | |||
52 | return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); | ||
53 | } | ||
54 | |||
55 | /* We don't need anything special. */ | ||
56 | int module_frob_arch_sections(Elf_Ehdr *hdr, | ||
57 | Elf_Shdr *sechdrs, | ||
58 | char *secstrings, | ||
59 | struct module *mod) | ||
60 | { | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | int apply_relocate_add(Elf64_Shdr *sechdrs, | ||
65 | const char *strtab, | ||
66 | unsigned int symindex, | ||
67 | unsigned int relsec, | ||
68 | struct module *me) | ||
69 | { | ||
70 | unsigned int i; | ||
71 | Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; | ||
72 | Elf64_Sym *sym; | ||
73 | void *loc; | ||
74 | u64 val; | ||
75 | |||
76 | DEBUGP("Applying relocate section %u to %u\n", relsec, | ||
77 | sechdrs[relsec].sh_info); | ||
78 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | ||
79 | /* This is where to make the change */ | ||
80 | loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr | ||
81 | + rel[i].r_offset; | ||
82 | |||
83 | /* This is the symbol it is referring to. Note that all | ||
84 | undefined symbols have been resolved. */ | ||
85 | sym = (Elf64_Sym *)sechdrs[symindex].sh_addr | ||
86 | + ELF64_R_SYM(rel[i].r_info); | ||
87 | |||
88 | DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", | ||
89 | (int)ELF64_R_TYPE(rel[i].r_info), | ||
90 | sym->st_value, rel[i].r_addend, (u64)loc); | ||
91 | |||
92 | val = sym->st_value + rel[i].r_addend; | ||
93 | |||
94 | switch (ELF64_R_TYPE(rel[i].r_info)) { | ||
95 | case R_X86_64_NONE: | ||
96 | break; | ||
97 | case R_X86_64_64: | ||
98 | *(u64 *)loc = val; | ||
99 | break; | ||
100 | case R_X86_64_32: | ||
101 | *(u32 *)loc = val; | ||
102 | if (val != *(u32 *)loc) | ||
103 | goto overflow; | ||
104 | break; | ||
105 | case R_X86_64_32S: | ||
106 | *(s32 *)loc = val; | ||
107 | if ((s64)val != *(s32 *)loc) | ||
108 | goto overflow; | ||
109 | break; | ||
110 | case R_X86_64_PC32: | ||
111 | val -= (u64)loc; | ||
112 | *(u32 *)loc = val; | ||
113 | #if 0 | ||
114 | if ((s64)val != *(s32 *)loc) | ||
115 | goto overflow; | ||
116 | #endif | ||
117 | break; | ||
118 | default: | ||
119 | printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", | ||
120 | me->name, ELF64_R_TYPE(rel[i].r_info)); | ||
121 | return -ENOEXEC; | ||
122 | } | ||
123 | } | ||
124 | return 0; | ||
125 | |||
126 | overflow: | ||
127 | printk(KERN_ERR "overflow in relocation type %d val %Lx\n", | ||
128 | (int)ELF64_R_TYPE(rel[i].r_info), val); | ||
129 | printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", | ||
130 | me->name); | ||
131 | return -ENOEXEC; | ||
132 | } | ||
133 | |||
134 | int apply_relocate(Elf_Shdr *sechdrs, | ||
135 | const char *strtab, | ||
136 | unsigned int symindex, | ||
137 | unsigned int relsec, | ||
138 | struct module *me) | ||
139 | { | ||
140 | printk("non add relocation not supported\n"); | ||
141 | return -ENOSYS; | ||
142 | } | ||
143 | |||
144 | extern void apply_alternatives(void *start, void *end); | ||
145 | |||
146 | int module_finalize(const Elf_Ehdr *hdr, | ||
147 | const Elf_Shdr *sechdrs, | ||
148 | struct module *me) | ||
149 | { | ||
150 | const Elf_Shdr *s; | ||
151 | char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
152 | |||
153 | /* look for .altinstructions to patch */ | ||
154 | for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { | ||
155 | void *seg; | ||
156 | if (strcmp(".altinstructions", secstrings + s->sh_name)) | ||
157 | continue; | ||
158 | seg = (void *)s->sh_addr; | ||
159 | apply_alternatives(seg, seg + s->sh_size); | ||
160 | } | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | void module_arch_cleanup(struct module *mod) | ||
165 | { | ||
166 | } | ||
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c new file mode 100644 index 000000000000..7ec031c6ca10 --- /dev/null +++ b/arch/x86_64/kernel/mpparse.c | |||
@@ -0,0 +1,949 @@ | |||
1 | /* | ||
2 | * Intel Multiprocessor Specification 1.1 and 1.4 | ||
3 | * compliant MP-table parsing routines. | ||
4 | * | ||
5 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
6 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes | ||
9 | * Erich Boleyn : MP v1.4 and additional changes. | ||
10 | * Alan Cox : Added EBDA scanning | ||
11 | * Ingo Molnar : various cleanups and rewrites | ||
12 | * Maciej W. Rozycki: Bits for default MP configurations | ||
13 | * Paul Diefenbaugh: Added full ACPI support | ||
14 | */ | ||
15 | |||
16 | #include <linux/mm.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <linux/delay.h> | ||
20 | #include <linux/config.h> | ||
21 | #include <linux/bootmem.h> | ||
22 | #include <linux/smp_lock.h> | ||
23 | #include <linux/kernel_stat.h> | ||
24 | #include <linux/mc146818rtc.h> | ||
25 | #include <linux/acpi.h> | ||
26 | |||
27 | #include <asm/smp.h> | ||
28 | #include <asm/mtrr.h> | ||
29 | #include <asm/mpspec.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/io_apic.h> | ||
32 | #include <asm/proto.h> | ||
33 | |||
34 | /* Have we found an MP table */ | ||
35 | int smp_found_config; | ||
36 | unsigned int __initdata maxcpus = NR_CPUS; | ||
37 | |||
38 | int acpi_found_madt; | ||
39 | |||
40 | /* | ||
41 | * Various Linux-internal data structures created from the | ||
42 | * MP-table. | ||
43 | */ | ||
44 | int apic_version [MAX_APICS]; | ||
45 | unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | ||
46 | int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; | ||
47 | cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; | ||
48 | |||
49 | static int mp_current_pci_id = 0; | ||
50 | /* I/O APIC entries */ | ||
51 | struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; | ||
52 | |||
53 | /* # of MP IRQ source entries */ | ||
54 | struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | ||
55 | |||
56 | /* MP IRQ source entries */ | ||
57 | int mp_irq_entries; | ||
58 | |||
59 | int nr_ioapics; | ||
60 | int pic_mode; | ||
61 | unsigned long mp_lapic_addr = 0; | ||
62 | |||
63 | |||
64 | |||
65 | /* Processor that is doing the boot up */ | ||
66 | unsigned int boot_cpu_id = -1U; | ||
67 | /* Internal processor count */ | ||
68 | static unsigned int num_processors = 0; | ||
69 | |||
70 | /* Bitmask of physically existing CPUs */ | ||
71 | physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; | ||
72 | |||
73 | /* ACPI MADT entry parsing functions */ | ||
74 | #ifdef CONFIG_ACPI_BOOT | ||
75 | extern struct acpi_boot_flags acpi_boot; | ||
76 | #ifdef CONFIG_X86_LOCAL_APIC | ||
77 | extern int acpi_parse_lapic (acpi_table_entry_header *header); | ||
78 | extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); | ||
79 | extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); | ||
80 | #endif /*CONFIG_X86_LOCAL_APIC*/ | ||
81 | #ifdef CONFIG_X86_IO_APIC | ||
82 | extern int acpi_parse_ioapic (acpi_table_entry_header *header); | ||
83 | #endif /*CONFIG_X86_IO_APIC*/ | ||
84 | #endif /*CONFIG_ACPI_BOOT*/ | ||
85 | |||
86 | u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
87 | |||
88 | |||
89 | /* | ||
90 | * Intel MP BIOS table parsing routines: | ||
91 | */ | ||
92 | |||
93 | /* | ||
94 | * Checksum an MP configuration block. | ||
95 | */ | ||
96 | |||
97 | static int __init mpf_checksum(unsigned char *mp, int len) | ||
98 | { | ||
99 | int sum = 0; | ||
100 | |||
101 | while (len--) | ||
102 | sum += *mp++; | ||
103 | |||
104 | return sum & 0xFF; | ||
105 | } | ||
106 | |||
107 | static void __init MP_processor_info (struct mpc_config_processor *m) | ||
108 | { | ||
109 | int ver; | ||
110 | |||
111 | if (!(m->mpc_cpuflag & CPU_ENABLED)) | ||
112 | return; | ||
113 | |||
114 | printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", | ||
115 | m->mpc_apicid, | ||
116 | (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, | ||
117 | (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, | ||
118 | m->mpc_apicver); | ||
119 | |||
120 | if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { | ||
121 | Dprintk(" Bootup CPU\n"); | ||
122 | boot_cpu_id = m->mpc_apicid; | ||
123 | } | ||
124 | if (num_processors >= NR_CPUS) { | ||
125 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | ||
126 | " Processor ignored.\n", NR_CPUS); | ||
127 | return; | ||
128 | } | ||
129 | if (num_processors >= maxcpus) { | ||
130 | printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." | ||
131 | " Processor ignored.\n", maxcpus); | ||
132 | return; | ||
133 | } | ||
134 | |||
135 | num_processors++; | ||
136 | |||
137 | if (m->mpc_apicid > MAX_APICS) { | ||
138 | printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", | ||
139 | m->mpc_apicid, MAX_APICS); | ||
140 | return; | ||
141 | } | ||
142 | ver = m->mpc_apicver; | ||
143 | |||
144 | physid_set(m->mpc_apicid, phys_cpu_present_map); | ||
145 | /* | ||
146 | * Validate version | ||
147 | */ | ||
148 | if (ver == 0x0) { | ||
149 | printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); | ||
150 | ver = 0x10; | ||
151 | } | ||
152 | apic_version[m->mpc_apicid] = ver; | ||
153 | bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; | ||
154 | } | ||
155 | |||
156 | static void __init MP_bus_info (struct mpc_config_bus *m) | ||
157 | { | ||
158 | char str[7]; | ||
159 | |||
160 | memcpy(str, m->mpc_bustype, 6); | ||
161 | str[6] = 0; | ||
162 | Dprintk("Bus #%d is %s\n", m->mpc_busid, str); | ||
163 | |||
164 | if (strncmp(str, "ISA", 3) == 0) { | ||
165 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; | ||
166 | } else if (strncmp(str, "EISA", 4) == 0) { | ||
167 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; | ||
168 | } else if (strncmp(str, "PCI", 3) == 0) { | ||
169 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; | ||
170 | mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; | ||
171 | mp_current_pci_id++; | ||
172 | } else if (strncmp(str, "MCA", 3) == 0) { | ||
173 | mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; | ||
174 | } else { | ||
175 | printk(KERN_ERR "Unknown bustype %s\n", str); | ||
176 | } | ||
177 | } | ||
178 | |||
179 | static void __init MP_ioapic_info (struct mpc_config_ioapic *m) | ||
180 | { | ||
181 | if (!(m->mpc_flags & MPC_APIC_USABLE)) | ||
182 | return; | ||
183 | |||
184 | printk("I/O APIC #%d Version %d at 0x%X.\n", | ||
185 | m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); | ||
186 | if (nr_ioapics >= MAX_IO_APICS) { | ||
187 | printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", | ||
188 | MAX_IO_APICS, nr_ioapics); | ||
189 | panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); | ||
190 | } | ||
191 | if (!m->mpc_apicaddr) { | ||
192 | printk(KERN_ERR "WARNING: bogus zero I/O APIC address" | ||
193 | " found in MP table, skipping!\n"); | ||
194 | return; | ||
195 | } | ||
196 | mp_ioapics[nr_ioapics] = *m; | ||
197 | nr_ioapics++; | ||
198 | } | ||
199 | |||
200 | static void __init MP_intsrc_info (struct mpc_config_intsrc *m) | ||
201 | { | ||
202 | mp_irqs [mp_irq_entries] = *m; | ||
203 | Dprintk("Int: type %d, pol %d, trig %d, bus %d," | ||
204 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | ||
205 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
206 | (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, | ||
207 | m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); | ||
208 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
209 | panic("Max # of irq sources exceeded!!\n"); | ||
210 | } | ||
211 | |||
212 | static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) | ||
213 | { | ||
214 | Dprintk("Lint: type %d, pol %d, trig %d, bus %d," | ||
215 | " IRQ %02x, APIC ID %x, APIC LINT %02x\n", | ||
216 | m->mpc_irqtype, m->mpc_irqflag & 3, | ||
217 | (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, | ||
218 | m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); | ||
219 | /* | ||
220 | * Well it seems all SMP boards in existence | ||
221 | * use ExtINT/LVT1 == LINT0 and | ||
222 | * NMI/LVT2 == LINT1 - the following check | ||
223 | * will show us if this assumptions is false. | ||
224 | * Until then we do not have to add baggage. | ||
225 | */ | ||
226 | if ((m->mpc_irqtype == mp_ExtINT) && | ||
227 | (m->mpc_destapiclint != 0)) | ||
228 | BUG(); | ||
229 | if ((m->mpc_irqtype == mp_NMI) && | ||
230 | (m->mpc_destapiclint != 1)) | ||
231 | BUG(); | ||
232 | } | ||
233 | |||
234 | /* | ||
235 | * Read/parse the MPC | ||
236 | */ | ||
237 | |||
238 | static int __init smp_read_mpc(struct mp_config_table *mpc) | ||
239 | { | ||
240 | char str[16]; | ||
241 | int count=sizeof(*mpc); | ||
242 | unsigned char *mpt=((unsigned char *)mpc)+count; | ||
243 | |||
244 | if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { | ||
245 | printk("SMP mptable: bad signature [%c%c%c%c]!\n", | ||
246 | mpc->mpc_signature[0], | ||
247 | mpc->mpc_signature[1], | ||
248 | mpc->mpc_signature[2], | ||
249 | mpc->mpc_signature[3]); | ||
250 | return 0; | ||
251 | } | ||
252 | if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { | ||
253 | printk("SMP mptable: checksum error!\n"); | ||
254 | return 0; | ||
255 | } | ||
256 | if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { | ||
257 | printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", | ||
258 | mpc->mpc_spec); | ||
259 | return 0; | ||
260 | } | ||
261 | if (!mpc->mpc_lapic) { | ||
262 | printk(KERN_ERR "SMP mptable: null local APIC address!\n"); | ||
263 | return 0; | ||
264 | } | ||
265 | memcpy(str,mpc->mpc_oem,8); | ||
266 | str[8]=0; | ||
267 | printk(KERN_INFO "OEM ID: %s ",str); | ||
268 | |||
269 | memcpy(str,mpc->mpc_productid,12); | ||
270 | str[12]=0; | ||
271 | printk(KERN_INFO "Product ID: %s ",str); | ||
272 | |||
273 | printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic); | ||
274 | |||
275 | /* save the local APIC address, it might be non-default */ | ||
276 | if (!acpi_lapic) | ||
277 | mp_lapic_addr = mpc->mpc_lapic; | ||
278 | |||
279 | /* | ||
280 | * Now process the configuration blocks. | ||
281 | */ | ||
282 | while (count < mpc->mpc_length) { | ||
283 | switch(*mpt) { | ||
284 | case MP_PROCESSOR: | ||
285 | { | ||
286 | struct mpc_config_processor *m= | ||
287 | (struct mpc_config_processor *)mpt; | ||
288 | if (!acpi_lapic) | ||
289 | MP_processor_info(m); | ||
290 | mpt += sizeof(*m); | ||
291 | count += sizeof(*m); | ||
292 | break; | ||
293 | } | ||
294 | case MP_BUS: | ||
295 | { | ||
296 | struct mpc_config_bus *m= | ||
297 | (struct mpc_config_bus *)mpt; | ||
298 | MP_bus_info(m); | ||
299 | mpt += sizeof(*m); | ||
300 | count += sizeof(*m); | ||
301 | break; | ||
302 | } | ||
303 | case MP_IOAPIC: | ||
304 | { | ||
305 | struct mpc_config_ioapic *m= | ||
306 | (struct mpc_config_ioapic *)mpt; | ||
307 | MP_ioapic_info(m); | ||
308 | mpt+=sizeof(*m); | ||
309 | count+=sizeof(*m); | ||
310 | break; | ||
311 | } | ||
312 | case MP_INTSRC: | ||
313 | { | ||
314 | struct mpc_config_intsrc *m= | ||
315 | (struct mpc_config_intsrc *)mpt; | ||
316 | |||
317 | MP_intsrc_info(m); | ||
318 | mpt+=sizeof(*m); | ||
319 | count+=sizeof(*m); | ||
320 | break; | ||
321 | } | ||
322 | case MP_LINTSRC: | ||
323 | { | ||
324 | struct mpc_config_lintsrc *m= | ||
325 | (struct mpc_config_lintsrc *)mpt; | ||
326 | MP_lintsrc_info(m); | ||
327 | mpt+=sizeof(*m); | ||
328 | count+=sizeof(*m); | ||
329 | break; | ||
330 | } | ||
331 | } | ||
332 | } | ||
333 | clustered_apic_check(); | ||
334 | if (!num_processors) | ||
335 | printk(KERN_ERR "SMP mptable: no processors registered!\n"); | ||
336 | return num_processors; | ||
337 | } | ||
338 | |||
339 | static int __init ELCR_trigger(unsigned int irq) | ||
340 | { | ||
341 | unsigned int port; | ||
342 | |||
343 | port = 0x4d0 + (irq >> 3); | ||
344 | return (inb(port) >> (irq & 7)) & 1; | ||
345 | } | ||
346 | |||
347 | static void __init construct_default_ioirq_mptable(int mpc_default_type) | ||
348 | { | ||
349 | struct mpc_config_intsrc intsrc; | ||
350 | int i; | ||
351 | int ELCR_fallback = 0; | ||
352 | |||
353 | intsrc.mpc_type = MP_INTSRC; | ||
354 | intsrc.mpc_irqflag = 0; /* conforming */ | ||
355 | intsrc.mpc_srcbus = 0; | ||
356 | intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; | ||
357 | |||
358 | intsrc.mpc_irqtype = mp_INT; | ||
359 | |||
360 | /* | ||
361 | * If true, we have an ISA/PCI system with no IRQ entries | ||
362 | * in the MP table. To prevent the PCI interrupts from being set up | ||
363 | * incorrectly, we try to use the ELCR. The sanity check to see if | ||
364 | * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can | ||
365 | * never be level sensitive, so we simply see if the ELCR agrees. | ||
366 | * If it does, we assume it's valid. | ||
367 | */ | ||
368 | if (mpc_default_type == 5) { | ||
369 | printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); | ||
370 | |||
371 | if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) | ||
372 | printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); | ||
373 | else { | ||
374 | printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); | ||
375 | ELCR_fallback = 1; | ||
376 | } | ||
377 | } | ||
378 | |||
379 | for (i = 0; i < 16; i++) { | ||
380 | switch (mpc_default_type) { | ||
381 | case 2: | ||
382 | if (i == 0 || i == 13) | ||
383 | continue; /* IRQ0 & IRQ13 not connected */ | ||
384 | /* fall through */ | ||
385 | default: | ||
386 | if (i == 2) | ||
387 | continue; /* IRQ2 is never connected */ | ||
388 | } | ||
389 | |||
390 | if (ELCR_fallback) { | ||
391 | /* | ||
392 | * If the ELCR indicates a level-sensitive interrupt, we | ||
393 | * copy that information over to the MP table in the | ||
394 | * irqflag field (level sensitive, active high polarity). | ||
395 | */ | ||
396 | if (ELCR_trigger(i)) | ||
397 | intsrc.mpc_irqflag = 13; | ||
398 | else | ||
399 | intsrc.mpc_irqflag = 0; | ||
400 | } | ||
401 | |||
402 | intsrc.mpc_srcbusirq = i; | ||
403 | intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | ||
404 | MP_intsrc_info(&intsrc); | ||
405 | } | ||
406 | |||
407 | intsrc.mpc_irqtype = mp_ExtINT; | ||
408 | intsrc.mpc_srcbusirq = 0; | ||
409 | intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ | ||
410 | MP_intsrc_info(&intsrc); | ||
411 | } | ||
412 | |||
413 | static inline void __init construct_default_ISA_mptable(int mpc_default_type) | ||
414 | { | ||
415 | struct mpc_config_processor processor; | ||
416 | struct mpc_config_bus bus; | ||
417 | struct mpc_config_ioapic ioapic; | ||
418 | struct mpc_config_lintsrc lintsrc; | ||
419 | int linttypes[2] = { mp_ExtINT, mp_NMI }; | ||
420 | int i; | ||
421 | |||
422 | /* | ||
423 | * local APIC has default address | ||
424 | */ | ||
425 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
426 | |||
427 | /* | ||
428 | * 2 CPUs, numbered 0 & 1. | ||
429 | */ | ||
430 | processor.mpc_type = MP_PROCESSOR; | ||
431 | /* Either an integrated APIC or a discrete 82489DX. */ | ||
432 | processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
433 | processor.mpc_cpuflag = CPU_ENABLED; | ||
434 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
435 | (boot_cpu_data.x86_model << 4) | | ||
436 | boot_cpu_data.x86_mask; | ||
437 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
438 | processor.mpc_reserved[0] = 0; | ||
439 | processor.mpc_reserved[1] = 0; | ||
440 | for (i = 0; i < 2; i++) { | ||
441 | processor.mpc_apicid = i; | ||
442 | MP_processor_info(&processor); | ||
443 | } | ||
444 | |||
445 | bus.mpc_type = MP_BUS; | ||
446 | bus.mpc_busid = 0; | ||
447 | switch (mpc_default_type) { | ||
448 | default: | ||
449 | printk(KERN_ERR "???\nUnknown standard configuration %d\n", | ||
450 | mpc_default_type); | ||
451 | /* fall through */ | ||
452 | case 1: | ||
453 | case 5: | ||
454 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
455 | break; | ||
456 | case 2: | ||
457 | case 6: | ||
458 | case 3: | ||
459 | memcpy(bus.mpc_bustype, "EISA ", 6); | ||
460 | break; | ||
461 | case 4: | ||
462 | case 7: | ||
463 | memcpy(bus.mpc_bustype, "MCA ", 6); | ||
464 | } | ||
465 | MP_bus_info(&bus); | ||
466 | if (mpc_default_type > 4) { | ||
467 | bus.mpc_busid = 1; | ||
468 | memcpy(bus.mpc_bustype, "PCI ", 6); | ||
469 | MP_bus_info(&bus); | ||
470 | } | ||
471 | |||
472 | ioapic.mpc_type = MP_IOAPIC; | ||
473 | ioapic.mpc_apicid = 2; | ||
474 | ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; | ||
475 | ioapic.mpc_flags = MPC_APIC_USABLE; | ||
476 | ioapic.mpc_apicaddr = 0xFEC00000; | ||
477 | MP_ioapic_info(&ioapic); | ||
478 | |||
479 | /* | ||
480 | * We set up most of the low 16 IO-APIC pins according to MPS rules. | ||
481 | */ | ||
482 | construct_default_ioirq_mptable(mpc_default_type); | ||
483 | |||
484 | lintsrc.mpc_type = MP_LINTSRC; | ||
485 | lintsrc.mpc_irqflag = 0; /* conforming */ | ||
486 | lintsrc.mpc_srcbusid = 0; | ||
487 | lintsrc.mpc_srcbusirq = 0; | ||
488 | lintsrc.mpc_destapic = MP_APIC_ALL; | ||
489 | for (i = 0; i < 2; i++) { | ||
490 | lintsrc.mpc_irqtype = linttypes[i]; | ||
491 | lintsrc.mpc_destapiclint = i; | ||
492 | MP_lintsrc_info(&lintsrc); | ||
493 | } | ||
494 | } | ||
495 | |||
496 | static struct intel_mp_floating *mpf_found; | ||
497 | |||
498 | /* | ||
499 | * Scan the memory blocks for an SMP configuration block. | ||
500 | */ | ||
501 | void __init get_smp_config (void) | ||
502 | { | ||
503 | struct intel_mp_floating *mpf = mpf_found; | ||
504 | |||
505 | /* | ||
506 | * ACPI may be used to obtain the entire SMP configuration or just to | ||
507 | * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that | ||
508 | * ACPI supports both logical (e.g. Hyper-Threading) and physical | ||
509 | * processors, where MPS only supports physical. | ||
510 | */ | ||
511 | if (acpi_lapic && acpi_ioapic) { | ||
512 | printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); | ||
513 | return; | ||
514 | } | ||
515 | else if (acpi_lapic) | ||
516 | printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); | ||
517 | |||
518 | printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); | ||
519 | if (mpf->mpf_feature2 & (1<<7)) { | ||
520 | printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); | ||
521 | pic_mode = 1; | ||
522 | } else { | ||
523 | printk(KERN_INFO " Virtual Wire compatibility mode.\n"); | ||
524 | pic_mode = 0; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Now see if we need to read further. | ||
529 | */ | ||
530 | if (mpf->mpf_feature1 != 0) { | ||
531 | |||
532 | printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); | ||
533 | construct_default_ISA_mptable(mpf->mpf_feature1); | ||
534 | |||
535 | } else if (mpf->mpf_physptr) { | ||
536 | |||
537 | /* | ||
538 | * Read the physical hardware table. Anything here will | ||
539 | * override the defaults. | ||
540 | */ | ||
541 | if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { | ||
542 | smp_found_config = 0; | ||
543 | printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); | ||
544 | printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); | ||
545 | return; | ||
546 | } | ||
547 | /* | ||
548 | * If there are no explicit MP IRQ entries, then we are | ||
549 | * broken. We set up most of the low 16 IO-APIC pins to | ||
550 | * ISA defaults and hope it will work. | ||
551 | */ | ||
552 | if (!mp_irq_entries) { | ||
553 | struct mpc_config_bus bus; | ||
554 | |||
555 | printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); | ||
556 | |||
557 | bus.mpc_type = MP_BUS; | ||
558 | bus.mpc_busid = 0; | ||
559 | memcpy(bus.mpc_bustype, "ISA ", 6); | ||
560 | MP_bus_info(&bus); | ||
561 | |||
562 | construct_default_ioirq_mptable(0); | ||
563 | } | ||
564 | |||
565 | } else | ||
566 | BUG(); | ||
567 | |||
568 | printk(KERN_INFO "Processors: %d\n", num_processors); | ||
569 | /* | ||
570 | * Only use the first configuration found. | ||
571 | */ | ||
572 | } | ||
573 | |||
574 | static int __init smp_scan_config (unsigned long base, unsigned long length) | ||
575 | { | ||
576 | extern void __bad_mpf_size(void); | ||
577 | unsigned int *bp = phys_to_virt(base); | ||
578 | struct intel_mp_floating *mpf; | ||
579 | |||
580 | Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); | ||
581 | if (sizeof(*mpf) != 16) | ||
582 | __bad_mpf_size(); | ||
583 | |||
584 | while (length > 0) { | ||
585 | mpf = (struct intel_mp_floating *)bp; | ||
586 | if ((*bp == SMP_MAGIC_IDENT) && | ||
587 | (mpf->mpf_length == 1) && | ||
588 | !mpf_checksum((unsigned char *)bp, 16) && | ||
589 | ((mpf->mpf_specification == 1) | ||
590 | || (mpf->mpf_specification == 4)) ) { | ||
591 | |||
592 | smp_found_config = 1; | ||
593 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); | ||
594 | if (mpf->mpf_physptr) | ||
595 | reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); | ||
596 | mpf_found = mpf; | ||
597 | return 1; | ||
598 | } | ||
599 | bp += 4; | ||
600 | length -= 16; | ||
601 | } | ||
602 | return 0; | ||
603 | } | ||
604 | |||
605 | void __init find_intel_smp (void) | ||
606 | { | ||
607 | unsigned int address; | ||
608 | |||
609 | /* | ||
610 | * FIXME: Linux assumes you have 640K of base ram.. | ||
611 | * this continues the error... | ||
612 | * | ||
613 | * 1) Scan the bottom 1K for a signature | ||
614 | * 2) Scan the top 1K of base RAM | ||
615 | * 3) Scan the 64K of bios | ||
616 | */ | ||
617 | if (smp_scan_config(0x0,0x400) || | ||
618 | smp_scan_config(639*0x400,0x400) || | ||
619 | smp_scan_config(0xF0000,0x10000)) | ||
620 | return; | ||
621 | /* | ||
622 | * If it is an SMP machine we should know now, unless the | ||
623 | * configuration is in an EISA/MCA bus machine with an | ||
624 | * extended bios data area. | ||
625 | * | ||
626 | * there is a real-mode segmented pointer pointing to the | ||
627 | * 4K EBDA area at 0x40E, calculate and scan it here. | ||
628 | * | ||
629 | * NOTE! There are Linux loaders that will corrupt the EBDA | ||
630 | * area, and as such this kind of SMP config may be less | ||
631 | * trustworthy, simply because the SMP table may have been | ||
632 | * stomped on during early boot. These loaders are buggy and | ||
633 | * should be fixed. | ||
634 | */ | ||
635 | |||
636 | address = *(unsigned short *)phys_to_virt(0x40E); | ||
637 | address <<= 4; | ||
638 | if (smp_scan_config(address, 0x1000)) | ||
639 | return; | ||
640 | |||
641 | /* If we have come this far, we did not find an MP table */ | ||
642 | printk(KERN_INFO "No mptable found.\n"); | ||
643 | } | ||
644 | |||
645 | /* | ||
646 | * - Intel MP Configuration Table | ||
647 | */ | ||
648 | void __init find_smp_config (void) | ||
649 | { | ||
650 | #ifdef CONFIG_X86_LOCAL_APIC | ||
651 | find_intel_smp(); | ||
652 | #endif | ||
653 | } | ||
654 | |||
655 | |||
656 | /* -------------------------------------------------------------------------- | ||
657 | ACPI-based MP Configuration | ||
658 | -------------------------------------------------------------------------- */ | ||
659 | |||
660 | #ifdef CONFIG_ACPI_BOOT | ||
661 | |||
662 | void __init mp_register_lapic_address ( | ||
663 | u64 address) | ||
664 | { | ||
665 | mp_lapic_addr = (unsigned long) address; | ||
666 | |||
667 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
668 | |||
669 | if (boot_cpu_id == -1U) | ||
670 | boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
671 | |||
672 | Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); | ||
673 | } | ||
674 | |||
675 | |||
676 | void __init mp_register_lapic ( | ||
677 | u8 id, | ||
678 | u8 enabled) | ||
679 | { | ||
680 | struct mpc_config_processor processor; | ||
681 | int boot_cpu = 0; | ||
682 | |||
683 | if (id >= MAX_APICS) { | ||
684 | printk(KERN_WARNING "Processor #%d invalid (max %d)\n", | ||
685 | id, MAX_APICS); | ||
686 | return; | ||
687 | } | ||
688 | |||
689 | if (id == boot_cpu_physical_apicid) | ||
690 | boot_cpu = 1; | ||
691 | |||
692 | processor.mpc_type = MP_PROCESSOR; | ||
693 | processor.mpc_apicid = id; | ||
694 | processor.mpc_apicver = 0x10; /* TBD: lapic version */ | ||
695 | processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); | ||
696 | processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); | ||
697 | processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | | ||
698 | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; | ||
699 | processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; | ||
700 | processor.mpc_reserved[0] = 0; | ||
701 | processor.mpc_reserved[1] = 0; | ||
702 | |||
703 | MP_processor_info(&processor); | ||
704 | } | ||
705 | |||
706 | #ifdef CONFIG_X86_IO_APIC | ||
707 | |||
708 | #define MP_ISA_BUS 0 | ||
709 | #define MP_MAX_IOAPIC_PIN 127 | ||
710 | |||
711 | static struct mp_ioapic_routing { | ||
712 | int apic_id; | ||
713 | int gsi_start; | ||
714 | int gsi_end; | ||
715 | u32 pin_programmed[4]; | ||
716 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
717 | |||
718 | |||
719 | static int mp_find_ioapic ( | ||
720 | int gsi) | ||
721 | { | ||
722 | int i = 0; | ||
723 | |||
724 | /* Find the IOAPIC that manages this GSI. */ | ||
725 | for (i = 0; i < nr_ioapics; i++) { | ||
726 | if ((gsi >= mp_ioapic_routing[i].gsi_start) | ||
727 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | ||
728 | return i; | ||
729 | } | ||
730 | |||
731 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
732 | |||
733 | return -1; | ||
734 | } | ||
735 | |||
736 | |||
737 | void __init mp_register_ioapic ( | ||
738 | u8 id, | ||
739 | u32 address, | ||
740 | u32 gsi_base) | ||
741 | { | ||
742 | int idx = 0; | ||
743 | |||
744 | if (nr_ioapics >= MAX_IO_APICS) { | ||
745 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
746 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
747 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
748 | } | ||
749 | if (!address) { | ||
750 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
751 | " found in MADT table, skipping!\n"); | ||
752 | return; | ||
753 | } | ||
754 | |||
755 | idx = nr_ioapics++; | ||
756 | |||
757 | mp_ioapics[idx].mpc_type = MP_IOAPIC; | ||
758 | mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; | ||
759 | mp_ioapics[idx].mpc_apicaddr = address; | ||
760 | |||
761 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
762 | mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); | ||
763 | mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); | ||
764 | |||
765 | /* | ||
766 | * Build basic IRQ lookup table to facilitate gsi->io_apic lookups | ||
767 | * and to prevent reprogramming of IOAPIC pins (PCI IRQs). | ||
768 | */ | ||
769 | mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; | ||
770 | mp_ioapic_routing[idx].gsi_start = gsi_base; | ||
771 | mp_ioapic_routing[idx].gsi_end = gsi_base + | ||
772 | io_apic_get_redir_entries(idx); | ||
773 | |||
774 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | ||
775 | "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, | ||
776 | mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, | ||
777 | mp_ioapic_routing[idx].gsi_start, | ||
778 | mp_ioapic_routing[idx].gsi_end); | ||
779 | |||
780 | return; | ||
781 | } | ||
782 | |||
783 | |||
784 | void __init mp_override_legacy_irq ( | ||
785 | u8 bus_irq, | ||
786 | u8 polarity, | ||
787 | u8 trigger, | ||
788 | u32 gsi) | ||
789 | { | ||
790 | struct mpc_config_intsrc intsrc; | ||
791 | int ioapic = -1; | ||
792 | int pin = -1; | ||
793 | |||
794 | /* | ||
795 | * Convert 'gsi' to 'ioapic.pin'. | ||
796 | */ | ||
797 | ioapic = mp_find_ioapic(gsi); | ||
798 | if (ioapic < 0) | ||
799 | return; | ||
800 | pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
801 | |||
802 | /* | ||
803 | * TBD: This check is for faulty timer entries, where the override | ||
804 | * erroneously sets the trigger to level, resulting in a HUGE | ||
805 | * increase of timer interrupts! | ||
806 | */ | ||
807 | if ((bus_irq == 0) && (trigger == 3)) | ||
808 | trigger = 1; | ||
809 | |||
810 | intsrc.mpc_type = MP_INTSRC; | ||
811 | intsrc.mpc_irqtype = mp_INT; | ||
812 | intsrc.mpc_irqflag = (trigger << 2) | polarity; | ||
813 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
814 | intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ | ||
815 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ | ||
816 | intsrc.mpc_dstirq = pin; /* INTIN# */ | ||
817 | |||
818 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", | ||
819 | intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
820 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
821 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); | ||
822 | |||
823 | mp_irqs[mp_irq_entries] = intsrc; | ||
824 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
825 | panic("Max # of irq sources exceeded!\n"); | ||
826 | |||
827 | return; | ||
828 | } | ||
829 | |||
830 | |||
831 | void __init mp_config_acpi_legacy_irqs (void) | ||
832 | { | ||
833 | struct mpc_config_intsrc intsrc; | ||
834 | int i = 0; | ||
835 | int ioapic = -1; | ||
836 | |||
837 | /* | ||
838 | * Fabricate the legacy ISA bus (bus #31). | ||
839 | */ | ||
840 | mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; | ||
841 | Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); | ||
842 | |||
843 | /* | ||
844 | * Locate the IOAPIC that manages the ISA IRQs (0-15). | ||
845 | */ | ||
846 | ioapic = mp_find_ioapic(0); | ||
847 | if (ioapic < 0) | ||
848 | return; | ||
849 | |||
850 | intsrc.mpc_type = MP_INTSRC; | ||
851 | intsrc.mpc_irqflag = 0; /* Conforming */ | ||
852 | intsrc.mpc_srcbus = MP_ISA_BUS; | ||
853 | intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; | ||
854 | |||
855 | /* | ||
856 | * Use the default configuration for the IRQs 0-15. Unless | ||
857 | * overridden by (MADT) interrupt source override entries. | ||
858 | */ | ||
859 | for (i = 0; i < 16; i++) { | ||
860 | int idx; | ||
861 | |||
862 | for (idx = 0; idx < mp_irq_entries; idx++) { | ||
863 | struct mpc_config_intsrc *irq = mp_irqs + idx; | ||
864 | |||
865 | /* Do we already have a mapping for this ISA IRQ? */ | ||
866 | if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) | ||
867 | break; | ||
868 | |||
869 | /* Do we already have a mapping for this IOAPIC pin */ | ||
870 | if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && | ||
871 | (irq->mpc_dstirq == i)) | ||
872 | break; | ||
873 | } | ||
874 | |||
875 | if (idx != mp_irq_entries) { | ||
876 | printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); | ||
877 | continue; /* IRQ already used */ | ||
878 | } | ||
879 | |||
880 | intsrc.mpc_irqtype = mp_INT; | ||
881 | intsrc.mpc_srcbusirq = i; /* Identity mapped */ | ||
882 | intsrc.mpc_dstirq = i; | ||
883 | |||
884 | Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " | ||
885 | "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, | ||
886 | (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, | ||
887 | intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, | ||
888 | intsrc.mpc_dstirq); | ||
889 | |||
890 | mp_irqs[mp_irq_entries] = intsrc; | ||
891 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
892 | panic("Max # of irq sources exceeded!\n"); | ||
893 | } | ||
894 | |||
895 | return; | ||
896 | } | ||
897 | |||
898 | int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) | ||
899 | { | ||
900 | int ioapic = -1; | ||
901 | int ioapic_pin = 0; | ||
902 | int idx, bit = 0; | ||
903 | |||
904 | if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) | ||
905 | return gsi; | ||
906 | |||
907 | #ifdef CONFIG_ACPI_BUS | ||
908 | /* Don't set up the ACPI SCI because it's already set up */ | ||
909 | if (acpi_fadt.sci_int == gsi) | ||
910 | return gsi; | ||
911 | #endif | ||
912 | |||
913 | ioapic = mp_find_ioapic(gsi); | ||
914 | if (ioapic < 0) { | ||
915 | printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); | ||
916 | return gsi; | ||
917 | } | ||
918 | |||
919 | ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; | ||
920 | |||
921 | /* | ||
922 | * Avoid pin reprogramming. PRTs typically include entries | ||
923 | * with redundant pin->gsi mappings (but unique PCI devices); | ||
924 | * we only program the IOAPIC on the first. | ||
925 | */ | ||
926 | bit = ioapic_pin % 32; | ||
927 | idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); | ||
928 | if (idx > 3) { | ||
929 | printk(KERN_ERR "Invalid reference to IOAPIC pin " | ||
930 | "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, | ||
931 | ioapic_pin); | ||
932 | return gsi; | ||
933 | } | ||
934 | if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { | ||
935 | Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", | ||
936 | mp_ioapic_routing[ioapic].apic_id, ioapic_pin); | ||
937 | return gsi; | ||
938 | } | ||
939 | |||
940 | mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); | ||
941 | |||
942 | io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, | ||
943 | edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, | ||
944 | active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); | ||
945 | return gsi; | ||
946 | } | ||
947 | |||
948 | #endif /*CONFIG_X86_IO_APIC*/ | ||
949 | #endif /*CONFIG_ACPI_BOOT*/ | ||
diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c new file mode 100644 index 000000000000..598953ab0154 --- /dev/null +++ b/arch/x86_64/kernel/msr.c | |||
@@ -0,0 +1,279 @@ | |||
1 | /* ----------------------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2000 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, | ||
8 | * USA; either version 2 of the License, or (at your option) any later | ||
9 | * version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * msr.c | ||
15 | * | ||
16 | * x86 MSR access device | ||
17 | * | ||
18 | * This device is accessed by lseek() to the appropriate register number | ||
19 | * and then read/write in chunks of 8 bytes. A larger size means multiple | ||
20 | * reads or writes of the same register. | ||
21 | * | ||
22 | * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on | ||
23 | * an SMP box will direct the access to CPU %d. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/config.h> | ||
28 | |||
29 | #include <linux/types.h> | ||
30 | #include <linux/errno.h> | ||
31 | #include <linux/fcntl.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/poll.h> | ||
34 | #include <linux/smp.h> | ||
35 | #include <linux/smp_lock.h> | ||
36 | #include <linux/major.h> | ||
37 | #include <linux/fs.h> | ||
38 | |||
39 | #include <asm/processor.h> | ||
40 | #include <asm/msr.h> | ||
41 | #include <asm/uaccess.h> | ||
42 | #include <asm/system.h> | ||
43 | |||
44 | /* Note: "err" is handled in a funny way below. Otherwise one version | ||
45 | of gcc or another breaks. */ | ||
46 | |||
47 | static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) | ||
48 | { | ||
49 | int err; | ||
50 | |||
51 | asm volatile ("1: wrmsr\n" | ||
52 | "2:\n" | ||
53 | ".section .fixup,\"ax\"\n" | ||
54 | "3: movl %4,%0\n" | ||
55 | " jmp 2b\n" | ||
56 | ".previous\n" | ||
57 | ".section __ex_table,\"a\"\n" | ||
58 | " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err) | ||
59 | :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); | ||
60 | |||
61 | return err; | ||
62 | } | ||
63 | |||
64 | static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) | ||
65 | { | ||
66 | int err; | ||
67 | |||
68 | asm volatile ("1: rdmsr\n" | ||
69 | "2:\n" | ||
70 | ".section .fixup,\"ax\"\n" | ||
71 | "3: movl %4,%0\n" | ||
72 | " jmp 2b\n" | ||
73 | ".previous\n" | ||
74 | ".section __ex_table,\"a\"\n" | ||
75 | " .align 8\n" | ||
76 | " .quad 1b,3b\n" | ||
77 | ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) | ||
78 | :"c"(reg), "i"(-EIO), "0"(0)); | ||
79 | |||
80 | return err; | ||
81 | } | ||
82 | |||
83 | #ifdef CONFIG_SMP | ||
84 | |||
85 | struct msr_command { | ||
86 | int cpu; | ||
87 | int err; | ||
88 | u32 reg; | ||
89 | u32 data[2]; | ||
90 | }; | ||
91 | |||
92 | static void msr_smp_wrmsr(void *cmd_block) | ||
93 | { | ||
94 | struct msr_command *cmd = (struct msr_command *)cmd_block; | ||
95 | |||
96 | if (cmd->cpu == smp_processor_id()) | ||
97 | cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); | ||
98 | } | ||
99 | |||
100 | static void msr_smp_rdmsr(void *cmd_block) | ||
101 | { | ||
102 | struct msr_command *cmd = (struct msr_command *)cmd_block; | ||
103 | |||
104 | if (cmd->cpu == smp_processor_id()) | ||
105 | cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); | ||
106 | } | ||
107 | |||
108 | static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) | ||
109 | { | ||
110 | struct msr_command cmd; | ||
111 | int ret; | ||
112 | |||
113 | preempt_disable(); | ||
114 | if (cpu == smp_processor_id()) { | ||
115 | ret = wrmsr_eio(reg, eax, edx); | ||
116 | } else { | ||
117 | cmd.cpu = cpu; | ||
118 | cmd.reg = reg; | ||
119 | cmd.data[0] = eax; | ||
120 | cmd.data[1] = edx; | ||
121 | |||
122 | smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); | ||
123 | ret = cmd.err; | ||
124 | } | ||
125 | preempt_enable(); | ||
126 | return ret; | ||
127 | } | ||
128 | |||
129 | static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) | ||
130 | { | ||
131 | struct msr_command cmd; | ||
132 | int ret; | ||
133 | |||
134 | preempt_disable(); | ||
135 | if (cpu == smp_processor_id()) { | ||
136 | ret = rdmsr_eio(reg, eax, edx); | ||
137 | } else { | ||
138 | cmd.cpu = cpu; | ||
139 | cmd.reg = reg; | ||
140 | |||
141 | smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); | ||
142 | |||
143 | *eax = cmd.data[0]; | ||
144 | *edx = cmd.data[1]; | ||
145 | |||
146 | ret = cmd.err; | ||
147 | } | ||
148 | preempt_enable(); | ||
149 | return ret; | ||
150 | } | ||
151 | |||
152 | #else /* ! CONFIG_SMP */ | ||
153 | |||
154 | static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) | ||
155 | { | ||
156 | return wrmsr_eio(reg, eax, edx); | ||
157 | } | ||
158 | |||
159 | static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) | ||
160 | { | ||
161 | return rdmsr_eio(reg, eax, edx); | ||
162 | } | ||
163 | |||
164 | #endif /* ! CONFIG_SMP */ | ||
165 | |||
166 | static loff_t msr_seek(struct file *file, loff_t offset, int orig) | ||
167 | { | ||
168 | loff_t ret = -EINVAL; | ||
169 | |||
170 | lock_kernel(); | ||
171 | switch (orig) { | ||
172 | case 0: | ||
173 | file->f_pos = offset; | ||
174 | ret = file->f_pos; | ||
175 | break; | ||
176 | case 1: | ||
177 | file->f_pos += offset; | ||
178 | ret = file->f_pos; | ||
179 | } | ||
180 | unlock_kernel(); | ||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | static ssize_t msr_read(struct file *file, char __user * buf, | ||
185 | size_t count, loff_t * ppos) | ||
186 | { | ||
187 | u32 __user *tmp = (u32 __user *) buf; | ||
188 | u32 data[2]; | ||
189 | size_t rv; | ||
190 | u32 reg = *ppos; | ||
191 | int cpu = iminor(file->f_dentry->d_inode); | ||
192 | int err; | ||
193 | |||
194 | if (count % 8) | ||
195 | return -EINVAL; /* Invalid chunk size */ | ||
196 | |||
197 | for (rv = 0; count; count -= 8) { | ||
198 | err = do_rdmsr(cpu, reg, &data[0], &data[1]); | ||
199 | if (err) | ||
200 | return err; | ||
201 | if (copy_to_user(tmp, &data, 8)) | ||
202 | return -EFAULT; | ||
203 | tmp += 2; | ||
204 | } | ||
205 | |||
206 | return ((char __user *)tmp) - buf; | ||
207 | } | ||
208 | |||
209 | static ssize_t msr_write(struct file *file, const char __user *buf, | ||
210 | size_t count, loff_t *ppos) | ||
211 | { | ||
212 | const u32 __user *tmp = (const u32 __user *)buf; | ||
213 | u32 data[2]; | ||
214 | size_t rv; | ||
215 | u32 reg = *ppos; | ||
216 | int cpu = iminor(file->f_dentry->d_inode); | ||
217 | int err; | ||
218 | |||
219 | if (count % 8) | ||
220 | return -EINVAL; /* Invalid chunk size */ | ||
221 | |||
222 | for (rv = 0; count; count -= 8) { | ||
223 | if (copy_from_user(&data, tmp, 8)) | ||
224 | return -EFAULT; | ||
225 | err = do_wrmsr(cpu, reg, data[0], data[1]); | ||
226 | if (err) | ||
227 | return err; | ||
228 | tmp += 2; | ||
229 | } | ||
230 | |||
231 | return ((char __user *)tmp) - buf; | ||
232 | } | ||
233 | |||
234 | static int msr_open(struct inode *inode, struct file *file) | ||
235 | { | ||
236 | unsigned int cpu = iminor(file->f_dentry->d_inode); | ||
237 | struct cpuinfo_x86 *c = &(cpu_data)[cpu]; | ||
238 | |||
239 | if (cpu >= NR_CPUS || !cpu_online(cpu)) | ||
240 | return -ENXIO; /* No such CPU */ | ||
241 | if (!cpu_has(c, X86_FEATURE_MSR)) | ||
242 | return -EIO; /* MSR not supported */ | ||
243 | |||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * File operations we support | ||
249 | */ | ||
250 | static struct file_operations msr_fops = { | ||
251 | .owner = THIS_MODULE, | ||
252 | .llseek = msr_seek, | ||
253 | .read = msr_read, | ||
254 | .write = msr_write, | ||
255 | .open = msr_open, | ||
256 | }; | ||
257 | |||
258 | static int __init msr_init(void) | ||
259 | { | ||
260 | if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { | ||
261 | printk(KERN_ERR "msr: unable to get major %d for msr\n", | ||
262 | MSR_MAJOR); | ||
263 | return -EBUSY; | ||
264 | } | ||
265 | |||
266 | return 0; | ||
267 | } | ||
268 | |||
269 | static void __exit msr_exit(void) | ||
270 | { | ||
271 | unregister_chrdev(MSR_MAJOR, "cpu/msr"); | ||
272 | } | ||
273 | |||
274 | module_init(msr_init); | ||
275 | module_exit(msr_exit) | ||
276 | |||
277 | MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); | ||
278 | MODULE_DESCRIPTION("x86 generic MSR driver"); | ||
279 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c new file mode 100644 index 000000000000..d9867de6a626 --- /dev/null +++ b/arch/x86_64/kernel/nmi.c | |||
@@ -0,0 +1,488 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/nmi.c | ||
3 | * | ||
4 | * NMI watchdog support on APIC systems | ||
5 | * | ||
6 | * Started by Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * Fixes: | ||
9 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | ||
10 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | ||
11 | * Pavel Machek and | ||
12 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/delay.h> | ||
19 | #include <linux/bootmem.h> | ||
20 | #include <linux/smp_lock.h> | ||
21 | #include <linux/interrupt.h> | ||
22 | #include <linux/mc146818rtc.h> | ||
23 | #include <linux/kernel_stat.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/nmi.h> | ||
27 | #include <linux/sysctl.h> | ||
28 | |||
29 | #include <asm/smp.h> | ||
30 | #include <asm/mtrr.h> | ||
31 | #include <asm/mpspec.h> | ||
32 | #include <asm/nmi.h> | ||
33 | #include <asm/msr.h> | ||
34 | #include <asm/proto.h> | ||
35 | #include <asm/kdebug.h> | ||
36 | |||
37 | /* | ||
38 | * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: | ||
39 | * - it may be reserved by some other driver, or not | ||
40 | * - when not reserved by some other driver, it may be used for | ||
41 | * the NMI watchdog, or not | ||
42 | * | ||
43 | * This is maintained separately from nmi_active because the NMI | ||
44 | * watchdog may also be driven from the I/O APIC timer. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(lapic_nmi_owner_lock); | ||
47 | static unsigned int lapic_nmi_owner; | ||
48 | #define LAPIC_NMI_WATCHDOG (1<<0) | ||
49 | #define LAPIC_NMI_RESERVED (1<<1) | ||
50 | |||
51 | /* nmi_active: | ||
52 | * +1: the lapic NMI watchdog is active, but can be disabled | ||
53 | * 0: the lapic NMI watchdog has not been set up, and cannot | ||
54 | * be enabled | ||
55 | * -1: the lapic NMI watchdog is disabled, but can be enabled | ||
56 | */ | ||
57 | int nmi_active; /* oprofile uses this */ | ||
58 | int panic_on_timeout; | ||
59 | |||
60 | unsigned int nmi_watchdog = NMI_DEFAULT; | ||
61 | static unsigned int nmi_hz = HZ; | ||
62 | unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ | ||
63 | |||
64 | /* Note that these events don't tick when the CPU idles. This means | ||
65 | the frequency varies with CPU load. */ | ||
66 | |||
67 | #define K7_EVNTSEL_ENABLE (1 << 22) | ||
68 | #define K7_EVNTSEL_INT (1 << 20) | ||
69 | #define K7_EVNTSEL_OS (1 << 17) | ||
70 | #define K7_EVNTSEL_USR (1 << 16) | ||
71 | #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 | ||
72 | #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING | ||
73 | |||
74 | #define P6_EVNTSEL0_ENABLE (1 << 22) | ||
75 | #define P6_EVNTSEL_INT (1 << 20) | ||
76 | #define P6_EVNTSEL_OS (1 << 17) | ||
77 | #define P6_EVNTSEL_USR (1 << 16) | ||
78 | #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 | ||
79 | #define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED | ||
80 | |||
81 | /* Run after command line and cpu_init init, but before all other checks */ | ||
82 | void __init nmi_watchdog_default(void) | ||
83 | { | ||
84 | if (nmi_watchdog != NMI_DEFAULT) | ||
85 | return; | ||
86 | |||
87 | /* For some reason the IO APIC watchdog doesn't work on the AMD | ||
88 | 8111 chipset. For now switch to local APIC mode using | ||
89 | perfctr0 there. On Intel CPUs we don't have code to handle | ||
90 | the perfctr and the IO-APIC seems to work, so use that. */ | ||
91 | |||
92 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | ||
93 | nmi_watchdog = NMI_LOCAL_APIC; | ||
94 | printk(KERN_INFO | ||
95 | "Using local APIC NMI watchdog using perfctr0\n"); | ||
96 | } else { | ||
97 | printk(KERN_INFO "Using IO APIC NMI watchdog\n"); | ||
98 | nmi_watchdog = NMI_IO_APIC; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | /* Why is there no CPUID flag for this? */ | ||
103 | static __init int cpu_has_lapic(void) | ||
104 | { | ||
105 | switch (boot_cpu_data.x86_vendor) { | ||
106 | case X86_VENDOR_INTEL: | ||
107 | case X86_VENDOR_AMD: | ||
108 | return boot_cpu_data.x86 >= 6; | ||
109 | /* .... add more cpus here or find a different way to figure this out. */ | ||
110 | default: | ||
111 | return 0; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | int __init check_nmi_watchdog (void) | ||
116 | { | ||
117 | int counts[NR_CPUS]; | ||
118 | int cpu; | ||
119 | |||
120 | if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { | ||
121 | nmi_watchdog = NMI_NONE; | ||
122 | return -1; | ||
123 | } | ||
124 | |||
125 | printk(KERN_INFO "testing NMI watchdog ... "); | ||
126 | |||
127 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
128 | counts[cpu] = cpu_pda[cpu].__nmi_count; | ||
129 | local_irq_enable(); | ||
130 | mdelay((10*1000)/nmi_hz); // wait 10 ticks | ||
131 | |||
132 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
133 | #ifdef CONFIG_SMP | ||
134 | /* Check cpu_callin_map here because that is set | ||
135 | after the timer is started. */ | ||
136 | if (!cpu_isset(cpu, cpu_callin_map)) | ||
137 | continue; | ||
138 | #endif | ||
139 | if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { | ||
140 | printk("CPU#%d: NMI appears to be stuck (%d)!\n", | ||
141 | cpu, | ||
142 | cpu_pda[cpu].__nmi_count); | ||
143 | nmi_active = 0; | ||
144 | lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; | ||
145 | return -1; | ||
146 | } | ||
147 | } | ||
148 | printk("OK.\n"); | ||
149 | |||
150 | /* now that we know it works we can reduce NMI frequency to | ||
151 | something more reasonable; makes a difference in some configs */ | ||
152 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
153 | nmi_hz = 1; | ||
154 | |||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | int __init setup_nmi_watchdog(char *str) | ||
159 | { | ||
160 | int nmi; | ||
161 | |||
162 | if (!strncmp(str,"panic",5)) { | ||
163 | panic_on_timeout = 1; | ||
164 | str = strchr(str, ','); | ||
165 | if (!str) | ||
166 | return 1; | ||
167 | ++str; | ||
168 | } | ||
169 | |||
170 | get_option(&str, &nmi); | ||
171 | |||
172 | if (nmi >= NMI_INVALID) | ||
173 | return 0; | ||
174 | nmi_watchdog = nmi; | ||
175 | return 1; | ||
176 | } | ||
177 | |||
178 | __setup("nmi_watchdog=", setup_nmi_watchdog); | ||
179 | |||
180 | static void disable_lapic_nmi_watchdog(void) | ||
181 | { | ||
182 | if (nmi_active <= 0) | ||
183 | return; | ||
184 | switch (boot_cpu_data.x86_vendor) { | ||
185 | case X86_VENDOR_AMD: | ||
186 | wrmsr(MSR_K7_EVNTSEL0, 0, 0); | ||
187 | break; | ||
188 | case X86_VENDOR_INTEL: | ||
189 | wrmsr(MSR_IA32_EVNTSEL0, 0, 0); | ||
190 | break; | ||
191 | } | ||
192 | nmi_active = -1; | ||
193 | /* tell do_nmi() and others that we're not active any more */ | ||
194 | nmi_watchdog = 0; | ||
195 | } | ||
196 | |||
197 | static void enable_lapic_nmi_watchdog(void) | ||
198 | { | ||
199 | if (nmi_active < 0) { | ||
200 | nmi_watchdog = NMI_LOCAL_APIC; | ||
201 | setup_apic_nmi_watchdog(); | ||
202 | } | ||
203 | } | ||
204 | |||
205 | int reserve_lapic_nmi(void) | ||
206 | { | ||
207 | unsigned int old_owner; | ||
208 | |||
209 | spin_lock(&lapic_nmi_owner_lock); | ||
210 | old_owner = lapic_nmi_owner; | ||
211 | lapic_nmi_owner |= LAPIC_NMI_RESERVED; | ||
212 | spin_unlock(&lapic_nmi_owner_lock); | ||
213 | if (old_owner & LAPIC_NMI_RESERVED) | ||
214 | return -EBUSY; | ||
215 | if (old_owner & LAPIC_NMI_WATCHDOG) | ||
216 | disable_lapic_nmi_watchdog(); | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | void release_lapic_nmi(void) | ||
221 | { | ||
222 | unsigned int new_owner; | ||
223 | |||
224 | spin_lock(&lapic_nmi_owner_lock); | ||
225 | new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; | ||
226 | lapic_nmi_owner = new_owner; | ||
227 | spin_unlock(&lapic_nmi_owner_lock); | ||
228 | if (new_owner & LAPIC_NMI_WATCHDOG) | ||
229 | enable_lapic_nmi_watchdog(); | ||
230 | } | ||
231 | |||
232 | void disable_timer_nmi_watchdog(void) | ||
233 | { | ||
234 | if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) | ||
235 | return; | ||
236 | |||
237 | disable_irq(0); | ||
238 | unset_nmi_callback(); | ||
239 | nmi_active = -1; | ||
240 | nmi_watchdog = NMI_NONE; | ||
241 | } | ||
242 | |||
243 | void enable_timer_nmi_watchdog(void) | ||
244 | { | ||
245 | if (nmi_active < 0) { | ||
246 | nmi_watchdog = NMI_IO_APIC; | ||
247 | touch_nmi_watchdog(); | ||
248 | nmi_active = 1; | ||
249 | enable_irq(0); | ||
250 | } | ||
251 | } | ||
252 | |||
253 | #ifdef CONFIG_PM | ||
254 | |||
255 | static int nmi_pm_active; /* nmi_active before suspend */ | ||
256 | |||
257 | static int lapic_nmi_suspend(struct sys_device *dev, u32 state) | ||
258 | { | ||
259 | nmi_pm_active = nmi_active; | ||
260 | disable_lapic_nmi_watchdog(); | ||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | static int lapic_nmi_resume(struct sys_device *dev) | ||
265 | { | ||
266 | if (nmi_pm_active > 0) | ||
267 | enable_lapic_nmi_watchdog(); | ||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | static struct sysdev_class nmi_sysclass = { | ||
272 | set_kset_name("lapic_nmi"), | ||
273 | .resume = lapic_nmi_resume, | ||
274 | .suspend = lapic_nmi_suspend, | ||
275 | }; | ||
276 | |||
277 | static struct sys_device device_lapic_nmi = { | ||
278 | .id = 0, | ||
279 | .cls = &nmi_sysclass, | ||
280 | }; | ||
281 | |||
282 | static int __init init_lapic_nmi_sysfs(void) | ||
283 | { | ||
284 | int error; | ||
285 | |||
286 | if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) | ||
287 | return 0; | ||
288 | |||
289 | error = sysdev_class_register(&nmi_sysclass); | ||
290 | if (!error) | ||
291 | error = sysdev_register(&device_lapic_nmi); | ||
292 | return error; | ||
293 | } | ||
294 | /* must come after the local APIC's device_initcall() */ | ||
295 | late_initcall(init_lapic_nmi_sysfs); | ||
296 | |||
297 | #endif /* CONFIG_PM */ | ||
298 | |||
299 | /* | ||
300 | * Activate the NMI watchdog via the local APIC. | ||
301 | * Original code written by Keith Owens. | ||
302 | */ | ||
303 | |||
304 | static void setup_k7_watchdog(void) | ||
305 | { | ||
306 | int i; | ||
307 | unsigned int evntsel; | ||
308 | |||
309 | /* No check, so can start with slow frequency */ | ||
310 | nmi_hz = 1; | ||
311 | |||
312 | /* XXX should check these in EFER */ | ||
313 | |||
314 | nmi_perfctr_msr = MSR_K7_PERFCTR0; | ||
315 | |||
316 | for(i = 0; i < 4; ++i) { | ||
317 | /* Simulator may not support it */ | ||
318 | if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) | ||
319 | return; | ||
320 | wrmsrl(MSR_K7_PERFCTR0+i, 0UL); | ||
321 | } | ||
322 | |||
323 | evntsel = K7_EVNTSEL_INT | ||
324 | | K7_EVNTSEL_OS | ||
325 | | K7_EVNTSEL_USR | ||
326 | | K7_NMI_EVENT; | ||
327 | |||
328 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); | ||
329 | wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); | ||
330 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
331 | evntsel |= K7_EVNTSEL_ENABLE; | ||
332 | wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); | ||
333 | } | ||
334 | |||
335 | void setup_apic_nmi_watchdog(void) | ||
336 | { | ||
337 | switch (boot_cpu_data.x86_vendor) { | ||
338 | case X86_VENDOR_AMD: | ||
339 | if (boot_cpu_data.x86 < 6) | ||
340 | return; | ||
341 | if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) | ||
342 | return; | ||
343 | setup_k7_watchdog(); | ||
344 | break; | ||
345 | default: | ||
346 | return; | ||
347 | } | ||
348 | lapic_nmi_owner = LAPIC_NMI_WATCHDOG; | ||
349 | nmi_active = 1; | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * the best way to detect whether a CPU has a 'hard lockup' problem | ||
354 | * is to check it's local APIC timer IRQ counts. If they are not | ||
355 | * changing then that CPU has some problem. | ||
356 | * | ||
357 | * as these watchdog NMI IRQs are generated on every CPU, we only | ||
358 | * have to check the current processor. | ||
359 | * | ||
360 | * since NMIs don't listen to _any_ locks, we have to be extremely | ||
361 | * careful not to rely on unsafe variables. The printk might lock | ||
362 | * up though, so we have to break up any console locks first ... | ||
363 | * [when there will be more tty-related locks, break them up | ||
364 | * here too!] | ||
365 | */ | ||
366 | |||
367 | static unsigned int | ||
368 | last_irq_sums [NR_CPUS], | ||
369 | alert_counter [NR_CPUS]; | ||
370 | |||
371 | void touch_nmi_watchdog (void) | ||
372 | { | ||
373 | int i; | ||
374 | |||
375 | /* | ||
376 | * Just reset the alert counters, (other CPUs might be | ||
377 | * spinning on locks we hold): | ||
378 | */ | ||
379 | for (i = 0; i < NR_CPUS; i++) | ||
380 | alert_counter[i] = 0; | ||
381 | } | ||
382 | |||
383 | void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) | ||
384 | { | ||
385 | int sum, cpu; | ||
386 | |||
387 | cpu = safe_smp_processor_id(); | ||
388 | sum = read_pda(apic_timer_irqs); | ||
389 | if (last_irq_sums[cpu] == sum) { | ||
390 | /* | ||
391 | * Ayiee, looks like this CPU is stuck ... | ||
392 | * wait a few IRQs (5 seconds) before doing the oops ... | ||
393 | */ | ||
394 | alert_counter[cpu]++; | ||
395 | if (alert_counter[cpu] == 5*nmi_hz) { | ||
396 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | ||
397 | == NOTIFY_STOP) { | ||
398 | alert_counter[cpu] = 0; | ||
399 | return; | ||
400 | } | ||
401 | die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); | ||
402 | } | ||
403 | } else { | ||
404 | last_irq_sums[cpu] = sum; | ||
405 | alert_counter[cpu] = 0; | ||
406 | } | ||
407 | if (nmi_perfctr_msr) | ||
408 | wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); | ||
409 | } | ||
410 | |||
411 | static int dummy_nmi_callback(struct pt_regs * regs, int cpu) | ||
412 | { | ||
413 | return 0; | ||
414 | } | ||
415 | |||
416 | static nmi_callback_t nmi_callback = dummy_nmi_callback; | ||
417 | |||
418 | asmlinkage void do_nmi(struct pt_regs * regs, long error_code) | ||
419 | { | ||
420 | int cpu = safe_smp_processor_id(); | ||
421 | |||
422 | nmi_enter(); | ||
423 | add_pda(__nmi_count,1); | ||
424 | if (!nmi_callback(regs, cpu)) | ||
425 | default_do_nmi(regs); | ||
426 | nmi_exit(); | ||
427 | } | ||
428 | |||
429 | void set_nmi_callback(nmi_callback_t callback) | ||
430 | { | ||
431 | nmi_callback = callback; | ||
432 | } | ||
433 | |||
434 | void unset_nmi_callback(void) | ||
435 | { | ||
436 | nmi_callback = dummy_nmi_callback; | ||
437 | } | ||
438 | |||
439 | #ifdef CONFIG_SYSCTL | ||
440 | |||
441 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | ||
442 | { | ||
443 | unsigned char reason = get_nmi_reason(); | ||
444 | char buf[64]; | ||
445 | |||
446 | if (!(reason & 0xc0)) { | ||
447 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | ||
448 | die_nmi(buf,regs); | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * proc handler for /proc/sys/kernel/unknown_nmi_panic | ||
455 | */ | ||
456 | int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, | ||
457 | void __user *buffer, size_t *length, loff_t *ppos) | ||
458 | { | ||
459 | int old_state; | ||
460 | |||
461 | old_state = unknown_nmi_panic; | ||
462 | proc_dointvec(table, write, file, buffer, length, ppos); | ||
463 | if (!!old_state == !!unknown_nmi_panic) | ||
464 | return 0; | ||
465 | |||
466 | if (unknown_nmi_panic) { | ||
467 | if (reserve_lapic_nmi() < 0) { | ||
468 | unknown_nmi_panic = 0; | ||
469 | return -EBUSY; | ||
470 | } else { | ||
471 | set_nmi_callback(unknown_nmi_panic_callback); | ||
472 | } | ||
473 | } else { | ||
474 | release_lapic_nmi(); | ||
475 | unset_nmi_callback(); | ||
476 | } | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | #endif | ||
481 | |||
482 | EXPORT_SYMBOL(nmi_active); | ||
483 | EXPORT_SYMBOL(nmi_watchdog); | ||
484 | EXPORT_SYMBOL(reserve_lapic_nmi); | ||
485 | EXPORT_SYMBOL(release_lapic_nmi); | ||
486 | EXPORT_SYMBOL(disable_timer_nmi_watchdog); | ||
487 | EXPORT_SYMBOL(enable_timer_nmi_watchdog); | ||
488 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c new file mode 100644 index 000000000000..cab471cf3edb --- /dev/null +++ b/arch/x86_64/kernel/pci-dma.c | |||
@@ -0,0 +1,60 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support. | ||
3 | */ | ||
4 | |||
5 | #include <linux/types.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/pci.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <asm/io.h> | ||
11 | |||
12 | /* Map a set of buffers described by scatterlist in streaming | ||
13 | * mode for DMA. This is the scatter-gather version of the | ||
14 | * above pci_map_single interface. Here the scatter gather list | ||
15 | * elements are each tagged with the appropriate dma address | ||
16 | * and length. They are obtained via sg_dma_{address,length}(SG). | ||
17 | * | ||
18 | * NOTE: An implementation may be able to use a smaller number of | ||
19 | * DMA address/length pairs than there are SG table elements. | ||
20 | * (for example via virtual mapping capabilities) | ||
21 | * The routine returns the number of addr/length pairs actually | ||
22 | * used, at most nents. | ||
23 | * | ||
24 | * Device ownership issues as mentioned above for pci_map_single are | ||
25 | * the same here. | ||
26 | */ | ||
27 | int dma_map_sg(struct device *hwdev, struct scatterlist *sg, | ||
28 | int nents, int direction) | ||
29 | { | ||
30 | int i; | ||
31 | |||
32 | BUG_ON(direction == DMA_NONE); | ||
33 | for (i = 0; i < nents; i++ ) { | ||
34 | struct scatterlist *s = &sg[i]; | ||
35 | BUG_ON(!s->page); | ||
36 | s->dma_address = virt_to_bus(page_address(s->page) +s->offset); | ||
37 | s->dma_length = s->length; | ||
38 | } | ||
39 | return nents; | ||
40 | } | ||
41 | |||
42 | EXPORT_SYMBOL(dma_map_sg); | ||
43 | |||
44 | /* Unmap a set of streaming mode DMA translations. | ||
45 | * Again, cpu read rules concerning calls here are the same as for | ||
46 | * pci_unmap_single() above. | ||
47 | */ | ||
48 | void dma_unmap_sg(struct device *dev, struct scatterlist *sg, | ||
49 | int nents, int dir) | ||
50 | { | ||
51 | int i; | ||
52 | for (i = 0; i < nents; i++) { | ||
53 | struct scatterlist *s = &sg[i]; | ||
54 | BUG_ON(s->page == NULL); | ||
55 | BUG_ON(s->dma_address == 0); | ||
56 | dma_unmap_single(dev, s->dma_address, s->dma_length, dir); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | EXPORT_SYMBOL(dma_unmap_sg); | ||
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c new file mode 100644 index 000000000000..57f35c68aa34 --- /dev/null +++ b/arch/x86_64/kernel/pci-gart.c | |||
@@ -0,0 +1,980 @@ | |||
1 | /* | ||
2 | * Dynamic DMA mapping support for AMD Hammer. | ||
3 | * | ||
4 | * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. | ||
5 | * This allows to use PCI devices that only support 32bit addresses on systems | ||
6 | * with more than 4GB. | ||
7 | * | ||
8 | * See Documentation/DMA-mapping.txt for the interface specification. | ||
9 | * | ||
10 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
11 | */ | ||
12 | |||
13 | #include <linux/config.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/ctype.h> | ||
16 | #include <linux/agp_backend.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/spinlock.h> | ||
21 | #include <linux/pci.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/topology.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/bitops.h> | ||
26 | #include <asm/atomic.h> | ||
27 | #include <asm/io.h> | ||
28 | #include <asm/mtrr.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/cacheflush.h> | ||
32 | #include <asm/kdebug.h> | ||
33 | |||
34 | dma_addr_t bad_dma_address; | ||
35 | |||
36 | unsigned long iommu_bus_base; /* GART remapping area (physical) */ | ||
37 | static unsigned long iommu_size; /* size of remapping area bytes */ | ||
38 | static unsigned long iommu_pages; /* .. and in pages */ | ||
39 | |||
40 | u32 *iommu_gatt_base; /* Remapping table */ | ||
41 | |||
42 | int no_iommu; | ||
43 | static int no_agp; | ||
44 | #ifdef CONFIG_IOMMU_DEBUG | ||
45 | int panic_on_overflow = 1; | ||
46 | int force_iommu = 1; | ||
47 | #else | ||
48 | int panic_on_overflow = 0; | ||
49 | int force_iommu = 0; | ||
50 | #endif | ||
51 | int iommu_merge = 1; | ||
52 | int iommu_sac_force = 0; | ||
53 | |||
54 | /* If this is disabled the IOMMU will use an optimized flushing strategy | ||
55 | of only flushing when an mapping is reused. With it true the GART is flushed | ||
56 | for every mapping. Problem is that doing the lazy flush seems to trigger | ||
57 | bugs with some popular PCI cards, in particular 3ware (but has been also | ||
58 | also seen with Qlogic at least). */ | ||
59 | int iommu_fullflush = 1; | ||
60 | |||
61 | /* This tells the BIO block layer to assume merging. Default to off | ||
62 | because we cannot guarantee merging later. */ | ||
63 | int iommu_bio_merge = 0; | ||
64 | |||
65 | #define MAX_NB 8 | ||
66 | |||
67 | /* Allocation bitmap for the remapping area */ | ||
68 | static DEFINE_SPINLOCK(iommu_bitmap_lock); | ||
69 | static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ | ||
70 | |||
71 | static u32 gart_unmapped_entry; | ||
72 | |||
73 | #define GPTE_VALID 1 | ||
74 | #define GPTE_COHERENT 2 | ||
75 | #define GPTE_ENCODE(x) \ | ||
76 | (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) | ||
77 | #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) | ||
78 | |||
79 | #define to_pages(addr,size) \ | ||
80 | (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) | ||
81 | |||
82 | #define for_all_nb(dev) \ | ||
83 | dev = NULL; \ | ||
84 | while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\ | ||
85 | if (dev->bus->number == 0 && \ | ||
86 | (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31)) | ||
87 | |||
88 | static struct pci_dev *northbridges[MAX_NB]; | ||
89 | static u32 northbridge_flush_word[MAX_NB]; | ||
90 | |||
91 | #define EMERGENCY_PAGES 32 /* = 128KB */ | ||
92 | |||
93 | #ifdef CONFIG_AGP | ||
94 | #define AGPEXTERN extern | ||
95 | #else | ||
96 | #define AGPEXTERN | ||
97 | #endif | ||
98 | |||
99 | /* backdoor interface to AGP driver */ | ||
100 | AGPEXTERN int agp_memory_reserved; | ||
101 | AGPEXTERN __u32 *agp_gatt_table; | ||
102 | |||
103 | static unsigned long next_bit; /* protected by iommu_bitmap_lock */ | ||
104 | static int need_flush; /* global flush state. set for each gart wrap */ | ||
105 | static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, | ||
106 | size_t size, int dir, int do_panic); | ||
107 | |||
108 | /* Dummy device used for NULL arguments (normally ISA). Better would | ||
109 | be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ | ||
110 | static struct device fallback_dev = { | ||
111 | .bus_id = "fallback device", | ||
112 | .coherent_dma_mask = 0xffffffff, | ||
113 | .dma_mask = &fallback_dev.coherent_dma_mask, | ||
114 | }; | ||
115 | |||
116 | static unsigned long alloc_iommu(int size) | ||
117 | { | ||
118 | unsigned long offset, flags; | ||
119 | |||
120 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
121 | offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); | ||
122 | if (offset == -1) { | ||
123 | need_flush = 1; | ||
124 | offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); | ||
125 | } | ||
126 | if (offset != -1) { | ||
127 | set_bit_string(iommu_gart_bitmap, offset, size); | ||
128 | next_bit = offset+size; | ||
129 | if (next_bit >= iommu_pages) { | ||
130 | next_bit = 0; | ||
131 | need_flush = 1; | ||
132 | } | ||
133 | } | ||
134 | if (iommu_fullflush) | ||
135 | need_flush = 1; | ||
136 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
137 | return offset; | ||
138 | } | ||
139 | |||
140 | static void free_iommu(unsigned long offset, int size) | ||
141 | { | ||
142 | unsigned long flags; | ||
143 | if (size == 1) { | ||
144 | clear_bit(offset, iommu_gart_bitmap); | ||
145 | return; | ||
146 | } | ||
147 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
148 | __clear_bit_string(iommu_gart_bitmap, offset, size); | ||
149 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * Use global flush state to avoid races with multiple flushers. | ||
154 | */ | ||
155 | static void flush_gart(struct device *dev) | ||
156 | { | ||
157 | unsigned long flags; | ||
158 | int flushed = 0; | ||
159 | int i, max; | ||
160 | |||
161 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | ||
162 | if (need_flush) { | ||
163 | max = 0; | ||
164 | for (i = 0; i < MAX_NB; i++) { | ||
165 | if (!northbridges[i]) | ||
166 | continue; | ||
167 | pci_write_config_dword(northbridges[i], 0x9c, | ||
168 | northbridge_flush_word[i] | 1); | ||
169 | flushed++; | ||
170 | max = i; | ||
171 | } | ||
172 | for (i = 0; i <= max; i++) { | ||
173 | u32 w; | ||
174 | if (!northbridges[i]) | ||
175 | continue; | ||
176 | /* Make sure the hardware actually executed the flush. */ | ||
177 | do { | ||
178 | pci_read_config_dword(northbridges[i], 0x9c, &w); | ||
179 | } while (w & 1); | ||
180 | } | ||
181 | if (!flushed) | ||
182 | printk("nothing to flush?\n"); | ||
183 | need_flush = 0; | ||
184 | } | ||
185 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | ||
186 | } | ||
187 | |||
188 | /* Allocate DMA memory on node near device */ | ||
189 | noinline | ||
190 | static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) | ||
191 | { | ||
192 | struct page *page; | ||
193 | int node; | ||
194 | if (dev->bus == &pci_bus_type) { | ||
195 | cpumask_t mask; | ||
196 | mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); | ||
197 | node = cpu_to_node(first_cpu(mask)); | ||
198 | } else | ||
199 | node = numa_node_id(); | ||
200 | page = alloc_pages_node(node, gfp, order); | ||
201 | return page ? page_address(page) : NULL; | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * Allocate memory for a coherent mapping. | ||
206 | */ | ||
207 | void * | ||
208 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | ||
209 | unsigned gfp) | ||
210 | { | ||
211 | void *memory; | ||
212 | unsigned long dma_mask = 0; | ||
213 | u64 bus; | ||
214 | |||
215 | if (!dev) | ||
216 | dev = &fallback_dev; | ||
217 | dma_mask = dev->coherent_dma_mask; | ||
218 | if (dma_mask == 0) | ||
219 | dma_mask = 0xffffffff; | ||
220 | |||
221 | /* Kludge to make it bug-to-bug compatible with i386. i386 | ||
222 | uses the normal dma_mask for alloc_coherent. */ | ||
223 | dma_mask &= *dev->dma_mask; | ||
224 | |||
225 | again: | ||
226 | memory = dma_alloc_pages(dev, gfp, get_order(size)); | ||
227 | if (memory == NULL) | ||
228 | return NULL; | ||
229 | |||
230 | { | ||
231 | int high, mmu; | ||
232 | bus = virt_to_bus(memory); | ||
233 | high = (bus + size) >= dma_mask; | ||
234 | mmu = high; | ||
235 | if (force_iommu && !(gfp & GFP_DMA)) | ||
236 | mmu = 1; | ||
237 | if (no_iommu || dma_mask < 0xffffffffUL) { | ||
238 | if (high) { | ||
239 | free_pages((unsigned long)memory, | ||
240 | get_order(size)); | ||
241 | |||
242 | if (swiotlb) { | ||
243 | return | ||
244 | swiotlb_alloc_coherent(dev, size, | ||
245 | dma_handle, | ||
246 | gfp); | ||
247 | } | ||
248 | |||
249 | if (!(gfp & GFP_DMA)) { | ||
250 | gfp |= GFP_DMA; | ||
251 | goto again; | ||
252 | } | ||
253 | return NULL; | ||
254 | } | ||
255 | mmu = 0; | ||
256 | } | ||
257 | memset(memory, 0, size); | ||
258 | if (!mmu) { | ||
259 | *dma_handle = virt_to_bus(memory); | ||
260 | return memory; | ||
261 | } | ||
262 | } | ||
263 | |||
264 | *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0); | ||
265 | if (*dma_handle == bad_dma_address) | ||
266 | goto error; | ||
267 | flush_gart(dev); | ||
268 | return memory; | ||
269 | |||
270 | error: | ||
271 | if (panic_on_overflow) | ||
272 | panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size); | ||
273 | free_pages((unsigned long)memory, get_order(size)); | ||
274 | return NULL; | ||
275 | } | ||
276 | |||
277 | /* | ||
278 | * Unmap coherent memory. | ||
279 | * The caller must ensure that the device has finished accessing the mapping. | ||
280 | */ | ||
281 | void dma_free_coherent(struct device *dev, size_t size, | ||
282 | void *vaddr, dma_addr_t bus) | ||
283 | { | ||
284 | if (swiotlb) { | ||
285 | swiotlb_free_coherent(dev, size, vaddr, bus); | ||
286 | return; | ||
287 | } | ||
288 | |||
289 | dma_unmap_single(dev, bus, size, 0); | ||
290 | free_pages((unsigned long)vaddr, get_order(size)); | ||
291 | } | ||
292 | |||
293 | #ifdef CONFIG_IOMMU_LEAK | ||
294 | |||
295 | #define SET_LEAK(x) if (iommu_leak_tab) \ | ||
296 | iommu_leak_tab[x] = __builtin_return_address(0); | ||
297 | #define CLEAR_LEAK(x) if (iommu_leak_tab) \ | ||
298 | iommu_leak_tab[x] = NULL; | ||
299 | |||
300 | /* Debugging aid for drivers that don't free their IOMMU tables */ | ||
301 | static void **iommu_leak_tab; | ||
302 | static int leak_trace; | ||
303 | int iommu_leak_pages = 20; | ||
304 | void dump_leak(void) | ||
305 | { | ||
306 | int i; | ||
307 | static int dump; | ||
308 | if (dump || !iommu_leak_tab) return; | ||
309 | dump = 1; | ||
310 | show_stack(NULL,NULL); | ||
311 | /* Very crude. dump some from the end of the table too */ | ||
312 | printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); | ||
313 | for (i = 0; i < iommu_leak_pages; i+=2) { | ||
314 | printk("%lu: ", iommu_pages-i); | ||
315 | printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); | ||
316 | printk("%c", (i+1)%2 == 0 ? '\n' : ' '); | ||
317 | } | ||
318 | printk("\n"); | ||
319 | } | ||
320 | #else | ||
321 | #define SET_LEAK(x) | ||
322 | #define CLEAR_LEAK(x) | ||
323 | #endif | ||
324 | |||
325 | static void iommu_full(struct device *dev, size_t size, int dir, int do_panic) | ||
326 | { | ||
327 | /* | ||
328 | * Ran out of IOMMU space for this operation. This is very bad. | ||
329 | * Unfortunately the drivers cannot handle this operation properly. | ||
330 | * Return some non mapped prereserved space in the aperture and | ||
331 | * let the Northbridge deal with it. This will result in garbage | ||
332 | * in the IO operation. When the size exceeds the prereserved space | ||
333 | * memory corruption will occur or random memory will be DMAed | ||
334 | * out. Hopefully no network devices use single mappings that big. | ||
335 | */ | ||
336 | |||
337 | printk(KERN_ERR | ||
338 | "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", | ||
339 | size, dev->bus_id); | ||
340 | |||
341 | if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) { | ||
342 | if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
343 | panic("PCI-DMA: Memory would be corrupted\n"); | ||
344 | if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) | ||
345 | panic("PCI-DMA: Random memory would be DMAed\n"); | ||
346 | } | ||
347 | |||
348 | #ifdef CONFIG_IOMMU_LEAK | ||
349 | dump_leak(); | ||
350 | #endif | ||
351 | } | ||
352 | |||
353 | static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) | ||
354 | { | ||
355 | u64 mask = *dev->dma_mask; | ||
356 | int high = addr + size >= mask; | ||
357 | int mmu = high; | ||
358 | if (force_iommu) | ||
359 | mmu = 1; | ||
360 | if (no_iommu) { | ||
361 | if (high) | ||
362 | panic("PCI-DMA: high address but no IOMMU.\n"); | ||
363 | mmu = 0; | ||
364 | } | ||
365 | return mmu; | ||
366 | } | ||
367 | |||
368 | static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | ||
369 | { | ||
370 | u64 mask = *dev->dma_mask; | ||
371 | int high = addr + size >= mask; | ||
372 | int mmu = high; | ||
373 | if (no_iommu) { | ||
374 | if (high) | ||
375 | panic("PCI-DMA: high address but no IOMMU.\n"); | ||
376 | mmu = 0; | ||
377 | } | ||
378 | return mmu; | ||
379 | } | ||
380 | |||
381 | /* Map a single continuous physical area into the IOMMU. | ||
382 | * Caller needs to check if the iommu is needed and flush. | ||
383 | */ | ||
384 | static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, | ||
385 | size_t size, int dir, int do_panic) | ||
386 | { | ||
387 | unsigned long npages = to_pages(phys_mem, size); | ||
388 | unsigned long iommu_page = alloc_iommu(npages); | ||
389 | int i; | ||
390 | if (iommu_page == -1) { | ||
391 | if (!nonforced_iommu(dev, phys_mem, size)) | ||
392 | return phys_mem; | ||
393 | if (panic_on_overflow) | ||
394 | panic("dma_map_area overflow %lu bytes\n", size); | ||
395 | iommu_full(dev, size, dir, do_panic); | ||
396 | return bad_dma_address; | ||
397 | } | ||
398 | |||
399 | for (i = 0; i < npages; i++) { | ||
400 | iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); | ||
401 | SET_LEAK(iommu_page + i); | ||
402 | phys_mem += PAGE_SIZE; | ||
403 | } | ||
404 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); | ||
405 | } | ||
406 | |||
407 | /* Map a single area into the IOMMU */ | ||
408 | dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir) | ||
409 | { | ||
410 | unsigned long phys_mem, bus; | ||
411 | |||
412 | BUG_ON(dir == DMA_NONE); | ||
413 | |||
414 | if (swiotlb) | ||
415 | return swiotlb_map_single(dev,addr,size,dir); | ||
416 | if (!dev) | ||
417 | dev = &fallback_dev; | ||
418 | |||
419 | phys_mem = virt_to_phys(addr); | ||
420 | if (!need_iommu(dev, phys_mem, size)) | ||
421 | return phys_mem; | ||
422 | |||
423 | bus = dma_map_area(dev, phys_mem, size, dir, 1); | ||
424 | flush_gart(dev); | ||
425 | return bus; | ||
426 | } | ||
427 | |||
428 | /* Fallback for dma_map_sg in case of overflow */ | ||
429 | static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, | ||
430 | int nents, int dir) | ||
431 | { | ||
432 | int i; | ||
433 | |||
434 | #ifdef CONFIG_IOMMU_DEBUG | ||
435 | printk(KERN_DEBUG "dma_map_sg overflow\n"); | ||
436 | #endif | ||
437 | |||
438 | for (i = 0; i < nents; i++ ) { | ||
439 | struct scatterlist *s = &sg[i]; | ||
440 | unsigned long addr = page_to_phys(s->page) + s->offset; | ||
441 | if (nonforced_iommu(dev, addr, s->length)) { | ||
442 | addr = dma_map_area(dev, addr, s->length, dir, 0); | ||
443 | if (addr == bad_dma_address) { | ||
444 | if (i > 0) | ||
445 | dma_unmap_sg(dev, sg, i, dir); | ||
446 | nents = 0; | ||
447 | sg[0].dma_length = 0; | ||
448 | break; | ||
449 | } | ||
450 | } | ||
451 | s->dma_address = addr; | ||
452 | s->dma_length = s->length; | ||
453 | } | ||
454 | flush_gart(dev); | ||
455 | return nents; | ||
456 | } | ||
457 | |||
458 | /* Map multiple scatterlist entries continuous into the first. */ | ||
459 | static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
460 | struct scatterlist *sout, unsigned long pages) | ||
461 | { | ||
462 | unsigned long iommu_start = alloc_iommu(pages); | ||
463 | unsigned long iommu_page = iommu_start; | ||
464 | int i; | ||
465 | |||
466 | if (iommu_start == -1) | ||
467 | return -1; | ||
468 | |||
469 | for (i = start; i < stopat; i++) { | ||
470 | struct scatterlist *s = &sg[i]; | ||
471 | unsigned long pages, addr; | ||
472 | unsigned long phys_addr = s->dma_address; | ||
473 | |||
474 | BUG_ON(i > start && s->offset); | ||
475 | if (i == start) { | ||
476 | *sout = *s; | ||
477 | sout->dma_address = iommu_bus_base; | ||
478 | sout->dma_address += iommu_page*PAGE_SIZE + s->offset; | ||
479 | sout->dma_length = s->length; | ||
480 | } else { | ||
481 | sout->dma_length += s->length; | ||
482 | } | ||
483 | |||
484 | addr = phys_addr; | ||
485 | pages = to_pages(s->offset, s->length); | ||
486 | while (pages--) { | ||
487 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | ||
488 | SET_LEAK(iommu_page); | ||
489 | addr += PAGE_SIZE; | ||
490 | iommu_page++; | ||
491 | } | ||
492 | } | ||
493 | BUG_ON(iommu_page - iommu_start != pages); | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, | ||
498 | struct scatterlist *sout, | ||
499 | unsigned long pages, int need) | ||
500 | { | ||
501 | if (!need) { | ||
502 | BUG_ON(stopat - start != 1); | ||
503 | *sout = sg[start]; | ||
504 | sout->dma_length = sg[start].length; | ||
505 | return 0; | ||
506 | } | ||
507 | return __dma_map_cont(sg, start, stopat, sout, pages); | ||
508 | } | ||
509 | |||
510 | /* | ||
511 | * DMA map all entries in a scatterlist. | ||
512 | * Merge chunks that have page aligned sizes into a continuous mapping. | ||
513 | */ | ||
514 | int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
515 | { | ||
516 | int i; | ||
517 | int out; | ||
518 | int start; | ||
519 | unsigned long pages = 0; | ||
520 | int need = 0, nextneed; | ||
521 | |||
522 | BUG_ON(dir == DMA_NONE); | ||
523 | if (nents == 0) | ||
524 | return 0; | ||
525 | |||
526 | if (swiotlb) | ||
527 | return swiotlb_map_sg(dev,sg,nents,dir); | ||
528 | if (!dev) | ||
529 | dev = &fallback_dev; | ||
530 | |||
531 | out = 0; | ||
532 | start = 0; | ||
533 | for (i = 0; i < nents; i++) { | ||
534 | struct scatterlist *s = &sg[i]; | ||
535 | dma_addr_t addr = page_to_phys(s->page) + s->offset; | ||
536 | s->dma_address = addr; | ||
537 | BUG_ON(s->length == 0); | ||
538 | |||
539 | nextneed = need_iommu(dev, addr, s->length); | ||
540 | |||
541 | /* Handle the previous not yet processed entries */ | ||
542 | if (i > start) { | ||
543 | struct scatterlist *ps = &sg[i-1]; | ||
544 | /* Can only merge when the last chunk ends on a page | ||
545 | boundary and the new one doesn't have an offset. */ | ||
546 | if (!iommu_merge || !nextneed || !need || s->offset || | ||
547 | (ps->offset + ps->length) % PAGE_SIZE) { | ||
548 | if (dma_map_cont(sg, start, i, sg+out, pages, | ||
549 | need) < 0) | ||
550 | goto error; | ||
551 | out++; | ||
552 | pages = 0; | ||
553 | start = i; | ||
554 | } | ||
555 | } | ||
556 | |||
557 | need = nextneed; | ||
558 | pages += to_pages(s->offset, s->length); | ||
559 | } | ||
560 | if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) | ||
561 | goto error; | ||
562 | out++; | ||
563 | flush_gart(dev); | ||
564 | if (out < nents) | ||
565 | sg[out].dma_length = 0; | ||
566 | return out; | ||
567 | |||
568 | error: | ||
569 | flush_gart(NULL); | ||
570 | dma_unmap_sg(dev, sg, nents, dir); | ||
571 | /* When it was forced try again unforced */ | ||
572 | if (force_iommu) | ||
573 | return dma_map_sg_nonforce(dev, sg, nents, dir); | ||
574 | if (panic_on_overflow) | ||
575 | panic("dma_map_sg: overflow on %lu pages\n", pages); | ||
576 | iommu_full(dev, pages << PAGE_SHIFT, dir, 0); | ||
577 | for (i = 0; i < nents; i++) | ||
578 | sg[i].dma_address = bad_dma_address; | ||
579 | return 0; | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * Free a DMA mapping. | ||
584 | */ | ||
585 | void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, | ||
586 | size_t size, int direction) | ||
587 | { | ||
588 | unsigned long iommu_page; | ||
589 | int npages; | ||
590 | int i; | ||
591 | |||
592 | if (swiotlb) { | ||
593 | swiotlb_unmap_single(dev,dma_addr,size,direction); | ||
594 | return; | ||
595 | } | ||
596 | |||
597 | if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || | ||
598 | dma_addr >= iommu_bus_base + iommu_size) | ||
599 | return; | ||
600 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | ||
601 | npages = to_pages(dma_addr, size); | ||
602 | for (i = 0; i < npages; i++) { | ||
603 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | ||
604 | CLEAR_LEAK(iommu_page + i); | ||
605 | } | ||
606 | free_iommu(iommu_page, npages); | ||
607 | } | ||
608 | |||
609 | /* | ||
610 | * Wrapper for pci_unmap_single working with scatterlists. | ||
611 | */ | ||
612 | void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | ||
613 | { | ||
614 | int i; | ||
615 | if (swiotlb) { | ||
616 | swiotlb_unmap_sg(dev,sg,nents,dir); | ||
617 | return; | ||
618 | } | ||
619 | for (i = 0; i < nents; i++) { | ||
620 | struct scatterlist *s = &sg[i]; | ||
621 | if (!s->dma_length || !s->length) | ||
622 | break; | ||
623 | dma_unmap_single(dev, s->dma_address, s->dma_length, dir); | ||
624 | } | ||
625 | } | ||
626 | |||
627 | int dma_supported(struct device *dev, u64 mask) | ||
628 | { | ||
629 | /* Copied from i386. Doesn't make much sense, because it will | ||
630 | only work for pci_alloc_coherent. | ||
631 | The caller just has to use GFP_DMA in this case. */ | ||
632 | if (mask < 0x00ffffff) | ||
633 | return 0; | ||
634 | |||
635 | /* Tell the device to use SAC when IOMMU force is on. | ||
636 | This allows the driver to use cheaper accesses in some cases. | ||
637 | |||
638 | Problem with this is that if we overflow the IOMMU area | ||
639 | and return DAC as fallback address the device may not handle it correctly. | ||
640 | |||
641 | As a special case some controllers have a 39bit address mode | ||
642 | that is as efficient as 32bit (aic79xx). Don't force SAC for these. | ||
643 | Assume all masks <= 40 bits are of this type. Normally this doesn't | ||
644 | make any difference, but gives more gentle handling of IOMMU overflow. */ | ||
645 | if (iommu_sac_force && (mask >= 0xffffffffffULL)) { | ||
646 | printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); | ||
647 | return 0; | ||
648 | } | ||
649 | |||
650 | return 1; | ||
651 | } | ||
652 | |||
653 | int dma_get_cache_alignment(void) | ||
654 | { | ||
655 | return boot_cpu_data.x86_clflush_size; | ||
656 | } | ||
657 | |||
658 | EXPORT_SYMBOL(dma_unmap_sg); | ||
659 | EXPORT_SYMBOL(dma_map_sg); | ||
660 | EXPORT_SYMBOL(dma_map_single); | ||
661 | EXPORT_SYMBOL(dma_unmap_single); | ||
662 | EXPORT_SYMBOL(dma_supported); | ||
663 | EXPORT_SYMBOL(no_iommu); | ||
664 | EXPORT_SYMBOL(force_iommu); | ||
665 | EXPORT_SYMBOL(bad_dma_address); | ||
666 | EXPORT_SYMBOL(iommu_bio_merge); | ||
667 | EXPORT_SYMBOL(iommu_sac_force); | ||
668 | EXPORT_SYMBOL(dma_get_cache_alignment); | ||
669 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
670 | EXPORT_SYMBOL(dma_free_coherent); | ||
671 | |||
672 | static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) | ||
673 | { | ||
674 | unsigned long a; | ||
675 | if (!iommu_size) { | ||
676 | iommu_size = aper_size; | ||
677 | if (!no_agp) | ||
678 | iommu_size /= 2; | ||
679 | } | ||
680 | |||
681 | a = aper + iommu_size; | ||
682 | iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; | ||
683 | |||
684 | if (iommu_size < 64*1024*1024) | ||
685 | printk(KERN_WARNING | ||
686 | "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); | ||
687 | |||
688 | return iommu_size; | ||
689 | } | ||
690 | |||
691 | static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) | ||
692 | { | ||
693 | unsigned aper_size = 0, aper_base_32; | ||
694 | u64 aper_base; | ||
695 | unsigned aper_order; | ||
696 | |||
697 | pci_read_config_dword(dev, 0x94, &aper_base_32); | ||
698 | pci_read_config_dword(dev, 0x90, &aper_order); | ||
699 | aper_order = (aper_order >> 1) & 7; | ||
700 | |||
701 | aper_base = aper_base_32 & 0x7fff; | ||
702 | aper_base <<= 25; | ||
703 | |||
704 | aper_size = (32 * 1024 * 1024) << aper_order; | ||
705 | if (aper_base + aper_size >= 0xffffffff || !aper_size) | ||
706 | aper_base = 0; | ||
707 | |||
708 | *size = aper_size; | ||
709 | return aper_base; | ||
710 | } | ||
711 | |||
712 | /* | ||
713 | * Private Northbridge GATT initialization in case we cannot use the | ||
714 | * AGP driver for some reason. | ||
715 | */ | ||
716 | static __init int init_k8_gatt(struct agp_kern_info *info) | ||
717 | { | ||
718 | struct pci_dev *dev; | ||
719 | void *gatt; | ||
720 | unsigned aper_base, new_aper_base; | ||
721 | unsigned aper_size, gatt_size, new_aper_size; | ||
722 | |||
723 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); | ||
724 | aper_size = aper_base = info->aper_size = 0; | ||
725 | for_all_nb(dev) { | ||
726 | new_aper_base = read_aperture(dev, &new_aper_size); | ||
727 | if (!new_aper_base) | ||
728 | goto nommu; | ||
729 | |||
730 | if (!aper_base) { | ||
731 | aper_size = new_aper_size; | ||
732 | aper_base = new_aper_base; | ||
733 | } | ||
734 | if (aper_size != new_aper_size || aper_base != new_aper_base) | ||
735 | goto nommu; | ||
736 | } | ||
737 | if (!aper_base) | ||
738 | goto nommu; | ||
739 | info->aper_base = aper_base; | ||
740 | info->aper_size = aper_size>>20; | ||
741 | |||
742 | gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); | ||
743 | gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); | ||
744 | if (!gatt) | ||
745 | panic("Cannot allocate GATT table"); | ||
746 | memset(gatt, 0, gatt_size); | ||
747 | agp_gatt_table = gatt; | ||
748 | |||
749 | for_all_nb(dev) { | ||
750 | u32 ctl; | ||
751 | u32 gatt_reg; | ||
752 | |||
753 | gatt_reg = __pa(gatt) >> 12; | ||
754 | gatt_reg <<= 4; | ||
755 | pci_write_config_dword(dev, 0x98, gatt_reg); | ||
756 | pci_read_config_dword(dev, 0x90, &ctl); | ||
757 | |||
758 | ctl |= 1; | ||
759 | ctl &= ~((1<<4) | (1<<5)); | ||
760 | |||
761 | pci_write_config_dword(dev, 0x90, ctl); | ||
762 | } | ||
763 | flush_gart(NULL); | ||
764 | |||
765 | printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); | ||
766 | return 0; | ||
767 | |||
768 | nommu: | ||
769 | /* Should not happen anymore */ | ||
770 | printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" | ||
771 | KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); | ||
772 | return -1; | ||
773 | } | ||
774 | |||
775 | extern int agp_amd64_init(void); | ||
776 | |||
777 | static int __init pci_iommu_init(void) | ||
778 | { | ||
779 | struct agp_kern_info info; | ||
780 | unsigned long aper_size; | ||
781 | unsigned long iommu_start; | ||
782 | struct pci_dev *dev; | ||
783 | unsigned long scratch; | ||
784 | long i; | ||
785 | |||
786 | #ifndef CONFIG_AGP_AMD64 | ||
787 | no_agp = 1; | ||
788 | #else | ||
789 | /* Makefile puts PCI initialization via subsys_initcall first. */ | ||
790 | /* Add other K8 AGP bridge drivers here */ | ||
791 | no_agp = no_agp || | ||
792 | (agp_amd64_init() < 0) || | ||
793 | (agp_copy_info(agp_bridge, &info) < 0); | ||
794 | #endif | ||
795 | |||
796 | if (swiotlb) { | ||
797 | no_iommu = 1; | ||
798 | printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); | ||
799 | return -1; | ||
800 | } | ||
801 | |||
802 | if (no_iommu || | ||
803 | (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) || | ||
804 | !iommu_aperture || | ||
805 | (no_agp && init_k8_gatt(&info) < 0)) { | ||
806 | printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); | ||
807 | no_iommu = 1; | ||
808 | return -1; | ||
809 | } | ||
810 | |||
811 | aper_size = info.aper_size * 1024 * 1024; | ||
812 | iommu_size = check_iommu_size(info.aper_base, aper_size); | ||
813 | iommu_pages = iommu_size >> PAGE_SHIFT; | ||
814 | |||
815 | iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, | ||
816 | get_order(iommu_pages/8)); | ||
817 | if (!iommu_gart_bitmap) | ||
818 | panic("Cannot allocate iommu bitmap\n"); | ||
819 | memset(iommu_gart_bitmap, 0, iommu_pages/8); | ||
820 | |||
821 | #ifdef CONFIG_IOMMU_LEAK | ||
822 | if (leak_trace) { | ||
823 | iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, | ||
824 | get_order(iommu_pages*sizeof(void *))); | ||
825 | if (iommu_leak_tab) | ||
826 | memset(iommu_leak_tab, 0, iommu_pages * 8); | ||
827 | else | ||
828 | printk("PCI-DMA: Cannot allocate leak trace area\n"); | ||
829 | } | ||
830 | #endif | ||
831 | |||
832 | /* | ||
833 | * Out of IOMMU space handling. | ||
834 | * Reserve some invalid pages at the beginning of the GART. | ||
835 | */ | ||
836 | set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); | ||
837 | |||
838 | agp_memory_reserved = iommu_size; | ||
839 | printk(KERN_INFO | ||
840 | "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", | ||
841 | iommu_size>>20); | ||
842 | |||
843 | iommu_start = aper_size - iommu_size; | ||
844 | iommu_bus_base = info.aper_base + iommu_start; | ||
845 | bad_dma_address = iommu_bus_base; | ||
846 | iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); | ||
847 | |||
848 | /* | ||
849 | * Unmap the IOMMU part of the GART. The alias of the page is | ||
850 | * always mapped with cache enabled and there is no full cache | ||
851 | * coherency across the GART remapping. The unmapping avoids | ||
852 | * automatic prefetches from the CPU allocating cache lines in | ||
853 | * there. All CPU accesses are done via the direct mapping to | ||
854 | * the backing memory. The GART address is only used by PCI | ||
855 | * devices. | ||
856 | */ | ||
857 | clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); | ||
858 | |||
859 | /* | ||
860 | * Try to workaround a bug (thanks to BenH) | ||
861 | * Set unmapped entries to a scratch page instead of 0. | ||
862 | * Any prefetches that hit unmapped entries won't get an bus abort | ||
863 | * then. | ||
864 | */ | ||
865 | scratch = get_zeroed_page(GFP_KERNEL); | ||
866 | if (!scratch) | ||
867 | panic("Cannot allocate iommu scratch page"); | ||
868 | gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); | ||
869 | for (i = EMERGENCY_PAGES; i < iommu_pages; i++) | ||
870 | iommu_gatt_base[i] = gart_unmapped_entry; | ||
871 | |||
872 | for_all_nb(dev) { | ||
873 | u32 flag; | ||
874 | int cpu = PCI_SLOT(dev->devfn) - 24; | ||
875 | if (cpu >= MAX_NB) | ||
876 | continue; | ||
877 | northbridges[cpu] = dev; | ||
878 | pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ | ||
879 | northbridge_flush_word[cpu] = flag; | ||
880 | } | ||
881 | |||
882 | flush_gart(NULL); | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | |||
887 | /* Must execute after PCI subsystem */ | ||
888 | fs_initcall(pci_iommu_init); | ||
889 | |||
890 | /* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] | ||
891 | [,forcesac][,fullflush][,nomerge][,biomerge] | ||
892 | size set size of iommu (in bytes) | ||
893 | noagp don't initialize the AGP driver and use full aperture. | ||
894 | off don't use the IOMMU | ||
895 | leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) | ||
896 | memaper[=order] allocate an own aperture over RAM with size 32MB^order. | ||
897 | noforce don't force IOMMU usage. Default. | ||
898 | force Force IOMMU. | ||
899 | merge Do lazy merging. This may improve performance on some block devices. | ||
900 | Implies force (experimental) | ||
901 | biomerge Do merging at the BIO layer. This is more efficient than merge, | ||
902 | but should be only done with very big IOMMUs. Implies merge,force. | ||
903 | nomerge Don't do SG merging. | ||
904 | forcesac For SAC mode for masks <40bits (experimental) | ||
905 | fullflush Flush IOMMU on each allocation (default) | ||
906 | nofullflush Don't use IOMMU fullflush | ||
907 | allowed overwrite iommu off workarounds for specific chipsets. | ||
908 | soft Use software bounce buffering (default for Intel machines) | ||
909 | noaperture Don't touch the aperture for AGP. | ||
910 | */ | ||
911 | __init int iommu_setup(char *p) | ||
912 | { | ||
913 | int arg; | ||
914 | |||
915 | while (*p) { | ||
916 | if (!strncmp(p,"noagp",5)) | ||
917 | no_agp = 1; | ||
918 | if (!strncmp(p,"off",3)) | ||
919 | no_iommu = 1; | ||
920 | if (!strncmp(p,"force",5)) { | ||
921 | force_iommu = 1; | ||
922 | iommu_aperture_allowed = 1; | ||
923 | } | ||
924 | if (!strncmp(p,"allowed",7)) | ||
925 | iommu_aperture_allowed = 1; | ||
926 | if (!strncmp(p,"noforce",7)) { | ||
927 | iommu_merge = 0; | ||
928 | force_iommu = 0; | ||
929 | } | ||
930 | if (!strncmp(p, "memaper", 7)) { | ||
931 | fallback_aper_force = 1; | ||
932 | p += 7; | ||
933 | if (*p == '=') { | ||
934 | ++p; | ||
935 | if (get_option(&p, &arg)) | ||
936 | fallback_aper_order = arg; | ||
937 | } | ||
938 | } | ||
939 | if (!strncmp(p, "biomerge",8)) { | ||
940 | iommu_bio_merge = 4096; | ||
941 | iommu_merge = 1; | ||
942 | force_iommu = 1; | ||
943 | } | ||
944 | if (!strncmp(p, "panic",5)) | ||
945 | panic_on_overflow = 1; | ||
946 | if (!strncmp(p, "nopanic",7)) | ||
947 | panic_on_overflow = 0; | ||
948 | if (!strncmp(p, "merge",5)) { | ||
949 | iommu_merge = 1; | ||
950 | force_iommu = 1; | ||
951 | } | ||
952 | if (!strncmp(p, "nomerge",7)) | ||
953 | iommu_merge = 0; | ||
954 | if (!strncmp(p, "forcesac",8)) | ||
955 | iommu_sac_force = 1; | ||
956 | if (!strncmp(p, "fullflush",8)) | ||
957 | iommu_fullflush = 1; | ||
958 | if (!strncmp(p, "nofullflush",11)) | ||
959 | iommu_fullflush = 0; | ||
960 | if (!strncmp(p, "soft",4)) | ||
961 | swiotlb = 1; | ||
962 | if (!strncmp(p, "noaperture",10)) | ||
963 | fix_aperture = 0; | ||
964 | #ifdef CONFIG_IOMMU_LEAK | ||
965 | if (!strncmp(p,"leak",4)) { | ||
966 | leak_trace = 1; | ||
967 | p += 4; | ||
968 | if (*p == '=') ++p; | ||
969 | if (isdigit(*p) && get_option(&p, &arg)) | ||
970 | iommu_leak_pages = arg; | ||
971 | } else | ||
972 | #endif | ||
973 | if (isdigit(*p) && get_option(&p, &arg)) | ||
974 | iommu_size = arg; | ||
975 | p += strcspn(p, ","); | ||
976 | if (*p == ',') | ||
977 | ++p; | ||
978 | } | ||
979 | return 1; | ||
980 | } | ||
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c new file mode 100644 index 000000000000..67d90b89af0b --- /dev/null +++ b/arch/x86_64/kernel/pci-nommu.c | |||
@@ -0,0 +1,94 @@ | |||
1 | /* Fallback functions when the main IOMMU code is not compiled in. This | ||
2 | code is roughly equivalent to i386. */ | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/pci.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <asm/proto.h> | ||
8 | #include <asm/processor.h> | ||
9 | |||
10 | int iommu_merge = 0; | ||
11 | EXPORT_SYMBOL(iommu_merge); | ||
12 | |||
13 | dma_addr_t bad_dma_address; | ||
14 | EXPORT_SYMBOL(bad_dma_address); | ||
15 | |||
16 | int iommu_bio_merge = 0; | ||
17 | EXPORT_SYMBOL(iommu_bio_merge); | ||
18 | |||
19 | int iommu_sac_force = 0; | ||
20 | EXPORT_SYMBOL(iommu_sac_force); | ||
21 | |||
22 | /* | ||
23 | * Dummy IO MMU functions | ||
24 | */ | ||
25 | |||
26 | void *dma_alloc_coherent(struct device *hwdev, size_t size, | ||
27 | dma_addr_t *dma_handle, unsigned gfp) | ||
28 | { | ||
29 | void *ret; | ||
30 | u64 mask; | ||
31 | int order = get_order(size); | ||
32 | |||
33 | if (hwdev) | ||
34 | mask = hwdev->coherent_dma_mask & *hwdev->dma_mask; | ||
35 | else | ||
36 | mask = 0xffffffff; | ||
37 | for (;;) { | ||
38 | ret = (void *)__get_free_pages(gfp, order); | ||
39 | if (ret == NULL) | ||
40 | return NULL; | ||
41 | *dma_handle = virt_to_bus(ret); | ||
42 | if ((*dma_handle & ~mask) == 0) | ||
43 | break; | ||
44 | free_pages((unsigned long)ret, order); | ||
45 | if (gfp & GFP_DMA) | ||
46 | return NULL; | ||
47 | gfp |= GFP_DMA; | ||
48 | } | ||
49 | |||
50 | memset(ret, 0, size); | ||
51 | return ret; | ||
52 | } | ||
53 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
54 | |||
55 | void dma_free_coherent(struct device *hwdev, size_t size, | ||
56 | void *vaddr, dma_addr_t dma_handle) | ||
57 | { | ||
58 | free_pages((unsigned long)vaddr, get_order(size)); | ||
59 | } | ||
60 | EXPORT_SYMBOL(dma_free_coherent); | ||
61 | |||
62 | int dma_supported(struct device *hwdev, u64 mask) | ||
63 | { | ||
64 | /* | ||
65 | * we fall back to GFP_DMA when the mask isn't all 1s, | ||
66 | * so we can't guarantee allocations that must be | ||
67 | * within a tighter range than GFP_DMA.. | ||
68 | * RED-PEN this won't work for pci_map_single. Caller has to | ||
69 | * use GFP_DMA in the first place. | ||
70 | */ | ||
71 | if (mask < 0x00ffffff) | ||
72 | return 0; | ||
73 | |||
74 | return 1; | ||
75 | } | ||
76 | EXPORT_SYMBOL(dma_supported); | ||
77 | |||
78 | int dma_get_cache_alignment(void) | ||
79 | { | ||
80 | return boot_cpu_data.x86_clflush_size; | ||
81 | } | ||
82 | EXPORT_SYMBOL(dma_get_cache_alignment); | ||
83 | |||
84 | static int __init check_ram(void) | ||
85 | { | ||
86 | if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { | ||
87 | printk( | ||
88 | KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n" | ||
89 | KERN_ERR "WARNING 32bit PCI may malfunction.\n"); | ||
90 | } | ||
91 | return 0; | ||
92 | } | ||
93 | __initcall(check_ram); | ||
94 | |||
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c new file mode 100644 index 000000000000..9922d2ba24a3 --- /dev/null +++ b/arch/x86_64/kernel/process.c | |||
@@ -0,0 +1,770 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/process.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Pentium III FXSR, SSE support | ||
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
8 | * | ||
9 | * X86-64 port | ||
10 | * Andi Kleen. | ||
11 | * | ||
12 | * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * This file handles the architecture-dependent parts of process handling.. | ||
17 | */ | ||
18 | |||
19 | #include <stdarg.h> | ||
20 | |||
21 | #include <linux/errno.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/elfcore.h> | ||
26 | #include <linux/smp.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/user.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/a.out.h> | ||
31 | #include <linux/interrupt.h> | ||
32 | #include <linux/delay.h> | ||
33 | #include <linux/irq.h> | ||
34 | #include <linux/ptrace.h> | ||
35 | #include <linux/utsname.h> | ||
36 | #include <linux/random.h> | ||
37 | |||
38 | #include <asm/uaccess.h> | ||
39 | #include <asm/pgtable.h> | ||
40 | #include <asm/system.h> | ||
41 | #include <asm/io.h> | ||
42 | #include <asm/processor.h> | ||
43 | #include <asm/i387.h> | ||
44 | #include <asm/mmu_context.h> | ||
45 | #include <asm/pda.h> | ||
46 | #include <asm/prctl.h> | ||
47 | #include <asm/kdebug.h> | ||
48 | #include <asm/desc.h> | ||
49 | #include <asm/proto.h> | ||
50 | #include <asm/ia32.h> | ||
51 | |||
52 | asmlinkage extern void ret_from_fork(void); | ||
53 | |||
54 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; | ||
55 | |||
56 | static atomic_t hlt_counter = ATOMIC_INIT(0); | ||
57 | |||
58 | unsigned long boot_option_idle_override = 0; | ||
59 | EXPORT_SYMBOL(boot_option_idle_override); | ||
60 | |||
61 | /* | ||
62 | * Powermanagement idle function, if any.. | ||
63 | */ | ||
64 | void (*pm_idle)(void); | ||
65 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | ||
66 | |||
67 | void disable_hlt(void) | ||
68 | { | ||
69 | atomic_inc(&hlt_counter); | ||
70 | } | ||
71 | |||
72 | EXPORT_SYMBOL(disable_hlt); | ||
73 | |||
74 | void enable_hlt(void) | ||
75 | { | ||
76 | atomic_dec(&hlt_counter); | ||
77 | } | ||
78 | |||
79 | EXPORT_SYMBOL(enable_hlt); | ||
80 | |||
81 | /* | ||
82 | * We use this if we don't have any better | ||
83 | * idle routine.. | ||
84 | */ | ||
85 | void default_idle(void) | ||
86 | { | ||
87 | if (!atomic_read(&hlt_counter)) { | ||
88 | local_irq_disable(); | ||
89 | if (!need_resched()) | ||
90 | safe_halt(); | ||
91 | else | ||
92 | local_irq_enable(); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * On SMP it's slightly faster (but much more power-consuming!) | ||
98 | * to poll the ->need_resched flag instead of waiting for the | ||
99 | * cross-CPU IPI to arrive. Use this option with caution. | ||
100 | */ | ||
101 | static void poll_idle (void) | ||
102 | { | ||
103 | int oldval; | ||
104 | |||
105 | local_irq_enable(); | ||
106 | |||
107 | /* | ||
108 | * Deal with another CPU just having chosen a thread to | ||
109 | * run here: | ||
110 | */ | ||
111 | oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); | ||
112 | |||
113 | if (!oldval) { | ||
114 | set_thread_flag(TIF_POLLING_NRFLAG); | ||
115 | asm volatile( | ||
116 | "2:" | ||
117 | "testl %0,%1;" | ||
118 | "rep; nop;" | ||
119 | "je 2b;" | ||
120 | : : | ||
121 | "i" (_TIF_NEED_RESCHED), | ||
122 | "m" (current_thread_info()->flags)); | ||
123 | } else { | ||
124 | set_need_resched(); | ||
125 | } | ||
126 | } | ||
127 | |||
128 | void cpu_idle_wait(void) | ||
129 | { | ||
130 | unsigned int cpu, this_cpu = get_cpu(); | ||
131 | cpumask_t map; | ||
132 | |||
133 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | ||
134 | put_cpu(); | ||
135 | |||
136 | cpus_clear(map); | ||
137 | for_each_online_cpu(cpu) { | ||
138 | per_cpu(cpu_idle_state, cpu) = 1; | ||
139 | cpu_set(cpu, map); | ||
140 | } | ||
141 | |||
142 | __get_cpu_var(cpu_idle_state) = 0; | ||
143 | |||
144 | wmb(); | ||
145 | do { | ||
146 | ssleep(1); | ||
147 | for_each_online_cpu(cpu) { | ||
148 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) | ||
149 | cpu_clear(cpu, map); | ||
150 | } | ||
151 | cpus_and(map, map, cpu_online_map); | ||
152 | } while (!cpus_empty(map)); | ||
153 | } | ||
154 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | ||
155 | |||
156 | /* | ||
157 | * The idle thread. There's no useful work to be | ||
158 | * done, so just try to conserve power and have a | ||
159 | * low exit latency (ie sit in a loop waiting for | ||
160 | * somebody to say that they'd like to reschedule) | ||
161 | */ | ||
162 | void cpu_idle (void) | ||
163 | { | ||
164 | /* endless idle loop with no priority at all */ | ||
165 | while (1) { | ||
166 | while (!need_resched()) { | ||
167 | void (*idle)(void); | ||
168 | |||
169 | if (__get_cpu_var(cpu_idle_state)) | ||
170 | __get_cpu_var(cpu_idle_state) = 0; | ||
171 | |||
172 | rmb(); | ||
173 | idle = pm_idle; | ||
174 | if (!idle) | ||
175 | idle = default_idle; | ||
176 | idle(); | ||
177 | } | ||
178 | |||
179 | schedule(); | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
185 | * which can obviate IPI to trigger checking of need_resched. | ||
186 | * We execute MONITOR against need_resched and enter optimized wait state | ||
187 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
188 | * up from MWAIT (without an IPI). | ||
189 | */ | ||
190 | static void mwait_idle(void) | ||
191 | { | ||
192 | local_irq_enable(); | ||
193 | |||
194 | if (!need_resched()) { | ||
195 | set_thread_flag(TIF_POLLING_NRFLAG); | ||
196 | do { | ||
197 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
198 | if (need_resched()) | ||
199 | break; | ||
200 | __mwait(0, 0); | ||
201 | } while (!need_resched()); | ||
202 | clear_thread_flag(TIF_POLLING_NRFLAG); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | void __init select_idle_routine(const struct cpuinfo_x86 *c) | ||
207 | { | ||
208 | static int printed; | ||
209 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | ||
210 | /* | ||
211 | * Skip, if setup has overridden idle. | ||
212 | * One CPU supports mwait => All CPUs supports mwait | ||
213 | */ | ||
214 | if (!pm_idle) { | ||
215 | if (!printed) { | ||
216 | printk("using mwait in idle threads.\n"); | ||
217 | printed = 1; | ||
218 | } | ||
219 | pm_idle = mwait_idle; | ||
220 | } | ||
221 | } | ||
222 | } | ||
223 | |||
224 | static int __init idle_setup (char *str) | ||
225 | { | ||
226 | if (!strncmp(str, "poll", 4)) { | ||
227 | printk("using polling idle threads.\n"); | ||
228 | pm_idle = poll_idle; | ||
229 | } | ||
230 | |||
231 | boot_option_idle_override = 1; | ||
232 | return 1; | ||
233 | } | ||
234 | |||
235 | __setup("idle=", idle_setup); | ||
236 | |||
237 | /* Prints also some state that isn't saved in the pt_regs */ | ||
238 | void __show_regs(struct pt_regs * regs) | ||
239 | { | ||
240 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | ||
241 | unsigned int fsindex,gsindex; | ||
242 | unsigned int ds,cs,es; | ||
243 | |||
244 | printk("\n"); | ||
245 | print_modules(); | ||
246 | printk("Pid: %d, comm: %.20s %s %s\n", | ||
247 | current->pid, current->comm, print_tainted(), system_utsname.release); | ||
248 | printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); | ||
249 | printk_address(regs->rip); | ||
250 | printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); | ||
251 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | ||
252 | regs->rax, regs->rbx, regs->rcx); | ||
253 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | ||
254 | regs->rdx, regs->rsi, regs->rdi); | ||
255 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", | ||
256 | regs->rbp, regs->r8, regs->r9); | ||
257 | printk("R10: %016lx R11: %016lx R12: %016lx\n", | ||
258 | regs->r10, regs->r11, regs->r12); | ||
259 | printk("R13: %016lx R14: %016lx R15: %016lx\n", | ||
260 | regs->r13, regs->r14, regs->r15); | ||
261 | |||
262 | asm("movl %%ds,%0" : "=r" (ds)); | ||
263 | asm("movl %%cs,%0" : "=r" (cs)); | ||
264 | asm("movl %%es,%0" : "=r" (es)); | ||
265 | asm("movl %%fs,%0" : "=r" (fsindex)); | ||
266 | asm("movl %%gs,%0" : "=r" (gsindex)); | ||
267 | |||
268 | rdmsrl(MSR_FS_BASE, fs); | ||
269 | rdmsrl(MSR_GS_BASE, gs); | ||
270 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | ||
271 | |||
272 | asm("movq %%cr0, %0": "=r" (cr0)); | ||
273 | asm("movq %%cr2, %0": "=r" (cr2)); | ||
274 | asm("movq %%cr3, %0": "=r" (cr3)); | ||
275 | asm("movq %%cr4, %0": "=r" (cr4)); | ||
276 | |||
277 | printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | ||
278 | fs,fsindex,gs,gsindex,shadowgs); | ||
279 | printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); | ||
280 | printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); | ||
281 | } | ||
282 | |||
283 | void show_regs(struct pt_regs *regs) | ||
284 | { | ||
285 | __show_regs(regs); | ||
286 | show_trace(®s->rsp); | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Free current thread data structures etc.. | ||
291 | */ | ||
292 | void exit_thread(void) | ||
293 | { | ||
294 | struct task_struct *me = current; | ||
295 | struct thread_struct *t = &me->thread; | ||
296 | if (me->thread.io_bitmap_ptr) { | ||
297 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | ||
298 | |||
299 | kfree(t->io_bitmap_ptr); | ||
300 | t->io_bitmap_ptr = NULL; | ||
301 | /* | ||
302 | * Careful, clear this in the TSS too: | ||
303 | */ | ||
304 | memset(tss->io_bitmap, 0xff, t->io_bitmap_max); | ||
305 | t->io_bitmap_max = 0; | ||
306 | put_cpu(); | ||
307 | } | ||
308 | } | ||
309 | |||
310 | void flush_thread(void) | ||
311 | { | ||
312 | struct task_struct *tsk = current; | ||
313 | struct thread_info *t = current_thread_info(); | ||
314 | |||
315 | if (t->flags & _TIF_ABI_PENDING) | ||
316 | t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); | ||
317 | |||
318 | tsk->thread.debugreg0 = 0; | ||
319 | tsk->thread.debugreg1 = 0; | ||
320 | tsk->thread.debugreg2 = 0; | ||
321 | tsk->thread.debugreg3 = 0; | ||
322 | tsk->thread.debugreg6 = 0; | ||
323 | tsk->thread.debugreg7 = 0; | ||
324 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | ||
325 | /* | ||
326 | * Forget coprocessor state.. | ||
327 | */ | ||
328 | clear_fpu(tsk); | ||
329 | clear_used_math(); | ||
330 | } | ||
331 | |||
332 | void release_thread(struct task_struct *dead_task) | ||
333 | { | ||
334 | if (dead_task->mm) { | ||
335 | if (dead_task->mm->context.size) { | ||
336 | printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", | ||
337 | dead_task->comm, | ||
338 | dead_task->mm->context.ldt, | ||
339 | dead_task->mm->context.size); | ||
340 | BUG(); | ||
341 | } | ||
342 | } | ||
343 | } | ||
344 | |||
345 | static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) | ||
346 | { | ||
347 | struct user_desc ud = { | ||
348 | .base_addr = addr, | ||
349 | .limit = 0xfffff, | ||
350 | .seg_32bit = 1, | ||
351 | .limit_in_pages = 1, | ||
352 | .useable = 1, | ||
353 | }; | ||
354 | struct n_desc_struct *desc = (void *)t->thread.tls_array; | ||
355 | desc += tls; | ||
356 | desc->a = LDT_entry_a(&ud); | ||
357 | desc->b = LDT_entry_b(&ud); | ||
358 | } | ||
359 | |||
360 | static inline u32 read_32bit_tls(struct task_struct *t, int tls) | ||
361 | { | ||
362 | struct desc_struct *desc = (void *)t->thread.tls_array; | ||
363 | desc += tls; | ||
364 | return desc->base0 | | ||
365 | (((u32)desc->base1) << 16) | | ||
366 | (((u32)desc->base2) << 24); | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * This gets called before we allocate a new thread and copy | ||
371 | * the current task into it. | ||
372 | */ | ||
373 | void prepare_to_copy(struct task_struct *tsk) | ||
374 | { | ||
375 | unlazy_fpu(tsk); | ||
376 | } | ||
377 | |||
378 | int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, | ||
379 | unsigned long unused, | ||
380 | struct task_struct * p, struct pt_regs * regs) | ||
381 | { | ||
382 | int err; | ||
383 | struct pt_regs * childregs; | ||
384 | struct task_struct *me = current; | ||
385 | |||
386 | childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; | ||
387 | |||
388 | *childregs = *regs; | ||
389 | |||
390 | childregs->rax = 0; | ||
391 | childregs->rsp = rsp; | ||
392 | if (rsp == ~0UL) { | ||
393 | childregs->rsp = (unsigned long)childregs; | ||
394 | } | ||
395 | |||
396 | p->thread.rsp = (unsigned long) childregs; | ||
397 | p->thread.rsp0 = (unsigned long) (childregs+1); | ||
398 | p->thread.userrsp = me->thread.userrsp; | ||
399 | |||
400 | set_ti_thread_flag(p->thread_info, TIF_FORK); | ||
401 | |||
402 | p->thread.fs = me->thread.fs; | ||
403 | p->thread.gs = me->thread.gs; | ||
404 | |||
405 | asm("movl %%gs,%0" : "=m" (p->thread.gsindex)); | ||
406 | asm("movl %%fs,%0" : "=m" (p->thread.fsindex)); | ||
407 | asm("movl %%es,%0" : "=m" (p->thread.es)); | ||
408 | asm("movl %%ds,%0" : "=m" (p->thread.ds)); | ||
409 | |||
410 | if (unlikely(me->thread.io_bitmap_ptr != NULL)) { | ||
411 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | ||
412 | if (!p->thread.io_bitmap_ptr) { | ||
413 | p->thread.io_bitmap_max = 0; | ||
414 | return -ENOMEM; | ||
415 | } | ||
416 | memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); | ||
417 | } | ||
418 | |||
419 | /* | ||
420 | * Set a new TLS for the child thread? | ||
421 | */ | ||
422 | if (clone_flags & CLONE_SETTLS) { | ||
423 | #ifdef CONFIG_IA32_EMULATION | ||
424 | if (test_thread_flag(TIF_IA32)) | ||
425 | err = ia32_child_tls(p, childregs); | ||
426 | else | ||
427 | #endif | ||
428 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | ||
429 | if (err) | ||
430 | goto out; | ||
431 | } | ||
432 | err = 0; | ||
433 | out: | ||
434 | if (err && p->thread.io_bitmap_ptr) { | ||
435 | kfree(p->thread.io_bitmap_ptr); | ||
436 | p->thread.io_bitmap_max = 0; | ||
437 | } | ||
438 | return err; | ||
439 | } | ||
440 | |||
441 | /* | ||
442 | * This special macro can be used to load a debugging register | ||
443 | */ | ||
444 | #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) | ||
445 | |||
446 | /* | ||
447 | * switch_to(x,y) should switch tasks from x to y. | ||
448 | * | ||
449 | * This could still be optimized: | ||
450 | * - fold all the options into a flag word and test it with a single test. | ||
451 | * - could test fs/gs bitsliced | ||
452 | */ | ||
453 | struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
454 | { | ||
455 | struct thread_struct *prev = &prev_p->thread, | ||
456 | *next = &next_p->thread; | ||
457 | int cpu = smp_processor_id(); | ||
458 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
459 | |||
460 | unlazy_fpu(prev_p); | ||
461 | |||
462 | /* | ||
463 | * Reload esp0, LDT and the page table pointer: | ||
464 | */ | ||
465 | tss->rsp0 = next->rsp0; | ||
466 | |||
467 | /* | ||
468 | * Switch DS and ES. | ||
469 | * This won't pick up thread selector changes, but I guess that is ok. | ||
470 | */ | ||
471 | asm volatile("movl %%es,%0" : "=m" (prev->es)); | ||
472 | if (unlikely(next->es | prev->es)) | ||
473 | loadsegment(es, next->es); | ||
474 | |||
475 | asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); | ||
476 | if (unlikely(next->ds | prev->ds)) | ||
477 | loadsegment(ds, next->ds); | ||
478 | |||
479 | load_TLS(next, cpu); | ||
480 | |||
481 | /* | ||
482 | * Switch FS and GS. | ||
483 | */ | ||
484 | { | ||
485 | unsigned fsindex; | ||
486 | asm volatile("movl %%fs,%0" : "=r" (fsindex)); | ||
487 | /* segment register != 0 always requires a reload. | ||
488 | also reload when it has changed. | ||
489 | when prev process used 64bit base always reload | ||
490 | to avoid an information leak. */ | ||
491 | if (unlikely(fsindex | next->fsindex | prev->fs)) { | ||
492 | loadsegment(fs, next->fsindex); | ||
493 | /* check if the user used a selector != 0 | ||
494 | * if yes clear 64bit base, since overloaded base | ||
495 | * is always mapped to the Null selector | ||
496 | */ | ||
497 | if (fsindex) | ||
498 | prev->fs = 0; | ||
499 | } | ||
500 | /* when next process has a 64bit base use it */ | ||
501 | if (next->fs) | ||
502 | wrmsrl(MSR_FS_BASE, next->fs); | ||
503 | prev->fsindex = fsindex; | ||
504 | } | ||
505 | { | ||
506 | unsigned gsindex; | ||
507 | asm volatile("movl %%gs,%0" : "=r" (gsindex)); | ||
508 | if (unlikely(gsindex | next->gsindex | prev->gs)) { | ||
509 | load_gs_index(next->gsindex); | ||
510 | if (gsindex) | ||
511 | prev->gs = 0; | ||
512 | } | ||
513 | if (next->gs) | ||
514 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | ||
515 | prev->gsindex = gsindex; | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * Switch the PDA context. | ||
520 | */ | ||
521 | prev->userrsp = read_pda(oldrsp); | ||
522 | write_pda(oldrsp, next->userrsp); | ||
523 | write_pda(pcurrent, next_p); | ||
524 | write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); | ||
525 | |||
526 | /* | ||
527 | * Now maybe reload the debug registers | ||
528 | */ | ||
529 | if (unlikely(next->debugreg7)) { | ||
530 | loaddebug(next, 0); | ||
531 | loaddebug(next, 1); | ||
532 | loaddebug(next, 2); | ||
533 | loaddebug(next, 3); | ||
534 | /* no 4 and 5 */ | ||
535 | loaddebug(next, 6); | ||
536 | loaddebug(next, 7); | ||
537 | } | ||
538 | |||
539 | |||
540 | /* | ||
541 | * Handle the IO bitmap | ||
542 | */ | ||
543 | if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { | ||
544 | if (next->io_bitmap_ptr) | ||
545 | /* | ||
546 | * Copy the relevant range of the IO bitmap. | ||
547 | * Normally this is 128 bytes or less: | ||
548 | */ | ||
549 | memcpy(tss->io_bitmap, next->io_bitmap_ptr, | ||
550 | max(prev->io_bitmap_max, next->io_bitmap_max)); | ||
551 | else { | ||
552 | /* | ||
553 | * Clear any possible leftover bits: | ||
554 | */ | ||
555 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); | ||
556 | } | ||
557 | } | ||
558 | |||
559 | return prev_p; | ||
560 | } | ||
561 | |||
562 | /* | ||
563 | * sys_execve() executes a new program. | ||
564 | */ | ||
565 | asmlinkage | ||
566 | long sys_execve(char __user *name, char __user * __user *argv, | ||
567 | char __user * __user *envp, struct pt_regs regs) | ||
568 | { | ||
569 | long error; | ||
570 | char * filename; | ||
571 | |||
572 | filename = getname(name); | ||
573 | error = PTR_ERR(filename); | ||
574 | if (IS_ERR(filename)) | ||
575 | return error; | ||
576 | error = do_execve(filename, argv, envp, ®s); | ||
577 | if (error == 0) { | ||
578 | task_lock(current); | ||
579 | current->ptrace &= ~PT_DTRACE; | ||
580 | task_unlock(current); | ||
581 | } | ||
582 | putname(filename); | ||
583 | return error; | ||
584 | } | ||
585 | |||
586 | void set_personality_64bit(void) | ||
587 | { | ||
588 | /* inherit personality from parent */ | ||
589 | |||
590 | /* Make sure to be in 64bit mode */ | ||
591 | clear_thread_flag(TIF_IA32); | ||
592 | |||
593 | /* TBD: overwrites user setup. Should have two bits. | ||
594 | But 64bit processes have always behaved this way, | ||
595 | so it's not too bad. The main problem is just that | ||
596 | 32bit childs are affected again. */ | ||
597 | current->personality &= ~READ_IMPLIES_EXEC; | ||
598 | } | ||
599 | |||
600 | asmlinkage long sys_fork(struct pt_regs *regs) | ||
601 | { | ||
602 | return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); | ||
603 | } | ||
604 | |||
605 | asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | ||
606 | { | ||
607 | if (!newsp) | ||
608 | newsp = regs->rsp; | ||
609 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | ||
610 | } | ||
611 | |||
612 | /* | ||
613 | * This is trivial, and on the face of it looks like it | ||
614 | * could equally well be done in user mode. | ||
615 | * | ||
616 | * Not so, for quite unobvious reasons - register pressure. | ||
617 | * In user mode vfork() cannot have a stack frame, and if | ||
618 | * done by calling the "clone()" system call directly, you | ||
619 | * do not have enough call-clobbered registers to hold all | ||
620 | * the information you need. | ||
621 | */ | ||
622 | asmlinkage long sys_vfork(struct pt_regs *regs) | ||
623 | { | ||
624 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, | ||
625 | NULL, NULL); | ||
626 | } | ||
627 | |||
628 | unsigned long get_wchan(struct task_struct *p) | ||
629 | { | ||
630 | unsigned long stack; | ||
631 | u64 fp,rip; | ||
632 | int count = 0; | ||
633 | |||
634 | if (!p || p == current || p->state==TASK_RUNNING) | ||
635 | return 0; | ||
636 | stack = (unsigned long)p->thread_info; | ||
637 | if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) | ||
638 | return 0; | ||
639 | fp = *(u64 *)(p->thread.rsp); | ||
640 | do { | ||
641 | if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) | ||
642 | return 0; | ||
643 | rip = *(u64 *)(fp+8); | ||
644 | if (!in_sched_functions(rip)) | ||
645 | return rip; | ||
646 | fp = *(u64 *)fp; | ||
647 | } while (count++ < 16); | ||
648 | return 0; | ||
649 | } | ||
650 | |||
651 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | ||
652 | { | ||
653 | int ret = 0; | ||
654 | int doit = task == current; | ||
655 | int cpu; | ||
656 | |||
657 | switch (code) { | ||
658 | case ARCH_SET_GS: | ||
659 | if (addr >= TASK_SIZE) | ||
660 | return -EPERM; | ||
661 | cpu = get_cpu(); | ||
662 | /* handle small bases via the GDT because that's faster to | ||
663 | switch. */ | ||
664 | if (addr <= 0xffffffff) { | ||
665 | set_32bit_tls(task, GS_TLS, addr); | ||
666 | if (doit) { | ||
667 | load_TLS(&task->thread, cpu); | ||
668 | load_gs_index(GS_TLS_SEL); | ||
669 | } | ||
670 | task->thread.gsindex = GS_TLS_SEL; | ||
671 | task->thread.gs = 0; | ||
672 | } else { | ||
673 | task->thread.gsindex = 0; | ||
674 | task->thread.gs = addr; | ||
675 | if (doit) { | ||
676 | load_gs_index(0); | ||
677 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); | ||
678 | } | ||
679 | } | ||
680 | put_cpu(); | ||
681 | break; | ||
682 | case ARCH_SET_FS: | ||
683 | /* Not strictly needed for fs, but do it for symmetry | ||
684 | with gs */ | ||
685 | if (addr >= TASK_SIZE) | ||
686 | return -EPERM; | ||
687 | cpu = get_cpu(); | ||
688 | /* handle small bases via the GDT because that's faster to | ||
689 | switch. */ | ||
690 | if (addr <= 0xffffffff) { | ||
691 | set_32bit_tls(task, FS_TLS, addr); | ||
692 | if (doit) { | ||
693 | load_TLS(&task->thread, cpu); | ||
694 | asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); | ||
695 | } | ||
696 | task->thread.fsindex = FS_TLS_SEL; | ||
697 | task->thread.fs = 0; | ||
698 | } else { | ||
699 | task->thread.fsindex = 0; | ||
700 | task->thread.fs = addr; | ||
701 | if (doit) { | ||
702 | /* set the selector to 0 to not confuse | ||
703 | __switch_to */ | ||
704 | asm volatile("movl %0,%%fs" :: "r" (0)); | ||
705 | ret = checking_wrmsrl(MSR_FS_BASE, addr); | ||
706 | } | ||
707 | } | ||
708 | put_cpu(); | ||
709 | break; | ||
710 | case ARCH_GET_FS: { | ||
711 | unsigned long base; | ||
712 | if (task->thread.fsindex == FS_TLS_SEL) | ||
713 | base = read_32bit_tls(task, FS_TLS); | ||
714 | else if (doit) { | ||
715 | rdmsrl(MSR_FS_BASE, base); | ||
716 | } else | ||
717 | base = task->thread.fs; | ||
718 | ret = put_user(base, (unsigned long __user *)addr); | ||
719 | break; | ||
720 | } | ||
721 | case ARCH_GET_GS: { | ||
722 | unsigned long base; | ||
723 | if (task->thread.gsindex == GS_TLS_SEL) | ||
724 | base = read_32bit_tls(task, GS_TLS); | ||
725 | else if (doit) { | ||
726 | rdmsrl(MSR_KERNEL_GS_BASE, base); | ||
727 | } else | ||
728 | base = task->thread.gs; | ||
729 | ret = put_user(base, (unsigned long __user *)addr); | ||
730 | break; | ||
731 | } | ||
732 | |||
733 | default: | ||
734 | ret = -EINVAL; | ||
735 | break; | ||
736 | } | ||
737 | |||
738 | return ret; | ||
739 | } | ||
740 | |||
741 | long sys_arch_prctl(int code, unsigned long addr) | ||
742 | { | ||
743 | return do_arch_prctl(current, code, addr); | ||
744 | } | ||
745 | |||
746 | /* | ||
747 | * Capture the user space registers if the task is not running (in user space) | ||
748 | */ | ||
749 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | ||
750 | { | ||
751 | struct pt_regs *pp, ptregs; | ||
752 | |||
753 | pp = (struct pt_regs *)(tsk->thread.rsp0); | ||
754 | --pp; | ||
755 | |||
756 | ptregs = *pp; | ||
757 | ptregs.cs &= 0xffff; | ||
758 | ptregs.ss &= 0xffff; | ||
759 | |||
760 | elf_core_copy_regs(regs, &ptregs); | ||
761 | |||
762 | return 1; | ||
763 | } | ||
764 | |||
765 | unsigned long arch_align_stack(unsigned long sp) | ||
766 | { | ||
767 | if (randomize_va_space) | ||
768 | sp -= get_random_int() % 8192; | ||
769 | return sp & ~0xf; | ||
770 | } | ||
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c new file mode 100644 index 000000000000..0b7b101debdf --- /dev/null +++ b/arch/x86_64/kernel/ptrace.c | |||
@@ -0,0 +1,547 @@ | |||
1 | /* ptrace.c */ | ||
2 | /* By Ross Biro 1/23/92 */ | ||
3 | /* | ||
4 | * Pentium III FXSR, SSE support | ||
5 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
6 | * | ||
7 | * x86-64 port 2000-2002 Andi Kleen | ||
8 | */ | ||
9 | |||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/smp.h> | ||
14 | #include <linux/smp_lock.h> | ||
15 | #include <linux/errno.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/user.h> | ||
18 | #include <linux/security.h> | ||
19 | #include <linux/audit.h> | ||
20 | #include <linux/seccomp.h> | ||
21 | |||
22 | #include <asm/uaccess.h> | ||
23 | #include <asm/pgtable.h> | ||
24 | #include <asm/system.h> | ||
25 | #include <asm/processor.h> | ||
26 | #include <asm/i387.h> | ||
27 | #include <asm/debugreg.h> | ||
28 | #include <asm/ldt.h> | ||
29 | #include <asm/desc.h> | ||
30 | #include <asm/proto.h> | ||
31 | #include <asm/ia32.h> | ||
32 | |||
33 | /* | ||
34 | * does not yet catch signals sent when the child dies. | ||
35 | * in exit.c or in signal.c. | ||
36 | */ | ||
37 | |||
38 | /* determines which flags the user has access to. */ | ||
39 | /* 1 = access 0 = no access */ | ||
40 | #define FLAG_MASK 0x44dd5UL | ||
41 | |||
42 | /* set's the trap flag. */ | ||
43 | #define TRAP_FLAG 0x100UL | ||
44 | |||
45 | /* | ||
46 | * eflags and offset of eflags on child stack.. | ||
47 | */ | ||
48 | #define EFLAGS offsetof(struct pt_regs, eflags) | ||
49 | #define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) | ||
50 | |||
51 | /* | ||
52 | * this routine will get a word off of the processes privileged stack. | ||
53 | * the offset is how far from the base addr as stored in the TSS. | ||
54 | * this routine assumes that all the privileged stacks are in our | ||
55 | * data space. | ||
56 | */ | ||
57 | static inline unsigned long get_stack_long(struct task_struct *task, int offset) | ||
58 | { | ||
59 | unsigned char *stack; | ||
60 | |||
61 | stack = (unsigned char *)task->thread.rsp0; | ||
62 | stack += offset; | ||
63 | return (*((unsigned long *)stack)); | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * this routine will put a word on the processes privileged stack. | ||
68 | * the offset is how far from the base addr as stored in the TSS. | ||
69 | * this routine assumes that all the privileged stacks are in our | ||
70 | * data space. | ||
71 | */ | ||
72 | static inline long put_stack_long(struct task_struct *task, int offset, | ||
73 | unsigned long data) | ||
74 | { | ||
75 | unsigned char * stack; | ||
76 | |||
77 | stack = (unsigned char *) task->thread.rsp0; | ||
78 | stack += offset; | ||
79 | *(unsigned long *) stack = data; | ||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Called by kernel/ptrace.c when detaching.. | ||
85 | * | ||
86 | * Make sure the single step bit is not set. | ||
87 | */ | ||
88 | void ptrace_disable(struct task_struct *child) | ||
89 | { | ||
90 | long tmp; | ||
91 | |||
92 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
93 | tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; | ||
94 | put_stack_long(child, EFL_OFFSET, tmp); | ||
95 | } | ||
96 | |||
97 | static int putreg(struct task_struct *child, | ||
98 | unsigned long regno, unsigned long value) | ||
99 | { | ||
100 | unsigned long tmp; | ||
101 | |||
102 | /* Some code in the 64bit emulation may not be 64bit clean. | ||
103 | Don't take any chances. */ | ||
104 | if (test_tsk_thread_flag(child, TIF_IA32)) | ||
105 | value &= 0xffffffff; | ||
106 | switch (regno) { | ||
107 | case offsetof(struct user_regs_struct,fs): | ||
108 | if (value && (value & 3) != 3) | ||
109 | return -EIO; | ||
110 | child->thread.fsindex = value & 0xffff; | ||
111 | return 0; | ||
112 | case offsetof(struct user_regs_struct,gs): | ||
113 | if (value && (value & 3) != 3) | ||
114 | return -EIO; | ||
115 | child->thread.gsindex = value & 0xffff; | ||
116 | return 0; | ||
117 | case offsetof(struct user_regs_struct,ds): | ||
118 | if (value && (value & 3) != 3) | ||
119 | return -EIO; | ||
120 | child->thread.ds = value & 0xffff; | ||
121 | return 0; | ||
122 | case offsetof(struct user_regs_struct,es): | ||
123 | if (value && (value & 3) != 3) | ||
124 | return -EIO; | ||
125 | child->thread.es = value & 0xffff; | ||
126 | return 0; | ||
127 | case offsetof(struct user_regs_struct,ss): | ||
128 | if ((value & 3) != 3) | ||
129 | return -EIO; | ||
130 | value &= 0xffff; | ||
131 | return 0; | ||
132 | case offsetof(struct user_regs_struct,fs_base): | ||
133 | if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) | ||
134 | return -EIO; | ||
135 | child->thread.fs = value; | ||
136 | return 0; | ||
137 | case offsetof(struct user_regs_struct,gs_base): | ||
138 | if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) | ||
139 | return -EIO; | ||
140 | child->thread.gs = value; | ||
141 | return 0; | ||
142 | case offsetof(struct user_regs_struct, eflags): | ||
143 | value &= FLAG_MASK; | ||
144 | tmp = get_stack_long(child, EFL_OFFSET); | ||
145 | tmp &= ~FLAG_MASK; | ||
146 | value |= tmp; | ||
147 | break; | ||
148 | case offsetof(struct user_regs_struct,cs): | ||
149 | if ((value & 3) != 3) | ||
150 | return -EIO; | ||
151 | value &= 0xffff; | ||
152 | break; | ||
153 | } | ||
154 | put_stack_long(child, regno - sizeof(struct pt_regs), value); | ||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | static unsigned long getreg(struct task_struct *child, unsigned long regno) | ||
159 | { | ||
160 | unsigned long val; | ||
161 | switch (regno) { | ||
162 | case offsetof(struct user_regs_struct, fs): | ||
163 | return child->thread.fsindex; | ||
164 | case offsetof(struct user_regs_struct, gs): | ||
165 | return child->thread.gsindex; | ||
166 | case offsetof(struct user_regs_struct, ds): | ||
167 | return child->thread.ds; | ||
168 | case offsetof(struct user_regs_struct, es): | ||
169 | return child->thread.es; | ||
170 | case offsetof(struct user_regs_struct, fs_base): | ||
171 | return child->thread.fs; | ||
172 | case offsetof(struct user_regs_struct, gs_base): | ||
173 | return child->thread.gs; | ||
174 | default: | ||
175 | regno = regno - sizeof(struct pt_regs); | ||
176 | val = get_stack_long(child, regno); | ||
177 | if (test_tsk_thread_flag(child, TIF_IA32)) | ||
178 | val &= 0xffffffff; | ||
179 | return val; | ||
180 | } | ||
181 | |||
182 | } | ||
183 | |||
184 | asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data) | ||
185 | { | ||
186 | struct task_struct *child; | ||
187 | long i, ret; | ||
188 | unsigned ui; | ||
189 | |||
190 | /* This lock_kernel fixes a subtle race with suid exec */ | ||
191 | lock_kernel(); | ||
192 | ret = -EPERM; | ||
193 | if (request == PTRACE_TRACEME) { | ||
194 | /* are we already being traced? */ | ||
195 | if (current->ptrace & PT_PTRACED) | ||
196 | goto out; | ||
197 | ret = security_ptrace(current->parent, current); | ||
198 | if (ret) | ||
199 | goto out; | ||
200 | /* set the ptrace bit in the process flags. */ | ||
201 | current->ptrace |= PT_PTRACED; | ||
202 | ret = 0; | ||
203 | goto out; | ||
204 | } | ||
205 | ret = -ESRCH; | ||
206 | read_lock(&tasklist_lock); | ||
207 | child = find_task_by_pid(pid); | ||
208 | if (child) | ||
209 | get_task_struct(child); | ||
210 | read_unlock(&tasklist_lock); | ||
211 | if (!child) | ||
212 | goto out; | ||
213 | |||
214 | ret = -EPERM; | ||
215 | if (pid == 1) /* you may not mess with init */ | ||
216 | goto out_tsk; | ||
217 | |||
218 | if (request == PTRACE_ATTACH) { | ||
219 | ret = ptrace_attach(child); | ||
220 | goto out_tsk; | ||
221 | } | ||
222 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | ||
223 | if (ret < 0) | ||
224 | goto out_tsk; | ||
225 | |||
226 | switch (request) { | ||
227 | /* when I and D space are separate, these will need to be fixed. */ | ||
228 | case PTRACE_PEEKTEXT: /* read word at location addr. */ | ||
229 | case PTRACE_PEEKDATA: { | ||
230 | unsigned long tmp; | ||
231 | int copied; | ||
232 | |||
233 | copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); | ||
234 | ret = -EIO; | ||
235 | if (copied != sizeof(tmp)) | ||
236 | break; | ||
237 | ret = put_user(tmp,(unsigned long __user *) data); | ||
238 | break; | ||
239 | } | ||
240 | |||
241 | /* read the word at location addr in the USER area. */ | ||
242 | case PTRACE_PEEKUSR: { | ||
243 | unsigned long tmp; | ||
244 | |||
245 | ret = -EIO; | ||
246 | if ((addr & 7) || | ||
247 | addr > sizeof(struct user) - 7) | ||
248 | break; | ||
249 | |||
250 | switch (addr) { | ||
251 | case 0 ... sizeof(struct user_regs_struct): | ||
252 | tmp = getreg(child, addr); | ||
253 | break; | ||
254 | case offsetof(struct user, u_debugreg[0]): | ||
255 | tmp = child->thread.debugreg0; | ||
256 | break; | ||
257 | case offsetof(struct user, u_debugreg[1]): | ||
258 | tmp = child->thread.debugreg1; | ||
259 | break; | ||
260 | case offsetof(struct user, u_debugreg[2]): | ||
261 | tmp = child->thread.debugreg2; | ||
262 | break; | ||
263 | case offsetof(struct user, u_debugreg[3]): | ||
264 | tmp = child->thread.debugreg3; | ||
265 | break; | ||
266 | case offsetof(struct user, u_debugreg[6]): | ||
267 | tmp = child->thread.debugreg6; | ||
268 | break; | ||
269 | case offsetof(struct user, u_debugreg[7]): | ||
270 | tmp = child->thread.debugreg7; | ||
271 | break; | ||
272 | default: | ||
273 | tmp = 0; | ||
274 | break; | ||
275 | } | ||
276 | ret = put_user(tmp,(unsigned long __user *) data); | ||
277 | break; | ||
278 | } | ||
279 | |||
280 | /* when I and D space are separate, this will have to be fixed. */ | ||
281 | case PTRACE_POKETEXT: /* write the word at location addr. */ | ||
282 | case PTRACE_POKEDATA: | ||
283 | ret = 0; | ||
284 | if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) | ||
285 | break; | ||
286 | ret = -EIO; | ||
287 | break; | ||
288 | |||
289 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | ||
290 | ret = -EIO; | ||
291 | if ((addr & 7) || | ||
292 | addr > sizeof(struct user) - 7) | ||
293 | break; | ||
294 | |||
295 | switch (addr) { | ||
296 | case 0 ... sizeof(struct user_regs_struct): | ||
297 | ret = putreg(child, addr, data); | ||
298 | break; | ||
299 | /* Disallows to set a breakpoint into the vsyscall */ | ||
300 | case offsetof(struct user, u_debugreg[0]): | ||
301 | if (data >= TASK_SIZE-7) break; | ||
302 | child->thread.debugreg0 = data; | ||
303 | ret = 0; | ||
304 | break; | ||
305 | case offsetof(struct user, u_debugreg[1]): | ||
306 | if (data >= TASK_SIZE-7) break; | ||
307 | child->thread.debugreg1 = data; | ||
308 | ret = 0; | ||
309 | break; | ||
310 | case offsetof(struct user, u_debugreg[2]): | ||
311 | if (data >= TASK_SIZE-7) break; | ||
312 | child->thread.debugreg2 = data; | ||
313 | ret = 0; | ||
314 | break; | ||
315 | case offsetof(struct user, u_debugreg[3]): | ||
316 | if (data >= TASK_SIZE-7) break; | ||
317 | child->thread.debugreg3 = data; | ||
318 | ret = 0; | ||
319 | break; | ||
320 | case offsetof(struct user, u_debugreg[6]): | ||
321 | if (data >> 32) | ||
322 | break; | ||
323 | child->thread.debugreg6 = data; | ||
324 | ret = 0; | ||
325 | break; | ||
326 | case offsetof(struct user, u_debugreg[7]): | ||
327 | /* See arch/i386/kernel/ptrace.c for an explanation of | ||
328 | * this awkward check.*/ | ||
329 | data &= ~DR_CONTROL_RESERVED; | ||
330 | for(i=0; i<4; i++) | ||
331 | if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1) | ||
332 | break; | ||
333 | if (i == 4) { | ||
334 | child->thread.debugreg7 = data; | ||
335 | ret = 0; | ||
336 | } | ||
337 | break; | ||
338 | } | ||
339 | break; | ||
340 | case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ | ||
341 | case PTRACE_CONT: { /* restart after signal. */ | ||
342 | long tmp; | ||
343 | |||
344 | ret = -EIO; | ||
345 | if ((unsigned long) data > _NSIG) | ||
346 | break; | ||
347 | if (request == PTRACE_SYSCALL) | ||
348 | set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
349 | else | ||
350 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
351 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
352 | child->exit_code = data; | ||
353 | /* make sure the single step bit is not set. */ | ||
354 | tmp = get_stack_long(child, EFL_OFFSET); | ||
355 | tmp &= ~TRAP_FLAG; | ||
356 | put_stack_long(child, EFL_OFFSET,tmp); | ||
357 | wake_up_process(child); | ||
358 | ret = 0; | ||
359 | break; | ||
360 | } | ||
361 | |||
362 | #ifdef CONFIG_IA32_EMULATION | ||
363 | /* This makes only sense with 32bit programs. Allow a | ||
364 | 64bit debugger to fully examine them too. Better | ||
365 | don't use it against 64bit processes, use | ||
366 | PTRACE_ARCH_PRCTL instead. */ | ||
367 | case PTRACE_SET_THREAD_AREA: { | ||
368 | struct user_desc __user *p; | ||
369 | int old; | ||
370 | p = (struct user_desc __user *)data; | ||
371 | get_user(old, &p->entry_number); | ||
372 | put_user(addr, &p->entry_number); | ||
373 | ret = do_set_thread_area(&child->thread, p); | ||
374 | put_user(old, &p->entry_number); | ||
375 | break; | ||
376 | case PTRACE_GET_THREAD_AREA: | ||
377 | p = (struct user_desc __user *)data; | ||
378 | get_user(old, &p->entry_number); | ||
379 | put_user(addr, &p->entry_number); | ||
380 | ret = do_get_thread_area(&child->thread, p); | ||
381 | put_user(old, &p->entry_number); | ||
382 | break; | ||
383 | } | ||
384 | #endif | ||
385 | /* normal 64bit interface to access TLS data. | ||
386 | Works just like arch_prctl, except that the arguments | ||
387 | are reversed. */ | ||
388 | case PTRACE_ARCH_PRCTL: | ||
389 | ret = do_arch_prctl(child, data, addr); | ||
390 | break; | ||
391 | |||
392 | /* | ||
393 | * make the child exit. Best I can do is send it a sigkill. | ||
394 | * perhaps it should be put in the status that it wants to | ||
395 | * exit. | ||
396 | */ | ||
397 | case PTRACE_KILL: { | ||
398 | long tmp; | ||
399 | |||
400 | ret = 0; | ||
401 | if (child->exit_state == EXIT_ZOMBIE) /* already dead */ | ||
402 | break; | ||
403 | clear_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
404 | child->exit_code = SIGKILL; | ||
405 | /* make sure the single step bit is not set. */ | ||
406 | tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; | ||
407 | put_stack_long(child, EFL_OFFSET, tmp); | ||
408 | wake_up_process(child); | ||
409 | break; | ||
410 | } | ||
411 | |||
412 | case PTRACE_SINGLESTEP: { /* set the trap flag. */ | ||
413 | long tmp; | ||
414 | |||
415 | ret = -EIO; | ||
416 | if ((unsigned long) data > _NSIG) | ||
417 | break; | ||
418 | clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); | ||
419 | if ((child->ptrace & PT_DTRACE) == 0) { | ||
420 | /* Spurious delayed TF traps may occur */ | ||
421 | child->ptrace |= PT_DTRACE; | ||
422 | } | ||
423 | tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG; | ||
424 | put_stack_long(child, EFL_OFFSET, tmp); | ||
425 | set_tsk_thread_flag(child, TIF_SINGLESTEP); | ||
426 | child->exit_code = data; | ||
427 | /* give it a chance to run. */ | ||
428 | wake_up_process(child); | ||
429 | ret = 0; | ||
430 | break; | ||
431 | } | ||
432 | |||
433 | case PTRACE_DETACH: | ||
434 | /* detach a process that was attached. */ | ||
435 | ret = ptrace_detach(child, data); | ||
436 | break; | ||
437 | |||
438 | case PTRACE_GETREGS: { /* Get all gp regs from the child. */ | ||
439 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
440 | sizeof(struct user_regs_struct))) { | ||
441 | ret = -EIO; | ||
442 | break; | ||
443 | } | ||
444 | ret = 0; | ||
445 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
446 | ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); | ||
447 | data += sizeof(long); | ||
448 | } | ||
449 | break; | ||
450 | } | ||
451 | |||
452 | case PTRACE_SETREGS: { /* Set all gp regs in the child. */ | ||
453 | unsigned long tmp; | ||
454 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
455 | sizeof(struct user_regs_struct))) { | ||
456 | ret = -EIO; | ||
457 | break; | ||
458 | } | ||
459 | ret = 0; | ||
460 | for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { | ||
461 | ret |= __get_user(tmp, (unsigned long __user *) data); | ||
462 | putreg(child, ui, tmp); | ||
463 | data += sizeof(long); | ||
464 | } | ||
465 | break; | ||
466 | } | ||
467 | |||
468 | case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ | ||
469 | if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, | ||
470 | sizeof(struct user_i387_struct))) { | ||
471 | ret = -EIO; | ||
472 | break; | ||
473 | } | ||
474 | ret = get_fpregs((struct user_i387_struct __user *)data, child); | ||
475 | break; | ||
476 | } | ||
477 | |||
478 | case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ | ||
479 | if (!access_ok(VERIFY_READ, (unsigned __user *)data, | ||
480 | sizeof(struct user_i387_struct))) { | ||
481 | ret = -EIO; | ||
482 | break; | ||
483 | } | ||
484 | set_stopped_child_used_math(child); | ||
485 | ret = set_fpregs(child, (struct user_i387_struct __user *)data); | ||
486 | break; | ||
487 | } | ||
488 | |||
489 | default: | ||
490 | ret = ptrace_request(child, request, addr, data); | ||
491 | break; | ||
492 | } | ||
493 | out_tsk: | ||
494 | put_task_struct(child); | ||
495 | out: | ||
496 | unlock_kernel(); | ||
497 | return ret; | ||
498 | } | ||
499 | |||
500 | static void syscall_trace(struct pt_regs *regs) | ||
501 | { | ||
502 | |||
503 | #if 0 | ||
504 | printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", | ||
505 | current->comm, | ||
506 | regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), | ||
507 | current_thread_info()->flags, current->ptrace); | ||
508 | #endif | ||
509 | |||
510 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) | ||
511 | ? 0x80 : 0)); | ||
512 | /* | ||
513 | * this isn't the same as continuing with a signal, but it will do | ||
514 | * for normal use. strace only continues with a signal if the | ||
515 | * stopping signal is not SIGTRAP. -brl | ||
516 | */ | ||
517 | if (current->exit_code) { | ||
518 | send_sig(current->exit_code, current, 1); | ||
519 | current->exit_code = 0; | ||
520 | } | ||
521 | } | ||
522 | |||
523 | asmlinkage void syscall_trace_enter(struct pt_regs *regs) | ||
524 | { | ||
525 | /* do the secure computing check first */ | ||
526 | secure_computing(regs->orig_rax); | ||
527 | |||
528 | if (unlikely(current->audit_context)) | ||
529 | audit_syscall_entry(current, regs->orig_rax, | ||
530 | regs->rdi, regs->rsi, | ||
531 | regs->rdx, regs->r10); | ||
532 | |||
533 | if (test_thread_flag(TIF_SYSCALL_TRACE) | ||
534 | && (current->ptrace & PT_PTRACED)) | ||
535 | syscall_trace(regs); | ||
536 | } | ||
537 | |||
538 | asmlinkage void syscall_trace_leave(struct pt_regs *regs) | ||
539 | { | ||
540 | if (unlikely(current->audit_context)) | ||
541 | audit_syscall_exit(current, regs->rax); | ||
542 | |||
543 | if ((test_thread_flag(TIF_SYSCALL_TRACE) | ||
544 | || test_thread_flag(TIF_SINGLESTEP)) | ||
545 | && (current->ptrace & PT_PTRACED)) | ||
546 | syscall_trace(regs); | ||
547 | } | ||
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c new file mode 100644 index 000000000000..be4b36f762cf --- /dev/null +++ b/arch/x86_64/kernel/reboot.c | |||
@@ -0,0 +1,163 @@ | |||
1 | /* Various gunk just to reboot the machine. */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/reboot.h> | ||
4 | #include <linux/init.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/ctype.h> | ||
8 | #include <linux/string.h> | ||
9 | #include <asm/io.h> | ||
10 | #include <asm/kdebug.h> | ||
11 | #include <asm/delay.h> | ||
12 | #include <asm/hw_irq.h> | ||
13 | #include <asm/system.h> | ||
14 | #include <asm/pgtable.h> | ||
15 | #include <asm/tlbflush.h> | ||
16 | #include <asm/apic.h> | ||
17 | |||
18 | /* | ||
19 | * Power off function, if any | ||
20 | */ | ||
21 | void (*pm_power_off)(void); | ||
22 | |||
23 | static long no_idt[3]; | ||
24 | static enum { | ||
25 | BOOT_TRIPLE = 't', | ||
26 | BOOT_KBD = 'k' | ||
27 | } reboot_type = BOOT_KBD; | ||
28 | static int reboot_mode = 0; | ||
29 | int reboot_force; | ||
30 | |||
31 | /* reboot=t[riple] | k[bd] [, [w]arm | [c]old] | ||
32 | warm Don't set the cold reboot flag | ||
33 | cold Set the cold reboot flag | ||
34 | triple Force a triple fault (init) | ||
35 | kbd Use the keyboard controller. cold reset (default) | ||
36 | force Avoid anything that could hang. | ||
37 | */ | ||
38 | static int __init reboot_setup(char *str) | ||
39 | { | ||
40 | for (;;) { | ||
41 | switch (*str) { | ||
42 | case 'w': | ||
43 | reboot_mode = 0x1234; | ||
44 | break; | ||
45 | |||
46 | case 'c': | ||
47 | reboot_mode = 0; | ||
48 | break; | ||
49 | |||
50 | case 't': | ||
51 | case 'b': | ||
52 | case 'k': | ||
53 | reboot_type = *str; | ||
54 | break; | ||
55 | case 'f': | ||
56 | reboot_force = 1; | ||
57 | break; | ||
58 | } | ||
59 | if((str = strchr(str,',')) != NULL) | ||
60 | str++; | ||
61 | else | ||
62 | break; | ||
63 | } | ||
64 | return 1; | ||
65 | } | ||
66 | |||
67 | __setup("reboot=", reboot_setup); | ||
68 | |||
69 | #ifdef CONFIG_SMP | ||
70 | static void smp_halt(void) | ||
71 | { | ||
72 | int cpuid = safe_smp_processor_id(); | ||
73 | static int first_entry = 1; | ||
74 | |||
75 | if (reboot_force) | ||
76 | return; | ||
77 | |||
78 | if (first_entry) { | ||
79 | first_entry = 0; | ||
80 | smp_call_function((void *)machine_restart, NULL, 1, 0); | ||
81 | } | ||
82 | |||
83 | smp_stop_cpu(); | ||
84 | |||
85 | /* AP calling this. Just halt */ | ||
86 | if (cpuid != boot_cpu_id) { | ||
87 | for (;;) | ||
88 | asm("hlt"); | ||
89 | } | ||
90 | |||
91 | /* Wait for all other CPUs to have run smp_stop_cpu */ | ||
92 | while (!cpus_empty(cpu_online_map)) | ||
93 | rep_nop(); | ||
94 | } | ||
95 | #endif | ||
96 | |||
97 | static inline void kb_wait(void) | ||
98 | { | ||
99 | int i; | ||
100 | |||
101 | for (i=0; i<0x10000; i++) | ||
102 | if ((inb_p(0x64) & 0x02) == 0) | ||
103 | break; | ||
104 | } | ||
105 | |||
106 | void machine_restart(char * __unused) | ||
107 | { | ||
108 | int i; | ||
109 | |||
110 | printk("machine restart\n"); | ||
111 | |||
112 | #ifdef CONFIG_SMP | ||
113 | smp_halt(); | ||
114 | #endif | ||
115 | |||
116 | if (!reboot_force) { | ||
117 | local_irq_disable(); | ||
118 | #ifndef CONFIG_SMP | ||
119 | disable_local_APIC(); | ||
120 | #endif | ||
121 | disable_IO_APIC(); | ||
122 | local_irq_enable(); | ||
123 | } | ||
124 | |||
125 | /* Tell the BIOS if we want cold or warm reboot */ | ||
126 | *((unsigned short *)__va(0x472)) = reboot_mode; | ||
127 | |||
128 | for (;;) { | ||
129 | /* Could also try the reset bit in the Hammer NB */ | ||
130 | switch (reboot_type) { | ||
131 | case BOOT_KBD: | ||
132 | for (i=0; i<100; i++) { | ||
133 | kb_wait(); | ||
134 | udelay(50); | ||
135 | outb(0xfe,0x64); /* pulse reset low */ | ||
136 | udelay(50); | ||
137 | } | ||
138 | |||
139 | case BOOT_TRIPLE: | ||
140 | __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); | ||
141 | __asm__ __volatile__("int3"); | ||
142 | |||
143 | reboot_type = BOOT_KBD; | ||
144 | break; | ||
145 | } | ||
146 | } | ||
147 | } | ||
148 | |||
149 | EXPORT_SYMBOL(machine_restart); | ||
150 | |||
151 | void machine_halt(void) | ||
152 | { | ||
153 | } | ||
154 | |||
155 | EXPORT_SYMBOL(machine_halt); | ||
156 | |||
157 | void machine_power_off(void) | ||
158 | { | ||
159 | if (pm_power_off) | ||
160 | pm_power_off(); | ||
161 | } | ||
162 | |||
163 | EXPORT_SYMBOL(machine_power_off); | ||
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c new file mode 100644 index 000000000000..48f7c18172b9 --- /dev/null +++ b/arch/x86_64/kernel/semaphore.c | |||
@@ -0,0 +1,180 @@ | |||
1 | /* | ||
2 | * x86_64 semaphore implementation. | ||
3 | * | ||
4 | * (C) Copyright 1999 Linus Torvalds | ||
5 | * | ||
6 | * Portions Copyright 1999 Red Hat, Inc. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version | ||
11 | * 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> | ||
14 | */ | ||
15 | #include <linux/config.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <asm/errno.h> | ||
19 | |||
20 | #include <asm/semaphore.h> | ||
21 | |||
22 | /* | ||
23 | * Semaphores are implemented using a two-way counter: | ||
24 | * The "count" variable is decremented for each process | ||
25 | * that tries to acquire the semaphore, while the "sleeping" | ||
26 | * variable is a count of such acquires. | ||
27 | * | ||
28 | * Notably, the inline "up()" and "down()" functions can | ||
29 | * efficiently test if they need to do any extra work (up | ||
30 | * needs to do something only if count was negative before | ||
31 | * the increment operation. | ||
32 | * | ||
33 | * "sleeping" and the contention routine ordering is protected | ||
34 | * by the spinlock in the semaphore's waitqueue head. | ||
35 | * | ||
36 | * Note that these functions are only called when there is | ||
37 | * contention on the lock, and as such all this is the | ||
38 | * "non-critical" part of the whole semaphore business. The | ||
39 | * critical part is the inline stuff in <asm/semaphore.h> | ||
40 | * where we want to avoid any extra jumps and calls. | ||
41 | */ | ||
42 | |||
43 | /* | ||
44 | * Logic: | ||
45 | * - only on a boundary condition do we need to care. When we go | ||
46 | * from a negative count to a non-negative, we wake people up. | ||
47 | * - when we go from a non-negative count to a negative do we | ||
48 | * (a) synchronize with the "sleeper" count and (b) make sure | ||
49 | * that we're on the wakeup list before we synchronize so that | ||
50 | * we cannot lose wakeup events. | ||
51 | */ | ||
52 | |||
53 | void __up(struct semaphore *sem) | ||
54 | { | ||
55 | wake_up(&sem->wait); | ||
56 | } | ||
57 | |||
58 | void __sched __down(struct semaphore * sem) | ||
59 | { | ||
60 | struct task_struct *tsk = current; | ||
61 | DECLARE_WAITQUEUE(wait, tsk); | ||
62 | unsigned long flags; | ||
63 | |||
64 | tsk->state = TASK_UNINTERRUPTIBLE; | ||
65 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
66 | add_wait_queue_exclusive_locked(&sem->wait, &wait); | ||
67 | |||
68 | sem->sleepers++; | ||
69 | for (;;) { | ||
70 | int sleepers = sem->sleepers; | ||
71 | |||
72 | /* | ||
73 | * Add "everybody else" into it. They aren't | ||
74 | * playing, because we own the spinlock in | ||
75 | * the wait_queue_head. | ||
76 | */ | ||
77 | if (!atomic_add_negative(sleepers - 1, &sem->count)) { | ||
78 | sem->sleepers = 0; | ||
79 | break; | ||
80 | } | ||
81 | sem->sleepers = 1; /* us - see -1 above */ | ||
82 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
83 | |||
84 | schedule(); | ||
85 | |||
86 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
87 | tsk->state = TASK_UNINTERRUPTIBLE; | ||
88 | } | ||
89 | remove_wait_queue_locked(&sem->wait, &wait); | ||
90 | wake_up_locked(&sem->wait); | ||
91 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
92 | tsk->state = TASK_RUNNING; | ||
93 | } | ||
94 | |||
95 | int __sched __down_interruptible(struct semaphore * sem) | ||
96 | { | ||
97 | int retval = 0; | ||
98 | struct task_struct *tsk = current; | ||
99 | DECLARE_WAITQUEUE(wait, tsk); | ||
100 | unsigned long flags; | ||
101 | |||
102 | tsk->state = TASK_INTERRUPTIBLE; | ||
103 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
104 | add_wait_queue_exclusive_locked(&sem->wait, &wait); | ||
105 | |||
106 | sem->sleepers++; | ||
107 | for (;;) { | ||
108 | int sleepers = sem->sleepers; | ||
109 | |||
110 | /* | ||
111 | * With signals pending, this turns into | ||
112 | * the trylock failure case - we won't be | ||
113 | * sleeping, and we* can't get the lock as | ||
114 | * it has contention. Just correct the count | ||
115 | * and exit. | ||
116 | */ | ||
117 | if (signal_pending(current)) { | ||
118 | retval = -EINTR; | ||
119 | sem->sleepers = 0; | ||
120 | atomic_add(sleepers, &sem->count); | ||
121 | break; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Add "everybody else" into it. They aren't | ||
126 | * playing, because we own the spinlock in | ||
127 | * wait_queue_head. The "-1" is because we're | ||
128 | * still hoping to get the semaphore. | ||
129 | */ | ||
130 | if (!atomic_add_negative(sleepers - 1, &sem->count)) { | ||
131 | sem->sleepers = 0; | ||
132 | break; | ||
133 | } | ||
134 | sem->sleepers = 1; /* us - see -1 above */ | ||
135 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
136 | |||
137 | schedule(); | ||
138 | |||
139 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
140 | tsk->state = TASK_INTERRUPTIBLE; | ||
141 | } | ||
142 | remove_wait_queue_locked(&sem->wait, &wait); | ||
143 | wake_up_locked(&sem->wait); | ||
144 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
145 | |||
146 | tsk->state = TASK_RUNNING; | ||
147 | return retval; | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Trylock failed - make sure we correct for | ||
152 | * having decremented the count. | ||
153 | * | ||
154 | * We could have done the trylock with a | ||
155 | * single "cmpxchg" without failure cases, | ||
156 | * but then it wouldn't work on a 386. | ||
157 | */ | ||
158 | int __down_trylock(struct semaphore * sem) | ||
159 | { | ||
160 | int sleepers; | ||
161 | unsigned long flags; | ||
162 | |||
163 | spin_lock_irqsave(&sem->wait.lock, flags); | ||
164 | sleepers = sem->sleepers + 1; | ||
165 | sem->sleepers = 0; | ||
166 | |||
167 | /* | ||
168 | * Add "everybody else" and us into it. They aren't | ||
169 | * playing, because we own the spinlock in the | ||
170 | * wait_queue_head. | ||
171 | */ | ||
172 | if (!atomic_add_negative(sleepers, &sem->count)) { | ||
173 | wake_up_locked(&sem->wait); | ||
174 | } | ||
175 | |||
176 | spin_unlock_irqrestore(&sem->wait.lock, flags); | ||
177 | return 1; | ||
178 | } | ||
179 | |||
180 | |||
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c new file mode 100644 index 000000000000..e50fc41de6b9 --- /dev/null +++ b/arch/x86_64/kernel/setup.c | |||
@@ -0,0 +1,1189 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/setup.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Nov 2001 Dave Jones <davej@suse.de> | ||
7 | * Forked from i386 setup code. | ||
8 | * | ||
9 | * $Id$ | ||
10 | */ | ||
11 | |||
12 | /* | ||
13 | * This file handles the architecture-dependent parts of initialization | ||
14 | */ | ||
15 | |||
16 | #include <linux/errno.h> | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/stddef.h> | ||
21 | #include <linux/unistd.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/user.h> | ||
25 | #include <linux/a.out.h> | ||
26 | #include <linux/tty.h> | ||
27 | #include <linux/ioport.h> | ||
28 | #include <linux/delay.h> | ||
29 | #include <linux/config.h> | ||
30 | #include <linux/init.h> | ||
31 | #include <linux/initrd.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/bootmem.h> | ||
34 | #include <linux/module.h> | ||
35 | #include <asm/processor.h> | ||
36 | #include <linux/console.h> | ||
37 | #include <linux/seq_file.h> | ||
38 | #include <linux/root_dev.h> | ||
39 | #include <linux/pci.h> | ||
40 | #include <linux/acpi.h> | ||
41 | #include <linux/kallsyms.h> | ||
42 | #include <linux/edd.h> | ||
43 | #include <asm/mtrr.h> | ||
44 | #include <asm/uaccess.h> | ||
45 | #include <asm/system.h> | ||
46 | #include <asm/io.h> | ||
47 | #include <asm/smp.h> | ||
48 | #include <asm/msr.h> | ||
49 | #include <asm/desc.h> | ||
50 | #include <video/edid.h> | ||
51 | #include <asm/e820.h> | ||
52 | #include <asm/dma.h> | ||
53 | #include <asm/mpspec.h> | ||
54 | #include <asm/mmu_context.h> | ||
55 | #include <asm/bootsetup.h> | ||
56 | #include <asm/proto.h> | ||
57 | #include <asm/setup.h> | ||
58 | #include <asm/mach_apic.h> | ||
59 | #include <asm/numa.h> | ||
60 | |||
61 | /* | ||
62 | * Machine setup.. | ||
63 | */ | ||
64 | |||
65 | struct cpuinfo_x86 boot_cpu_data; | ||
66 | |||
67 | unsigned long mmu_cr4_features; | ||
68 | |||
69 | int acpi_disabled; | ||
70 | EXPORT_SYMBOL(acpi_disabled); | ||
71 | #ifdef CONFIG_ACPI_BOOT | ||
72 | extern int __initdata acpi_ht; | ||
73 | extern acpi_interrupt_flags acpi_sci_flags; | ||
74 | int __initdata acpi_force = 0; | ||
75 | #endif | ||
76 | |||
77 | int acpi_numa __initdata; | ||
78 | |||
79 | /* For PCI or other memory-mapped resources */ | ||
80 | unsigned long pci_mem_start = 0x10000000; | ||
81 | |||
82 | /* Boot loader ID as an integer, for the benefit of proc_dointvec */ | ||
83 | int bootloader_type; | ||
84 | |||
85 | unsigned long saved_video_mode; | ||
86 | |||
87 | #ifdef CONFIG_SWIOTLB | ||
88 | int swiotlb; | ||
89 | EXPORT_SYMBOL(swiotlb); | ||
90 | #endif | ||
91 | |||
92 | /* | ||
93 | * Setup options | ||
94 | */ | ||
95 | struct drive_info_struct { char dummy[32]; } drive_info; | ||
96 | struct screen_info screen_info; | ||
97 | struct sys_desc_table_struct { | ||
98 | unsigned short length; | ||
99 | unsigned char table[0]; | ||
100 | }; | ||
101 | |||
102 | struct edid_info edid_info; | ||
103 | struct e820map e820; | ||
104 | |||
105 | extern int root_mountflags; | ||
106 | extern char _text, _etext, _edata, _end; | ||
107 | |||
108 | char command_line[COMMAND_LINE_SIZE]; | ||
109 | |||
110 | struct resource standard_io_resources[] = { | ||
111 | { .name = "dma1", .start = 0x00, .end = 0x1f, | ||
112 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
113 | { .name = "pic1", .start = 0x20, .end = 0x21, | ||
114 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
115 | { .name = "timer0", .start = 0x40, .end = 0x43, | ||
116 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
117 | { .name = "timer1", .start = 0x50, .end = 0x53, | ||
118 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
119 | { .name = "keyboard", .start = 0x60, .end = 0x6f, | ||
120 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
121 | { .name = "dma page reg", .start = 0x80, .end = 0x8f, | ||
122 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
123 | { .name = "pic2", .start = 0xa0, .end = 0xa1, | ||
124 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
125 | { .name = "dma2", .start = 0xc0, .end = 0xdf, | ||
126 | .flags = IORESOURCE_BUSY | IORESOURCE_IO }, | ||
127 | { .name = "fpu", .start = 0xf0, .end = 0xff, | ||
128 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } | ||
129 | }; | ||
130 | |||
131 | #define STANDARD_IO_RESOURCES \ | ||
132 | (sizeof standard_io_resources / sizeof standard_io_resources[0]) | ||
133 | |||
134 | #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) | ||
135 | |||
136 | struct resource data_resource = { | ||
137 | .name = "Kernel data", | ||
138 | .start = 0, | ||
139 | .end = 0, | ||
140 | .flags = IORESOURCE_RAM, | ||
141 | }; | ||
142 | struct resource code_resource = { | ||
143 | .name = "Kernel code", | ||
144 | .start = 0, | ||
145 | .end = 0, | ||
146 | .flags = IORESOURCE_RAM, | ||
147 | }; | ||
148 | |||
149 | #define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) | ||
150 | |||
151 | static struct resource system_rom_resource = { | ||
152 | .name = "System ROM", | ||
153 | .start = 0xf0000, | ||
154 | .end = 0xfffff, | ||
155 | .flags = IORESOURCE_ROM, | ||
156 | }; | ||
157 | |||
158 | static struct resource extension_rom_resource = { | ||
159 | .name = "Extension ROM", | ||
160 | .start = 0xe0000, | ||
161 | .end = 0xeffff, | ||
162 | .flags = IORESOURCE_ROM, | ||
163 | }; | ||
164 | |||
165 | static struct resource adapter_rom_resources[] = { | ||
166 | { .name = "Adapter ROM", .start = 0xc8000, .end = 0, | ||
167 | .flags = IORESOURCE_ROM }, | ||
168 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
169 | .flags = IORESOURCE_ROM }, | ||
170 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
171 | .flags = IORESOURCE_ROM }, | ||
172 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
173 | .flags = IORESOURCE_ROM }, | ||
174 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
175 | .flags = IORESOURCE_ROM }, | ||
176 | { .name = "Adapter ROM", .start = 0, .end = 0, | ||
177 | .flags = IORESOURCE_ROM } | ||
178 | }; | ||
179 | |||
180 | #define ADAPTER_ROM_RESOURCES \ | ||
181 | (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) | ||
182 | |||
183 | static struct resource video_rom_resource = { | ||
184 | .name = "Video ROM", | ||
185 | .start = 0xc0000, | ||
186 | .end = 0xc7fff, | ||
187 | .flags = IORESOURCE_ROM, | ||
188 | }; | ||
189 | |||
190 | static struct resource video_ram_resource = { | ||
191 | .name = "Video RAM area", | ||
192 | .start = 0xa0000, | ||
193 | .end = 0xbffff, | ||
194 | .flags = IORESOURCE_RAM, | ||
195 | }; | ||
196 | |||
197 | #define romsignature(x) (*(unsigned short *)(x) == 0xaa55) | ||
198 | |||
199 | static int __init romchecksum(unsigned char *rom, unsigned long length) | ||
200 | { | ||
201 | unsigned char *p, sum = 0; | ||
202 | |||
203 | for (p = rom; p < rom + length; p++) | ||
204 | sum += *p; | ||
205 | return sum == 0; | ||
206 | } | ||
207 | |||
208 | static void __init probe_roms(void) | ||
209 | { | ||
210 | unsigned long start, length, upper; | ||
211 | unsigned char *rom; | ||
212 | int i; | ||
213 | |||
214 | /* video rom */ | ||
215 | upper = adapter_rom_resources[0].start; | ||
216 | for (start = video_rom_resource.start; start < upper; start += 2048) { | ||
217 | rom = isa_bus_to_virt(start); | ||
218 | if (!romsignature(rom)) | ||
219 | continue; | ||
220 | |||
221 | video_rom_resource.start = start; | ||
222 | |||
223 | /* 0 < length <= 0x7f * 512, historically */ | ||
224 | length = rom[2] * 512; | ||
225 | |||
226 | /* if checksum okay, trust length byte */ | ||
227 | if (length && romchecksum(rom, length)) | ||
228 | video_rom_resource.end = start + length - 1; | ||
229 | |||
230 | request_resource(&iomem_resource, &video_rom_resource); | ||
231 | break; | ||
232 | } | ||
233 | |||
234 | start = (video_rom_resource.end + 1 + 2047) & ~2047UL; | ||
235 | if (start < upper) | ||
236 | start = upper; | ||
237 | |||
238 | /* system rom */ | ||
239 | request_resource(&iomem_resource, &system_rom_resource); | ||
240 | upper = system_rom_resource.start; | ||
241 | |||
242 | /* check for extension rom (ignore length byte!) */ | ||
243 | rom = isa_bus_to_virt(extension_rom_resource.start); | ||
244 | if (romsignature(rom)) { | ||
245 | length = extension_rom_resource.end - extension_rom_resource.start + 1; | ||
246 | if (romchecksum(rom, length)) { | ||
247 | request_resource(&iomem_resource, &extension_rom_resource); | ||
248 | upper = extension_rom_resource.start; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | /* check for adapter roms on 2k boundaries */ | ||
253 | for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { | ||
254 | rom = isa_bus_to_virt(start); | ||
255 | if (!romsignature(rom)) | ||
256 | continue; | ||
257 | |||
258 | /* 0 < length <= 0x7f * 512, historically */ | ||
259 | length = rom[2] * 512; | ||
260 | |||
261 | /* but accept any length that fits if checksum okay */ | ||
262 | if (!length || start + length > upper || !romchecksum(rom, length)) | ||
263 | continue; | ||
264 | |||
265 | adapter_rom_resources[i].start = start; | ||
266 | adapter_rom_resources[i].end = start + length - 1; | ||
267 | request_resource(&iomem_resource, &adapter_rom_resources[i]); | ||
268 | |||
269 | start = adapter_rom_resources[i++].end & ~2047UL; | ||
270 | } | ||
271 | } | ||
272 | |||
273 | static __init void parse_cmdline_early (char ** cmdline_p) | ||
274 | { | ||
275 | char c = ' ', *to = command_line, *from = COMMAND_LINE; | ||
276 | int len = 0; | ||
277 | |||
278 | /* Save unparsed command line copy for /proc/cmdline */ | ||
279 | memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); | ||
280 | saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; | ||
281 | |||
282 | for (;;) { | ||
283 | if (c != ' ') | ||
284 | goto next_char; | ||
285 | |||
286 | #ifdef CONFIG_SMP | ||
287 | /* | ||
288 | * If the BIOS enumerates physical processors before logical, | ||
289 | * maxcpus=N at enumeration-time can be used to disable HT. | ||
290 | */ | ||
291 | else if (!memcmp(from, "maxcpus=", 8)) { | ||
292 | extern unsigned int maxcpus; | ||
293 | |||
294 | maxcpus = simple_strtoul(from + 8, NULL, 0); | ||
295 | } | ||
296 | #endif | ||
297 | #ifdef CONFIG_ACPI_BOOT | ||
298 | /* "acpi=off" disables both ACPI table parsing and interpreter init */ | ||
299 | if (!memcmp(from, "acpi=off", 8)) | ||
300 | disable_acpi(); | ||
301 | |||
302 | if (!memcmp(from, "acpi=force", 10)) { | ||
303 | /* add later when we do DMI horrors: */ | ||
304 | acpi_force = 1; | ||
305 | acpi_disabled = 0; | ||
306 | } | ||
307 | |||
308 | /* acpi=ht just means: do ACPI MADT parsing | ||
309 | at bootup, but don't enable the full ACPI interpreter */ | ||
310 | if (!memcmp(from, "acpi=ht", 7)) { | ||
311 | if (!acpi_force) | ||
312 | disable_acpi(); | ||
313 | acpi_ht = 1; | ||
314 | } | ||
315 | else if (!memcmp(from, "pci=noacpi", 10)) | ||
316 | acpi_disable_pci(); | ||
317 | else if (!memcmp(from, "acpi=noirq", 10)) | ||
318 | acpi_noirq_set(); | ||
319 | |||
320 | else if (!memcmp(from, "acpi_sci=edge", 13)) | ||
321 | acpi_sci_flags.trigger = 1; | ||
322 | else if (!memcmp(from, "acpi_sci=level", 14)) | ||
323 | acpi_sci_flags.trigger = 3; | ||
324 | else if (!memcmp(from, "acpi_sci=high", 13)) | ||
325 | acpi_sci_flags.polarity = 1; | ||
326 | else if (!memcmp(from, "acpi_sci=low", 12)) | ||
327 | acpi_sci_flags.polarity = 3; | ||
328 | |||
329 | /* acpi=strict disables out-of-spec workarounds */ | ||
330 | else if (!memcmp(from, "acpi=strict", 11)) { | ||
331 | acpi_strict = 1; | ||
332 | } | ||
333 | #endif | ||
334 | |||
335 | if (!memcmp(from, "nolapic", 7) || | ||
336 | !memcmp(from, "disableapic", 11)) | ||
337 | disable_apic = 1; | ||
338 | |||
339 | if (!memcmp(from, "noapic", 6)) | ||
340 | skip_ioapic_setup = 1; | ||
341 | |||
342 | if (!memcmp(from, "apic", 4)) { | ||
343 | skip_ioapic_setup = 0; | ||
344 | ioapic_force = 1; | ||
345 | } | ||
346 | |||
347 | if (!memcmp(from, "mem=", 4)) | ||
348 | parse_memopt(from+4, &from); | ||
349 | |||
350 | #ifdef CONFIG_DISCONTIGMEM | ||
351 | if (!memcmp(from, "numa=", 5)) | ||
352 | numa_setup(from+5); | ||
353 | #endif | ||
354 | |||
355 | #ifdef CONFIG_GART_IOMMU | ||
356 | if (!memcmp(from,"iommu=",6)) { | ||
357 | iommu_setup(from+6); | ||
358 | } | ||
359 | #endif | ||
360 | |||
361 | if (!memcmp(from,"oops=panic", 10)) | ||
362 | panic_on_oops = 1; | ||
363 | |||
364 | if (!memcmp(from, "noexec=", 7)) | ||
365 | nonx_setup(from + 7); | ||
366 | |||
367 | next_char: | ||
368 | c = *(from++); | ||
369 | if (!c) | ||
370 | break; | ||
371 | if (COMMAND_LINE_SIZE <= ++len) | ||
372 | break; | ||
373 | *(to++) = c; | ||
374 | } | ||
375 | *to = '\0'; | ||
376 | *cmdline_p = command_line; | ||
377 | } | ||
378 | |||
379 | #ifndef CONFIG_DISCONTIGMEM | ||
380 | static void __init contig_initmem_init(void) | ||
381 | { | ||
382 | unsigned long bootmap_size, bootmap; | ||
383 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
384 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); | ||
385 | if (bootmap == -1L) | ||
386 | panic("Cannot find bootmem map of size %ld\n",bootmap_size); | ||
387 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | ||
388 | e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); | ||
389 | reserve_bootmem(bootmap, bootmap_size); | ||
390 | } | ||
391 | #endif | ||
392 | |||
393 | /* Use inline assembly to define this because the nops are defined | ||
394 | as inline assembly strings in the include files and we cannot | ||
395 | get them easily into strings. */ | ||
396 | asm("\t.data\nk8nops: " | ||
397 | K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 | ||
398 | K8_NOP7 K8_NOP8); | ||
399 | |||
400 | extern unsigned char k8nops[]; | ||
401 | static unsigned char *k8_nops[ASM_NOP_MAX+1] = { | ||
402 | NULL, | ||
403 | k8nops, | ||
404 | k8nops + 1, | ||
405 | k8nops + 1 + 2, | ||
406 | k8nops + 1 + 2 + 3, | ||
407 | k8nops + 1 + 2 + 3 + 4, | ||
408 | k8nops + 1 + 2 + 3 + 4 + 5, | ||
409 | k8nops + 1 + 2 + 3 + 4 + 5 + 6, | ||
410 | k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, | ||
411 | }; | ||
412 | |||
413 | /* Replace instructions with better alternatives for this CPU type. | ||
414 | |||
415 | This runs before SMP is initialized to avoid SMP problems with | ||
416 | self modifying code. This implies that assymetric systems where | ||
417 | APs have less capabilities than the boot processor are not handled. | ||
418 | In this case boot with "noreplacement". */ | ||
419 | void apply_alternatives(void *start, void *end) | ||
420 | { | ||
421 | struct alt_instr *a; | ||
422 | int diff, i, k; | ||
423 | for (a = start; (void *)a < end; a++) { | ||
424 | if (!boot_cpu_has(a->cpuid)) | ||
425 | continue; | ||
426 | |||
427 | BUG_ON(a->replacementlen > a->instrlen); | ||
428 | __inline_memcpy(a->instr, a->replacement, a->replacementlen); | ||
429 | diff = a->instrlen - a->replacementlen; | ||
430 | |||
431 | /* Pad the rest with nops */ | ||
432 | for (i = a->replacementlen; diff > 0; diff -= k, i += k) { | ||
433 | k = diff; | ||
434 | if (k > ASM_NOP_MAX) | ||
435 | k = ASM_NOP_MAX; | ||
436 | __inline_memcpy(a->instr + i, k8_nops[k], k); | ||
437 | } | ||
438 | } | ||
439 | } | ||
440 | |||
441 | static int no_replacement __initdata = 0; | ||
442 | |||
443 | void __init alternative_instructions(void) | ||
444 | { | ||
445 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | ||
446 | if (no_replacement) | ||
447 | return; | ||
448 | apply_alternatives(__alt_instructions, __alt_instructions_end); | ||
449 | } | ||
450 | |||
451 | static int __init noreplacement_setup(char *s) | ||
452 | { | ||
453 | no_replacement = 1; | ||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | __setup("noreplacement", noreplacement_setup); | ||
458 | |||
459 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | ||
460 | struct edd edd; | ||
461 | #ifdef CONFIG_EDD_MODULE | ||
462 | EXPORT_SYMBOL(edd); | ||
463 | #endif | ||
464 | /** | ||
465 | * copy_edd() - Copy the BIOS EDD information | ||
466 | * from boot_params into a safe place. | ||
467 | * | ||
468 | */ | ||
469 | static inline void copy_edd(void) | ||
470 | { | ||
471 | memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); | ||
472 | memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); | ||
473 | edd.mbr_signature_nr = EDD_MBR_SIG_NR; | ||
474 | edd.edd_info_nr = EDD_NR; | ||
475 | } | ||
476 | #else | ||
477 | static inline void copy_edd(void) | ||
478 | { | ||
479 | } | ||
480 | #endif | ||
481 | |||
482 | #define EBDA_ADDR_POINTER 0x40E | ||
483 | static void __init reserve_ebda_region(void) | ||
484 | { | ||
485 | unsigned int addr; | ||
486 | /** | ||
487 | * there is a real-mode segmented pointer pointing to the | ||
488 | * 4K EBDA area at 0x40E | ||
489 | */ | ||
490 | addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); | ||
491 | addr <<= 4; | ||
492 | if (addr) | ||
493 | reserve_bootmem_generic(addr, PAGE_SIZE); | ||
494 | } | ||
495 | |||
496 | void __init setup_arch(char **cmdline_p) | ||
497 | { | ||
498 | unsigned long low_mem_size; | ||
499 | unsigned long kernel_end; | ||
500 | |||
501 | ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); | ||
502 | drive_info = DRIVE_INFO; | ||
503 | screen_info = SCREEN_INFO; | ||
504 | edid_info = EDID_INFO; | ||
505 | saved_video_mode = SAVED_VIDEO_MODE; | ||
506 | bootloader_type = LOADER_TYPE; | ||
507 | |||
508 | #ifdef CONFIG_BLK_DEV_RAM | ||
509 | rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; | ||
510 | rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); | ||
511 | rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); | ||
512 | #endif | ||
513 | setup_memory_region(); | ||
514 | copy_edd(); | ||
515 | |||
516 | if (!MOUNT_ROOT_RDONLY) | ||
517 | root_mountflags &= ~MS_RDONLY; | ||
518 | init_mm.start_code = (unsigned long) &_text; | ||
519 | init_mm.end_code = (unsigned long) &_etext; | ||
520 | init_mm.end_data = (unsigned long) &_edata; | ||
521 | init_mm.brk = (unsigned long) &_end; | ||
522 | |||
523 | code_resource.start = virt_to_phys(&_text); | ||
524 | code_resource.end = virt_to_phys(&_etext)-1; | ||
525 | data_resource.start = virt_to_phys(&_etext); | ||
526 | data_resource.end = virt_to_phys(&_edata)-1; | ||
527 | |||
528 | parse_cmdline_early(cmdline_p); | ||
529 | |||
530 | early_identify_cpu(&boot_cpu_data); | ||
531 | |||
532 | /* | ||
533 | * partially used pages are not usable - thus | ||
534 | * we are rounding upwards: | ||
535 | */ | ||
536 | end_pfn = e820_end_of_ram(); | ||
537 | |||
538 | check_efer(); | ||
539 | |||
540 | init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); | ||
541 | |||
542 | #ifdef CONFIG_ACPI_BOOT | ||
543 | /* | ||
544 | * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). | ||
545 | * Call this early for SRAT node setup. | ||
546 | */ | ||
547 | acpi_boot_table_init(); | ||
548 | #endif | ||
549 | |||
550 | #ifdef CONFIG_ACPI_NUMA | ||
551 | /* | ||
552 | * Parse SRAT to discover nodes. | ||
553 | */ | ||
554 | acpi_numa_init(); | ||
555 | #endif | ||
556 | |||
557 | #ifdef CONFIG_DISCONTIGMEM | ||
558 | numa_initmem_init(0, end_pfn); | ||
559 | #else | ||
560 | contig_initmem_init(); | ||
561 | #endif | ||
562 | |||
563 | /* Reserve direct mapping */ | ||
564 | reserve_bootmem_generic(table_start << PAGE_SHIFT, | ||
565 | (table_end - table_start) << PAGE_SHIFT); | ||
566 | |||
567 | /* reserve kernel */ | ||
568 | kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); | ||
569 | reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); | ||
570 | |||
571 | /* | ||
572 | * reserve physical page 0 - it's a special BIOS page on many boxes, | ||
573 | * enabling clean reboots, SMP operation, laptop functions. | ||
574 | */ | ||
575 | reserve_bootmem_generic(0, PAGE_SIZE); | ||
576 | |||
577 | /* reserve ebda region */ | ||
578 | reserve_ebda_region(); | ||
579 | |||
580 | #ifdef CONFIG_SMP | ||
581 | /* | ||
582 | * But first pinch a few for the stack/trampoline stuff | ||
583 | * FIXME: Don't need the extra page at 4K, but need to fix | ||
584 | * trampoline before removing it. (see the GDT stuff) | ||
585 | */ | ||
586 | reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); | ||
587 | |||
588 | /* Reserve SMP trampoline */ | ||
589 | reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); | ||
590 | #endif | ||
591 | |||
592 | #ifdef CONFIG_ACPI_SLEEP | ||
593 | /* | ||
594 | * Reserve low memory region for sleep support. | ||
595 | */ | ||
596 | acpi_reserve_bootmem(); | ||
597 | #endif | ||
598 | #ifdef CONFIG_X86_LOCAL_APIC | ||
599 | /* | ||
600 | * Find and reserve possible boot-time SMP configuration: | ||
601 | */ | ||
602 | find_smp_config(); | ||
603 | #endif | ||
604 | #ifdef CONFIG_BLK_DEV_INITRD | ||
605 | if (LOADER_TYPE && INITRD_START) { | ||
606 | if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { | ||
607 | reserve_bootmem_generic(INITRD_START, INITRD_SIZE); | ||
608 | initrd_start = | ||
609 | INITRD_START ? INITRD_START + PAGE_OFFSET : 0; | ||
610 | initrd_end = initrd_start+INITRD_SIZE; | ||
611 | } | ||
612 | else { | ||
613 | printk(KERN_ERR "initrd extends beyond end of memory " | ||
614 | "(0x%08lx > 0x%08lx)\ndisabling initrd\n", | ||
615 | (unsigned long)(INITRD_START + INITRD_SIZE), | ||
616 | (unsigned long)(end_pfn << PAGE_SHIFT)); | ||
617 | initrd_start = 0; | ||
618 | } | ||
619 | } | ||
620 | #endif | ||
621 | paging_init(); | ||
622 | |||
623 | check_ioapic(); | ||
624 | |||
625 | #ifdef CONFIG_ACPI_BOOT | ||
626 | /* | ||
627 | * Read APIC and some other early information from ACPI tables. | ||
628 | */ | ||
629 | acpi_boot_init(); | ||
630 | #endif | ||
631 | |||
632 | #ifdef CONFIG_X86_LOCAL_APIC | ||
633 | /* | ||
634 | * get boot-time SMP configuration: | ||
635 | */ | ||
636 | if (smp_found_config) | ||
637 | get_smp_config(); | ||
638 | init_apic_mappings(); | ||
639 | #endif | ||
640 | |||
641 | /* | ||
642 | * Request address space for all standard RAM and ROM resources | ||
643 | * and also for regions reported as reserved by the e820. | ||
644 | */ | ||
645 | probe_roms(); | ||
646 | e820_reserve_resources(); | ||
647 | |||
648 | request_resource(&iomem_resource, &video_ram_resource); | ||
649 | |||
650 | { | ||
651 | unsigned i; | ||
652 | /* request I/O space for devices used on all i[345]86 PCs */ | ||
653 | for (i = 0; i < STANDARD_IO_RESOURCES; i++) | ||
654 | request_resource(&ioport_resource, &standard_io_resources[i]); | ||
655 | } | ||
656 | |||
657 | /* Will likely break when you have unassigned resources with more | ||
658 | than 4GB memory and bridges that don't support more than 4GB. | ||
659 | Doing it properly would require to use pci_alloc_consistent | ||
660 | in this case. */ | ||
661 | low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; | ||
662 | if (low_mem_size > pci_mem_start) | ||
663 | pci_mem_start = low_mem_size; | ||
664 | |||
665 | #ifdef CONFIG_GART_IOMMU | ||
666 | iommu_hole_init(); | ||
667 | #endif | ||
668 | |||
669 | #ifdef CONFIG_VT | ||
670 | #if defined(CONFIG_VGA_CONSOLE) | ||
671 | conswitchp = &vga_con; | ||
672 | #elif defined(CONFIG_DUMMY_CONSOLE) | ||
673 | conswitchp = &dummy_con; | ||
674 | #endif | ||
675 | #endif | ||
676 | } | ||
677 | |||
678 | static int __init get_model_name(struct cpuinfo_x86 *c) | ||
679 | { | ||
680 | unsigned int *v; | ||
681 | |||
682 | if (c->x86_cpuid_level < 0x80000004) | ||
683 | return 0; | ||
684 | |||
685 | v = (unsigned int *) c->x86_model_id; | ||
686 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | ||
687 | cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | ||
688 | cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | ||
689 | c->x86_model_id[48] = 0; | ||
690 | return 1; | ||
691 | } | ||
692 | |||
693 | |||
694 | static void __init display_cacheinfo(struct cpuinfo_x86 *c) | ||
695 | { | ||
696 | unsigned int n, dummy, eax, ebx, ecx, edx; | ||
697 | |||
698 | n = c->x86_cpuid_level; | ||
699 | |||
700 | if (n >= 0x80000005) { | ||
701 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | ||
702 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | ||
703 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | ||
704 | c->x86_cache_size=(ecx>>24)+(edx>>24); | ||
705 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
706 | c->x86_tlbsize = 0; | ||
707 | } | ||
708 | |||
709 | if (n >= 0x80000006) { | ||
710 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | ||
711 | ecx = cpuid_ecx(0x80000006); | ||
712 | c->x86_cache_size = ecx >> 16; | ||
713 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
714 | |||
715 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | ||
716 | c->x86_cache_size, ecx & 0xFF); | ||
717 | } | ||
718 | |||
719 | if (n >= 0x80000007) | ||
720 | cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); | ||
721 | if (n >= 0x80000008) { | ||
722 | cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); | ||
723 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
724 | c->x86_phys_bits = eax & 0xff; | ||
725 | } | ||
726 | } | ||
727 | |||
728 | |||
729 | static int __init init_amd(struct cpuinfo_x86 *c) | ||
730 | { | ||
731 | int r; | ||
732 | int level; | ||
733 | #ifdef CONFIG_NUMA | ||
734 | int cpu; | ||
735 | #endif | ||
736 | |||
737 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | ||
738 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | ||
739 | clear_bit(0*32+31, &c->x86_capability); | ||
740 | |||
741 | /* C-stepping K8? */ | ||
742 | level = cpuid_eax(1); | ||
743 | if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) | ||
744 | set_bit(X86_FEATURE_K8_C, &c->x86_capability); | ||
745 | |||
746 | r = get_model_name(c); | ||
747 | if (!r) { | ||
748 | switch (c->x86) { | ||
749 | case 15: | ||
750 | /* Should distinguish Models here, but this is only | ||
751 | a fallback anyways. */ | ||
752 | strcpy(c->x86_model_id, "Hammer"); | ||
753 | break; | ||
754 | } | ||
755 | } | ||
756 | display_cacheinfo(c); | ||
757 | |||
758 | if (c->x86_cpuid_level >= 0x80000008) { | ||
759 | c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; | ||
760 | if (c->x86_num_cores & (c->x86_num_cores - 1)) | ||
761 | c->x86_num_cores = 1; | ||
762 | |||
763 | #ifdef CONFIG_NUMA | ||
764 | /* On a dual core setup the lower bits of apic id | ||
765 | distingush the cores. Fix up the CPU<->node mappings | ||
766 | here based on that. | ||
767 | Assumes number of cores is a power of two. | ||
768 | When using SRAT use mapping from SRAT. */ | ||
769 | cpu = c->x86_apicid; | ||
770 | if (acpi_numa <= 0 && c->x86_num_cores > 1) { | ||
771 | cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1); | ||
772 | if (!node_online(cpu_to_node[cpu])) | ||
773 | cpu_to_node[cpu] = first_node(node_online_map); | ||
774 | } | ||
775 | printk(KERN_INFO "CPU %d(%d) -> Node %d\n", | ||
776 | cpu, c->x86_num_cores, cpu_to_node[cpu]); | ||
777 | #endif | ||
778 | } | ||
779 | |||
780 | return r; | ||
781 | } | ||
782 | |||
783 | static void __init detect_ht(struct cpuinfo_x86 *c) | ||
784 | { | ||
785 | #ifdef CONFIG_SMP | ||
786 | u32 eax, ebx, ecx, edx; | ||
787 | int index_lsb, index_msb, tmp; | ||
788 | int cpu = smp_processor_id(); | ||
789 | |||
790 | if (!cpu_has(c, X86_FEATURE_HT)) | ||
791 | return; | ||
792 | |||
793 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
794 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
795 | |||
796 | if (smp_num_siblings == 1) { | ||
797 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
798 | } else if (smp_num_siblings > 1) { | ||
799 | index_lsb = 0; | ||
800 | index_msb = 31; | ||
801 | /* | ||
802 | * At this point we only support two siblings per | ||
803 | * processor package. | ||
804 | */ | ||
805 | if (smp_num_siblings > NR_CPUS) { | ||
806 | printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); | ||
807 | smp_num_siblings = 1; | ||
808 | return; | ||
809 | } | ||
810 | tmp = smp_num_siblings; | ||
811 | while ((tmp & 1) == 0) { | ||
812 | tmp >>=1 ; | ||
813 | index_lsb++; | ||
814 | } | ||
815 | tmp = smp_num_siblings; | ||
816 | while ((tmp & 0x80000000 ) == 0) { | ||
817 | tmp <<=1 ; | ||
818 | index_msb--; | ||
819 | } | ||
820 | if (index_lsb != index_msb ) | ||
821 | index_msb++; | ||
822 | phys_proc_id[cpu] = phys_pkg_id(index_msb); | ||
823 | |||
824 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | ||
825 | phys_proc_id[cpu]); | ||
826 | } | ||
827 | #endif | ||
828 | } | ||
829 | |||
830 | static void __init sched_cmp_hack(struct cpuinfo_x86 *c) | ||
831 | { | ||
832 | #ifdef CONFIG_SMP | ||
833 | /* AMD dual core looks like HT but isn't really. Hide it from the | ||
834 | scheduler. This works around problems with the domain scheduler. | ||
835 | Also probably gives slightly better scheduling and disables | ||
836 | SMT nice which is harmful on dual core. | ||
837 | TBD tune the domain scheduler for dual core. */ | ||
838 | if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
839 | smp_num_siblings = 1; | ||
840 | #endif | ||
841 | } | ||
842 | |||
843 | static void __init init_intel(struct cpuinfo_x86 *c) | ||
844 | { | ||
845 | /* Cache sizes */ | ||
846 | unsigned n; | ||
847 | |||
848 | init_intel_cacheinfo(c); | ||
849 | n = c->x86_cpuid_level; | ||
850 | if (n >= 0x80000008) { | ||
851 | unsigned eax = cpuid_eax(0x80000008); | ||
852 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
853 | c->x86_phys_bits = eax & 0xff; | ||
854 | } | ||
855 | |||
856 | if (c->x86 == 15) | ||
857 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
858 | } | ||
859 | |||
860 | void __init get_cpu_vendor(struct cpuinfo_x86 *c) | ||
861 | { | ||
862 | char *v = c->x86_vendor_id; | ||
863 | |||
864 | if (!strcmp(v, "AuthenticAMD")) | ||
865 | c->x86_vendor = X86_VENDOR_AMD; | ||
866 | else if (!strcmp(v, "GenuineIntel")) | ||
867 | c->x86_vendor = X86_VENDOR_INTEL; | ||
868 | else | ||
869 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
870 | } | ||
871 | |||
872 | struct cpu_model_info { | ||
873 | int vendor; | ||
874 | int family; | ||
875 | char *model_names[16]; | ||
876 | }; | ||
877 | |||
878 | /* Do some early cpuid on the boot CPU to get some parameter that are | ||
879 | needed before check_bugs. Everything advanced is in identify_cpu | ||
880 | below. */ | ||
881 | void __init early_identify_cpu(struct cpuinfo_x86 *c) | ||
882 | { | ||
883 | u32 tfms; | ||
884 | |||
885 | c->loops_per_jiffy = loops_per_jiffy; | ||
886 | c->x86_cache_size = -1; | ||
887 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
888 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | ||
889 | c->x86_vendor_id[0] = '\0'; /* Unset */ | ||
890 | c->x86_model_id[0] = '\0'; /* Unset */ | ||
891 | c->x86_clflush_size = 64; | ||
892 | c->x86_cache_alignment = c->x86_clflush_size; | ||
893 | c->x86_num_cores = 1; | ||
894 | c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data; | ||
895 | c->x86_cpuid_level = 0; | ||
896 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
897 | |||
898 | /* Get vendor name */ | ||
899 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | ||
900 | (unsigned int *)&c->x86_vendor_id[0], | ||
901 | (unsigned int *)&c->x86_vendor_id[8], | ||
902 | (unsigned int *)&c->x86_vendor_id[4]); | ||
903 | |||
904 | get_cpu_vendor(c); | ||
905 | |||
906 | /* Initialize the standard set of capabilities */ | ||
907 | /* Note that the vendor-specific code below might override */ | ||
908 | |||
909 | /* Intel-defined flags: level 0x00000001 */ | ||
910 | if (c->cpuid_level >= 0x00000001) { | ||
911 | __u32 misc; | ||
912 | cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | ||
913 | &c->x86_capability[0]); | ||
914 | c->x86 = (tfms >> 8) & 0xf; | ||
915 | c->x86_model = (tfms >> 4) & 0xf; | ||
916 | c->x86_mask = tfms & 0xf; | ||
917 | if (c->x86 == 0xf) { | ||
918 | c->x86 += (tfms >> 20) & 0xff; | ||
919 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
920 | } | ||
921 | if (c->x86_capability[0] & (1<<19)) | ||
922 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | ||
923 | c->x86_apicid = misc >> 24; | ||
924 | } else { | ||
925 | /* Have CPUID level 0 only - unheard of */ | ||
926 | c->x86 = 4; | ||
927 | } | ||
928 | } | ||
929 | |||
930 | /* | ||
931 | * This does the hard work of actually picking apart the CPU stuff... | ||
932 | */ | ||
933 | void __init identify_cpu(struct cpuinfo_x86 *c) | ||
934 | { | ||
935 | int i; | ||
936 | u32 xlvl; | ||
937 | |||
938 | early_identify_cpu(c); | ||
939 | |||
940 | /* AMD-defined flags: level 0x80000001 */ | ||
941 | xlvl = cpuid_eax(0x80000000); | ||
942 | c->x86_cpuid_level = xlvl; | ||
943 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
944 | if (xlvl >= 0x80000001) { | ||
945 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
946 | c->x86_capability[5] = cpuid_ecx(0x80000001); | ||
947 | } | ||
948 | if (xlvl >= 0x80000004) | ||
949 | get_model_name(c); /* Default name */ | ||
950 | } | ||
951 | |||
952 | /* Transmeta-defined flags: level 0x80860001 */ | ||
953 | xlvl = cpuid_eax(0x80860000); | ||
954 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
955 | /* Don't set x86_cpuid_level here for now to not confuse. */ | ||
956 | if (xlvl >= 0x80860001) | ||
957 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Vendor-specific initialization. In this section we | ||
962 | * canonicalize the feature flags, meaning if there are | ||
963 | * features a certain CPU supports which CPUID doesn't | ||
964 | * tell us, CPUID claiming incorrect flags, or other bugs, | ||
965 | * we handle them here. | ||
966 | * | ||
967 | * At the end of this section, c->x86_capability better | ||
968 | * indicate the features this CPU genuinely supports! | ||
969 | */ | ||
970 | switch (c->x86_vendor) { | ||
971 | case X86_VENDOR_AMD: | ||
972 | init_amd(c); | ||
973 | break; | ||
974 | |||
975 | case X86_VENDOR_INTEL: | ||
976 | init_intel(c); | ||
977 | break; | ||
978 | |||
979 | case X86_VENDOR_UNKNOWN: | ||
980 | default: | ||
981 | display_cacheinfo(c); | ||
982 | break; | ||
983 | } | ||
984 | |||
985 | select_idle_routine(c); | ||
986 | detect_ht(c); | ||
987 | sched_cmp_hack(c); | ||
988 | |||
989 | /* | ||
990 | * On SMP, boot_cpu_data holds the common feature set between | ||
991 | * all CPUs; so make sure that we indicate which features are | ||
992 | * common between the CPUs. The first time this routine gets | ||
993 | * executed, c == &boot_cpu_data. | ||
994 | */ | ||
995 | if (c != &boot_cpu_data) { | ||
996 | /* AND the already accumulated flags with these */ | ||
997 | for (i = 0 ; i < NCAPINTS ; i++) | ||
998 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | ||
999 | } | ||
1000 | |||
1001 | #ifdef CONFIG_X86_MCE | ||
1002 | mcheck_init(c); | ||
1003 | #endif | ||
1004 | #ifdef CONFIG_NUMA | ||
1005 | if (c != &boot_cpu_data) | ||
1006 | numa_add_cpu(c - cpu_data); | ||
1007 | #endif | ||
1008 | } | ||
1009 | |||
1010 | |||
1011 | void __init print_cpu_info(struct cpuinfo_x86 *c) | ||
1012 | { | ||
1013 | if (c->x86_model_id[0]) | ||
1014 | printk("%s", c->x86_model_id); | ||
1015 | |||
1016 | if (c->x86_mask || c->cpuid_level >= 0) | ||
1017 | printk(" stepping %02x\n", c->x86_mask); | ||
1018 | else | ||
1019 | printk("\n"); | ||
1020 | } | ||
1021 | |||
1022 | /* | ||
1023 | * Get CPU information for use by the procfs. | ||
1024 | */ | ||
1025 | |||
1026 | static int show_cpuinfo(struct seq_file *m, void *v) | ||
1027 | { | ||
1028 | struct cpuinfo_x86 *c = v; | ||
1029 | |||
1030 | /* | ||
1031 | * These flag bits must match the definitions in <asm/cpufeature.h>. | ||
1032 | * NULL means this bit is undefined or reserved; either way it doesn't | ||
1033 | * have meaning as far as Linux is concerned. Note that it's important | ||
1034 | * to realize there is a difference between this table and CPUID -- if | ||
1035 | * applications want to get the raw CPUID data, they should access | ||
1036 | * /dev/cpu/<cpu_nr>/cpuid instead. | ||
1037 | */ | ||
1038 | static char *x86_cap_flags[] = { | ||
1039 | /* Intel-defined */ | ||
1040 | "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", | ||
1041 | "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", | ||
1042 | "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", | ||
1043 | "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, | ||
1044 | |||
1045 | /* AMD-defined */ | ||
1046 | "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1047 | NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, | ||
1048 | NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, | ||
1049 | NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", | ||
1050 | |||
1051 | /* Transmeta-defined */ | ||
1052 | "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, | ||
1053 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1054 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1055 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1056 | |||
1057 | /* Other (Linux-defined) */ | ||
1058 | "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL, | ||
1059 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1060 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1061 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1062 | |||
1063 | /* Intel-defined (#2) */ | ||
1064 | "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", | ||
1065 | "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, | ||
1066 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1067 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1068 | |||
1069 | /* AMD-defined (#2) */ | ||
1070 | "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, | ||
1071 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1072 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
1073 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL | ||
1074 | }; | ||
1075 | static char *x86_power_flags[] = { | ||
1076 | "ts", /* temperature sensor */ | ||
1077 | "fid", /* frequency id control */ | ||
1078 | "vid", /* voltage id control */ | ||
1079 | "ttp", /* thermal trip */ | ||
1080 | "tm", | ||
1081 | "stc" | ||
1082 | }; | ||
1083 | |||
1084 | |||
1085 | #ifdef CONFIG_SMP | ||
1086 | if (!cpu_online(c-cpu_data)) | ||
1087 | return 0; | ||
1088 | #endif | ||
1089 | |||
1090 | seq_printf(m,"processor\t: %u\n" | ||
1091 | "vendor_id\t: %s\n" | ||
1092 | "cpu family\t: %d\n" | ||
1093 | "model\t\t: %d\n" | ||
1094 | "model name\t: %s\n", | ||
1095 | (unsigned)(c-cpu_data), | ||
1096 | c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", | ||
1097 | c->x86, | ||
1098 | (int)c->x86_model, | ||
1099 | c->x86_model_id[0] ? c->x86_model_id : "unknown"); | ||
1100 | |||
1101 | if (c->x86_mask || c->cpuid_level >= 0) | ||
1102 | seq_printf(m, "stepping\t: %d\n", c->x86_mask); | ||
1103 | else | ||
1104 | seq_printf(m, "stepping\t: unknown\n"); | ||
1105 | |||
1106 | if (cpu_has(c,X86_FEATURE_TSC)) { | ||
1107 | seq_printf(m, "cpu MHz\t\t: %u.%03u\n", | ||
1108 | cpu_khz / 1000, (cpu_khz % 1000)); | ||
1109 | } | ||
1110 | |||
1111 | /* Cache size */ | ||
1112 | if (c->x86_cache_size >= 0) | ||
1113 | seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); | ||
1114 | |||
1115 | #ifdef CONFIG_SMP | ||
1116 | seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]); | ||
1117 | seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings); | ||
1118 | #endif | ||
1119 | |||
1120 | seq_printf(m, | ||
1121 | "fpu\t\t: yes\n" | ||
1122 | "fpu_exception\t: yes\n" | ||
1123 | "cpuid level\t: %d\n" | ||
1124 | "wp\t\t: yes\n" | ||
1125 | "flags\t\t:", | ||
1126 | c->cpuid_level); | ||
1127 | |||
1128 | { | ||
1129 | int i; | ||
1130 | for ( i = 0 ; i < 32*NCAPINTS ; i++ ) | ||
1131 | if ( test_bit(i, &c->x86_capability) && | ||
1132 | x86_cap_flags[i] != NULL ) | ||
1133 | seq_printf(m, " %s", x86_cap_flags[i]); | ||
1134 | } | ||
1135 | |||
1136 | seq_printf(m, "\nbogomips\t: %lu.%02lu\n", | ||
1137 | c->loops_per_jiffy/(500000/HZ), | ||
1138 | (c->loops_per_jiffy/(5000/HZ)) % 100); | ||
1139 | |||
1140 | if (c->x86_tlbsize > 0) | ||
1141 | seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); | ||
1142 | seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); | ||
1143 | seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); | ||
1144 | |||
1145 | seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | ||
1146 | c->x86_phys_bits, c->x86_virt_bits); | ||
1147 | |||
1148 | seq_printf(m, "power management:"); | ||
1149 | { | ||
1150 | unsigned i; | ||
1151 | for (i = 0; i < 32; i++) | ||
1152 | if (c->x86_power & (1 << i)) { | ||
1153 | if (i < ARRAY_SIZE(x86_power_flags)) | ||
1154 | seq_printf(m, " %s", x86_power_flags[i]); | ||
1155 | else | ||
1156 | seq_printf(m, " [%d]", i); | ||
1157 | } | ||
1158 | } | ||
1159 | seq_printf(m, "\n"); | ||
1160 | |||
1161 | if (c->x86_num_cores > 1) | ||
1162 | seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); | ||
1163 | |||
1164 | seq_printf(m, "\n\n"); | ||
1165 | |||
1166 | return 0; | ||
1167 | } | ||
1168 | |||
1169 | static void *c_start(struct seq_file *m, loff_t *pos) | ||
1170 | { | ||
1171 | return *pos < NR_CPUS ? cpu_data + *pos : NULL; | ||
1172 | } | ||
1173 | |||
1174 | static void *c_next(struct seq_file *m, void *v, loff_t *pos) | ||
1175 | { | ||
1176 | ++*pos; | ||
1177 | return c_start(m, pos); | ||
1178 | } | ||
1179 | |||
1180 | static void c_stop(struct seq_file *m, void *v) | ||
1181 | { | ||
1182 | } | ||
1183 | |||
1184 | struct seq_operations cpuinfo_op = { | ||
1185 | .start =c_start, | ||
1186 | .next = c_next, | ||
1187 | .stop = c_stop, | ||
1188 | .show = show_cpuinfo, | ||
1189 | }; | ||
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c new file mode 100644 index 000000000000..b5305b04bc40 --- /dev/null +++ b/arch/x86_64/kernel/setup64.c | |||
@@ -0,0 +1,292 @@ | |||
1 | /* | ||
2 | * X86-64 specific CPU setup. | ||
3 | * Copyright (C) 1995 Linus Torvalds | ||
4 | * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. | ||
5 | * See setup.c for older changelog. | ||
6 | * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ | ||
7 | */ | ||
8 | #include <linux/config.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/bootmem.h> | ||
14 | #include <linux/bitops.h> | ||
15 | #include <asm/pda.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/processor.h> | ||
18 | #include <asm/desc.h> | ||
19 | #include <asm/atomic.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/smp.h> | ||
22 | #include <asm/i387.h> | ||
23 | #include <asm/percpu.h> | ||
24 | #include <asm/mtrr.h> | ||
25 | #include <asm/proto.h> | ||
26 | #include <asm/mman.h> | ||
27 | #include <asm/numa.h> | ||
28 | |||
29 | char x86_boot_params[2048] __initdata = {0,}; | ||
30 | |||
31 | cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; | ||
32 | |||
33 | struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; | ||
34 | |||
35 | extern struct task_struct init_task; | ||
36 | |||
37 | extern unsigned char __per_cpu_start[], __per_cpu_end[]; | ||
38 | |||
39 | extern struct desc_ptr cpu_gdt_descr[]; | ||
40 | struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; | ||
41 | |||
42 | char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); | ||
43 | |||
44 | unsigned long __supported_pte_mask = ~0UL; | ||
45 | static int do_not_nx __initdata = 0; | ||
46 | |||
47 | /* noexec=on|off | ||
48 | Control non executable mappings for 64bit processes. | ||
49 | |||
50 | on Enable(default) | ||
51 | off Disable | ||
52 | */ | ||
53 | int __init nonx_setup(char *str) | ||
54 | { | ||
55 | if (!strncmp(str, "on", 2)) { | ||
56 | __supported_pte_mask |= _PAGE_NX; | ||
57 | do_not_nx = 0; | ||
58 | } else if (!strncmp(str, "off", 3)) { | ||
59 | do_not_nx = 1; | ||
60 | __supported_pte_mask &= ~_PAGE_NX; | ||
61 | } | ||
62 | return 0; | ||
63 | } | ||
64 | __setup("noexec=", nonx_setup); /* parsed early actually */ | ||
65 | |||
66 | int force_personality32 = READ_IMPLIES_EXEC; | ||
67 | |||
68 | /* noexec32=on|off | ||
69 | Control non executable heap for 32bit processes. | ||
70 | To control the stack too use noexec=off | ||
71 | |||
72 | on PROT_READ does not imply PROT_EXEC for 32bit processes | ||
73 | off PROT_READ implies PROT_EXEC (default) | ||
74 | */ | ||
75 | static int __init nonx32_setup(char *str) | ||
76 | { | ||
77 | if (!strcmp(str, "on")) | ||
78 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
79 | else if (!strcmp(str, "off")) | ||
80 | force_personality32 |= READ_IMPLIES_EXEC; | ||
81 | return 0; | ||
82 | } | ||
83 | __setup("noexec32=", nonx32_setup); | ||
84 | |||
85 | /* | ||
86 | * Great future plan: | ||
87 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | ||
88 | * Always point %gs to its beginning | ||
89 | */ | ||
90 | void __init setup_per_cpu_areas(void) | ||
91 | { | ||
92 | int i; | ||
93 | unsigned long size; | ||
94 | |||
95 | /* Copy section for each CPU (we discard the original) */ | ||
96 | size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); | ||
97 | #ifdef CONFIG_MODULES | ||
98 | if (size < PERCPU_ENOUGH_ROOM) | ||
99 | size = PERCPU_ENOUGH_ROOM; | ||
100 | #endif | ||
101 | |||
102 | for (i = 0; i < NR_CPUS; i++) { | ||
103 | unsigned char *ptr; | ||
104 | |||
105 | if (!NODE_DATA(cpu_to_node(i))) { | ||
106 | printk("cpu with no node %d, num_online_nodes %d\n", | ||
107 | i, num_online_nodes()); | ||
108 | ptr = alloc_bootmem(size); | ||
109 | } else { | ||
110 | ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); | ||
111 | } | ||
112 | if (!ptr) | ||
113 | panic("Cannot allocate cpu data for CPU %d\n", i); | ||
114 | cpu_pda[i].data_offset = ptr - __per_cpu_start; | ||
115 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
116 | } | ||
117 | } | ||
118 | |||
119 | void pda_init(int cpu) | ||
120 | { | ||
121 | struct x8664_pda *pda = &cpu_pda[cpu]; | ||
122 | |||
123 | /* Setup up data that may be needed in __get_free_pages early */ | ||
124 | asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); | ||
125 | wrmsrl(MSR_GS_BASE, cpu_pda + cpu); | ||
126 | |||
127 | pda->me = pda; | ||
128 | pda->cpunumber = cpu; | ||
129 | pda->irqcount = -1; | ||
130 | pda->kernelstack = | ||
131 | (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; | ||
132 | pda->active_mm = &init_mm; | ||
133 | pda->mmu_state = 0; | ||
134 | |||
135 | if (cpu == 0) { | ||
136 | /* others are initialized in smpboot.c */ | ||
137 | pda->pcurrent = &init_task; | ||
138 | pda->irqstackptr = boot_cpu_stack; | ||
139 | } else { | ||
140 | pda->irqstackptr = (char *) | ||
141 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
142 | if (!pda->irqstackptr) | ||
143 | panic("cannot allocate irqstack for cpu %d", cpu); | ||
144 | } | ||
145 | |||
146 | asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); | ||
147 | |||
148 | pda->irqstackptr += IRQSTACKSIZE-64; | ||
149 | } | ||
150 | |||
151 | char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] | ||
152 | __attribute__((section(".bss.page_aligned"))); | ||
153 | |||
154 | /* May not be marked __init: used by software suspend */ | ||
155 | void syscall_init(void) | ||
156 | { | ||
157 | /* | ||
158 | * LSTAR and STAR live in a bit strange symbiosis. | ||
159 | * They both write to the same internal register. STAR allows to set CS/DS | ||
160 | * but only a 32bit target. LSTAR sets the 64bit rip. | ||
161 | */ | ||
162 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
163 | wrmsrl(MSR_LSTAR, system_call); | ||
164 | |||
165 | #ifdef CONFIG_IA32_EMULATION | ||
166 | syscall32_cpu_init (); | ||
167 | #endif | ||
168 | |||
169 | /* Flags to clear on syscall */ | ||
170 | wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); | ||
171 | } | ||
172 | |||
173 | void __init check_efer(void) | ||
174 | { | ||
175 | unsigned long efer; | ||
176 | |||
177 | rdmsrl(MSR_EFER, efer); | ||
178 | if (!(efer & EFER_NX) || do_not_nx) { | ||
179 | __supported_pte_mask &= ~_PAGE_NX; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * cpu_init() initializes state that is per-CPU. Some data is already | ||
185 | * initialized (naturally) in the bootstrap process, such as the GDT | ||
186 | * and IDT. We reload them nevertheless, this function acts as a | ||
187 | * 'CPU state barrier', nothing should get across. | ||
188 | * A lot of state is already set up in PDA init. | ||
189 | */ | ||
190 | void __init cpu_init (void) | ||
191 | { | ||
192 | #ifdef CONFIG_SMP | ||
193 | int cpu = stack_smp_processor_id(); | ||
194 | #else | ||
195 | int cpu = smp_processor_id(); | ||
196 | #endif | ||
197 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
198 | unsigned long v; | ||
199 | char *estacks = NULL; | ||
200 | struct task_struct *me; | ||
201 | int i; | ||
202 | |||
203 | /* CPU 0 is initialised in head64.c */ | ||
204 | if (cpu != 0) { | ||
205 | pda_init(cpu); | ||
206 | } else | ||
207 | estacks = boot_exception_stacks; | ||
208 | |||
209 | me = current; | ||
210 | |||
211 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
212 | panic("CPU#%d already initialized!\n", cpu); | ||
213 | |||
214 | printk("Initializing CPU#%d\n", cpu); | ||
215 | |||
216 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
217 | |||
218 | /* | ||
219 | * Initialize the per-CPU GDT with the boot GDT, | ||
220 | * and set up the GDT descriptor: | ||
221 | */ | ||
222 | if (cpu) { | ||
223 | memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); | ||
224 | } | ||
225 | |||
226 | cpu_gdt_descr[cpu].size = GDT_SIZE; | ||
227 | cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; | ||
228 | asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); | ||
229 | asm volatile("lidt %0" :: "m" (idt_descr)); | ||
230 | |||
231 | memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); | ||
232 | |||
233 | /* | ||
234 | * Delete NT | ||
235 | */ | ||
236 | |||
237 | asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax"); | ||
238 | |||
239 | syscall_init(); | ||
240 | |||
241 | wrmsrl(MSR_FS_BASE, 0); | ||
242 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
243 | barrier(); | ||
244 | |||
245 | check_efer(); | ||
246 | |||
247 | /* | ||
248 | * set up and load the per-CPU TSS | ||
249 | */ | ||
250 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
251 | if (cpu) { | ||
252 | estacks = (char *)__get_free_pages(GFP_ATOMIC, | ||
253 | EXCEPTION_STACK_ORDER); | ||
254 | if (!estacks) | ||
255 | panic("Cannot allocate exception stack %ld %d\n", | ||
256 | v, cpu); | ||
257 | } | ||
258 | estacks += EXCEPTION_STKSZ; | ||
259 | t->ist[v] = (unsigned long)estacks; | ||
260 | } | ||
261 | |||
262 | t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
263 | /* | ||
264 | * <= is required because the CPU will access up to | ||
265 | * 8 bits beyond the end of the IO permission bitmap. | ||
266 | */ | ||
267 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
268 | t->io_bitmap[i] = ~0UL; | ||
269 | |||
270 | atomic_inc(&init_mm.mm_count); | ||
271 | me->active_mm = &init_mm; | ||
272 | if (me->mm) | ||
273 | BUG(); | ||
274 | enter_lazy_tlb(&init_mm, me); | ||
275 | |||
276 | set_tss_desc(cpu, t); | ||
277 | load_TR_desc(); | ||
278 | load_LDT(&init_mm.context); | ||
279 | |||
280 | /* | ||
281 | * Clear all 6 debug registers: | ||
282 | */ | ||
283 | |||
284 | set_debug(0UL, 0); | ||
285 | set_debug(0UL, 1); | ||
286 | set_debug(0UL, 2); | ||
287 | set_debug(0UL, 3); | ||
288 | set_debug(0UL, 6); | ||
289 | set_debug(0UL, 7); | ||
290 | |||
291 | fpu_init(); | ||
292 | } | ||
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c new file mode 100644 index 000000000000..5ace32c6ac5c --- /dev/null +++ b/arch/x86_64/kernel/signal.c | |||
@@ -0,0 +1,486 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs | ||
6 | * | ||
7 | * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson | ||
8 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | ||
9 | * 2000-2002 x86-64 support by Andi Kleen | ||
10 | * | ||
11 | * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $ | ||
12 | */ | ||
13 | |||
14 | #include <linux/sched.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/smp.h> | ||
17 | #include <linux/smp_lock.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/signal.h> | ||
20 | #include <linux/errno.h> | ||
21 | #include <linux/wait.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/unistd.h> | ||
24 | #include <linux/stddef.h> | ||
25 | #include <linux/personality.h> | ||
26 | #include <linux/compiler.h> | ||
27 | #include <asm/ucontext.h> | ||
28 | #include <asm/uaccess.h> | ||
29 | #include <asm/i387.h> | ||
30 | #include <asm/proto.h> | ||
31 | |||
32 | /* #define DEBUG_SIG 1 */ | ||
33 | |||
34 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | ||
35 | |||
36 | void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
37 | sigset_t *set, struct pt_regs * regs); | ||
38 | void ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
39 | sigset_t *set, struct pt_regs * regs); | ||
40 | |||
41 | asmlinkage long | ||
42 | sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) | ||
43 | { | ||
44 | sigset_t saveset, newset; | ||
45 | |||
46 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
47 | if (sigsetsize != sizeof(sigset_t)) | ||
48 | return -EINVAL; | ||
49 | |||
50 | if (copy_from_user(&newset, unewset, sizeof(newset))) | ||
51 | return -EFAULT; | ||
52 | sigdelsetmask(&newset, ~_BLOCKABLE); | ||
53 | |||
54 | spin_lock_irq(¤t->sighand->siglock); | ||
55 | saveset = current->blocked; | ||
56 | current->blocked = newset; | ||
57 | recalc_sigpending(); | ||
58 | spin_unlock_irq(¤t->sighand->siglock); | ||
59 | #ifdef DEBUG_SIG | ||
60 | printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", | ||
61 | saveset, newset, regs, regs->rip); | ||
62 | #endif | ||
63 | regs->rax = -EINTR; | ||
64 | while (1) { | ||
65 | current->state = TASK_INTERRUPTIBLE; | ||
66 | schedule(); | ||
67 | if (do_signal(regs, &saveset)) | ||
68 | return -EINTR; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | asmlinkage long | ||
73 | sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | ||
74 | struct pt_regs *regs) | ||
75 | { | ||
76 | return do_sigaltstack(uss, uoss, regs->rsp); | ||
77 | } | ||
78 | |||
79 | |||
80 | /* | ||
81 | * Do a signal return; undo the signal stack. | ||
82 | */ | ||
83 | |||
84 | struct rt_sigframe | ||
85 | { | ||
86 | char *pretcode; | ||
87 | struct ucontext uc; | ||
88 | struct siginfo info; | ||
89 | }; | ||
90 | |||
91 | static int | ||
92 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) | ||
93 | { | ||
94 | unsigned int err = 0; | ||
95 | |||
96 | /* Always make any pending restarted system calls return -EINTR */ | ||
97 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | ||
98 | |||
99 | #define COPY(x) err |= __get_user(regs->x, &sc->x) | ||
100 | |||
101 | COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); | ||
102 | COPY(rdx); COPY(rcx); COPY(rip); | ||
103 | COPY(r8); | ||
104 | COPY(r9); | ||
105 | COPY(r10); | ||
106 | COPY(r11); | ||
107 | COPY(r12); | ||
108 | COPY(r13); | ||
109 | COPY(r14); | ||
110 | COPY(r15); | ||
111 | |||
112 | { | ||
113 | unsigned int tmpflags; | ||
114 | err |= __get_user(tmpflags, &sc->eflags); | ||
115 | regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); | ||
116 | regs->orig_rax = -1; /* disable syscall checks */ | ||
117 | } | ||
118 | |||
119 | { | ||
120 | struct _fpstate __user * buf; | ||
121 | err |= __get_user(buf, &sc->fpstate); | ||
122 | |||
123 | if (buf) { | ||
124 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
125 | goto badframe; | ||
126 | err |= restore_i387(buf); | ||
127 | } else { | ||
128 | struct task_struct *me = current; | ||
129 | if (used_math()) { | ||
130 | clear_fpu(me); | ||
131 | clear_used_math(); | ||
132 | } | ||
133 | } | ||
134 | } | ||
135 | |||
136 | err |= __get_user(*prax, &sc->rax); | ||
137 | return err; | ||
138 | |||
139 | badframe: | ||
140 | return 1; | ||
141 | } | ||
142 | |||
143 | asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | ||
144 | { | ||
145 | struct rt_sigframe __user *frame; | ||
146 | sigset_t set; | ||
147 | unsigned long eax; | ||
148 | |||
149 | frame = (struct rt_sigframe __user *)(regs->rsp - 8); | ||
150 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { | ||
151 | goto badframe; | ||
152 | } | ||
153 | if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { | ||
154 | goto badframe; | ||
155 | } | ||
156 | |||
157 | sigdelsetmask(&set, ~_BLOCKABLE); | ||
158 | spin_lock_irq(¤t->sighand->siglock); | ||
159 | current->blocked = set; | ||
160 | recalc_sigpending(); | ||
161 | spin_unlock_irq(¤t->sighand->siglock); | ||
162 | |||
163 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) | ||
164 | goto badframe; | ||
165 | |||
166 | #ifdef DEBUG_SIG | ||
167 | printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); | ||
168 | #endif | ||
169 | |||
170 | if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) | ||
171 | goto badframe; | ||
172 | |||
173 | return eax; | ||
174 | |||
175 | badframe: | ||
176 | signal_fault(regs,frame,"sigreturn"); | ||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * Set up a signal frame. | ||
182 | */ | ||
183 | |||
184 | static inline int | ||
185 | setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) | ||
186 | { | ||
187 | int err = 0; | ||
188 | unsigned long eflags; | ||
189 | |||
190 | err |= __put_user(0, &sc->gs); | ||
191 | err |= __put_user(0, &sc->fs); | ||
192 | |||
193 | err |= __put_user(regs->rdi, &sc->rdi); | ||
194 | err |= __put_user(regs->rsi, &sc->rsi); | ||
195 | err |= __put_user(regs->rbp, &sc->rbp); | ||
196 | err |= __put_user(regs->rsp, &sc->rsp); | ||
197 | err |= __put_user(regs->rbx, &sc->rbx); | ||
198 | err |= __put_user(regs->rdx, &sc->rdx); | ||
199 | err |= __put_user(regs->rcx, &sc->rcx); | ||
200 | err |= __put_user(regs->rax, &sc->rax); | ||
201 | err |= __put_user(regs->r8, &sc->r8); | ||
202 | err |= __put_user(regs->r9, &sc->r9); | ||
203 | err |= __put_user(regs->r10, &sc->r10); | ||
204 | err |= __put_user(regs->r11, &sc->r11); | ||
205 | err |= __put_user(regs->r12, &sc->r12); | ||
206 | err |= __put_user(regs->r13, &sc->r13); | ||
207 | err |= __put_user(regs->r14, &sc->r14); | ||
208 | err |= __put_user(regs->r15, &sc->r15); | ||
209 | err |= __put_user(me->thread.trap_no, &sc->trapno); | ||
210 | err |= __put_user(me->thread.error_code, &sc->err); | ||
211 | err |= __put_user(regs->rip, &sc->rip); | ||
212 | eflags = regs->eflags; | ||
213 | if (current->ptrace & PT_PTRACED) { | ||
214 | eflags &= ~TF_MASK; | ||
215 | } | ||
216 | err |= __put_user(eflags, &sc->eflags); | ||
217 | err |= __put_user(mask, &sc->oldmask); | ||
218 | err |= __put_user(me->thread.cr2, &sc->cr2); | ||
219 | |||
220 | return err; | ||
221 | } | ||
222 | |||
223 | /* | ||
224 | * Determine which stack to use.. | ||
225 | */ | ||
226 | |||
227 | static void __user * | ||
228 | get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) | ||
229 | { | ||
230 | unsigned long rsp; | ||
231 | |||
232 | /* Default to using normal stack - redzone*/ | ||
233 | rsp = regs->rsp - 128; | ||
234 | |||
235 | /* This is the X/Open sanctioned signal stack switching. */ | ||
236 | /* RED-PEN: redzone on that stack? */ | ||
237 | if (ka->sa.sa_flags & SA_ONSTACK) { | ||
238 | if (sas_ss_flags(rsp) == 0) | ||
239 | rsp = current->sas_ss_sp + current->sas_ss_size; | ||
240 | } | ||
241 | |||
242 | return (void __user *)round_down(rsp - size, 16); | ||
243 | } | ||
244 | |||
245 | static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
246 | sigset_t *set, struct pt_regs * regs) | ||
247 | { | ||
248 | struct rt_sigframe __user *frame; | ||
249 | struct _fpstate __user *fp = NULL; | ||
250 | int err = 0; | ||
251 | struct task_struct *me = current; | ||
252 | |||
253 | if (used_math()) { | ||
254 | fp = get_stack(ka, regs, sizeof(struct _fpstate)); | ||
255 | frame = (void __user *)round_down( | ||
256 | (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; | ||
257 | |||
258 | if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) | ||
259 | goto give_sigsegv; | ||
260 | |||
261 | if (save_i387(fp) < 0) | ||
262 | err |= -1; | ||
263 | } else | ||
264 | frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; | ||
265 | |||
266 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | ||
267 | goto give_sigsegv; | ||
268 | |||
269 | if (ka->sa.sa_flags & SA_SIGINFO) { | ||
270 | err |= copy_siginfo_to_user(&frame->info, info); | ||
271 | if (err) | ||
272 | goto give_sigsegv; | ||
273 | } | ||
274 | |||
275 | /* Create the ucontext. */ | ||
276 | err |= __put_user(0, &frame->uc.uc_flags); | ||
277 | err |= __put_user(0, &frame->uc.uc_link); | ||
278 | err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | ||
279 | err |= __put_user(sas_ss_flags(regs->rsp), | ||
280 | &frame->uc.uc_stack.ss_flags); | ||
281 | err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
282 | err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); | ||
283 | err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); | ||
284 | if (sizeof(*set) == 16) { | ||
285 | __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); | ||
286 | __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); | ||
287 | } else | ||
288 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | ||
289 | |||
290 | /* Set up to return from userspace. If provided, use a stub | ||
291 | already in userspace. */ | ||
292 | /* x86-64 should always use SA_RESTORER. */ | ||
293 | if (ka->sa.sa_flags & SA_RESTORER) { | ||
294 | err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); | ||
295 | } else { | ||
296 | /* could use a vstub here */ | ||
297 | goto give_sigsegv; | ||
298 | } | ||
299 | |||
300 | if (err) | ||
301 | goto give_sigsegv; | ||
302 | |||
303 | #ifdef DEBUG_SIG | ||
304 | printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); | ||
305 | #endif | ||
306 | |||
307 | /* Set up registers for signal handler */ | ||
308 | { | ||
309 | struct exec_domain *ed = current_thread_info()->exec_domain; | ||
310 | if (unlikely(ed && ed->signal_invmap && sig < 32)) | ||
311 | sig = ed->signal_invmap[sig]; | ||
312 | } | ||
313 | regs->rdi = sig; | ||
314 | /* In case the signal handler was declared without prototypes */ | ||
315 | regs->rax = 0; | ||
316 | |||
317 | /* This also works for non SA_SIGINFO handlers because they expect the | ||
318 | next argument after the signal number on the stack. */ | ||
319 | regs->rsi = (unsigned long)&frame->info; | ||
320 | regs->rdx = (unsigned long)&frame->uc; | ||
321 | regs->rip = (unsigned long) ka->sa.sa_handler; | ||
322 | |||
323 | regs->rsp = (unsigned long)frame; | ||
324 | |||
325 | set_fs(USER_DS); | ||
326 | if (regs->eflags & TF_MASK) { | ||
327 | if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) { | ||
328 | ptrace_notify(SIGTRAP); | ||
329 | } else { | ||
330 | regs->eflags &= ~TF_MASK; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | #ifdef DEBUG_SIG | ||
335 | printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", | ||
336 | current->comm, current->pid, frame, regs->rip, frame->pretcode); | ||
337 | #endif | ||
338 | |||
339 | return; | ||
340 | |||
341 | give_sigsegv: | ||
342 | force_sigsegv(sig, current); | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * OK, we're invoking a handler | ||
347 | */ | ||
348 | |||
349 | static void | ||
350 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | ||
351 | sigset_t *oldset, struct pt_regs *regs) | ||
352 | { | ||
353 | #ifdef DEBUG_SIG | ||
354 | printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", | ||
355 | current->pid, sig, | ||
356 | regs->rip, regs->rsp, regs); | ||
357 | #endif | ||
358 | |||
359 | /* Are we from a system call? */ | ||
360 | if ((long)regs->orig_rax >= 0) { | ||
361 | /* If so, check system call restarting.. */ | ||
362 | switch (regs->rax) { | ||
363 | case -ERESTART_RESTARTBLOCK: | ||
364 | case -ERESTARTNOHAND: | ||
365 | regs->rax = -EINTR; | ||
366 | break; | ||
367 | |||
368 | case -ERESTARTSYS: | ||
369 | if (!(ka->sa.sa_flags & SA_RESTART)) { | ||
370 | regs->rax = -EINTR; | ||
371 | break; | ||
372 | } | ||
373 | /* fallthrough */ | ||
374 | case -ERESTARTNOINTR: | ||
375 | regs->rax = regs->orig_rax; | ||
376 | regs->rip -= 2; | ||
377 | break; | ||
378 | } | ||
379 | } | ||
380 | |||
381 | #ifdef CONFIG_IA32_EMULATION | ||
382 | if (test_thread_flag(TIF_IA32)) { | ||
383 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
384 | ia32_setup_rt_frame(sig, ka, info, oldset, regs); | ||
385 | else | ||
386 | ia32_setup_frame(sig, ka, oldset, regs); | ||
387 | } else | ||
388 | #endif | ||
389 | setup_rt_frame(sig, ka, info, oldset, regs); | ||
390 | |||
391 | if (!(ka->sa.sa_flags & SA_NODEFER)) { | ||
392 | spin_lock_irq(¤t->sighand->siglock); | ||
393 | sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); | ||
394 | sigaddset(¤t->blocked,sig); | ||
395 | recalc_sigpending(); | ||
396 | spin_unlock_irq(¤t->sighand->siglock); | ||
397 | } | ||
398 | } | ||
399 | |||
400 | /* | ||
401 | * Note that 'init' is a special process: it doesn't get signals it doesn't | ||
402 | * want to handle. Thus you cannot kill init even with a SIGKILL even by | ||
403 | * mistake. | ||
404 | */ | ||
405 | int do_signal(struct pt_regs *regs, sigset_t *oldset) | ||
406 | { | ||
407 | struct k_sigaction ka; | ||
408 | siginfo_t info; | ||
409 | int signr; | ||
410 | |||
411 | /* | ||
412 | * We want the common case to go fast, which | ||
413 | * is why we may in certain cases get here from | ||
414 | * kernel mode. Just return without doing anything | ||
415 | * if so. | ||
416 | */ | ||
417 | if ((regs->cs & 3) != 3) | ||
418 | return 1; | ||
419 | |||
420 | if (try_to_freeze(0)) | ||
421 | goto no_signal; | ||
422 | |||
423 | if (!oldset) | ||
424 | oldset = ¤t->blocked; | ||
425 | |||
426 | signr = get_signal_to_deliver(&info, &ka, regs, NULL); | ||
427 | if (signr > 0) { | ||
428 | /* Reenable any watchpoints before delivering the | ||
429 | * signal to user space. The processor register will | ||
430 | * have been cleared if the watchpoint triggered | ||
431 | * inside the kernel. | ||
432 | */ | ||
433 | if (current->thread.debugreg7) | ||
434 | asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); | ||
435 | |||
436 | /* Whee! Actually deliver the signal. */ | ||
437 | handle_signal(signr, &info, &ka, oldset, regs); | ||
438 | return 1; | ||
439 | } | ||
440 | |||
441 | no_signal: | ||
442 | /* Did we come from a system call? */ | ||
443 | if ((long)regs->orig_rax >= 0) { | ||
444 | /* Restart the system call - no handlers present */ | ||
445 | long res = regs->rax; | ||
446 | if (res == -ERESTARTNOHAND || | ||
447 | res == -ERESTARTSYS || | ||
448 | res == -ERESTARTNOINTR) { | ||
449 | regs->rax = regs->orig_rax; | ||
450 | regs->rip -= 2; | ||
451 | } | ||
452 | if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { | ||
453 | regs->rax = __NR_restart_syscall; | ||
454 | regs->rip -= 2; | ||
455 | } | ||
456 | } | ||
457 | return 0; | ||
458 | } | ||
459 | |||
460 | void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) | ||
461 | { | ||
462 | #ifdef DEBUG_SIG | ||
463 | printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", | ||
464 | thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); | ||
465 | #endif | ||
466 | |||
467 | /* Pending single-step? */ | ||
468 | if (thread_info_flags & _TIF_SINGLESTEP) { | ||
469 | regs->eflags |= TF_MASK; | ||
470 | clear_thread_flag(TIF_SINGLESTEP); | ||
471 | } | ||
472 | |||
473 | /* deal with pending signal delivery */ | ||
474 | if (thread_info_flags & _TIF_SIGPENDING) | ||
475 | do_signal(regs,oldset); | ||
476 | } | ||
477 | |||
478 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | ||
479 | { | ||
480 | struct task_struct *me = current; | ||
481 | if (exception_trace) | ||
482 | printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", | ||
483 | me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); | ||
484 | |||
485 | force_sig(SIGSEGV, me); | ||
486 | } | ||
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c new file mode 100644 index 000000000000..e3cdbf9a88bd --- /dev/null +++ b/arch/x86_64/kernel/smp.c | |||
@@ -0,0 +1,415 @@ | |||
1 | /* | ||
2 | * Intel SMP support routines. | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * (c) 2002,2003 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * This code is released under the GNU General Public License version 2 or | ||
9 | * later. | ||
10 | */ | ||
11 | |||
12 | #include <linux/init.h> | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/irq.h> | ||
16 | #include <linux/delay.h> | ||
17 | #include <linux/spinlock.h> | ||
18 | #include <linux/smp_lock.h> | ||
19 | #include <linux/smp.h> | ||
20 | #include <linux/kernel_stat.h> | ||
21 | #include <linux/mc146818rtc.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | |||
24 | #include <asm/mtrr.h> | ||
25 | #include <asm/pgalloc.h> | ||
26 | #include <asm/tlbflush.h> | ||
27 | #include <asm/mach_apic.h> | ||
28 | #include <asm/mmu_context.h> | ||
29 | #include <asm/proto.h> | ||
30 | |||
31 | /* | ||
32 | * Smarter SMP flushing macros. | ||
33 | * c/o Linus Torvalds. | ||
34 | * | ||
35 | * These mean you can really definitely utterly forget about | ||
36 | * writing to user space from interrupts. (Its not allowed anyway). | ||
37 | * | ||
38 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | ||
39 | */ | ||
40 | |||
41 | static cpumask_t flush_cpumask; | ||
42 | static struct mm_struct * flush_mm; | ||
43 | static unsigned long flush_va; | ||
44 | static DEFINE_SPINLOCK(tlbstate_lock); | ||
45 | #define FLUSH_ALL -1ULL | ||
46 | |||
47 | /* | ||
48 | * We cannot call mmdrop() because we are in interrupt context, | ||
49 | * instead update mm->cpu_vm_mask. | ||
50 | */ | ||
51 | static inline void leave_mm (unsigned long cpu) | ||
52 | { | ||
53 | if (read_pda(mmu_state) == TLBSTATE_OK) | ||
54 | BUG(); | ||
55 | clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask); | ||
56 | load_cr3(swapper_pg_dir); | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * | ||
61 | * The flush IPI assumes that a thread switch happens in this order: | ||
62 | * [cpu0: the cpu that switches] | ||
63 | * 1) switch_mm() either 1a) or 1b) | ||
64 | * 1a) thread switch to a different mm | ||
65 | * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask); | ||
66 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
67 | * the other cpus, but smp_invalidate_interrupt ignore flush ipis | ||
68 | * for the wrong mm, and in the worst case we perform a superfluous | ||
69 | * tlb flush. | ||
70 | * 1a2) set cpu mmu_state to TLBSTATE_OK | ||
71 | * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | ||
72 | * was in lazy tlb mode. | ||
73 | * 1a3) update cpu active_mm | ||
74 | * Now cpu0 accepts tlb flushes for the new mm. | ||
75 | * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask); | ||
76 | * Now the other cpus will send tlb flush ipis. | ||
77 | * 1a4) change cr3. | ||
78 | * 1b) thread switch without mm change | ||
79 | * cpu active_mm is correct, cpu0 already handles | ||
80 | * flush ipis. | ||
81 | * 1b1) set cpu mmu_state to TLBSTATE_OK | ||
82 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
83 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
84 | * and test the bit. | ||
85 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
86 | * 2) switch %%esp, ie current | ||
87 | * | ||
88 | * The interrupt must handle 2 special cases: | ||
89 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
90 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
91 | * runs in kernel space, the cpu could load tlb entries for user space | ||
92 | * pages. | ||
93 | * | ||
94 | * The good news is that cpu mmu_state is local to each cpu, no | ||
95 | * write/read ordering problems. | ||
96 | */ | ||
97 | |||
98 | /* | ||
99 | * TLB flush IPI: | ||
100 | * | ||
101 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
102 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
103 | */ | ||
104 | |||
105 | asmlinkage void smp_invalidate_interrupt (void) | ||
106 | { | ||
107 | unsigned long cpu; | ||
108 | |||
109 | cpu = get_cpu(); | ||
110 | |||
111 | if (!cpu_isset(cpu, flush_cpumask)) | ||
112 | goto out; | ||
113 | /* | ||
114 | * This was a BUG() but until someone can quote me the | ||
115 | * line from the intel manual that guarantees an IPI to | ||
116 | * multiple CPUs is retried _only_ on the erroring CPUs | ||
117 | * its staying as a return | ||
118 | * | ||
119 | * BUG(); | ||
120 | */ | ||
121 | |||
122 | if (flush_mm == read_pda(active_mm)) { | ||
123 | if (read_pda(mmu_state) == TLBSTATE_OK) { | ||
124 | if (flush_va == FLUSH_ALL) | ||
125 | local_flush_tlb(); | ||
126 | else | ||
127 | __flush_tlb_one(flush_va); | ||
128 | } else | ||
129 | leave_mm(cpu); | ||
130 | } | ||
131 | ack_APIC_irq(); | ||
132 | cpu_clear(cpu, flush_cpumask); | ||
133 | |||
134 | out: | ||
135 | put_cpu_no_resched(); | ||
136 | } | ||
137 | |||
138 | static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | ||
139 | unsigned long va) | ||
140 | { | ||
141 | cpumask_t tmp; | ||
142 | /* | ||
143 | * A couple of (to be removed) sanity checks: | ||
144 | * | ||
145 | * - we do not send IPIs to not-yet booted CPUs. | ||
146 | * - current CPU must not be in mask | ||
147 | * - mask must exist :) | ||
148 | */ | ||
149 | BUG_ON(cpus_empty(cpumask)); | ||
150 | cpus_and(tmp, cpumask, cpu_online_map); | ||
151 | BUG_ON(!cpus_equal(tmp, cpumask)); | ||
152 | BUG_ON(cpu_isset(smp_processor_id(), cpumask)); | ||
153 | if (!mm) | ||
154 | BUG(); | ||
155 | |||
156 | /* | ||
157 | * I'm not happy about this global shared spinlock in the | ||
158 | * MM hot path, but we'll see how contended it is. | ||
159 | * Temporarily this turns IRQs off, so that lockups are | ||
160 | * detected by the NMI watchdog. | ||
161 | */ | ||
162 | spin_lock(&tlbstate_lock); | ||
163 | |||
164 | flush_mm = mm; | ||
165 | flush_va = va; | ||
166 | cpus_or(flush_cpumask, cpumask, flush_cpumask); | ||
167 | |||
168 | /* | ||
169 | * We have to send the IPI only to | ||
170 | * CPUs affected. | ||
171 | */ | ||
172 | send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); | ||
173 | |||
174 | while (!cpus_empty(flush_cpumask)) | ||
175 | mb(); /* nothing. lockup detection does not belong here */; | ||
176 | |||
177 | flush_mm = NULL; | ||
178 | flush_va = 0; | ||
179 | spin_unlock(&tlbstate_lock); | ||
180 | } | ||
181 | |||
182 | void flush_tlb_current_task(void) | ||
183 | { | ||
184 | struct mm_struct *mm = current->mm; | ||
185 | cpumask_t cpu_mask; | ||
186 | |||
187 | preempt_disable(); | ||
188 | cpu_mask = mm->cpu_vm_mask; | ||
189 | cpu_clear(smp_processor_id(), cpu_mask); | ||
190 | |||
191 | local_flush_tlb(); | ||
192 | if (!cpus_empty(cpu_mask)) | ||
193 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
194 | preempt_enable(); | ||
195 | } | ||
196 | |||
197 | void flush_tlb_mm (struct mm_struct * mm) | ||
198 | { | ||
199 | cpumask_t cpu_mask; | ||
200 | |||
201 | preempt_disable(); | ||
202 | cpu_mask = mm->cpu_vm_mask; | ||
203 | cpu_clear(smp_processor_id(), cpu_mask); | ||
204 | |||
205 | if (current->active_mm == mm) { | ||
206 | if (current->mm) | ||
207 | local_flush_tlb(); | ||
208 | else | ||
209 | leave_mm(smp_processor_id()); | ||
210 | } | ||
211 | if (!cpus_empty(cpu_mask)) | ||
212 | flush_tlb_others(cpu_mask, mm, FLUSH_ALL); | ||
213 | |||
214 | preempt_enable(); | ||
215 | } | ||
216 | |||
217 | void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) | ||
218 | { | ||
219 | struct mm_struct *mm = vma->vm_mm; | ||
220 | cpumask_t cpu_mask; | ||
221 | |||
222 | preempt_disable(); | ||
223 | cpu_mask = mm->cpu_vm_mask; | ||
224 | cpu_clear(smp_processor_id(), cpu_mask); | ||
225 | |||
226 | if (current->active_mm == mm) { | ||
227 | if(current->mm) | ||
228 | __flush_tlb_one(va); | ||
229 | else | ||
230 | leave_mm(smp_processor_id()); | ||
231 | } | ||
232 | |||
233 | if (!cpus_empty(cpu_mask)) | ||
234 | flush_tlb_others(cpu_mask, mm, va); | ||
235 | |||
236 | preempt_enable(); | ||
237 | } | ||
238 | |||
239 | static void do_flush_tlb_all(void* info) | ||
240 | { | ||
241 | unsigned long cpu = smp_processor_id(); | ||
242 | |||
243 | __flush_tlb_all(); | ||
244 | if (read_pda(mmu_state) == TLBSTATE_LAZY) | ||
245 | leave_mm(cpu); | ||
246 | } | ||
247 | |||
248 | void flush_tlb_all(void) | ||
249 | { | ||
250 | on_each_cpu(do_flush_tlb_all, NULL, 1, 1); | ||
251 | } | ||
252 | |||
253 | void smp_kdb_stop(void) | ||
254 | { | ||
255 | send_IPI_allbutself(KDB_VECTOR); | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * this function sends a 'reschedule' IPI to another CPU. | ||
260 | * it goes straight through and wastes no time serializing | ||
261 | * anything. Worst case is that we lose a reschedule ... | ||
262 | */ | ||
263 | |||
264 | void smp_send_reschedule(int cpu) | ||
265 | { | ||
266 | send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Structure and data for smp_call_function(). This is designed to minimise | ||
271 | * static memory requirements. It also looks cleaner. | ||
272 | */ | ||
273 | static DEFINE_SPINLOCK(call_lock); | ||
274 | |||
275 | struct call_data_struct { | ||
276 | void (*func) (void *info); | ||
277 | void *info; | ||
278 | atomic_t started; | ||
279 | atomic_t finished; | ||
280 | int wait; | ||
281 | }; | ||
282 | |||
283 | static struct call_data_struct * call_data; | ||
284 | |||
285 | /* | ||
286 | * this function sends a 'generic call function' IPI to all other CPUs | ||
287 | * in the system. | ||
288 | */ | ||
289 | static void __smp_call_function (void (*func) (void *info), void *info, | ||
290 | int nonatomic, int wait) | ||
291 | { | ||
292 | struct call_data_struct data; | ||
293 | int cpus = num_online_cpus()-1; | ||
294 | |||
295 | if (!cpus) | ||
296 | return; | ||
297 | |||
298 | data.func = func; | ||
299 | data.info = info; | ||
300 | atomic_set(&data.started, 0); | ||
301 | data.wait = wait; | ||
302 | if (wait) | ||
303 | atomic_set(&data.finished, 0); | ||
304 | |||
305 | call_data = &data; | ||
306 | wmb(); | ||
307 | /* Send a message to all other CPUs and wait for them to respond */ | ||
308 | send_IPI_allbutself(CALL_FUNCTION_VECTOR); | ||
309 | |||
310 | /* Wait for response */ | ||
311 | while (atomic_read(&data.started) != cpus) | ||
312 | cpu_relax(); | ||
313 | |||
314 | if (!wait) | ||
315 | return; | ||
316 | |||
317 | while (atomic_read(&data.finished) != cpus) | ||
318 | cpu_relax(); | ||
319 | } | ||
320 | |||
321 | /* | ||
322 | * smp_call_function - run a function on all other CPUs. | ||
323 | * @func: The function to run. This must be fast and non-blocking. | ||
324 | * @info: An arbitrary pointer to pass to the function. | ||
325 | * @nonatomic: currently unused. | ||
326 | * @wait: If true, wait (atomically) until function has completed on other | ||
327 | * CPUs. | ||
328 | * | ||
329 | * Returns 0 on success, else a negative status code. Does not return until | ||
330 | * remote CPUs are nearly ready to execute func or are or have executed. | ||
331 | * | ||
332 | * You must not call this function with disabled interrupts or from a | ||
333 | * hardware interrupt handler or from a bottom half handler. | ||
334 | * Actually there are a few legal cases, like panic. | ||
335 | */ | ||
336 | int smp_call_function (void (*func) (void *info), void *info, int nonatomic, | ||
337 | int wait) | ||
338 | { | ||
339 | spin_lock(&call_lock); | ||
340 | __smp_call_function(func,info,nonatomic,wait); | ||
341 | spin_unlock(&call_lock); | ||
342 | return 0; | ||
343 | } | ||
344 | |||
345 | void smp_stop_cpu(void) | ||
346 | { | ||
347 | /* | ||
348 | * Remove this CPU: | ||
349 | */ | ||
350 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
351 | local_irq_disable(); | ||
352 | disable_local_APIC(); | ||
353 | local_irq_enable(); | ||
354 | } | ||
355 | |||
356 | static void smp_really_stop_cpu(void *dummy) | ||
357 | { | ||
358 | smp_stop_cpu(); | ||
359 | for (;;) | ||
360 | asm("hlt"); | ||
361 | } | ||
362 | |||
363 | void smp_send_stop(void) | ||
364 | { | ||
365 | int nolock = 0; | ||
366 | if (reboot_force) | ||
367 | return; | ||
368 | /* Don't deadlock on the call lock in panic */ | ||
369 | if (!spin_trylock(&call_lock)) { | ||
370 | /* ignore locking because we have paniced anyways */ | ||
371 | nolock = 1; | ||
372 | } | ||
373 | __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); | ||
374 | if (!nolock) | ||
375 | spin_unlock(&call_lock); | ||
376 | |||
377 | local_irq_disable(); | ||
378 | disable_local_APIC(); | ||
379 | local_irq_enable(); | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * Reschedule call back. Nothing to do, | ||
384 | * all the work is done automatically when | ||
385 | * we return from the interrupt. | ||
386 | */ | ||
387 | asmlinkage void smp_reschedule_interrupt(void) | ||
388 | { | ||
389 | ack_APIC_irq(); | ||
390 | } | ||
391 | |||
392 | asmlinkage void smp_call_function_interrupt(void) | ||
393 | { | ||
394 | void (*func) (void *info) = call_data->func; | ||
395 | void *info = call_data->info; | ||
396 | int wait = call_data->wait; | ||
397 | |||
398 | ack_APIC_irq(); | ||
399 | /* | ||
400 | * Notify initiating CPU that I've grabbed the data and am | ||
401 | * about to execute the function | ||
402 | */ | ||
403 | mb(); | ||
404 | atomic_inc(&call_data->started); | ||
405 | /* | ||
406 | * At this point the info structure may be out of scope unless wait==1 | ||
407 | */ | ||
408 | irq_enter(); | ||
409 | (*func)(info); | ||
410 | irq_exit(); | ||
411 | if (wait) { | ||
412 | mb(); | ||
413 | atomic_inc(&call_data->finished); | ||
414 | } | ||
415 | } | ||
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c new file mode 100644 index 000000000000..afd3600b919e --- /dev/null +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -0,0 +1,938 @@ | |||
1 | /* | ||
2 | * x86 SMP booting functions | ||
3 | * | ||
4 | * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> | ||
5 | * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
6 | * Copyright 2001 Andi Kleen, SuSE Labs. | ||
7 | * | ||
8 | * Much of the core SMP work is based on previous work by Thomas Radke, to | ||
9 | * whom a great many thanks are extended. | ||
10 | * | ||
11 | * Thanks to Intel for making available several different Pentium, | ||
12 | * Pentium Pro and Pentium-II/Xeon MP machines. | ||
13 | * Original development of Linux SMP code supported by Caldera. | ||
14 | * | ||
15 | * This code is released under the GNU General Public License version 2 or | ||
16 | * later. | ||
17 | * | ||
18 | * Fixes | ||
19 | * Felix Koop : NR_CPUS used properly | ||
20 | * Jose Renau : Handle single CPU case. | ||
21 | * Alan Cox : By repeated request 8) - Total BogoMIP report. | ||
22 | * Greg Wright : Fix for kernel stacks panic. | ||
23 | * Erich Boleyn : MP v1.4 and additional changes. | ||
24 | * Matthias Sattler : Changes for 2.1 kernel map. | ||
25 | * Michel Lespinasse : Changes for 2.1 kernel map. | ||
26 | * Michael Chastain : Change trampoline.S to gnu as. | ||
27 | * Alan Cox : Dumb bug: 'B' step PPro's are fine | ||
28 | * Ingo Molnar : Added APIC timers, based on code | ||
29 | * from Jose Renau | ||
30 | * Ingo Molnar : various cleanups and rewrites | ||
31 | * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. | ||
32 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs | ||
33 | * Andi Kleen : Changed for SMP boot into long mode. | ||
34 | * Rusty Russell : Hacked into shape for new "hotplug" boot process. | ||
35 | */ | ||
36 | |||
37 | #include <linux/config.h> | ||
38 | #include <linux/init.h> | ||
39 | |||
40 | #include <linux/mm.h> | ||
41 | #include <linux/kernel_stat.h> | ||
42 | #include <linux/smp_lock.h> | ||
43 | #include <linux/irq.h> | ||
44 | #include <linux/bootmem.h> | ||
45 | #include <linux/thread_info.h> | ||
46 | #include <linux/module.h> | ||
47 | |||
48 | #include <linux/delay.h> | ||
49 | #include <linux/mc146818rtc.h> | ||
50 | #include <asm/mtrr.h> | ||
51 | #include <asm/pgalloc.h> | ||
52 | #include <asm/desc.h> | ||
53 | #include <asm/kdebug.h> | ||
54 | #include <asm/tlbflush.h> | ||
55 | #include <asm/proto.h> | ||
56 | |||
57 | /* Number of siblings per CPU package */ | ||
58 | int smp_num_siblings = 1; | ||
59 | /* Package ID of each logical CPU */ | ||
60 | u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
61 | EXPORT_SYMBOL(phys_proc_id); | ||
62 | |||
63 | /* Bitmask of currently online CPUs */ | ||
64 | cpumask_t cpu_online_map; | ||
65 | |||
66 | cpumask_t cpu_callin_map; | ||
67 | cpumask_t cpu_callout_map; | ||
68 | static cpumask_t smp_commenced_mask; | ||
69 | |||
70 | /* Per CPU bogomips and other parameters */ | ||
71 | struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; | ||
72 | |||
73 | cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; | ||
74 | |||
75 | /* | ||
76 | * Trampoline 80x86 program as an array. | ||
77 | */ | ||
78 | |||
79 | extern unsigned char trampoline_data []; | ||
80 | extern unsigned char trampoline_end []; | ||
81 | |||
82 | /* | ||
83 | * Currently trivial. Write the real->protected mode | ||
84 | * bootstrap into the page concerned. The caller | ||
85 | * has made sure it's suitably aligned. | ||
86 | */ | ||
87 | |||
88 | static unsigned long __init setup_trampoline(void) | ||
89 | { | ||
90 | void *tramp = __va(SMP_TRAMPOLINE_BASE); | ||
91 | memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); | ||
92 | return virt_to_phys(tramp); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * The bootstrap kernel entry code has set these up. Save them for | ||
97 | * a given CPU | ||
98 | */ | ||
99 | |||
100 | static void __init smp_store_cpu_info(int id) | ||
101 | { | ||
102 | struct cpuinfo_x86 *c = cpu_data + id; | ||
103 | |||
104 | *c = boot_cpu_data; | ||
105 | identify_cpu(c); | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * TSC synchronization. | ||
110 | * | ||
111 | * We first check whether all CPUs have their TSC's synchronized, | ||
112 | * then we print a warning if not, and always resync. | ||
113 | */ | ||
114 | |||
115 | static atomic_t tsc_start_flag = ATOMIC_INIT(0); | ||
116 | static atomic_t tsc_count_start = ATOMIC_INIT(0); | ||
117 | static atomic_t tsc_count_stop = ATOMIC_INIT(0); | ||
118 | static unsigned long long tsc_values[NR_CPUS]; | ||
119 | |||
120 | #define NR_LOOPS 5 | ||
121 | |||
122 | extern unsigned int fast_gettimeoffset_quotient; | ||
123 | |||
124 | static void __init synchronize_tsc_bp (void) | ||
125 | { | ||
126 | int i; | ||
127 | unsigned long long t0; | ||
128 | unsigned long long sum, avg; | ||
129 | long long delta; | ||
130 | long one_usec; | ||
131 | int buggy = 0; | ||
132 | |||
133 | printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus()); | ||
134 | |||
135 | one_usec = cpu_khz; | ||
136 | |||
137 | atomic_set(&tsc_start_flag, 1); | ||
138 | wmb(); | ||
139 | |||
140 | /* | ||
141 | * We loop a few times to get a primed instruction cache, | ||
142 | * then the last pass is more or less synchronized and | ||
143 | * the BP and APs set their cycle counters to zero all at | ||
144 | * once. This reduces the chance of having random offsets | ||
145 | * between the processors, and guarantees that the maximum | ||
146 | * delay between the cycle counters is never bigger than | ||
147 | * the latency of information-passing (cachelines) between | ||
148 | * two CPUs. | ||
149 | */ | ||
150 | for (i = 0; i < NR_LOOPS; i++) { | ||
151 | /* | ||
152 | * all APs synchronize but they loop on '== num_cpus' | ||
153 | */ | ||
154 | while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb(); | ||
155 | atomic_set(&tsc_count_stop, 0); | ||
156 | wmb(); | ||
157 | /* | ||
158 | * this lets the APs save their current TSC: | ||
159 | */ | ||
160 | atomic_inc(&tsc_count_start); | ||
161 | |||
162 | sync_core(); | ||
163 | rdtscll(tsc_values[smp_processor_id()]); | ||
164 | /* | ||
165 | * We clear the TSC in the last loop: | ||
166 | */ | ||
167 | if (i == NR_LOOPS-1) | ||
168 | write_tsc(0, 0); | ||
169 | |||
170 | /* | ||
171 | * Wait for all APs to leave the synchronization point: | ||
172 | */ | ||
173 | while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb(); | ||
174 | atomic_set(&tsc_count_start, 0); | ||
175 | wmb(); | ||
176 | atomic_inc(&tsc_count_stop); | ||
177 | } | ||
178 | |||
179 | sum = 0; | ||
180 | for (i = 0; i < NR_CPUS; i++) { | ||
181 | if (cpu_isset(i, cpu_callout_map)) { | ||
182 | t0 = tsc_values[i]; | ||
183 | sum += t0; | ||
184 | } | ||
185 | } | ||
186 | avg = sum / num_booting_cpus(); | ||
187 | |||
188 | sum = 0; | ||
189 | for (i = 0; i < NR_CPUS; i++) { | ||
190 | if (!cpu_isset(i, cpu_callout_map)) | ||
191 | continue; | ||
192 | |||
193 | delta = tsc_values[i] - avg; | ||
194 | if (delta < 0) | ||
195 | delta = -delta; | ||
196 | /* | ||
197 | * We report bigger than 2 microseconds clock differences. | ||
198 | */ | ||
199 | if (delta > 2*one_usec) { | ||
200 | long realdelta; | ||
201 | if (!buggy) { | ||
202 | buggy = 1; | ||
203 | printk("\n"); | ||
204 | } | ||
205 | realdelta = delta / one_usec; | ||
206 | if (tsc_values[i] < avg) | ||
207 | realdelta = -realdelta; | ||
208 | |||
209 | printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", | ||
210 | i, realdelta); | ||
211 | } | ||
212 | |||
213 | sum += delta; | ||
214 | } | ||
215 | if (!buggy) | ||
216 | printk("passed.\n"); | ||
217 | } | ||
218 | |||
219 | static void __init synchronize_tsc_ap (void) | ||
220 | { | ||
221 | int i; | ||
222 | |||
223 | /* | ||
224 | * Not every cpu is online at the time | ||
225 | * this gets called, so we first wait for the BP to | ||
226 | * finish SMP initialization: | ||
227 | */ | ||
228 | while (!atomic_read(&tsc_start_flag)) mb(); | ||
229 | |||
230 | for (i = 0; i < NR_LOOPS; i++) { | ||
231 | atomic_inc(&tsc_count_start); | ||
232 | while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb(); | ||
233 | |||
234 | sync_core(); | ||
235 | rdtscll(tsc_values[smp_processor_id()]); | ||
236 | if (i == NR_LOOPS-1) | ||
237 | write_tsc(0, 0); | ||
238 | |||
239 | atomic_inc(&tsc_count_stop); | ||
240 | while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); | ||
241 | } | ||
242 | } | ||
243 | #undef NR_LOOPS | ||
244 | |||
245 | static atomic_t init_deasserted; | ||
246 | |||
247 | static void __init smp_callin(void) | ||
248 | { | ||
249 | int cpuid, phys_id; | ||
250 | unsigned long timeout; | ||
251 | |||
252 | /* | ||
253 | * If waken up by an INIT in an 82489DX configuration | ||
254 | * we may get here before an INIT-deassert IPI reaches | ||
255 | * our local APIC. We have to wait for the IPI or we'll | ||
256 | * lock up on an APIC access. | ||
257 | */ | ||
258 | while (!atomic_read(&init_deasserted)); | ||
259 | |||
260 | /* | ||
261 | * (This works even if the APIC is not enabled.) | ||
262 | */ | ||
263 | phys_id = GET_APIC_ID(apic_read(APIC_ID)); | ||
264 | cpuid = smp_processor_id(); | ||
265 | if (cpu_isset(cpuid, cpu_callin_map)) { | ||
266 | panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", | ||
267 | phys_id, cpuid); | ||
268 | } | ||
269 | Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); | ||
270 | |||
271 | /* | ||
272 | * STARTUP IPIs are fragile beasts as they might sometimes | ||
273 | * trigger some glue motherboard logic. Complete APIC bus | ||
274 | * silence for 1 second, this overestimates the time the | ||
275 | * boot CPU is spending to send the up to 2 STARTUP IPIs | ||
276 | * by a factor of two. This should be enough. | ||
277 | */ | ||
278 | |||
279 | /* | ||
280 | * Waiting 2s total for startup (udelay is not yet working) | ||
281 | */ | ||
282 | timeout = jiffies + 2*HZ; | ||
283 | while (time_before(jiffies, timeout)) { | ||
284 | /* | ||
285 | * Has the boot CPU finished it's STARTUP sequence? | ||
286 | */ | ||
287 | if (cpu_isset(cpuid, cpu_callout_map)) | ||
288 | break; | ||
289 | rep_nop(); | ||
290 | } | ||
291 | |||
292 | if (!time_before(jiffies, timeout)) { | ||
293 | panic("smp_callin: CPU%d started up but did not get a callout!\n", | ||
294 | cpuid); | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * the boot CPU has finished the init stage and is spinning | ||
299 | * on callin_map until we finish. We are free to set up this | ||
300 | * CPU, first the APIC. (this is probably redundant on most | ||
301 | * boards) | ||
302 | */ | ||
303 | |||
304 | Dprintk("CALLIN, before setup_local_APIC().\n"); | ||
305 | setup_local_APIC(); | ||
306 | |||
307 | local_irq_enable(); | ||
308 | |||
309 | /* | ||
310 | * Get our bogomips. | ||
311 | */ | ||
312 | calibrate_delay(); | ||
313 | Dprintk("Stack at about %p\n",&cpuid); | ||
314 | |||
315 | disable_APIC_timer(); | ||
316 | |||
317 | /* | ||
318 | * Save our processor parameters | ||
319 | */ | ||
320 | smp_store_cpu_info(cpuid); | ||
321 | |||
322 | local_irq_disable(); | ||
323 | |||
324 | /* | ||
325 | * Allow the master to continue. | ||
326 | */ | ||
327 | cpu_set(cpuid, cpu_callin_map); | ||
328 | |||
329 | /* | ||
330 | * Synchronize the TSC with the BP | ||
331 | */ | ||
332 | if (cpu_has_tsc) | ||
333 | synchronize_tsc_ap(); | ||
334 | } | ||
335 | |||
336 | static int cpucount; | ||
337 | |||
338 | /* | ||
339 | * Activate a secondary processor. | ||
340 | */ | ||
341 | void __init start_secondary(void) | ||
342 | { | ||
343 | /* | ||
344 | * Dont put anything before smp_callin(), SMP | ||
345 | * booting is too fragile that we want to limit the | ||
346 | * things done here to the most necessary things. | ||
347 | */ | ||
348 | cpu_init(); | ||
349 | smp_callin(); | ||
350 | |||
351 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | ||
352 | barrier(); | ||
353 | |||
354 | Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); | ||
355 | while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) | ||
356 | rep_nop(); | ||
357 | |||
358 | Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); | ||
359 | setup_secondary_APIC_clock(); | ||
360 | |||
361 | Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); | ||
362 | |||
363 | if (nmi_watchdog == NMI_IO_APIC) { | ||
364 | disable_8259A_irq(0); | ||
365 | enable_NMI_through_LVT0(NULL); | ||
366 | enable_8259A_irq(0); | ||
367 | } | ||
368 | |||
369 | |||
370 | enable_APIC_timer(); | ||
371 | |||
372 | /* | ||
373 | * low-memory mappings have been cleared, flush them from | ||
374 | * the local TLBs too. | ||
375 | */ | ||
376 | local_flush_tlb(); | ||
377 | |||
378 | Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); | ||
379 | cpu_set(smp_processor_id(), cpu_online_map); | ||
380 | wmb(); | ||
381 | |||
382 | cpu_idle(); | ||
383 | } | ||
384 | |||
385 | extern volatile unsigned long init_rsp; | ||
386 | extern void (*initial_code)(void); | ||
387 | |||
388 | #if APIC_DEBUG | ||
389 | static inline void inquire_remote_apic(int apicid) | ||
390 | { | ||
391 | unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; | ||
392 | char *names[] = { "ID", "VERSION", "SPIV" }; | ||
393 | int timeout, status; | ||
394 | |||
395 | printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); | ||
396 | |||
397 | for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { | ||
398 | printk("... APIC #%d %s: ", apicid, names[i]); | ||
399 | |||
400 | /* | ||
401 | * Wait for idle. | ||
402 | */ | ||
403 | apic_wait_icr_idle(); | ||
404 | |||
405 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); | ||
406 | apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); | ||
407 | |||
408 | timeout = 0; | ||
409 | do { | ||
410 | udelay(100); | ||
411 | status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; | ||
412 | } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); | ||
413 | |||
414 | switch (status) { | ||
415 | case APIC_ICR_RR_VALID: | ||
416 | status = apic_read(APIC_RRR); | ||
417 | printk("%08x\n", status); | ||
418 | break; | ||
419 | default: | ||
420 | printk("failed\n"); | ||
421 | } | ||
422 | } | ||
423 | } | ||
424 | #endif | ||
425 | |||
426 | static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) | ||
427 | { | ||
428 | unsigned long send_status = 0, accept_status = 0; | ||
429 | int maxlvt, timeout, num_starts, j; | ||
430 | |||
431 | Dprintk("Asserting INIT.\n"); | ||
432 | |||
433 | /* | ||
434 | * Turn INIT on target chip | ||
435 | */ | ||
436 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
437 | |||
438 | /* | ||
439 | * Send IPI | ||
440 | */ | ||
441 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | ||
442 | | APIC_DM_INIT); | ||
443 | |||
444 | Dprintk("Waiting for send to finish...\n"); | ||
445 | timeout = 0; | ||
446 | do { | ||
447 | Dprintk("+"); | ||
448 | udelay(100); | ||
449 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
450 | } while (send_status && (timeout++ < 1000)); | ||
451 | |||
452 | mdelay(10); | ||
453 | |||
454 | Dprintk("Deasserting INIT.\n"); | ||
455 | |||
456 | /* Target chip */ | ||
457 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
458 | |||
459 | /* Send IPI */ | ||
460 | apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); | ||
461 | |||
462 | Dprintk("Waiting for send to finish...\n"); | ||
463 | timeout = 0; | ||
464 | do { | ||
465 | Dprintk("+"); | ||
466 | udelay(100); | ||
467 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
468 | } while (send_status && (timeout++ < 1000)); | ||
469 | |||
470 | atomic_set(&init_deasserted, 1); | ||
471 | |||
472 | /* | ||
473 | * Should we send STARTUP IPIs ? | ||
474 | * | ||
475 | * Determine this based on the APIC version. | ||
476 | * If we don't have an integrated APIC, don't send the STARTUP IPIs. | ||
477 | */ | ||
478 | if (APIC_INTEGRATED(apic_version[phys_apicid])) | ||
479 | num_starts = 2; | ||
480 | else | ||
481 | num_starts = 0; | ||
482 | |||
483 | /* | ||
484 | * Run STARTUP IPI loop. | ||
485 | */ | ||
486 | Dprintk("#startup loops: %d.\n", num_starts); | ||
487 | |||
488 | maxlvt = get_maxlvt(); | ||
489 | |||
490 | for (j = 1; j <= num_starts; j++) { | ||
491 | Dprintk("Sending STARTUP #%d.\n",j); | ||
492 | apic_read_around(APIC_SPIV); | ||
493 | apic_write(APIC_ESR, 0); | ||
494 | apic_read(APIC_ESR); | ||
495 | Dprintk("After apic_write.\n"); | ||
496 | |||
497 | /* | ||
498 | * STARTUP IPI | ||
499 | */ | ||
500 | |||
501 | /* Target chip */ | ||
502 | apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
503 | |||
504 | /* Boot on the stack */ | ||
505 | /* Kick the second */ | ||
506 | apic_write_around(APIC_ICR, APIC_DM_STARTUP | ||
507 | | (start_rip >> 12)); | ||
508 | |||
509 | /* | ||
510 | * Give the other CPU some time to accept the IPI. | ||
511 | */ | ||
512 | udelay(300); | ||
513 | |||
514 | Dprintk("Startup point 1.\n"); | ||
515 | |||
516 | Dprintk("Waiting for send to finish...\n"); | ||
517 | timeout = 0; | ||
518 | do { | ||
519 | Dprintk("+"); | ||
520 | udelay(100); | ||
521 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
522 | } while (send_status && (timeout++ < 1000)); | ||
523 | |||
524 | /* | ||
525 | * Give the other CPU some time to accept the IPI. | ||
526 | */ | ||
527 | udelay(200); | ||
528 | /* | ||
529 | * Due to the Pentium erratum 3AP. | ||
530 | */ | ||
531 | if (maxlvt > 3) { | ||
532 | apic_read_around(APIC_SPIV); | ||
533 | apic_write(APIC_ESR, 0); | ||
534 | } | ||
535 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
536 | if (send_status || accept_status) | ||
537 | break; | ||
538 | } | ||
539 | Dprintk("After Startup.\n"); | ||
540 | |||
541 | if (send_status) | ||
542 | printk(KERN_ERR "APIC never delivered???\n"); | ||
543 | if (accept_status) | ||
544 | printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); | ||
545 | |||
546 | return (send_status | accept_status); | ||
547 | } | ||
548 | |||
549 | static void __init do_boot_cpu (int apicid) | ||
550 | { | ||
551 | struct task_struct *idle; | ||
552 | unsigned long boot_error; | ||
553 | int timeout, cpu; | ||
554 | unsigned long start_rip; | ||
555 | |||
556 | cpu = ++cpucount; | ||
557 | /* | ||
558 | * We can't use kernel_thread since we must avoid to | ||
559 | * reschedule the child. | ||
560 | */ | ||
561 | idle = fork_idle(cpu); | ||
562 | if (IS_ERR(idle)) | ||
563 | panic("failed fork for CPU %d", cpu); | ||
564 | x86_cpu_to_apicid[cpu] = apicid; | ||
565 | |||
566 | cpu_pda[cpu].pcurrent = idle; | ||
567 | |||
568 | start_rip = setup_trampoline(); | ||
569 | |||
570 | init_rsp = idle->thread.rsp; | ||
571 | per_cpu(init_tss,cpu).rsp0 = init_rsp; | ||
572 | initial_code = start_secondary; | ||
573 | clear_ti_thread_flag(idle->thread_info, TIF_FORK); | ||
574 | |||
575 | printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, | ||
576 | start_rip, init_rsp); | ||
577 | |||
578 | /* | ||
579 | * This grunge runs the startup process for | ||
580 | * the targeted processor. | ||
581 | */ | ||
582 | |||
583 | atomic_set(&init_deasserted, 0); | ||
584 | |||
585 | Dprintk("Setting warm reset code and vector.\n"); | ||
586 | |||
587 | CMOS_WRITE(0xa, 0xf); | ||
588 | local_flush_tlb(); | ||
589 | Dprintk("1.\n"); | ||
590 | *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; | ||
591 | Dprintk("2.\n"); | ||
592 | *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; | ||
593 | Dprintk("3.\n"); | ||
594 | |||
595 | /* | ||
596 | * Be paranoid about clearing APIC errors. | ||
597 | */ | ||
598 | if (APIC_INTEGRATED(apic_version[apicid])) { | ||
599 | apic_read_around(APIC_SPIV); | ||
600 | apic_write(APIC_ESR, 0); | ||
601 | apic_read(APIC_ESR); | ||
602 | } | ||
603 | |||
604 | /* | ||
605 | * Status is now clean | ||
606 | */ | ||
607 | boot_error = 0; | ||
608 | |||
609 | /* | ||
610 | * Starting actual IPI sequence... | ||
611 | */ | ||
612 | boot_error = wakeup_secondary_via_INIT(apicid, start_rip); | ||
613 | |||
614 | if (!boot_error) { | ||
615 | /* | ||
616 | * allow APs to start initializing. | ||
617 | */ | ||
618 | Dprintk("Before Callout %d.\n", cpu); | ||
619 | cpu_set(cpu, cpu_callout_map); | ||
620 | Dprintk("After Callout %d.\n", cpu); | ||
621 | |||
622 | /* | ||
623 | * Wait 5s total for a response | ||
624 | */ | ||
625 | for (timeout = 0; timeout < 50000; timeout++) { | ||
626 | if (cpu_isset(cpu, cpu_callin_map)) | ||
627 | break; /* It has booted */ | ||
628 | udelay(100); | ||
629 | } | ||
630 | |||
631 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
632 | /* number CPUs logically, starting from 1 (BSP is 0) */ | ||
633 | Dprintk("OK.\n"); | ||
634 | print_cpu_info(&cpu_data[cpu]); | ||
635 | Dprintk("CPU has booted.\n"); | ||
636 | } else { | ||
637 | boot_error = 1; | ||
638 | if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) | ||
639 | == 0xA5) | ||
640 | /* trampoline started but...? */ | ||
641 | printk("Stuck ??\n"); | ||
642 | else | ||
643 | /* trampoline code not run */ | ||
644 | printk("Not responding.\n"); | ||
645 | #if APIC_DEBUG | ||
646 | inquire_remote_apic(apicid); | ||
647 | #endif | ||
648 | } | ||
649 | } | ||
650 | if (boot_error) { | ||
651 | cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ | ||
652 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
653 | cpucount--; | ||
654 | x86_cpu_to_apicid[cpu] = BAD_APICID; | ||
655 | x86_cpu_to_log_apicid[cpu] = BAD_APICID; | ||
656 | } | ||
657 | } | ||
658 | |||
659 | static void smp_tune_scheduling (void) | ||
660 | { | ||
661 | int cachesize; /* kB */ | ||
662 | unsigned long bandwidth = 1000; /* MB/s */ | ||
663 | /* | ||
664 | * Rough estimation for SMP scheduling, this is the number of | ||
665 | * cycles it takes for a fully memory-limited process to flush | ||
666 | * the SMP-local cache. | ||
667 | * | ||
668 | * (For a P5 this pretty much means we will choose another idle | ||
669 | * CPU almost always at wakeup time (this is due to the small | ||
670 | * L1 cache), on PIIs it's around 50-100 usecs, depending on | ||
671 | * the cache size) | ||
672 | */ | ||
673 | |||
674 | if (!cpu_khz) { | ||
675 | return; | ||
676 | } else { | ||
677 | cachesize = boot_cpu_data.x86_cache_size; | ||
678 | if (cachesize == -1) { | ||
679 | cachesize = 16; /* Pentiums, 2x8kB cache */ | ||
680 | bandwidth = 100; | ||
681 | } | ||
682 | } | ||
683 | } | ||
684 | |||
685 | /* | ||
686 | * Cycle through the processors sending APIC IPIs to boot each. | ||
687 | */ | ||
688 | |||
689 | static void __init smp_boot_cpus(unsigned int max_cpus) | ||
690 | { | ||
691 | unsigned apicid, cpu, bit, kicked; | ||
692 | |||
693 | nmi_watchdog_default(); | ||
694 | |||
695 | /* | ||
696 | * Setup boot CPU information | ||
697 | */ | ||
698 | smp_store_cpu_info(0); /* Final full version of the data */ | ||
699 | printk(KERN_INFO "CPU%d: ", 0); | ||
700 | print_cpu_info(&cpu_data[0]); | ||
701 | |||
702 | current_thread_info()->cpu = 0; | ||
703 | smp_tune_scheduling(); | ||
704 | |||
705 | if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { | ||
706 | printk("weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
707 | hard_smp_processor_id()); | ||
708 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
709 | } | ||
710 | |||
711 | /* | ||
712 | * If we couldn't find an SMP configuration at boot time, | ||
713 | * get out of here now! | ||
714 | */ | ||
715 | if (!smp_found_config) { | ||
716 | printk(KERN_NOTICE "SMP motherboard not detected.\n"); | ||
717 | io_apic_irqs = 0; | ||
718 | cpu_online_map = cpumask_of_cpu(0); | ||
719 | cpu_set(0, cpu_sibling_map[0]); | ||
720 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
721 | if (APIC_init_uniprocessor()) | ||
722 | printk(KERN_NOTICE "Local APIC not detected." | ||
723 | " Using dummy APIC emulation.\n"); | ||
724 | goto smp_done; | ||
725 | } | ||
726 | |||
727 | /* | ||
728 | * Should not be necessary because the MP table should list the boot | ||
729 | * CPU too, but we do it for the sake of robustness anyway. | ||
730 | */ | ||
731 | if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { | ||
732 | printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", | ||
733 | boot_cpu_id); | ||
734 | physid_set(hard_smp_processor_id(), phys_cpu_present_map); | ||
735 | } | ||
736 | |||
737 | /* | ||
738 | * If we couldn't find a local APIC, then get out of here now! | ||
739 | */ | ||
740 | if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { | ||
741 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | ||
742 | boot_cpu_id); | ||
743 | printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); | ||
744 | io_apic_irqs = 0; | ||
745 | cpu_online_map = cpumask_of_cpu(0); | ||
746 | cpu_set(0, cpu_sibling_map[0]); | ||
747 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
748 | disable_apic = 1; | ||
749 | goto smp_done; | ||
750 | } | ||
751 | |||
752 | verify_local_APIC(); | ||
753 | |||
754 | /* | ||
755 | * If SMP should be disabled, then really disable it! | ||
756 | */ | ||
757 | if (!max_cpus) { | ||
758 | smp_found_config = 0; | ||
759 | printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); | ||
760 | io_apic_irqs = 0; | ||
761 | cpu_online_map = cpumask_of_cpu(0); | ||
762 | cpu_set(0, cpu_sibling_map[0]); | ||
763 | phys_cpu_present_map = physid_mask_of_physid(0); | ||
764 | disable_apic = 1; | ||
765 | goto smp_done; | ||
766 | } | ||
767 | |||
768 | connect_bsp_APIC(); | ||
769 | setup_local_APIC(); | ||
770 | |||
771 | if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) | ||
772 | BUG(); | ||
773 | |||
774 | x86_cpu_to_apicid[0] = boot_cpu_id; | ||
775 | |||
776 | /* | ||
777 | * Now scan the CPU present map and fire up the other CPUs. | ||
778 | */ | ||
779 | Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); | ||
780 | |||
781 | kicked = 1; | ||
782 | for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { | ||
783 | apicid = cpu_present_to_apicid(bit); | ||
784 | /* | ||
785 | * Don't even attempt to start the boot CPU! | ||
786 | */ | ||
787 | if (apicid == boot_cpu_id || (apicid == BAD_APICID)) | ||
788 | continue; | ||
789 | |||
790 | if (!physid_isset(apicid, phys_cpu_present_map)) | ||
791 | continue; | ||
792 | if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) | ||
793 | continue; | ||
794 | |||
795 | do_boot_cpu(apicid); | ||
796 | ++kicked; | ||
797 | } | ||
798 | |||
799 | /* | ||
800 | * Cleanup possible dangling ends... | ||
801 | */ | ||
802 | { | ||
803 | /* | ||
804 | * Install writable page 0 entry to set BIOS data area. | ||
805 | */ | ||
806 | local_flush_tlb(); | ||
807 | |||
808 | /* | ||
809 | * Paranoid: Set warm reset code and vector here back | ||
810 | * to default values. | ||
811 | */ | ||
812 | CMOS_WRITE(0, 0xf); | ||
813 | |||
814 | *((volatile int *) phys_to_virt(0x467)) = 0; | ||
815 | } | ||
816 | |||
817 | /* | ||
818 | * Allow the user to impress friends. | ||
819 | */ | ||
820 | |||
821 | Dprintk("Before bogomips.\n"); | ||
822 | if (!cpucount) { | ||
823 | printk(KERN_INFO "Only one processor found.\n"); | ||
824 | } else { | ||
825 | unsigned long bogosum = 0; | ||
826 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
827 | if (cpu_isset(cpu, cpu_callout_map)) | ||
828 | bogosum += cpu_data[cpu].loops_per_jiffy; | ||
829 | printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", | ||
830 | cpucount+1, | ||
831 | bogosum/(500000/HZ), | ||
832 | (bogosum/(5000/HZ))%100); | ||
833 | Dprintk("Before bogocount - setting activated=1.\n"); | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Construct cpu_sibling_map[], so that we can tell the | ||
838 | * sibling CPU efficiently. | ||
839 | */ | ||
840 | for (cpu = 0; cpu < NR_CPUS; cpu++) | ||
841 | cpus_clear(cpu_sibling_map[cpu]); | ||
842 | |||
843 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
844 | int siblings = 0; | ||
845 | int i; | ||
846 | if (!cpu_isset(cpu, cpu_callout_map)) | ||
847 | continue; | ||
848 | |||
849 | if (smp_num_siblings > 1) { | ||
850 | for (i = 0; i < NR_CPUS; i++) { | ||
851 | if (!cpu_isset(i, cpu_callout_map)) | ||
852 | continue; | ||
853 | if (phys_proc_id[cpu] == phys_proc_id[i]) { | ||
854 | siblings++; | ||
855 | cpu_set(i, cpu_sibling_map[cpu]); | ||
856 | } | ||
857 | } | ||
858 | } else { | ||
859 | siblings++; | ||
860 | cpu_set(cpu, cpu_sibling_map[cpu]); | ||
861 | } | ||
862 | |||
863 | if (siblings != smp_num_siblings) { | ||
864 | printk(KERN_WARNING | ||
865 | "WARNING: %d siblings found for CPU%d, should be %d\n", | ||
866 | siblings, cpu, smp_num_siblings); | ||
867 | smp_num_siblings = siblings; | ||
868 | } | ||
869 | } | ||
870 | |||
871 | Dprintk("Boot done.\n"); | ||
872 | |||
873 | /* | ||
874 | * Here we can be sure that there is an IO-APIC in the system. Let's | ||
875 | * go and set it up: | ||
876 | */ | ||
877 | if (!skip_ioapic_setup && nr_ioapics) | ||
878 | setup_IO_APIC(); | ||
879 | else | ||
880 | nr_ioapics = 0; | ||
881 | |||
882 | setup_boot_APIC_clock(); | ||
883 | |||
884 | /* | ||
885 | * Synchronize the TSC with the AP | ||
886 | */ | ||
887 | if (cpu_has_tsc && cpucount) | ||
888 | synchronize_tsc_bp(); | ||
889 | |||
890 | smp_done: | ||
891 | time_init_smp(); | ||
892 | } | ||
893 | |||
894 | /* These are wrappers to interface to the new boot process. Someone | ||
895 | who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ | ||
896 | void __init smp_prepare_cpus(unsigned int max_cpus) | ||
897 | { | ||
898 | smp_boot_cpus(max_cpus); | ||
899 | } | ||
900 | |||
901 | void __devinit smp_prepare_boot_cpu(void) | ||
902 | { | ||
903 | cpu_set(smp_processor_id(), cpu_online_map); | ||
904 | cpu_set(smp_processor_id(), cpu_callout_map); | ||
905 | } | ||
906 | |||
907 | int __devinit __cpu_up(unsigned int cpu) | ||
908 | { | ||
909 | /* This only works at boot for x86. See "rewrite" above. */ | ||
910 | if (cpu_isset(cpu, smp_commenced_mask)) { | ||
911 | local_irq_enable(); | ||
912 | return -ENOSYS; | ||
913 | } | ||
914 | |||
915 | /* In case one didn't come up */ | ||
916 | if (!cpu_isset(cpu, cpu_callin_map)) { | ||
917 | local_irq_enable(); | ||
918 | return -EIO; | ||
919 | } | ||
920 | local_irq_enable(); | ||
921 | |||
922 | /* Unleash the CPU! */ | ||
923 | Dprintk("waiting for cpu %d\n", cpu); | ||
924 | |||
925 | cpu_set(cpu, smp_commenced_mask); | ||
926 | while (!cpu_isset(cpu, cpu_online_map)) | ||
927 | mb(); | ||
928 | return 0; | ||
929 | } | ||
930 | |||
931 | void __init smp_cpus_done(unsigned int max_cpus) | ||
932 | { | ||
933 | #ifdef CONFIG_X86_IO_APIC | ||
934 | setup_ioapic_dest(); | ||
935 | #endif | ||
936 | zap_low_mappings(); | ||
937 | } | ||
938 | |||
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c new file mode 100644 index 000000000000..ebaa1e37d657 --- /dev/null +++ b/arch/x86_64/kernel/suspend.c | |||
@@ -0,0 +1,157 @@ | |||
1 | /* | ||
2 | * Suspend support specific for i386. | ||
3 | * | ||
4 | * Distribute under GPLv2 | ||
5 | * | ||
6 | * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> | ||
7 | * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> | ||
8 | */ | ||
9 | |||
10 | #include <linux/config.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/poll.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/sysrq.h> | ||
19 | #include <linux/proc_fs.h> | ||
20 | #include <linux/irq.h> | ||
21 | #include <linux/pm.h> | ||
22 | #include <linux/device.h> | ||
23 | #include <linux/suspend.h> | ||
24 | #include <asm/uaccess.h> | ||
25 | #include <asm/acpi.h> | ||
26 | #include <asm/tlbflush.h> | ||
27 | #include <asm/io.h> | ||
28 | #include <asm/proto.h> | ||
29 | |||
30 | struct saved_context saved_context; | ||
31 | |||
32 | unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; | ||
33 | unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi; | ||
34 | unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11; | ||
35 | unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; | ||
36 | unsigned long saved_context_eflags; | ||
37 | |||
38 | void __save_processor_state(struct saved_context *ctxt) | ||
39 | { | ||
40 | kernel_fpu_begin(); | ||
41 | |||
42 | /* | ||
43 | * descriptor tables | ||
44 | */ | ||
45 | asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); | ||
46 | asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); | ||
47 | asm volatile ("sldt %0" : "=m" (ctxt->ldt)); | ||
48 | asm volatile ("str %0" : "=m" (ctxt->tr)); | ||
49 | |||
50 | /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ | ||
51 | /* EFER should be constant for kernel version, no need to handle it. */ | ||
52 | /* | ||
53 | * segment registers | ||
54 | */ | ||
55 | asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); | ||
56 | asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); | ||
57 | asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); | ||
58 | asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); | ||
59 | asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); | ||
60 | |||
61 | rdmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
62 | rdmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
63 | rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
64 | |||
65 | /* | ||
66 | * control registers | ||
67 | */ | ||
68 | asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); | ||
69 | asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); | ||
70 | asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); | ||
71 | asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); | ||
72 | } | ||
73 | |||
74 | void save_processor_state(void) | ||
75 | { | ||
76 | __save_processor_state(&saved_context); | ||
77 | } | ||
78 | |||
79 | static void | ||
80 | do_fpu_end(void) | ||
81 | { | ||
82 | /* restore FPU regs if necessary */ | ||
83 | /* Do it out of line so that gcc does not move cr0 load to some stupid place */ | ||
84 | kernel_fpu_end(); | ||
85 | mxcsr_feature_mask_init(); | ||
86 | } | ||
87 | |||
88 | void __restore_processor_state(struct saved_context *ctxt) | ||
89 | { | ||
90 | /* | ||
91 | * control registers | ||
92 | */ | ||
93 | asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); | ||
94 | asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); | ||
95 | asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); | ||
96 | asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); | ||
97 | |||
98 | /* | ||
99 | * segment registers | ||
100 | */ | ||
101 | asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); | ||
102 | asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); | ||
103 | asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); | ||
104 | load_gs_index(ctxt->gs); | ||
105 | asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); | ||
106 | |||
107 | wrmsrl(MSR_FS_BASE, ctxt->fs_base); | ||
108 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); | ||
109 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | ||
110 | |||
111 | /* | ||
112 | * now restore the descriptor tables to their proper values | ||
113 | * ltr is done i fix_processor_context(). | ||
114 | */ | ||
115 | asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); | ||
116 | asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); | ||
117 | asm volatile ("lldt %0" :: "m" (ctxt->ldt)); | ||
118 | |||
119 | fix_processor_context(); | ||
120 | |||
121 | do_fpu_end(); | ||
122 | } | ||
123 | |||
124 | void restore_processor_state(void) | ||
125 | { | ||
126 | __restore_processor_state(&saved_context); | ||
127 | } | ||
128 | |||
129 | void fix_processor_context(void) | ||
130 | { | ||
131 | int cpu = smp_processor_id(); | ||
132 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
133 | |||
134 | set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ | ||
135 | |||
136 | cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9; | ||
137 | |||
138 | syscall_init(); /* This sets MSR_*STAR and related */ | ||
139 | load_TR_desc(); /* This does ltr */ | ||
140 | load_LDT(¤t->active_mm->context); /* This does lldt */ | ||
141 | |||
142 | /* | ||
143 | * Now maybe reload the debug registers | ||
144 | */ | ||
145 | if (current->thread.debugreg7){ | ||
146 | loaddebug(¤t->thread, 0); | ||
147 | loaddebug(¤t->thread, 1); | ||
148 | loaddebug(¤t->thread, 2); | ||
149 | loaddebug(¤t->thread, 3); | ||
150 | /* no 4 and 5 */ | ||
151 | loaddebug(¤t->thread, 6); | ||
152 | loaddebug(¤t->thread, 7); | ||
153 | } | ||
154 | |||
155 | } | ||
156 | |||
157 | |||
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S new file mode 100644 index 000000000000..53f8e1659511 --- /dev/null +++ b/arch/x86_64/kernel/suspend_asm.S | |||
@@ -0,0 +1,104 @@ | |||
1 | /* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> | ||
2 | * | ||
3 | * Distribute under GPLv2. | ||
4 | * | ||
5 | * swsusp_arch_resume may not use any stack, nor any variable that is | ||
6 | * not "NoSave" during copying pages: | ||
7 | * | ||
8 | * Its rewriting one kernel image with another. What is stack in "old" | ||
9 | * image could very well be data page in "new" image, and overwriting | ||
10 | * your own stack under you is bad idea. | ||
11 | */ | ||
12 | |||
13 | .text | ||
14 | #include <linux/linkage.h> | ||
15 | #include <asm/segment.h> | ||
16 | #include <asm/page.h> | ||
17 | #include <asm/offset.h> | ||
18 | |||
19 | ENTRY(swsusp_arch_suspend) | ||
20 | |||
21 | movq %rsp, saved_context_esp(%rip) | ||
22 | movq %rax, saved_context_eax(%rip) | ||
23 | movq %rbx, saved_context_ebx(%rip) | ||
24 | movq %rcx, saved_context_ecx(%rip) | ||
25 | movq %rdx, saved_context_edx(%rip) | ||
26 | movq %rbp, saved_context_ebp(%rip) | ||
27 | movq %rsi, saved_context_esi(%rip) | ||
28 | movq %rdi, saved_context_edi(%rip) | ||
29 | movq %r8, saved_context_r08(%rip) | ||
30 | movq %r9, saved_context_r09(%rip) | ||
31 | movq %r10, saved_context_r10(%rip) | ||
32 | movq %r11, saved_context_r11(%rip) | ||
33 | movq %r12, saved_context_r12(%rip) | ||
34 | movq %r13, saved_context_r13(%rip) | ||
35 | movq %r14, saved_context_r14(%rip) | ||
36 | movq %r15, saved_context_r15(%rip) | ||
37 | pushfq ; popq saved_context_eflags(%rip) | ||
38 | |||
39 | call swsusp_save | ||
40 | ret | ||
41 | |||
42 | ENTRY(swsusp_arch_resume) | ||
43 | /* set up cr3 */ | ||
44 | leaq init_level4_pgt(%rip),%rax | ||
45 | subq $__START_KERNEL_map,%rax | ||
46 | movq %rax,%cr3 | ||
47 | |||
48 | movq mmu_cr4_features(%rip), %rax | ||
49 | movq %rax, %rdx | ||
50 | andq $~(1<<7), %rdx # PGE | ||
51 | movq %rdx, %cr4; # turn off PGE | ||
52 | movq %cr3, %rcx; # flush TLB | ||
53 | movq %rcx, %cr3; | ||
54 | movq %rax, %cr4; # turn PGE back on | ||
55 | |||
56 | movq pagedir_nosave(%rip), %rdx | ||
57 | loop: | ||
58 | testq %rdx, %rdx | ||
59 | jz done | ||
60 | |||
61 | /* get addresses from the pbe and copy the page */ | ||
62 | movq pbe_address(%rdx), %rsi | ||
63 | movq pbe_orig_address(%rdx), %rdi | ||
64 | movq $512, %rcx | ||
65 | rep | ||
66 | movsq | ||
67 | |||
68 | /* progress to the next pbe */ | ||
69 | movq pbe_next(%rdx), %rdx | ||
70 | jmp loop | ||
71 | done: | ||
72 | /* Flush TLB, including "global" things (vmalloc) */ | ||
73 | movq mmu_cr4_features(%rip), %rax | ||
74 | movq %rax, %rdx | ||
75 | andq $~(1<<7), %rdx; # PGE | ||
76 | movq %rdx, %cr4; # turn off PGE | ||
77 | movq %cr3, %rcx; # flush TLB | ||
78 | movq %rcx, %cr3 | ||
79 | movq %rax, %cr4; # turn PGE back on | ||
80 | |||
81 | movl $24, %eax | ||
82 | movl %eax, %ds | ||
83 | |||
84 | movq saved_context_esp(%rip), %rsp | ||
85 | movq saved_context_ebp(%rip), %rbp | ||
86 | /* Don't restore %rax, it must be 0 anyway */ | ||
87 | movq saved_context_ebx(%rip), %rbx | ||
88 | movq saved_context_ecx(%rip), %rcx | ||
89 | movq saved_context_edx(%rip), %rdx | ||
90 | movq saved_context_esi(%rip), %rsi | ||
91 | movq saved_context_edi(%rip), %rdi | ||
92 | movq saved_context_r08(%rip), %r8 | ||
93 | movq saved_context_r09(%rip), %r9 | ||
94 | movq saved_context_r10(%rip), %r10 | ||
95 | movq saved_context_r11(%rip), %r11 | ||
96 | movq saved_context_r12(%rip), %r12 | ||
97 | movq saved_context_r13(%rip), %r13 | ||
98 | movq saved_context_r14(%rip), %r14 | ||
99 | movq saved_context_r15(%rip), %r15 | ||
100 | pushq saved_context_eflags(%rip) ; popfq | ||
101 | |||
102 | xorq %rax, %rax | ||
103 | |||
104 | ret | ||
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c new file mode 100644 index 000000000000..477d8be57d64 --- /dev/null +++ b/arch/x86_64/kernel/sys_x86_64.c | |||
@@ -0,0 +1,173 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/sys_x86_64.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/errno.h> | ||
6 | #include <linux/sched.h> | ||
7 | #include <linux/syscalls.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/smp.h> | ||
10 | #include <linux/smp_lock.h> | ||
11 | #include <linux/sem.h> | ||
12 | #include <linux/msg.h> | ||
13 | #include <linux/shm.h> | ||
14 | #include <linux/stat.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/file.h> | ||
17 | #include <linux/utsname.h> | ||
18 | #include <linux/personality.h> | ||
19 | |||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/ia32.h> | ||
22 | |||
23 | /* | ||
24 | * sys_pipe() is the normal C calling standard for creating | ||
25 | * a pipe. It's not the way Unix traditionally does this, though. | ||
26 | */ | ||
27 | asmlinkage long sys_pipe(int __user *fildes) | ||
28 | { | ||
29 | int fd[2]; | ||
30 | int error; | ||
31 | |||
32 | error = do_pipe(fd); | ||
33 | if (!error) { | ||
34 | if (copy_to_user(fildes, fd, 2*sizeof(int))) | ||
35 | error = -EFAULT; | ||
36 | } | ||
37 | return error; | ||
38 | } | ||
39 | |||
40 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, | ||
41 | unsigned long fd, unsigned long off) | ||
42 | { | ||
43 | long error; | ||
44 | struct file * file; | ||
45 | |||
46 | error = -EINVAL; | ||
47 | if (off & ~PAGE_MASK) | ||
48 | goto out; | ||
49 | |||
50 | error = -EBADF; | ||
51 | file = NULL; | ||
52 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
53 | if (!(flags & MAP_ANONYMOUS)) { | ||
54 | file = fget(fd); | ||
55 | if (!file) | ||
56 | goto out; | ||
57 | } | ||
58 | down_write(¤t->mm->mmap_sem); | ||
59 | error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); | ||
60 | up_write(¤t->mm->mmap_sem); | ||
61 | |||
62 | if (file) | ||
63 | fput(file); | ||
64 | out: | ||
65 | return error; | ||
66 | } | ||
67 | |||
68 | static void find_start_end(unsigned long flags, unsigned long *begin, | ||
69 | unsigned long *end) | ||
70 | { | ||
71 | #ifdef CONFIG_IA32_EMULATION | ||
72 | if (test_thread_flag(TIF_IA32)) { | ||
73 | *begin = TASK_UNMAPPED_32; | ||
74 | *end = IA32_PAGE_OFFSET; | ||
75 | } else | ||
76 | #endif | ||
77 | if (flags & MAP_32BIT) { | ||
78 | /* This is usually used needed to map code in small | ||
79 | model, so it needs to be in the first 31bit. Limit | ||
80 | it to that. This means we need to move the | ||
81 | unmapped base down for this case. This can give | ||
82 | conflicts with the heap, but we assume that glibc | ||
83 | malloc knows how to fall back to mmap. Give it 1GB | ||
84 | of playground for now. -AK */ | ||
85 | *begin = 0x40000000; | ||
86 | *end = 0x80000000; | ||
87 | } else { | ||
88 | *begin = TASK_UNMAPPED_64; | ||
89 | *end = TASK_SIZE; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | unsigned long | ||
94 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | ||
95 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
96 | { | ||
97 | struct mm_struct *mm = current->mm; | ||
98 | struct vm_area_struct *vma; | ||
99 | unsigned long start_addr; | ||
100 | unsigned long begin, end; | ||
101 | |||
102 | find_start_end(flags, &begin, &end); | ||
103 | |||
104 | if (len > end) | ||
105 | return -ENOMEM; | ||
106 | |||
107 | if (addr) { | ||
108 | addr = PAGE_ALIGN(addr); | ||
109 | vma = find_vma(mm, addr); | ||
110 | if (end - len >= addr && | ||
111 | (!vma || addr + len <= vma->vm_start)) | ||
112 | return addr; | ||
113 | } | ||
114 | addr = mm->free_area_cache; | ||
115 | if (addr < begin) | ||
116 | addr = begin; | ||
117 | start_addr = addr; | ||
118 | |||
119 | full_search: | ||
120 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
121 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
122 | if (end - len < addr) { | ||
123 | /* | ||
124 | * Start a new search - just in case we missed | ||
125 | * some holes. | ||
126 | */ | ||
127 | if (start_addr != begin) { | ||
128 | start_addr = addr = begin; | ||
129 | goto full_search; | ||
130 | } | ||
131 | return -ENOMEM; | ||
132 | } | ||
133 | if (!vma || addr + len <= vma->vm_start) { | ||
134 | /* | ||
135 | * Remember the place where we stopped the search: | ||
136 | */ | ||
137 | mm->free_area_cache = addr + len; | ||
138 | return addr; | ||
139 | } | ||
140 | addr = vma->vm_end; | ||
141 | } | ||
142 | } | ||
143 | |||
144 | asmlinkage long sys_uname(struct new_utsname __user * name) | ||
145 | { | ||
146 | int err; | ||
147 | down_read(&uts_sem); | ||
148 | err = copy_to_user(name, &system_utsname, sizeof (*name)); | ||
149 | up_read(&uts_sem); | ||
150 | if (personality(current->personality) == PER_LINUX32) | ||
151 | err |= copy_to_user(&name->machine, "i686", 5); | ||
152 | return err ? -EFAULT : 0; | ||
153 | } | ||
154 | |||
155 | asmlinkage long wrap_sys_shmat(int shmid, char __user *shmaddr, int shmflg) | ||
156 | { | ||
157 | unsigned long raddr; | ||
158 | return do_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr; | ||
159 | } | ||
160 | |||
161 | asmlinkage long sys_time64(long __user * tloc) | ||
162 | { | ||
163 | struct timeval now; | ||
164 | int i; | ||
165 | |||
166 | do_gettimeofday(&now); | ||
167 | i = now.tv_sec; | ||
168 | if (tloc) { | ||
169 | if (put_user(i,tloc)) | ||
170 | i = -EFAULT; | ||
171 | } | ||
172 | return i; | ||
173 | } | ||
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c new file mode 100644 index 000000000000..e263685f864c --- /dev/null +++ b/arch/x86_64/kernel/syscall.c | |||
@@ -0,0 +1,26 @@ | |||
1 | /* System call table for x86-64. */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <linux/sys.h> | ||
5 | #include <linux/cache.h> | ||
6 | #include <linux/config.h> | ||
7 | |||
8 | #define __NO_STUBS | ||
9 | |||
10 | #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; | ||
11 | #undef _ASM_X86_64_UNISTD_H_ | ||
12 | #include <asm-x86_64/unistd.h> | ||
13 | |||
14 | #undef __SYSCALL | ||
15 | #define __SYSCALL(nr, sym) [ nr ] = sym, | ||
16 | #undef _ASM_X86_64_UNISTD_H_ | ||
17 | |||
18 | typedef void (*sys_call_ptr_t)(void); | ||
19 | |||
20 | extern void sys_ni_syscall(void); | ||
21 | |||
22 | sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { | ||
23 | /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ | ||
24 | [0 ... __NR_syscall_max] = &sys_ni_syscall, | ||
25 | #include <asm-x86_64/unistd.h> | ||
26 | }; | ||
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c new file mode 100644 index 000000000000..3bafe438fa75 --- /dev/null +++ b/arch/x86_64/kernel/time.c | |||
@@ -0,0 +1,1262 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/kernel/time.c | ||
3 | * | ||
4 | * "High Precision Event Timer" based timekeeping. | ||
5 | * | ||
6 | * Copyright (c) 1991,1992,1995 Linus Torvalds | ||
7 | * Copyright (c) 1994 Alan Modra | ||
8 | * Copyright (c) 1995 Markus Kuhn | ||
9 | * Copyright (c) 1996 Ingo Molnar | ||
10 | * Copyright (c) 1998 Andrea Arcangeli | ||
11 | * Copyright (c) 2002 Vojtech Pavlik | ||
12 | * Copyright (c) 2003 Andi Kleen | ||
13 | * RTC support code taken from arch/i386/kernel/timers/time_hpet.c | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/sched.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/mc146818rtc.h> | ||
21 | #include <linux/irq.h> | ||
22 | #include <linux/time.h> | ||
23 | #include <linux/ioport.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/device.h> | ||
26 | #include <linux/sysdev.h> | ||
27 | #include <linux/bcd.h> | ||
28 | #include <linux/kallsyms.h> | ||
29 | #include <asm/8253pit.h> | ||
30 | #include <asm/pgtable.h> | ||
31 | #include <asm/vsyscall.h> | ||
32 | #include <asm/timex.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm/hpet.h> | ||
35 | #include <asm/sections.h> | ||
36 | #include <linux/cpufreq.h> | ||
37 | #include <linux/hpet.h> | ||
38 | #ifdef CONFIG_X86_LOCAL_APIC | ||
39 | #include <asm/apic.h> | ||
40 | #endif | ||
41 | |||
42 | u64 jiffies_64 = INITIAL_JIFFIES; | ||
43 | |||
44 | EXPORT_SYMBOL(jiffies_64); | ||
45 | |||
46 | #ifdef CONFIG_CPU_FREQ | ||
47 | static void cpufreq_delayed_get(void); | ||
48 | #endif | ||
49 | extern void i8254_timer_resume(void); | ||
50 | extern int using_apic_timer; | ||
51 | |||
52 | DEFINE_SPINLOCK(rtc_lock); | ||
53 | DEFINE_SPINLOCK(i8253_lock); | ||
54 | |||
55 | static int nohpet __initdata = 0; | ||
56 | static int notsc __initdata = 0; | ||
57 | |||
58 | #undef HPET_HACK_ENABLE_DANGEROUS | ||
59 | |||
60 | unsigned int cpu_khz; /* TSC clocks / usec, not used here */ | ||
61 | static unsigned long hpet_period; /* fsecs / HPET clock */ | ||
62 | unsigned long hpet_tick; /* HPET clocks / interrupt */ | ||
63 | unsigned long vxtime_hz = PIT_TICK_RATE; | ||
64 | int report_lost_ticks; /* command line option */ | ||
65 | unsigned long long monotonic_base; | ||
66 | |||
67 | struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ | ||
68 | |||
69 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | ||
70 | unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; | ||
71 | struct timespec __xtime __section_xtime; | ||
72 | struct timezone __sys_tz __section_sys_tz; | ||
73 | |||
74 | static inline void rdtscll_sync(unsigned long *tsc) | ||
75 | { | ||
76 | #ifdef CONFIG_SMP | ||
77 | sync_core(); | ||
78 | #endif | ||
79 | rdtscll(*tsc); | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * do_gettimeoffset() returns microseconds since last timer interrupt was | ||
84 | * triggered by hardware. A memory read of HPET is slower than a register read | ||
85 | * of TSC, but much more reliable. It's also synchronized to the timer | ||
86 | * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a | ||
87 | * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. | ||
88 | * This is not a problem, because jiffies hasn't updated either. They are bound | ||
89 | * together by xtime_lock. | ||
90 | */ | ||
91 | |||
92 | static inline unsigned int do_gettimeoffset_tsc(void) | ||
93 | { | ||
94 | unsigned long t; | ||
95 | unsigned long x; | ||
96 | rdtscll_sync(&t); | ||
97 | if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ | ||
98 | x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; | ||
99 | return x; | ||
100 | } | ||
101 | |||
102 | static inline unsigned int do_gettimeoffset_hpet(void) | ||
103 | { | ||
104 | return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; | ||
105 | } | ||
106 | |||
107 | unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; | ||
108 | |||
109 | /* | ||
110 | * This version of gettimeofday() has microsecond resolution and better than | ||
111 | * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 | ||
112 | * MHz) HPET timer. | ||
113 | */ | ||
114 | |||
115 | void do_gettimeofday(struct timeval *tv) | ||
116 | { | ||
117 | unsigned long seq, t; | ||
118 | unsigned int sec, usec; | ||
119 | |||
120 | do { | ||
121 | seq = read_seqbegin(&xtime_lock); | ||
122 | |||
123 | sec = xtime.tv_sec; | ||
124 | usec = xtime.tv_nsec / 1000; | ||
125 | |||
126 | /* i386 does some correction here to keep the clock | ||
127 | monotonous even when ntpd is fixing drift. | ||
128 | But they didn't work for me, there is a non monotonic | ||
129 | clock anyways with ntp. | ||
130 | I dropped all corrections now until a real solution can | ||
131 | be found. Note when you fix it here you need to do the same | ||
132 | in arch/x86_64/kernel/vsyscall.c and export all needed | ||
133 | variables in vmlinux.lds. -AK */ | ||
134 | |||
135 | t = (jiffies - wall_jiffies) * (1000000L / HZ) + | ||
136 | do_gettimeoffset(); | ||
137 | usec += t; | ||
138 | |||
139 | } while (read_seqretry(&xtime_lock, seq)); | ||
140 | |||
141 | tv->tv_sec = sec + usec / 1000000; | ||
142 | tv->tv_usec = usec % 1000000; | ||
143 | } | ||
144 | |||
145 | EXPORT_SYMBOL(do_gettimeofday); | ||
146 | |||
147 | /* | ||
148 | * settimeofday() first undoes the correction that gettimeofday would do | ||
149 | * on the time, and then saves it. This is ugly, but has been like this for | ||
150 | * ages already. | ||
151 | */ | ||
152 | |||
153 | int do_settimeofday(struct timespec *tv) | ||
154 | { | ||
155 | time_t wtm_sec, sec = tv->tv_sec; | ||
156 | long wtm_nsec, nsec = tv->tv_nsec; | ||
157 | |||
158 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
159 | return -EINVAL; | ||
160 | |||
161 | write_seqlock_irq(&xtime_lock); | ||
162 | |||
163 | nsec -= do_gettimeoffset() * 1000 + | ||
164 | (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); | ||
165 | |||
166 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
167 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
168 | |||
169 | set_normalized_timespec(&xtime, sec, nsec); | ||
170 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
171 | |||
172 | time_adjust = 0; /* stop active adjtime() */ | ||
173 | time_status |= STA_UNSYNC; | ||
174 | time_maxerror = NTP_PHASE_LIMIT; | ||
175 | time_esterror = NTP_PHASE_LIMIT; | ||
176 | |||
177 | write_sequnlock_irq(&xtime_lock); | ||
178 | clock_was_set(); | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | EXPORT_SYMBOL(do_settimeofday); | ||
183 | |||
184 | unsigned long profile_pc(struct pt_regs *regs) | ||
185 | { | ||
186 | unsigned long pc = instruction_pointer(regs); | ||
187 | |||
188 | /* Assume the lock function has either no stack frame or only a single word. | ||
189 | This checks if the address on the stack looks like a kernel text address. | ||
190 | There is a small window for false hits, but in that case the tick | ||
191 | is just accounted to the spinlock function. | ||
192 | Better would be to write these functions in assembler again | ||
193 | and check exactly. */ | ||
194 | if (in_lock_functions(pc)) { | ||
195 | char *v = *(char **)regs->rsp; | ||
196 | if ((v >= _stext && v <= _etext) || | ||
197 | (v >= _sinittext && v <= _einittext) || | ||
198 | (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) | ||
199 | return (unsigned long)v; | ||
200 | return ((unsigned long *)regs->rsp)[1]; | ||
201 | } | ||
202 | return pc; | ||
203 | } | ||
204 | EXPORT_SYMBOL(profile_pc); | ||
205 | |||
206 | /* | ||
207 | * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 | ||
208 | * ms after the second nowtime has started, because when nowtime is written | ||
209 | * into the registers of the CMOS clock, it will jump to the next second | ||
210 | * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data | ||
211 | * sheet for details. | ||
212 | */ | ||
213 | |||
214 | static void set_rtc_mmss(unsigned long nowtime) | ||
215 | { | ||
216 | int real_seconds, real_minutes, cmos_minutes; | ||
217 | unsigned char control, freq_select; | ||
218 | |||
219 | /* | ||
220 | * IRQs are disabled when we're called from the timer interrupt, | ||
221 | * no need for spin_lock_irqsave() | ||
222 | */ | ||
223 | |||
224 | spin_lock(&rtc_lock); | ||
225 | |||
226 | /* | ||
227 | * Tell the clock it's being set and stop it. | ||
228 | */ | ||
229 | |||
230 | control = CMOS_READ(RTC_CONTROL); | ||
231 | CMOS_WRITE(control | RTC_SET, RTC_CONTROL); | ||
232 | |||
233 | freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
234 | CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); | ||
235 | |||
236 | cmos_minutes = CMOS_READ(RTC_MINUTES); | ||
237 | BCD_TO_BIN(cmos_minutes); | ||
238 | |||
239 | /* | ||
240 | * since we're only adjusting minutes and seconds, don't interfere with hour | ||
241 | * overflow. This avoids messing with unknown time zones but requires your RTC | ||
242 | * not to be off by more than 15 minutes. Since we're calling it only when | ||
243 | * our clock is externally synchronized using NTP, this shouldn't be a problem. | ||
244 | */ | ||
245 | |||
246 | real_seconds = nowtime % 60; | ||
247 | real_minutes = nowtime / 60; | ||
248 | if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) | ||
249 | real_minutes += 30; /* correct for half hour time zone */ | ||
250 | real_minutes %= 60; | ||
251 | |||
252 | #if 0 | ||
253 | /* AMD 8111 is a really bad time keeper and hits this regularly. | ||
254 | It probably was an attempt to avoid screwing up DST, but ignore | ||
255 | that for now. */ | ||
256 | if (abs(real_minutes - cmos_minutes) >= 30) { | ||
257 | printk(KERN_WARNING "time.c: can't update CMOS clock " | ||
258 | "from %d to %d\n", cmos_minutes, real_minutes); | ||
259 | } else | ||
260 | #endif | ||
261 | |||
262 | { | ||
263 | BIN_TO_BCD(real_seconds); | ||
264 | BIN_TO_BCD(real_minutes); | ||
265 | CMOS_WRITE(real_seconds, RTC_SECONDS); | ||
266 | CMOS_WRITE(real_minutes, RTC_MINUTES); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * The following flags have to be released exactly in this order, otherwise the | ||
271 | * DS12887 (popular MC146818A clone with integrated battery and quartz) will | ||
272 | * not reset the oscillator and will not update precisely 500 ms later. You | ||
273 | * won't find this mentioned in the Dallas Semiconductor data sheets, but who | ||
274 | * believes data sheets anyway ... -- Markus Kuhn | ||
275 | */ | ||
276 | |||
277 | CMOS_WRITE(control, RTC_CONTROL); | ||
278 | CMOS_WRITE(freq_select, RTC_FREQ_SELECT); | ||
279 | |||
280 | spin_unlock(&rtc_lock); | ||
281 | } | ||
282 | |||
283 | |||
284 | /* monotonic_clock(): returns # of nanoseconds passed since time_init() | ||
285 | * Note: This function is required to return accurate | ||
286 | * time even in the absence of multiple timer ticks. | ||
287 | */ | ||
288 | unsigned long long monotonic_clock(void) | ||
289 | { | ||
290 | unsigned long seq; | ||
291 | u32 last_offset, this_offset, offset; | ||
292 | unsigned long long base; | ||
293 | |||
294 | if (vxtime.mode == VXTIME_HPET) { | ||
295 | do { | ||
296 | seq = read_seqbegin(&xtime_lock); | ||
297 | |||
298 | last_offset = vxtime.last; | ||
299 | base = monotonic_base; | ||
300 | this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
301 | |||
302 | } while (read_seqretry(&xtime_lock, seq)); | ||
303 | offset = (this_offset - last_offset); | ||
304 | offset *=(NSEC_PER_SEC/HZ)/hpet_tick; | ||
305 | return base + offset; | ||
306 | }else{ | ||
307 | do { | ||
308 | seq = read_seqbegin(&xtime_lock); | ||
309 | |||
310 | last_offset = vxtime.last_tsc; | ||
311 | base = monotonic_base; | ||
312 | } while (read_seqretry(&xtime_lock, seq)); | ||
313 | sync_core(); | ||
314 | rdtscll(this_offset); | ||
315 | offset = (this_offset - last_offset)*1000/cpu_khz; | ||
316 | return base + offset; | ||
317 | } | ||
318 | |||
319 | |||
320 | } | ||
321 | EXPORT_SYMBOL(monotonic_clock); | ||
322 | |||
323 | static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) | ||
324 | { | ||
325 | static long lost_count; | ||
326 | static int warned; | ||
327 | |||
328 | if (report_lost_ticks) { | ||
329 | printk(KERN_WARNING "time.c: Lost %d timer " | ||
330 | "tick(s)! ", lost); | ||
331 | print_symbol("rip %s)\n", regs->rip); | ||
332 | } | ||
333 | |||
334 | if (lost_count == 1000 && !warned) { | ||
335 | printk(KERN_WARNING | ||
336 | "warning: many lost ticks.\n" | ||
337 | KERN_WARNING "Your time source seems to be instable or " | ||
338 | "some driver is hogging interupts\n"); | ||
339 | print_symbol("rip %s\n", regs->rip); | ||
340 | if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { | ||
341 | printk(KERN_WARNING "Falling back to HPET\n"); | ||
342 | vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
343 | vxtime.mode = VXTIME_HPET; | ||
344 | do_gettimeoffset = do_gettimeoffset_hpet; | ||
345 | } | ||
346 | /* else should fall back to PIT, but code missing. */ | ||
347 | warned = 1; | ||
348 | } else | ||
349 | lost_count++; | ||
350 | |||
351 | #ifdef CONFIG_CPU_FREQ | ||
352 | /* In some cases the CPU can change frequency without us noticing | ||
353 | (like going into thermal throttle) | ||
354 | Give cpufreq a change to catch up. */ | ||
355 | if ((lost_count+1) % 25 == 0) { | ||
356 | cpufreq_delayed_get(); | ||
357 | } | ||
358 | #endif | ||
359 | } | ||
360 | |||
361 | static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) | ||
362 | { | ||
363 | static unsigned long rtc_update = 0; | ||
364 | unsigned long tsc; | ||
365 | int delay, offset = 0, lost = 0; | ||
366 | |||
367 | /* | ||
368 | * Here we are in the timer irq handler. We have irqs locally disabled (so we | ||
369 | * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running | ||
370 | * on the other CPU, so we need a lock. We also need to lock the vsyscall | ||
371 | * variables, because both do_timer() and us change them -arca+vojtech | ||
372 | */ | ||
373 | |||
374 | write_seqlock(&xtime_lock); | ||
375 | |||
376 | if (vxtime.hpet_address) { | ||
377 | offset = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
378 | delay = hpet_readl(HPET_COUNTER) - offset; | ||
379 | } else { | ||
380 | spin_lock(&i8253_lock); | ||
381 | outb_p(0x00, 0x43); | ||
382 | delay = inb_p(0x40); | ||
383 | delay |= inb(0x40) << 8; | ||
384 | spin_unlock(&i8253_lock); | ||
385 | delay = LATCH - 1 - delay; | ||
386 | } | ||
387 | |||
388 | rdtscll_sync(&tsc); | ||
389 | |||
390 | if (vxtime.mode == VXTIME_HPET) { | ||
391 | if (offset - vxtime.last > hpet_tick) { | ||
392 | lost = (offset - vxtime.last) / hpet_tick - 1; | ||
393 | } | ||
394 | |||
395 | monotonic_base += | ||
396 | (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; | ||
397 | |||
398 | vxtime.last = offset; | ||
399 | } else { | ||
400 | offset = (((tsc - vxtime.last_tsc) * | ||
401 | vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); | ||
402 | |||
403 | if (offset < 0) | ||
404 | offset = 0; | ||
405 | |||
406 | if (offset > (USEC_PER_SEC / HZ)) { | ||
407 | lost = offset / (USEC_PER_SEC / HZ); | ||
408 | offset %= (USEC_PER_SEC / HZ); | ||
409 | } | ||
410 | |||
411 | monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; | ||
412 | |||
413 | vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; | ||
414 | |||
415 | if ((((tsc - vxtime.last_tsc) * | ||
416 | vxtime.tsc_quot) >> 32) < offset) | ||
417 | vxtime.last_tsc = tsc - | ||
418 | (((long) offset << 32) / vxtime.tsc_quot) - 1; | ||
419 | } | ||
420 | |||
421 | if (lost > 0) { | ||
422 | handle_lost_ticks(lost, regs); | ||
423 | jiffies += lost; | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * Do the timer stuff. | ||
428 | */ | ||
429 | |||
430 | do_timer(regs); | ||
431 | #ifndef CONFIG_SMP | ||
432 | update_process_times(user_mode(regs)); | ||
433 | #endif | ||
434 | |||
435 | /* | ||
436 | * In the SMP case we use the local APIC timer interrupt to do the profiling, | ||
437 | * except when we simulate SMP mode on a uniprocessor system, in that case we | ||
438 | * have to call the local interrupt handler. | ||
439 | */ | ||
440 | |||
441 | #ifndef CONFIG_X86_LOCAL_APIC | ||
442 | profile_tick(CPU_PROFILING, regs); | ||
443 | #else | ||
444 | if (!using_apic_timer) | ||
445 | smp_local_timer_interrupt(regs); | ||
446 | #endif | ||
447 | |||
448 | /* | ||
449 | * If we have an externally synchronized Linux clock, then update CMOS clock | ||
450 | * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy | ||
451 | * closest to exactly 500 ms before the next second. If the update fails, we | ||
452 | * don't care, as it'll be updated on the next turn, and the problem (time way | ||
453 | * off) isn't likely to go away much sooner anyway. | ||
454 | */ | ||
455 | |||
456 | if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update && | ||
457 | abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { | ||
458 | set_rtc_mmss(xtime.tv_sec); | ||
459 | rtc_update = xtime.tv_sec + 660; | ||
460 | } | ||
461 | |||
462 | write_sequnlock(&xtime_lock); | ||
463 | |||
464 | return IRQ_HANDLED; | ||
465 | } | ||
466 | |||
467 | static unsigned int cyc2ns_scale; | ||
468 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | ||
469 | |||
470 | static inline void set_cyc2ns_scale(unsigned long cpu_mhz) | ||
471 | { | ||
472 | cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; | ||
473 | } | ||
474 | |||
475 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
476 | { | ||
477 | return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; | ||
478 | } | ||
479 | |||
480 | unsigned long long sched_clock(void) | ||
481 | { | ||
482 | unsigned long a = 0; | ||
483 | |||
484 | #if 0 | ||
485 | /* Don't do a HPET read here. Using TSC always is much faster | ||
486 | and HPET may not be mapped yet when the scheduler first runs. | ||
487 | Disadvantage is a small drift between CPUs in some configurations, | ||
488 | but that should be tolerable. */ | ||
489 | if (__vxtime.mode == VXTIME_HPET) | ||
490 | return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; | ||
491 | #endif | ||
492 | |||
493 | /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, | ||
494 | which means it is not completely exact and may not be monotonous between | ||
495 | CPUs. But the errors should be too small to matter for scheduling | ||
496 | purposes. */ | ||
497 | |||
498 | rdtscll(a); | ||
499 | return cycles_2_ns(a); | ||
500 | } | ||
501 | |||
502 | unsigned long get_cmos_time(void) | ||
503 | { | ||
504 | unsigned int timeout, year, mon, day, hour, min, sec; | ||
505 | unsigned char last, this; | ||
506 | unsigned long flags; | ||
507 | |||
508 | /* | ||
509 | * The Linux interpretation of the CMOS clock register contents: When the | ||
510 | * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the | ||
511 | * second which has precisely just started. Waiting for this can take up to 1 | ||
512 | * second, we timeout approximately after 2.4 seconds on a machine with | ||
513 | * standard 8.3 MHz ISA bus. | ||
514 | */ | ||
515 | |||
516 | spin_lock_irqsave(&rtc_lock, flags); | ||
517 | |||
518 | timeout = 1000000; | ||
519 | last = this = 0; | ||
520 | |||
521 | while (timeout && last && !this) { | ||
522 | last = this; | ||
523 | this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; | ||
524 | timeout--; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Here we are safe to assume the registers won't change for a whole second, so | ||
529 | * we just go ahead and read them. | ||
530 | */ | ||
531 | |||
532 | sec = CMOS_READ(RTC_SECONDS); | ||
533 | min = CMOS_READ(RTC_MINUTES); | ||
534 | hour = CMOS_READ(RTC_HOURS); | ||
535 | day = CMOS_READ(RTC_DAY_OF_MONTH); | ||
536 | mon = CMOS_READ(RTC_MONTH); | ||
537 | year = CMOS_READ(RTC_YEAR); | ||
538 | |||
539 | spin_unlock_irqrestore(&rtc_lock, flags); | ||
540 | |||
541 | /* | ||
542 | * We know that x86-64 always uses BCD format, no need to check the config | ||
543 | * register. | ||
544 | */ | ||
545 | |||
546 | BCD_TO_BIN(sec); | ||
547 | BCD_TO_BIN(min); | ||
548 | BCD_TO_BIN(hour); | ||
549 | BCD_TO_BIN(day); | ||
550 | BCD_TO_BIN(mon); | ||
551 | BCD_TO_BIN(year); | ||
552 | |||
553 | /* | ||
554 | * x86-64 systems only exists since 2002. | ||
555 | * This will work up to Dec 31, 2100 | ||
556 | */ | ||
557 | year += 2000; | ||
558 | |||
559 | return mktime(year, mon, day, hour, min, sec); | ||
560 | } | ||
561 | |||
562 | #ifdef CONFIG_CPU_FREQ | ||
563 | |||
564 | /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency | ||
565 | changes. | ||
566 | |||
567 | RED-PEN: On SMP we assume all CPUs run with the same frequency. It's | ||
568 | not that important because current Opteron setups do not support | ||
569 | scaling on SMP anyroads. | ||
570 | |||
571 | Should fix up last_tsc too. Currently gettimeofday in the | ||
572 | first tick after the change will be slightly wrong. */ | ||
573 | |||
574 | #include <linux/workqueue.h> | ||
575 | |||
576 | static unsigned int cpufreq_delayed_issched = 0; | ||
577 | static unsigned int cpufreq_init = 0; | ||
578 | static struct work_struct cpufreq_delayed_get_work; | ||
579 | |||
580 | static void handle_cpufreq_delayed_get(void *v) | ||
581 | { | ||
582 | unsigned int cpu; | ||
583 | for_each_online_cpu(cpu) { | ||
584 | cpufreq_get(cpu); | ||
585 | } | ||
586 | cpufreq_delayed_issched = 0; | ||
587 | } | ||
588 | |||
589 | /* if we notice lost ticks, schedule a call to cpufreq_get() as it tries | ||
590 | * to verify the CPU frequency the timing core thinks the CPU is running | ||
591 | * at is still correct. | ||
592 | */ | ||
593 | static void cpufreq_delayed_get(void) | ||
594 | { | ||
595 | static int warned; | ||
596 | if (cpufreq_init && !cpufreq_delayed_issched) { | ||
597 | cpufreq_delayed_issched = 1; | ||
598 | if (!warned) { | ||
599 | warned = 1; | ||
600 | printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); | ||
601 | } | ||
602 | schedule_work(&cpufreq_delayed_get_work); | ||
603 | } | ||
604 | } | ||
605 | |||
606 | static unsigned int ref_freq = 0; | ||
607 | static unsigned long loops_per_jiffy_ref = 0; | ||
608 | |||
609 | static unsigned long cpu_khz_ref = 0; | ||
610 | |||
611 | static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | ||
612 | void *data) | ||
613 | { | ||
614 | struct cpufreq_freqs *freq = data; | ||
615 | unsigned long *lpj, dummy; | ||
616 | |||
617 | lpj = &dummy; | ||
618 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
619 | #ifdef CONFIG_SMP | ||
620 | lpj = &cpu_data[freq->cpu].loops_per_jiffy; | ||
621 | #else | ||
622 | lpj = &boot_cpu_data.loops_per_jiffy; | ||
623 | #endif | ||
624 | |||
625 | |||
626 | |||
627 | if (!ref_freq) { | ||
628 | ref_freq = freq->old; | ||
629 | loops_per_jiffy_ref = *lpj; | ||
630 | cpu_khz_ref = cpu_khz; | ||
631 | } | ||
632 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | ||
633 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | ||
634 | (val == CPUFREQ_RESUMECHANGE)) { | ||
635 | *lpj = | ||
636 | cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); | ||
637 | |||
638 | cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); | ||
639 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
640 | vxtime.tsc_quot = (1000L << 32) / cpu_khz; | ||
641 | } | ||
642 | |||
643 | set_cyc2ns_scale(cpu_khz_ref / 1000); | ||
644 | |||
645 | return 0; | ||
646 | } | ||
647 | |||
648 | static struct notifier_block time_cpufreq_notifier_block = { | ||
649 | .notifier_call = time_cpufreq_notifier | ||
650 | }; | ||
651 | |||
652 | static int __init cpufreq_tsc(void) | ||
653 | { | ||
654 | INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); | ||
655 | if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, | ||
656 | CPUFREQ_TRANSITION_NOTIFIER)) | ||
657 | cpufreq_init = 1; | ||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | core_initcall(cpufreq_tsc); | ||
662 | |||
663 | #endif | ||
664 | |||
665 | /* | ||
666 | * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing | ||
667 | * it to the HPET timer of known frequency. | ||
668 | */ | ||
669 | |||
670 | #define TICK_COUNT 100000000 | ||
671 | |||
672 | static unsigned int __init hpet_calibrate_tsc(void) | ||
673 | { | ||
674 | int tsc_start, hpet_start; | ||
675 | int tsc_now, hpet_now; | ||
676 | unsigned long flags; | ||
677 | |||
678 | local_irq_save(flags); | ||
679 | local_irq_disable(); | ||
680 | |||
681 | hpet_start = hpet_readl(HPET_COUNTER); | ||
682 | rdtscl(tsc_start); | ||
683 | |||
684 | do { | ||
685 | local_irq_disable(); | ||
686 | hpet_now = hpet_readl(HPET_COUNTER); | ||
687 | sync_core(); | ||
688 | rdtscl(tsc_now); | ||
689 | local_irq_restore(flags); | ||
690 | } while ((tsc_now - tsc_start) < TICK_COUNT && | ||
691 | (hpet_now - hpet_start) < TICK_COUNT); | ||
692 | |||
693 | return (tsc_now - tsc_start) * 1000000000L | ||
694 | / ((hpet_now - hpet_start) * hpet_period / 1000); | ||
695 | } | ||
696 | |||
697 | |||
698 | /* | ||
699 | * pit_calibrate_tsc() uses the speaker output (channel 2) of | ||
700 | * the PIT. This is better than using the timer interrupt output, | ||
701 | * because we can read the value of the speaker with just one inb(), | ||
702 | * where we need three i/o operations for the interrupt channel. | ||
703 | * We count how many ticks the TSC does in 50 ms. | ||
704 | */ | ||
705 | |||
706 | static unsigned int __init pit_calibrate_tsc(void) | ||
707 | { | ||
708 | unsigned long start, end; | ||
709 | unsigned long flags; | ||
710 | |||
711 | spin_lock_irqsave(&i8253_lock, flags); | ||
712 | |||
713 | outb((inb(0x61) & ~0x02) | 0x01, 0x61); | ||
714 | |||
715 | outb(0xb0, 0x43); | ||
716 | outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); | ||
717 | outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); | ||
718 | rdtscll(start); | ||
719 | sync_core(); | ||
720 | while ((inb(0x61) & 0x20) == 0); | ||
721 | sync_core(); | ||
722 | rdtscll(end); | ||
723 | |||
724 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
725 | |||
726 | return (end - start) / 50; | ||
727 | } | ||
728 | |||
729 | #ifdef CONFIG_HPET | ||
730 | static __init int late_hpet_init(void) | ||
731 | { | ||
732 | struct hpet_data hd; | ||
733 | unsigned int ntimer; | ||
734 | |||
735 | if (!vxtime.hpet_address) | ||
736 | return -1; | ||
737 | |||
738 | memset(&hd, 0, sizeof (hd)); | ||
739 | |||
740 | ntimer = hpet_readl(HPET_ID); | ||
741 | ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; | ||
742 | ntimer++; | ||
743 | |||
744 | /* | ||
745 | * Register with driver. | ||
746 | * Timer0 and Timer1 is used by platform. | ||
747 | */ | ||
748 | hd.hd_phys_address = vxtime.hpet_address; | ||
749 | hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE); | ||
750 | hd.hd_nirqs = ntimer; | ||
751 | hd.hd_flags = HPET_DATA_PLATFORM; | ||
752 | hpet_reserve_timer(&hd, 0); | ||
753 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
754 | hpet_reserve_timer(&hd, 1); | ||
755 | #endif | ||
756 | hd.hd_irq[0] = HPET_LEGACY_8254; | ||
757 | hd.hd_irq[1] = HPET_LEGACY_RTC; | ||
758 | if (ntimer > 2) { | ||
759 | struct hpet *hpet; | ||
760 | struct hpet_timer *timer; | ||
761 | int i; | ||
762 | |||
763 | hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); | ||
764 | |||
765 | for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; | ||
766 | timer++, i++) | ||
767 | hd.hd_irq[i] = (timer->hpet_config & | ||
768 | Tn_INT_ROUTE_CNF_MASK) >> | ||
769 | Tn_INT_ROUTE_CNF_SHIFT; | ||
770 | |||
771 | } | ||
772 | |||
773 | hpet_alloc(&hd); | ||
774 | return 0; | ||
775 | } | ||
776 | fs_initcall(late_hpet_init); | ||
777 | #endif | ||
778 | |||
779 | static int hpet_timer_stop_set_go(unsigned long tick) | ||
780 | { | ||
781 | unsigned int cfg; | ||
782 | |||
783 | /* | ||
784 | * Stop the timers and reset the main counter. | ||
785 | */ | ||
786 | |||
787 | cfg = hpet_readl(HPET_CFG); | ||
788 | cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); | ||
789 | hpet_writel(cfg, HPET_CFG); | ||
790 | hpet_writel(0, HPET_COUNTER); | ||
791 | hpet_writel(0, HPET_COUNTER + 4); | ||
792 | |||
793 | /* | ||
794 | * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, | ||
795 | * and period also hpet_tick. | ||
796 | */ | ||
797 | |||
798 | hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | | ||
799 | HPET_TN_32BIT, HPET_T0_CFG); | ||
800 | hpet_writel(hpet_tick, HPET_T0_CMP); | ||
801 | hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ | ||
802 | |||
803 | /* | ||
804 | * Go! | ||
805 | */ | ||
806 | |||
807 | cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; | ||
808 | hpet_writel(cfg, HPET_CFG); | ||
809 | |||
810 | return 0; | ||
811 | } | ||
812 | |||
813 | static int hpet_init(void) | ||
814 | { | ||
815 | unsigned int id; | ||
816 | |||
817 | if (!vxtime.hpet_address) | ||
818 | return -1; | ||
819 | set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); | ||
820 | __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); | ||
821 | |||
822 | /* | ||
823 | * Read the period, compute tick and quotient. | ||
824 | */ | ||
825 | |||
826 | id = hpet_readl(HPET_ID); | ||
827 | |||
828 | if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) || | ||
829 | !(id & HPET_ID_LEGSUP)) | ||
830 | return -1; | ||
831 | |||
832 | hpet_period = hpet_readl(HPET_PERIOD); | ||
833 | if (hpet_period < 100000 || hpet_period > 100000000) | ||
834 | return -1; | ||
835 | |||
836 | hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / | ||
837 | hpet_period; | ||
838 | |||
839 | return hpet_timer_stop_set_go(hpet_tick); | ||
840 | } | ||
841 | |||
842 | static int hpet_reenable(void) | ||
843 | { | ||
844 | return hpet_timer_stop_set_go(hpet_tick); | ||
845 | } | ||
846 | |||
847 | void __init pit_init(void) | ||
848 | { | ||
849 | unsigned long flags; | ||
850 | |||
851 | spin_lock_irqsave(&i8253_lock, flags); | ||
852 | outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ | ||
853 | outb_p(LATCH & 0xff, 0x40); /* LSB */ | ||
854 | outb_p(LATCH >> 8, 0x40); /* MSB */ | ||
855 | spin_unlock_irqrestore(&i8253_lock, flags); | ||
856 | } | ||
857 | |||
858 | int __init time_setup(char *str) | ||
859 | { | ||
860 | report_lost_ticks = 1; | ||
861 | return 1; | ||
862 | } | ||
863 | |||
864 | static struct irqaction irq0 = { | ||
865 | timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL | ||
866 | }; | ||
867 | |||
868 | extern void __init config_acpi_tables(void); | ||
869 | |||
870 | void __init time_init(void) | ||
871 | { | ||
872 | char *timename; | ||
873 | |||
874 | #ifdef HPET_HACK_ENABLE_DANGEROUS | ||
875 | if (!vxtime.hpet_address) { | ||
876 | printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " | ||
877 | "manually!\n"); | ||
878 | outl(0x800038a0, 0xcf8); | ||
879 | outl(0xff000001, 0xcfc); | ||
880 | outl(0x800038a0, 0xcf8); | ||
881 | vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; | ||
882 | printk(KERN_WARNING "time.c: WARNING: Enabled HPET " | ||
883 | "at %#lx.\n", vxtime.hpet_address); | ||
884 | } | ||
885 | #endif | ||
886 | if (nohpet) | ||
887 | vxtime.hpet_address = 0; | ||
888 | |||
889 | xtime.tv_sec = get_cmos_time(); | ||
890 | xtime.tv_nsec = 0; | ||
891 | |||
892 | set_normalized_timespec(&wall_to_monotonic, | ||
893 | -xtime.tv_sec, -xtime.tv_nsec); | ||
894 | |||
895 | if (!hpet_init()) { | ||
896 | vxtime_hz = (1000000000000000L + hpet_period / 2) / | ||
897 | hpet_period; | ||
898 | cpu_khz = hpet_calibrate_tsc(); | ||
899 | timename = "HPET"; | ||
900 | } else { | ||
901 | pit_init(); | ||
902 | cpu_khz = pit_calibrate_tsc(); | ||
903 | timename = "PIT"; | ||
904 | } | ||
905 | |||
906 | printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n", | ||
907 | vxtime_hz / 1000000, vxtime_hz % 1000000, timename); | ||
908 | printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", | ||
909 | cpu_khz / 1000, cpu_khz % 1000); | ||
910 | vxtime.mode = VXTIME_TSC; | ||
911 | vxtime.quot = (1000000L << 32) / vxtime_hz; | ||
912 | vxtime.tsc_quot = (1000L << 32) / cpu_khz; | ||
913 | vxtime.hz = vxtime_hz; | ||
914 | rdtscll_sync(&vxtime.last_tsc); | ||
915 | setup_irq(0, &irq0); | ||
916 | |||
917 | set_cyc2ns_scale(cpu_khz / 1000); | ||
918 | } | ||
919 | |||
920 | void __init time_init_smp(void) | ||
921 | { | ||
922 | char *timetype; | ||
923 | |||
924 | /* | ||
925 | * AMD systems with more than one CPU don't have fully synchronized | ||
926 | * TSCs. Always use HPET gettimeofday for these, although it is slower. | ||
927 | * Intel SMP systems usually have synchronized TSCs, so use always | ||
928 | * the TSC. | ||
929 | * | ||
930 | * Exceptions: | ||
931 | * IBM Summit2 checked by oem_force_hpet_timer(). | ||
932 | * AMD dual core may also not need HPET. Check me. | ||
933 | * | ||
934 | * Can be turned off with "notsc". | ||
935 | */ | ||
936 | if (num_online_cpus() > 1 && | ||
937 | boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
938 | notsc = 1; | ||
939 | /* Some systems will want to disable TSC and use HPET. */ | ||
940 | if (oem_force_hpet_timer()) | ||
941 | notsc = 1; | ||
942 | if (vxtime.hpet_address && notsc) { | ||
943 | timetype = "HPET"; | ||
944 | vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; | ||
945 | vxtime.mode = VXTIME_HPET; | ||
946 | do_gettimeoffset = do_gettimeoffset_hpet; | ||
947 | } else { | ||
948 | timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; | ||
949 | vxtime.mode = VXTIME_TSC; | ||
950 | } | ||
951 | |||
952 | printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype); | ||
953 | } | ||
954 | |||
955 | __setup("report_lost_ticks", time_setup); | ||
956 | |||
957 | static long clock_cmos_diff; | ||
958 | static unsigned long sleep_start; | ||
959 | |||
960 | static int timer_suspend(struct sys_device *dev, u32 state) | ||
961 | { | ||
962 | /* | ||
963 | * Estimate time zone so that set_time can update the clock | ||
964 | */ | ||
965 | long cmos_time = get_cmos_time(); | ||
966 | |||
967 | clock_cmos_diff = -cmos_time; | ||
968 | clock_cmos_diff += get_seconds(); | ||
969 | sleep_start = cmos_time; | ||
970 | return 0; | ||
971 | } | ||
972 | |||
973 | static int timer_resume(struct sys_device *dev) | ||
974 | { | ||
975 | unsigned long flags; | ||
976 | unsigned long sec; | ||
977 | unsigned long ctime = get_cmos_time(); | ||
978 | unsigned long sleep_length = (ctime - sleep_start) * HZ; | ||
979 | |||
980 | if (vxtime.hpet_address) | ||
981 | hpet_reenable(); | ||
982 | else | ||
983 | i8254_timer_resume(); | ||
984 | |||
985 | sec = ctime + clock_cmos_diff; | ||
986 | write_seqlock_irqsave(&xtime_lock,flags); | ||
987 | xtime.tv_sec = sec; | ||
988 | xtime.tv_nsec = 0; | ||
989 | write_sequnlock_irqrestore(&xtime_lock,flags); | ||
990 | jiffies += sleep_length; | ||
991 | wall_jiffies += sleep_length; | ||
992 | return 0; | ||
993 | } | ||
994 | |||
995 | static struct sysdev_class timer_sysclass = { | ||
996 | .resume = timer_resume, | ||
997 | .suspend = timer_suspend, | ||
998 | set_kset_name("timer"), | ||
999 | }; | ||
1000 | |||
1001 | |||
1002 | /* XXX this driverfs stuff should probably go elsewhere later -john */ | ||
1003 | static struct sys_device device_timer = { | ||
1004 | .id = 0, | ||
1005 | .cls = &timer_sysclass, | ||
1006 | }; | ||
1007 | |||
1008 | static int time_init_device(void) | ||
1009 | { | ||
1010 | int error = sysdev_class_register(&timer_sysclass); | ||
1011 | if (!error) | ||
1012 | error = sysdev_register(&device_timer); | ||
1013 | return error; | ||
1014 | } | ||
1015 | |||
1016 | device_initcall(time_init_device); | ||
1017 | |||
1018 | #ifdef CONFIG_HPET_EMULATE_RTC | ||
1019 | /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET | ||
1020 | * is enabled, we support RTC interrupt functionality in software. | ||
1021 | * RTC has 3 kinds of interrupts: | ||
1022 | * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock | ||
1023 | * is updated | ||
1024 | * 2) Alarm Interrupt - generate an interrupt at a specific time of day | ||
1025 | * 3) Periodic Interrupt - generate periodic interrupt, with frequencies | ||
1026 | * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) | ||
1027 | * (1) and (2) above are implemented using polling at a frequency of | ||
1028 | * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt | ||
1029 | * overhead. (DEFAULT_RTC_INT_FREQ) | ||
1030 | * For (3), we use interrupts at 64Hz or user specified periodic | ||
1031 | * frequency, whichever is higher. | ||
1032 | */ | ||
1033 | #include <linux/rtc.h> | ||
1034 | |||
1035 | extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs); | ||
1036 | |||
1037 | #define DEFAULT_RTC_INT_FREQ 64 | ||
1038 | #define RTC_NUM_INTS 1 | ||
1039 | |||
1040 | static unsigned long UIE_on; | ||
1041 | static unsigned long prev_update_sec; | ||
1042 | |||
1043 | static unsigned long AIE_on; | ||
1044 | static struct rtc_time alarm_time; | ||
1045 | |||
1046 | static unsigned long PIE_on; | ||
1047 | static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; | ||
1048 | static unsigned long PIE_count; | ||
1049 | |||
1050 | static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ | ||
1051 | |||
1052 | int is_hpet_enabled(void) | ||
1053 | { | ||
1054 | return vxtime.hpet_address != 0; | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Timer 1 for RTC, we do not use periodic interrupt feature, | ||
1059 | * even if HPET supports periodic interrupts on Timer 1. | ||
1060 | * The reason being, to set up a periodic interrupt in HPET, we need to | ||
1061 | * stop the main counter. And if we do that everytime someone diables/enables | ||
1062 | * RTC, we will have adverse effect on main kernel timer running on Timer 0. | ||
1063 | * So, for the time being, simulate the periodic interrupt in software. | ||
1064 | * | ||
1065 | * hpet_rtc_timer_init() is called for the first time and during subsequent | ||
1066 | * interuppts reinit happens through hpet_rtc_timer_reinit(). | ||
1067 | */ | ||
1068 | int hpet_rtc_timer_init(void) | ||
1069 | { | ||
1070 | unsigned int cfg, cnt; | ||
1071 | unsigned long flags; | ||
1072 | |||
1073 | if (!is_hpet_enabled()) | ||
1074 | return 0; | ||
1075 | /* | ||
1076 | * Set the counter 1 and enable the interrupts. | ||
1077 | */ | ||
1078 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
1079 | hpet_rtc_int_freq = PIE_freq; | ||
1080 | else | ||
1081 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
1082 | |||
1083 | local_irq_save(flags); | ||
1084 | cnt = hpet_readl(HPET_COUNTER); | ||
1085 | cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); | ||
1086 | hpet_writel(cnt, HPET_T1_CMP); | ||
1087 | local_irq_restore(flags); | ||
1088 | |||
1089 | cfg = hpet_readl(HPET_T1_CFG); | ||
1090 | cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; | ||
1091 | hpet_writel(cfg, HPET_T1_CFG); | ||
1092 | |||
1093 | return 1; | ||
1094 | } | ||
1095 | |||
1096 | static void hpet_rtc_timer_reinit(void) | ||
1097 | { | ||
1098 | unsigned int cfg, cnt; | ||
1099 | |||
1100 | if (!(PIE_on | AIE_on | UIE_on)) | ||
1101 | return; | ||
1102 | |||
1103 | if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) | ||
1104 | hpet_rtc_int_freq = PIE_freq; | ||
1105 | else | ||
1106 | hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; | ||
1107 | |||
1108 | /* It is more accurate to use the comparator value than current count.*/ | ||
1109 | cnt = hpet_readl(HPET_T1_CMP); | ||
1110 | cnt += hpet_tick*HZ/hpet_rtc_int_freq; | ||
1111 | hpet_writel(cnt, HPET_T1_CMP); | ||
1112 | |||
1113 | cfg = hpet_readl(HPET_T1_CFG); | ||
1114 | cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; | ||
1115 | hpet_writel(cfg, HPET_T1_CFG); | ||
1116 | |||
1117 | return; | ||
1118 | } | ||
1119 | |||
1120 | /* | ||
1121 | * The functions below are called from rtc driver. | ||
1122 | * Return 0 if HPET is not being used. | ||
1123 | * Otherwise do the necessary changes and return 1. | ||
1124 | */ | ||
1125 | int hpet_mask_rtc_irq_bit(unsigned long bit_mask) | ||
1126 | { | ||
1127 | if (!is_hpet_enabled()) | ||
1128 | return 0; | ||
1129 | |||
1130 | if (bit_mask & RTC_UIE) | ||
1131 | UIE_on = 0; | ||
1132 | if (bit_mask & RTC_PIE) | ||
1133 | PIE_on = 0; | ||
1134 | if (bit_mask & RTC_AIE) | ||
1135 | AIE_on = 0; | ||
1136 | |||
1137 | return 1; | ||
1138 | } | ||
1139 | |||
1140 | int hpet_set_rtc_irq_bit(unsigned long bit_mask) | ||
1141 | { | ||
1142 | int timer_init_reqd = 0; | ||
1143 | |||
1144 | if (!is_hpet_enabled()) | ||
1145 | return 0; | ||
1146 | |||
1147 | if (!(PIE_on | AIE_on | UIE_on)) | ||
1148 | timer_init_reqd = 1; | ||
1149 | |||
1150 | if (bit_mask & RTC_UIE) { | ||
1151 | UIE_on = 1; | ||
1152 | } | ||
1153 | if (bit_mask & RTC_PIE) { | ||
1154 | PIE_on = 1; | ||
1155 | PIE_count = 0; | ||
1156 | } | ||
1157 | if (bit_mask & RTC_AIE) { | ||
1158 | AIE_on = 1; | ||
1159 | } | ||
1160 | |||
1161 | if (timer_init_reqd) | ||
1162 | hpet_rtc_timer_init(); | ||
1163 | |||
1164 | return 1; | ||
1165 | } | ||
1166 | |||
1167 | int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) | ||
1168 | { | ||
1169 | if (!is_hpet_enabled()) | ||
1170 | return 0; | ||
1171 | |||
1172 | alarm_time.tm_hour = hrs; | ||
1173 | alarm_time.tm_min = min; | ||
1174 | alarm_time.tm_sec = sec; | ||
1175 | |||
1176 | return 1; | ||
1177 | } | ||
1178 | |||
1179 | int hpet_set_periodic_freq(unsigned long freq) | ||
1180 | { | ||
1181 | if (!is_hpet_enabled()) | ||
1182 | return 0; | ||
1183 | |||
1184 | PIE_freq = freq; | ||
1185 | PIE_count = 0; | ||
1186 | |||
1187 | return 1; | ||
1188 | } | ||
1189 | |||
1190 | int hpet_rtc_dropped_irq(void) | ||
1191 | { | ||
1192 | if (!is_hpet_enabled()) | ||
1193 | return 0; | ||
1194 | |||
1195 | return 1; | ||
1196 | } | ||
1197 | |||
1198 | irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) | ||
1199 | { | ||
1200 | struct rtc_time curr_time; | ||
1201 | unsigned long rtc_int_flag = 0; | ||
1202 | int call_rtc_interrupt = 0; | ||
1203 | |||
1204 | hpet_rtc_timer_reinit(); | ||
1205 | |||
1206 | if (UIE_on | AIE_on) { | ||
1207 | rtc_get_rtc_time(&curr_time); | ||
1208 | } | ||
1209 | if (UIE_on) { | ||
1210 | if (curr_time.tm_sec != prev_update_sec) { | ||
1211 | /* Set update int info, call real rtc int routine */ | ||
1212 | call_rtc_interrupt = 1; | ||
1213 | rtc_int_flag = RTC_UF; | ||
1214 | prev_update_sec = curr_time.tm_sec; | ||
1215 | } | ||
1216 | } | ||
1217 | if (PIE_on) { | ||
1218 | PIE_count++; | ||
1219 | if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { | ||
1220 | /* Set periodic int info, call real rtc int routine */ | ||
1221 | call_rtc_interrupt = 1; | ||
1222 | rtc_int_flag |= RTC_PF; | ||
1223 | PIE_count = 0; | ||
1224 | } | ||
1225 | } | ||
1226 | if (AIE_on) { | ||
1227 | if ((curr_time.tm_sec == alarm_time.tm_sec) && | ||
1228 | (curr_time.tm_min == alarm_time.tm_min) && | ||
1229 | (curr_time.tm_hour == alarm_time.tm_hour)) { | ||
1230 | /* Set alarm int info, call real rtc int routine */ | ||
1231 | call_rtc_interrupt = 1; | ||
1232 | rtc_int_flag |= RTC_AF; | ||
1233 | } | ||
1234 | } | ||
1235 | if (call_rtc_interrupt) { | ||
1236 | rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); | ||
1237 | rtc_interrupt(rtc_int_flag, dev_id, regs); | ||
1238 | } | ||
1239 | return IRQ_HANDLED; | ||
1240 | } | ||
1241 | #endif | ||
1242 | |||
1243 | |||
1244 | |||
1245 | static int __init nohpet_setup(char *s) | ||
1246 | { | ||
1247 | nohpet = 1; | ||
1248 | return 0; | ||
1249 | } | ||
1250 | |||
1251 | __setup("nohpet", nohpet_setup); | ||
1252 | |||
1253 | |||
1254 | static int __init notsc_setup(char *s) | ||
1255 | { | ||
1256 | notsc = 1; | ||
1257 | return 0; | ||
1258 | } | ||
1259 | |||
1260 | __setup("notsc", notsc_setup); | ||
1261 | |||
1262 | |||
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S new file mode 100644 index 000000000000..6d9c9a8e7d0e --- /dev/null +++ b/arch/x86_64/kernel/trampoline.S | |||
@@ -0,0 +1,64 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Trampoline.S Derived from Setup.S by Linus Torvalds | ||
4 | * | ||
5 | * 4 Jan 1997 Michael Chastain: changed to gnu as. | ||
6 | * | ||
7 | * Entry: CS:IP point to the start of our code, we are | ||
8 | * in real mode with no stack, but the rest of the | ||
9 | * trampoline page to make our stack and everything else | ||
10 | * is a mystery. | ||
11 | * | ||
12 | * In fact we don't actually need a stack so we don't | ||
13 | * set one up. | ||
14 | * | ||
15 | * On entry to trampoline_data, the processor is in real mode | ||
16 | * with 16-bit addressing and 16-bit data. CS has some value | ||
17 | * and IP is zero. Thus, data addresses need to be absolute | ||
18 | * (no relocation) and are taken with regard to r_base. | ||
19 | * | ||
20 | * If you work on this file, check the object module with objdump | ||
21 | * --full-contents --reloc to make sure there are no relocation | ||
22 | * entries. For the GDT entry we do hand relocation in smpboot.c | ||
23 | * because of 64bit linker limitations. | ||
24 | */ | ||
25 | |||
26 | #include <linux/linkage.h> | ||
27 | #include <asm/segment.h> | ||
28 | #include <asm/page.h> | ||
29 | |||
30 | .data | ||
31 | |||
32 | .code16 | ||
33 | |||
34 | ENTRY(trampoline_data) | ||
35 | r_base = . | ||
36 | wbinvd | ||
37 | mov %cs, %ax # Code and data in the same place | ||
38 | mov %ax, %ds | ||
39 | |||
40 | cli # We should be safe anyway | ||
41 | |||
42 | movl $0xA5A5A5A5, trampoline_data - r_base | ||
43 | # write marker for master knows we're running | ||
44 | |||
45 | lidt idt_48 - r_base # load idt with 0, 0 | ||
46 | lgdt gdt_48 - r_base # load gdt with whatever is appropriate | ||
47 | |||
48 | xor %ax, %ax | ||
49 | inc %ax # protected mode (PE) bit | ||
50 | lmsw %ax # into protected mode | ||
51 | # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S | ||
52 | ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) | ||
53 | |||
54 | # Careful these need to be in the same 64K segment as the above; | ||
55 | idt_48: | ||
56 | .word 0 # idt limit = 0 | ||
57 | .word 0, 0 # idt base = 0L | ||
58 | |||
59 | gdt_48: | ||
60 | .short __KERNEL32_CS + 7 # gdt limit | ||
61 | .long cpu_gdt_table-__START_KERNEL_map | ||
62 | |||
63 | .globl trampoline_end | ||
64 | trampoline_end: | ||
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c new file mode 100644 index 000000000000..d87ac6496cc2 --- /dev/null +++ b/arch/x86_64/kernel/traps.c | |||
@@ -0,0 +1,948 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/traps.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
6 | * | ||
7 | * Pentium III FXSR, SSE support | ||
8 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
9 | * | ||
10 | * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * 'Traps.c' handles hardware traps and faults after we have saved some | ||
15 | * state in 'entry.S'. | ||
16 | */ | ||
17 | #include <linux/config.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/errno.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/timer.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/spinlock.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/moduleparam.h> | ||
31 | |||
32 | #include <asm/system.h> | ||
33 | #include <asm/uaccess.h> | ||
34 | #include <asm/io.h> | ||
35 | #include <asm/atomic.h> | ||
36 | #include <asm/debugreg.h> | ||
37 | #include <asm/desc.h> | ||
38 | #include <asm/i387.h> | ||
39 | #include <asm/kdebug.h> | ||
40 | #include <asm/processor.h> | ||
41 | |||
42 | #include <asm/smp.h> | ||
43 | #include <asm/pgalloc.h> | ||
44 | #include <asm/pda.h> | ||
45 | #include <asm/proto.h> | ||
46 | #include <asm/nmi.h> | ||
47 | |||
48 | #include <linux/irq.h> | ||
49 | |||
50 | |||
51 | extern struct gate_struct idt_table[256]; | ||
52 | |||
53 | asmlinkage void divide_error(void); | ||
54 | asmlinkage void debug(void); | ||
55 | asmlinkage void nmi(void); | ||
56 | asmlinkage void int3(void); | ||
57 | asmlinkage void overflow(void); | ||
58 | asmlinkage void bounds(void); | ||
59 | asmlinkage void invalid_op(void); | ||
60 | asmlinkage void device_not_available(void); | ||
61 | asmlinkage void double_fault(void); | ||
62 | asmlinkage void coprocessor_segment_overrun(void); | ||
63 | asmlinkage void invalid_TSS(void); | ||
64 | asmlinkage void segment_not_present(void); | ||
65 | asmlinkage void stack_segment(void); | ||
66 | asmlinkage void general_protection(void); | ||
67 | asmlinkage void page_fault(void); | ||
68 | asmlinkage void coprocessor_error(void); | ||
69 | asmlinkage void simd_coprocessor_error(void); | ||
70 | asmlinkage void reserved(void); | ||
71 | asmlinkage void alignment_check(void); | ||
72 | asmlinkage void machine_check(void); | ||
73 | asmlinkage void spurious_interrupt_bug(void); | ||
74 | asmlinkage void call_debug(void); | ||
75 | |||
76 | struct notifier_block *die_chain; | ||
77 | static DEFINE_SPINLOCK(die_notifier_lock); | ||
78 | |||
79 | int register_die_notifier(struct notifier_block *nb) | ||
80 | { | ||
81 | int err = 0; | ||
82 | unsigned long flags; | ||
83 | spin_lock_irqsave(&die_notifier_lock, flags); | ||
84 | err = notifier_chain_register(&die_chain, nb); | ||
85 | spin_unlock_irqrestore(&die_notifier_lock, flags); | ||
86 | return err; | ||
87 | } | ||
88 | |||
89 | static inline void conditional_sti(struct pt_regs *regs) | ||
90 | { | ||
91 | if (regs->eflags & X86_EFLAGS_IF) | ||
92 | local_irq_enable(); | ||
93 | } | ||
94 | |||
95 | static int kstack_depth_to_print = 10; | ||
96 | |||
97 | #ifdef CONFIG_KALLSYMS | ||
98 | #include <linux/kallsyms.h> | ||
99 | int printk_address(unsigned long address) | ||
100 | { | ||
101 | unsigned long offset = 0, symsize; | ||
102 | const char *symname; | ||
103 | char *modname; | ||
104 | char *delim = ":"; | ||
105 | char namebuf[128]; | ||
106 | |||
107 | symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); | ||
108 | if (!symname) | ||
109 | return printk("[<%016lx>]", address); | ||
110 | if (!modname) | ||
111 | modname = delim = ""; | ||
112 | return printk("<%016lx>{%s%s%s%s%+ld}", | ||
113 | address,delim,modname,delim,symname,offset); | ||
114 | } | ||
115 | #else | ||
116 | int printk_address(unsigned long address) | ||
117 | { | ||
118 | return printk("[<%016lx>]", address); | ||
119 | } | ||
120 | #endif | ||
121 | |||
122 | unsigned long *in_exception_stack(int cpu, unsigned long stack) | ||
123 | { | ||
124 | int k; | ||
125 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
126 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | ||
127 | unsigned long start = tss->ist[k] - EXCEPTION_STKSZ; | ||
128 | |||
129 | if (stack >= start && stack < tss->ist[k]) | ||
130 | return (unsigned long *)tss->ist[k]; | ||
131 | } | ||
132 | return NULL; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * x86-64 can have upto three kernel stacks: | ||
137 | * process stack | ||
138 | * interrupt stack | ||
139 | * severe exception (double fault, nmi, stack fault) hardware stack | ||
140 | * Check and process them in order. | ||
141 | */ | ||
142 | |||
143 | void show_trace(unsigned long *stack) | ||
144 | { | ||
145 | unsigned long addr; | ||
146 | unsigned long *irqstack, *irqstack_end, *estack_end; | ||
147 | const int cpu = safe_smp_processor_id(); | ||
148 | int i; | ||
149 | |||
150 | printk("\nCall Trace:"); | ||
151 | i = 0; | ||
152 | |||
153 | estack_end = in_exception_stack(cpu, (unsigned long)stack); | ||
154 | if (estack_end) { | ||
155 | while (stack < estack_end) { | ||
156 | addr = *stack++; | ||
157 | if (__kernel_text_address(addr)) { | ||
158 | i += printk_address(addr); | ||
159 | i += printk(" "); | ||
160 | if (i > 50) { | ||
161 | printk("\n"); | ||
162 | i = 0; | ||
163 | } | ||
164 | } | ||
165 | } | ||
166 | i += printk(" <EOE> "); | ||
167 | i += 7; | ||
168 | stack = (unsigned long *) estack_end[-2]; | ||
169 | } | ||
170 | |||
171 | irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); | ||
172 | irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64); | ||
173 | |||
174 | if (stack >= irqstack && stack < irqstack_end) { | ||
175 | printk("<IRQ> "); | ||
176 | while (stack < irqstack_end) { | ||
177 | addr = *stack++; | ||
178 | /* | ||
179 | * If the address is either in the text segment of the | ||
180 | * kernel, or in the region which contains vmalloc'ed | ||
181 | * memory, it *may* be the address of a calling | ||
182 | * routine; if so, print it so that someone tracing | ||
183 | * down the cause of the crash will be able to figure | ||
184 | * out the call path that was taken. | ||
185 | */ | ||
186 | if (__kernel_text_address(addr)) { | ||
187 | i += printk_address(addr); | ||
188 | i += printk(" "); | ||
189 | if (i > 50) { | ||
190 | printk("\n "); | ||
191 | i = 0; | ||
192 | } | ||
193 | } | ||
194 | } | ||
195 | stack = (unsigned long *) (irqstack_end[-1]); | ||
196 | printk(" <EOI> "); | ||
197 | i += 7; | ||
198 | } | ||
199 | |||
200 | while (((long) stack & (THREAD_SIZE-1)) != 0) { | ||
201 | addr = *stack++; | ||
202 | if (__kernel_text_address(addr)) { | ||
203 | i += printk_address(addr); | ||
204 | i += printk(" "); | ||
205 | if (i > 50) { | ||
206 | printk("\n "); | ||
207 | i = 0; | ||
208 | } | ||
209 | } | ||
210 | } | ||
211 | printk("\n"); | ||
212 | } | ||
213 | |||
214 | void show_stack(struct task_struct *tsk, unsigned long * rsp) | ||
215 | { | ||
216 | unsigned long *stack; | ||
217 | int i; | ||
218 | const int cpu = safe_smp_processor_id(); | ||
219 | unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); | ||
220 | unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE); | ||
221 | |||
222 | // debugging aid: "show_stack(NULL, NULL);" prints the | ||
223 | // back trace for this cpu. | ||
224 | |||
225 | if (rsp == NULL) { | ||
226 | if (tsk) | ||
227 | rsp = (unsigned long *)tsk->thread.rsp; | ||
228 | else | ||
229 | rsp = (unsigned long *)&rsp; | ||
230 | } | ||
231 | |||
232 | stack = rsp; | ||
233 | for(i=0; i < kstack_depth_to_print; i++) { | ||
234 | if (stack >= irqstack && stack <= irqstack_end) { | ||
235 | if (stack == irqstack_end) { | ||
236 | stack = (unsigned long *) (irqstack_end[-1]); | ||
237 | printk(" <EOI> "); | ||
238 | } | ||
239 | } else { | ||
240 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
241 | break; | ||
242 | } | ||
243 | if (i && ((i % 4) == 0)) | ||
244 | printk("\n "); | ||
245 | printk("%016lx ", *stack++); | ||
246 | } | ||
247 | show_trace((unsigned long *)rsp); | ||
248 | } | ||
249 | |||
250 | /* | ||
251 | * The architecture-independent dump_stack generator | ||
252 | */ | ||
253 | void dump_stack(void) | ||
254 | { | ||
255 | unsigned long dummy; | ||
256 | show_trace(&dummy); | ||
257 | } | ||
258 | |||
259 | EXPORT_SYMBOL(dump_stack); | ||
260 | |||
261 | void show_registers(struct pt_regs *regs) | ||
262 | { | ||
263 | int i; | ||
264 | int in_kernel = (regs->cs & 3) == 0; | ||
265 | unsigned long rsp; | ||
266 | const int cpu = safe_smp_processor_id(); | ||
267 | struct task_struct *cur = cpu_pda[cpu].pcurrent; | ||
268 | |||
269 | rsp = regs->rsp; | ||
270 | |||
271 | printk("CPU %d ", cpu); | ||
272 | __show_regs(regs); | ||
273 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
274 | cur->comm, cur->pid, cur->thread_info, cur); | ||
275 | |||
276 | /* | ||
277 | * When in-kernel, we also print out the stack and code at the | ||
278 | * time of the fault.. | ||
279 | */ | ||
280 | if (in_kernel) { | ||
281 | |||
282 | printk("Stack: "); | ||
283 | show_stack(NULL, (unsigned long*)rsp); | ||
284 | |||
285 | printk("\nCode: "); | ||
286 | if(regs->rip < PAGE_OFFSET) | ||
287 | goto bad; | ||
288 | |||
289 | for(i=0;i<20;i++) | ||
290 | { | ||
291 | unsigned char c; | ||
292 | if(__get_user(c, &((unsigned char*)regs->rip)[i])) { | ||
293 | bad: | ||
294 | printk(" Bad RIP value."); | ||
295 | break; | ||
296 | } | ||
297 | printk("%02x ", c); | ||
298 | } | ||
299 | } | ||
300 | printk("\n"); | ||
301 | } | ||
302 | |||
303 | void handle_BUG(struct pt_regs *regs) | ||
304 | { | ||
305 | struct bug_frame f; | ||
306 | char tmp; | ||
307 | |||
308 | if (regs->cs & 3) | ||
309 | return; | ||
310 | if (__copy_from_user(&f, (struct bug_frame *) regs->rip, | ||
311 | sizeof(struct bug_frame))) | ||
312 | return; | ||
313 | if ((unsigned long)f.filename < __PAGE_OFFSET || | ||
314 | f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) | ||
315 | return; | ||
316 | if (__get_user(tmp, f.filename)) | ||
317 | f.filename = "unmapped filename"; | ||
318 | printk("----------- [cut here ] --------- [please bite here ] ---------\n"); | ||
319 | printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); | ||
320 | } | ||
321 | |||
322 | void out_of_line_bug(void) | ||
323 | { | ||
324 | BUG(); | ||
325 | } | ||
326 | |||
327 | static DEFINE_SPINLOCK(die_lock); | ||
328 | static int die_owner = -1; | ||
329 | |||
330 | void oops_begin(void) | ||
331 | { | ||
332 | int cpu = safe_smp_processor_id(); | ||
333 | /* racy, but better than risking deadlock. */ | ||
334 | local_irq_disable(); | ||
335 | if (!spin_trylock(&die_lock)) { | ||
336 | if (cpu == die_owner) | ||
337 | /* nested oops. should stop eventually */; | ||
338 | else | ||
339 | spin_lock(&die_lock); | ||
340 | } | ||
341 | die_owner = cpu; | ||
342 | console_verbose(); | ||
343 | bust_spinlocks(1); | ||
344 | } | ||
345 | |||
346 | void oops_end(void) | ||
347 | { | ||
348 | die_owner = -1; | ||
349 | bust_spinlocks(0); | ||
350 | spin_unlock(&die_lock); | ||
351 | if (panic_on_oops) | ||
352 | panic("Oops"); | ||
353 | } | ||
354 | |||
355 | void __die(const char * str, struct pt_regs * regs, long err) | ||
356 | { | ||
357 | static int die_counter; | ||
358 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); | ||
359 | #ifdef CONFIG_PREEMPT | ||
360 | printk("PREEMPT "); | ||
361 | #endif | ||
362 | #ifdef CONFIG_SMP | ||
363 | printk("SMP "); | ||
364 | #endif | ||
365 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
366 | printk("DEBUG_PAGEALLOC"); | ||
367 | #endif | ||
368 | printk("\n"); | ||
369 | notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); | ||
370 | show_registers(regs); | ||
371 | /* Executive summary in case the oops scrolled away */ | ||
372 | printk(KERN_ALERT "RIP "); | ||
373 | printk_address(regs->rip); | ||
374 | printk(" RSP <%016lx>\n", regs->rsp); | ||
375 | } | ||
376 | |||
377 | void die(const char * str, struct pt_regs * regs, long err) | ||
378 | { | ||
379 | oops_begin(); | ||
380 | handle_BUG(regs); | ||
381 | __die(str, regs, err); | ||
382 | oops_end(); | ||
383 | do_exit(SIGSEGV); | ||
384 | } | ||
385 | static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) | ||
386 | { | ||
387 | if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS)) | ||
388 | die(str, regs, err); | ||
389 | } | ||
390 | |||
391 | void die_nmi(char *str, struct pt_regs *regs) | ||
392 | { | ||
393 | oops_begin(); | ||
394 | /* | ||
395 | * We are in trouble anyway, lets at least try | ||
396 | * to get a message out. | ||
397 | */ | ||
398 | printk(str, safe_smp_processor_id()); | ||
399 | show_registers(regs); | ||
400 | if (panic_on_timeout || panic_on_oops) | ||
401 | panic("nmi watchdog"); | ||
402 | printk("console shuts up ...\n"); | ||
403 | oops_end(); | ||
404 | do_exit(SIGSEGV); | ||
405 | } | ||
406 | |||
407 | static void do_trap(int trapnr, int signr, char *str, | ||
408 | struct pt_regs * regs, long error_code, siginfo_t *info) | ||
409 | { | ||
410 | conditional_sti(regs); | ||
411 | |||
412 | #ifdef CONFIG_CHECKING | ||
413 | { | ||
414 | unsigned long gs; | ||
415 | struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); | ||
416 | rdmsrl(MSR_GS_BASE, gs); | ||
417 | if (gs != (unsigned long)pda) { | ||
418 | wrmsrl(MSR_GS_BASE, pda); | ||
419 | printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, | ||
420 | regs->rip); | ||
421 | } | ||
422 | } | ||
423 | #endif | ||
424 | |||
425 | if ((regs->cs & 3) != 0) { | ||
426 | struct task_struct *tsk = current; | ||
427 | |||
428 | if (exception_trace && unhandled_signal(tsk, signr)) | ||
429 | printk(KERN_INFO | ||
430 | "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", | ||
431 | tsk->comm, tsk->pid, str, | ||
432 | regs->rip,regs->rsp,error_code); | ||
433 | |||
434 | tsk->thread.error_code = error_code; | ||
435 | tsk->thread.trap_no = trapnr; | ||
436 | if (info) | ||
437 | force_sig_info(signr, info, tsk); | ||
438 | else | ||
439 | force_sig(signr, tsk); | ||
440 | return; | ||
441 | } | ||
442 | |||
443 | |||
444 | /* kernel trap */ | ||
445 | { | ||
446 | const struct exception_table_entry *fixup; | ||
447 | fixup = search_exception_tables(regs->rip); | ||
448 | if (fixup) { | ||
449 | regs->rip = fixup->fixup; | ||
450 | } else | ||
451 | die(str, regs, error_code); | ||
452 | return; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | #define DO_ERROR(trapnr, signr, str, name) \ | ||
457 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
458 | { \ | ||
459 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
460 | == NOTIFY_STOP) \ | ||
461 | return; \ | ||
462 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
463 | } | ||
464 | |||
465 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
466 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
467 | { \ | ||
468 | siginfo_t info; \ | ||
469 | info.si_signo = signr; \ | ||
470 | info.si_errno = 0; \ | ||
471 | info.si_code = sicode; \ | ||
472 | info.si_addr = (void __user *)siaddr; \ | ||
473 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
474 | == NOTIFY_STOP) \ | ||
475 | return; \ | ||
476 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
477 | } | ||
478 | |||
479 | DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) | ||
480 | DO_ERROR( 4, SIGSEGV, "overflow", overflow) | ||
481 | DO_ERROR( 5, SIGSEGV, "bounds", bounds) | ||
482 | DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip) | ||
483 | DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) | ||
484 | DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
485 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | ||
486 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | ||
487 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | ||
488 | DO_ERROR(18, SIGSEGV, "reserved", reserved) | ||
489 | |||
490 | #define DO_ERROR_STACK(trapnr, signr, str, name) \ | ||
491 | asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \ | ||
492 | { \ | ||
493 | struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \ | ||
494 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
495 | == NOTIFY_STOP) \ | ||
496 | return regs; \ | ||
497 | if (regs->cs & 3) { \ | ||
498 | memcpy(pr, regs, sizeof(struct pt_regs)); \ | ||
499 | regs = pr; \ | ||
500 | } \ | ||
501 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
502 | return regs; \ | ||
503 | } | ||
504 | |||
505 | DO_ERROR_STACK(12, SIGBUS, "stack segment", stack_segment) | ||
506 | DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault) | ||
507 | |||
508 | asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) | ||
509 | { | ||
510 | conditional_sti(regs); | ||
511 | |||
512 | #ifdef CONFIG_CHECKING | ||
513 | { | ||
514 | unsigned long gs; | ||
515 | struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); | ||
516 | rdmsrl(MSR_GS_BASE, gs); | ||
517 | if (gs != (unsigned long)pda) { | ||
518 | wrmsrl(MSR_GS_BASE, pda); | ||
519 | oops_in_progress++; | ||
520 | printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); | ||
521 | oops_in_progress--; | ||
522 | } | ||
523 | } | ||
524 | #endif | ||
525 | |||
526 | if ((regs->cs & 3)!=0) { | ||
527 | struct task_struct *tsk = current; | ||
528 | |||
529 | if (exception_trace && unhandled_signal(tsk, SIGSEGV)) | ||
530 | printk(KERN_INFO | ||
531 | "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", | ||
532 | tsk->comm, tsk->pid, | ||
533 | regs->rip,regs->rsp,error_code); | ||
534 | |||
535 | tsk->thread.error_code = error_code; | ||
536 | tsk->thread.trap_no = 13; | ||
537 | force_sig(SIGSEGV, tsk); | ||
538 | return; | ||
539 | } | ||
540 | |||
541 | /* kernel gp */ | ||
542 | { | ||
543 | const struct exception_table_entry *fixup; | ||
544 | fixup = search_exception_tables(regs->rip); | ||
545 | if (fixup) { | ||
546 | regs->rip = fixup->fixup; | ||
547 | return; | ||
548 | } | ||
549 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
550 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | ||
551 | return; | ||
552 | die("general protection fault", regs, error_code); | ||
553 | } | ||
554 | } | ||
555 | |||
556 | static void mem_parity_error(unsigned char reason, struct pt_regs * regs) | ||
557 | { | ||
558 | printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); | ||
559 | printk("You probably have a hardware problem with your RAM chips\n"); | ||
560 | |||
561 | /* Clear and disable the memory parity error line. */ | ||
562 | reason = (reason & 0xf) | 4; | ||
563 | outb(reason, 0x61); | ||
564 | } | ||
565 | |||
566 | static void io_check_error(unsigned char reason, struct pt_regs * regs) | ||
567 | { | ||
568 | printk("NMI: IOCK error (debug interrupt?)\n"); | ||
569 | show_registers(regs); | ||
570 | |||
571 | /* Re-enable the IOCK line, wait for a few seconds */ | ||
572 | reason = (reason & 0xf) | 8; | ||
573 | outb(reason, 0x61); | ||
574 | mdelay(2000); | ||
575 | reason &= ~8; | ||
576 | outb(reason, 0x61); | ||
577 | } | ||
578 | |||
579 | static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | ||
580 | { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); | ||
581 | printk("Dazed and confused, but trying to continue\n"); | ||
582 | printk("Do you have a strange power saving mode enabled?\n"); | ||
583 | } | ||
584 | |||
585 | asmlinkage void default_do_nmi(struct pt_regs *regs) | ||
586 | { | ||
587 | unsigned char reason = 0; | ||
588 | |||
589 | /* Only the BSP gets external NMIs from the system. */ | ||
590 | if (!smp_processor_id()) | ||
591 | reason = get_nmi_reason(); | ||
592 | |||
593 | if (!(reason & 0xc0)) { | ||
594 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) | ||
595 | == NOTIFY_STOP) | ||
596 | return; | ||
597 | #ifdef CONFIG_X86_LOCAL_APIC | ||
598 | /* | ||
599 | * Ok, so this is none of the documented NMI sources, | ||
600 | * so it must be the NMI watchdog. | ||
601 | */ | ||
602 | if (nmi_watchdog > 0) { | ||
603 | nmi_watchdog_tick(regs,reason); | ||
604 | return; | ||
605 | } | ||
606 | #endif | ||
607 | unknown_nmi_error(reason, regs); | ||
608 | return; | ||
609 | } | ||
610 | if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) | ||
611 | return; | ||
612 | |||
613 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | ||
614 | |||
615 | if (reason & 0x80) | ||
616 | mem_parity_error(reason, regs); | ||
617 | if (reason & 0x40) | ||
618 | io_check_error(reason, regs); | ||
619 | } | ||
620 | |||
621 | asmlinkage void do_int3(struct pt_regs * regs, long error_code) | ||
622 | { | ||
623 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { | ||
624 | return; | ||
625 | } | ||
626 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
627 | return; | ||
628 | } | ||
629 | |||
630 | /* runs on IST stack. */ | ||
631 | asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) | ||
632 | { | ||
633 | struct pt_regs *pr; | ||
634 | unsigned long condition; | ||
635 | struct task_struct *tsk = current; | ||
636 | siginfo_t info; | ||
637 | |||
638 | pr = (struct pt_regs *)(current->thread.rsp0)-1; | ||
639 | if (regs->cs & 3) { | ||
640 | memcpy(pr, regs, sizeof(struct pt_regs)); | ||
641 | regs = pr; | ||
642 | } | ||
643 | |||
644 | #ifdef CONFIG_CHECKING | ||
645 | { | ||
646 | /* RED-PEN interaction with debugger - could destroy gs */ | ||
647 | unsigned long gs; | ||
648 | struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); | ||
649 | rdmsrl(MSR_GS_BASE, gs); | ||
650 | if (gs != (unsigned long)pda) { | ||
651 | wrmsrl(MSR_GS_BASE, pda); | ||
652 | printk("debug handler: wrong gs %lx expected %p\n", gs, pda); | ||
653 | } | ||
654 | } | ||
655 | #endif | ||
656 | |||
657 | asm("movq %%db6,%0" : "=r" (condition)); | ||
658 | |||
659 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | ||
660 | SIGTRAP) == NOTIFY_STOP) { | ||
661 | return regs; | ||
662 | } | ||
663 | conditional_sti(regs); | ||
664 | |||
665 | /* Mask out spurious debug traps due to lazy DR7 setting */ | ||
666 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | ||
667 | if (!tsk->thread.debugreg7) { | ||
668 | goto clear_dr7; | ||
669 | } | ||
670 | } | ||
671 | |||
672 | tsk->thread.debugreg6 = condition; | ||
673 | |||
674 | /* Mask out spurious TF errors due to lazy TF clearing */ | ||
675 | if ((condition & DR_STEP) && | ||
676 | (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition, | ||
677 | 1, SIGTRAP) != NOTIFY_STOP)) { | ||
678 | /* | ||
679 | * The TF error should be masked out only if the current | ||
680 | * process is not traced and if the TRAP flag has been set | ||
681 | * previously by a tracing process (condition detected by | ||
682 | * the PT_DTRACE flag); remember that the i386 TRAP flag | ||
683 | * can be modified by the process itself in user mode, | ||
684 | * allowing programs to debug themselves without the ptrace() | ||
685 | * interface. | ||
686 | */ | ||
687 | if ((regs->cs & 3) == 0) | ||
688 | goto clear_TF_reenable; | ||
689 | if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) | ||
690 | goto clear_TF; | ||
691 | } | ||
692 | |||
693 | /* Ok, finally something we can handle */ | ||
694 | tsk->thread.trap_no = 1; | ||
695 | tsk->thread.error_code = error_code; | ||
696 | info.si_signo = SIGTRAP; | ||
697 | info.si_errno = 0; | ||
698 | info.si_code = TRAP_BRKPT; | ||
699 | if ((regs->cs & 3) == 0) | ||
700 | goto clear_dr7; | ||
701 | |||
702 | info.si_addr = (void __user *)regs->rip; | ||
703 | force_sig_info(SIGTRAP, &info, tsk); | ||
704 | clear_dr7: | ||
705 | asm volatile("movq %0,%%db7"::"r"(0UL)); | ||
706 | notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP); | ||
707 | return regs; | ||
708 | |||
709 | clear_TF_reenable: | ||
710 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | ||
711 | |||
712 | clear_TF: | ||
713 | /* RED-PEN could cause spurious errors */ | ||
714 | if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP) | ||
715 | != NOTIFY_STOP) | ||
716 | regs->eflags &= ~TF_MASK; | ||
717 | return regs; | ||
718 | } | ||
719 | |||
720 | static int kernel_math_error(struct pt_regs *regs, char *str) | ||
721 | { | ||
722 | const struct exception_table_entry *fixup; | ||
723 | fixup = search_exception_tables(regs->rip); | ||
724 | if (fixup) { | ||
725 | regs->rip = fixup->fixup; | ||
726 | return 1; | ||
727 | } | ||
728 | notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE); | ||
729 | #if 0 | ||
730 | /* This should be a die, but warn only for now */ | ||
731 | die(str, regs, 0); | ||
732 | #else | ||
733 | printk(KERN_DEBUG "%s: %s at ", current->comm, str); | ||
734 | printk_address(regs->rip); | ||
735 | printk("\n"); | ||
736 | #endif | ||
737 | return 0; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Note that we play around with the 'TS' bit in an attempt to get | ||
742 | * the correct behaviour even in the presence of the asynchronous | ||
743 | * IRQ13 behaviour | ||
744 | */ | ||
745 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | ||
746 | { | ||
747 | void __user *rip = (void __user *)(regs->rip); | ||
748 | struct task_struct * task; | ||
749 | siginfo_t info; | ||
750 | unsigned short cwd, swd; | ||
751 | |||
752 | conditional_sti(regs); | ||
753 | if ((regs->cs & 3) == 0 && | ||
754 | kernel_math_error(regs, "kernel x87 math error")) | ||
755 | return; | ||
756 | |||
757 | /* | ||
758 | * Save the info for the exception handler and clear the error. | ||
759 | */ | ||
760 | task = current; | ||
761 | save_init_fpu(task); | ||
762 | task->thread.trap_no = 16; | ||
763 | task->thread.error_code = 0; | ||
764 | info.si_signo = SIGFPE; | ||
765 | info.si_errno = 0; | ||
766 | info.si_code = __SI_FAULT; | ||
767 | info.si_addr = rip; | ||
768 | /* | ||
769 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | ||
770 | * status. 0x3f is the exception bits in these regs, 0x200 is the | ||
771 | * C1 reg you need in case of a stack fault, 0x040 is the stack | ||
772 | * fault bit. We should only be taking one exception at a time, | ||
773 | * so if this combination doesn't produce any single exception, | ||
774 | * then we have a bad program that isn't synchronizing its FPU usage | ||
775 | * and it will suffer the consequences since we won't be able to | ||
776 | * fully reproduce the context of the exception | ||
777 | */ | ||
778 | cwd = get_fpu_cwd(task); | ||
779 | swd = get_fpu_swd(task); | ||
780 | switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { | ||
781 | case 0x000: | ||
782 | default: | ||
783 | break; | ||
784 | case 0x001: /* Invalid Op */ | ||
785 | case 0x041: /* Stack Fault */ | ||
786 | case 0x241: /* Stack Fault | Direction */ | ||
787 | info.si_code = FPE_FLTINV; | ||
788 | break; | ||
789 | case 0x002: /* Denormalize */ | ||
790 | case 0x010: /* Underflow */ | ||
791 | info.si_code = FPE_FLTUND; | ||
792 | break; | ||
793 | case 0x004: /* Zero Divide */ | ||
794 | info.si_code = FPE_FLTDIV; | ||
795 | break; | ||
796 | case 0x008: /* Overflow */ | ||
797 | info.si_code = FPE_FLTOVF; | ||
798 | break; | ||
799 | case 0x020: /* Precision */ | ||
800 | info.si_code = FPE_FLTRES; | ||
801 | break; | ||
802 | } | ||
803 | force_sig_info(SIGFPE, &info, task); | ||
804 | } | ||
805 | |||
806 | asmlinkage void bad_intr(void) | ||
807 | { | ||
808 | printk("bad interrupt"); | ||
809 | } | ||
810 | |||
811 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | ||
812 | { | ||
813 | void __user *rip = (void __user *)(regs->rip); | ||
814 | struct task_struct * task; | ||
815 | siginfo_t info; | ||
816 | unsigned short mxcsr; | ||
817 | |||
818 | conditional_sti(regs); | ||
819 | if ((regs->cs & 3) == 0 && | ||
820 | kernel_math_error(regs, "simd math error")) | ||
821 | return; | ||
822 | |||
823 | /* | ||
824 | * Save the info for the exception handler and clear the error. | ||
825 | */ | ||
826 | task = current; | ||
827 | save_init_fpu(task); | ||
828 | task->thread.trap_no = 19; | ||
829 | task->thread.error_code = 0; | ||
830 | info.si_signo = SIGFPE; | ||
831 | info.si_errno = 0; | ||
832 | info.si_code = __SI_FAULT; | ||
833 | info.si_addr = rip; | ||
834 | /* | ||
835 | * The SIMD FPU exceptions are handled a little differently, as there | ||
836 | * is only a single status/control register. Thus, to determine which | ||
837 | * unmasked exception was caught we must mask the exception mask bits | ||
838 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
839 | */ | ||
840 | mxcsr = get_fpu_mxcsr(task); | ||
841 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
842 | case 0x000: | ||
843 | default: | ||
844 | break; | ||
845 | case 0x001: /* Invalid Op */ | ||
846 | info.si_code = FPE_FLTINV; | ||
847 | break; | ||
848 | case 0x002: /* Denormalize */ | ||
849 | case 0x010: /* Underflow */ | ||
850 | info.si_code = FPE_FLTUND; | ||
851 | break; | ||
852 | case 0x004: /* Zero Divide */ | ||
853 | info.si_code = FPE_FLTDIV; | ||
854 | break; | ||
855 | case 0x008: /* Overflow */ | ||
856 | info.si_code = FPE_FLTOVF; | ||
857 | break; | ||
858 | case 0x020: /* Precision */ | ||
859 | info.si_code = FPE_FLTRES; | ||
860 | break; | ||
861 | } | ||
862 | force_sig_info(SIGFPE, &info, task); | ||
863 | } | ||
864 | |||
865 | asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) | ||
866 | { | ||
867 | } | ||
868 | |||
869 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
870 | { | ||
871 | } | ||
872 | |||
873 | /* | ||
874 | * 'math_state_restore()' saves the current math information in the | ||
875 | * old math state array, and gets the new ones from the current task | ||
876 | * | ||
877 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | ||
878 | * Don't touch unless you *really* know how it works. | ||
879 | */ | ||
880 | asmlinkage void math_state_restore(void) | ||
881 | { | ||
882 | struct task_struct *me = current; | ||
883 | clts(); /* Allow maths ops (or we recurse) */ | ||
884 | |||
885 | if (!used_math()) | ||
886 | init_fpu(me); | ||
887 | restore_fpu_checking(&me->thread.i387.fxsave); | ||
888 | me->thread_info->status |= TS_USEDFPU; | ||
889 | } | ||
890 | |||
891 | void do_call_debug(struct pt_regs *regs) | ||
892 | { | ||
893 | notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); | ||
894 | } | ||
895 | |||
896 | void __init trap_init(void) | ||
897 | { | ||
898 | set_intr_gate(0,÷_error); | ||
899 | set_intr_gate_ist(1,&debug,DEBUG_STACK); | ||
900 | set_intr_gate_ist(2,&nmi,NMI_STACK); | ||
901 | set_system_gate(3,&int3); | ||
902 | set_system_gate(4,&overflow); /* int4-5 can be called from all */ | ||
903 | set_system_gate(5,&bounds); | ||
904 | set_intr_gate(6,&invalid_op); | ||
905 | set_intr_gate(7,&device_not_available); | ||
906 | set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); | ||
907 | set_intr_gate(9,&coprocessor_segment_overrun); | ||
908 | set_intr_gate(10,&invalid_TSS); | ||
909 | set_intr_gate(11,&segment_not_present); | ||
910 | set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); | ||
911 | set_intr_gate(13,&general_protection); | ||
912 | set_intr_gate(14,&page_fault); | ||
913 | set_intr_gate(15,&spurious_interrupt_bug); | ||
914 | set_intr_gate(16,&coprocessor_error); | ||
915 | set_intr_gate(17,&alignment_check); | ||
916 | #ifdef CONFIG_X86_MCE | ||
917 | set_intr_gate_ist(18,&machine_check, MCE_STACK); | ||
918 | #endif | ||
919 | set_intr_gate(19,&simd_coprocessor_error); | ||
920 | |||
921 | #ifdef CONFIG_IA32_EMULATION | ||
922 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
923 | #endif | ||
924 | |||
925 | set_intr_gate(KDB_VECTOR, call_debug); | ||
926 | |||
927 | /* | ||
928 | * Should be a barrier for any external CPU state. | ||
929 | */ | ||
930 | cpu_init(); | ||
931 | } | ||
932 | |||
933 | |||
934 | /* Actual parsing is done early in setup.c. */ | ||
935 | static int __init oops_dummy(char *s) | ||
936 | { | ||
937 | panic_on_oops = 1; | ||
938 | return -1; | ||
939 | } | ||
940 | __setup("oops=", oops_dummy); | ||
941 | |||
942 | static int __init kstack_setup(char *s) | ||
943 | { | ||
944 | kstack_depth_to_print = simple_strtoul(s,NULL,0); | ||
945 | return 0; | ||
946 | } | ||
947 | __setup("kstack=", kstack_setup); | ||
948 | |||
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S new file mode 100644 index 000000000000..59ebd5beda87 --- /dev/null +++ b/arch/x86_64/kernel/vmlinux.lds.S | |||
@@ -0,0 +1,164 @@ | |||
1 | /* ld script to make x86-64 Linux kernel | ||
2 | * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; | ||
3 | */ | ||
4 | |||
5 | #include <asm-generic/vmlinux.lds.h> | ||
6 | #include <linux/config.h> | ||
7 | |||
8 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
9 | OUTPUT_ARCH(i386:x86-64) | ||
10 | ENTRY(phys_startup_64) | ||
11 | jiffies_64 = jiffies; | ||
12 | SECTIONS | ||
13 | { | ||
14 | . = 0xffffffff80100000; | ||
15 | phys_startup_64 = startup_64 - LOAD_OFFSET; | ||
16 | _text = .; /* Text and read-only data */ | ||
17 | .text : { | ||
18 | *(.text) | ||
19 | SCHED_TEXT | ||
20 | LOCK_TEXT | ||
21 | *(.fixup) | ||
22 | *(.gnu.warning) | ||
23 | } = 0x9090 | ||
24 | .text.lock : { *(.text.lock) } /* out-of-line lock text */ | ||
25 | |||
26 | _etext = .; /* End of text section */ | ||
27 | |||
28 | . = ALIGN(16); /* Exception table */ | ||
29 | __start___ex_table = .; | ||
30 | __ex_table : { *(__ex_table) } | ||
31 | __stop___ex_table = .; | ||
32 | |||
33 | RODATA | ||
34 | |||
35 | .data : { /* Data */ | ||
36 | *(.data) | ||
37 | CONSTRUCTORS | ||
38 | } | ||
39 | |||
40 | _edata = .; /* End of data section */ | ||
41 | |||
42 | __bss_start = .; /* BSS */ | ||
43 | .bss : { | ||
44 | *(.bss.page_aligned) | ||
45 | *(.bss) | ||
46 | } | ||
47 | __bss_end = .; | ||
48 | |||
49 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
50 | .data.cacheline_aligned : { *(.data.cacheline_aligned) } | ||
51 | |||
52 | #define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) | ||
53 | #define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) | ||
54 | #define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) | ||
55 | |||
56 | .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) } | ||
57 | __vsyscall_0 = LOADADDR(.vsyscall_0); | ||
58 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
59 | .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } | ||
60 | xtime_lock = LOADADDR(.xtime_lock); | ||
61 | .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } | ||
62 | vxtime = LOADADDR(.vxtime); | ||
63 | .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } | ||
64 | wall_jiffies = LOADADDR(.wall_jiffies); | ||
65 | .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } | ||
66 | sys_tz = LOADADDR(.sys_tz); | ||
67 | .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } | ||
68 | sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); | ||
69 | .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } | ||
70 | xtime = LOADADDR(.xtime); | ||
71 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
72 | .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } | ||
73 | jiffies = LOADADDR(.jiffies); | ||
74 | .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } | ||
75 | . = LOADADDR(.vsyscall_0) + 4096; | ||
76 | |||
77 | . = ALIGN(8192); /* init_task */ | ||
78 | .data.init_task : { *(.data.init_task) } | ||
79 | |||
80 | . = ALIGN(4096); | ||
81 | .data.page_aligned : { *(.data.page_aligned) } | ||
82 | |||
83 | . = ALIGN(4096); /* Init code and data */ | ||
84 | __init_begin = .; | ||
85 | .init.text : { | ||
86 | _sinittext = .; | ||
87 | *(.init.text) | ||
88 | _einittext = .; | ||
89 | } | ||
90 | __initdata_begin = .; | ||
91 | .init.data : { *(.init.data) } | ||
92 | __initdata_end = .; | ||
93 | . = ALIGN(16); | ||
94 | __setup_start = .; | ||
95 | .init.setup : { *(.init.setup) } | ||
96 | __setup_end = .; | ||
97 | __initcall_start = .; | ||
98 | .initcall.init : { | ||
99 | *(.initcall1.init) | ||
100 | *(.initcall2.init) | ||
101 | *(.initcall3.init) | ||
102 | *(.initcall4.init) | ||
103 | *(.initcall5.init) | ||
104 | *(.initcall6.init) | ||
105 | *(.initcall7.init) | ||
106 | } | ||
107 | __initcall_end = .; | ||
108 | __con_initcall_start = .; | ||
109 | .con_initcall.init : { *(.con_initcall.init) } | ||
110 | __con_initcall_end = .; | ||
111 | SECURITY_INIT | ||
112 | . = ALIGN(8); | ||
113 | __alt_instructions = .; | ||
114 | .altinstructions : { *(.altinstructions) } | ||
115 | __alt_instructions_end = .; | ||
116 | .altinstr_replacement : { *(.altinstr_replacement) } | ||
117 | /* .exit.text is discard at runtime, not link time, to deal with references | ||
118 | from .altinstructions and .eh_frame */ | ||
119 | .exit.text : { *(.exit.text) } | ||
120 | .exit.data : { *(.exit.data) } | ||
121 | . = ALIGN(4096); | ||
122 | __initramfs_start = .; | ||
123 | .init.ramfs : { *(.init.ramfs) } | ||
124 | __initramfs_end = .; | ||
125 | . = ALIGN(32); | ||
126 | __per_cpu_start = .; | ||
127 | .data.percpu : { *(.data.percpu) } | ||
128 | __per_cpu_end = .; | ||
129 | . = ALIGN(4096); | ||
130 | __init_end = .; | ||
131 | |||
132 | . = ALIGN(4096); | ||
133 | __nosave_begin = .; | ||
134 | .data_nosave : { *(.data.nosave) } | ||
135 | . = ALIGN(4096); | ||
136 | __nosave_end = .; | ||
137 | |||
138 | _end = . ; | ||
139 | |||
140 | /* Sections to be discarded */ | ||
141 | /DISCARD/ : { | ||
142 | *(.exitcall.exit) | ||
143 | #ifndef CONFIG_DEBUG_INFO | ||
144 | *(.eh_frame) | ||
145 | #endif | ||
146 | } | ||
147 | |||
148 | /* DWARF 2 */ | ||
149 | .debug_info 0 : { *(.debug_info) } | ||
150 | .debug_abbrev 0 : { *(.debug_abbrev) } | ||
151 | .debug_line 0 : { *(.debug_line) } | ||
152 | .debug_frame 0 : { *(.debug_frame) } | ||
153 | .debug_str 0 : { *(.debug_str) } | ||
154 | .debug_loc 0 : { *(.debug_loc) } | ||
155 | .debug_macinfo 0 : { *(.debug_macinfo) } | ||
156 | /* SGI/MIPS DWARF 2 extensions */ | ||
157 | .debug_weaknames 0 : { *(.debug_weaknames) } | ||
158 | .debug_funcnames 0 : { *(.debug_funcnames) } | ||
159 | .debug_typenames 0 : { *(.debug_typenames) } | ||
160 | .debug_varnames 0 : { *(.debug_varnames) } | ||
161 | |||
162 | |||
163 | .comment 0 : { *(.comment) } | ||
164 | } | ||
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c new file mode 100644 index 000000000000..b4b8dc59663a --- /dev/null +++ b/arch/x86_64/kernel/vsyscall.c | |||
@@ -0,0 +1,225 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/kernel/vsyscall.c | ||
3 | * | ||
4 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE | ||
5 | * Copyright 2003 Andi Kleen, SuSE Labs. | ||
6 | * | ||
7 | * Thanks to hpa@transmeta.com for some useful hint. | ||
8 | * Special thanks to Ingo Molnar for his early experience with | ||
9 | * a different vsyscall implementation for Linux/IA32 and for the name. | ||
10 | * | ||
11 | * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located | ||
12 | * at virtual address -10Mbyte+1024bytes etc... There are at max 4 | ||
13 | * vsyscalls. One vsyscall can reserve more than 1 slot to avoid | ||
14 | * jumping out of line if necessary. We cannot add more with this | ||
15 | * mechanism because older kernels won't return -ENOSYS. | ||
16 | * If we want more than four we need a vDSO. | ||
17 | * | ||
18 | * Note: the concept clashes with user mode linux. If you use UML and | ||
19 | * want per guest time just set the kernel.vsyscall64 sysctl to 0. | ||
20 | */ | ||
21 | |||
22 | #include <linux/time.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/timer.h> | ||
26 | #include <linux/seqlock.h> | ||
27 | #include <linux/jiffies.h> | ||
28 | #include <linux/sysctl.h> | ||
29 | |||
30 | #include <asm/vsyscall.h> | ||
31 | #include <asm/pgtable.h> | ||
32 | #include <asm/page.h> | ||
33 | #include <asm/fixmap.h> | ||
34 | #include <asm/errno.h> | ||
35 | #include <asm/io.h> | ||
36 | |||
37 | #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) | ||
38 | #define force_inline __attribute__((always_inline)) inline | ||
39 | |||
40 | int __sysctl_vsyscall __section_sysctl_vsyscall = 1; | ||
41 | seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; | ||
42 | |||
43 | #include <asm/unistd.h> | ||
44 | |||
45 | static force_inline void timeval_normalize(struct timeval * tv) | ||
46 | { | ||
47 | time_t __sec; | ||
48 | |||
49 | __sec = tv->tv_usec / 1000000; | ||
50 | if (__sec) { | ||
51 | tv->tv_usec %= 1000000; | ||
52 | tv->tv_sec += __sec; | ||
53 | } | ||
54 | } | ||
55 | |||
56 | static force_inline void do_vgettimeofday(struct timeval * tv) | ||
57 | { | ||
58 | long sequence, t; | ||
59 | unsigned long sec, usec; | ||
60 | |||
61 | do { | ||
62 | sequence = read_seqbegin(&__xtime_lock); | ||
63 | |||
64 | sec = __xtime.tv_sec; | ||
65 | usec = (__xtime.tv_nsec / 1000) + | ||
66 | (__jiffies - __wall_jiffies) * (1000000 / HZ); | ||
67 | |||
68 | if (__vxtime.mode == VXTIME_TSC) { | ||
69 | sync_core(); | ||
70 | rdtscll(t); | ||
71 | if (t < __vxtime.last_tsc) | ||
72 | t = __vxtime.last_tsc; | ||
73 | usec += ((t - __vxtime.last_tsc) * | ||
74 | __vxtime.tsc_quot) >> 32; | ||
75 | /* See comment in x86_64 do_gettimeofday. */ | ||
76 | } else { | ||
77 | usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - | ||
78 | __vxtime.last) * __vxtime.quot) >> 32; | ||
79 | } | ||
80 | } while (read_seqretry(&__xtime_lock, sequence)); | ||
81 | |||
82 | tv->tv_sec = sec + usec / 1000000; | ||
83 | tv->tv_usec = usec % 1000000; | ||
84 | } | ||
85 | |||
86 | /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ | ||
87 | static force_inline void do_get_tz(struct timezone * tz) | ||
88 | { | ||
89 | *tz = __sys_tz; | ||
90 | } | ||
91 | |||
92 | static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) | ||
93 | { | ||
94 | int ret; | ||
95 | asm volatile("vsysc2: syscall" | ||
96 | : "=a" (ret) | ||
97 | : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); | ||
98 | return ret; | ||
99 | } | ||
100 | |||
101 | static force_inline long time_syscall(long *t) | ||
102 | { | ||
103 | long secs; | ||
104 | asm volatile("vsysc1: syscall" | ||
105 | : "=a" (secs) | ||
106 | : "0" (__NR_time),"D" (t) : __syscall_clobber); | ||
107 | return secs; | ||
108 | } | ||
109 | |||
110 | static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) | ||
111 | { | ||
112 | if (unlikely(!__sysctl_vsyscall)) | ||
113 | return gettimeofday(tv,tz); | ||
114 | if (tv) | ||
115 | do_vgettimeofday(tv); | ||
116 | if (tz) | ||
117 | do_get_tz(tz); | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | /* This will break when the xtime seconds get inaccurate, but that is | ||
122 | * unlikely */ | ||
123 | static time_t __vsyscall(1) vtime(time_t *t) | ||
124 | { | ||
125 | if (unlikely(!__sysctl_vsyscall)) | ||
126 | return time_syscall(t); | ||
127 | else if (t) | ||
128 | *t = __xtime.tv_sec; | ||
129 | return __xtime.tv_sec; | ||
130 | } | ||
131 | |||
132 | static long __vsyscall(2) venosys_0(void) | ||
133 | { | ||
134 | return -ENOSYS; | ||
135 | } | ||
136 | |||
137 | static long __vsyscall(3) venosys_1(void) | ||
138 | { | ||
139 | return -ENOSYS; | ||
140 | } | ||
141 | |||
142 | #ifdef CONFIG_SYSCTL | ||
143 | |||
144 | #define SYSCALL 0x050f | ||
145 | #define NOP2 0x9090 | ||
146 | |||
147 | /* | ||
148 | * NOP out syscall in vsyscall page when not needed. | ||
149 | */ | ||
150 | static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | ||
151 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
152 | { | ||
153 | extern u16 vsysc1, vsysc2; | ||
154 | u16 *map1, *map2; | ||
155 | int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | ||
156 | if (!write) | ||
157 | return ret; | ||
158 | /* gcc has some trouble with __va(__pa()), so just do it this | ||
159 | way. */ | ||
160 | map1 = ioremap(__pa_symbol(&vsysc1), 2); | ||
161 | if (!map1) | ||
162 | return -ENOMEM; | ||
163 | map2 = ioremap(__pa_symbol(&vsysc2), 2); | ||
164 | if (!map2) { | ||
165 | ret = -ENOMEM; | ||
166 | goto out; | ||
167 | } | ||
168 | if (!sysctl_vsyscall) { | ||
169 | *map1 = SYSCALL; | ||
170 | *map2 = SYSCALL; | ||
171 | } else { | ||
172 | *map1 = NOP2; | ||
173 | *map2 = NOP2; | ||
174 | } | ||
175 | iounmap(map2); | ||
176 | out: | ||
177 | iounmap(map1); | ||
178 | return ret; | ||
179 | } | ||
180 | |||
181 | static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, | ||
182 | void __user *oldval, size_t __user *oldlenp, | ||
183 | void __user *newval, size_t newlen, | ||
184 | void **context) | ||
185 | { | ||
186 | return -ENOSYS; | ||
187 | } | ||
188 | |||
189 | static ctl_table kernel_table2[] = { | ||
190 | { .ctl_name = 99, .procname = "vsyscall64", | ||
191 | .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, | ||
192 | .strategy = vsyscall_sysctl_nostrat, | ||
193 | .proc_handler = vsyscall_sysctl_change }, | ||
194 | { 0, } | ||
195 | }; | ||
196 | |||
197 | static ctl_table kernel_root_table2[] = { | ||
198 | { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, | ||
199 | .child = kernel_table2 }, | ||
200 | { 0 }, | ||
201 | }; | ||
202 | |||
203 | #endif | ||
204 | |||
205 | static void __init map_vsyscall(void) | ||
206 | { | ||
207 | extern char __vsyscall_0; | ||
208 | unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); | ||
209 | |||
210 | __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); | ||
211 | } | ||
212 | |||
213 | static int __init vsyscall_init(void) | ||
214 | { | ||
215 | BUG_ON(((unsigned long) &vgettimeofday != | ||
216 | VSYSCALL_ADDR(__NR_vgettimeofday))); | ||
217 | BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); | ||
218 | BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); | ||
219 | map_vsyscall(); | ||
220 | sysctl_vsyscall = 1; | ||
221 | register_sysctl_table(kernel_root_table2, 0); | ||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | __initcall(vsyscall_init); | ||
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c new file mode 100644 index 000000000000..88626e626886 --- /dev/null +++ b/arch/x86_64/kernel/x8664_ksyms.c | |||
@@ -0,0 +1,221 @@ | |||
1 | #include <linux/config.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/smp.h> | ||
4 | #include <linux/user.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/in6.h> | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/smp_lock.h> | ||
9 | #include <linux/pm.h> | ||
10 | #include <linux/pci.h> | ||
11 | #include <linux/apm_bios.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/syscalls.h> | ||
15 | #include <linux/tty.h> | ||
16 | #include <linux/ioctl32.h> | ||
17 | |||
18 | #include <asm/semaphore.h> | ||
19 | #include <asm/processor.h> | ||
20 | #include <asm/i387.h> | ||
21 | #include <asm/uaccess.h> | ||
22 | #include <asm/checksum.h> | ||
23 | #include <asm/io.h> | ||
24 | #include <asm/delay.h> | ||
25 | #include <asm/irq.h> | ||
26 | #include <asm/mmx.h> | ||
27 | #include <asm/desc.h> | ||
28 | #include <asm/pgtable.h> | ||
29 | #include <asm/pgalloc.h> | ||
30 | #include <asm/nmi.h> | ||
31 | #include <asm/kdebug.h> | ||
32 | #include <asm/unistd.h> | ||
33 | #include <asm/tlbflush.h> | ||
34 | #include <asm/kdebug.h> | ||
35 | |||
36 | extern spinlock_t rtc_lock; | ||
37 | |||
38 | #ifdef CONFIG_SMP | ||
39 | extern void __write_lock_failed(rwlock_t *rw); | ||
40 | extern void __read_lock_failed(rwlock_t *rw); | ||
41 | #endif | ||
42 | |||
43 | #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) | ||
44 | extern struct drive_info_struct drive_info; | ||
45 | EXPORT_SYMBOL(drive_info); | ||
46 | #endif | ||
47 | |||
48 | extern unsigned long get_cmos_time(void); | ||
49 | |||
50 | /* platform dependent support */ | ||
51 | EXPORT_SYMBOL(boot_cpu_data); | ||
52 | //EXPORT_SYMBOL(dump_fpu); | ||
53 | EXPORT_SYMBOL(__ioremap); | ||
54 | EXPORT_SYMBOL(ioremap_nocache); | ||
55 | EXPORT_SYMBOL(iounmap); | ||
56 | EXPORT_SYMBOL(enable_irq); | ||
57 | EXPORT_SYMBOL(disable_irq); | ||
58 | EXPORT_SYMBOL(disable_irq_nosync); | ||
59 | EXPORT_SYMBOL(probe_irq_mask); | ||
60 | EXPORT_SYMBOL(kernel_thread); | ||
61 | EXPORT_SYMBOL(pm_idle); | ||
62 | EXPORT_SYMBOL(pm_power_off); | ||
63 | EXPORT_SYMBOL(get_cmos_time); | ||
64 | |||
65 | EXPORT_SYMBOL(__down_failed); | ||
66 | EXPORT_SYMBOL(__down_failed_interruptible); | ||
67 | EXPORT_SYMBOL(__down_failed_trylock); | ||
68 | EXPORT_SYMBOL(__up_wakeup); | ||
69 | /* Networking helper routines. */ | ||
70 | EXPORT_SYMBOL(csum_partial_copy_nocheck); | ||
71 | EXPORT_SYMBOL(ip_compute_csum); | ||
72 | /* Delay loops */ | ||
73 | EXPORT_SYMBOL(__udelay); | ||
74 | EXPORT_SYMBOL(__ndelay); | ||
75 | EXPORT_SYMBOL(__delay); | ||
76 | EXPORT_SYMBOL(__const_udelay); | ||
77 | |||
78 | EXPORT_SYMBOL(__get_user_1); | ||
79 | EXPORT_SYMBOL(__get_user_2); | ||
80 | EXPORT_SYMBOL(__get_user_4); | ||
81 | EXPORT_SYMBOL(__get_user_8); | ||
82 | EXPORT_SYMBOL(__put_user_1); | ||
83 | EXPORT_SYMBOL(__put_user_2); | ||
84 | EXPORT_SYMBOL(__put_user_4); | ||
85 | EXPORT_SYMBOL(__put_user_8); | ||
86 | |||
87 | EXPORT_SYMBOL(strpbrk); | ||
88 | EXPORT_SYMBOL(strstr); | ||
89 | |||
90 | EXPORT_SYMBOL(strncpy_from_user); | ||
91 | EXPORT_SYMBOL(__strncpy_from_user); | ||
92 | EXPORT_SYMBOL(clear_user); | ||
93 | EXPORT_SYMBOL(__clear_user); | ||
94 | EXPORT_SYMBOL(copy_user_generic); | ||
95 | EXPORT_SYMBOL(copy_from_user); | ||
96 | EXPORT_SYMBOL(copy_to_user); | ||
97 | EXPORT_SYMBOL(copy_in_user); | ||
98 | EXPORT_SYMBOL(strnlen_user); | ||
99 | |||
100 | #ifdef CONFIG_PCI | ||
101 | EXPORT_SYMBOL(pci_alloc_consistent); | ||
102 | EXPORT_SYMBOL(pci_free_consistent); | ||
103 | #endif | ||
104 | |||
105 | #ifdef CONFIG_PCI | ||
106 | EXPORT_SYMBOL(pci_mem_start); | ||
107 | #endif | ||
108 | |||
109 | EXPORT_SYMBOL(copy_page); | ||
110 | EXPORT_SYMBOL(clear_page); | ||
111 | |||
112 | EXPORT_SYMBOL(cpu_pda); | ||
113 | #ifdef CONFIG_SMP | ||
114 | EXPORT_SYMBOL(cpu_data); | ||
115 | EXPORT_SYMBOL(cpu_online_map); | ||
116 | EXPORT_SYMBOL(__write_lock_failed); | ||
117 | EXPORT_SYMBOL(__read_lock_failed); | ||
118 | |||
119 | EXPORT_SYMBOL(synchronize_irq); | ||
120 | EXPORT_SYMBOL(smp_call_function); | ||
121 | EXPORT_SYMBOL(cpu_callout_map); | ||
122 | #endif | ||
123 | |||
124 | #ifdef CONFIG_VT | ||
125 | EXPORT_SYMBOL(screen_info); | ||
126 | #endif | ||
127 | |||
128 | EXPORT_SYMBOL(get_wchan); | ||
129 | |||
130 | EXPORT_SYMBOL(rtc_lock); | ||
131 | |||
132 | EXPORT_SYMBOL_GPL(set_nmi_callback); | ||
133 | EXPORT_SYMBOL_GPL(unset_nmi_callback); | ||
134 | |||
135 | /* Export string functions. We normally rely on gcc builtin for most of these, | ||
136 | but gcc sometimes decides not to inline them. */ | ||
137 | #undef memcpy | ||
138 | #undef memset | ||
139 | #undef memmove | ||
140 | #undef memchr | ||
141 | #undef strlen | ||
142 | #undef strcpy | ||
143 | #undef strncmp | ||
144 | #undef strncpy | ||
145 | #undef strchr | ||
146 | #undef strcmp | ||
147 | #undef strcpy | ||
148 | #undef strcat | ||
149 | #undef memcmp | ||
150 | |||
151 | extern void * memset(void *,int,__kernel_size_t); | ||
152 | extern size_t strlen(const char *); | ||
153 | extern void * memmove(void * dest,const void *src,size_t count); | ||
154 | extern char * strcpy(char * dest,const char *src); | ||
155 | extern int strcmp(const char * cs,const char * ct); | ||
156 | extern void *memchr(const void *s, int c, size_t n); | ||
157 | extern void * memcpy(void *,const void *,__kernel_size_t); | ||
158 | extern void * __memcpy(void *,const void *,__kernel_size_t); | ||
159 | extern char * strcat(char *, const char *); | ||
160 | extern int memcmp(const void * cs,const void * ct,size_t count); | ||
161 | |||
162 | EXPORT_SYMBOL(memset); | ||
163 | EXPORT_SYMBOL(strlen); | ||
164 | EXPORT_SYMBOL(memmove); | ||
165 | EXPORT_SYMBOL(strcpy); | ||
166 | EXPORT_SYMBOL(strncmp); | ||
167 | EXPORT_SYMBOL(strncpy); | ||
168 | EXPORT_SYMBOL(strchr); | ||
169 | EXPORT_SYMBOL(strcmp); | ||
170 | EXPORT_SYMBOL(strcat); | ||
171 | EXPORT_SYMBOL(strncat); | ||
172 | EXPORT_SYMBOL(memchr); | ||
173 | EXPORT_SYMBOL(strrchr); | ||
174 | EXPORT_SYMBOL(strnlen); | ||
175 | EXPORT_SYMBOL(memscan); | ||
176 | EXPORT_SYMBOL(memcpy); | ||
177 | EXPORT_SYMBOL(__memcpy); | ||
178 | EXPORT_SYMBOL(memcmp); | ||
179 | |||
180 | #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM | ||
181 | /* prototypes are wrong, these are assembly with custom calling functions */ | ||
182 | extern void rwsem_down_read_failed_thunk(void); | ||
183 | extern void rwsem_wake_thunk(void); | ||
184 | extern void rwsem_downgrade_thunk(void); | ||
185 | extern void rwsem_down_write_failed_thunk(void); | ||
186 | EXPORT_SYMBOL(rwsem_down_read_failed_thunk); | ||
187 | EXPORT_SYMBOL(rwsem_wake_thunk); | ||
188 | EXPORT_SYMBOL(rwsem_downgrade_thunk); | ||
189 | EXPORT_SYMBOL(rwsem_down_write_failed_thunk); | ||
190 | #endif | ||
191 | |||
192 | EXPORT_SYMBOL(empty_zero_page); | ||
193 | |||
194 | #ifdef CONFIG_HAVE_DEC_LOCK | ||
195 | EXPORT_SYMBOL(_atomic_dec_and_lock); | ||
196 | #endif | ||
197 | |||
198 | EXPORT_SYMBOL(die_chain); | ||
199 | EXPORT_SYMBOL(register_die_notifier); | ||
200 | |||
201 | #ifdef CONFIG_SMP | ||
202 | EXPORT_SYMBOL(cpu_sibling_map); | ||
203 | EXPORT_SYMBOL(smp_num_siblings); | ||
204 | #endif | ||
205 | |||
206 | extern void do_softirq_thunk(void); | ||
207 | EXPORT_SYMBOL(do_softirq_thunk); | ||
208 | |||
209 | void out_of_line_bug(void); | ||
210 | EXPORT_SYMBOL(out_of_line_bug); | ||
211 | |||
212 | EXPORT_SYMBOL(init_level4_pgt); | ||
213 | |||
214 | extern unsigned long __supported_pte_mask; | ||
215 | EXPORT_SYMBOL(__supported_pte_mask); | ||
216 | |||
217 | #ifdef CONFIG_SMP | ||
218 | EXPORT_SYMBOL(flush_tlb_page); | ||
219 | #endif | ||
220 | |||
221 | EXPORT_SYMBOL(cpu_khz); | ||