aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/Makefile45
-rw-r--r--arch/x86_64/kernel/acpi/Makefile3
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c132
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S527
-rw-r--r--arch/x86_64/kernel/aperture.c286
-rw-r--r--arch/x86_64/kernel/apic.c1088
-rw-r--r--arch/x86_64/kernel/asm-offsets.c69
-rw-r--r--arch/x86_64/kernel/cpufreq/Kconfig96
-rw-r--r--arch/x86_64/kernel/cpufreq/Makefile17
-rw-r--r--arch/x86_64/kernel/e820.c513
-rw-r--r--arch/x86_64/kernel/early_printk.c220
-rw-r--r--arch/x86_64/kernel/entry.S920
-rw-r--r--arch/x86_64/kernel/genapic.c89
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c130
-rw-r--r--arch/x86_64/kernel/genapic_flat.c127
-rw-r--r--arch/x86_64/kernel/head.S396
-rw-r--r--arch/x86_64/kernel/head64.c117
-rw-r--r--arch/x86_64/kernel/i387.c155
-rw-r--r--arch/x86_64/kernel/i8259.c579
-rw-r--r--arch/x86_64/kernel/init_task.c49
-rw-r--r--arch/x86_64/kernel/io_apic.c1982
-rw-r--r--arch/x86_64/kernel/ioport.c117
-rw-r--r--arch/x86_64/kernel/irq.c108
-rw-r--r--arch/x86_64/kernel/kprobes.c631
-rw-r--r--arch/x86_64/kernel/ldt.c253
-rw-r--r--arch/x86_64/kernel/mce.c548
-rw-r--r--arch/x86_64/kernel/mce_intel.c99
-rw-r--r--arch/x86_64/kernel/module.c166
-rw-r--r--arch/x86_64/kernel/mpparse.c949
-rw-r--r--arch/x86_64/kernel/msr.c279
-rw-r--r--arch/x86_64/kernel/nmi.c488
-rw-r--r--arch/x86_64/kernel/pci-dma.c60
-rw-r--r--arch/x86_64/kernel/pci-gart.c980
-rw-r--r--arch/x86_64/kernel/pci-nommu.c94
-rw-r--r--arch/x86_64/kernel/process.c770
-rw-r--r--arch/x86_64/kernel/ptrace.c547
-rw-r--r--arch/x86_64/kernel/reboot.c163
-rw-r--r--arch/x86_64/kernel/semaphore.c180
-rw-r--r--arch/x86_64/kernel/setup.c1189
-rw-r--r--arch/x86_64/kernel/setup64.c292
-rw-r--r--arch/x86_64/kernel/signal.c486
-rw-r--r--arch/x86_64/kernel/smp.c415
-rw-r--r--arch/x86_64/kernel/smpboot.c938
-rw-r--r--arch/x86_64/kernel/suspend.c157
-rw-r--r--arch/x86_64/kernel/suspend_asm.S104
-rw-r--r--arch/x86_64/kernel/sys_x86_64.c173
-rw-r--r--arch/x86_64/kernel/syscall.c26
-rw-r--r--arch/x86_64/kernel/time.c1262
-rw-r--r--arch/x86_64/kernel/trampoline.S64
-rw-r--r--arch/x86_64/kernel/traps.c948
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S164
-rw-r--r--arch/x86_64/kernel/vsyscall.c225
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c221
53 files changed, 20636 insertions, 0 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
new file mode 100644
index 000000000000..0a3318e08ab6
--- /dev/null
+++ b/arch/x86_64/kernel/Makefile
@@ -0,0 +1,45 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head.o head64.o init_task.o vmlinux.lds
6EXTRA_AFLAGS := -traditional
7obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
9 x8664_ksyms.o i387.o syscall.o vsyscall.o \
10 setup64.o bootflag.o e820.o reboot.o quirks.o
11
12obj-$(CONFIG_X86_MCE) += mce.o
13obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
14obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
15obj-$(CONFIG_ACPI_BOOT) += acpi/
16obj-$(CONFIG_X86_MSR) += msr.o
17obj-$(CONFIG_MICROCODE) += microcode.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
20obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
21obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
22 genapic.o genapic_cluster.o genapic_flat.o
23obj-$(CONFIG_PM) += suspend.o
24obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
25obj-$(CONFIG_CPU_FREQ) += cpufreq/
26obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
27obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
28obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
29obj-$(CONFIG_SWIOTLB) += swiotlb.o
30obj-$(CONFIG_KPROBES) += kprobes.o
31
32obj-$(CONFIG_MODULES) += module.o
33
34obj-y += topology.o
35obj-y += intel_cacheinfo.o
36
37CFLAGS_vsyscall.o := $(PROFILING) -g0
38
39bootflag-y += ../../i386/kernel/bootflag.o
40cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
41topology-y += ../../i386/mach-default/topology.o
42swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o
43microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o
44intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
45quirks-y += ../../i386/kernel/quirks.o
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile
new file mode 100644
index 000000000000..d2c2ee5f9a88
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_ACPI_BOOT) := boot.o
2boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o
3obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..7a275de6df22
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -0,0 +1,132 @@
1/*
2 * acpi.c - Architecture-Specific Low-Level ACPI Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
8 * Copyright (C) 2003 Pavel Machek, SuSE Labs
9 *
10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 *
26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 */
28
29#include <linux/config.h>
30#include <linux/kernel.h>
31#include <linux/init.h>
32#include <linux/types.h>
33#include <linux/stddef.h>
34#include <linux/slab.h>
35#include <linux/pci.h>
36#include <linux/bootmem.h>
37#include <linux/irq.h>
38#include <linux/acpi.h>
39#include <asm/mpspec.h>
40#include <asm/io.h>
41#include <asm/apic.h>
42#include <asm/apicdef.h>
43#include <asm/page.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/io_apic.h>
47#include <asm/proto.h>
48#include <asm/tlbflush.h>
49
50
51/* --------------------------------------------------------------------------
52 Low-Level Sleep Support
53 -------------------------------------------------------------------------- */
54
55#ifdef CONFIG_ACPI_SLEEP
56
57/* address in low memory of the wakeup routine. */
58unsigned long acpi_wakeup_address = 0;
59unsigned long acpi_video_flags;
60extern char wakeup_start, wakeup_end;
61
62extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
63
64static pgd_t low_ptr;
65
66static void init_low_mapping(void)
67{
68 pgd_t *slot0 = pgd_offset(current->mm, 0UL);
69 low_ptr = *slot0;
70 set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
71 flush_tlb_all();
72}
73
74/**
75 * acpi_save_state_mem - save kernel state
76 *
77 * Create an identity mapped page table and copy the wakeup routine to
78 * low memory.
79 */
80int acpi_save_state_mem (void)
81{
82 init_low_mapping();
83
84 memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start);
85 acpi_copy_wakeup_routine(acpi_wakeup_address);
86
87 return 0;
88}
89
90/*
91 * acpi_restore_state
92 */
93void acpi_restore_state_mem (void)
94{
95 set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
96 flush_tlb_all();
97}
98
99/**
100 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
101 *
102 * We allocate a page in low memory for the wakeup
103 * routine for when we come back from a sleep state. The
104 * runtime allocator allows specification of <16M pages, but not
105 * <1M pages.
106 */
107void __init acpi_reserve_bootmem(void)
108{
109 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
110 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
111 printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
112}
113
114static int __init acpi_sleep_setup(char *str)
115{
116 while ((str != NULL) && (*str != '\0')) {
117 if (strncmp(str, "s3_bios", 7) == 0)
118 acpi_video_flags = 1;
119 if (strncmp(str, "s3_mode", 7) == 0)
120 acpi_video_flags |= 2;
121 str = strchr(str, ',');
122 if (str != NULL)
123 str += strspn(str, ", \t");
124 }
125 return 1;
126}
127
128__setup("acpi_sleep=", acpi_sleep_setup);
129
130#endif /*CONFIG_ACPI_SLEEP*/
131
132void acpi_pci_link_exit(void) {}
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
new file mode 100644
index 000000000000..a4c630034cd4
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -0,0 +1,527 @@
1.text
2#include <linux/linkage.h>
3#include <asm/segment.h>
4#include <asm/page.h>
5#include <asm/msr.h>
6
7# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
8#
9# wakeup_code runs in real mode, and at unknown address (determined at run-time).
10# Therefore it must only use relative jumps/calls.
11#
12# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
13#
14# If physical address of wakeup_code is 0x12345, BIOS should call us with
15# cs = 0x1234, eip = 0x05
16#
17
18
19ALIGN
20 .align 16
21ENTRY(wakeup_start)
22wakeup_code:
23 wakeup_code_start = .
24 .code16
25
26# Running in *copy* of this code, somewhere in low 1MB.
27
28 movb $0xa1, %al ; outb %al, $0x80
29 cli
30 cld
31 # setup data segment
32 movw %cs, %ax
33 movw %ax, %ds # Make ds:0 point to wakeup_start
34 movw %ax, %ss
35 mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board
36
37 pushl $0 # Kill any dangerous flags
38 popfl
39
40 movl real_magic - wakeup_code, %eax
41 cmpl $0x12345678, %eax
42 jne bogus_real_magic
43
44 testl $1, video_flags - wakeup_code
45 jz 1f
46 lcall $0xc000,$3
47 movw %cs, %ax
48 movw %ax, %ds # Bios might have played with that
49 movw %ax, %ss
501:
51
52 testl $2, video_flags - wakeup_code
53 jz 1f
54 mov video_mode - wakeup_code, %ax
55 call mode_seta
561:
57
58 movw $0xb800, %ax
59 movw %ax,%fs
60 movw $0x0e00 + 'L', %fs:(0x10)
61
62 movb $0xa2, %al ; outb %al, $0x80
63
64 lidt %ds:idt_48a - wakeup_code
65 xorl %eax, %eax
66 movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
67 shll $4, %eax
68 addl $(gdta - wakeup_code), %eax
69 movl %eax, gdt_48a +2 - wakeup_code
70 lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is
71 # appropriate
72
73 movl $1, %eax # protected mode (PE) bit
74 lmsw %ax # This is it!
75 jmp 1f
761:
77
78 .byte 0x66, 0xea # prefix + jmpi-opcode
79 .long wakeup_32 - __START_KERNEL_map
80 .word __KERNEL_CS
81
82 .code32
83wakeup_32:
84# Running in this code, but at low address; paging is not yet turned on.
85 movb $0xa5, %al ; outb %al, $0x80
86
87 /* Check if extended functions are implemented */
88 movl $0x80000000, %eax
89 cpuid
90 cmpl $0x80000000, %eax
91 jbe bogus_cpu
92 wbinvd
93 mov $0x80000001, %eax
94 cpuid
95 btl $29, %edx
96 jnc bogus_cpu
97 movl %edx,%edi
98
99 movw $__KERNEL_DS, %ax
100 movw %ax, %ds
101 movw %ax, %es
102 movw %ax, %fs
103 movw %ax, %gs
104
105 movw $__KERNEL_DS, %ax
106 movw %ax, %ss
107
108 mov $(wakeup_stack - __START_KERNEL_map), %esp
109 movl saved_magic - __START_KERNEL_map, %eax
110 cmpl $0x9abcdef0, %eax
111 jne bogus_32_magic
112
113 /*
114 * Prepare for entering 64bits mode
115 */
116
117 /* Enable PAE mode and PGE */
118 xorl %eax, %eax
119 btsl $5, %eax
120 btsl $7, %eax
121 movl %eax, %cr4
122
123 /* Setup early boot stage 4 level pagetables */
124 movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax
125 movl %eax, %cr3
126
127 /* Setup EFER (Extended Feature Enable Register) */
128 movl $MSR_EFER, %ecx
129 rdmsr
130 /* Fool rdmsr and reset %eax to avoid dependences */
131 xorl %eax, %eax
132 /* Enable Long Mode */
133 btsl $_EFER_LME, %eax
134 /* Enable System Call */
135 btsl $_EFER_SCE, %eax
136
137 /* No Execute supported? */
138 btl $20,%edi
139 jnc 1f
140 btsl $_EFER_NX, %eax
1411:
142
143 /* Make changes effective */
144 wrmsr
145 wbinvd
146
147 xorl %eax, %eax
148 btsl $31, %eax /* Enable paging and in turn activate Long Mode */
149 btsl $0, %eax /* Enable protected mode */
150 btsl $1, %eax /* Enable MP */
151 btsl $4, %eax /* Enable ET */
152 btsl $5, %eax /* Enable NE */
153 btsl $16, %eax /* Enable WP */
154 btsl $18, %eax /* Enable AM */
155
156 /* Make changes effective */
157 movl %eax, %cr0
158 /* At this point:
159 CR4.PAE must be 1
160 CS.L must be 0
161 CR3 must point to PML4
162 Next instruction must be a branch
163 This must be on identity-mapped page
164 */
165 jmp reach_compatibility_mode
166reach_compatibility_mode:
167 movw $0x0e00 + 'i', %ds:(0xb8012)
168 movb $0xa8, %al ; outb %al, $0x80;
169
170 /*
171 * At this point we're in long mode but in 32bit compatibility mode
172 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
173 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
174 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
175 */
176
177 movw $0x0e00 + 'n', %ds:(0xb8014)
178 movb $0xa9, %al ; outb %al, $0x80
179
180 /* Load new GDT with the 64bit segment using 32bit descriptor */
181 movl $(pGDT32 - __START_KERNEL_map), %eax
182 lgdt (%eax)
183
184 movl $(wakeup_jumpvector - __START_KERNEL_map), %eax
185 /* Finally jump in 64bit mode */
186 ljmp *(%eax)
187
188wakeup_jumpvector:
189 .long wakeup_long64 - __START_KERNEL_map
190 .word __KERNEL_CS
191
192.code64
193
194 /* Hooray, we are in Long 64-bit mode (but still running in low memory) */
195wakeup_long64:
196 /*
197 * We must switch to a new descriptor in kernel space for the GDT
198 * because soon the kernel won't have access anymore to the userspace
199 * addresses where we're currently running on. We have to do that here
200 * because in 32bit we couldn't load a 64bit linear address.
201 */
202 lgdt cpu_gdt_descr - __START_KERNEL_map
203
204 movw $0x0e00 + 'u', %ds:(0xb8016)
205
206 nop
207 nop
208 movw $__KERNEL_DS, %ax
209 movw %ax, %ss
210 movw %ax, %ds
211 movw %ax, %es
212 movw %ax, %fs
213 movw %ax, %gs
214 movq saved_esp, %rsp
215
216 movw $0x0e00 + 'x', %ds:(0xb8018)
217 movq saved_ebx, %rbx
218 movq saved_edi, %rdi
219 movq saved_esi, %rsi
220 movq saved_ebp, %rbp
221
222 movw $0x0e00 + '!', %ds:(0xb801a)
223 movq saved_eip, %rax
224 jmp *%rax
225
226.code32
227
228 .align 64
229gdta:
230 .word 0, 0, 0, 0 # dummy
231
232 .word 0, 0, 0, 0 # unused
233
234 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
235 .word 0 # base address = 0
236 .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?)
237 .word 0x00CF # granularity = 4096, 386
238 # (+5th nibble of limit)
239
240 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
241 .word 0 # base address = 0
242 .word 0x9200 # data read/write
243 .word 0x00CF # granularity = 4096, 386
244 # (+5th nibble of limit)
245# this is 64bit descriptor for code
246 .word 0xFFFF
247 .word 0
248 .word 0x9A00 # code read/exec
249 .word 0x00AF # as above, but it is long mode and with D=0
250
251idt_48a:
252 .word 0 # idt limit = 0
253 .word 0, 0 # idt base = 0L
254
255gdt_48a:
256 .word 0x8000 # gdt limit=2048,
257 # 256 GDT entries
258 .word 0, 0 # gdt base (filled in later)
259
260
261real_save_gdt: .word 0
262 .quad 0
263real_magic: .quad 0
264video_mode: .quad 0
265video_flags: .quad 0
266
267bogus_real_magic:
268 movb $0xba,%al ; outb %al,$0x80
269 jmp bogus_real_magic
270
271bogus_32_magic:
272 movb $0xb3,%al ; outb %al,$0x80
273 jmp bogus_32_magic
274
275bogus_31_magic:
276 movb $0xb1,%al ; outb %al,$0x80
277 jmp bogus_31_magic
278
279bogus_cpu:
280 movb $0xbc,%al ; outb %al,$0x80
281 jmp bogus_cpu
282
283
284/* This code uses an extended set of video mode numbers. These include:
285 * Aliases for standard modes
286 * NORMAL_VGA (-1)
287 * EXTENDED_VGA (-2)
288 * ASK_VGA (-3)
289 * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
290 * of compatibility when extending the table. These are between 0x00 and 0xff.
291 */
292#define VIDEO_FIRST_MENU 0x0000
293
294/* Standard BIOS video modes (BIOS number + 0x0100) */
295#define VIDEO_FIRST_BIOS 0x0100
296
297/* VESA BIOS video modes (VESA number + 0x0200) */
298#define VIDEO_FIRST_VESA 0x0200
299
300/* Video7 special modes (BIOS number + 0x0900) */
301#define VIDEO_FIRST_V7 0x0900
302
303# Setting of user mode (AX=mode ID) => CF=success
304mode_seta:
305 movw %ax, %bx
306#if 0
307 cmpb $0xff, %ah
308 jz setalias
309
310 testb $VIDEO_RECALC>>8, %ah
311 jnz _setrec
312
313 cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
314 jnc setres
315
316 cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
317 jz setspc
318
319 cmpb $VIDEO_FIRST_V7>>8, %ah
320 jz setv7
321#endif
322
323 cmpb $VIDEO_FIRST_VESA>>8, %ah
324 jnc check_vesaa
325#if 0
326 orb %ah, %ah
327 jz setmenu
328#endif
329
330 decb %ah
331# jz setbios Add bios modes later
332
333setbada: clc
334 ret
335
336check_vesaa:
337 subb $VIDEO_FIRST_VESA>>8, %bh
338 orw $0x4000, %bx # Use linear frame buffer
339 movw $0x4f02, %ax # VESA BIOS mode set call
340 int $0x10
341 cmpw $0x004f, %ax # AL=4f if implemented
342 jnz _setbada # AH=0 if OK
343
344 stc
345 ret
346
347_setbada: jmp setbada
348
349 .code64
350bogus_magic:
351 movw $0x0e00 + 'B', %ds:(0xb8018)
352 jmp bogus_magic
353
354bogus_magic2:
355 movw $0x0e00 + '2', %ds:(0xb8018)
356 jmp bogus_magic2
357
358
359wakeup_stack_begin: # Stack grows down
360
361.org 0xff0
362wakeup_stack: # Just below end of page
363
364ENTRY(wakeup_end)
365
366##
367# acpi_copy_wakeup_routine
368#
369# Copy the above routine to low memory.
370#
371# Parameters:
372# %rdi: place to copy wakeup routine to
373#
374# Returned address is location of code in low memory (past data and stack)
375#
376ENTRY(acpi_copy_wakeup_routine)
377 pushq %rax
378 pushq %rcx
379 pushq %rdx
380
381 sgdt saved_gdt
382 sidt saved_idt
383 sldt saved_ldt
384 str saved_tss
385
386 movq %cr3, %rdx
387 movq %rdx, saved_cr3
388 movq %cr4, %rdx
389 movq %rdx, saved_cr4
390 movq %cr0, %rdx
391 movq %rdx, saved_cr0
392 sgdt real_save_gdt - wakeup_start (,%rdi)
393 movl $MSR_EFER, %ecx
394 rdmsr
395 movl %eax, saved_efer
396 movl %edx, saved_efer2
397
398 movl saved_video_mode, %edx
399 movl %edx, video_mode - wakeup_start (,%rdi)
400 movl acpi_video_flags, %edx
401 movl %edx, video_flags - wakeup_start (,%rdi)
402 movq $0x12345678, real_magic - wakeup_start (,%rdi)
403 movq $0x123456789abcdef0, %rdx
404 movq %rdx, saved_magic
405
406 movl saved_magic - __START_KERNEL_map, %eax
407 cmpl $0x9abcdef0, %eax
408 jne bogus_32_magic
409
410 # make sure %cr4 is set correctly (features, etc)
411 movl saved_cr4 - __START_KERNEL_map, %eax
412 movq %rax, %cr4
413
414 movl saved_cr0 - __START_KERNEL_map, %eax
415 movq %rax, %cr0
416 jmp 1f # Flush pipelines
4171:
418 # restore the regs we used
419 popq %rdx
420 popq %rcx
421 popq %rax
422ENTRY(do_suspend_lowlevel_s4bios)
423 ret
424
425 .align 2
426 .p2align 4,,15
427.globl do_suspend_lowlevel
428 .type do_suspend_lowlevel,@function
429do_suspend_lowlevel:
430.LFB5:
431 subq $8, %rsp
432 xorl %eax, %eax
433 call save_processor_state
434
435 movq %rsp, saved_context_esp(%rip)
436 movq %rax, saved_context_eax(%rip)
437 movq %rbx, saved_context_ebx(%rip)
438 movq %rcx, saved_context_ecx(%rip)
439 movq %rdx, saved_context_edx(%rip)
440 movq %rbp, saved_context_ebp(%rip)
441 movq %rsi, saved_context_esi(%rip)
442 movq %rdi, saved_context_edi(%rip)
443 movq %r8, saved_context_r08(%rip)
444 movq %r9, saved_context_r09(%rip)
445 movq %r10, saved_context_r10(%rip)
446 movq %r11, saved_context_r11(%rip)
447 movq %r12, saved_context_r12(%rip)
448 movq %r13, saved_context_r13(%rip)
449 movq %r14, saved_context_r14(%rip)
450 movq %r15, saved_context_r15(%rip)
451 pushfq ; popq saved_context_eflags(%rip)
452
453 movq $.L97, saved_eip(%rip)
454
455 movq %rsp,saved_esp
456 movq %rbp,saved_ebp
457 movq %rbx,saved_ebx
458 movq %rdi,saved_edi
459 movq %rsi,saved_esi
460
461 addq $8, %rsp
462 movl $3, %edi
463 xorl %eax, %eax
464 jmp acpi_enter_sleep_state
465.L97:
466 .p2align 4,,7
467.L99:
468 .align 4
469 movl $24, %eax
470 movw %ax, %ds
471 movq saved_context+58(%rip), %rax
472 movq %rax, %cr4
473 movq saved_context+50(%rip), %rax
474 movq %rax, %cr3
475 movq saved_context+42(%rip), %rax
476 movq %rax, %cr2
477 movq saved_context+34(%rip), %rax
478 movq %rax, %cr0
479 pushq saved_context_eflags(%rip) ; popfq
480 movq saved_context_esp(%rip), %rsp
481 movq saved_context_ebp(%rip), %rbp
482 movq saved_context_eax(%rip), %rax
483 movq saved_context_ebx(%rip), %rbx
484 movq saved_context_ecx(%rip), %rcx
485 movq saved_context_edx(%rip), %rdx
486 movq saved_context_esi(%rip), %rsi
487 movq saved_context_edi(%rip), %rdi
488 movq saved_context_r08(%rip), %r8
489 movq saved_context_r09(%rip), %r9
490 movq saved_context_r10(%rip), %r10
491 movq saved_context_r11(%rip), %r11
492 movq saved_context_r12(%rip), %r12
493 movq saved_context_r13(%rip), %r13
494 movq saved_context_r14(%rip), %r14
495 movq saved_context_r15(%rip), %r15
496
497 xorl %eax, %eax
498 addq $8, %rsp
499 jmp restore_processor_state
500.LFE5:
501.Lfe5:
502 .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
503
504.data
505ALIGN
506ENTRY(saved_ebp) .quad 0
507ENTRY(saved_esi) .quad 0
508ENTRY(saved_edi) .quad 0
509ENTRY(saved_ebx) .quad 0
510
511ENTRY(saved_eip) .quad 0
512ENTRY(saved_esp) .quad 0
513
514ENTRY(saved_magic) .quad 0
515
516ALIGN
517# saved registers
518saved_gdt: .quad 0,0
519saved_idt: .quad 0,0
520saved_ldt: .quad 0
521saved_tss: .quad 0
522
523saved_cr0: .quad 0
524saved_cr3: .quad 0
525saved_cr4: .quad 0
526saved_efer: .quad 0
527saved_efer2: .quad 0
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
new file mode 100644
index 000000000000..4baa99fe1e5c
--- /dev/null
+++ b/arch/x86_64/kernel/aperture.c
@@ -0,0 +1,286 @@
1/*
2 * Firmware replacement code.
3 *
4 * Work around broken BIOSes that don't set an aperture or only set the
5 * aperture in the AGP bridge.
6 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
12 */
13#include <linux/config.h>
14#include <linux/kernel.h>
15#include <linux/types.h>
16#include <linux/init.h>
17#include <linux/bootmem.h>
18#include <linux/mmzone.h>
19#include <linux/pci_ids.h>
20#include <linux/pci.h>
21#include <linux/bitops.h>
22#include <asm/e820.h>
23#include <asm/io.h>
24#include <asm/proto.h>
25#include <asm/pci-direct.h>
26
27int iommu_aperture;
28int iommu_aperture_disabled __initdata = 0;
29int iommu_aperture_allowed __initdata = 0;
30
31int fallback_aper_order __initdata = 1; /* 64MB */
32int fallback_aper_force __initdata = 0;
33
34int fix_aperture __initdata = 1;
35
36/* This code runs before the PCI subsystem is initialized, so just
37 access the northbridge directly. */
38
39#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
40
41static u32 __init allocate_aperture(void)
42{
43#ifdef CONFIG_DISCONTIGMEM
44 pg_data_t *nd0 = NODE_DATA(0);
45#else
46 pg_data_t *nd0 = &contig_page_data;
47#endif
48 u32 aper_size;
49 void *p;
50
51 if (fallback_aper_order > 7)
52 fallback_aper_order = 7;
53 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
54
55 /*
56 * Aperture has to be naturally aligned. This means an 2GB aperture won't
57 * have much chances to find a place in the lower 4GB of memory.
58 * Unfortunately we cannot move it up because that would make the
59 * IOMMU useless.
60 */
61 p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0);
62 if (!p || __pa(p)+aper_size > 0xffffffff) {
63 printk("Cannot allocate aperture memory hole (%p,%uK)\n",
64 p, aper_size>>10);
65 if (p)
66 free_bootmem_node(nd0, (unsigned long)p, aper_size);
67 return 0;
68 }
69 printk("Mapping aperture over %d KB of RAM @ %lx\n",
70 aper_size >> 10, __pa(p));
71 return (u32)__pa(p);
72}
73
74static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
75{
76 if (!aper_base)
77 return 0;
78 if (aper_size < 64*1024*1024) {
79 printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20);
80 return 0;
81 }
82 if (aper_base + aper_size >= 0xffffffff) {
83 printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
84 return 0;
85 }
86 if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
87 printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
88 return 0;
89 }
90 return 1;
91}
92
93/* Find a PCI capability */
94static __u32 __init find_cap(int num, int slot, int func, int cap)
95{
96 u8 pos;
97 int bytes;
98 if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
99 return 0;
100 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
101 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
102 u8 id;
103 pos &= ~3;
104 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
105 if (id == 0xff)
106 break;
107 if (id == cap)
108 return pos;
109 pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT);
110 }
111 return 0;
112}
113
114/* Read a standard AGPv3 bridge header */
115static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
116{
117 u32 apsize;
118 u32 apsizereg;
119 int nbits;
120 u32 aper_low, aper_hi;
121 u64 aper;
122
123 printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
124 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
125 if (apsizereg == 0xffffffff) {
126 printk("APSIZE in AGP bridge unreadable\n");
127 return 0;
128 }
129
130 apsize = apsizereg & 0xfff;
131 /* Some BIOS use weird encodings not in the AGPv3 table. */
132 if (apsize & 0xff)
133 apsize |= 0xf00;
134 nbits = hweight16(apsize);
135 *order = 7 - nbits;
136 if ((int)*order < 0) /* < 32MB */
137 *order = 0;
138
139 aper_low = read_pci_config(num,slot,func, 0x10);
140 aper_hi = read_pci_config(num,slot,func,0x14);
141 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
142
143 printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
144 aper, 32 << *order, apsizereg);
145
146 if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
147 return 0;
148 return (u32)aper;
149}
150
151/* Look for an AGP bridge. Windows only expects the aperture in the
152 AGP bridge and some BIOS forget to initialize the Northbridge too.
153 Work around this here.
154
155 Do an PCI bus scan by hand because we're running before the PCI
156 subsystem.
157
158 All K8 AGP bridges are AGPv3 compliant, so we can do this scan
159 generically. It's probably overkill to always scan all slots because
160 the AGP bridges should be always an own bus on the HT hierarchy,
161 but do it here for future safety. */
162static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
163{
164 int num, slot, func;
165
166 /* Poor man's PCI discovery */
167 for (num = 0; num < 32; num++) {
168 for (slot = 0; slot < 32; slot++) {
169 for (func = 0; func < 8; func++) {
170 u32 class, cap;
171 u8 type;
172 class = read_pci_config(num,slot,func,
173 PCI_CLASS_REVISION);
174 if (class == 0xffffffff)
175 break;
176
177 switch (class >> 16) {
178 case PCI_CLASS_BRIDGE_HOST:
179 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
180 /* AGP bridge? */
181 cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
182 if (!cap)
183 break;
184 *valid_agp = 1;
185 return read_agp(num,slot,func,cap,order);
186 }
187
188 /* No multi-function device? */
189 type = read_pci_config_byte(num,slot,func,
190 PCI_HEADER_TYPE);
191 if (!(type & 0x80))
192 break;
193 }
194 }
195 }
196 printk("No AGP bridge found\n");
197 return 0;
198}
199
200void __init iommu_hole_init(void)
201{
202 int fix, num;
203 u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
204 u64 aper_base, last_aper_base = 0;
205 int valid_agp = 0;
206
207 if (iommu_aperture_disabled || !fix_aperture)
208 return;
209
210 printk("Checking aperture...\n");
211
212 fix = 0;
213 for (num = 24; num < 32; num++) {
214 char name[30];
215 if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
216 continue;
217
218 iommu_aperture = 1;
219
220 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
221 aper_size = (32 * 1024 * 1024) << aper_order;
222 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
223 aper_base <<= 25;
224
225 printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
226 aper_base, aper_size>>20);
227
228 sprintf(name, "northbridge cpu %d", num-24);
229
230 if (!aperture_valid(name, aper_base, aper_size)) {
231 fix = 1;
232 break;
233 }
234
235 if ((last_aper_order && aper_order != last_aper_order) ||
236 (last_aper_base && aper_base != last_aper_base)) {
237 fix = 1;
238 break;
239 }
240 last_aper_order = aper_order;
241 last_aper_base = aper_base;
242 }
243
244 if (!fix && !fallback_aper_force)
245 return;
246
247 if (!fallback_aper_force)
248 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
249
250 if (aper_alloc) {
251 /* Got the aperture from the AGP bridge */
252 } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) ||
253 force_iommu ||
254 valid_agp ||
255 fallback_aper_force) {
256 printk("Your BIOS doesn't leave a aperture memory hole\n");
257 printk("Please enable the IOMMU option in the BIOS setup\n");
258 printk("This costs you %d MB of RAM\n",
259 32 << fallback_aper_order);
260
261 aper_order = fallback_aper_order;
262 aper_alloc = allocate_aperture();
263 if (!aper_alloc) {
264 /* Could disable AGP and IOMMU here, but it's probably
265 not worth it. But the later users cannot deal with
266 bad apertures and turning on the aperture over memory
267 causes very strange problems, so it's better to
268 panic early. */
269 panic("Not enough memory for aperture");
270 }
271 } else {
272 return;
273 }
274
275 /* Fix up the north bridges */
276 for (num = 24; num < 32; num++) {
277 if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
278 continue;
279
280 /* Don't enable translation yet. That is done later.
281 Assume this BIOS didn't initialise the GART so
282 just overwrite all previous bits */
283 write_pci_config(0, num, 3, 0x90, aper_order<<1);
284 write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
285 }
286}
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
new file mode 100644
index 000000000000..9826f03dc9a3
--- /dev/null
+++ b/arch/x86_64/kernel/apic.c
@@ -0,0 +1,1088 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/config.h>
18#include <linux/init.h>
19
20#include <linux/mm.h>
21#include <linux/irq.h>
22#include <linux/delay.h>
23#include <linux/bootmem.h>
24#include <linux/smp_lock.h>
25#include <linux/interrupt.h>
26#include <linux/mc146818rtc.h>
27#include <linux/kernel_stat.h>
28#include <linux/sysdev.h>
29
30#include <asm/atomic.h>
31#include <asm/smp.h>
32#include <asm/mtrr.h>
33#include <asm/mpspec.h>
34#include <asm/pgalloc.h>
35#include <asm/mach_apic.h>
36
37int apic_verbosity;
38
39int disable_apic_timer __initdata;
40
41/* Using APIC to generate smp_local_timer_interrupt? */
42int using_apic_timer = 0;
43
44static DEFINE_PER_CPU(int, prof_multiplier) = 1;
45static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
46static DEFINE_PER_CPU(int, prof_counter) = 1;
47
48static void apic_pm_activate(void);
49
50void enable_NMI_through_LVT0 (void * dummy)
51{
52 unsigned int v, ver;
53
54 ver = apic_read(APIC_LVR);
55 ver = GET_APIC_VERSION(ver);
56 v = APIC_DM_NMI; /* unmask and set to NMI */
57 apic_write_around(APIC_LVT0, v);
58}
59
60int get_maxlvt(void)
61{
62 unsigned int v, ver, maxlvt;
63
64 v = apic_read(APIC_LVR);
65 ver = GET_APIC_VERSION(v);
66 maxlvt = GET_APIC_MAXLVT(v);
67 return maxlvt;
68}
69
70void clear_local_APIC(void)
71{
72 int maxlvt;
73 unsigned int v;
74
75 maxlvt = get_maxlvt();
76
77 /*
78 * Masking an LVT entry on a P6 can trigger a local APIC error
79 * if the vector is zero. Mask LVTERR first to prevent this.
80 */
81 if (maxlvt >= 3) {
82 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
83 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
84 }
85 /*
86 * Careful: we have to set masks only first to deassert
87 * any level-triggered sources.
88 */
89 v = apic_read(APIC_LVTT);
90 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
91 v = apic_read(APIC_LVT0);
92 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
93 v = apic_read(APIC_LVT1);
94 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
95 if (maxlvt >= 4) {
96 v = apic_read(APIC_LVTPC);
97 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
98 }
99
100 /*
101 * Clean APIC state for other OSs:
102 */
103 apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
104 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
105 apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
106 if (maxlvt >= 3)
107 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
108 if (maxlvt >= 4)
109 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
110 v = GET_APIC_VERSION(apic_read(APIC_LVR));
111 if (APIC_INTEGRATED(v)) { /* !82489DX */
112 if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */
113 apic_write(APIC_ESR, 0);
114 apic_read(APIC_ESR);
115 }
116}
117
118void __init connect_bsp_APIC(void)
119{
120 if (pic_mode) {
121 /*
122 * Do not trust the local APIC being empty at bootup.
123 */
124 clear_local_APIC();
125 /*
126 * PIC mode, enable APIC mode in the IMCR, i.e.
127 * connect BSP's local APIC to INT and NMI lines.
128 */
129 apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
130 outb(0x70, 0x22);
131 outb(0x01, 0x23);
132 }
133}
134
135void disconnect_bsp_APIC(void)
136{
137 if (pic_mode) {
138 /*
139 * Put the board back into PIC mode (has an effect
140 * only on certain older boards). Note that APIC
141 * interrupts, including IPIs, won't work beyond
142 * this point! The only exception are INIT IPIs.
143 */
144 apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n");
145 outb(0x70, 0x22);
146 outb(0x00, 0x23);
147 }
148}
149
150void disable_local_APIC(void)
151{
152 unsigned int value;
153
154 clear_local_APIC();
155
156 /*
157 * Disable APIC (implies clearing of registers
158 * for 82489DX!).
159 */
160 value = apic_read(APIC_SPIV);
161 value &= ~APIC_SPIV_APIC_ENABLED;
162 apic_write_around(APIC_SPIV, value);
163}
164
165/*
166 * This is to verify that we're looking at a real local APIC.
167 * Check these against your board if the CPUs aren't getting
168 * started for no apparent reason.
169 */
170int __init verify_local_APIC(void)
171{
172 unsigned int reg0, reg1;
173
174 /*
175 * The version register is read-only in a real APIC.
176 */
177 reg0 = apic_read(APIC_LVR);
178 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
179 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
180 reg1 = apic_read(APIC_LVR);
181 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
182
183 /*
184 * The two version reads above should print the same
185 * numbers. If the second one is different, then we
186 * poke at a non-APIC.
187 */
188 if (reg1 != reg0)
189 return 0;
190
191 /*
192 * Check if the version looks reasonably.
193 */
194 reg1 = GET_APIC_VERSION(reg0);
195 if (reg1 == 0x00 || reg1 == 0xff)
196 return 0;
197 reg1 = get_maxlvt();
198 if (reg1 < 0x02 || reg1 == 0xff)
199 return 0;
200
201 /*
202 * The ID register is read/write in a real APIC.
203 */
204 reg0 = apic_read(APIC_ID);
205 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
206 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
207 reg1 = apic_read(APIC_ID);
208 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
209 apic_write(APIC_ID, reg0);
210 if (reg1 != (reg0 ^ APIC_ID_MASK))
211 return 0;
212
213 /*
214 * The next two are just to see if we have sane values.
215 * They're only really relevant if we're in Virtual Wire
216 * compatibility mode, but most boxes are anymore.
217 */
218 reg0 = apic_read(APIC_LVT0);
219 apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
220 reg1 = apic_read(APIC_LVT1);
221 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
222
223 return 1;
224}
225
226void __init sync_Arb_IDs(void)
227{
228 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
229 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
230 if (ver >= 0x14) /* P4 or higher */
231 return;
232
233 /*
234 * Wait for idle.
235 */
236 apic_wait_icr_idle();
237
238 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
239 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
240 | APIC_DM_INIT);
241}
242
243extern void __error_in_apic_c (void);
244
245/*
246 * An initial setup of the virtual wire mode.
247 */
248void __init init_bsp_APIC(void)
249{
250 unsigned int value, ver;
251
252 /*
253 * Don't do the setup now if we have a SMP BIOS as the
254 * through-I/O-APIC virtual wire mode might be active.
255 */
256 if (smp_found_config || !cpu_has_apic)
257 return;
258
259 value = apic_read(APIC_LVR);
260 ver = GET_APIC_VERSION(value);
261
262 /*
263 * Do not trust the local APIC being empty at bootup.
264 */
265 clear_local_APIC();
266
267 /*
268 * Enable APIC.
269 */
270 value = apic_read(APIC_SPIV);
271 value &= ~APIC_VECTOR_MASK;
272 value |= APIC_SPIV_APIC_ENABLED;
273 value |= APIC_SPIV_FOCUS_DISABLED;
274 value |= SPURIOUS_APIC_VECTOR;
275 apic_write_around(APIC_SPIV, value);
276
277 /*
278 * Set up the virtual wire mode.
279 */
280 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
281 value = APIC_DM_NMI;
282 if (!APIC_INTEGRATED(ver)) /* 82489DX */
283 value |= APIC_LVT_LEVEL_TRIGGER;
284 apic_write_around(APIC_LVT1, value);
285}
286
287void __init setup_local_APIC (void)
288{
289 unsigned int value, ver, maxlvt;
290
291 /* Pound the ESR really hard over the head with a big hammer - mbligh */
292 if (esr_disable) {
293 apic_write(APIC_ESR, 0);
294 apic_write(APIC_ESR, 0);
295 apic_write(APIC_ESR, 0);
296 apic_write(APIC_ESR, 0);
297 }
298
299 value = apic_read(APIC_LVR);
300 ver = GET_APIC_VERSION(value);
301
302 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
303 __error_in_apic_c();
304
305 /*
306 * Double-check whether this APIC is really registered.
307 * This is meaningless in clustered apic mode, so we skip it.
308 */
309 if (!apic_id_registered())
310 BUG();
311
312 /*
313 * Intel recommends to set DFR, LDR and TPR before enabling
314 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
315 * document number 292116). So here it goes...
316 */
317 init_apic_ldr();
318
319 /*
320 * Set Task Priority to 'accept all'. We never change this
321 * later on.
322 */
323 value = apic_read(APIC_TASKPRI);
324 value &= ~APIC_TPRI_MASK;
325 apic_write_around(APIC_TASKPRI, value);
326
327 /*
328 * Now that we are all set up, enable the APIC
329 */
330 value = apic_read(APIC_SPIV);
331 value &= ~APIC_VECTOR_MASK;
332 /*
333 * Enable APIC
334 */
335 value |= APIC_SPIV_APIC_ENABLED;
336
337 /*
338 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
339 * certain networking cards. If high frequency interrupts are
340 * happening on a particular IOAPIC pin, plus the IOAPIC routing
341 * entry is masked/unmasked at a high rate as well then sooner or
342 * later IOAPIC line gets 'stuck', no more interrupts are received
343 * from the device. If focus CPU is disabled then the hang goes
344 * away, oh well :-(
345 *
346 * [ This bug can be reproduced easily with a level-triggered
347 * PCI Ne2000 networking cards and PII/PIII processors, dual
348 * BX chipset. ]
349 */
350 /*
351 * Actually disabling the focus CPU check just makes the hang less
352 * frequent as it makes the interrupt distributon model be more
353 * like LRU than MRU (the short-term load is more even across CPUs).
354 * See also the comment in end_level_ioapic_irq(). --macro
355 */
356#if 1
357 /* Enable focus processor (bit==0) */
358 value &= ~APIC_SPIV_FOCUS_DISABLED;
359#else
360 /* Disable focus processor (bit==1) */
361 value |= APIC_SPIV_FOCUS_DISABLED;
362#endif
363 /*
364 * Set spurious IRQ vector
365 */
366 value |= SPURIOUS_APIC_VECTOR;
367 apic_write_around(APIC_SPIV, value);
368
369 /*
370 * Set up LVT0, LVT1:
371 *
372 * set up through-local-APIC on the BP's LINT0. This is not
373 * strictly necessary in pure symmetric-IO mode, but sometimes
374 * we delegate interrupts to the 8259A.
375 */
376 /*
377 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
378 */
379 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
380 if (!smp_processor_id() && (pic_mode || !value)) {
381 value = APIC_DM_EXTINT;
382 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
383 } else {
384 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
385 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
386 }
387 apic_write_around(APIC_LVT0, value);
388
389 /*
390 * only the BP should see the LINT1 NMI signal, obviously.
391 */
392 if (!smp_processor_id())
393 value = APIC_DM_NMI;
394 else
395 value = APIC_DM_NMI | APIC_LVT_MASKED;
396 if (!APIC_INTEGRATED(ver)) /* 82489DX */
397 value |= APIC_LVT_LEVEL_TRIGGER;
398 apic_write_around(APIC_LVT1, value);
399
400 if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */
401 unsigned oldvalue;
402 maxlvt = get_maxlvt();
403 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
404 apic_write(APIC_ESR, 0);
405 oldvalue = apic_read(APIC_ESR);
406 value = ERROR_APIC_VECTOR; // enables sending errors
407 apic_write_around(APIC_LVTERR, value);
408 /*
409 * spec says clear errors after enabling vector.
410 */
411 if (maxlvt > 3)
412 apic_write(APIC_ESR, 0);
413 value = apic_read(APIC_ESR);
414 if (value != oldvalue)
415 apic_printk(APIC_VERBOSE,
416 "ESR value after enabling vector: %08x, after %08x\n",
417 oldvalue, value);
418 } else {
419 if (esr_disable)
420 /*
421 * Something untraceble is creating bad interrupts on
422 * secondary quads ... for the moment, just leave the
423 * ESR disabled - we can't do anything useful with the
424 * errors anyway - mbligh
425 */
426 apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n");
427 else
428 apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n");
429 }
430
431 nmi_watchdog_default();
432 if (nmi_watchdog == NMI_LOCAL_APIC)
433 setup_apic_nmi_watchdog();
434 apic_pm_activate();
435}
436
437#ifdef CONFIG_PM
438
439static struct {
440 /* 'active' is true if the local APIC was enabled by us and
441 not the BIOS; this signifies that we are also responsible
442 for disabling it before entering apm/acpi suspend */
443 int active;
444 /* r/w apic fields */
445 unsigned int apic_id;
446 unsigned int apic_taskpri;
447 unsigned int apic_ldr;
448 unsigned int apic_dfr;
449 unsigned int apic_spiv;
450 unsigned int apic_lvtt;
451 unsigned int apic_lvtpc;
452 unsigned int apic_lvt0;
453 unsigned int apic_lvt1;
454 unsigned int apic_lvterr;
455 unsigned int apic_tmict;
456 unsigned int apic_tdcr;
457 unsigned int apic_thmr;
458} apic_pm_state;
459
460static int lapic_suspend(struct sys_device *dev, u32 state)
461{
462 unsigned long flags;
463
464 if (!apic_pm_state.active)
465 return 0;
466
467 apic_pm_state.apic_id = apic_read(APIC_ID);
468 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
469 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
470 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
471 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
472 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
473 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
474 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
475 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
476 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
477 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
478 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
479 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
480 local_save_flags(flags);
481 local_irq_disable();
482 disable_local_APIC();
483 local_irq_restore(flags);
484 return 0;
485}
486
487static int lapic_resume(struct sys_device *dev)
488{
489 unsigned int l, h;
490 unsigned long flags;
491
492 if (!apic_pm_state.active)
493 return 0;
494
495 /* XXX: Pavel needs this for S3 resume, but can't explain why */
496 set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
497
498 local_irq_save(flags);
499 rdmsr(MSR_IA32_APICBASE, l, h);
500 l &= ~MSR_IA32_APICBASE_BASE;
501 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
502 wrmsr(MSR_IA32_APICBASE, l, h);
503 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
504 apic_write(APIC_ID, apic_pm_state.apic_id);
505 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
506 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
507 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
508 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
509 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
510 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
511 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
512 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
513 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
514 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
515 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
516 apic_write(APIC_ESR, 0);
517 apic_read(APIC_ESR);
518 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
519 apic_write(APIC_ESR, 0);
520 apic_read(APIC_ESR);
521 local_irq_restore(flags);
522 return 0;
523}
524
525static struct sysdev_class lapic_sysclass = {
526 set_kset_name("lapic"),
527 .resume = lapic_resume,
528 .suspend = lapic_suspend,
529};
530
531static struct sys_device device_lapic = {
532 .id = 0,
533 .cls = &lapic_sysclass,
534};
535
536static void __init apic_pm_activate(void)
537{
538 apic_pm_state.active = 1;
539}
540
541static int __init init_lapic_sysfs(void)
542{
543 int error;
544 if (!cpu_has_apic)
545 return 0;
546 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
547 error = sysdev_class_register(&lapic_sysclass);
548 if (!error)
549 error = sysdev_register(&device_lapic);
550 return error;
551}
552device_initcall(init_lapic_sysfs);
553
554#else /* CONFIG_PM */
555
556static void apic_pm_activate(void) { }
557
558#endif /* CONFIG_PM */
559
560static int __init apic_set_verbosity(char *str)
561{
562 if (strcmp("debug", str) == 0)
563 apic_verbosity = APIC_DEBUG;
564 else if (strcmp("verbose", str) == 0)
565 apic_verbosity = APIC_VERBOSE;
566 else
567 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
568 " use apic=verbose or apic=debug", str);
569
570 return 0;
571}
572
573__setup("apic=", apic_set_verbosity);
574
575/*
576 * Detect and enable local APICs on non-SMP boards.
577 * Original code written by Keir Fraser.
578 * On AMD64 we trust the BIOS - if it says no APIC it is likely
579 * not correctly set up (usually the APIC timer won't work etc.)
580 */
581
582static int __init detect_init_APIC (void)
583{
584 if (!cpu_has_apic) {
585 printk(KERN_INFO "No local APIC present\n");
586 return -1;
587 }
588
589 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
590 boot_cpu_id = 0;
591 return 0;
592}
593
594void __init init_apic_mappings(void)
595{
596 unsigned long apic_phys;
597
598 /*
599 * If no local APIC can be found then set up a fake all
600 * zeroes page to simulate the local APIC and another
601 * one for the IO-APIC.
602 */
603 if (!smp_found_config && detect_init_APIC()) {
604 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
605 apic_phys = __pa(apic_phys);
606 } else
607 apic_phys = mp_lapic_addr;
608
609 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
610 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
611
612 /*
613 * Fetch the APIC ID of the BSP in case we have a
614 * default configuration (or the MP table is broken).
615 */
616 if (boot_cpu_id == -1U)
617 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
618
619#ifdef CONFIG_X86_IO_APIC
620 {
621 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
622 int i;
623
624 for (i = 0; i < nr_ioapics; i++) {
625 if (smp_found_config) {
626 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
627 } else {
628 ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
629 ioapic_phys = __pa(ioapic_phys);
630 }
631 set_fixmap_nocache(idx, ioapic_phys);
632 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
633 __fix_to_virt(idx), ioapic_phys);
634 idx++;
635 }
636 }
637#endif
638}
639
640/*
641 * This function sets up the local APIC timer, with a timeout of
642 * 'clocks' APIC bus clock. During calibration we actually call
643 * this function twice on the boot CPU, once with a bogus timeout
644 * value, second time for real. The other (noncalibrating) CPUs
645 * call this function only once, with the real, calibrated value.
646 *
647 * We do reads before writes even if unnecessary, to get around the
648 * P5 APIC double write bug.
649 */
650
651#define APIC_DIVISOR 16
652
653static void __setup_APIC_LVTT(unsigned int clocks)
654{
655 unsigned int lvtt_value, tmp_value, ver;
656
657 ver = GET_APIC_VERSION(apic_read(APIC_LVR));
658 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
659 if (!APIC_INTEGRATED(ver))
660 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
661 apic_write_around(APIC_LVTT, lvtt_value);
662
663 /*
664 * Divide PICLK by 16
665 */
666 tmp_value = apic_read(APIC_TDCR);
667 apic_write_around(APIC_TDCR, (tmp_value
668 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
669 | APIC_TDR_DIV_16);
670
671 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
672}
673
674static void setup_APIC_timer(unsigned int clocks)
675{
676 unsigned long flags;
677
678 local_irq_save(flags);
679
680 /* For some reasons this doesn't work on Simics, so fake it for now */
681 if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) {
682 __setup_APIC_LVTT(clocks);
683 return;
684 }
685
686 /* wait for irq slice */
687 if (vxtime.hpet_address) {
688 int trigger = hpet_readl(HPET_T0_CMP);
689 while (hpet_readl(HPET_COUNTER) >= trigger)
690 /* do nothing */ ;
691 while (hpet_readl(HPET_COUNTER) < trigger)
692 /* do nothing */ ;
693 } else {
694 int c1, c2;
695 outb_p(0x00, 0x43);
696 c2 = inb_p(0x40);
697 c2 |= inb_p(0x40) << 8;
698 do {
699 c1 = c2;
700 outb_p(0x00, 0x43);
701 c2 = inb_p(0x40);
702 c2 |= inb_p(0x40) << 8;
703 } while (c2 - c1 < 300);
704 }
705
706 __setup_APIC_LVTT(clocks);
707
708 local_irq_restore(flags);
709}
710
711/*
712 * In this function we calibrate APIC bus clocks to the external
713 * timer. Unfortunately we cannot use jiffies and the timer irq
714 * to calibrate, since some later bootup code depends on getting
715 * the first irq? Ugh.
716 *
717 * We want to do the calibration only once since we
718 * want to have local timer irqs syncron. CPUs connected
719 * by the same APIC bus have the very same bus frequency.
720 * And we want to have irqs off anyways, no accidental
721 * APIC irq that way.
722 */
723
724#define TICK_COUNT 100000000
725
726static int __init calibrate_APIC_clock(void)
727{
728 int apic, apic_start, tsc, tsc_start;
729 int result;
730 /*
731 * Put whatever arbitrary (but long enough) timeout
732 * value into the APIC clock, we just want to get the
733 * counter running for calibration.
734 */
735 __setup_APIC_LVTT(1000000000);
736
737 apic_start = apic_read(APIC_TMCCT);
738 rdtscl(tsc_start);
739
740 do {
741 apic = apic_read(APIC_TMCCT);
742 rdtscl(tsc);
743 } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT);
744
745 result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start);
746
747 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
748 result / 1000 / 1000, result / 1000 % 1000);
749
750 return result * APIC_DIVISOR / HZ;
751}
752
753static unsigned int calibration_result;
754
755void __init setup_boot_APIC_clock (void)
756{
757 if (disable_apic_timer) {
758 printk(KERN_INFO "Disabling APIC timer\n");
759 return;
760 }
761
762 printk(KERN_INFO "Using local APIC timer interrupts.\n");
763 using_apic_timer = 1;
764
765 local_irq_disable();
766
767 calibration_result = calibrate_APIC_clock();
768 /*
769 * Now set up the timer for real.
770 */
771 setup_APIC_timer(calibration_result);
772
773 local_irq_enable();
774}
775
776void __init setup_secondary_APIC_clock(void)
777{
778 local_irq_disable(); /* FIXME: Do we need this? --RR */
779 setup_APIC_timer(calibration_result);
780 local_irq_enable();
781}
782
783void __init disable_APIC_timer(void)
784{
785 if (using_apic_timer) {
786 unsigned long v;
787
788 v = apic_read(APIC_LVTT);
789 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
790 }
791}
792
793void enable_APIC_timer(void)
794{
795 if (using_apic_timer) {
796 unsigned long v;
797
798 v = apic_read(APIC_LVTT);
799 apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
800 }
801}
802
803/*
804 * the frequency of the profiling timer can be changed
805 * by writing a multiplier value into /proc/profile.
806 */
807int setup_profiling_timer(unsigned int multiplier)
808{
809 int i;
810
811 /*
812 * Sanity check. [at least 500 APIC cycles should be
813 * between APIC interrupts as a rule of thumb, to avoid
814 * irqs flooding us]
815 */
816 if ( (!multiplier) || (calibration_result/multiplier < 500))
817 return -EINVAL;
818
819 /*
820 * Set the new multiplier for each CPU. CPUs don't start using the
821 * new values until the next timer interrupt in which they do process
822 * accounting. At that time they also adjust their APIC timers
823 * accordingly.
824 */
825 for (i = 0; i < NR_CPUS; ++i)
826 per_cpu(prof_multiplier, i) = multiplier;
827
828 return 0;
829}
830
831#undef APIC_DIVISOR
832
833/*
834 * Local timer interrupt handler. It does both profiling and
835 * process statistics/rescheduling.
836 *
837 * We do profiling in every local tick, statistics/rescheduling
838 * happen only every 'profiling multiplier' ticks. The default
839 * multiplier is 1 and it can be changed by writing the new multiplier
840 * value into /proc/profile.
841 */
842
843void smp_local_timer_interrupt(struct pt_regs *regs)
844{
845 int cpu = smp_processor_id();
846
847 profile_tick(CPU_PROFILING, regs);
848 if (--per_cpu(prof_counter, cpu) <= 0) {
849 /*
850 * The multiplier may have changed since the last time we got
851 * to this point as a result of the user writing to
852 * /proc/profile. In this case we need to adjust the APIC
853 * timer accordingly.
854 *
855 * Interrupts are already masked off at this point.
856 */
857 per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
858 if (per_cpu(prof_counter, cpu) !=
859 per_cpu(prof_old_multiplier, cpu)) {
860 __setup_APIC_LVTT(calibration_result/
861 per_cpu(prof_counter, cpu));
862 per_cpu(prof_old_multiplier, cpu) =
863 per_cpu(prof_counter, cpu);
864 }
865
866#ifdef CONFIG_SMP
867 update_process_times(user_mode(regs));
868#endif
869 }
870
871 /*
872 * We take the 'long' return path, and there every subsystem
873 * grabs the appropriate locks (kernel lock/ irq lock).
874 *
875 * we might want to decouple profiling from the 'long path',
876 * and do the profiling totally in assembly.
877 *
878 * Currently this isn't too much of an issue (performance wise),
879 * we can take more than 100K local irqs per second on a 100 MHz P5.
880 */
881}
882
883/*
884 * Local APIC timer interrupt. This is the most natural way for doing
885 * local interrupts, but local timer interrupts can be emulated by
886 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
887 *
888 * [ if a single-CPU system runs an SMP kernel then we call the local
889 * interrupt as well. Thus we cannot inline the local irq ... ]
890 */
891void smp_apic_timer_interrupt(struct pt_regs *regs)
892{
893 /*
894 * the NMI deadlock-detector uses this.
895 */
896 add_pda(apic_timer_irqs, 1);
897
898 /*
899 * NOTE! We'd better ACK the irq immediately,
900 * because timer handling can be slow.
901 */
902 ack_APIC_irq();
903 /*
904 * update_process_times() expects us to have done irq_enter().
905 * Besides, if we don't timer interrupts ignore the global
906 * interrupt lock, which is the WrongThing (tm) to do.
907 */
908 irq_enter();
909 smp_local_timer_interrupt(regs);
910 irq_exit();
911}
912
913/*
914 * oem_force_hpet_timer -- force HPET mode for some boxes.
915 *
916 * Thus far, the major user of this is IBM's Summit2 series:
917 *
918 * Clustered boxes may have unsynced TSC problems if they are
919 * multi-chassis. Use available data to take a good guess.
920 * If in doubt, go HPET.
921 */
922__init int oem_force_hpet_timer(void)
923{
924 int i, clusters, zeros;
925 unsigned id;
926 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
927
928 bitmap_empty(clustermap, NUM_APIC_CLUSTERS);
929
930 for (i = 0; i < NR_CPUS; i++) {
931 id = bios_cpu_apicid[i];
932 if (id != BAD_APICID)
933 __set_bit(APIC_CLUSTERID(id), clustermap);
934 }
935
936 /* Problem: Partially populated chassis may not have CPUs in some of
937 * the APIC clusters they have been allocated. Only present CPUs have
938 * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since
939 * clusters are allocated sequentially, count zeros only if they are
940 * bounded by ones.
941 */
942 clusters = 0;
943 zeros = 0;
944 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
945 if (test_bit(i, clustermap)) {
946 clusters += 1 + zeros;
947 zeros = 0;
948 } else
949 ++zeros;
950 }
951
952 /*
953 * If clusters > 2, then should be multi-chassis. Return 1 for HPET.
954 * Else return 0 to use TSC.
955 * May have to revisit this when multi-core + hyperthreaded CPUs come
956 * out, but AFAIK this will work even for them.
957 */
958 return (clusters > 2);
959}
960
961/*
962 * This interrupt should _never_ happen with our APIC/SMP architecture
963 */
964asmlinkage void smp_spurious_interrupt(void)
965{
966 unsigned int v;
967 irq_enter();
968 /*
969 * Check if this really is a spurious interrupt and ACK it
970 * if it is a vectored one. Just in case...
971 * Spurious interrupts should not be ACKed.
972 */
973 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
974 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
975 ack_APIC_irq();
976
977#if 0
978 static unsigned long last_warning;
979 static unsigned long skipped;
980
981 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
982 if (time_before(last_warning+30*HZ,jiffies)) {
983 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
984 smp_processor_id(), skipped);
985 last_warning = jiffies;
986 skipped = 0;
987 } else {
988 skipped++;
989 }
990#endif
991 irq_exit();
992}
993
994/*
995 * This interrupt should never happen with our APIC/SMP architecture
996 */
997
998asmlinkage void smp_error_interrupt(void)
999{
1000 unsigned int v, v1;
1001
1002 irq_enter();
1003 /* First tickle the hardware, only then report what went on. -- REW */
1004 v = apic_read(APIC_ESR);
1005 apic_write(APIC_ESR, 0);
1006 v1 = apic_read(APIC_ESR);
1007 ack_APIC_irq();
1008 atomic_inc(&irq_err_count);
1009
1010 /* Here is what the APIC error bits mean:
1011 0: Send CS error
1012 1: Receive CS error
1013 2: Send accept error
1014 3: Receive accept error
1015 4: Reserved
1016 5: Send illegal vector
1017 6: Received illegal vector
1018 7: Illegal register address
1019 */
1020 printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1021 smp_processor_id(), v , v1);
1022 irq_exit();
1023}
1024
1025int disable_apic;
1026
1027/*
1028 * This initializes the IO-APIC and APIC hardware if this is
1029 * a UP kernel.
1030 */
1031int __init APIC_init_uniprocessor (void)
1032{
1033 if (disable_apic) {
1034 printk(KERN_INFO "Apic disabled\n");
1035 return -1;
1036 }
1037 if (!cpu_has_apic) {
1038 disable_apic = 1;
1039 printk(KERN_INFO "Apic disabled by BIOS\n");
1040 return -1;
1041 }
1042
1043 verify_local_APIC();
1044
1045 connect_bsp_APIC();
1046
1047 phys_cpu_present_map = physid_mask_of_physid(0);
1048 apic_write_around(APIC_ID, boot_cpu_id);
1049
1050 setup_local_APIC();
1051
1052#ifdef CONFIG_X86_IO_APIC
1053 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1054 setup_IO_APIC();
1055 else
1056 nr_ioapics = 0;
1057#endif
1058 setup_boot_APIC_clock();
1059
1060 return 0;
1061}
1062
1063static __init int setup_disableapic(char *str)
1064{
1065 disable_apic = 1;
1066 return 0;
1067}
1068
1069static __init int setup_nolapic(char *str)
1070{
1071 disable_apic = 1;
1072 return 0;
1073}
1074
1075static __init int setup_noapictimer(char *str)
1076{
1077 disable_apic_timer = 1;
1078 return 0;
1079}
1080
1081/* dummy parsing: see setup.c */
1082
1083__setup("disableapic", setup_disableapic);
1084__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
1085
1086__setup("noapictimer", setup_noapictimer);
1087
1088/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
new file mode 100644
index 000000000000..35b4c3fcbb37
--- /dev/null
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -0,0 +1,69 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6
7#include <linux/sched.h>
8#include <linux/stddef.h>
9#include <linux/errno.h>
10#include <linux/hardirq.h>
11#include <linux/suspend.h>
12#include <asm/pda.h>
13#include <asm/processor.h>
14#include <asm/segment.h>
15#include <asm/thread_info.h>
16#include <asm/ia32.h>
17
18#define DEFINE(sym, val) \
19 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
20
21#define BLANK() asm volatile("\n->" : : )
22
23int main(void)
24{
25#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
26 ENTRY(state);
27 ENTRY(flags);
28 ENTRY(thread);
29 ENTRY(pid);
30 BLANK();
31#undef ENTRY
32#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
33 ENTRY(flags);
34 ENTRY(addr_limit);
35 ENTRY(preempt_count);
36 BLANK();
37#undef ENTRY
38#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
39 ENTRY(kernelstack);
40 ENTRY(oldrsp);
41 ENTRY(pcurrent);
42 ENTRY(irqrsp);
43 ENTRY(irqcount);
44 ENTRY(cpunumber);
45 ENTRY(irqstackptr);
46 BLANK();
47#undef ENTRY
48#ifdef CONFIG_IA32_EMULATION
49#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
50 ENTRY(eax);
51 ENTRY(ebx);
52 ENTRY(ecx);
53 ENTRY(edx);
54 ENTRY(esi);
55 ENTRY(edi);
56 ENTRY(ebp);
57 ENTRY(esp);
58 ENTRY(eip);
59 BLANK();
60#undef ENTRY
61 DEFINE(IA32_RT_SIGFRAME_sigcontext,
62 offsetof (struct rt_sigframe32, uc.uc_mcontext));
63 BLANK();
64#endif
65 DEFINE(pbe_address, offsetof(struct pbe, address));
66 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
67 DEFINE(pbe_next, offsetof(struct pbe, next));
68 return 0;
69}
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..81f1562e5393
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Kconfig
@@ -0,0 +1,96 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_POWERNOW_K8
14 tristate "AMD Opteron/Athlon64 PowerNow!"
15 select CPU_FREQ_TABLE
16 help
17 This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
18
19 For details, take a look at <file:Documentation/cpu-freq/>.
20
21 If in doubt, say N.
22
23config X86_POWERNOW_K8_ACPI
24 bool
25 depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
26 depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
27 default y
28
29config X86_SPEEDSTEP_CENTRINO
30 tristate "Intel Enhanced SpeedStep"
31 select CPU_FREQ_TABLE
32 depends on ACPI_PROCESSOR
33 help
34 This adds the CPUFreq driver for Enhanced SpeedStep enabled
35 mobile CPUs. This means Intel Pentium M (Centrino) CPUs
36 or 64bit enabled Intel Xeons.
37
38 For details, take a look at <file:Documentation/cpu-freq/>.
39
40 If in doubt, say N.
41
42config X86_SPEEDSTEP_CENTRINO_ACPI
43 bool
44 depends on X86_SPEEDSTEP_CENTRINO
45 default y
46
47config X86_ACPI_CPUFREQ
48 tristate "ACPI Processor P-States driver"
49 depends on ACPI_PROCESSOR
50 help
51 This driver adds a CPUFreq driver which utilizes the ACPI
52 Processor Performance States.
53
54 For details, take a look at <file:Documentation/cpu-freq/>.
55
56 If in doubt, say N.
57
58comment "shared options"
59
60config X86_ACPI_CPUFREQ_PROC_INTF
61 bool "/proc/acpi/processor/../performance interface (deprecated)"
62 depends on PROC_FS
63 depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI
64 help
65 This enables the deprecated /proc/acpi/processor/../performance
66 interface. While it is helpful for debugging, the generic,
67 cross-architecture cpufreq interfaces should be used.
68
69 If in doubt, say N.
70
71config X86_P4_CLOCKMOD
72 tristate "Intel Pentium 4 clock modulation"
73 depends on EMBEDDED
74 help
75 This adds the clock modulation driver for Intel Pentium 4 / XEON
76 processors. When enabled it will lower CPU temperature by skipping
77 clocks.
78
79 This driver should be only used in exceptional
80 circumstances when very low power is needed because it causes severe
81 slowdowns and noticeable latencies. Normally Speedstep should be used
82 instead.
83
84 For details, take a look at <file:Documentation/cpu-freq/>.
85
86 Unless you are absolutely sure say N.
87
88
89config X86_SPEEDSTEP_LIB
90 tristate
91 default X86_P4_CLOCKMOD
92
93endif
94
95endmenu
96
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile
new file mode 100644
index 000000000000..d8b593879224
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Makefile
@@ -0,0 +1,17 @@
1#
2# Reuse the i386 cpufreq drivers
3#
4
5SRCDIR := ../../../i386/kernel/cpu/cpufreq
6
7obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
8obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
9obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
10obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
11obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
12
13powernow-k8-objs := ${SRCDIR}/powernow-k8.o
14speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o
15acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o
16p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o
17speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
new file mode 100644
index 000000000000..657003e461e6
--- /dev/null
+++ b/arch/x86_64/kernel/e820.c
@@ -0,0 +1,513 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
5 */
6#include <linux/config.h>
7#include <linux/kernel.h>
8#include <linux/types.h>
9#include <linux/init.h>
10#include <linux/bootmem.h>
11#include <linux/ioport.h>
12#include <linux/string.h>
13#include <asm/page.h>
14#include <asm/e820.h>
15#include <asm/proto.h>
16#include <asm/bootsetup.h>
17
18extern char _end[];
19
20/*
21 * PFN of last memory page.
22 */
23unsigned long end_pfn;
24
25/*
26 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
27 * The direct mapping extends to end_pfn_map, so that we can directly access
28 * apertures, ACPI and other tables without having to play with fixmaps.
29 */
30unsigned long end_pfn_map;
31
32/*
33 * Last pfn which the user wants to use.
34 */
35unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
36
37extern struct resource code_resource, data_resource;
38
39/* Check for some hardcoded bad areas that early boot is not allowed to touch */
40static inline int bad_addr(unsigned long *addrp, unsigned long size)
41{
42 unsigned long addr = *addrp, last = addr + size;
43
44 /* various gunk below that needed for SMP startup */
45 if (addr < 0x8000) {
46 *addrp = 0x8000;
47 return 1;
48 }
49
50 /* direct mapping tables of the kernel */
51 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
52 *addrp = table_end << PAGE_SHIFT;
53 return 1;
54 }
55
56 /* initrd */
57#ifdef CONFIG_BLK_DEV_INITRD
58 if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
59 addr < INITRD_START+INITRD_SIZE) {
60 *addrp = INITRD_START + INITRD_SIZE;
61 return 1;
62 }
63#endif
64 /* kernel code + 640k memory hole (later should not be needed, but
65 be paranoid for now) */
66 if (last >= 640*1024 && addr < __pa_symbol(&_end)) {
67 *addrp = __pa_symbol(&_end);
68 return 1;
69 }
70 /* XXX ramdisk image here? */
71 return 0;
72}
73
74int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
75{
76 int i;
77 for (i = 0; i < e820.nr_map; i++) {
78 struct e820entry *ei = &e820.map[i];
79 if (type && ei->type != type)
80 continue;
81 if (ei->addr >= end || ei->addr + ei->size < start)
82 continue;
83 return 1;
84 }
85 return 0;
86}
87
88/*
89 * Find a free area in a specific range.
90 */
91unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
92{
93 int i;
94 for (i = 0; i < e820.nr_map; i++) {
95 struct e820entry *ei = &e820.map[i];
96 unsigned long addr = ei->addr, last;
97 if (ei->type != E820_RAM)
98 continue;
99 if (addr < start)
100 addr = start;
101 if (addr > ei->addr + ei->size)
102 continue;
103 while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
104 ;
105 last = addr + size;
106 if (last > ei->addr + ei->size)
107 continue;
108 if (last > end)
109 continue;
110 return addr;
111 }
112 return -1UL;
113}
114
115/*
116 * Free bootmem based on the e820 table for a node.
117 */
118void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
119{
120 int i;
121 for (i = 0; i < e820.nr_map; i++) {
122 struct e820entry *ei = &e820.map[i];
123 unsigned long last, addr;
124
125 if (ei->type != E820_RAM ||
126 ei->addr+ei->size <= start ||
127 ei->addr > end)
128 continue;
129
130 addr = round_up(ei->addr, PAGE_SIZE);
131 if (addr < start)
132 addr = start;
133
134 last = round_down(ei->addr + ei->size, PAGE_SIZE);
135 if (last >= end)
136 last = end;
137
138 if (last > addr && last-addr >= PAGE_SIZE)
139 free_bootmem_node(pgdat, addr, last-addr);
140 }
141}
142
143/*
144 * Find the highest page frame number we have available
145 */
146unsigned long __init e820_end_of_ram(void)
147{
148 int i;
149 unsigned long end_pfn = 0;
150
151 for (i = 0; i < e820.nr_map; i++) {
152 struct e820entry *ei = &e820.map[i];
153 unsigned long start, end;
154
155 start = round_up(ei->addr, PAGE_SIZE);
156 end = round_down(ei->addr + ei->size, PAGE_SIZE);
157 if (start >= end)
158 continue;
159 if (ei->type == E820_RAM) {
160 if (end > end_pfn<<PAGE_SHIFT)
161 end_pfn = end>>PAGE_SHIFT;
162 } else {
163 if (end > end_pfn_map<<PAGE_SHIFT)
164 end_pfn_map = end>>PAGE_SHIFT;
165 }
166 }
167
168 if (end_pfn > end_pfn_map)
169 end_pfn_map = end_pfn;
170 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
171 end_pfn_map = MAXMEM>>PAGE_SHIFT;
172 if (end_pfn > end_user_pfn)
173 end_pfn = end_user_pfn;
174 if (end_pfn > end_pfn_map)
175 end_pfn = end_pfn_map;
176
177 return end_pfn;
178}
179
180/*
181 * Mark e820 reserved areas as busy for the resource manager.
182 */
183void __init e820_reserve_resources(void)
184{
185 int i;
186 for (i = 0; i < e820.nr_map; i++) {
187 struct resource *res;
188 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
189 continue;
190 res = alloc_bootmem_low(sizeof(struct resource));
191 switch (e820.map[i].type) {
192 case E820_RAM: res->name = "System RAM"; break;
193 case E820_ACPI: res->name = "ACPI Tables"; break;
194 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
195 default: res->name = "reserved";
196 }
197 res->start = e820.map[i].addr;
198 res->end = res->start + e820.map[i].size - 1;
199 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
200 request_resource(&iomem_resource, res);
201 if (e820.map[i].type == E820_RAM) {
202 /*
203 * We don't know which RAM region contains kernel data,
204 * so we try it repeatedly and let the resource manager
205 * test it.
206 */
207 request_resource(res, &code_resource);
208 request_resource(res, &data_resource);
209 }
210 }
211}
212
213/*
214 * Add a memory region to the kernel e820 map.
215 */
216void __init add_memory_region(unsigned long start, unsigned long size, int type)
217{
218 int x = e820.nr_map;
219
220 if (x == E820MAX) {
221 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
222 return;
223 }
224
225 e820.map[x].addr = start;
226 e820.map[x].size = size;
227 e820.map[x].type = type;
228 e820.nr_map++;
229}
230
231void __init e820_print_map(char *who)
232{
233 int i;
234
235 for (i = 0; i < e820.nr_map; i++) {
236 printk(" %s: %016Lx - %016Lx ", who,
237 (unsigned long long) e820.map[i].addr,
238 (unsigned long long) (e820.map[i].addr + e820.map[i].size));
239 switch (e820.map[i].type) {
240 case E820_RAM: printk("(usable)\n");
241 break;
242 case E820_RESERVED:
243 printk("(reserved)\n");
244 break;
245 case E820_ACPI:
246 printk("(ACPI data)\n");
247 break;
248 case E820_NVS:
249 printk("(ACPI NVS)\n");
250 break;
251 default: printk("type %u\n", e820.map[i].type);
252 break;
253 }
254 }
255}
256
257/*
258 * Sanitize the BIOS e820 map.
259 *
260 * Some e820 responses include overlapping entries. The following
261 * replaces the original e820 map with a new one, removing overlaps.
262 *
263 */
264static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
265{
266 struct change_member {
267 struct e820entry *pbios; /* pointer to original bios entry */
268 unsigned long long addr; /* address for this change point */
269 };
270 static struct change_member change_point_list[2*E820MAX] __initdata;
271 static struct change_member *change_point[2*E820MAX] __initdata;
272 static struct e820entry *overlap_list[E820MAX] __initdata;
273 static struct e820entry new_bios[E820MAX] __initdata;
274 struct change_member *change_tmp;
275 unsigned long current_type, last_type;
276 unsigned long long last_addr;
277 int chgidx, still_changing;
278 int overlap_entries;
279 int new_bios_entry;
280 int old_nr, new_nr;
281 int i;
282
283 /*
284 Visually we're performing the following (1,2,3,4 = memory types)...
285
286 Sample memory map (w/overlaps):
287 ____22__________________
288 ______________________4_
289 ____1111________________
290 _44_____________________
291 11111111________________
292 ____________________33__
293 ___________44___________
294 __________33333_________
295 ______________22________
296 ___________________2222_
297 _________111111111______
298 _____________________11_
299 _________________4______
300
301 Sanitized equivalent (no overlap):
302 1_______________________
303 _44_____________________
304 ___1____________________
305 ____22__________________
306 ______11________________
307 _________1______________
308 __________3_____________
309 ___________44___________
310 _____________33_________
311 _______________2________
312 ________________1_______
313 _________________4______
314 ___________________2____
315 ____________________33__
316 ______________________4_
317 */
318
319 /* if there's only one memory region, don't bother */
320 if (*pnr_map < 2)
321 return -1;
322
323 old_nr = *pnr_map;
324
325 /* bail out if we find any unreasonable addresses in bios map */
326 for (i=0; i<old_nr; i++)
327 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
328 return -1;
329
330 /* create pointers for initial change-point information (for sorting) */
331 for (i=0; i < 2*old_nr; i++)
332 change_point[i] = &change_point_list[i];
333
334 /* record all known change-points (starting and ending addresses) */
335 chgidx = 0;
336 for (i=0; i < old_nr; i++) {
337 change_point[chgidx]->addr = biosmap[i].addr;
338 change_point[chgidx++]->pbios = &biosmap[i];
339 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
340 change_point[chgidx++]->pbios = &biosmap[i];
341 }
342
343 /* sort change-point list by memory addresses (low -> high) */
344 still_changing = 1;
345 while (still_changing) {
346 still_changing = 0;
347 for (i=1; i < 2*old_nr; i++) {
348 /* if <current_addr> > <last_addr>, swap */
349 /* or, if current=<start_addr> & last=<end_addr>, swap */
350 if ((change_point[i]->addr < change_point[i-1]->addr) ||
351 ((change_point[i]->addr == change_point[i-1]->addr) &&
352 (change_point[i]->addr == change_point[i]->pbios->addr) &&
353 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
354 )
355 {
356 change_tmp = change_point[i];
357 change_point[i] = change_point[i-1];
358 change_point[i-1] = change_tmp;
359 still_changing=1;
360 }
361 }
362 }
363
364 /* create a new bios memory map, removing overlaps */
365 overlap_entries=0; /* number of entries in the overlap table */
366 new_bios_entry=0; /* index for creating new bios map entries */
367 last_type = 0; /* start with undefined memory type */
368 last_addr = 0; /* start with 0 as last starting address */
369 /* loop through change-points, determining affect on the new bios map */
370 for (chgidx=0; chgidx < 2*old_nr; chgidx++)
371 {
372 /* keep track of all overlapping bios entries */
373 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
374 {
375 /* add map entry to overlap list (> 1 entry implies an overlap) */
376 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
377 }
378 else
379 {
380 /* remove entry from list (order independent, so swap with last) */
381 for (i=0; i<overlap_entries; i++)
382 {
383 if (overlap_list[i] == change_point[chgidx]->pbios)
384 overlap_list[i] = overlap_list[overlap_entries-1];
385 }
386 overlap_entries--;
387 }
388 /* if there are overlapping entries, decide which "type" to use */
389 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
390 current_type = 0;
391 for (i=0; i<overlap_entries; i++)
392 if (overlap_list[i]->type > current_type)
393 current_type = overlap_list[i]->type;
394 /* continue building up new bios map based on this information */
395 if (current_type != last_type) {
396 if (last_type != 0) {
397 new_bios[new_bios_entry].size =
398 change_point[chgidx]->addr - last_addr;
399 /* move forward only if the new size was non-zero */
400 if (new_bios[new_bios_entry].size != 0)
401 if (++new_bios_entry >= E820MAX)
402 break; /* no more space left for new bios entries */
403 }
404 if (current_type != 0) {
405 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
406 new_bios[new_bios_entry].type = current_type;
407 last_addr=change_point[chgidx]->addr;
408 }
409 last_type = current_type;
410 }
411 }
412 new_nr = new_bios_entry; /* retain count for new bios entries */
413
414 /* copy new bios mapping into original location */
415 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
416 *pnr_map = new_nr;
417
418 return 0;
419}
420
421/*
422 * Copy the BIOS e820 map into a safe place.
423 *
424 * Sanity-check it while we're at it..
425 *
426 * If we're lucky and live on a modern system, the setup code
427 * will have given us a memory map that we can use to properly
428 * set up memory. If we aren't, we'll fake a memory map.
429 *
430 * We check to see that the memory map contains at least 2 elements
431 * before we'll use it, because the detection code in setup.S may
432 * not be perfect and most every PC known to man has two memory
433 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
434 * thinkpad 560x, for example, does not cooperate with the memory
435 * detection code.)
436 */
437static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
438{
439 /* Only one memory region (or negative)? Ignore it */
440 if (nr_map < 2)
441 return -1;
442
443 do {
444 unsigned long start = biosmap->addr;
445 unsigned long size = biosmap->size;
446 unsigned long end = start + size;
447 unsigned long type = biosmap->type;
448
449 /* Overflow in 64 bits? Ignore the memory map. */
450 if (start > end)
451 return -1;
452
453 /*
454 * Some BIOSes claim RAM in the 640k - 1M region.
455 * Not right. Fix it up.
456 *
457 * This should be removed on Hammer which is supposed to not
458 * have non e820 covered ISA mappings there, but I had some strange
459 * problems so it stays for now. -AK
460 */
461 if (type == E820_RAM) {
462 if (start < 0x100000ULL && end > 0xA0000ULL) {
463 if (start < 0xA0000ULL)
464 add_memory_region(start, 0xA0000ULL-start, type);
465 if (end <= 0x100000ULL)
466 continue;
467 start = 0x100000ULL;
468 size = end - start;
469 }
470 }
471
472 add_memory_region(start, size, type);
473 } while (biosmap++,--nr_map);
474 return 0;
475}
476
477void __init setup_memory_region(void)
478{
479 char *who = "BIOS-e820";
480
481 /*
482 * Try to copy the BIOS-supplied E820-map.
483 *
484 * Otherwise fake a memory map; one section from 0k->640k,
485 * the next section from 1mb->appropriate_mem_k
486 */
487 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
488 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
489 unsigned long mem_size;
490
491 /* compare results from other methods and take the greater */
492 if (ALT_MEM_K < EXT_MEM_K) {
493 mem_size = EXT_MEM_K;
494 who = "BIOS-88";
495 } else {
496 mem_size = ALT_MEM_K;
497 who = "BIOS-e801";
498 }
499
500 e820.nr_map = 0;
501 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
502 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
503 }
504 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
505 e820_print_map(who);
506}
507
508void __init parse_memopt(char *p, char **from)
509{
510 end_user_pfn = memparse(p, from);
511 end_user_pfn >>= PAGE_SHIFT;
512}
513
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
new file mode 100644
index 000000000000..750bcd0655dc
--- /dev/null
+++ b/arch/x86_64/kernel/early_printk.c
@@ -0,0 +1,220 @@
1#include <linux/console.h>
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/string.h>
5#include <asm/io.h>
6#include <asm/processor.h>
7
8/* Simple VGA output */
9
10#ifdef __i386__
11#define VGABASE (__ISA_IO_base + 0xb8000)
12#else
13#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
14#endif
15
16#define MAX_YPOS 25
17#define MAX_XPOS 80
18
19static int current_ypos = 1, current_xpos = 0;
20
21static void early_vga_write(struct console *con, const char *str, unsigned n)
22{
23 char c;
24 int i, k, j;
25
26 while ((c = *str++) != '\0' && n-- > 0) {
27 if (current_ypos >= MAX_YPOS) {
28 /* scroll 1 line up */
29 for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
30 for (i = 0; i < MAX_XPOS; i++) {
31 writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
32 VGABASE + 2*(MAX_XPOS*j + i));
33 }
34 }
35 for (i = 0; i < MAX_XPOS; i++)
36 writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
37 current_ypos = MAX_YPOS-1;
38 }
39 if (c == '\n') {
40 current_xpos = 0;
41 current_ypos++;
42 } else if (c != '\r') {
43 writew(((0x7 << 8) | (unsigned short) c),
44 VGABASE + 2*(MAX_XPOS*current_ypos +
45 current_xpos++));
46 if (current_xpos >= MAX_XPOS) {
47 current_xpos = 0;
48 current_ypos++;
49 }
50 }
51 }
52}
53
54static struct console early_vga_console = {
55 .name = "earlyvga",
56 .write = early_vga_write,
57 .flags = CON_PRINTBUFFER,
58 .index = -1,
59};
60
61/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
62
63int early_serial_base = 0x3f8; /* ttyS0 */
64
65#define XMTRDY 0x20
66
67#define DLAB 0x80
68
69#define TXR 0 /* Transmit register (WRITE) */
70#define RXR 0 /* Receive register (READ) */
71#define IER 1 /* Interrupt Enable */
72#define IIR 2 /* Interrupt ID */
73#define FCR 2 /* FIFO control */
74#define LCR 3 /* Line control */
75#define MCR 4 /* Modem control */
76#define LSR 5 /* Line Status */
77#define MSR 6 /* Modem Status */
78#define DLL 0 /* Divisor Latch Low */
79#define DLH 1 /* Divisor latch High */
80
81static int early_serial_putc(unsigned char ch)
82{
83 unsigned timeout = 0xffff;
84 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
85 cpu_relax();
86 outb(ch, early_serial_base + TXR);
87 return timeout ? 0 : -1;
88}
89
90static void early_serial_write(struct console *con, const char *s, unsigned n)
91{
92 while (*s && n-- > 0) {
93 early_serial_putc(*s);
94 if (*s == '\n')
95 early_serial_putc('\r');
96 s++;
97 }
98}
99
100#define DEFAULT_BAUD 9600
101
102static __init void early_serial_init(char *s)
103{
104 unsigned char c;
105 unsigned divisor;
106 unsigned baud = DEFAULT_BAUD;
107 char *e;
108
109 if (*s == ',')
110 ++s;
111
112 if (*s) {
113 unsigned port;
114 if (!strncmp(s,"0x",2)) {
115 early_serial_base = simple_strtoul(s, &e, 16);
116 } else {
117 static int bases[] = { 0x3f8, 0x2f8 };
118
119 if (!strncmp(s,"ttyS",4))
120 s += 4;
121 port = simple_strtoul(s, &e, 10);
122 if (port > 1 || s == e)
123 port = 0;
124 early_serial_base = bases[port];
125 }
126 s += strcspn(s, ",");
127 if (*s == ',')
128 s++;
129 }
130
131 outb(0x3, early_serial_base + LCR); /* 8n1 */
132 outb(0, early_serial_base + IER); /* no interrupt */
133 outb(0, early_serial_base + FCR); /* no fifo */
134 outb(0x3, early_serial_base + MCR); /* DTR + RTS */
135
136 if (*s) {
137 baud = simple_strtoul(s, &e, 0);
138 if (baud == 0 || s == e)
139 baud = DEFAULT_BAUD;
140 }
141
142 divisor = 115200 / baud;
143 c = inb(early_serial_base + LCR);
144 outb(c | DLAB, early_serial_base + LCR);
145 outb(divisor & 0xff, early_serial_base + DLL);
146 outb((divisor >> 8) & 0xff, early_serial_base + DLH);
147 outb(c & ~DLAB, early_serial_base + LCR);
148}
149
150static struct console early_serial_console = {
151 .name = "earlyser",
152 .write = early_serial_write,
153 .flags = CON_PRINTBUFFER,
154 .index = -1,
155};
156
157/* Direct interface for emergencies */
158struct console *early_console = &early_vga_console;
159static int early_console_initialized = 0;
160
161void early_printk(const char *fmt, ...)
162{
163 char buf[512];
164 int n;
165 va_list ap;
166
167 va_start(ap,fmt);
168 n = vscnprintf(buf,512,fmt,ap);
169 early_console->write(early_console,buf,n);
170 va_end(ap);
171}
172
173static int keep_early;
174
175int __init setup_early_printk(char *opt)
176{
177 char *space;
178 char buf[256];
179
180 if (early_console_initialized)
181 return -1;
182
183 opt = strchr(opt, '=') + 1;
184
185 strlcpy(buf,opt,sizeof(buf));
186 space = strchr(buf, ' ');
187 if (space)
188 *space = 0;
189
190 if (strstr(buf,"keep"))
191 keep_early = 1;
192
193 if (!strncmp(buf, "serial", 6)) {
194 early_serial_init(buf + 6);
195 early_console = &early_serial_console;
196 } else if (!strncmp(buf, "ttyS", 4)) {
197 early_serial_init(buf);
198 early_console = &early_serial_console;
199 } else if (!strncmp(buf, "vga", 3)) {
200 early_console = &early_vga_console;
201 }
202 early_console_initialized = 1;
203 register_console(early_console);
204 return 0;
205}
206
207void __init disable_early_printk(void)
208{
209 if (!early_console_initialized || !early_console)
210 return;
211 if (!keep_early) {
212 printk("disabling early console\n");
213 unregister_console(early_console);
214 early_console_initialized = 0;
215 } else {
216 printk("keeping early console\n");
217 }
218}
219
220__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
new file mode 100644
index 000000000000..e126284db7a8
--- /dev/null
+++ b/arch/x86_64/kernel/entry.S
@@ -0,0 +1,920 @@
1/*
2 * linux/arch/x86_64/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 *
8 * $Id$
9 */
10
11/*
12 * entry.S contains the system-call and fault low-level handling routines.
13 *
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers upto R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 *
26 * TODO:
27 * - schedule it carefully for the final hardware.
28 */
29
30#define ASSEMBLY 1
31#include <linux/config.h>
32#include <linux/linkage.h>
33#include <asm/segment.h>
34#include <asm/smp.h>
35#include <asm/cache.h>
36#include <asm/errno.h>
37#include <asm/dwarf2.h>
38#include <asm/calling.h>
39#include <asm/offset.h>
40#include <asm/msr.h>
41#include <asm/unistd.h>
42#include <asm/thread_info.h>
43#include <asm/hw_irq.h>
44
45 .code64
46
47#ifdef CONFIG_PREEMPT
48#define preempt_stop cli
49#else
50#define preempt_stop
51#define retint_kernel retint_restore_args
52#endif
53
54/*
55 * C code is not supposed to know about undefined top of stack. Every time
56 * a C function with an pt_regs argument is called from the SYSCALL based
57 * fast path FIXUP_TOP_OF_STACK is needed.
58 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
59 * manipulation.
60 */
61
62 /* %rsp:at FRAMEEND */
63 .macro FIXUP_TOP_OF_STACK tmp
64 movq %gs:pda_oldrsp,\tmp
65 movq \tmp,RSP(%rsp)
66 movq $__USER_DS,SS(%rsp)
67 movq $__USER_CS,CS(%rsp)
68 movq $-1,RCX(%rsp)
69 movq R11(%rsp),\tmp /* get eflags */
70 movq \tmp,EFLAGS(%rsp)
71 .endm
72
73 .macro RESTORE_TOP_OF_STACK tmp,offset=0
74 movq RSP-\offset(%rsp),\tmp
75 movq \tmp,%gs:pda_oldrsp
76 movq EFLAGS-\offset(%rsp),\tmp
77 movq \tmp,R11-\offset(%rsp)
78 .endm
79
80 .macro FAKE_STACK_FRAME child_rip
81 /* push in order ss, rsp, eflags, cs, rip */
82 xorq %rax, %rax
83 pushq %rax /* ss */
84 CFI_ADJUST_CFA_OFFSET 8
85 pushq %rax /* rsp */
86 CFI_ADJUST_CFA_OFFSET 8
87 CFI_OFFSET rip,0
88 pushq $(1<<9) /* eflags - interrupts on */
89 CFI_ADJUST_CFA_OFFSET 8
90 pushq $__KERNEL_CS /* cs */
91 CFI_ADJUST_CFA_OFFSET 8
92 pushq \child_rip /* rip */
93 CFI_ADJUST_CFA_OFFSET 8
94 CFI_OFFSET rip,0
95 pushq %rax /* orig rax */
96 CFI_ADJUST_CFA_OFFSET 8
97 .endm
98
99 .macro UNFAKE_STACK_FRAME
100 addq $8*6, %rsp
101 CFI_ADJUST_CFA_OFFSET -(6*8)
102 .endm
103
104 .macro CFI_DEFAULT_STACK
105 CFI_ADJUST_CFA_OFFSET (SS)
106 CFI_OFFSET r15,R15-SS
107 CFI_OFFSET r14,R14-SS
108 CFI_OFFSET r13,R13-SS
109 CFI_OFFSET r12,R12-SS
110 CFI_OFFSET rbp,RBP-SS
111 CFI_OFFSET rbx,RBX-SS
112 CFI_OFFSET r11,R11-SS
113 CFI_OFFSET r10,R10-SS
114 CFI_OFFSET r9,R9-SS
115 CFI_OFFSET r8,R8-SS
116 CFI_OFFSET rax,RAX-SS
117 CFI_OFFSET rcx,RCX-SS
118 CFI_OFFSET rdx,RDX-SS
119 CFI_OFFSET rsi,RSI-SS
120 CFI_OFFSET rdi,RDI-SS
121 CFI_OFFSET rsp,RSP-SS
122 CFI_OFFSET rip,RIP-SS
123 .endm
124/*
125 * A newly forked process directly context switches into this.
126 */
127/* rdi: prev */
128ENTRY(ret_from_fork)
129 CFI_STARTPROC
130 CFI_DEFAULT_STACK
131 call schedule_tail
132 GET_THREAD_INFO(%rcx)
133 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
134 jnz rff_trace
135rff_action:
136 RESTORE_REST
137 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
138 je int_ret_from_sys_call
139 testl $_TIF_IA32,threadinfo_flags(%rcx)
140 jnz int_ret_from_sys_call
141 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
142 jmp ret_from_sys_call
143rff_trace:
144 movq %rsp,%rdi
145 call syscall_trace_leave
146 GET_THREAD_INFO(%rcx)
147 jmp rff_action
148 CFI_ENDPROC
149
150/*
151 * System call entry. Upto 6 arguments in registers are supported.
152 *
153 * SYSCALL does not save anything on the stack and does not change the
154 * stack pointer.
155 */
156
157/*
158 * Register setup:
159 * rax system call number
160 * rdi arg0
161 * rcx return address for syscall/sysret, C arg3
162 * rsi arg1
163 * rdx arg2
164 * r10 arg3 (--> moved to rcx for C)
165 * r8 arg4
166 * r9 arg5
167 * r11 eflags for syscall/sysret, temporary for C
168 * r12-r15,rbp,rbx saved by C code, not touched.
169 *
170 * Interrupts are off on entry.
171 * Only called from user space.
172 *
173 * XXX if we had a free scratch register we could save the RSP into the stack frame
174 * and report it properly in ps. Unfortunately we haven't.
175 */
176
177ENTRY(system_call)
178 CFI_STARTPROC
179 swapgs
180 movq %rsp,%gs:pda_oldrsp
181 movq %gs:pda_kernelstack,%rsp
182 sti
183 SAVE_ARGS 8,1
184 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
185 movq %rcx,RIP-ARGOFFSET(%rsp)
186 GET_THREAD_INFO(%rcx)
187 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
188 jnz tracesys
189 cmpq $__NR_syscall_max,%rax
190 ja badsys
191 movq %r10,%rcx
192 call *sys_call_table(,%rax,8) # XXX: rip relative
193 movq %rax,RAX-ARGOFFSET(%rsp)
194/*
195 * Syscall return path ending with SYSRET (fast path)
196 * Has incomplete stack frame and undefined top of stack.
197 */
198 .globl ret_from_sys_call
199ret_from_sys_call:
200 movl $_TIF_WORK_MASK,%edi
201 /* edi: flagmask */
202sysret_check:
203 GET_THREAD_INFO(%rcx)
204 cli
205 movl threadinfo_flags(%rcx),%edx
206 andl %edi,%edx
207 jnz sysret_careful
208 movq RIP-ARGOFFSET(%rsp),%rcx
209 RESTORE_ARGS 0,-ARG_SKIP,1
210 movq %gs:pda_oldrsp,%rsp
211 swapgs
212 sysretq
213
214 /* Handle reschedules */
215 /* edx: work, edi: workmask */
216sysret_careful:
217 bt $TIF_NEED_RESCHED,%edx
218 jnc sysret_signal
219 sti
220 pushq %rdi
221 call schedule
222 popq %rdi
223 jmp sysret_check
224
225 /* Handle a signal */
226sysret_signal:
227 sti
228 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
229 jz 1f
230
231 /* Really a signal */
232 /* edx: work flags (arg3) */
233 leaq do_notify_resume(%rip),%rax
234 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
235 xorl %esi,%esi # oldset -> arg2
236 call ptregscall_common
2371: movl $_TIF_NEED_RESCHED,%edi
238 jmp sysret_check
239
240 /* Do syscall tracing */
241tracesys:
242 SAVE_REST
243 movq $-ENOSYS,RAX(%rsp)
244 FIXUP_TOP_OF_STACK %rdi
245 movq %rsp,%rdi
246 call syscall_trace_enter
247 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
248 RESTORE_REST
249 cmpq $__NR_syscall_max,%rax
250 ja 1f
251 movq %r10,%rcx /* fixup for C */
252 call *sys_call_table(,%rax,8)
253 movq %rax,RAX-ARGOFFSET(%rsp)
2541: SAVE_REST
255 movq %rsp,%rdi
256 call syscall_trace_leave
257 RESTORE_TOP_OF_STACK %rbx
258 RESTORE_REST
259 jmp ret_from_sys_call
260
261badsys:
262 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
263 jmp ret_from_sys_call
264
265/*
266 * Syscall return path ending with IRET.
267 * Has correct top of stack, but partial stack frame.
268 */
269ENTRY(int_ret_from_sys_call)
270 cli
271 testl $3,CS-ARGOFFSET(%rsp)
272 je retint_restore_args
273 movl $_TIF_ALLWORK_MASK,%edi
274 /* edi: mask to check */
275int_with_check:
276 GET_THREAD_INFO(%rcx)
277 movl threadinfo_flags(%rcx),%edx
278 andl %edi,%edx
279 jnz int_careful
280 jmp retint_swapgs
281
282 /* Either reschedule or signal or syscall exit tracking needed. */
283 /* First do a reschedule test. */
284 /* edx: work, edi: workmask */
285int_careful:
286 bt $TIF_NEED_RESCHED,%edx
287 jnc int_very_careful
288 sti
289 pushq %rdi
290 call schedule
291 popq %rdi
292 jmp int_with_check
293
294 /* handle signals and tracing -- both require a full stack frame */
295int_very_careful:
296 sti
297 SAVE_REST
298 /* Check for syscall exit trace */
299 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
300 jz int_signal
301 pushq %rdi
302 leaq 8(%rsp),%rdi # &ptregs -> arg1
303 call syscall_trace_leave
304 popq %rdi
305 btr $TIF_SYSCALL_TRACE,%edi
306 btr $TIF_SYSCALL_AUDIT,%edi
307 btr $TIF_SINGLESTEP,%edi
308 jmp int_restore_rest
309
310int_signal:
311 testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
312 jz 1f
313 movq %rsp,%rdi # &ptregs -> arg1
314 xorl %esi,%esi # oldset -> arg2
315 call do_notify_resume
3161: movl $_TIF_NEED_RESCHED,%edi
317int_restore_rest:
318 RESTORE_REST
319 jmp int_with_check
320 CFI_ENDPROC
321
322/*
323 * Certain special system calls that need to save a complete full stack frame.
324 */
325
326 .macro PTREGSCALL label,func,arg
327 .globl \label
328\label:
329 leaq \func(%rip),%rax
330 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
331 jmp ptregscall_common
332 .endm
333
334 PTREGSCALL stub_clone, sys_clone, %r8
335 PTREGSCALL stub_fork, sys_fork, %rdi
336 PTREGSCALL stub_vfork, sys_vfork, %rdi
337 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
338 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
339 PTREGSCALL stub_iopl, sys_iopl, %rsi
340
341ENTRY(ptregscall_common)
342 CFI_STARTPROC
343 popq %r11
344 CFI_ADJUST_CFA_OFFSET -8
345 SAVE_REST
346 movq %r11, %r15
347 FIXUP_TOP_OF_STACK %r11
348 call *%rax
349 RESTORE_TOP_OF_STACK %r11
350 movq %r15, %r11
351 RESTORE_REST
352 pushq %r11
353 CFI_ADJUST_CFA_OFFSET 8
354 ret
355 CFI_ENDPROC
356
357ENTRY(stub_execve)
358 CFI_STARTPROC
359 popq %r11
360 CFI_ADJUST_CFA_OFFSET -8
361 SAVE_REST
362 movq %r11, %r15
363 FIXUP_TOP_OF_STACK %r11
364 call sys_execve
365 GET_THREAD_INFO(%rcx)
366 bt $TIF_IA32,threadinfo_flags(%rcx)
367 jc exec_32bit
368 RESTORE_TOP_OF_STACK %r11
369 movq %r15, %r11
370 RESTORE_REST
371 push %r11
372 ret
373
374exec_32bit:
375 CFI_ADJUST_CFA_OFFSET REST_SKIP
376 movq %rax,RAX(%rsp)
377 RESTORE_REST
378 jmp int_ret_from_sys_call
379 CFI_ENDPROC
380
381/*
382 * sigreturn is special because it needs to restore all registers on return.
383 * This cannot be done with SYSRET, so use the IRET return path instead.
384 */
385ENTRY(stub_rt_sigreturn)
386 CFI_STARTPROC
387 addq $8, %rsp
388 SAVE_REST
389 movq %rsp,%rdi
390 FIXUP_TOP_OF_STACK %r11
391 call sys_rt_sigreturn
392 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
393 RESTORE_REST
394 jmp int_ret_from_sys_call
395 CFI_ENDPROC
396
397/*
398 * Interrupt entry/exit.
399 *
400 * Interrupt entry points save only callee clobbered registers in fast path.
401 *
402 * Entry runs with interrupts off.
403 */
404
405/* 0(%rsp): interrupt number */
406 .macro interrupt func
407 CFI_STARTPROC simple
408 CFI_DEF_CFA rsp,(SS-RDI)
409 CFI_REL_OFFSET rsp,(RSP-ORIG_RAX)
410 CFI_REL_OFFSET rip,(RIP-ORIG_RAX)
411 cld
412#ifdef CONFIG_DEBUG_INFO
413 SAVE_ALL
414 movq %rsp,%rdi
415 /*
416 * Setup a stack frame pointer. This allows gdb to trace
417 * back to the original stack.
418 */
419 movq %rsp,%rbp
420 CFI_DEF_CFA_REGISTER rbp
421#else
422 SAVE_ARGS
423 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
424#endif
425 testl $3,CS(%rdi)
426 je 1f
427 swapgs
4281: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count
429 movq %gs:pda_irqstackptr,%rax
430 cmoveq %rax,%rsp
431 pushq %rdi # save old stack
432 call \func
433 .endm
434
435ENTRY(common_interrupt)
436 interrupt do_IRQ
437 /* 0(%rsp): oldrsp-ARGOFFSET */
438ret_from_intr:
439 popq %rdi
440 cli
441 subl $1,%gs:pda_irqcount
442#ifdef CONFIG_DEBUG_INFO
443 movq RBP(%rdi),%rbp
444#endif
445 leaq ARGOFFSET(%rdi),%rsp
446exit_intr:
447 GET_THREAD_INFO(%rcx)
448 testl $3,CS-ARGOFFSET(%rsp)
449 je retint_kernel
450
451 /* Interrupt came from user space */
452 /*
453 * Has a correct top of stack, but a partial stack frame
454 * %rcx: thread info. Interrupts off.
455 */
456retint_with_reschedule:
457 movl $_TIF_WORK_MASK,%edi
458retint_check:
459 movl threadinfo_flags(%rcx),%edx
460 andl %edi,%edx
461 jnz retint_careful
462retint_swapgs:
463 cli
464 swapgs
465retint_restore_args:
466 cli
467 RESTORE_ARGS 0,8,0
468iret_label:
469 iretq
470
471 .section __ex_table,"a"
472 .quad iret_label,bad_iret
473 .previous
474 .section .fixup,"ax"
475 /* force a signal here? this matches i386 behaviour */
476 /* running with kernel gs */
477bad_iret:
478 movq $-9999,%rdi /* better code? */
479 jmp do_exit
480 .previous
481
482 /* edi: workmask, edx: work */
483retint_careful:
484 bt $TIF_NEED_RESCHED,%edx
485 jnc retint_signal
486 sti
487 pushq %rdi
488 call schedule
489 popq %rdi
490 GET_THREAD_INFO(%rcx)
491 cli
492 jmp retint_check
493
494retint_signal:
495 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
496 jz retint_swapgs
497 sti
498 SAVE_REST
499 movq $-1,ORIG_RAX(%rsp)
500 xorq %rsi,%rsi # oldset
501 movq %rsp,%rdi # &pt_regs
502 call do_notify_resume
503 RESTORE_REST
504 cli
505 movl $_TIF_NEED_RESCHED,%edi
506 GET_THREAD_INFO(%rcx)
507 jmp retint_check
508
509#ifdef CONFIG_PREEMPT
510 /* Returning to kernel space. Check if we need preemption */
511 /* rcx: threadinfo. interrupts off. */
512 .p2align
513retint_kernel:
514 cmpl $0,threadinfo_preempt_count(%rcx)
515 jnz retint_restore_args
516 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
517 jnc retint_restore_args
518 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
519 jnc retint_restore_args
520 call preempt_schedule_irq
521 jmp exit_intr
522#endif
523 CFI_ENDPROC
524
525/*
526 * APIC interrupts.
527 */
528 .macro apicinterrupt num,func
529 pushq $\num-256
530 interrupt \func
531 jmp ret_from_intr
532 CFI_ENDPROC
533 .endm
534
535ENTRY(thermal_interrupt)
536 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
537
538#ifdef CONFIG_SMP
539ENTRY(reschedule_interrupt)
540 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
541
542ENTRY(invalidate_interrupt)
543 apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt
544
545ENTRY(call_function_interrupt)
546 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
547#endif
548
549#ifdef CONFIG_X86_LOCAL_APIC
550ENTRY(apic_timer_interrupt)
551 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
552
553ENTRY(error_interrupt)
554 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
555
556ENTRY(spurious_interrupt)
557 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
558#endif
559
560/*
561 * Exception entry points.
562 */
563 .macro zeroentry sym
564 pushq $0 /* push error code/oldrax */
565 pushq %rax /* push real oldrax to the rdi slot */
566 leaq \sym(%rip),%rax
567 jmp error_entry
568 .endm
569
570 .macro errorentry sym
571 pushq %rax
572 leaq \sym(%rip),%rax
573 jmp error_entry
574 .endm
575
576 /* error code is on the stack already */
577 /* handle NMI like exceptions that can happen everywhere */
578 .macro paranoidentry sym
579 SAVE_ALL
580 cld
581 movl $1,%ebx
582 movl $MSR_GS_BASE,%ecx
583 rdmsr
584 testl %edx,%edx
585 js 1f
586 swapgs
587 xorl %ebx,%ebx
5881: movq %rsp,%rdi
589 movq ORIG_RAX(%rsp),%rsi
590 movq $-1,ORIG_RAX(%rsp)
591 call \sym
592 .endm
593
594/*
595 * Exception entry point. This expects an error code/orig_rax on the stack
596 * and the exception handler in %rax.
597 */
598ENTRY(error_entry)
599 CFI_STARTPROC simple
600 CFI_DEF_CFA rsp,(SS-RDI)
601 CFI_REL_OFFSET rsp,(RSP-RDI)
602 CFI_REL_OFFSET rip,(RIP-RDI)
603 /* rdi slot contains rax, oldrax contains error code */
604 cld
605 subq $14*8,%rsp
606 CFI_ADJUST_CFA_OFFSET (14*8)
607 movq %rsi,13*8(%rsp)
608 CFI_REL_OFFSET rsi,RSI
609 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
610 movq %rdx,12*8(%rsp)
611 CFI_REL_OFFSET rdx,RDX
612 movq %rcx,11*8(%rsp)
613 CFI_REL_OFFSET rcx,RCX
614 movq %rsi,10*8(%rsp) /* store rax */
615 CFI_REL_OFFSET rax,RAX
616 movq %r8, 9*8(%rsp)
617 CFI_REL_OFFSET r8,R8
618 movq %r9, 8*8(%rsp)
619 CFI_REL_OFFSET r9,R9
620 movq %r10,7*8(%rsp)
621 CFI_REL_OFFSET r10,R10
622 movq %r11,6*8(%rsp)
623 CFI_REL_OFFSET r11,R11
624 movq %rbx,5*8(%rsp)
625 CFI_REL_OFFSET rbx,RBX
626 movq %rbp,4*8(%rsp)
627 CFI_REL_OFFSET rbp,RBP
628 movq %r12,3*8(%rsp)
629 CFI_REL_OFFSET r12,R12
630 movq %r13,2*8(%rsp)
631 CFI_REL_OFFSET r13,R13
632 movq %r14,1*8(%rsp)
633 CFI_REL_OFFSET r14,R14
634 movq %r15,(%rsp)
635 CFI_REL_OFFSET r15,R15
636 xorl %ebx,%ebx
637 testl $3,CS(%rsp)
638 je error_kernelspace
639error_swapgs:
640 swapgs
641error_sti:
642 movq %rdi,RDI(%rsp)
643 movq %rsp,%rdi
644 movq ORIG_RAX(%rsp),%rsi /* get error code */
645 movq $-1,ORIG_RAX(%rsp)
646 call *%rax
647 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
648error_exit:
649 movl %ebx,%eax
650 RESTORE_REST
651 cli
652 GET_THREAD_INFO(%rcx)
653 testl %eax,%eax
654 jne retint_kernel
655 movl threadinfo_flags(%rcx),%edx
656 movl $_TIF_WORK_MASK,%edi
657 andl %edi,%edx
658 jnz retint_careful
659 swapgs
660 RESTORE_ARGS 0,8,0
661 iretq
662 CFI_ENDPROC
663
664error_kernelspace:
665 incl %ebx
666 /* There are two places in the kernel that can potentially fault with
667 usergs. Handle them here. The exception handlers after
668 iret run with kernel gs again, so don't set the user space flag.
669 B stepping K8s sometimes report an truncated RIP for IRET
670 exceptions returning to compat mode. Check for these here too. */
671 leaq iret_label(%rip),%rbp
672 cmpq %rbp,RIP(%rsp)
673 je error_swapgs
674 movl %ebp,%ebp /* zero extend */
675 cmpq %rbp,RIP(%rsp)
676 je error_swapgs
677 cmpq $gs_change,RIP(%rsp)
678 je error_swapgs
679 jmp error_sti
680
681 /* Reload gs selector with exception handling */
682 /* edi: new selector */
683ENTRY(load_gs_index)
684 pushf
685 cli
686 swapgs
687gs_change:
688 movl %edi,%gs
6892: mfence /* workaround */
690 swapgs
691 popf
692 ret
693
694 .section __ex_table,"a"
695 .align 8
696 .quad gs_change,bad_gs
697 .previous
698 .section .fixup,"ax"
699 /* running with kernelgs */
700bad_gs:
701 swapgs /* switch back to user gs */
702 xorl %eax,%eax
703 movl %eax,%gs
704 jmp 2b
705 .previous
706
707/*
708 * Create a kernel thread.
709 *
710 * C extern interface:
711 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
712 *
713 * asm input arguments:
714 * rdi: fn, rsi: arg, rdx: flags
715 */
716ENTRY(kernel_thread)
717 CFI_STARTPROC
718 FAKE_STACK_FRAME $child_rip
719 SAVE_ALL
720
721 # rdi: flags, rsi: usp, rdx: will be &pt_regs
722 movq %rdx,%rdi
723 orq kernel_thread_flags(%rip),%rdi
724 movq $-1, %rsi
725 movq %rsp, %rdx
726
727 xorl %r8d,%r8d
728 xorl %r9d,%r9d
729
730 # clone now
731 call do_fork
732 movq %rax,RAX(%rsp)
733 xorl %edi,%edi
734
735 /*
736 * It isn't worth to check for reschedule here,
737 * so internally to the x86_64 port you can rely on kernel_thread()
738 * not to reschedule the child before returning, this avoids the need
739 * of hacks for example to fork off the per-CPU idle tasks.
740 * [Hopefully no generic code relies on the reschedule -AK]
741 */
742 RESTORE_ALL
743 UNFAKE_STACK_FRAME
744 ret
745 CFI_ENDPROC
746
747
748child_rip:
749 /*
750 * Here we are in the child and the registers are set as they were
751 * at kernel_thread() invocation in the parent.
752 */
753 movq %rdi, %rax
754 movq %rsi, %rdi
755 call *%rax
756 # exit
757 xorq %rdi, %rdi
758 call do_exit
759
760/*
761 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
762 *
763 * C extern interface:
764 * extern long execve(char *name, char **argv, char **envp)
765 *
766 * asm input arguments:
767 * rdi: name, rsi: argv, rdx: envp
768 *
769 * We want to fallback into:
770 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
771 *
772 * do_sys_execve asm fallback arguments:
773 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
774 */
775ENTRY(execve)
776 CFI_STARTPROC
777 FAKE_STACK_FRAME $0
778 SAVE_ALL
779 call sys_execve
780 movq %rax, RAX(%rsp)
781 RESTORE_REST
782 testq %rax,%rax
783 je int_ret_from_sys_call
784 RESTORE_ARGS
785 UNFAKE_STACK_FRAME
786 ret
787 CFI_ENDPROC
788
789ENTRY(page_fault)
790 errorentry do_page_fault
791
792ENTRY(coprocessor_error)
793 zeroentry do_coprocessor_error
794
795ENTRY(simd_coprocessor_error)
796 zeroentry do_simd_coprocessor_error
797
798ENTRY(device_not_available)
799 zeroentry math_state_restore
800
801 /* runs on exception stack */
802ENTRY(debug)
803 CFI_STARTPROC
804 pushq $0
805 CFI_ADJUST_CFA_OFFSET 8
806 paranoidentry do_debug
807 /* switch back to process stack to restore the state ptrace touched */
808 movq %rax,%rsp
809 testl $3,CS(%rsp)
810 jnz paranoid_userspace
811 jmp paranoid_exit
812 CFI_ENDPROC
813
814 /* runs on exception stack */
815ENTRY(nmi)
816 CFI_STARTPROC
817 pushq $-1
818 CFI_ADJUST_CFA_OFFSET 8
819 paranoidentry do_nmi
820 /* ebx: no swapgs flag */
821paranoid_exit:
822 testl %ebx,%ebx /* swapgs needed? */
823 jnz paranoid_restore
824paranoid_swapgs:
825 cli
826 swapgs
827paranoid_restore:
828 RESTORE_ALL 8
829 iretq
830paranoid_userspace:
831 cli
832 GET_THREAD_INFO(%rcx)
833 movl threadinfo_flags(%rcx),%edx
834 testl $_TIF_NEED_RESCHED,%edx
835 jnz paranoid_resched
836 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
837 jnz paranoid_signal
838 jmp paranoid_swapgs
839paranoid_resched:
840 sti
841 call schedule
842 jmp paranoid_exit
843paranoid_signal:
844 sti
845 xorl %esi,%esi /* oldset */
846 movq %rsp,%rdi /* &pt_regs */
847 call do_notify_resume
848 jmp paranoid_exit
849 CFI_ENDPROC
850
851ENTRY(int3)
852 zeroentry do_int3
853
854ENTRY(overflow)
855 zeroentry do_overflow
856
857ENTRY(bounds)
858 zeroentry do_bounds
859
860ENTRY(invalid_op)
861 zeroentry do_invalid_op
862
863ENTRY(coprocessor_segment_overrun)
864 zeroentry do_coprocessor_segment_overrun
865
866ENTRY(reserved)
867 zeroentry do_reserved
868
869 /* runs on exception stack */
870ENTRY(double_fault)
871 CFI_STARTPROC
872 paranoidentry do_double_fault
873 movq %rax,%rsp
874 testl $3,CS(%rsp)
875 jnz paranoid_userspace
876 jmp paranoid_exit
877 CFI_ENDPROC
878
879ENTRY(invalid_TSS)
880 errorentry do_invalid_TSS
881
882ENTRY(segment_not_present)
883 errorentry do_segment_not_present
884
885 /* runs on exception stack */
886ENTRY(stack_segment)
887 CFI_STARTPROC
888 paranoidentry do_stack_segment
889 movq %rax,%rsp
890 testl $3,CS(%rsp)
891 jnz paranoid_userspace
892 jmp paranoid_exit
893 CFI_ENDPROC
894
895ENTRY(general_protection)
896 errorentry do_general_protection
897
898ENTRY(alignment_check)
899 errorentry do_alignment_check
900
901ENTRY(divide_error)
902 zeroentry do_divide_error
903
904ENTRY(spurious_interrupt_bug)
905 zeroentry do_spurious_interrupt_bug
906
907#ifdef CONFIG_X86_MCE
908 /* runs on exception stack */
909ENTRY(machine_check)
910 CFI_STARTPROC
911 pushq $0
912 CFI_ADJUST_CFA_OFFSET 8
913 paranoidentry do_machine_check
914 jmp paranoid_exit
915 CFI_ENDPROC
916#endif
917
918ENTRY(call_debug)
919 zeroentry do_call_debug
920
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
new file mode 100644
index 000000000000..d2c42fb99cb3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic.c
@@ -0,0 +1,89 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Generic APIC sub-arch probe layer.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/config.h>
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/module.h>
19
20#include <asm/smp.h>
21#include <asm/ipi.h>
22
23/* which logical CPU number maps to which CPU (physical APIC ID) */
24u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
25EXPORT_SYMBOL(x86_cpu_to_apicid);
26u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
27
28extern struct genapic apic_cluster;
29extern struct genapic apic_flat;
30
31struct genapic *genapic = &apic_flat;
32
33
34/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */
37void __init clustered_apic_check(void)
38{
39 long i;
40 u8 clusters, max_cluster;
41 u8 id;
42 u8 cluster_cnt[NUM_APIC_CLUSTERS];
43
44 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
45 /* AMD always uses flat mode right now */
46 genapic = &apic_flat;
47 goto print;
48 }
49
50 memset(cluster_cnt, 0, sizeof(cluster_cnt));
51
52 for (i = 0; i < NR_CPUS; i++) {
53 id = bios_cpu_apicid[i];
54 if (id != BAD_APICID)
55 cluster_cnt[APIC_CLUSTERID(id)]++;
56 }
57
58 clusters = 0;
59 max_cluster = 0;
60 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
61 if (cluster_cnt[i] > 0) {
62 ++clusters;
63 if (cluster_cnt[i] > max_cluster)
64 max_cluster = cluster_cnt[i];
65 }
66 }
67
68 /*
69 * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
70 * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
71 * else physical mode.
72 * (We don't use lowest priority delivery + HW APIC IRQ steering, so
73 * can ignore the clustered logical case and go straight to physical.)
74 */
75 if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
76 genapic = &apic_flat;
77 else
78 genapic = &apic_cluster;
79
80print:
81 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
82}
83
84/* Same for both flat and clustered. */
85
86void send_IPI_self(int vector)
87{
88 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
89}
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
new file mode 100644
index 000000000000..9703da7202e3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -0,0 +1,130 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Clustered APIC subarch code. Up to 255 CPUs, physical delivery.
6 * (A more realistic maximum is around 230 CPUs.)
7 *
8 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
9 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
10 * James Cleverdon.
11 */
12#include <linux/config.h>
13#include <linux/threads.h>
14#include <linux/cpumask.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <asm/smp.h>
20#include <asm/ipi.h>
21
22
23/*
24 * Set up the logical destination ID.
25 *
26 * Intel recommends to set DFR, LDR and TPR before enabling
27 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
28 * document number 292116). So here it goes...
29 */
30static void cluster_init_apic_ldr(void)
31{
32 unsigned long val, id;
33 long i, count;
34 u8 lid;
35 u8 my_id = hard_smp_processor_id();
36 u8 my_cluster = APIC_CLUSTER(my_id);
37
38 /* Create logical APIC IDs by counting CPUs already in cluster. */
39 for (count = 0, i = NR_CPUS; --i >= 0; ) {
40 lid = x86_cpu_to_log_apicid[i];
41 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
42 ++count;
43 }
44 /*
45 * We only have a 4 wide bitmap in cluster mode. There's no way
46 * to get above 60 CPUs and still give each one it's own bit.
47 * But, we're using physical IRQ delivery, so we don't care.
48 * Use bit 3 for the 4th through Nth CPU in each cluster.
49 */
50 if (count >= XAPIC_DEST_CPUS_SHIFT)
51 count = 3;
52 id = my_cluster | (1UL << count);
53 x86_cpu_to_log_apicid[smp_processor_id()] = id;
54 apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
55 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
56 val |= SET_APIC_LOGICAL_ID(id);
57 apic_write_around(APIC_LDR, val);
58}
59
60/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
61
62static cpumask_t cluster_target_cpus(void)
63{
64 return cpumask_of_cpu(0);
65}
66
67static void cluster_send_IPI_mask(cpumask_t mask, int vector)
68{
69 send_IPI_mask_sequence(mask, vector);
70}
71
72static void cluster_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75 cpu_clear(smp_processor_id(), mask);
76
77 if (!cpus_empty(mask))
78 cluster_send_IPI_mask(mask, vector);
79}
80
81static void cluster_send_IPI_all(int vector)
82{
83 cluster_send_IPI_mask(cpu_online_map, vector);
84}
85
86static int cluster_apic_id_registered(void)
87{
88 return 1;
89}
90
91static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
92{
93 int cpu;
94
95 /*
96 * We're using fixed IRQ delivery, can only return one phys APIC ID.
97 * May as well be the first.
98 */
99 cpu = first_cpu(cpumask);
100 if ((unsigned)cpu < NR_CPUS)
101 return x86_cpu_to_apicid[cpu];
102 else
103 return BAD_APICID;
104}
105
106/* cpuid returns the value latched in the HW at reset, not the APIC ID
107 * register's value. For any box whose BIOS changes APIC IDs, like
108 * clustered APIC systems, we must use hard_smp_processor_id.
109 *
110 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
111 */
112static unsigned int phys_pkg_id(int index_msb)
113{
114 return hard_smp_processor_id() >> index_msb;
115}
116
117struct genapic apic_cluster = {
118 .name = "clustered",
119 .int_delivery_mode = dest_Fixed,
120 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
121 .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
122 .target_cpus = cluster_target_cpus,
123 .apic_id_registered = cluster_apic_id_registered,
124 .init_apic_ldr = cluster_init_apic_ldr,
125 .send_IPI_all = cluster_send_IPI_all,
126 .send_IPI_allbutself = cluster_send_IPI_allbutself,
127 .send_IPI_mask = cluster_send_IPI_mask,
128 .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
129 .phys_pkg_id = phys_pkg_id,
130};
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
new file mode 100644
index 000000000000..b4cbbad04226
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2
4 *
5 * Flat APIC subarch code. Maximum 8 CPUs, logical delivery.
6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon.
10 */
11#include <linux/config.h>
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <asm/smp.h>
19#include <asm/ipi.h>
20
21
22static cpumask_t flat_target_cpus(void)
23{
24 return cpu_online_map;
25}
26
27/*
28 * Set up the logical destination ID.
29 *
30 * Intel recommends to set DFR, LDR and TPR before enabling
31 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
32 * document number 292116). So here it goes...
33 */
34static void flat_init_apic_ldr(void)
35{
36 unsigned long val;
37 unsigned long num, id;
38
39 num = smp_processor_id();
40 id = 1UL << num;
41 x86_cpu_to_log_apicid[num] = id;
42 apic_write_around(APIC_DFR, APIC_DFR_FLAT);
43 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
44 val |= SET_APIC_LOGICAL_ID(id);
45 apic_write_around(APIC_LDR, val);
46}
47
48static void flat_send_IPI_allbutself(int vector)
49{
50 /*
51 * if there are no other CPUs in the system then
52 * we get an APIC send error if we try to broadcast.
53 * thus we have to avoid sending IPIs in this case.
54 */
55 if (num_online_cpus() > 1)
56 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
57}
58
59static void flat_send_IPI_all(int vector)
60{
61 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
62}
63
64static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
65{
66 unsigned long mask = cpus_addr(cpumask)[0];
67 unsigned long cfg;
68 unsigned long flags;
69
70 local_save_flags(flags);
71 local_irq_disable();
72
73 /*
74 * Wait for idle.
75 */
76 apic_wait_icr_idle();
77
78 /*
79 * prepare target chip field
80 */
81 cfg = __prepare_ICR2(mask);
82 apic_write_around(APIC_ICR2, cfg);
83
84 /*
85 * program the ICR
86 */
87 cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL);
88
89 /*
90 * Send the IPI. The write to APIC_ICR fires this off.
91 */
92 apic_write_around(APIC_ICR, cfg);
93 local_irq_restore(flags);
94}
95
96static int flat_apic_id_registered(void)
97{
98 return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
99}
100
101static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
102{
103 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
104}
105
106static unsigned int phys_pkg_id(int index_msb)
107{
108 u32 ebx;
109
110 ebx = cpuid_ebx(1);
111 return ((ebx >> 24) & 0xFF) >> index_msb;
112}
113
114struct genapic apic_flat = {
115 .name = "flat",
116 .int_delivery_mode = dest_LowestPrio,
117 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
118 .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
119 .target_cpus = flat_target_cpus,
120 .apic_id_registered = flat_apic_id_registered,
121 .init_apic_ldr = flat_init_apic_ldr,
122 .send_IPI_all = flat_send_IPI_all,
123 .send_IPI_allbutself = flat_send_IPI_allbutself,
124 .send_IPI_mask = flat_send_IPI_mask,
125 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
126 .phys_pkg_id = phys_pkg_id,
127};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
new file mode 100644
index 000000000000..b6d8725c1f61
--- /dev/null
+++ b/arch/x86_64/kernel/head.S
@@ -0,0 +1,396 @@
1/*
2 * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 *
9 * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
10 */
11
12
13#include <linux/linkage.h>
14#include <linux/threads.h>
15#include <asm/desc.h>
16#include <asm/segment.h>
17#include <asm/page.h>
18#include <asm/msr.h>
19#include <asm/cache.h>
20
21/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
22 * because we need identity-mapped pages on setup so define __START_KERNEL to
23 * 0x100000 for this stage
24 *
25 */
26
27 .text
28 .code32
29 .globl startup_32
30/* %bx: 1 if coming from smp trampoline on secondary cpu */
31startup_32:
32
33 /*
34 * At this point the CPU runs in 32bit protected mode (CS.D = 1) with
35 * paging disabled and the point of this file is to switch to 64bit
36 * long mode with a kernel mapping for kerneland to jump into the
37 * kernel virtual addresses.
38 * There is no stack until we set one up.
39 */
40
41 /* Initialize the %ds segment register */
42 movl $__KERNEL_DS,%eax
43 movl %eax,%ds
44
45 /* Load new GDT with the 64bit segments using 32bit descriptor */
46 lgdt pGDT32 - __START_KERNEL_map
47
48 /* If the CPU doesn't support CPUID this will double fault.
49 * Unfortunately it is hard to check for CPUID without a stack.
50 */
51
52 /* Check if extended functions are implemented */
53 movl $0x80000000, %eax
54 cpuid
55 cmpl $0x80000000, %eax
56 jbe no_long_mode
57 /* Check if long mode is implemented */
58 mov $0x80000001, %eax
59 cpuid
60 btl $29, %edx
61 jnc no_long_mode
62
63 /*
64 * Prepare for entering 64bits mode
65 */
66
67 /* Enable PAE mode */
68 xorl %eax, %eax
69 btsl $5, %eax
70 movl %eax, %cr4
71
72 /* Setup early boot stage 4 level pagetables */
73 movl $(init_level4_pgt - __START_KERNEL_map), %eax
74 movl %eax, %cr3
75
76 /* Setup EFER (Extended Feature Enable Register) */
77 movl $MSR_EFER, %ecx
78 rdmsr
79
80 /* Enable Long Mode */
81 btsl $_EFER_LME, %eax
82
83 /* Make changes effective */
84 wrmsr
85
86 xorl %eax, %eax
87 btsl $31, %eax /* Enable paging and in turn activate Long Mode */
88 btsl $0, %eax /* Enable protected mode */
89 /* Make changes effective */
90 movl %eax, %cr0
91 /*
92 * At this point we're in long mode but in 32bit compatibility mode
93 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
94 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
95 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
96 */
97 ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map)
98
99 .code64
100 .org 0x100
101 .globl startup_64
102startup_64:
103 /* We come here either from startup_32
104 * or directly from a 64bit bootloader.
105 * Since we may have come directly from a bootloader we
106 * reload the page tables here.
107 */
108
109 /* Enable PAE mode and PGE */
110 xorq %rax, %rax
111 btsq $5, %rax
112 btsq $7, %rax
113 movq %rax, %cr4
114
115 /* Setup early boot stage 4 level pagetables. */
116 movq $(init_level4_pgt - __START_KERNEL_map), %rax
117 movq %rax, %cr3
118
119 /* Check if nx is implemented */
120 movl $0x80000001, %eax
121 cpuid
122 movl %edx,%edi
123
124 /* Setup EFER (Extended Feature Enable Register) */
125 movl $MSR_EFER, %ecx
126 rdmsr
127
128 /* Enable System Call */
129 btsl $_EFER_SCE, %eax
130
131 /* No Execute supported? */
132 btl $20,%edi
133 jnc 1f
134 btsl $_EFER_NX, %eax
1351:
136 /* Make changes effective */
137 wrmsr
138
139 /* Setup cr0 */
140 xorq %rax, %rax
141 btsq $31, %rax /* Enable paging */
142 btsq $0, %rax /* Enable protected mode */
143 btsq $1, %rax /* Enable MP */
144 btsq $4, %rax /* Enable ET */
145 btsq $5, %rax /* Enable NE */
146 btsq $16, %rax /* Enable WP */
147 btsq $18, %rax /* Enable AM */
148 /* Make changes effective */
149 movq %rax, %cr0
150
151 /* Setup a boot time stack */
152 movq init_rsp(%rip),%rsp
153
154 /* zero EFLAGS after setting rsp */
155 pushq $0
156 popfq
157
158 /*
159 * We must switch to a new descriptor in kernel space for the GDT
160 * because soon the kernel won't have access anymore to the userspace
161 * addresses where we're currently running on. We have to do that here
162 * because in 32bit we couldn't load a 64bit linear address.
163 */
164 lgdt cpu_gdt_descr
165
166 /*
167 * Setup up a dummy PDA. this is just for some early bootup code
168 * that does in_interrupt()
169 */
170 movl $MSR_GS_BASE,%ecx
171 movq $empty_zero_page,%rax
172 movq %rax,%rdx
173 shrq $32,%rdx
174 wrmsr
175
176 /* set up data segments. actually 0 would do too */
177 movl $__KERNEL_DS,%eax
178 movl %eax,%ds
179 movl %eax,%ss
180 movl %eax,%es
181
182 /* esi is pointer to real mode structure with interesting info.
183 pass it to C */
184 movl %esi, %edi
185
186 /* Finally jump to run C code and to be on real kernel address
187 * Since we are running on identity-mapped space we have to jump
188 * to the full 64bit address , this is only possible as indirect
189 * jump
190 */
191 movq initial_code(%rip),%rax
192 jmp *%rax
193
194 /* SMP bootup changes these two */
195 .globl initial_code
196initial_code:
197 .quad x86_64_start_kernel
198 .globl init_rsp
199init_rsp:
200 .quad init_thread_union+THREAD_SIZE-8
201
202ENTRY(early_idt_handler)
203 xorl %eax,%eax
204 movq 8(%rsp),%rsi # get rip
205 movq (%rsp),%rdx
206 movq %cr2,%rcx
207 leaq early_idt_msg(%rip),%rdi
208 call early_printk
2091: hlt
210 jmp 1b
211
212early_idt_msg:
213 .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
214
215.code32
216ENTRY(no_long_mode)
217 /* This isn't an x86-64 CPU so hang */
2181:
219 jmp 1b
220
221.org 0xf00
222 .globl pGDT32
223pGDT32:
224 .word gdt_end-cpu_gdt_table
225 .long cpu_gdt_table-__START_KERNEL_map
226
227.org 0xf10
228ljumpvector:
229 .long startup_64-__START_KERNEL_map
230 .word __KERNEL_CS
231
232ENTRY(stext)
233ENTRY(_stext)
234
235 /*
236 * This default setting generates an ident mapping at address 0x100000
237 * and a mapping for the kernel that precisely maps virtual address
238 * 0xffffffff80000000 to physical address 0x000000. (always using
239 * 2Mbyte large pages provided by PAE mode)
240 */
241.org 0x1000
242ENTRY(init_level4_pgt)
243 .quad 0x0000000000102007 /* -> level3_ident_pgt */
244 .fill 255,8,0
245 .quad 0x000000000010a007
246 .fill 254,8,0
247 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
248 .quad 0x0000000000103007 /* -> level3_kernel_pgt */
249
250.org 0x2000
251ENTRY(level3_ident_pgt)
252 .quad 0x0000000000104007
253 .fill 511,8,0
254
255.org 0x3000
256ENTRY(level3_kernel_pgt)
257 .fill 510,8,0
258 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
259 .quad 0x0000000000105007 /* -> level2_kernel_pgt */
260 .fill 1,8,0
261
262.org 0x4000
263ENTRY(level2_ident_pgt)
264 /* 40MB for bootup. */
265 .quad 0x0000000000000283
266 .quad 0x0000000000200183
267 .quad 0x0000000000400183
268 .quad 0x0000000000600183
269 .quad 0x0000000000800183
270 .quad 0x0000000000A00183
271 .quad 0x0000000000C00183
272 .quad 0x0000000000E00183
273 .quad 0x0000000001000183
274 .quad 0x0000000001200183
275 .quad 0x0000000001400183
276 .quad 0x0000000001600183
277 .quad 0x0000000001800183
278 .quad 0x0000000001A00183
279 .quad 0x0000000001C00183
280 .quad 0x0000000001E00183
281 .quad 0x0000000002000183
282 .quad 0x0000000002200183
283 .quad 0x0000000002400183
284 .quad 0x0000000002600183
285 /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
286 .globl temp_boot_pmds
287temp_boot_pmds:
288 .fill 492,8,0
289
290.org 0x5000
291ENTRY(level2_kernel_pgt)
292 /* 40MB kernel mapping. The kernel code cannot be bigger than that.
293 When you change this change KERNEL_TEXT_SIZE in page.h too. */
294 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
295 .quad 0x0000000000000183
296 .quad 0x0000000000200183
297 .quad 0x0000000000400183
298 .quad 0x0000000000600183
299 .quad 0x0000000000800183
300 .quad 0x0000000000A00183
301 .quad 0x0000000000C00183
302 .quad 0x0000000000E00183
303 .quad 0x0000000001000183
304 .quad 0x0000000001200183
305 .quad 0x0000000001400183
306 .quad 0x0000000001600183
307 .quad 0x0000000001800183
308 .quad 0x0000000001A00183
309 .quad 0x0000000001C00183
310 .quad 0x0000000001E00183
311 .quad 0x0000000002000183
312 .quad 0x0000000002200183
313 .quad 0x0000000002400183
314 .quad 0x0000000002600183
315 /* Module mapping starts here */
316 .fill 492,8,0
317
318.org 0x6000
319ENTRY(empty_zero_page)
320
321.org 0x7000
322ENTRY(empty_bad_page)
323
324.org 0x8000
325ENTRY(empty_bad_pte_table)
326
327.org 0x9000
328ENTRY(empty_bad_pmd_table)
329
330.org 0xa000
331ENTRY(level3_physmem_pgt)
332 .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
333
334 .org 0xb000
335#ifdef CONFIG_ACPI_SLEEP
336ENTRY(wakeup_level4_pgt)
337 .quad 0x0000000000102007 /* -> level3_ident_pgt */
338 .fill 255,8,0
339 .quad 0x000000000010a007
340 .fill 254,8,0
341 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
342 .quad 0x0000000000103007 /* -> level3_kernel_pgt */
343#endif
344
345 .data
346
347 .align 16
348 .globl cpu_gdt_descr
349cpu_gdt_descr:
350 .word gdt_end-cpu_gdt_table
351gdt:
352 .quad cpu_gdt_table
353#ifdef CONFIG_SMP
354 .rept NR_CPUS-1
355 .word 0
356 .quad 0
357 .endr
358#endif
359
360/* We need valid kernel segments for data and code in long mode too
361 * IRET will check the segment types kkeil 2000/10/28
362 * Also sysret mandates a special GDT layout
363 */
364
365.align L1_CACHE_BYTES
366
367/* The TLS descriptors are currently at a different place compared to i386.
368 Hopefully nobody expects them at a fixed place (Wine?) */
369
370ENTRY(cpu_gdt_table)
371 .quad 0x0000000000000000 /* NULL descriptor */
372 .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */
373 .quad 0x00af9a000000ffff /* __KERNEL_CS */
374 .quad 0x00cf92000000ffff /* __KERNEL_DS */
375 .quad 0x00cffa000000ffff /* __USER32_CS */
376 .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
377 .quad 0x00affa000000ffff /* __USER_CS */
378 .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
379 .quad 0,0 /* TSS */
380 .quad 0,0 /* LDT */
381 .quad 0,0,0 /* three TLS descriptors */
382 .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */
383 /* base must be patched for real base address. */
384gdt_end:
385 /* asm/segment.h:GDT_ENTRIES must match this */
386 /* This should be a multiple of the cache line size */
387 /* GDTs of other CPUs: */
388 .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table)
389
390 .align L1_CACHE_BYTES
391ENTRY(idt_table)
392 .rept 256
393 .quad 0
394 .quad 0
395 .endr
396
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
new file mode 100644
index 000000000000..6cad46c98a23
--- /dev/null
+++ b/arch/x86_64/kernel/head64.c
@@ -0,0 +1,117 @@
1/*
2 * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 *
6 * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
7 */
8
9#include <linux/init.h>
10#include <linux/linkage.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/percpu.h>
15
16#include <asm/processor.h>
17#include <asm/proto.h>
18#include <asm/smp.h>
19#include <asm/bootsetup.h>
20#include <asm/setup.h>
21#include <asm/desc.h>
22
23/* Don't add a printk in there. printk relies on the PDA which is not initialized
24 yet. */
25static void __init clear_bss(void)
26{
27 extern char __bss_start[], __bss_end[];
28 memset(__bss_start, 0,
29 (unsigned long) __bss_end - (unsigned long) __bss_start);
30}
31
32extern char x86_boot_params[2048];
33
34#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
35#define OLD_CL_MAGIC_ADDR 0x90020
36#define OLD_CL_MAGIC 0xA33F
37#define OLD_CL_BASE_ADDR 0x90000
38#define OLD_CL_OFFSET 0x90022
39
40extern char saved_command_line[];
41
42static void __init copy_bootdata(char *real_mode_data)
43{
44 int new_data;
45 char * command_line;
46
47 memcpy(x86_boot_params, real_mode_data, 2048);
48 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
49 if (!new_data) {
50 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
51 printk("so old bootloader that it does not support commandline?!\n");
52 return;
53 }
54 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
55 printk("old bootloader convention, maybe loadlin?\n");
56 }
57 command_line = (char *) ((u64)(new_data));
58 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
59 printk("Bootdata ok (command line is %s)\n", saved_command_line);
60}
61
62static void __init setup_boot_cpu_data(void)
63{
64 unsigned int dummy, eax;
65
66 /* get vendor info */
67 cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
68 (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
69 (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
70 (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
71
72 /* get cpu type */
73 cpuid(1, &eax, &dummy, &dummy,
74 (unsigned int *) &boot_cpu_data.x86_capability);
75 boot_cpu_data.x86 = (eax >> 8) & 0xf;
76 boot_cpu_data.x86_model = (eax >> 4) & 0xf;
77 boot_cpu_data.x86_mask = eax & 0xf;
78}
79
80extern char _end[];
81
82void __init x86_64_start_kernel(char * real_mode_data)
83{
84 char *s;
85 int i;
86
87 for (i = 0; i < 256; i++)
88 set_intr_gate(i, early_idt_handler);
89 asm volatile("lidt %0" :: "m" (idt_descr));
90 clear_bss();
91 pda_init(0);
92 copy_bootdata(real_mode_data);
93#ifdef CONFIG_SMP
94 cpu_set(0, cpu_online_map);
95#endif
96 /* default console: */
97 if (!strstr(saved_command_line, "console="))
98 strcat(saved_command_line, " console=tty0");
99 s = strstr(saved_command_line, "earlyprintk=");
100 if (s != NULL)
101 setup_early_printk(s);
102#ifdef CONFIG_DISCONTIGMEM
103 s = strstr(saved_command_line, "numa=");
104 if (s != NULL)
105 numa_setup(s+5);
106#endif
107#ifdef CONFIG_X86_IO_APIC
108 if (strstr(saved_command_line, "disableapic"))
109 disable_apic = 1;
110#endif
111 /* You need early console to see that */
112 if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
113 panic("Kernel too big for kernel mapping\n");
114
115 setup_boot_cpu_data();
116 start_kernel();
117}
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
new file mode 100644
index 000000000000..ba139cac57ce
--- /dev/null
+++ b/arch/x86_64/kernel/i387.c
@@ -0,0 +1,155 @@
1/*
2 * linux/arch/x86_64/kernel/i387.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 * Copyright (C) 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * General FPU state handling cleanups
9 * Gareth Hughes <gareth@valinux.com>, May 2000
10 *
11 * x86-64 rework 2002 Andi Kleen.
12 * Does direct fxsave in and out of user space now for signal handlers.
13 * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
14 * the 64bit user space sees a FXSAVE frame directly.
15 */
16
17#include <linux/config.h>
18#include <linux/sched.h>
19#include <linux/init.h>
20#include <asm/processor.h>
21#include <asm/i387.h>
22#include <asm/sigcontext.h>
23#include <asm/user.h>
24#include <asm/ptrace.h>
25#include <asm/uaccess.h>
26
27unsigned int mxcsr_feature_mask = 0xffffffff;
28
29void mxcsr_feature_mask_init(void)
30{
31 unsigned int mask;
32 clts();
33 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
34 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
35 mask = current->thread.i387.fxsave.mxcsr_mask;
36 if (mask == 0) mask = 0x0000ffbf;
37 mxcsr_feature_mask &= mask;
38 stts();
39}
40
41/*
42 * Called at bootup to set up the initial FPU state that is later cloned
43 * into all processes.
44 */
45void __init fpu_init(void)
46{
47 unsigned long oldcr0 = read_cr0();
48 extern void __bad_fxsave_alignment(void);
49
50 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
51 __bad_fxsave_alignment();
52 set_in_cr4(X86_CR4_OSFXSR);
53 set_in_cr4(X86_CR4_OSXMMEXCPT);
54
55 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
56
57 mxcsr_feature_mask_init();
58 /* clean state in init */
59 current_thread_info()->status = 0;
60 clear_used_math();
61}
62
63void init_fpu(struct task_struct *child)
64{
65 if (tsk_used_math(child)) {
66 if (child == current)
67 unlazy_fpu(child);
68 return;
69 }
70 memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
71 child->thread.i387.fxsave.cwd = 0x37f;
72 child->thread.i387.fxsave.mxcsr = 0x1f80;
73 /* only the device not available exception or ptrace can call init_fpu */
74 set_stopped_child_used_math(child);
75}
76
77/*
78 * Signal frame handlers.
79 */
80
81int save_i387(struct _fpstate __user *buf)
82{
83 struct task_struct *tsk = current;
84 int err = 0;
85
86 {
87 extern void bad_user_i387_struct(void);
88 if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
89 bad_user_i387_struct();
90 }
91
92 if ((unsigned long)buf % 16)
93 printk("save_i387: bad fpstate %p\n",buf);
94
95 if (!used_math())
96 return 0;
97 clear_used_math(); /* trigger finit */
98 if (tsk->thread_info->status & TS_USEDFPU) {
99 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
100 if (err) return err;
101 stts();
102 } else {
103 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
104 sizeof(struct i387_fxsave_struct)))
105 return -1;
106 }
107 return 1;
108}
109
110/*
111 * ptrace request handlers.
112 */
113
114int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
115{
116 init_fpu(tsk);
117 return __copy_to_user(buf, &tsk->thread.i387.fxsave,
118 sizeof(struct user_i387_struct)) ? -EFAULT : 0;
119}
120
121int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
122{
123 if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
124 sizeof(struct user_i387_struct)))
125 return -EFAULT;
126 return 0;
127}
128
129/*
130 * FPU state for core dumps.
131 */
132
133int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
134{
135 struct task_struct *tsk = current;
136
137 if (!used_math())
138 return 0;
139
140 unlazy_fpu(tsk);
141 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
142 return 1;
143}
144
145int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
146{
147 int fpvalid = !!tsk_used_math(tsk);
148
149 if (fpvalid) {
150 if (tsk == current)
151 unlazy_fpu(tsk);
152 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
153}
154 return fpvalid;
155}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
new file mode 100644
index 000000000000..d52701246010
--- /dev/null
+++ b/arch/x86_64/kernel/i8259.c
@@ -0,0 +1,579 @@
1#include <linux/linkage.h>
2#include <linux/config.h>
3#include <linux/errno.h>
4#include <linux/signal.h>
5#include <linux/sched.h>
6#include <linux/ioport.h>
7#include <linux/interrupt.h>
8#include <linux/timex.h>
9#include <linux/slab.h>
10#include <linux/random.h>
11#include <linux/smp_lock.h>
12#include <linux/init.h>
13#include <linux/kernel_stat.h>
14#include <linux/sysdev.h>
15#include <linux/bitops.h>
16
17#include <asm/acpi.h>
18#include <asm/atomic.h>
19#include <asm/system.h>
20#include <asm/io.h>
21#include <asm/irq.h>
22#include <asm/hw_irq.h>
23#include <asm/pgtable.h>
24#include <asm/delay.h>
25#include <asm/desc.h>
26#include <asm/apic.h>
27
28#include <linux/irq.h>
29
30/*
31 * Common place to define all x86 IRQ vectors
32 *
33 * This builds up the IRQ handler stubs using some ugly macros in irq.h
34 *
35 * These macros create the low-level assembly IRQ routines that save
36 * register context and call do_IRQ(). do_IRQ() then does all the
37 * operations that are needed to keep the AT (or SMP IOAPIC)
38 * interrupt-controller happy.
39 */
40
41#define BI(x,y) \
42 BUILD_IRQ(x##y)
43
44#define BUILD_16_IRQS(x) \
45 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
46 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
47 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
48 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
49
50#define BUILD_14_IRQS(x) \
51 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
52 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
53 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
54 BI(x,c) BI(x,d)
55
56/*
57 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
58 * (these are usually mapped to vectors 0x20-0x2f)
59 */
60BUILD_16_IRQS(0x0)
61
62#ifdef CONFIG_X86_LOCAL_APIC
63/*
64 * The IO-APIC gives us many more interrupt sources. Most of these
65 * are unused but an SMP system is supposed to have enough memory ...
66 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
67 * across the spectrum, so we really want to be prepared to get all
68 * of these. Plus, more powerful systems might have more than 64
69 * IO-APIC registers.
70 *
71 * (these are usually mapped into the 0x30-0xff vector range)
72 */
73 BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
74BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
75BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
76BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
77
78#ifdef CONFIG_PCI_MSI
79 BUILD_14_IRQS(0xe)
80#endif
81
82#endif
83
84#undef BUILD_16_IRQS
85#undef BUILD_14_IRQS
86#undef BI
87
88
89#define IRQ(x,y) \
90 IRQ##x##y##_interrupt
91
92#define IRQLIST_16(x) \
93 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
94 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
95 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
96 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
97
98#define IRQLIST_14(x) \
99 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
100 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
101 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
102 IRQ(x,c), IRQ(x,d)
103
104void (*interrupt[NR_IRQS])(void) = {
105 IRQLIST_16(0x0),
106
107#ifdef CONFIG_X86_IO_APIC
108 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
109 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
110 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
111 IRQLIST_16(0xc), IRQLIST_16(0xd)
112
113#ifdef CONFIG_PCI_MSI
114 , IRQLIST_14(0xe)
115#endif
116
117#endif
118};
119
120#undef IRQ
121#undef IRQLIST_16
122#undef IRQLIST_14
123
124/*
125 * This is the 'legacy' 8259A Programmable Interrupt Controller,
126 * present in the majority of PC/AT boxes.
127 * plus some generic x86 specific things if generic specifics makes
128 * any sense at all.
129 * this file should become arch/i386/kernel/irq.c when the old irq.c
130 * moves to arch independent land
131 */
132
133DEFINE_SPINLOCK(i8259A_lock);
134
135static void end_8259A_irq (unsigned int irq)
136{
137 if (irq > 256) {
138 char var;
139 printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info);
140
141 BUG();
142 }
143
144 if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
145 irq_desc[irq].action)
146 enable_8259A_irq(irq);
147}
148
149#define shutdown_8259A_irq disable_8259A_irq
150
151static void mask_and_ack_8259A(unsigned int);
152
153static unsigned int startup_8259A_irq(unsigned int irq)
154{
155 enable_8259A_irq(irq);
156 return 0; /* never anything pending */
157}
158
159static struct hw_interrupt_type i8259A_irq_type = {
160 "XT-PIC",
161 startup_8259A_irq,
162 shutdown_8259A_irq,
163 enable_8259A_irq,
164 disable_8259A_irq,
165 mask_and_ack_8259A,
166 end_8259A_irq,
167 NULL
168};
169
170/*
171 * 8259A PIC functions to handle ISA devices:
172 */
173
174/*
175 * This contains the irq mask for both 8259A irq controllers,
176 */
177static unsigned int cached_irq_mask = 0xffff;
178
179#define __byte(x,y) (((unsigned char *)&(y))[x])
180#define cached_21 (__byte(0,cached_irq_mask))
181#define cached_A1 (__byte(1,cached_irq_mask))
182
183/*
184 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
185 * boards the timer interrupt is not really connected to any IO-APIC pin,
186 * it's fed to the master 8259A's IR0 line only.
187 *
188 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
189 * this 'mixed mode' IRQ handling costs nothing because it's only used
190 * at IRQ setup time.
191 */
192unsigned long io_apic_irqs;
193
194void disable_8259A_irq(unsigned int irq)
195{
196 unsigned int mask = 1 << irq;
197 unsigned long flags;
198
199 spin_lock_irqsave(&i8259A_lock, flags);
200 cached_irq_mask |= mask;
201 if (irq & 8)
202 outb(cached_A1,0xA1);
203 else
204 outb(cached_21,0x21);
205 spin_unlock_irqrestore(&i8259A_lock, flags);
206}
207
208void enable_8259A_irq(unsigned int irq)
209{
210 unsigned int mask = ~(1 << irq);
211 unsigned long flags;
212
213 spin_lock_irqsave(&i8259A_lock, flags);
214 cached_irq_mask &= mask;
215 if (irq & 8)
216 outb(cached_A1,0xA1);
217 else
218 outb(cached_21,0x21);
219 spin_unlock_irqrestore(&i8259A_lock, flags);
220}
221
222int i8259A_irq_pending(unsigned int irq)
223{
224 unsigned int mask = 1<<irq;
225 unsigned long flags;
226 int ret;
227
228 spin_lock_irqsave(&i8259A_lock, flags);
229 if (irq < 8)
230 ret = inb(0x20) & mask;
231 else
232 ret = inb(0xA0) & (mask >> 8);
233 spin_unlock_irqrestore(&i8259A_lock, flags);
234
235 return ret;
236}
237
238void make_8259A_irq(unsigned int irq)
239{
240 disable_irq_nosync(irq);
241 io_apic_irqs &= ~(1<<irq);
242 irq_desc[irq].handler = &i8259A_irq_type;
243 enable_irq(irq);
244}
245
246/*
247 * This function assumes to be called rarely. Switching between
248 * 8259A registers is slow.
249 * This has to be protected by the irq controller spinlock
250 * before being called.
251 */
252static inline int i8259A_irq_real(unsigned int irq)
253{
254 int value;
255 int irqmask = 1<<irq;
256
257 if (irq < 8) {
258 outb(0x0B,0x20); /* ISR register */
259 value = inb(0x20) & irqmask;
260 outb(0x0A,0x20); /* back to the IRR register */
261 return value;
262 }
263 outb(0x0B,0xA0); /* ISR register */
264 value = inb(0xA0) & (irqmask >> 8);
265 outb(0x0A,0xA0); /* back to the IRR register */
266 return value;
267}
268
269/*
270 * Careful! The 8259A is a fragile beast, it pretty
271 * much _has_ to be done exactly like this (mask it
272 * first, _then_ send the EOI, and the order of EOI
273 * to the two 8259s is important!
274 */
275static void mask_and_ack_8259A(unsigned int irq)
276{
277 unsigned int irqmask = 1 << irq;
278 unsigned long flags;
279
280 spin_lock_irqsave(&i8259A_lock, flags);
281 /*
282 * Lightweight spurious IRQ detection. We do not want
283 * to overdo spurious IRQ handling - it's usually a sign
284 * of hardware problems, so we only do the checks we can
285 * do without slowing down good hardware unnecesserily.
286 *
287 * Note that IRQ7 and IRQ15 (the two spurious IRQs
288 * usually resulting from the 8259A-1|2 PICs) occur
289 * even if the IRQ is masked in the 8259A. Thus we
290 * can check spurious 8259A IRQs without doing the
291 * quite slow i8259A_irq_real() call for every IRQ.
292 * This does not cover 100% of spurious interrupts,
293 * but should be enough to warn the user that there
294 * is something bad going on ...
295 */
296 if (cached_irq_mask & irqmask)
297 goto spurious_8259A_irq;
298 cached_irq_mask |= irqmask;
299
300handle_real_irq:
301 if (irq & 8) {
302 inb(0xA1); /* DUMMY - (do we need this?) */
303 outb(cached_A1,0xA1);
304 outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
305 outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */
306 } else {
307 inb(0x21); /* DUMMY - (do we need this?) */
308 outb(cached_21,0x21);
309 outb(0x60+irq,0x20); /* 'Specific EOI' to master */
310 }
311 spin_unlock_irqrestore(&i8259A_lock, flags);
312 return;
313
314spurious_8259A_irq:
315 /*
316 * this is the slow path - should happen rarely.
317 */
318 if (i8259A_irq_real(irq))
319 /*
320 * oops, the IRQ _is_ in service according to the
321 * 8259A - not spurious, go handle it.
322 */
323 goto handle_real_irq;
324
325 {
326 static int spurious_irq_mask;
327 /*
328 * At this point we can be sure the IRQ is spurious,
329 * lets ACK and report it. [once per IRQ]
330 */
331 if (!(spurious_irq_mask & irqmask)) {
332 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
333 spurious_irq_mask |= irqmask;
334 }
335 atomic_inc(&irq_err_count);
336 /*
337 * Theoretically we do not have to handle this IRQ,
338 * but in Linux this does not cause problems and is
339 * simpler for us.
340 */
341 goto handle_real_irq;
342 }
343}
344
345void init_8259A(int auto_eoi)
346{
347 unsigned long flags;
348
349 spin_lock_irqsave(&i8259A_lock, flags);
350
351 outb(0xff, 0x21); /* mask all of 8259A-1 */
352 outb(0xff, 0xA1); /* mask all of 8259A-2 */
353
354 /*
355 * outb_p - this has to work on a wide range of PC hardware.
356 */
357 outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
358 outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
359 outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
360 if (auto_eoi)
361 outb_p(0x03, 0x21); /* master does Auto EOI */
362 else
363 outb_p(0x01, 0x21); /* master expects normal EOI */
364
365 outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
366 outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
367 outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
368 outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
369 is to be investigated) */
370
371 if (auto_eoi)
372 /*
373 * in AEOI mode we just have to mask the interrupt
374 * when acking.
375 */
376 i8259A_irq_type.ack = disable_8259A_irq;
377 else
378 i8259A_irq_type.ack = mask_and_ack_8259A;
379
380 udelay(100); /* wait for 8259A to initialize */
381
382 outb(cached_21, 0x21); /* restore master IRQ mask */
383 outb(cached_A1, 0xA1); /* restore slave IRQ mask */
384
385 spin_unlock_irqrestore(&i8259A_lock, flags);
386}
387
388static char irq_trigger[2];
389/**
390 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
391 */
392static void restore_ELCR(char *trigger)
393{
394 outb(trigger[0], 0x4d0);
395 outb(trigger[1], 0x4d1);
396}
397
398static void save_ELCR(char *trigger)
399{
400 /* IRQ 0,1,2,8,13 are marked as reserved */
401 trigger[0] = inb(0x4d0) & 0xF8;
402 trigger[1] = inb(0x4d1) & 0xDE;
403}
404
405static int i8259A_resume(struct sys_device *dev)
406{
407 init_8259A(0);
408 restore_ELCR(irq_trigger);
409 return 0;
410}
411
412static int i8259A_suspend(struct sys_device *dev, u32 state)
413{
414 save_ELCR(irq_trigger);
415 return 0;
416}
417
418static struct sysdev_class i8259_sysdev_class = {
419 set_kset_name("i8259"),
420 .suspend = i8259A_suspend,
421 .resume = i8259A_resume,
422};
423
424static struct sys_device device_i8259A = {
425 .id = 0,
426 .cls = &i8259_sysdev_class,
427};
428
429static int __init i8259A_init_sysfs(void)
430{
431 int error = sysdev_class_register(&i8259_sysdev_class);
432 if (!error)
433 error = sysdev_register(&device_i8259A);
434 return error;
435}
436
437device_initcall(i8259A_init_sysfs);
438
439/*
440 * IRQ2 is cascade interrupt to second interrupt controller
441 */
442
443static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
444
445void __init init_ISA_irqs (void)
446{
447 int i;
448
449#ifdef CONFIG_X86_LOCAL_APIC
450 init_bsp_APIC();
451#endif
452 init_8259A(0);
453
454 for (i = 0; i < NR_IRQS; i++) {
455 irq_desc[i].status = IRQ_DISABLED;
456 irq_desc[i].action = NULL;
457 irq_desc[i].depth = 1;
458
459 if (i < 16) {
460 /*
461 * 16 old-style INTA-cycle interrupts:
462 */
463 irq_desc[i].handler = &i8259A_irq_type;
464 } else {
465 /*
466 * 'high' PCI IRQs filled in on demand
467 */
468 irq_desc[i].handler = &no_irq_type;
469 }
470 }
471}
472
473void apic_timer_interrupt(void);
474void spurious_interrupt(void);
475void error_interrupt(void);
476void reschedule_interrupt(void);
477void call_function_interrupt(void);
478void invalidate_interrupt(void);
479void thermal_interrupt(void);
480void i8254_timer_resume(void);
481
482static void setup_timer(void)
483{
484 outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
485 udelay(10);
486 outb_p(LATCH & 0xff , 0x40); /* LSB */
487 udelay(10);
488 outb(LATCH >> 8 , 0x40); /* MSB */
489}
490
491static int timer_resume(struct sys_device *dev)
492{
493 setup_timer();
494 return 0;
495}
496
497void i8254_timer_resume(void)
498{
499 setup_timer();
500}
501
502static struct sysdev_class timer_sysclass = {
503 set_kset_name("timer"),
504 .resume = timer_resume,
505};
506
507static struct sys_device device_timer = {
508 .id = 0,
509 .cls = &timer_sysclass,
510};
511
512static int __init init_timer_sysfs(void)
513{
514 int error = sysdev_class_register(&timer_sysclass);
515 if (!error)
516 error = sysdev_register(&device_timer);
517 return error;
518}
519
520device_initcall(init_timer_sysfs);
521
522void __init init_IRQ(void)
523{
524 int i;
525
526 init_ISA_irqs();
527 /*
528 * Cover the whole vector space, no vector can escape
529 * us. (some of these will be overridden and become
530 * 'special' SMP interrupts)
531 */
532 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
533 int vector = FIRST_EXTERNAL_VECTOR + i;
534 if (i >= NR_IRQS)
535 break;
536 if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) {
537 set_intr_gate(vector, interrupt[i]);
538 }
539 }
540
541#ifdef CONFIG_SMP
542 /*
543 * IRQ0 must be given a fixed assignment and initialized,
544 * because it's used before the IO-APIC is set up.
545 */
546 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
547
548 /*
549 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
550 * IPI, driven by wakeup.
551 */
552 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
553
554 /* IPI for invalidation */
555 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
556
557 /* IPI for generic function call */
558 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
559#endif
560 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
561
562#ifdef CONFIG_X86_LOCAL_APIC
563 /* self generated IPI for local APIC timer */
564 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
565
566 /* IPI vectors for APIC spurious and error interrupts */
567 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
568 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
569#endif
570
571 /*
572 * Set the clock to HZ Hz, we already have a valid
573 * vector now:
574 */
575 setup_timer();
576
577 if (!acpi_ioapic)
578 setup_irq(2, &irq2);
579}
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
new file mode 100644
index 000000000000..c4dc91491157
--- /dev/null
+++ b/arch/x86_64/kernel/init_task.c
@@ -0,0 +1,49 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm);
18
19EXPORT_SYMBOL(init_mm);
20
21/*
22 * Initial task structure.
23 *
24 * We need to make sure that this is 8192-byte aligned due to the
25 * way process stacks are handled. This is done by having a special
26 * "init_task" linker map entry..
27 */
28union thread_union init_thread_union
29 __attribute__((__section__(".data.init_task"))) =
30 { INIT_THREAD_INFO(init_task) };
31
32/*
33 * Initial task structure.
34 *
35 * All other task structs will be allocated on slabs in fork.c
36 */
37struct task_struct init_task = INIT_TASK(init_task);
38
39EXPORT_SYMBOL(init_task);
40/*
41 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
42 * no more per-task TSS's. The TSS size is kept cacheline-aligned
43 * so they are allowed to end up in the .data.cacheline_aligned
44 * section. Since TSS's are completely CPU-local, we want them
45 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
46 */
47DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp;
48
49#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
new file mode 100644
index 000000000000..adebe70c180e
--- /dev/null
+++ b/arch/x86_64/kernel/io_apic.c
@@ -0,0 +1,1982 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/irq.h>
25#include <linux/interrupt.h>
26#include <linux/init.h>
27#include <linux/delay.h>
28#include <linux/sched.h>
29#include <linux/config.h>
30#include <linux/smp_lock.h>
31#include <linux/mc146818rtc.h>
32#include <linux/acpi.h>
33#include <linux/sysdev.h>
34
35#include <asm/io.h>
36#include <asm/smp.h>
37#include <asm/desc.h>
38#include <asm/proto.h>
39#include <asm/mach_apic.h>
40
41#define __apicdebuginit __init
42
43int sis_apic_bug; /* not actually supported, dummy for compile */
44
45static DEFINE_SPINLOCK(ioapic_lock);
46
47/*
48 * # of IRQ routing registers
49 */
50int nr_ioapic_registers[MAX_IO_APICS];
51
52/*
53 * Rough estimation of how many shared IRQs there are, can
54 * be changed anytime.
55 */
56#define MAX_PLUS_SHARED_IRQS NR_IRQS
57#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
58
59/*
60 * This is performance-critical, we want to do it O(1)
61 *
62 * the indexing order of this array favors 1:1 mappings
63 * between pins and IRQs.
64 */
65
66static struct irq_pin_list {
67 short apic, pin, next;
68} irq_2_pin[PIN_MAP_SIZE];
69
70int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
71#ifdef CONFIG_PCI_MSI
72#define vector_to_irq(vector) \
73 (platform_legacy_irq(vector) ? vector : vector_irq[vector])
74#else
75#define vector_to_irq(vector) (vector)
76#endif
77
78/*
79 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
80 * shared ISA-space IRQs, so we have to support them. We are super
81 * fast in the common case, and fast for shared ISA-space IRQs.
82 */
83static void add_pin_to_irq(unsigned int irq, int apic, int pin)
84{
85 static int first_free_entry = NR_IRQS;
86 struct irq_pin_list *entry = irq_2_pin + irq;
87
88 while (entry->next)
89 entry = irq_2_pin + entry->next;
90
91 if (entry->pin != -1) {
92 entry->next = first_free_entry;
93 entry = irq_2_pin + entry->next;
94 if (++first_free_entry >= PIN_MAP_SIZE)
95 panic("io_apic.c: whoops");
96 }
97 entry->apic = apic;
98 entry->pin = pin;
99}
100
101#define __DO_ACTION(R, ACTION, FINAL) \
102 \
103{ \
104 int pin; \
105 struct irq_pin_list *entry = irq_2_pin + irq; \
106 \
107 for (;;) { \
108 unsigned int reg; \
109 pin = entry->pin; \
110 if (pin == -1) \
111 break; \
112 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
113 reg ACTION; \
114 io_apic_modify(entry->apic, reg); \
115 if (!entry->next) \
116 break; \
117 entry = irq_2_pin + entry->next; \
118 } \
119 FINAL; \
120}
121
122#define DO_ACTION(name,R,ACTION, FINAL) \
123 \
124 static void name##_IO_APIC_irq (unsigned int irq) \
125 __DO_ACTION(R, ACTION, FINAL)
126
127DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
128 /* mask = 1 */
129DO_ACTION( __unmask, 0, &= 0xfffeffff, )
130 /* mask = 0 */
131
132static void mask_IO_APIC_irq (unsigned int irq)
133{
134 unsigned long flags;
135
136 spin_lock_irqsave(&ioapic_lock, flags);
137 __mask_IO_APIC_irq(irq);
138 spin_unlock_irqrestore(&ioapic_lock, flags);
139}
140
141static void unmask_IO_APIC_irq (unsigned int irq)
142{
143 unsigned long flags;
144
145 spin_lock_irqsave(&ioapic_lock, flags);
146 __unmask_IO_APIC_irq(irq);
147 spin_unlock_irqrestore(&ioapic_lock, flags);
148}
149
150static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
151{
152 struct IO_APIC_route_entry entry;
153 unsigned long flags;
154
155 /* Check delivery_mode to be sure we're not clearing an SMI pin */
156 spin_lock_irqsave(&ioapic_lock, flags);
157 *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
158 *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
159 spin_unlock_irqrestore(&ioapic_lock, flags);
160 if (entry.delivery_mode == dest_SMI)
161 return;
162 /*
163 * Disable it in the IO-APIC irq-routing table:
164 */
165 memset(&entry, 0, sizeof(entry));
166 entry.mask = 1;
167 spin_lock_irqsave(&ioapic_lock, flags);
168 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
169 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
170 spin_unlock_irqrestore(&ioapic_lock, flags);
171}
172
173static void clear_IO_APIC (void)
174{
175 int apic, pin;
176
177 for (apic = 0; apic < nr_ioapics; apic++)
178 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
179 clear_IO_APIC_pin(apic, pin);
180}
181
182/*
183 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
184 * specific CPU-side IRQs.
185 */
186
187#define MAX_PIRQS 8
188static int pirq_entries [MAX_PIRQS];
189static int pirqs_enabled;
190int skip_ioapic_setup;
191int ioapic_force;
192
193/* dummy parsing: see setup.c */
194
195static int __init disable_ioapic_setup(char *str)
196{
197 skip_ioapic_setup = 1;
198 return 1;
199}
200
201static int __init enable_ioapic_setup(char *str)
202{
203 ioapic_force = 1;
204 skip_ioapic_setup = 0;
205 return 1;
206}
207
208__setup("noapic", disable_ioapic_setup);
209__setup("apic", enable_ioapic_setup);
210
211#include <asm/pci-direct.h>
212#include <linux/pci_ids.h>
213#include <linux/pci.h>
214
215/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
216 off. Check for an Nvidia or VIA PCI bridge and turn it off.
217 Use pci direct infrastructure because this runs before the PCI subsystem.
218
219 Can be overwritten with "apic"
220
221 And another hack to disable the IOMMU on VIA chipsets.
222
223 Kludge-O-Rama. */
224void __init check_ioapic(void)
225{
226 int num,slot,func;
227 if (ioapic_force)
228 return;
229
230 /* Poor man's PCI discovery */
231 for (num = 0; num < 32; num++) {
232 for (slot = 0; slot < 32; slot++) {
233 for (func = 0; func < 8; func++) {
234 u32 class;
235 u32 vendor;
236 u8 type;
237 class = read_pci_config(num,slot,func,
238 PCI_CLASS_REVISION);
239 if (class == 0xffffffff)
240 break;
241
242 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
243 continue;
244
245 vendor = read_pci_config(num, slot, func,
246 PCI_VENDOR_ID);
247 vendor &= 0xffff;
248 switch (vendor) {
249 case PCI_VENDOR_ID_VIA:
250#ifdef CONFIG_GART_IOMMU
251 if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) ||
252 force_iommu) &&
253 !iommu_aperture_allowed) {
254 printk(KERN_INFO
255 "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
256 iommu_aperture_disabled = 1;
257 }
258#endif
259 return;
260 case PCI_VENDOR_ID_NVIDIA:
261#ifdef CONFIG_ACPI
262 /* All timer overrides on Nvidia
263 seem to be wrong. Skip them. */
264 acpi_skip_timer_override = 1;
265 printk(KERN_INFO
266 "Nvidia board detected. Ignoring ACPI timer override.\n");
267#endif
268 /* RED-PEN skip them on mptables too? */
269 return;
270 }
271
272 /* No multi-function device? */
273 type = read_pci_config_byte(num,slot,func,
274 PCI_HEADER_TYPE);
275 if (!(type & 0x80))
276 break;
277 }
278 }
279 }
280}
281
282static int __init ioapic_pirq_setup(char *str)
283{
284 int i, max;
285 int ints[MAX_PIRQS+1];
286
287 get_options(str, ARRAY_SIZE(ints), ints);
288
289 for (i = 0; i < MAX_PIRQS; i++)
290 pirq_entries[i] = -1;
291
292 pirqs_enabled = 1;
293 apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
294 max = MAX_PIRQS;
295 if (ints[0] < MAX_PIRQS)
296 max = ints[0];
297
298 for (i = 0; i < max; i++) {
299 apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
300 /*
301 * PIRQs are mapped upside down, usually.
302 */
303 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
304 }
305 return 1;
306}
307
308__setup("pirq=", ioapic_pirq_setup);
309
310/*
311 * Find the IRQ entry number of a certain pin.
312 */
313static int find_irq_entry(int apic, int pin, int type)
314{
315 int i;
316
317 for (i = 0; i < mp_irq_entries; i++)
318 if (mp_irqs[i].mpc_irqtype == type &&
319 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
320 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
321 mp_irqs[i].mpc_dstirq == pin)
322 return i;
323
324 return -1;
325}
326
327/*
328 * Find the pin to which IRQ[irq] (ISA) is connected
329 */
330static int __init find_isa_irq_pin(int irq, int type)
331{
332 int i;
333
334 for (i = 0; i < mp_irq_entries; i++) {
335 int lbus = mp_irqs[i].mpc_srcbus;
336
337 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
338 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
339 mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
340 (mp_irqs[i].mpc_irqtype == type) &&
341 (mp_irqs[i].mpc_srcbusirq == irq))
342
343 return mp_irqs[i].mpc_dstirq;
344 }
345 return -1;
346}
347
348/*
349 * Find a specific PCI IRQ entry.
350 * Not an __init, possibly needed by modules
351 */
352static int pin_2_irq(int idx, int apic, int pin);
353
354int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
355{
356 int apic, i, best_guess = -1;
357
358 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
359 bus, slot, pin);
360 if (mp_bus_id_to_pci_bus[bus] == -1) {
361 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
362 return -1;
363 }
364 for (i = 0; i < mp_irq_entries; i++) {
365 int lbus = mp_irqs[i].mpc_srcbus;
366
367 for (apic = 0; apic < nr_ioapics; apic++)
368 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
369 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
370 break;
371
372 if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
373 !mp_irqs[i].mpc_irqtype &&
374 (bus == lbus) &&
375 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
376 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
377
378 if (!(apic || IO_APIC_IRQ(irq)))
379 continue;
380
381 if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
382 return irq;
383 /*
384 * Use the first all-but-pin matching entry as a
385 * best-guess fuzzy result for broken mptables.
386 */
387 if (best_guess < 0)
388 best_guess = irq;
389 }
390 }
391 return best_guess;
392}
393
394/*
395 * EISA Edge/Level control register, ELCR
396 */
397static int EISA_ELCR(unsigned int irq)
398{
399 if (irq < 16) {
400 unsigned int port = 0x4d0 + (irq >> 3);
401 return (inb(port) >> (irq & 7)) & 1;
402 }
403 apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
404 return 0;
405}
406
407/* EISA interrupts are always polarity zero and can be edge or level
408 * trigger depending on the ELCR value. If an interrupt is listed as
409 * EISA conforming in the MP table, that means its trigger type must
410 * be read in from the ELCR */
411
412#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
413#define default_EISA_polarity(idx) (0)
414
415/* ISA interrupts are always polarity zero edge triggered,
416 * when listed as conforming in the MP table. */
417
418#define default_ISA_trigger(idx) (0)
419#define default_ISA_polarity(idx) (0)
420
421/* PCI interrupts are always polarity one level triggered,
422 * when listed as conforming in the MP table. */
423
424#define default_PCI_trigger(idx) (1)
425#define default_PCI_polarity(idx) (1)
426
427/* MCA interrupts are always polarity zero level triggered,
428 * when listed as conforming in the MP table. */
429
430#define default_MCA_trigger(idx) (1)
431#define default_MCA_polarity(idx) (0)
432
433static int __init MPBIOS_polarity(int idx)
434{
435 int bus = mp_irqs[idx].mpc_srcbus;
436 int polarity;
437
438 /*
439 * Determine IRQ line polarity (high active or low active):
440 */
441 switch (mp_irqs[idx].mpc_irqflag & 3)
442 {
443 case 0: /* conforms, ie. bus-type dependent polarity */
444 {
445 switch (mp_bus_id_to_type[bus])
446 {
447 case MP_BUS_ISA: /* ISA pin */
448 {
449 polarity = default_ISA_polarity(idx);
450 break;
451 }
452 case MP_BUS_EISA: /* EISA pin */
453 {
454 polarity = default_EISA_polarity(idx);
455 break;
456 }
457 case MP_BUS_PCI: /* PCI pin */
458 {
459 polarity = default_PCI_polarity(idx);
460 break;
461 }
462 case MP_BUS_MCA: /* MCA pin */
463 {
464 polarity = default_MCA_polarity(idx);
465 break;
466 }
467 default:
468 {
469 printk(KERN_WARNING "broken BIOS!!\n");
470 polarity = 1;
471 break;
472 }
473 }
474 break;
475 }
476 case 1: /* high active */
477 {
478 polarity = 0;
479 break;
480 }
481 case 2: /* reserved */
482 {
483 printk(KERN_WARNING "broken BIOS!!\n");
484 polarity = 1;
485 break;
486 }
487 case 3: /* low active */
488 {
489 polarity = 1;
490 break;
491 }
492 default: /* invalid */
493 {
494 printk(KERN_WARNING "broken BIOS!!\n");
495 polarity = 1;
496 break;
497 }
498 }
499 return polarity;
500}
501
502static int MPBIOS_trigger(int idx)
503{
504 int bus = mp_irqs[idx].mpc_srcbus;
505 int trigger;
506
507 /*
508 * Determine IRQ trigger mode (edge or level sensitive):
509 */
510 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
511 {
512 case 0: /* conforms, ie. bus-type dependent */
513 {
514 switch (mp_bus_id_to_type[bus])
515 {
516 case MP_BUS_ISA: /* ISA pin */
517 {
518 trigger = default_ISA_trigger(idx);
519 break;
520 }
521 case MP_BUS_EISA: /* EISA pin */
522 {
523 trigger = default_EISA_trigger(idx);
524 break;
525 }
526 case MP_BUS_PCI: /* PCI pin */
527 {
528 trigger = default_PCI_trigger(idx);
529 break;
530 }
531 case MP_BUS_MCA: /* MCA pin */
532 {
533 trigger = default_MCA_trigger(idx);
534 break;
535 }
536 default:
537 {
538 printk(KERN_WARNING "broken BIOS!!\n");
539 trigger = 1;
540 break;
541 }
542 }
543 break;
544 }
545 case 1: /* edge */
546 {
547 trigger = 0;
548 break;
549 }
550 case 2: /* reserved */
551 {
552 printk(KERN_WARNING "broken BIOS!!\n");
553 trigger = 1;
554 break;
555 }
556 case 3: /* level */
557 {
558 trigger = 1;
559 break;
560 }
561 default: /* invalid */
562 {
563 printk(KERN_WARNING "broken BIOS!!\n");
564 trigger = 0;
565 break;
566 }
567 }
568 return trigger;
569}
570
571static inline int irq_polarity(int idx)
572{
573 return MPBIOS_polarity(idx);
574}
575
576static inline int irq_trigger(int idx)
577{
578 return MPBIOS_trigger(idx);
579}
580
581static int pin_2_irq(int idx, int apic, int pin)
582{
583 int irq, i;
584 int bus = mp_irqs[idx].mpc_srcbus;
585
586 /*
587 * Debugging check, we are in big trouble if this message pops up!
588 */
589 if (mp_irqs[idx].mpc_dstirq != pin)
590 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
591
592 switch (mp_bus_id_to_type[bus])
593 {
594 case MP_BUS_ISA: /* ISA pin */
595 case MP_BUS_EISA:
596 case MP_BUS_MCA:
597 {
598 irq = mp_irqs[idx].mpc_srcbusirq;
599 break;
600 }
601 case MP_BUS_PCI: /* PCI pin */
602 {
603 /*
604 * PCI IRQs are mapped in order
605 */
606 i = irq = 0;
607 while (i < apic)
608 irq += nr_ioapic_registers[i++];
609 irq += pin;
610 break;
611 }
612 default:
613 {
614 printk(KERN_ERR "unknown bus type %d.\n",bus);
615 irq = 0;
616 break;
617 }
618 }
619
620 /*
621 * PCI IRQ command line redirection. Yes, limits are hardcoded.
622 */
623 if ((pin >= 16) && (pin <= 23)) {
624 if (pirq_entries[pin-16] != -1) {
625 if (!pirq_entries[pin-16]) {
626 apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
627 } else {
628 irq = pirq_entries[pin-16];
629 apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
630 pin-16, irq);
631 }
632 }
633 }
634 return irq;
635}
636
637static inline int IO_APIC_irq_trigger(int irq)
638{
639 int apic, idx, pin;
640
641 for (apic = 0; apic < nr_ioapics; apic++) {
642 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
643 idx = find_irq_entry(apic,pin,mp_INT);
644 if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
645 return irq_trigger(idx);
646 }
647 }
648 /*
649 * nonexistent IRQs are edge default
650 */
651 return 0;
652}
653
654/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
655u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
656
657int assign_irq_vector(int irq)
658{
659 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
660
661 BUG_ON(irq >= NR_IRQ_VECTORS);
662 if (IO_APIC_VECTOR(irq) > 0)
663 return IO_APIC_VECTOR(irq);
664next:
665 current_vector += 8;
666 if (current_vector == IA32_SYSCALL_VECTOR)
667 goto next;
668
669 if (current_vector >= FIRST_SYSTEM_VECTOR) {
670 offset++;
671 if (!(offset%8))
672 return -ENOSPC;
673 current_vector = FIRST_DEVICE_VECTOR + offset;
674 }
675
676 vector_irq[current_vector] = irq;
677 if (irq != AUTO_ASSIGN)
678 IO_APIC_VECTOR(irq) = current_vector;
679
680 return current_vector;
681}
682
683extern void (*interrupt[NR_IRQS])(void);
684static struct hw_interrupt_type ioapic_level_type;
685static struct hw_interrupt_type ioapic_edge_type;
686
687#define IOAPIC_AUTO -1
688#define IOAPIC_EDGE 0
689#define IOAPIC_LEVEL 1
690
691static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
692{
693 if (use_pci_vector() && !platform_legacy_irq(irq)) {
694 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
695 trigger == IOAPIC_LEVEL)
696 irq_desc[vector].handler = &ioapic_level_type;
697 else
698 irq_desc[vector].handler = &ioapic_edge_type;
699 set_intr_gate(vector, interrupt[vector]);
700 } else {
701 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
702 trigger == IOAPIC_LEVEL)
703 irq_desc[irq].handler = &ioapic_level_type;
704 else
705 irq_desc[irq].handler = &ioapic_edge_type;
706 set_intr_gate(vector, interrupt[irq]);
707 }
708}
709
710static void __init setup_IO_APIC_irqs(void)
711{
712 struct IO_APIC_route_entry entry;
713 int apic, pin, idx, irq, first_notcon = 1, vector;
714 unsigned long flags;
715
716 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
717
718 for (apic = 0; apic < nr_ioapics; apic++) {
719 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
720
721 /*
722 * add it to the IO-APIC irq-routing table:
723 */
724 memset(&entry,0,sizeof(entry));
725
726 entry.delivery_mode = INT_DELIVERY_MODE;
727 entry.dest_mode = INT_DEST_MODE;
728 entry.mask = 0; /* enable IRQ */
729 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
730
731 idx = find_irq_entry(apic,pin,mp_INT);
732 if (idx == -1) {
733 if (first_notcon) {
734 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
735 first_notcon = 0;
736 } else
737 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
738 continue;
739 }
740
741 entry.trigger = irq_trigger(idx);
742 entry.polarity = irq_polarity(idx);
743
744 if (irq_trigger(idx)) {
745 entry.trigger = 1;
746 entry.mask = 1;
747 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
748 }
749
750 irq = pin_2_irq(idx, apic, pin);
751 add_pin_to_irq(irq, apic, pin);
752
753 if (!apic && !IO_APIC_IRQ(irq))
754 continue;
755
756 if (IO_APIC_IRQ(irq)) {
757 vector = assign_irq_vector(irq);
758 entry.vector = vector;
759
760 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
761 if (!apic && (irq < 16))
762 disable_8259A_irq(irq);
763 }
764 spin_lock_irqsave(&ioapic_lock, flags);
765 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
766 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
767 spin_unlock_irqrestore(&ioapic_lock, flags);
768 }
769 }
770
771 if (!first_notcon)
772 apic_printk(APIC_VERBOSE," not connected.\n");
773}
774
775/*
776 * Set up the 8259A-master output pin as broadcast to all
777 * CPUs.
778 */
779static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
780{
781 struct IO_APIC_route_entry entry;
782 unsigned long flags;
783
784 memset(&entry,0,sizeof(entry));
785
786 disable_8259A_irq(0);
787
788 /* mask LVT0 */
789 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
790
791 /*
792 * We use logical delivery to get the timer IRQ
793 * to the first CPU.
794 */
795 entry.dest_mode = INT_DEST_MODE;
796 entry.mask = 0; /* unmask IRQ now */
797 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
798 entry.delivery_mode = INT_DELIVERY_MODE;
799 entry.polarity = 0;
800 entry.trigger = 0;
801 entry.vector = vector;
802
803 /*
804 * The timer IRQ doesn't have to know that behind the
805 * scene we have a 8259A-master in AEOI mode ...
806 */
807 irq_desc[0].handler = &ioapic_edge_type;
808
809 /*
810 * Add it to the IO-APIC irq-routing table:
811 */
812 spin_lock_irqsave(&ioapic_lock, flags);
813 io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
814 io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
815 spin_unlock_irqrestore(&ioapic_lock, flags);
816
817 enable_8259A_irq(0);
818}
819
820void __init UNEXPECTED_IO_APIC(void)
821{
822}
823
824void __apicdebuginit print_IO_APIC(void)
825{
826 int apic, i;
827 union IO_APIC_reg_00 reg_00;
828 union IO_APIC_reg_01 reg_01;
829 union IO_APIC_reg_02 reg_02;
830 unsigned long flags;
831
832 if (apic_verbosity == APIC_QUIET)
833 return;
834
835 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
836 for (i = 0; i < nr_ioapics; i++)
837 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
838 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
839
840 /*
841 * We are a bit conservative about what we expect. We have to
842 * know about every hardware change ASAP.
843 */
844 printk(KERN_INFO "testing the IO APIC.......................\n");
845
846 for (apic = 0; apic < nr_ioapics; apic++) {
847
848 spin_lock_irqsave(&ioapic_lock, flags);
849 reg_00.raw = io_apic_read(apic, 0);
850 reg_01.raw = io_apic_read(apic, 1);
851 if (reg_01.bits.version >= 0x10)
852 reg_02.raw = io_apic_read(apic, 2);
853 spin_unlock_irqrestore(&ioapic_lock, flags);
854
855 printk("\n");
856 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
857 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
858 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
859 if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
860 UNEXPECTED_IO_APIC();
861
862 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
863 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
864 if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
865 (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
866 (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
867 (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
868 (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
869 (reg_01.bits.entries != 0x2E) &&
870 (reg_01.bits.entries != 0x3F) &&
871 (reg_01.bits.entries != 0x03)
872 )
873 UNEXPECTED_IO_APIC();
874
875 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
876 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
877 if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
878 (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
879 (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
880 (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
881 (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
882 (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
883 )
884 UNEXPECTED_IO_APIC();
885 if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
886 UNEXPECTED_IO_APIC();
887
888 if (reg_01.bits.version >= 0x10) {
889 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
890 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
891 if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
892 UNEXPECTED_IO_APIC();
893 }
894
895 printk(KERN_DEBUG ".... IRQ redirection table:\n");
896
897 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
898 " Stat Dest Deli Vect: \n");
899
900 for (i = 0; i <= reg_01.bits.entries; i++) {
901 struct IO_APIC_route_entry entry;
902
903 spin_lock_irqsave(&ioapic_lock, flags);
904 *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
905 *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
906 spin_unlock_irqrestore(&ioapic_lock, flags);
907
908 printk(KERN_DEBUG " %02x %03X %02X ",
909 i,
910 entry.dest.logical.logical_dest,
911 entry.dest.physical.physical_dest
912 );
913
914 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
915 entry.mask,
916 entry.trigger,
917 entry.irr,
918 entry.polarity,
919 entry.delivery_status,
920 entry.dest_mode,
921 entry.delivery_mode,
922 entry.vector
923 );
924 }
925 }
926 if (use_pci_vector())
927 printk(KERN_INFO "Using vector-based indexing\n");
928 printk(KERN_DEBUG "IRQ to pin mappings:\n");
929 for (i = 0; i < NR_IRQS; i++) {
930 struct irq_pin_list *entry = irq_2_pin + i;
931 if (entry->pin < 0)
932 continue;
933 if (use_pci_vector() && !platform_legacy_irq(i))
934 printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
935 else
936 printk(KERN_DEBUG "IRQ%d ", i);
937 for (;;) {
938 printk("-> %d:%d", entry->apic, entry->pin);
939 if (!entry->next)
940 break;
941 entry = irq_2_pin + entry->next;
942 }
943 printk("\n");
944 }
945
946 printk(KERN_INFO ".................................... done.\n");
947
948 return;
949}
950
951#if 0
952
953static __apicdebuginit void print_APIC_bitfield (int base)
954{
955 unsigned int v;
956 int i, j;
957
958 if (apic_verbosity == APIC_QUIET)
959 return;
960
961 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
962 for (i = 0; i < 8; i++) {
963 v = apic_read(base + i*0x10);
964 for (j = 0; j < 32; j++) {
965 if (v & (1<<j))
966 printk("1");
967 else
968 printk("0");
969 }
970 printk("\n");
971 }
972}
973
974void __apicdebuginit print_local_APIC(void * dummy)
975{
976 unsigned int v, ver, maxlvt;
977
978 if (apic_verbosity == APIC_QUIET)
979 return;
980
981 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
982 smp_processor_id(), hard_smp_processor_id());
983 v = apic_read(APIC_ID);
984 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
985 v = apic_read(APIC_LVR);
986 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
987 ver = GET_APIC_VERSION(v);
988 maxlvt = get_maxlvt();
989
990 v = apic_read(APIC_TASKPRI);
991 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
992
993 if (APIC_INTEGRATED(ver)) { /* !82489DX */
994 v = apic_read(APIC_ARBPRI);
995 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
996 v & APIC_ARBPRI_MASK);
997 v = apic_read(APIC_PROCPRI);
998 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
999 }
1000
1001 v = apic_read(APIC_EOI);
1002 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1003 v = apic_read(APIC_RRR);
1004 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1005 v = apic_read(APIC_LDR);
1006 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1007 v = apic_read(APIC_DFR);
1008 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1009 v = apic_read(APIC_SPIV);
1010 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1011
1012 printk(KERN_DEBUG "... APIC ISR field:\n");
1013 print_APIC_bitfield(APIC_ISR);
1014 printk(KERN_DEBUG "... APIC TMR field:\n");
1015 print_APIC_bitfield(APIC_TMR);
1016 printk(KERN_DEBUG "... APIC IRR field:\n");
1017 print_APIC_bitfield(APIC_IRR);
1018
1019 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1020 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1021 apic_write(APIC_ESR, 0);
1022 v = apic_read(APIC_ESR);
1023 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1024 }
1025
1026 v = apic_read(APIC_ICR);
1027 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1028 v = apic_read(APIC_ICR2);
1029 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1030
1031 v = apic_read(APIC_LVTT);
1032 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1033
1034 if (maxlvt > 3) { /* PC is LVT#4. */
1035 v = apic_read(APIC_LVTPC);
1036 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1037 }
1038 v = apic_read(APIC_LVT0);
1039 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1040 v = apic_read(APIC_LVT1);
1041 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1042
1043 if (maxlvt > 2) { /* ERR is LVT#3. */
1044 v = apic_read(APIC_LVTERR);
1045 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1046 }
1047
1048 v = apic_read(APIC_TMICT);
1049 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1050 v = apic_read(APIC_TMCCT);
1051 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1052 v = apic_read(APIC_TDCR);
1053 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1054 printk("\n");
1055}
1056
1057void print_all_local_APICs (void)
1058{
1059 on_each_cpu(print_local_APIC, NULL, 1, 1);
1060}
1061
1062void __apicdebuginit print_PIC(void)
1063{
1064 extern spinlock_t i8259A_lock;
1065 unsigned int v;
1066 unsigned long flags;
1067
1068 if (apic_verbosity == APIC_QUIET)
1069 return;
1070
1071 printk(KERN_DEBUG "\nprinting PIC contents\n");
1072
1073 spin_lock_irqsave(&i8259A_lock, flags);
1074
1075 v = inb(0xa1) << 8 | inb(0x21);
1076 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1077
1078 v = inb(0xa0) << 8 | inb(0x20);
1079 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1080
1081 outb(0x0b,0xa0);
1082 outb(0x0b,0x20);
1083 v = inb(0xa0) << 8 | inb(0x20);
1084 outb(0x0a,0xa0);
1085 outb(0x0a,0x20);
1086
1087 spin_unlock_irqrestore(&i8259A_lock, flags);
1088
1089 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1090
1091 v = inb(0x4d1) << 8 | inb(0x4d0);
1092 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1093}
1094
1095#endif /* 0 */
1096
1097static void __init enable_IO_APIC(void)
1098{
1099 union IO_APIC_reg_01 reg_01;
1100 int i;
1101 unsigned long flags;
1102
1103 for (i = 0; i < PIN_MAP_SIZE; i++) {
1104 irq_2_pin[i].pin = -1;
1105 irq_2_pin[i].next = 0;
1106 }
1107 if (!pirqs_enabled)
1108 for (i = 0; i < MAX_PIRQS; i++)
1109 pirq_entries[i] = -1;
1110
1111 /*
1112 * The number of IO-APIC IRQ registers (== #pins):
1113 */
1114 for (i = 0; i < nr_ioapics; i++) {
1115 spin_lock_irqsave(&ioapic_lock, flags);
1116 reg_01.raw = io_apic_read(i, 1);
1117 spin_unlock_irqrestore(&ioapic_lock, flags);
1118 nr_ioapic_registers[i] = reg_01.bits.entries+1;
1119 }
1120
1121 /*
1122 * Do not trust the IO-APIC being empty at bootup
1123 */
1124 clear_IO_APIC();
1125}
1126
1127/*
1128 * Not an __init, needed by the reboot code
1129 */
1130void disable_IO_APIC(void)
1131{
1132 /*
1133 * Clear the IO-APIC before rebooting:
1134 */
1135 clear_IO_APIC();
1136
1137 disconnect_bsp_APIC();
1138}
1139
1140/*
1141 * function to set the IO-APIC physical IDs based on the
1142 * values stored in the MPC table.
1143 *
1144 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1145 */
1146
1147static void __init setup_ioapic_ids_from_mpc (void)
1148{
1149 union IO_APIC_reg_00 reg_00;
1150 int apic;
1151 int i;
1152 unsigned char old_id;
1153 unsigned long flags;
1154
1155 /*
1156 * Set the IOAPIC ID to the value stored in the MPC table.
1157 */
1158 for (apic = 0; apic < nr_ioapics; apic++) {
1159
1160 /* Read the register 0 value */
1161 spin_lock_irqsave(&ioapic_lock, flags);
1162 reg_00.raw = io_apic_read(apic, 0);
1163 spin_unlock_irqrestore(&ioapic_lock, flags);
1164
1165 old_id = mp_ioapics[apic].mpc_apicid;
1166
1167
1168 printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
1169
1170
1171 /*
1172 * We need to adjust the IRQ routing table
1173 * if the ID changed.
1174 */
1175 if (old_id != mp_ioapics[apic].mpc_apicid)
1176 for (i = 0; i < mp_irq_entries; i++)
1177 if (mp_irqs[i].mpc_dstapic == old_id)
1178 mp_irqs[i].mpc_dstapic
1179 = mp_ioapics[apic].mpc_apicid;
1180
1181 /*
1182 * Read the right value from the MPC table and
1183 * write it into the ID register.
1184 */
1185 apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
1186 mp_ioapics[apic].mpc_apicid);
1187
1188 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
1189 spin_lock_irqsave(&ioapic_lock, flags);
1190 io_apic_write(apic, 0, reg_00.raw);
1191 spin_unlock_irqrestore(&ioapic_lock, flags);
1192
1193 /*
1194 * Sanity check
1195 */
1196 spin_lock_irqsave(&ioapic_lock, flags);
1197 reg_00.raw = io_apic_read(apic, 0);
1198 spin_unlock_irqrestore(&ioapic_lock, flags);
1199 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
1200 printk("could not set ID!\n");
1201 else
1202 apic_printk(APIC_VERBOSE," ok.\n");
1203 }
1204}
1205
1206/*
1207 * There is a nasty bug in some older SMP boards, their mptable lies
1208 * about the timer IRQ. We do the following to work around the situation:
1209 *
1210 * - timer IRQ defaults to IO-APIC IRQ
1211 * - if this function detects that timer IRQs are defunct, then we fall
1212 * back to ISA timer IRQs
1213 */
1214static int __init timer_irq_works(void)
1215{
1216 unsigned long t1 = jiffies;
1217
1218 local_irq_enable();
1219 /* Let ten ticks pass... */
1220 mdelay((10 * 1000) / HZ);
1221
1222 /*
1223 * Expect a few ticks at least, to be sure some possible
1224 * glue logic does not lock up after one or two first
1225 * ticks in a non-ExtINT mode. Also the local APIC
1226 * might have cached one ExtINT interrupt. Finally, at
1227 * least one tick may be lost due to delays.
1228 */
1229
1230 /* jiffies wrap? */
1231 if (jiffies - t1 > 4)
1232 return 1;
1233 return 0;
1234}
1235
1236/*
1237 * In the SMP+IOAPIC case it might happen that there are an unspecified
1238 * number of pending IRQ events unhandled. These cases are very rare,
1239 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1240 * better to do it this way as thus we do not have to be aware of
1241 * 'pending' interrupts in the IRQ path, except at this point.
1242 */
1243/*
1244 * Edge triggered needs to resend any interrupt
1245 * that was delayed but this is now handled in the device
1246 * independent code.
1247 */
1248
1249/*
1250 * Starting up a edge-triggered IO-APIC interrupt is
1251 * nasty - we need to make sure that we get the edge.
1252 * If it is already asserted for some reason, we need
1253 * return 1 to indicate that is was pending.
1254 *
1255 * This is not complete - we should be able to fake
1256 * an edge even if it isn't on the 8259A...
1257 */
1258
1259static unsigned int startup_edge_ioapic_irq(unsigned int irq)
1260{
1261 int was_pending = 0;
1262 unsigned long flags;
1263
1264 spin_lock_irqsave(&ioapic_lock, flags);
1265 if (irq < 16) {
1266 disable_8259A_irq(irq);
1267 if (i8259A_irq_pending(irq))
1268 was_pending = 1;
1269 }
1270 __unmask_IO_APIC_irq(irq);
1271 spin_unlock_irqrestore(&ioapic_lock, flags);
1272
1273 return was_pending;
1274}
1275
1276/*
1277 * Once we have recorded IRQ_PENDING already, we can mask the
1278 * interrupt for real. This prevents IRQ storms from unhandled
1279 * devices.
1280 */
1281static void ack_edge_ioapic_irq(unsigned int irq)
1282{
1283 if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
1284 == (IRQ_PENDING | IRQ_DISABLED))
1285 mask_IO_APIC_irq(irq);
1286 ack_APIC_irq();
1287}
1288
1289/*
1290 * Level triggered interrupts can just be masked,
1291 * and shutting down and starting up the interrupt
1292 * is the same as enabling and disabling them -- except
1293 * with a startup need to return a "was pending" value.
1294 *
1295 * Level triggered interrupts are special because we
1296 * do not touch any IO-APIC register while handling
1297 * them. We ack the APIC in the end-IRQ handler, not
1298 * in the start-IRQ-handler. Protection against reentrance
1299 * from the same interrupt is still provided, both by the
1300 * generic IRQ layer and by the fact that an unacked local
1301 * APIC does not accept IRQs.
1302 */
1303static unsigned int startup_level_ioapic_irq (unsigned int irq)
1304{
1305 unmask_IO_APIC_irq(irq);
1306
1307 return 0; /* don't check for pending */
1308}
1309
1310static void end_level_ioapic_irq (unsigned int irq)
1311{
1312 ack_APIC_irq();
1313}
1314
1315static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
1316{
1317 unsigned long flags;
1318 unsigned int dest;
1319
1320 dest = cpu_mask_to_apicid(mask);
1321
1322 /*
1323 * Only the high 8 bits are valid.
1324 */
1325 dest = SET_APIC_LOGICAL_ID(dest);
1326
1327 spin_lock_irqsave(&ioapic_lock, flags);
1328 __DO_ACTION(1, = dest, )
1329 spin_unlock_irqrestore(&ioapic_lock, flags);
1330}
1331
1332#ifdef CONFIG_PCI_MSI
1333static unsigned int startup_edge_ioapic_vector(unsigned int vector)
1334{
1335 int irq = vector_to_irq(vector);
1336
1337 return startup_edge_ioapic_irq(irq);
1338}
1339
1340static void ack_edge_ioapic_vector(unsigned int vector)
1341{
1342 int irq = vector_to_irq(vector);
1343
1344 ack_edge_ioapic_irq(irq);
1345}
1346
1347static unsigned int startup_level_ioapic_vector (unsigned int vector)
1348{
1349 int irq = vector_to_irq(vector);
1350
1351 return startup_level_ioapic_irq (irq);
1352}
1353
1354static void end_level_ioapic_vector (unsigned int vector)
1355{
1356 int irq = vector_to_irq(vector);
1357
1358 end_level_ioapic_irq(irq);
1359}
1360
1361static void mask_IO_APIC_vector (unsigned int vector)
1362{
1363 int irq = vector_to_irq(vector);
1364
1365 mask_IO_APIC_irq(irq);
1366}
1367
1368static void unmask_IO_APIC_vector (unsigned int vector)
1369{
1370 int irq = vector_to_irq(vector);
1371
1372 unmask_IO_APIC_irq(irq);
1373}
1374
1375static void set_ioapic_affinity_vector (unsigned int vector,
1376 cpumask_t cpu_mask)
1377{
1378 int irq = vector_to_irq(vector);
1379
1380 set_ioapic_affinity_irq(irq, cpu_mask);
1381}
1382#endif
1383
1384/*
1385 * Level and edge triggered IO-APIC interrupts need different handling,
1386 * so we use two separate IRQ descriptors. Edge triggered IRQs can be
1387 * handled with the level-triggered descriptor, but that one has slightly
1388 * more overhead. Level-triggered interrupts cannot be handled with the
1389 * edge-triggered handler, without risking IRQ storms and other ugly
1390 * races.
1391 */
1392
1393static struct hw_interrupt_type ioapic_edge_type = {
1394 .typename = "IO-APIC-edge",
1395 .startup = startup_edge_ioapic,
1396 .shutdown = shutdown_edge_ioapic,
1397 .enable = enable_edge_ioapic,
1398 .disable = disable_edge_ioapic,
1399 .ack = ack_edge_ioapic,
1400 .end = end_edge_ioapic,
1401 .set_affinity = set_ioapic_affinity,
1402};
1403
1404static struct hw_interrupt_type ioapic_level_type = {
1405 .typename = "IO-APIC-level",
1406 .startup = startup_level_ioapic,
1407 .shutdown = shutdown_level_ioapic,
1408 .enable = enable_level_ioapic,
1409 .disable = disable_level_ioapic,
1410 .ack = mask_and_ack_level_ioapic,
1411 .end = end_level_ioapic,
1412 .set_affinity = set_ioapic_affinity,
1413};
1414
1415static inline void init_IO_APIC_traps(void)
1416{
1417 int irq;
1418
1419 /*
1420 * NOTE! The local APIC isn't very good at handling
1421 * multiple interrupts at the same interrupt level.
1422 * As the interrupt level is determined by taking the
1423 * vector number and shifting that right by 4, we
1424 * want to spread these out a bit so that they don't
1425 * all fall in the same interrupt level.
1426 *
1427 * Also, we've got to be careful not to trash gate
1428 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1429 */
1430 for (irq = 0; irq < NR_IRQS ; irq++) {
1431 int tmp = irq;
1432 if (use_pci_vector()) {
1433 if (!platform_legacy_irq(tmp))
1434 if ((tmp = vector_to_irq(tmp)) == -1)
1435 continue;
1436 }
1437 if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
1438 /*
1439 * Hmm.. We don't have an entry for this,
1440 * so default to an old-fashioned 8259
1441 * interrupt if we can..
1442 */
1443 if (irq < 16)
1444 make_8259A_irq(irq);
1445 else
1446 /* Strange. Oh, well.. */
1447 irq_desc[irq].handler = &no_irq_type;
1448 }
1449 }
1450}
1451
1452static void enable_lapic_irq (unsigned int irq)
1453{
1454 unsigned long v;
1455
1456 v = apic_read(APIC_LVT0);
1457 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
1458}
1459
1460static void disable_lapic_irq (unsigned int irq)
1461{
1462 unsigned long v;
1463
1464 v = apic_read(APIC_LVT0);
1465 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
1466}
1467
1468static void ack_lapic_irq (unsigned int irq)
1469{
1470 ack_APIC_irq();
1471}
1472
1473static void end_lapic_irq (unsigned int i) { /* nothing */ }
1474
1475static struct hw_interrupt_type lapic_irq_type = {
1476 .typename = "local-APIC-edge",
1477 .startup = NULL, /* startup_irq() not used for IRQ0 */
1478 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1479 .enable = enable_lapic_irq,
1480 .disable = disable_lapic_irq,
1481 .ack = ack_lapic_irq,
1482 .end = end_lapic_irq,
1483};
1484
1485static void setup_nmi (void)
1486{
1487 /*
1488 * Dirty trick to enable the NMI watchdog ...
1489 * We put the 8259A master into AEOI mode and
1490 * unmask on all local APICs LVT0 as NMI.
1491 *
1492 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1493 * is from Maciej W. Rozycki - so we do not have to EOI from
1494 * the NMI handler or the timer interrupt.
1495 */
1496 printk(KERN_INFO "activating NMI Watchdog ...");
1497
1498 enable_NMI_through_LVT0(NULL);
1499
1500 printk(" done.\n");
1501}
1502
1503/*
1504 * This looks a bit hackish but it's about the only one way of sending
1505 * a few INTA cycles to 8259As and any associated glue logic. ICR does
1506 * not support the ExtINT mode, unfortunately. We need to send these
1507 * cycles as some i82489DX-based boards have glue logic that keeps the
1508 * 8259A interrupt line asserted until INTA. --macro
1509 */
1510static inline void unlock_ExtINT_logic(void)
1511{
1512 int pin, i;
1513 struct IO_APIC_route_entry entry0, entry1;
1514 unsigned char save_control, save_freq_select;
1515 unsigned long flags;
1516
1517 pin = find_isa_irq_pin(8, mp_INT);
1518 if (pin == -1)
1519 return;
1520
1521 spin_lock_irqsave(&ioapic_lock, flags);
1522 *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
1523 *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
1524 spin_unlock_irqrestore(&ioapic_lock, flags);
1525 clear_IO_APIC_pin(0, pin);
1526
1527 memset(&entry1, 0, sizeof(entry1));
1528
1529 entry1.dest_mode = 0; /* physical delivery */
1530 entry1.mask = 0; /* unmask IRQ now */
1531 entry1.dest.physical.physical_dest = hard_smp_processor_id();
1532 entry1.delivery_mode = dest_ExtINT;
1533 entry1.polarity = entry0.polarity;
1534 entry1.trigger = 0;
1535 entry1.vector = 0;
1536
1537 spin_lock_irqsave(&ioapic_lock, flags);
1538 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
1539 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
1540 spin_unlock_irqrestore(&ioapic_lock, flags);
1541
1542 save_control = CMOS_READ(RTC_CONTROL);
1543 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
1544 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
1545 RTC_FREQ_SELECT);
1546 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
1547
1548 i = 100;
1549 while (i-- > 0) {
1550 mdelay(10);
1551 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
1552 i -= 10;
1553 }
1554
1555 CMOS_WRITE(save_control, RTC_CONTROL);
1556 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
1557 clear_IO_APIC_pin(0, pin);
1558
1559 spin_lock_irqsave(&ioapic_lock, flags);
1560 io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
1561 io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
1562 spin_unlock_irqrestore(&ioapic_lock, flags);
1563}
1564
1565/*
1566 * This code may look a bit paranoid, but it's supposed to cooperate with
1567 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1568 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1569 * fanatically on his truly buggy board.
1570 */
1571static inline void check_timer(void)
1572{
1573 int pin1, pin2;
1574 int vector;
1575
1576 /*
1577 * get/set the timer IRQ vector:
1578 */
1579 disable_8259A_irq(0);
1580 vector = assign_irq_vector(0);
1581 set_intr_gate(vector, interrupt[0]);
1582
1583 /*
1584 * Subtle, code in do_timer_interrupt() expects an AEOI
1585 * mode for the 8259A whenever interrupts are routed
1586 * through I/O APICs. Also IRQ0 has to be enabled in
1587 * the 8259A which implies the virtual wire has to be
1588 * disabled in the local APIC.
1589 */
1590 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1591 init_8259A(1);
1592 enable_8259A_irq(0);
1593
1594 pin1 = find_isa_irq_pin(0, mp_INT);
1595 pin2 = find_isa_irq_pin(0, mp_ExtINT);
1596
1597 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2);
1598
1599 if (pin1 != -1) {
1600 /*
1601 * Ok, does IRQ0 through the IOAPIC work?
1602 */
1603 unmask_IO_APIC_irq(0);
1604 if (timer_irq_works()) {
1605 nmi_watchdog_default();
1606 if (nmi_watchdog == NMI_IO_APIC) {
1607 disable_8259A_irq(0);
1608 setup_nmi();
1609 enable_8259A_irq(0);
1610 check_nmi_watchdog();
1611 }
1612 return;
1613 }
1614 clear_IO_APIC_pin(0, pin1);
1615 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
1616 }
1617
1618 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
1619 if (pin2 != -1) {
1620 apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2);
1621 /*
1622 * legacy devices should be connected to IO APIC #0
1623 */
1624 setup_ExtINT_IRQ0_pin(pin2, vector);
1625 if (timer_irq_works()) {
1626 printk("works.\n");
1627 nmi_watchdog_default();
1628 if (nmi_watchdog == NMI_IO_APIC) {
1629 setup_nmi();
1630 check_nmi_watchdog();
1631 }
1632 return;
1633 }
1634 /*
1635 * Cleanup, just in case ...
1636 */
1637 clear_IO_APIC_pin(0, pin2);
1638 }
1639 printk(" failed.\n");
1640
1641 if (nmi_watchdog) {
1642 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
1643 nmi_watchdog = 0;
1644 }
1645
1646 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
1647
1648 disable_8259A_irq(0);
1649 irq_desc[0].handler = &lapic_irq_type;
1650 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
1651 enable_8259A_irq(0);
1652
1653 if (timer_irq_works()) {
1654 apic_printk(APIC_QUIET, " works.\n");
1655 return;
1656 }
1657 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
1658 apic_printk(APIC_VERBOSE," failed.\n");
1659
1660 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
1661
1662 init_8259A(0);
1663 make_8259A_irq(0);
1664 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
1665
1666 unlock_ExtINT_logic();
1667
1668 if (timer_irq_works()) {
1669 apic_printk(APIC_VERBOSE," works.\n");
1670 return;
1671 }
1672 apic_printk(APIC_VERBOSE," failed :(.\n");
1673 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
1674}
1675
1676/*
1677 *
1678 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
1679 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
1680 * Linux doesn't really care, as it's not actually used
1681 * for any interrupt handling anyway.
1682 */
1683#define PIC_IRQS (1<<2)
1684
1685void __init setup_IO_APIC(void)
1686{
1687 enable_IO_APIC();
1688
1689 if (acpi_ioapic)
1690 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
1691 else
1692 io_apic_irqs = ~PIC_IRQS;
1693
1694 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1695
1696 /*
1697 * Set up the IO-APIC IRQ routing table.
1698 */
1699 if (!acpi_ioapic)
1700 setup_ioapic_ids_from_mpc();
1701 sync_Arb_IDs();
1702 setup_IO_APIC_irqs();
1703 init_IO_APIC_traps();
1704 check_timer();
1705 if (!acpi_ioapic)
1706 print_IO_APIC();
1707}
1708
1709struct sysfs_ioapic_data {
1710 struct sys_device dev;
1711 struct IO_APIC_route_entry entry[0];
1712};
1713static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
1714
1715static int ioapic_suspend(struct sys_device *dev, u32 state)
1716{
1717 struct IO_APIC_route_entry *entry;
1718 struct sysfs_ioapic_data *data;
1719 unsigned long flags;
1720 int i;
1721
1722 data = container_of(dev, struct sysfs_ioapic_data, dev);
1723 entry = data->entry;
1724 spin_lock_irqsave(&ioapic_lock, flags);
1725 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
1726 *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
1727 *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
1728 }
1729 spin_unlock_irqrestore(&ioapic_lock, flags);
1730
1731 return 0;
1732}
1733
1734static int ioapic_resume(struct sys_device *dev)
1735{
1736 struct IO_APIC_route_entry *entry;
1737 struct sysfs_ioapic_data *data;
1738 unsigned long flags;
1739 union IO_APIC_reg_00 reg_00;
1740 int i;
1741
1742 data = container_of(dev, struct sysfs_ioapic_data, dev);
1743 entry = data->entry;
1744
1745 spin_lock_irqsave(&ioapic_lock, flags);
1746 reg_00.raw = io_apic_read(dev->id, 0);
1747 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
1748 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
1749 io_apic_write(dev->id, 0, reg_00.raw);
1750 }
1751 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
1752 io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
1753 io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
1754 }
1755 spin_unlock_irqrestore(&ioapic_lock, flags);
1756
1757 return 0;
1758}
1759
1760static struct sysdev_class ioapic_sysdev_class = {
1761 set_kset_name("ioapic"),
1762 .suspend = ioapic_suspend,
1763 .resume = ioapic_resume,
1764};
1765
1766static int __init ioapic_init_sysfs(void)
1767{
1768 struct sys_device * dev;
1769 int i, size, error = 0;
1770
1771 error = sysdev_class_register(&ioapic_sysdev_class);
1772 if (error)
1773 return error;
1774
1775 for (i = 0; i < nr_ioapics; i++ ) {
1776 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
1777 * sizeof(struct IO_APIC_route_entry);
1778 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
1779 if (!mp_ioapic_data[i]) {
1780 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1781 continue;
1782 }
1783 memset(mp_ioapic_data[i], 0, size);
1784 dev = &mp_ioapic_data[i]->dev;
1785 dev->id = i;
1786 dev->cls = &ioapic_sysdev_class;
1787 error = sysdev_register(dev);
1788 if (error) {
1789 kfree(mp_ioapic_data[i]);
1790 mp_ioapic_data[i] = NULL;
1791 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
1792 continue;
1793 }
1794 }
1795
1796 return 0;
1797}
1798
1799device_initcall(ioapic_init_sysfs);
1800
1801/* --------------------------------------------------------------------------
1802 ACPI-based IOAPIC Configuration
1803 -------------------------------------------------------------------------- */
1804
1805#ifdef CONFIG_ACPI_BOOT
1806
1807#define IO_APIC_MAX_ID 0xFE
1808
1809int __init io_apic_get_unique_id (int ioapic, int apic_id)
1810{
1811 union IO_APIC_reg_00 reg_00;
1812 static physid_mask_t apic_id_map;
1813 unsigned long flags;
1814 int i = 0;
1815
1816 /*
1817 * The P4 platform supports up to 256 APIC IDs on two separate APIC
1818 * buses (one for LAPICs, one for IOAPICs), where predecessors only
1819 * supports up to 16 on one shared APIC bus.
1820 *
1821 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
1822 * advantage of new APIC bus architecture.
1823 */
1824
1825 if (physids_empty(apic_id_map))
1826 apic_id_map = phys_cpu_present_map;
1827
1828 spin_lock_irqsave(&ioapic_lock, flags);
1829 reg_00.raw = io_apic_read(ioapic, 0);
1830 spin_unlock_irqrestore(&ioapic_lock, flags);
1831
1832 if (apic_id >= IO_APIC_MAX_ID) {
1833 apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
1834 "%d\n", ioapic, apic_id, reg_00.bits.ID);
1835 apic_id = reg_00.bits.ID;
1836 }
1837
1838 /*
1839 * Every APIC in a system must have a unique ID or we get lots of nice
1840 * 'stuck on smp_invalidate_needed IPI wait' messages.
1841 */
1842 if (physid_isset(apic_id, apic_id_map)) {
1843
1844 for (i = 0; i < IO_APIC_MAX_ID; i++) {
1845 if (!physid_isset(i, apic_id_map))
1846 break;
1847 }
1848
1849 if (i == IO_APIC_MAX_ID)
1850 panic("Max apic_id exceeded!\n");
1851
1852 apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
1853 "trying %d\n", ioapic, apic_id, i);
1854
1855 apic_id = i;
1856 }
1857
1858 physid_set(apic_id, apic_id_map);
1859
1860 if (reg_00.bits.ID != apic_id) {
1861 reg_00.bits.ID = apic_id;
1862
1863 spin_lock_irqsave(&ioapic_lock, flags);
1864 io_apic_write(ioapic, 0, reg_00.raw);
1865 reg_00.raw = io_apic_read(ioapic, 0);
1866 spin_unlock_irqrestore(&ioapic_lock, flags);
1867
1868 /* Sanity check */
1869 if (reg_00.bits.ID != apic_id)
1870 panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
1871 }
1872
1873 apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
1874
1875 return apic_id;
1876}
1877
1878
1879int __init io_apic_get_version (int ioapic)
1880{
1881 union IO_APIC_reg_01 reg_01;
1882 unsigned long flags;
1883
1884 spin_lock_irqsave(&ioapic_lock, flags);
1885 reg_01.raw = io_apic_read(ioapic, 1);
1886 spin_unlock_irqrestore(&ioapic_lock, flags);
1887
1888 return reg_01.bits.version;
1889}
1890
1891
1892int __init io_apic_get_redir_entries (int ioapic)
1893{
1894 union IO_APIC_reg_01 reg_01;
1895 unsigned long flags;
1896
1897 spin_lock_irqsave(&ioapic_lock, flags);
1898 reg_01.raw = io_apic_read(ioapic, 1);
1899 spin_unlock_irqrestore(&ioapic_lock, flags);
1900
1901 return reg_01.bits.entries;
1902}
1903
1904
1905int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
1906{
1907 struct IO_APIC_route_entry entry;
1908 unsigned long flags;
1909
1910 if (!IO_APIC_IRQ(irq)) {
1911 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
1912 ioapic);
1913 return -EINVAL;
1914 }
1915
1916 /*
1917 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
1918 * Note that we mask (disable) IRQs now -- these get enabled when the
1919 * corresponding device driver registers for this IRQ.
1920 */
1921
1922 memset(&entry,0,sizeof(entry));
1923
1924 entry.delivery_mode = INT_DELIVERY_MODE;
1925 entry.dest_mode = INT_DEST_MODE;
1926 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1927 entry.trigger = edge_level;
1928 entry.polarity = active_high_low;
1929 entry.mask = 1; /* Disabled (masked) */
1930
1931 /*
1932 * IRQs < 16 are already in the irq_2_pin[] map
1933 */
1934 if (irq >= 16)
1935 add_pin_to_irq(irq, ioapic, pin);
1936
1937 entry.vector = assign_irq_vector(irq);
1938
1939 apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
1940 "IRQ %d Mode:%i Active:%i)\n", ioapic,
1941 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
1942 edge_level, active_high_low);
1943
1944 ioapic_register_intr(irq, entry.vector, edge_level);
1945
1946 if (!ioapic && (irq < 16))
1947 disable_8259A_irq(irq);
1948
1949 spin_lock_irqsave(&ioapic_lock, flags);
1950 io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
1951 io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
1952 spin_unlock_irqrestore(&ioapic_lock, flags);
1953
1954 return 0;
1955}
1956
1957#endif /*CONFIG_ACPI_BOOT*/
1958
1959
1960/*
1961 * This function currently is only a helper for the i386 smp boot process where
1962 * we need to reprogram the ioredtbls to cater for the cpus which have come online
1963 * so mask in all cases should simply be TARGET_CPUS
1964 */
1965void __init setup_ioapic_dest(void)
1966{
1967 int pin, ioapic, irq, irq_entry;
1968
1969 if (skip_ioapic_setup == 1)
1970 return;
1971
1972 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
1973 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
1974 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
1975 if (irq_entry == -1)
1976 continue;
1977 irq = pin_2_irq(irq_entry, ioapic, pin);
1978 set_ioapic_affinity_irq(irq, TARGET_CPUS);
1979 }
1980
1981 }
1982}
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
new file mode 100644
index 000000000000..cde0e868efe3
--- /dev/null
+++ b/arch/x86_64/kernel/ioport.c
@@ -0,0 +1,117 @@
1/*
2 * linux/arch/x86_64/kernel/ioport.c
3 *
4 * This contains the io-permission bitmap code - written by obz, with changes
5 * by Linus.
6 */
7
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/types.h>
12#include <linux/ioport.h>
13#include <linux/smp.h>
14#include <linux/smp_lock.h>
15#include <linux/stddef.h>
16#include <linux/slab.h>
17#include <linux/thread_info.h>
18
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
21{
22 int i;
23 if (new_value)
24 for (i = base; i < base + extent; i++)
25 __set_bit(i, bitmap);
26 else
27 for (i = base; i < base + extent; i++)
28 clear_bit(i, bitmap);
29}
30
31/*
32 * this changes the io permissions bitmap in the current task.
33 */
34asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
35{
36 unsigned int i, max_long, bytes, bytes_updated;
37 struct thread_struct * t = &current->thread;
38 struct tss_struct * tss;
39 unsigned long *bitmap;
40
41 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
42 return -EINVAL;
43 if (turn_on && !capable(CAP_SYS_RAWIO))
44 return -EPERM;
45
46 /*
47 * If it's the first ioperm() call in this thread's lifetime, set the
48 * IO bitmap up. ioperm() is much less timing critical than clone(),
49 * this is why we delay this operation until now:
50 */
51 if (!t->io_bitmap_ptr) {
52 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
53 if (!bitmap)
54 return -ENOMEM;
55
56 memset(bitmap, 0xff, IO_BITMAP_BYTES);
57 t->io_bitmap_ptr = bitmap;
58 }
59
60 /*
61 * do it in the per-thread copy and in the TSS ...
62 *
63 * Disable preemption via get_cpu() - we must not switch away
64 * because the ->io_bitmap_max value must match the bitmap
65 * contents:
66 */
67 tss = &per_cpu(init_tss, get_cpu());
68
69 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
70
71 /*
72 * Search for a (possibly new) maximum. This is simple and stupid,
73 * to keep it obviously correct:
74 */
75 max_long = 0;
76 for (i = 0; i < IO_BITMAP_LONGS; i++)
77 if (t->io_bitmap_ptr[i] != ~0UL)
78 max_long = i;
79
80 bytes = (max_long + 1) * sizeof(long);
81 bytes_updated = max(bytes, t->io_bitmap_max);
82
83 t->io_bitmap_max = bytes;
84
85 /* Update the TSS: */
86 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
87
88 put_cpu();
89
90 return 0;
91}
92
93/*
94 * sys_iopl has to be used when you want to access the IO ports
95 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
96 * you'd need 8kB of bitmaps/process, which is a bit excessive.
97 *
98 * Here we just change the eflags value on the stack: we allow
99 * only the super-user to do it. This depends on the stack-layout
100 * on system-call entry - see also fork() and the signal handling
101 * code.
102 */
103
104asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
105{
106 unsigned int old = (regs->eflags >> 12) & 3;
107
108 if (level > 3)
109 return -EINVAL;
110 /* Trying to gain more privileges? */
111 if (level > old) {
112 if (!capable(CAP_SYS_RAWIO))
113 return -EPERM;
114 }
115 regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
116 return 0;
117}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
new file mode 100644
index 000000000000..62b112e4deb4
--- /dev/null
+++ b/arch/x86_64/kernel/irq.c
@@ -0,0 +1,108 @@
1/*
2 * linux/arch/x86_64/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the lowest level x86_64-specific interrupt
7 * entry and irq statistics code. All the remaining irq logic is
8 * done by the generic kernel/irq/ code and in the
9 * x86_64-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.)
11 */
12
13#include <linux/kernel_stat.h>
14#include <linux/interrupt.h>
15#include <linux/seq_file.h>
16#include <linux/module.h>
17#include <asm/uaccess.h>
18#include <asm/io_apic.h>
19
20atomic_t irq_err_count;
21#ifdef CONFIG_X86_IO_APIC
22#ifdef APIC_MISMATCH_DEBUG
23atomic_t irq_mis_count;
24#endif
25#endif
26
27/*
28 * Generic, controller-independent functions:
29 */
30
31int show_interrupts(struct seq_file *p, void *v)
32{
33 int i = *(loff_t *) v, j;
34 struct irqaction * action;
35 unsigned long flags;
36
37 if (i == 0) {
38 seq_printf(p, " ");
39 for (j=0; j<NR_CPUS; j++)
40 if (cpu_online(j))
41 seq_printf(p, "CPU%d ",j);
42 seq_putc(p, '\n');
43 }
44
45 if (i < NR_IRQS) {
46 spin_lock_irqsave(&irq_desc[i].lock, flags);
47 action = irq_desc[i].action;
48 if (!action)
49 goto skip;
50 seq_printf(p, "%3d: ",i);
51#ifndef CONFIG_SMP
52 seq_printf(p, "%10u ", kstat_irqs(i));
53#else
54 for (j=0; j<NR_CPUS; j++)
55 if (cpu_online(j))
56 seq_printf(p, "%10u ",
57 kstat_cpu(j).irqs[i]);
58#endif
59 seq_printf(p, " %14s", irq_desc[i].handler->typename);
60
61 seq_printf(p, " %s", action->name);
62 for (action=action->next; action; action = action->next)
63 seq_printf(p, ", %s", action->name);
64 seq_putc(p, '\n');
65skip:
66 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
67 } else if (i == NR_IRQS) {
68 seq_printf(p, "NMI: ");
69 for (j = 0; j < NR_CPUS; j++)
70 if (cpu_online(j))
71 seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
72 seq_putc(p, '\n');
73#ifdef CONFIG_X86_LOCAL_APIC
74 seq_printf(p, "LOC: ");
75 for (j = 0; j < NR_CPUS; j++)
76 if (cpu_online(j))
77 seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
78 seq_putc(p, '\n');
79#endif
80 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
81#ifdef CONFIG_X86_IO_APIC
82#ifdef APIC_MISMATCH_DEBUG
83 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
84#endif
85#endif
86 }
87 return 0;
88}
89
90/*
91 * do_IRQ handles all normal device IRQ's (the special
92 * SMP cross-CPU interrupts have their own specific
93 * handlers).
94 */
95asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
96{
97 /* high bits used in ret_from_ code */
98 unsigned irq = regs->orig_rax & 0xff;
99
100 irq_enter();
101 BUG_ON(irq > 256);
102
103 __do_IRQ(irq, regs);
104 irq_exit();
105
106 return 1;
107}
108
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
new file mode 100644
index 000000000000..4f2a852299b6
--- /dev/null
+++ b/arch/x86_64/kernel/kprobes.c
@@ -0,0 +1,631 @@
1/*
2 * Kernel Probes (KProbes)
3 * arch/x86_64/kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation ( includes contributions from
23 * Rusty Russell).
24 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
25 * interface to access function arguments.
26 * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
27 * <prasanna@in.ibm.com> adapted for x86_64
28 * 2005-Mar Roland McGrath <roland@redhat.com>
29 * Fixed to handle %rip-relative addressing mode correctly.
30 */
31
32#include <linux/config.h>
33#include <linux/kprobes.h>
34#include <linux/ptrace.h>
35#include <linux/spinlock.h>
36#include <linux/string.h>
37#include <linux/slab.h>
38#include <linux/preempt.h>
39#include <linux/moduleloader.h>
40
41#include <asm/pgtable.h>
42#include <asm/kdebug.h>
43
44static DECLARE_MUTEX(kprobe_mutex);
45
46/* kprobe_status settings */
47#define KPROBE_HIT_ACTIVE 0x00000001
48#define KPROBE_HIT_SS 0x00000002
49
50static struct kprobe *current_kprobe;
51static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
52static struct pt_regs jprobe_saved_regs;
53static long *jprobe_saved_rsp;
54static kprobe_opcode_t *get_insn_slot(void);
55static void free_insn_slot(kprobe_opcode_t *slot);
56void jprobe_return_end(void);
57
58/* copy of the kernel stack at the probe fire time */
59static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
60
61/*
62 * returns non-zero if opcode modifies the interrupt flag.
63 */
64static inline int is_IF_modifier(kprobe_opcode_t *insn)
65{
66 switch (*insn) {
67 case 0xfa: /* cli */
68 case 0xfb: /* sti */
69 case 0xcf: /* iret/iretd */
70 case 0x9d: /* popf/popfd */
71 return 1;
72 }
73
74 if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
75 return 1;
76 return 0;
77}
78
79int arch_prepare_kprobe(struct kprobe *p)
80{
81 /* insn: must be on special executable page on x86_64. */
82 up(&kprobe_mutex);
83 p->ainsn.insn = get_insn_slot();
84 down(&kprobe_mutex);
85 if (!p->ainsn.insn) {
86 return -ENOMEM;
87 }
88 return 0;
89}
90
91/*
92 * Determine if the instruction uses the %rip-relative addressing mode.
93 * If it does, return the address of the 32-bit displacement word.
94 * If not, return null.
95 */
96static inline s32 *is_riprel(u8 *insn)
97{
98#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
99 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
100 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
101 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
102 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
103 << (row % 64))
104 static const u64 onebyte_has_modrm[256 / 64] = {
105 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
106 /* ------------------------------- */
107 W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
108 W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
109 W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
110 W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
111 W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
112 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
113 W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
114 W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
115 W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
116 W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
117 W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
118 W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
119 W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
120 W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
121 W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
122 W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
123 /* ------------------------------- */
124 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
125 };
126 static const u64 twobyte_has_modrm[256 / 64] = {
127 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
128 /* ------------------------------- */
129 W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
130 W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
131 W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
132 W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
133 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
134 W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
135 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
136 W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
137 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
138 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
139 W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
140 W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
141 W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
142 W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
143 W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
144 W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
145 /* ------------------------------- */
146 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
147 };
148#undef W
149 int need_modrm;
150
151 /* Skip legacy instruction prefixes. */
152 while (1) {
153 switch (*insn) {
154 case 0x66:
155 case 0x67:
156 case 0x2e:
157 case 0x3e:
158 case 0x26:
159 case 0x64:
160 case 0x65:
161 case 0x36:
162 case 0xf0:
163 case 0xf3:
164 case 0xf2:
165 ++insn;
166 continue;
167 }
168 break;
169 }
170
171 /* Skip REX instruction prefix. */
172 if ((*insn & 0xf0) == 0x40)
173 ++insn;
174
175 if (*insn == 0x0f) { /* Two-byte opcode. */
176 ++insn;
177 need_modrm = test_bit(*insn, twobyte_has_modrm);
178 } else { /* One-byte opcode. */
179 need_modrm = test_bit(*insn, onebyte_has_modrm);
180 }
181
182 if (need_modrm) {
183 u8 modrm = *++insn;
184 if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
185 /* Displacement follows ModRM byte. */
186 return (s32 *) ++insn;
187 }
188 }
189
190 /* No %rip-relative addressing mode here. */
191 return NULL;
192}
193
194void arch_copy_kprobe(struct kprobe *p)
195{
196 s32 *ripdisp;
197 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
198 ripdisp = is_riprel(p->ainsn.insn);
199 if (ripdisp) {
200 /*
201 * The copied instruction uses the %rip-relative
202 * addressing mode. Adjust the displacement for the
203 * difference between the original location of this
204 * instruction and the location of the copy that will
205 * actually be run. The tricky bit here is making sure
206 * that the sign extension happens correctly in this
207 * calculation, since we need a signed 32-bit result to
208 * be sign-extended to 64 bits when it's added to the
209 * %rip value and yield the same 64-bit result that the
210 * sign-extension of the original signed 32-bit
211 * displacement would have given.
212 */
213 s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
214 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
215 *ripdisp = disp;
216 }
217}
218
219void arch_remove_kprobe(struct kprobe *p)
220{
221 up(&kprobe_mutex);
222 free_insn_slot(p->ainsn.insn);
223 down(&kprobe_mutex);
224}
225
226static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
227{
228 *p->addr = p->opcode;
229 regs->rip = (unsigned long)p->addr;
230}
231
232static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
233{
234 regs->eflags |= TF_MASK;
235 regs->eflags &= ~IF_MASK;
236 /*single step inline if the instruction is an int3*/
237 if (p->opcode == BREAKPOINT_INSTRUCTION)
238 regs->rip = (unsigned long)p->addr;
239 else
240 regs->rip = (unsigned long)p->ainsn.insn;
241}
242
243/*
244 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
245 * remain disabled thorough out this function.
246 */
247int kprobe_handler(struct pt_regs *regs)
248{
249 struct kprobe *p;
250 int ret = 0;
251 kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
252
253 /* We're in an interrupt, but this is clear and BUG()-safe. */
254 preempt_disable();
255
256 /* Check we're not actually recursing */
257 if (kprobe_running()) {
258 /* We *are* holding lock here, so this is safe.
259 Disarm the probe we just hit, and ignore it. */
260 p = get_kprobe(addr);
261 if (p) {
262 if (kprobe_status == KPROBE_HIT_SS) {
263 regs->eflags &= ~TF_MASK;
264 regs->eflags |= kprobe_saved_rflags;
265 unlock_kprobes();
266 goto no_kprobe;
267 }
268 disarm_kprobe(p, regs);
269 ret = 1;
270 } else {
271 p = current_kprobe;
272 if (p->break_handler && p->break_handler(p, regs)) {
273 goto ss_probe;
274 }
275 }
276 /* If it's not ours, can't be delete race, (we hold lock). */
277 goto no_kprobe;
278 }
279
280 lock_kprobes();
281 p = get_kprobe(addr);
282 if (!p) {
283 unlock_kprobes();
284 if (*addr != BREAKPOINT_INSTRUCTION) {
285 /*
286 * The breakpoint instruction was removed right
287 * after we hit it. Another cpu has removed
288 * either a probepoint or a debugger breakpoint
289 * at this address. In either case, no further
290 * handling of this interrupt is appropriate.
291 */
292 ret = 1;
293 }
294 /* Not one of ours: let kernel handle it */
295 goto no_kprobe;
296 }
297
298 kprobe_status = KPROBE_HIT_ACTIVE;
299 current_kprobe = p;
300 kprobe_saved_rflags = kprobe_old_rflags
301 = (regs->eflags & (TF_MASK | IF_MASK));
302 if (is_IF_modifier(p->ainsn.insn))
303 kprobe_saved_rflags &= ~IF_MASK;
304
305 if (p->pre_handler && p->pre_handler(p, regs))
306 /* handler has already set things up, so skip ss setup */
307 return 1;
308
309ss_probe:
310 prepare_singlestep(p, regs);
311 kprobe_status = KPROBE_HIT_SS;
312 return 1;
313
314no_kprobe:
315 preempt_enable_no_resched();
316 return ret;
317}
318
319/*
320 * Called after single-stepping. p->addr is the address of the
321 * instruction whose first byte has been replaced by the "int 3"
322 * instruction. To avoid the SMP problems that can occur when we
323 * temporarily put back the original opcode to single-step, we
324 * single-stepped a copy of the instruction. The address of this
325 * copy is p->ainsn.insn.
326 *
327 * This function prepares to return from the post-single-step
328 * interrupt. We have to fix up the stack as follows:
329 *
330 * 0) Except in the case of absolute or indirect jump or call instructions,
331 * the new rip is relative to the copied instruction. We need to make
332 * it relative to the original instruction.
333 *
334 * 1) If the single-stepped instruction was pushfl, then the TF and IF
335 * flags are set in the just-pushed eflags, and may need to be cleared.
336 *
337 * 2) If the single-stepped instruction was a call, the return address
338 * that is atop the stack is the address following the copied instruction.
339 * We need to make it the address following the original instruction.
340 */
341static void resume_execution(struct kprobe *p, struct pt_regs *regs)
342{
343 unsigned long *tos = (unsigned long *)regs->rsp;
344 unsigned long next_rip = 0;
345 unsigned long copy_rip = (unsigned long)p->ainsn.insn;
346 unsigned long orig_rip = (unsigned long)p->addr;
347 kprobe_opcode_t *insn = p->ainsn.insn;
348
349 /*skip the REX prefix*/
350 if (*insn >= 0x40 && *insn <= 0x4f)
351 insn++;
352
353 switch (*insn) {
354 case 0x9c: /* pushfl */
355 *tos &= ~(TF_MASK | IF_MASK);
356 *tos |= kprobe_old_rflags;
357 break;
358 case 0xe8: /* call relative - Fix return addr */
359 *tos = orig_rip + (*tos - copy_rip);
360 break;
361 case 0xff:
362 if ((*insn & 0x30) == 0x10) {
363 /* call absolute, indirect */
364 /* Fix return addr; rip is correct. */
365 next_rip = regs->rip;
366 *tos = orig_rip + (*tos - copy_rip);
367 } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */
368 ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */
369 /* rip is correct. */
370 next_rip = regs->rip;
371 }
372 break;
373 case 0xea: /* jmp absolute -- rip is correct */
374 next_rip = regs->rip;
375 break;
376 default:
377 break;
378 }
379
380 regs->eflags &= ~TF_MASK;
381 if (next_rip) {
382 regs->rip = next_rip;
383 } else {
384 regs->rip = orig_rip + (regs->rip - copy_rip);
385 }
386}
387
388/*
389 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
390 * remain disabled thoroughout this function. And we hold kprobe lock.
391 */
392int post_kprobe_handler(struct pt_regs *regs)
393{
394 if (!kprobe_running())
395 return 0;
396
397 if (current_kprobe->post_handler)
398 current_kprobe->post_handler(current_kprobe, regs, 0);
399
400 resume_execution(current_kprobe, regs);
401 regs->eflags |= kprobe_saved_rflags;
402
403 unlock_kprobes();
404 preempt_enable_no_resched();
405
406 /*
407 * if somebody else is singlestepping across a probe point, eflags
408 * will have TF set, in which case, continue the remaining processing
409 * of do_debug, as if this is not a probe hit.
410 */
411 if (regs->eflags & TF_MASK)
412 return 0;
413
414 return 1;
415}
416
417/* Interrupts disabled, kprobe_lock held. */
418int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
419{
420 if (current_kprobe->fault_handler
421 && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
422 return 1;
423
424 if (kprobe_status & KPROBE_HIT_SS) {
425 resume_execution(current_kprobe, regs);
426 regs->eflags |= kprobe_old_rflags;
427
428 unlock_kprobes();
429 preempt_enable_no_resched();
430 }
431 return 0;
432}
433
434/*
435 * Wrapper routine for handling exceptions.
436 */
437int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
438 void *data)
439{
440 struct die_args *args = (struct die_args *)data;
441 switch (val) {
442 case DIE_INT3:
443 if (kprobe_handler(args->regs))
444 return NOTIFY_STOP;
445 break;
446 case DIE_DEBUG:
447 if (post_kprobe_handler(args->regs))
448 return NOTIFY_STOP;
449 break;
450 case DIE_GPF:
451 if (kprobe_running() &&
452 kprobe_fault_handler(args->regs, args->trapnr))
453 return NOTIFY_STOP;
454 break;
455 case DIE_PAGE_FAULT:
456 if (kprobe_running() &&
457 kprobe_fault_handler(args->regs, args->trapnr))
458 return NOTIFY_STOP;
459 break;
460 default:
461 break;
462 }
463 return NOTIFY_DONE;
464}
465
466int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
467{
468 struct jprobe *jp = container_of(p, struct jprobe, kp);
469 unsigned long addr;
470
471 jprobe_saved_regs = *regs;
472 jprobe_saved_rsp = (long *) regs->rsp;
473 addr = (unsigned long)jprobe_saved_rsp;
474 /*
475 * As Linus pointed out, gcc assumes that the callee
476 * owns the argument space and could overwrite it, e.g.
477 * tailcall optimization. So, to be absolutely safe
478 * we also save and restore enough stack bytes to cover
479 * the argument area.
480 */
481 memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
482 regs->eflags &= ~IF_MASK;
483 regs->rip = (unsigned long)(jp->entry);
484 return 1;
485}
486
487void jprobe_return(void)
488{
489 preempt_enable_no_resched();
490 asm volatile (" xchg %%rbx,%%rsp \n"
491 " int3 \n"
492 " .globl jprobe_return_end \n"
493 " jprobe_return_end: \n"
494 " nop \n"::"b"
495 (jprobe_saved_rsp):"memory");
496}
497
498int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
499{
500 u8 *addr = (u8 *) (regs->rip - 1);
501 unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
502 struct jprobe *jp = container_of(p, struct jprobe, kp);
503
504 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
505 if ((long *)regs->rsp != jprobe_saved_rsp) {
506 struct pt_regs *saved_regs =
507 container_of(jprobe_saved_rsp, struct pt_regs, rsp);
508 printk("current rsp %p does not match saved rsp %p\n",
509 (long *)regs->rsp, jprobe_saved_rsp);
510 printk("Saved registers for jprobe %p\n", jp);
511 show_registers(saved_regs);
512 printk("Current registers\n");
513 show_registers(regs);
514 BUG();
515 }
516 *regs = jprobe_saved_regs;
517 memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
518 MIN_STACK_SIZE(stack_addr));
519 return 1;
520 }
521 return 0;
522}
523
524/*
525 * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
526 * By default on x86_64, pages we get from kmalloc or vmalloc are not
527 * executable. Single-stepping an instruction on such a page yields an
528 * oops. So instead of storing the instruction copies in their respective
529 * kprobe objects, we allocate a page, map it executable, and store all the
530 * instruction copies there. (We can allocate additional pages if somebody
531 * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE
532 * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
533 * bytes.
534 */
535#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
536struct kprobe_insn_page {
537 struct hlist_node hlist;
538 kprobe_opcode_t *insns; /* page of instruction slots */
539 char slot_used[INSNS_PER_PAGE];
540 int nused;
541};
542
543static struct hlist_head kprobe_insn_pages;
544
545/**
546 * get_insn_slot() - Find a slot on an executable page for an instruction.
547 * We allocate an executable page if there's no room on existing ones.
548 */
549static kprobe_opcode_t *get_insn_slot(void)
550{
551 struct kprobe_insn_page *kip;
552 struct hlist_node *pos;
553
554 hlist_for_each(pos, &kprobe_insn_pages) {
555 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
556 if (kip->nused < INSNS_PER_PAGE) {
557 int i;
558 for (i = 0; i < INSNS_PER_PAGE; i++) {
559 if (!kip->slot_used[i]) {
560 kip->slot_used[i] = 1;
561 kip->nused++;
562 return kip->insns + (i*MAX_INSN_SIZE);
563 }
564 }
565 /* Surprise! No unused slots. Fix kip->nused. */
566 kip->nused = INSNS_PER_PAGE;
567 }
568 }
569
570 /* All out of space. Need to allocate a new page. Use slot 0.*/
571 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
572 if (!kip) {
573 return NULL;
574 }
575
576 /*
577 * For the %rip-relative displacement fixups to be doable, we
578 * need our instruction copy to be within +/- 2GB of any data it
579 * might access via %rip. That is, within 2GB of where the
580 * kernel image and loaded module images reside. So we allocate
581 * a page in the module loading area.
582 */
583 kip->insns = module_alloc(PAGE_SIZE);
584 if (!kip->insns) {
585 kfree(kip);
586 return NULL;
587 }
588 INIT_HLIST_NODE(&kip->hlist);
589 hlist_add_head(&kip->hlist, &kprobe_insn_pages);
590 memset(kip->slot_used, 0, INSNS_PER_PAGE);
591 kip->slot_used[0] = 1;
592 kip->nused = 1;
593 return kip->insns;
594}
595
596/**
597 * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
598 */
599static void free_insn_slot(kprobe_opcode_t *slot)
600{
601 struct kprobe_insn_page *kip;
602 struct hlist_node *pos;
603
604 hlist_for_each(pos, &kprobe_insn_pages) {
605 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
606 if (kip->insns <= slot
607 && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
608 int i = (slot - kip->insns) / MAX_INSN_SIZE;
609 kip->slot_used[i] = 0;
610 kip->nused--;
611 if (kip->nused == 0) {
612 /*
613 * Page is no longer in use. Free it unless
614 * it's the last one. We keep the last one
615 * so as not to have to set it up again the
616 * next time somebody inserts a probe.
617 */
618 hlist_del(&kip->hlist);
619 if (hlist_empty(&kprobe_insn_pages)) {
620 INIT_HLIST_NODE(&kip->hlist);
621 hlist_add_head(&kip->hlist,
622 &kprobe_insn_pages);
623 } else {
624 module_free(NULL, kip->insns);
625 kfree(kip);
626 }
627 }
628 return;
629 }
630 }
631}
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c
new file mode 100644
index 000000000000..d7e5d0cf4285
--- /dev/null
+++ b/arch/x86_64/kernel/ldt.c
@@ -0,0 +1,253 @@
1/*
2 * linux/arch/x86_64/kernel/ldt.c
3 *
4 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
5 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6 * Copyright (C) 2002 Andi Kleen
7 *
8 * This handles calls from both 32bit and 64bit mode.
9 */
10
11#include <linux/errno.h>
12#include <linux/sched.h>
13#include <linux/string.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
16#include <linux/smp_lock.h>
17#include <linux/vmalloc.h>
18#include <linux/slab.h>
19
20#include <asm/uaccess.h>
21#include <asm/system.h>
22#include <asm/ldt.h>
23#include <asm/desc.h>
24#include <asm/proto.h>
25
26#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
27static void flush_ldt(void *null)
28{
29 if (current->active_mm)
30 load_LDT(&current->active_mm->context);
31}
32#endif
33
34static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
35{
36 void *oldldt;
37 void *newldt;
38 unsigned oldsize;
39
40 if (mincount <= (unsigned)pc->size)
41 return 0;
42 oldsize = pc->size;
43 mincount = (mincount+511)&(~511);
44 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
45 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
46 else
47 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
48
49 if (!newldt)
50 return -ENOMEM;
51
52 if (oldsize)
53 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
54 oldldt = pc->ldt;
55 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
56 wmb();
57 pc->ldt = newldt;
58 wmb();
59 pc->size = mincount;
60 wmb();
61 if (reload) {
62#ifdef CONFIG_SMP
63 cpumask_t mask;
64
65 preempt_disable();
66 mask = cpumask_of_cpu(smp_processor_id());
67 load_LDT(pc);
68 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
69 smp_call_function(flush_ldt, NULL, 1, 1);
70 preempt_enable();
71#else
72 load_LDT(pc);
73#endif
74 }
75 if (oldsize) {
76 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
77 vfree(oldldt);
78 else
79 kfree(oldldt);
80 }
81 return 0;
82}
83
84static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
85{
86 int err = alloc_ldt(new, old->size, 0);
87 if (err < 0)
88 return err;
89 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
90 return 0;
91}
92
93/*
94 * we do not have to muck with descriptors here, that is
95 * done in switch_mm() as needed.
96 */
97int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
98{
99 struct mm_struct * old_mm;
100 int retval = 0;
101
102 init_MUTEX(&mm->context.sem);
103 mm->context.size = 0;
104 old_mm = current->mm;
105 if (old_mm && old_mm->context.size > 0) {
106 down(&old_mm->context.sem);
107 retval = copy_ldt(&mm->context, &old_mm->context);
108 up(&old_mm->context.sem);
109 }
110 return retval;
111}
112
113/*
114 *
115 * Don't touch the LDT register - we're already in the next thread.
116 */
117void destroy_context(struct mm_struct *mm)
118{
119 if (mm->context.size) {
120 if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
121 vfree(mm->context.ldt);
122 else
123 kfree(mm->context.ldt);
124 mm->context.size = 0;
125 }
126}
127
128static int read_ldt(void __user * ptr, unsigned long bytecount)
129{
130 int err;
131 unsigned long size;
132 struct mm_struct * mm = current->mm;
133
134 if (!mm->context.size)
135 return 0;
136 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
137 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
138
139 down(&mm->context.sem);
140 size = mm->context.size*LDT_ENTRY_SIZE;
141 if (size > bytecount)
142 size = bytecount;
143
144 err = 0;
145 if (copy_to_user(ptr, mm->context.ldt, size))
146 err = -EFAULT;
147 up(&mm->context.sem);
148 if (err < 0)
149 goto error_return;
150 if (size != bytecount) {
151 /* zero-fill the rest */
152 if (clear_user(ptr+size, bytecount-size) != 0) {
153 err = -EFAULT;
154 goto error_return;
155 }
156 }
157 return bytecount;
158error_return:
159 return err;
160}
161
162static int read_default_ldt(void __user * ptr, unsigned long bytecount)
163{
164 /* Arbitrary number */
165 /* x86-64 default LDT is all zeros */
166 if (bytecount > 128)
167 bytecount = 128;
168 if (clear_user(ptr, bytecount))
169 return -EFAULT;
170 return bytecount;
171}
172
173static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
174{
175 struct task_struct *me = current;
176 struct mm_struct * mm = me->mm;
177 __u32 entry_1, entry_2, *lp;
178 int error;
179 struct user_desc ldt_info;
180
181 error = -EINVAL;
182
183 if (bytecount != sizeof(ldt_info))
184 goto out;
185 error = -EFAULT;
186 if (copy_from_user(&ldt_info, ptr, bytecount))
187 goto out;
188
189 error = -EINVAL;
190 if (ldt_info.entry_number >= LDT_ENTRIES)
191 goto out;
192 if (ldt_info.contents == 3) {
193 if (oldmode)
194 goto out;
195 if (ldt_info.seg_not_present == 0)
196 goto out;
197 }
198
199 down(&mm->context.sem);
200 if (ldt_info.entry_number >= (unsigned)mm->context.size) {
201 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
202 if (error < 0)
203 goto out_unlock;
204 }
205
206 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
207
208 /* Allow LDTs to be cleared by the user. */
209 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
210 if (oldmode || LDT_empty(&ldt_info)) {
211 entry_1 = 0;
212 entry_2 = 0;
213 goto install;
214 }
215 }
216
217 entry_1 = LDT_entry_a(&ldt_info);
218 entry_2 = LDT_entry_b(&ldt_info);
219 if (oldmode)
220 entry_2 &= ~(1 << 20);
221
222 /* Install the new entry ... */
223install:
224 *lp = entry_1;
225 *(lp+1) = entry_2;
226 error = 0;
227
228out_unlock:
229 up(&mm->context.sem);
230out:
231 return error;
232}
233
234asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
235{
236 int ret = -ENOSYS;
237
238 switch (func) {
239 case 0:
240 ret = read_ldt(ptr, bytecount);
241 break;
242 case 1:
243 ret = write_ldt(ptr, bytecount, 1);
244 break;
245 case 2:
246 ret = read_default_ldt(ptr, bytecount);
247 break;
248 case 0x11:
249 ret = write_ldt(ptr, bytecount, 0);
250 break;
251 }
252 return ret;
253}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
new file mode 100644
index 000000000000..86f9fd85016a
--- /dev/null
+++ b/arch/x86_64/kernel/mce.c
@@ -0,0 +1,548 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
18#include <asm/processor.h>
19#include <asm/msr.h>
20#include <asm/mce.h>
21#include <asm/kdebug.h>
22#include <asm/uaccess.h>
23
24#define MISC_MCELOG_MINOR 227
25#define NR_BANKS 5
26
27static int mce_dont_init;
28
29/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
30 3: never panic or exit (for testing only) */
31static int tolerant = 1;
32static int banks;
33static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
34static unsigned long console_logged;
35static int notify_user;
36
37/*
38 * Lockless MCE logging infrastructure.
39 * This avoids deadlocks on printk locks without having to break locks. Also
40 * separate MCEs from kernel messages to avoid bogus bug reports.
41 */
42
43struct mce_log mcelog = {
44 MCE_LOG_SIGNATURE,
45 MCE_LOG_LEN,
46};
47
48void mce_log(struct mce *mce)
49{
50 unsigned next, entry;
51 mce->finished = 0;
52 smp_wmb();
53 for (;;) {
54 entry = rcu_dereference(mcelog.next);
55 /* When the buffer fills up discard new entries. Assume
56 that the earlier errors are the more interesting. */
57 if (entry >= MCE_LOG_LEN) {
58 set_bit(MCE_OVERFLOW, &mcelog.flags);
59 return;
60 }
61 /* Old left over entry. Skip. */
62 if (mcelog.entry[entry].finished)
63 continue;
64 smp_rmb();
65 next = entry + 1;
66 if (cmpxchg(&mcelog.next, entry, next) == entry)
67 break;
68 }
69 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
70 smp_wmb();
71 mcelog.entry[entry].finished = 1;
72 smp_wmb();
73
74 if (!test_and_set_bit(0, &console_logged))
75 notify_user = 1;
76}
77
78static void print_mce(struct mce *m)
79{
80 printk(KERN_EMERG "\n"
81 KERN_EMERG
82 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
83 m->cpu, m->mcgstatus, m->bank, m->status);
84 if (m->rip) {
85 printk(KERN_EMERG
86 "RIP%s %02x:<%016Lx> ",
87 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
88 m->cs, m->rip);
89 if (m->cs == __KERNEL_CS)
90 print_symbol("{%s}", m->rip);
91 printk("\n");
92 }
93 printk(KERN_EMERG "TSC %Lx ", m->tsc);
94 if (m->addr)
95 printk("ADDR %Lx ", m->addr);
96 if (m->misc)
97 printk("MISC %Lx ", m->misc);
98 printk("\n");
99}
100
101static void mce_panic(char *msg, struct mce *backup, unsigned long start)
102{
103 int i;
104 oops_begin();
105 for (i = 0; i < MCE_LOG_LEN; i++) {
106 unsigned long tsc = mcelog.entry[i].tsc;
107 if (time_before(tsc, start))
108 continue;
109 print_mce(&mcelog.entry[i]);
110 if (backup && mcelog.entry[i].tsc == backup->tsc)
111 backup = NULL;
112 }
113 if (backup)
114 print_mce(backup);
115 if (tolerant >= 3)
116 printk("Fake panic: %s\n", msg);
117 else
118 panic(msg);
119}
120
121static int mce_available(struct cpuinfo_x86 *c)
122{
123 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
124 test_bit(X86_FEATURE_MCA, &c->x86_capability);
125}
126
127/*
128 * The actual machine check handler
129 */
130
131void do_machine_check(struct pt_regs * regs, long error_code)
132{
133 struct mce m, panicm;
134 int nowayout = (tolerant < 1);
135 int kill_it = 0;
136 u64 mcestart = 0;
137 int i;
138 int panicm_found = 0;
139
140 if (regs)
141 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
142 if (!banks)
143 return;
144
145 memset(&m, 0, sizeof(struct mce));
146 m.cpu = hard_smp_processor_id();
147 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
148 if (!(m.mcgstatus & MCG_STATUS_RIPV))
149 kill_it = 1;
150
151 rdtscll(mcestart);
152 barrier();
153
154 for (i = 0; i < banks; i++) {
155 if (!bank[i])
156 continue;
157
158 m.misc = 0;
159 m.addr = 0;
160 m.bank = i;
161 m.tsc = 0;
162
163 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
164 if ((m.status & MCI_STATUS_VAL) == 0)
165 continue;
166
167 if (m.status & MCI_STATUS_EN) {
168 /* In theory _OVER could be a nowayout too, but
169 assume any overflowed errors were no fatal. */
170 nowayout |= !!(m.status & MCI_STATUS_PCC);
171 kill_it |= !!(m.status & MCI_STATUS_UC);
172 }
173
174 if (m.status & MCI_STATUS_MISCV)
175 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
176 if (m.status & MCI_STATUS_ADDRV)
177 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
178
179 if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) {
180 m.rip = regs->rip;
181 m.cs = regs->cs;
182 } else {
183 m.rip = 0;
184 m.cs = 0;
185 }
186
187 if (error_code != -1)
188 rdtscll(m.tsc);
189 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
190 mce_log(&m);
191
192 /* Did this bank cause the exception? */
193 /* Assume that the bank with uncorrectable errors did it,
194 and that there is only a single one. */
195 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
196 panicm = m;
197 panicm_found = 1;
198 }
199
200 tainted |= TAINT_MACHINE_CHECK;
201 }
202
203 /* Never do anything final in the polling timer */
204 if (!regs)
205 goto out;
206
207 /* If we didn't find an uncorrectable error, pick
208 the last one (shouldn't happen, just being safe). */
209 if (!panicm_found)
210 panicm = m;
211 if (nowayout)
212 mce_panic("Machine check", &panicm, mcestart);
213 if (kill_it) {
214 int user_space = 0;
215
216 if (m.mcgstatus & MCG_STATUS_RIPV)
217 user_space = panicm.rip && (panicm.cs & 3);
218
219 /* When the machine was in user space and the CPU didn't get
220 confused it's normally not necessary to panic, unless you
221 are paranoid (tolerant == 0)
222
223 RED-PEN could be more tolerant for MCEs in idle,
224 but most likely they occur at boot anyways, where
225 it is best to just halt the machine. */
226 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
227 (unsigned)current->pid <= 1)
228 mce_panic("Uncorrected machine check", &panicm, mcestart);
229
230 /* do_exit takes an awful lot of locks and has as
231 slight risk of deadlocking. If you don't want that
232 don't set tolerant >= 2 */
233 if (tolerant < 3)
234 do_exit(SIGBUS);
235 }
236
237 out:
238 /* Last thing done in the machine check exception to clear state. */
239 wrmsrl(MSR_IA32_MCG_STATUS, 0);
240}
241
242/*
243 * Periodic polling timer for "silent" machine check errors.
244 */
245
246static int check_interval = 5 * 60; /* 5 minutes */
247static void mcheck_timer(void *data);
248static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
249
250static void mcheck_check_cpu(void *info)
251{
252 if (mce_available(&current_cpu_data))
253 do_machine_check(NULL, 0);
254}
255
256static void mcheck_timer(void *data)
257{
258 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
259 schedule_delayed_work(&mcheck_work, check_interval * HZ);
260
261 /*
262 * It's ok to read stale data here for notify_user and
263 * console_logged as we'll simply get the updated versions
264 * on the next mcheck_timer execution and atomic operations
265 * on console_logged act as synchronization for notify_user
266 * writes.
267 */
268 if (notify_user && console_logged) {
269 notify_user = 0;
270 clear_bit(0, &console_logged);
271 printk(KERN_INFO "Machine check events logged\n");
272 }
273}
274
275
276static __init int periodic_mcheck_init(void)
277{
278 if (check_interval)
279 schedule_delayed_work(&mcheck_work, check_interval*HZ);
280 return 0;
281}
282__initcall(periodic_mcheck_init);
283
284
285/*
286 * Initialize Machine Checks for a CPU.
287 */
288static void mce_init(void *dummy)
289{
290 u64 cap;
291 int i;
292
293 rdmsrl(MSR_IA32_MCG_CAP, cap);
294 banks = cap & 0xff;
295 if (banks > NR_BANKS) {
296 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
297 banks = NR_BANKS;
298 }
299
300 /* Log the machine checks left over from the previous reset.
301 This also clears all registers */
302 do_machine_check(NULL, -1);
303
304 set_in_cr4(X86_CR4_MCE);
305
306 if (cap & MCG_CTL_P)
307 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
308
309 for (i = 0; i < banks; i++) {
310 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
311 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
312 }
313}
314
315/* Add per CPU specific workarounds here */
316static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
317{
318 /* This should be disabled by the BIOS, but isn't always */
319 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
320 /* disable GART TBL walk error reporting, which trips off
321 incorrectly with the IOMMU & 3ware & Cerberus. */
322 clear_bit(10, &bank[4]);
323 }
324}
325
326static void __init mce_cpu_features(struct cpuinfo_x86 *c)
327{
328 switch (c->x86_vendor) {
329 case X86_VENDOR_INTEL:
330 mce_intel_feature_init(c);
331 break;
332 default:
333 break;
334 }
335}
336
337/*
338 * Called for each booted CPU to set up machine checks.
339 * Must be called with preempt off.
340 */
341void __init mcheck_init(struct cpuinfo_x86 *c)
342{
343 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
344
345 mce_cpu_quirks(c);
346
347 if (mce_dont_init ||
348 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
349 !mce_available(c))
350 return;
351
352 mce_init(NULL);
353 mce_cpu_features(c);
354}
355
356/*
357 * Character device to read and clear the MCE log.
358 */
359
360static void collect_tscs(void *data)
361{
362 unsigned long *cpu_tsc = (unsigned long *)data;
363 rdtscll(cpu_tsc[smp_processor_id()]);
364}
365
366static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
367{
368 unsigned long cpu_tsc[NR_CPUS];
369 static DECLARE_MUTEX(mce_read_sem);
370 unsigned next;
371 char __user *buf = ubuf;
372 int i, err;
373
374 down(&mce_read_sem);
375 next = rcu_dereference(mcelog.next);
376
377 /* Only supports full reads right now */
378 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
379 up(&mce_read_sem);
380 return -EINVAL;
381 }
382
383 err = 0;
384 for (i = 0; i < next; i++) {
385 if (!mcelog.entry[i].finished)
386 continue;
387 smp_rmb();
388 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
389 buf += sizeof(struct mce);
390 }
391
392 memset(mcelog.entry, 0, next * sizeof(struct mce));
393 mcelog.next = 0;
394
395 synchronize_kernel();
396
397 /* Collect entries that were still getting written before the synchronize. */
398
399 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
400 for (i = next; i < MCE_LOG_LEN; i++) {
401 if (mcelog.entry[i].finished &&
402 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
403 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
404 smp_rmb();
405 buf += sizeof(struct mce);
406 memset(&mcelog.entry[i], 0, sizeof(struct mce));
407 }
408 }
409 up(&mce_read_sem);
410 return err ? -EFAULT : buf - ubuf;
411}
412
413static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
414{
415 int __user *p = (int __user *)arg;
416 if (!capable(CAP_SYS_ADMIN))
417 return -EPERM;
418 switch (cmd) {
419 case MCE_GET_RECORD_LEN:
420 return put_user(sizeof(struct mce), p);
421 case MCE_GET_LOG_LEN:
422 return put_user(MCE_LOG_LEN, p);
423 case MCE_GETCLEAR_FLAGS: {
424 unsigned flags;
425 do {
426 flags = mcelog.flags;
427 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
428 return put_user(flags, p);
429 }
430 default:
431 return -ENOTTY;
432 }
433}
434
435static struct file_operations mce_chrdev_ops = {
436 .read = mce_read,
437 .ioctl = mce_ioctl,
438};
439
440static struct miscdevice mce_log_device = {
441 MISC_MCELOG_MINOR,
442 "mcelog",
443 &mce_chrdev_ops,
444};
445
446/*
447 * Old style boot options parsing. Only for compatibility.
448 */
449
450static int __init mcheck_disable(char *str)
451{
452 mce_dont_init = 1;
453 return 0;
454}
455
456/* mce=off disables machine check. Note you can reenable it later
457 using sysfs */
458static int __init mcheck_enable(char *str)
459{
460 if (!strcmp(str, "off"))
461 mce_dont_init = 1;
462 else
463 printk("mce= argument %s ignored. Please use /sys", str);
464 return 0;
465}
466
467__setup("nomce", mcheck_disable);
468__setup("mce", mcheck_enable);
469
470/*
471 * Sysfs support
472 */
473
474/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
475static int mce_resume(struct sys_device *dev)
476{
477 on_each_cpu(mce_init, NULL, 1, 1);
478 return 0;
479}
480
481/* Reinit MCEs after user configuration changes */
482static void mce_restart(void)
483{
484 if (check_interval)
485 cancel_delayed_work(&mcheck_work);
486 /* Timer race is harmless here */
487 on_each_cpu(mce_init, NULL, 1, 1);
488 if (check_interval)
489 schedule_delayed_work(&mcheck_work, check_interval*HZ);
490}
491
492static struct sysdev_class mce_sysclass = {
493 .resume = mce_resume,
494 set_kset_name("machinecheck"),
495};
496
497static struct sys_device device_mce = {
498 .id = 0,
499 .cls = &mce_sysclass,
500};
501
502/* Why are there no generic functions for this? */
503#define ACCESSOR(name, var, start) \
504 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
505 return sprintf(buf, "%lx\n", (unsigned long)var); \
506 } \
507 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
508 char *end; \
509 unsigned long new = simple_strtoul(buf, &end, 0); \
510 if (end == buf) return -EINVAL; \
511 var = new; \
512 start; \
513 return end-buf; \
514 } \
515 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
516
517ACCESSOR(bank0ctl,bank[0],mce_restart())
518ACCESSOR(bank1ctl,bank[1],mce_restart())
519ACCESSOR(bank2ctl,bank[2],mce_restart())
520ACCESSOR(bank3ctl,bank[3],mce_restart())
521ACCESSOR(bank4ctl,bank[4],mce_restart())
522ACCESSOR(tolerant,tolerant,)
523ACCESSOR(check_interval,check_interval,mce_restart())
524
525static __init int mce_init_device(void)
526{
527 int err;
528 if (!mce_available(&boot_cpu_data))
529 return -EIO;
530 err = sysdev_class_register(&mce_sysclass);
531 if (!err)
532 err = sysdev_register(&device_mce);
533 if (!err) {
534 /* could create per CPU objects, but it is not worth it. */
535 sysdev_create_file(&device_mce, &attr_bank0ctl);
536 sysdev_create_file(&device_mce, &attr_bank1ctl);
537 sysdev_create_file(&device_mce, &attr_bank2ctl);
538 sysdev_create_file(&device_mce, &attr_bank3ctl);
539 sysdev_create_file(&device_mce, &attr_bank4ctl);
540 sysdev_create_file(&device_mce, &attr_tolerant);
541 sysdev_create_file(&device_mce, &attr_check_interval);
542 }
543
544 misc_register(&mce_log_device);
545 return err;
546
547}
548device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
new file mode 100644
index 000000000000..4db9a640069f
--- /dev/null
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -0,0 +1,99 @@
1/*
2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 */
5
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <linux/percpu.h>
9#include <asm/processor.h>
10#include <asm/msr.h>
11#include <asm/mce.h>
12#include <asm/hw_irq.h>
13
14static DEFINE_PER_CPU(unsigned long, next_check);
15
16asmlinkage void smp_thermal_interrupt(void)
17{
18 struct mce m;
19
20 ack_APIC_irq();
21
22 irq_enter();
23 if (time_before(jiffies, __get_cpu_var(next_check)))
24 goto done;
25
26 __get_cpu_var(next_check) = jiffies + HZ*300;
27 memset(&m, 0, sizeof(m));
28 m.cpu = smp_processor_id();
29 m.bank = MCE_THERMAL_BANK;
30 rdtscll(m.tsc);
31 rdmsrl(MSR_IA32_THERM_STATUS, m.status);
32 if (m.status & 0x1) {
33 printk(KERN_EMERG
34 "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
35 add_taint(TAINT_MACHINE_CHECK);
36 } else {
37 printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
38 }
39
40 mce_log(&m);
41done:
42 irq_exit();
43}
44
45static void __init intel_init_thermal(struct cpuinfo_x86 *c)
46{
47 u32 l, h;
48 int tm2 = 0;
49 unsigned int cpu = smp_processor_id();
50
51 if (!cpu_has(c, X86_FEATURE_ACPI))
52 return;
53
54 if (!cpu_has(c, X86_FEATURE_ACC))
55 return;
56
57 /* first check if TM1 is already enabled by the BIOS, in which
58 * case there might be some SMM goo which handles it, so we can't even
59 * put a handler since it might be delivered via SMI already.
60 */
61 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
62 h = apic_read(APIC_LVTTHMR);
63 if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
64 printk(KERN_DEBUG
65 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
66 return;
67 }
68
69 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
70 tm2 = 1;
71
72 if (h & APIC_VECTOR_MASK) {
73 printk(KERN_DEBUG
74 "CPU%d: Thermal LVT vector (%#x) already "
75 "installed\n", cpu, (h & APIC_VECTOR_MASK));
76 return;
77 }
78
79 h = THERMAL_APIC_VECTOR;
80 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
81 apic_write_around(APIC_LVTTHMR, h);
82
83 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
84 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
85
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
88
89 l = apic_read(APIC_LVTTHMR);
90 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
91 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
92 cpu, tm2 ? "TM2" : "TM1");
93 return;
94}
95
96void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
97{
98 intel_init_thermal(c);
99}
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
new file mode 100644
index 000000000000..c2ffea8845ed
--- /dev/null
+++ b/arch/x86_64/kernel/module.c
@@ -0,0 +1,166 @@
1/* Kernel module help for x86-64
2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/
19#include <linux/moduleloader.h>
20#include <linux/elf.h>
21#include <linux/vmalloc.h>
22#include <linux/fs.h>
23#include <linux/string.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26
27#include <asm/system.h>
28#include <asm/page.h>
29#include <asm/pgtable.h>
30
31#define DEBUGP(fmt...)
32
33void module_free(struct module *mod, void *module_region)
34{
35 vfree(module_region);
36}
37
38void *module_alloc(unsigned long size)
39{
40 struct vm_struct *area;
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL;
47
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
49 if (!area)
50 return NULL;
51
52 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
53}
54
55/* We don't need anything special. */
56int module_frob_arch_sections(Elf_Ehdr *hdr,
57 Elf_Shdr *sechdrs,
58 char *secstrings,
59 struct module *mod)
60{
61 return 0;
62}
63
64int apply_relocate_add(Elf64_Shdr *sechdrs,
65 const char *strtab,
66 unsigned int symindex,
67 unsigned int relsec,
68 struct module *me)
69{
70 unsigned int i;
71 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
72 Elf64_Sym *sym;
73 void *loc;
74 u64 val;
75
76 DEBUGP("Applying relocate section %u to %u\n", relsec,
77 sechdrs[relsec].sh_info);
78 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
79 /* This is where to make the change */
80 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
81 + rel[i].r_offset;
82
83 /* This is the symbol it is referring to. Note that all
84 undefined symbols have been resolved. */
85 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
86 + ELF64_R_SYM(rel[i].r_info);
87
88 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
89 (int)ELF64_R_TYPE(rel[i].r_info),
90 sym->st_value, rel[i].r_addend, (u64)loc);
91
92 val = sym->st_value + rel[i].r_addend;
93
94 switch (ELF64_R_TYPE(rel[i].r_info)) {
95 case R_X86_64_NONE:
96 break;
97 case R_X86_64_64:
98 *(u64 *)loc = val;
99 break;
100 case R_X86_64_32:
101 *(u32 *)loc = val;
102 if (val != *(u32 *)loc)
103 goto overflow;
104 break;
105 case R_X86_64_32S:
106 *(s32 *)loc = val;
107 if ((s64)val != *(s32 *)loc)
108 goto overflow;
109 break;
110 case R_X86_64_PC32:
111 val -= (u64)loc;
112 *(u32 *)loc = val;
113#if 0
114 if ((s64)val != *(s32 *)loc)
115 goto overflow;
116#endif
117 break;
118 default:
119 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
120 me->name, ELF64_R_TYPE(rel[i].r_info));
121 return -ENOEXEC;
122 }
123 }
124 return 0;
125
126overflow:
127 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
128 (int)ELF64_R_TYPE(rel[i].r_info), val);
129 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
130 me->name);
131 return -ENOEXEC;
132}
133
134int apply_relocate(Elf_Shdr *sechdrs,
135 const char *strtab,
136 unsigned int symindex,
137 unsigned int relsec,
138 struct module *me)
139{
140 printk("non add relocation not supported\n");
141 return -ENOSYS;
142}
143
144extern void apply_alternatives(void *start, void *end);
145
146int module_finalize(const Elf_Ehdr *hdr,
147 const Elf_Shdr *sechdrs,
148 struct module *me)
149{
150 const Elf_Shdr *s;
151 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
152
153 /* look for .altinstructions to patch */
154 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
155 void *seg;
156 if (strcmp(".altinstructions", secstrings + s->sh_name))
157 continue;
158 seg = (void *)s->sh_addr;
159 apply_alternatives(seg, seg + s->sh_size);
160 }
161 return 0;
162}
163
164void module_arch_cleanup(struct module *mod)
165{
166}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
new file mode 100644
index 000000000000..7ec031c6ca10
--- /dev/null
+++ b/arch/x86_64/kernel/mpparse.c
@@ -0,0 +1,949 @@
1/*
2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines.
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes
9 * Erich Boleyn : MP v1.4 and additional changes.
10 * Alan Cox : Added EBDA scanning
11 * Ingo Molnar : various cleanups and rewrites
12 * Maciej W. Rozycki: Bits for default MP configurations
13 * Paul Diefenbaugh: Added full ACPI support
14 */
15
16#include <linux/mm.h>
17#include <linux/irq.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/config.h>
21#include <linux/bootmem.h>
22#include <linux/smp_lock.h>
23#include <linux/kernel_stat.h>
24#include <linux/mc146818rtc.h>
25#include <linux/acpi.h>
26
27#include <asm/smp.h>
28#include <asm/mtrr.h>
29#include <asm/mpspec.h>
30#include <asm/pgalloc.h>
31#include <asm/io_apic.h>
32#include <asm/proto.h>
33
34/* Have we found an MP table */
35int smp_found_config;
36unsigned int __initdata maxcpus = NR_CPUS;
37
38int acpi_found_madt;
39
40/*
41 * Various Linux-internal data structures created from the
42 * MP-table.
43 */
44int apic_version [MAX_APICS];
45unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
46int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
47cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
48
49static int mp_current_pci_id = 0;
50/* I/O APIC entries */
51struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
52
53/* # of MP IRQ source entries */
54struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
55
56/* MP IRQ source entries */
57int mp_irq_entries;
58
59int nr_ioapics;
60int pic_mode;
61unsigned long mp_lapic_addr = 0;
62
63
64
65/* Processor that is doing the boot up */
66unsigned int boot_cpu_id = -1U;
67/* Internal processor count */
68static unsigned int num_processors = 0;
69
70/* Bitmask of physically existing CPUs */
71physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
72
73/* ACPI MADT entry parsing functions */
74#ifdef CONFIG_ACPI_BOOT
75extern struct acpi_boot_flags acpi_boot;
76#ifdef CONFIG_X86_LOCAL_APIC
77extern int acpi_parse_lapic (acpi_table_entry_header *header);
78extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
79extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
80#endif /*CONFIG_X86_LOCAL_APIC*/
81#ifdef CONFIG_X86_IO_APIC
82extern int acpi_parse_ioapic (acpi_table_entry_header *header);
83#endif /*CONFIG_X86_IO_APIC*/
84#endif /*CONFIG_ACPI_BOOT*/
85
86u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
87
88
89/*
90 * Intel MP BIOS table parsing routines:
91 */
92
93/*
94 * Checksum an MP configuration block.
95 */
96
97static int __init mpf_checksum(unsigned char *mp, int len)
98{
99 int sum = 0;
100
101 while (len--)
102 sum += *mp++;
103
104 return sum & 0xFF;
105}
106
107static void __init MP_processor_info (struct mpc_config_processor *m)
108{
109 int ver;
110
111 if (!(m->mpc_cpuflag & CPU_ENABLED))
112 return;
113
114 printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
115 m->mpc_apicid,
116 (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
117 (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
118 m->mpc_apicver);
119
120 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
121 Dprintk(" Bootup CPU\n");
122 boot_cpu_id = m->mpc_apicid;
123 }
124 if (num_processors >= NR_CPUS) {
125 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
126 " Processor ignored.\n", NR_CPUS);
127 return;
128 }
129 if (num_processors >= maxcpus) {
130 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
131 " Processor ignored.\n", maxcpus);
132 return;
133 }
134
135 num_processors++;
136
137 if (m->mpc_apicid > MAX_APICS) {
138 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
139 m->mpc_apicid, MAX_APICS);
140 return;
141 }
142 ver = m->mpc_apicver;
143
144 physid_set(m->mpc_apicid, phys_cpu_present_map);
145 /*
146 * Validate version
147 */
148 if (ver == 0x0) {
149 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
150 ver = 0x10;
151 }
152 apic_version[m->mpc_apicid] = ver;
153 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
154}
155
156static void __init MP_bus_info (struct mpc_config_bus *m)
157{
158 char str[7];
159
160 memcpy(str, m->mpc_bustype, 6);
161 str[6] = 0;
162 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
163
164 if (strncmp(str, "ISA", 3) == 0) {
165 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
166 } else if (strncmp(str, "EISA", 4) == 0) {
167 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
168 } else if (strncmp(str, "PCI", 3) == 0) {
169 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
170 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
171 mp_current_pci_id++;
172 } else if (strncmp(str, "MCA", 3) == 0) {
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
174 } else {
175 printk(KERN_ERR "Unknown bustype %s\n", str);
176 }
177}
178
179static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
180{
181 if (!(m->mpc_flags & MPC_APIC_USABLE))
182 return;
183
184 printk("I/O APIC #%d Version %d at 0x%X.\n",
185 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
186 if (nr_ioapics >= MAX_IO_APICS) {
187 printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
188 MAX_IO_APICS, nr_ioapics);
189 panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
190 }
191 if (!m->mpc_apicaddr) {
192 printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
193 " found in MP table, skipping!\n");
194 return;
195 }
196 mp_ioapics[nr_ioapics] = *m;
197 nr_ioapics++;
198}
199
200static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
201{
202 mp_irqs [mp_irq_entries] = *m;
203 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
204 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
205 m->mpc_irqtype, m->mpc_irqflag & 3,
206 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
207 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
208 if (++mp_irq_entries == MAX_IRQ_SOURCES)
209 panic("Max # of irq sources exceeded!!\n");
210}
211
212static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
213{
214 Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
215 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
216 m->mpc_irqtype, m->mpc_irqflag & 3,
217 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
218 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
219 /*
220 * Well it seems all SMP boards in existence
221 * use ExtINT/LVT1 == LINT0 and
222 * NMI/LVT2 == LINT1 - the following check
223 * will show us if this assumptions is false.
224 * Until then we do not have to add baggage.
225 */
226 if ((m->mpc_irqtype == mp_ExtINT) &&
227 (m->mpc_destapiclint != 0))
228 BUG();
229 if ((m->mpc_irqtype == mp_NMI) &&
230 (m->mpc_destapiclint != 1))
231 BUG();
232}
233
234/*
235 * Read/parse the MPC
236 */
237
238static int __init smp_read_mpc(struct mp_config_table *mpc)
239{
240 char str[16];
241 int count=sizeof(*mpc);
242 unsigned char *mpt=((unsigned char *)mpc)+count;
243
244 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
245 printk("SMP mptable: bad signature [%c%c%c%c]!\n",
246 mpc->mpc_signature[0],
247 mpc->mpc_signature[1],
248 mpc->mpc_signature[2],
249 mpc->mpc_signature[3]);
250 return 0;
251 }
252 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
253 printk("SMP mptable: checksum error!\n");
254 return 0;
255 }
256 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
257 printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
258 mpc->mpc_spec);
259 return 0;
260 }
261 if (!mpc->mpc_lapic) {
262 printk(KERN_ERR "SMP mptable: null local APIC address!\n");
263 return 0;
264 }
265 memcpy(str,mpc->mpc_oem,8);
266 str[8]=0;
267 printk(KERN_INFO "OEM ID: %s ",str);
268
269 memcpy(str,mpc->mpc_productid,12);
270 str[12]=0;
271 printk(KERN_INFO "Product ID: %s ",str);
272
273 printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic);
274
275 /* save the local APIC address, it might be non-default */
276 if (!acpi_lapic)
277 mp_lapic_addr = mpc->mpc_lapic;
278
279 /*
280 * Now process the configuration blocks.
281 */
282 while (count < mpc->mpc_length) {
283 switch(*mpt) {
284 case MP_PROCESSOR:
285 {
286 struct mpc_config_processor *m=
287 (struct mpc_config_processor *)mpt;
288 if (!acpi_lapic)
289 MP_processor_info(m);
290 mpt += sizeof(*m);
291 count += sizeof(*m);
292 break;
293 }
294 case MP_BUS:
295 {
296 struct mpc_config_bus *m=
297 (struct mpc_config_bus *)mpt;
298 MP_bus_info(m);
299 mpt += sizeof(*m);
300 count += sizeof(*m);
301 break;
302 }
303 case MP_IOAPIC:
304 {
305 struct mpc_config_ioapic *m=
306 (struct mpc_config_ioapic *)mpt;
307 MP_ioapic_info(m);
308 mpt+=sizeof(*m);
309 count+=sizeof(*m);
310 break;
311 }
312 case MP_INTSRC:
313 {
314 struct mpc_config_intsrc *m=
315 (struct mpc_config_intsrc *)mpt;
316
317 MP_intsrc_info(m);
318 mpt+=sizeof(*m);
319 count+=sizeof(*m);
320 break;
321 }
322 case MP_LINTSRC:
323 {
324 struct mpc_config_lintsrc *m=
325 (struct mpc_config_lintsrc *)mpt;
326 MP_lintsrc_info(m);
327 mpt+=sizeof(*m);
328 count+=sizeof(*m);
329 break;
330 }
331 }
332 }
333 clustered_apic_check();
334 if (!num_processors)
335 printk(KERN_ERR "SMP mptable: no processors registered!\n");
336 return num_processors;
337}
338
339static int __init ELCR_trigger(unsigned int irq)
340{
341 unsigned int port;
342
343 port = 0x4d0 + (irq >> 3);
344 return (inb(port) >> (irq & 7)) & 1;
345}
346
347static void __init construct_default_ioirq_mptable(int mpc_default_type)
348{
349 struct mpc_config_intsrc intsrc;
350 int i;
351 int ELCR_fallback = 0;
352
353 intsrc.mpc_type = MP_INTSRC;
354 intsrc.mpc_irqflag = 0; /* conforming */
355 intsrc.mpc_srcbus = 0;
356 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
357
358 intsrc.mpc_irqtype = mp_INT;
359
360 /*
361 * If true, we have an ISA/PCI system with no IRQ entries
362 * in the MP table. To prevent the PCI interrupts from being set up
363 * incorrectly, we try to use the ELCR. The sanity check to see if
364 * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
365 * never be level sensitive, so we simply see if the ELCR agrees.
366 * If it does, we assume it's valid.
367 */
368 if (mpc_default_type == 5) {
369 printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
370
371 if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
372 printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
373 else {
374 printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
375 ELCR_fallback = 1;
376 }
377 }
378
379 for (i = 0; i < 16; i++) {
380 switch (mpc_default_type) {
381 case 2:
382 if (i == 0 || i == 13)
383 continue; /* IRQ0 & IRQ13 not connected */
384 /* fall through */
385 default:
386 if (i == 2)
387 continue; /* IRQ2 is never connected */
388 }
389
390 if (ELCR_fallback) {
391 /*
392 * If the ELCR indicates a level-sensitive interrupt, we
393 * copy that information over to the MP table in the
394 * irqflag field (level sensitive, active high polarity).
395 */
396 if (ELCR_trigger(i))
397 intsrc.mpc_irqflag = 13;
398 else
399 intsrc.mpc_irqflag = 0;
400 }
401
402 intsrc.mpc_srcbusirq = i;
403 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
404 MP_intsrc_info(&intsrc);
405 }
406
407 intsrc.mpc_irqtype = mp_ExtINT;
408 intsrc.mpc_srcbusirq = 0;
409 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
410 MP_intsrc_info(&intsrc);
411}
412
413static inline void __init construct_default_ISA_mptable(int mpc_default_type)
414{
415 struct mpc_config_processor processor;
416 struct mpc_config_bus bus;
417 struct mpc_config_ioapic ioapic;
418 struct mpc_config_lintsrc lintsrc;
419 int linttypes[2] = { mp_ExtINT, mp_NMI };
420 int i;
421
422 /*
423 * local APIC has default address
424 */
425 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
426
427 /*
428 * 2 CPUs, numbered 0 & 1.
429 */
430 processor.mpc_type = MP_PROCESSOR;
431 /* Either an integrated APIC or a discrete 82489DX. */
432 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
433 processor.mpc_cpuflag = CPU_ENABLED;
434 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
435 (boot_cpu_data.x86_model << 4) |
436 boot_cpu_data.x86_mask;
437 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
438 processor.mpc_reserved[0] = 0;
439 processor.mpc_reserved[1] = 0;
440 for (i = 0; i < 2; i++) {
441 processor.mpc_apicid = i;
442 MP_processor_info(&processor);
443 }
444
445 bus.mpc_type = MP_BUS;
446 bus.mpc_busid = 0;
447 switch (mpc_default_type) {
448 default:
449 printk(KERN_ERR "???\nUnknown standard configuration %d\n",
450 mpc_default_type);
451 /* fall through */
452 case 1:
453 case 5:
454 memcpy(bus.mpc_bustype, "ISA ", 6);
455 break;
456 case 2:
457 case 6:
458 case 3:
459 memcpy(bus.mpc_bustype, "EISA ", 6);
460 break;
461 case 4:
462 case 7:
463 memcpy(bus.mpc_bustype, "MCA ", 6);
464 }
465 MP_bus_info(&bus);
466 if (mpc_default_type > 4) {
467 bus.mpc_busid = 1;
468 memcpy(bus.mpc_bustype, "PCI ", 6);
469 MP_bus_info(&bus);
470 }
471
472 ioapic.mpc_type = MP_IOAPIC;
473 ioapic.mpc_apicid = 2;
474 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
475 ioapic.mpc_flags = MPC_APIC_USABLE;
476 ioapic.mpc_apicaddr = 0xFEC00000;
477 MP_ioapic_info(&ioapic);
478
479 /*
480 * We set up most of the low 16 IO-APIC pins according to MPS rules.
481 */
482 construct_default_ioirq_mptable(mpc_default_type);
483
484 lintsrc.mpc_type = MP_LINTSRC;
485 lintsrc.mpc_irqflag = 0; /* conforming */
486 lintsrc.mpc_srcbusid = 0;
487 lintsrc.mpc_srcbusirq = 0;
488 lintsrc.mpc_destapic = MP_APIC_ALL;
489 for (i = 0; i < 2; i++) {
490 lintsrc.mpc_irqtype = linttypes[i];
491 lintsrc.mpc_destapiclint = i;
492 MP_lintsrc_info(&lintsrc);
493 }
494}
495
496static struct intel_mp_floating *mpf_found;
497
498/*
499 * Scan the memory blocks for an SMP configuration block.
500 */
501void __init get_smp_config (void)
502{
503 struct intel_mp_floating *mpf = mpf_found;
504
505 /*
506 * ACPI may be used to obtain the entire SMP configuration or just to
507 * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that
508 * ACPI supports both logical (e.g. Hyper-Threading) and physical
509 * processors, where MPS only supports physical.
510 */
511 if (acpi_lapic && acpi_ioapic) {
512 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
513 return;
514 }
515 else if (acpi_lapic)
516 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
517
518 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
519 if (mpf->mpf_feature2 & (1<<7)) {
520 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
521 pic_mode = 1;
522 } else {
523 printk(KERN_INFO " Virtual Wire compatibility mode.\n");
524 pic_mode = 0;
525 }
526
527 /*
528 * Now see if we need to read further.
529 */
530 if (mpf->mpf_feature1 != 0) {
531
532 printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
533 construct_default_ISA_mptable(mpf->mpf_feature1);
534
535 } else if (mpf->mpf_physptr) {
536
537 /*
538 * Read the physical hardware table. Anything here will
539 * override the defaults.
540 */
541 if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) {
542 smp_found_config = 0;
543 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
544 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
545 return;
546 }
547 /*
548 * If there are no explicit MP IRQ entries, then we are
549 * broken. We set up most of the low 16 IO-APIC pins to
550 * ISA defaults and hope it will work.
551 */
552 if (!mp_irq_entries) {
553 struct mpc_config_bus bus;
554
555 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
556
557 bus.mpc_type = MP_BUS;
558 bus.mpc_busid = 0;
559 memcpy(bus.mpc_bustype, "ISA ", 6);
560 MP_bus_info(&bus);
561
562 construct_default_ioirq_mptable(0);
563 }
564
565 } else
566 BUG();
567
568 printk(KERN_INFO "Processors: %d\n", num_processors);
569 /*
570 * Only use the first configuration found.
571 */
572}
573
574static int __init smp_scan_config (unsigned long base, unsigned long length)
575{
576 extern void __bad_mpf_size(void);
577 unsigned int *bp = phys_to_virt(base);
578 struct intel_mp_floating *mpf;
579
580 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
581 if (sizeof(*mpf) != 16)
582 __bad_mpf_size();
583
584 while (length > 0) {
585 mpf = (struct intel_mp_floating *)bp;
586 if ((*bp == SMP_MAGIC_IDENT) &&
587 (mpf->mpf_length == 1) &&
588 !mpf_checksum((unsigned char *)bp, 16) &&
589 ((mpf->mpf_specification == 1)
590 || (mpf->mpf_specification == 4)) ) {
591
592 smp_found_config = 1;
593 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
594 if (mpf->mpf_physptr)
595 reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
596 mpf_found = mpf;
597 return 1;
598 }
599 bp += 4;
600 length -= 16;
601 }
602 return 0;
603}
604
605void __init find_intel_smp (void)
606{
607 unsigned int address;
608
609 /*
610 * FIXME: Linux assumes you have 640K of base ram..
611 * this continues the error...
612 *
613 * 1) Scan the bottom 1K for a signature
614 * 2) Scan the top 1K of base RAM
615 * 3) Scan the 64K of bios
616 */
617 if (smp_scan_config(0x0,0x400) ||
618 smp_scan_config(639*0x400,0x400) ||
619 smp_scan_config(0xF0000,0x10000))
620 return;
621 /*
622 * If it is an SMP machine we should know now, unless the
623 * configuration is in an EISA/MCA bus machine with an
624 * extended bios data area.
625 *
626 * there is a real-mode segmented pointer pointing to the
627 * 4K EBDA area at 0x40E, calculate and scan it here.
628 *
629 * NOTE! There are Linux loaders that will corrupt the EBDA
630 * area, and as such this kind of SMP config may be less
631 * trustworthy, simply because the SMP table may have been
632 * stomped on during early boot. These loaders are buggy and
633 * should be fixed.
634 */
635
636 address = *(unsigned short *)phys_to_virt(0x40E);
637 address <<= 4;
638 if (smp_scan_config(address, 0x1000))
639 return;
640
641 /* If we have come this far, we did not find an MP table */
642 printk(KERN_INFO "No mptable found.\n");
643}
644
645/*
646 * - Intel MP Configuration Table
647 */
648void __init find_smp_config (void)
649{
650#ifdef CONFIG_X86_LOCAL_APIC
651 find_intel_smp();
652#endif
653}
654
655
656/* --------------------------------------------------------------------------
657 ACPI-based MP Configuration
658 -------------------------------------------------------------------------- */
659
660#ifdef CONFIG_ACPI_BOOT
661
662void __init mp_register_lapic_address (
663 u64 address)
664{
665 mp_lapic_addr = (unsigned long) address;
666
667 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
668
669 if (boot_cpu_id == -1U)
670 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
671
672 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
673}
674
675
676void __init mp_register_lapic (
677 u8 id,
678 u8 enabled)
679{
680 struct mpc_config_processor processor;
681 int boot_cpu = 0;
682
683 if (id >= MAX_APICS) {
684 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
685 id, MAX_APICS);
686 return;
687 }
688
689 if (id == boot_cpu_physical_apicid)
690 boot_cpu = 1;
691
692 processor.mpc_type = MP_PROCESSOR;
693 processor.mpc_apicid = id;
694 processor.mpc_apicver = 0x10; /* TBD: lapic version */
695 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
696 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
697 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
698 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
699 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
700 processor.mpc_reserved[0] = 0;
701 processor.mpc_reserved[1] = 0;
702
703 MP_processor_info(&processor);
704}
705
706#ifdef CONFIG_X86_IO_APIC
707
708#define MP_ISA_BUS 0
709#define MP_MAX_IOAPIC_PIN 127
710
711static struct mp_ioapic_routing {
712 int apic_id;
713 int gsi_start;
714 int gsi_end;
715 u32 pin_programmed[4];
716} mp_ioapic_routing[MAX_IO_APICS];
717
718
719static int mp_find_ioapic (
720 int gsi)
721{
722 int i = 0;
723
724 /* Find the IOAPIC that manages this GSI. */
725 for (i = 0; i < nr_ioapics; i++) {
726 if ((gsi >= mp_ioapic_routing[i].gsi_start)
727 && (gsi <= mp_ioapic_routing[i].gsi_end))
728 return i;
729 }
730
731 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
732
733 return -1;
734}
735
736
737void __init mp_register_ioapic (
738 u8 id,
739 u32 address,
740 u32 gsi_base)
741{
742 int idx = 0;
743
744 if (nr_ioapics >= MAX_IO_APICS) {
745 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
746 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
747 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
748 }
749 if (!address) {
750 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
751 " found in MADT table, skipping!\n");
752 return;
753 }
754
755 idx = nr_ioapics++;
756
757 mp_ioapics[idx].mpc_type = MP_IOAPIC;
758 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
759 mp_ioapics[idx].mpc_apicaddr = address;
760
761 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
762 mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
763 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
764
765 /*
766 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
767 * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
768 */
769 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
770 mp_ioapic_routing[idx].gsi_start = gsi_base;
771 mp_ioapic_routing[idx].gsi_end = gsi_base +
772 io_apic_get_redir_entries(idx);
773
774 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
775 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
776 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
777 mp_ioapic_routing[idx].gsi_start,
778 mp_ioapic_routing[idx].gsi_end);
779
780 return;
781}
782
783
784void __init mp_override_legacy_irq (
785 u8 bus_irq,
786 u8 polarity,
787 u8 trigger,
788 u32 gsi)
789{
790 struct mpc_config_intsrc intsrc;
791 int ioapic = -1;
792 int pin = -1;
793
794 /*
795 * Convert 'gsi' to 'ioapic.pin'.
796 */
797 ioapic = mp_find_ioapic(gsi);
798 if (ioapic < 0)
799 return;
800 pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
801
802 /*
803 * TBD: This check is for faulty timer entries, where the override
804 * erroneously sets the trigger to level, resulting in a HUGE
805 * increase of timer interrupts!
806 */
807 if ((bus_irq == 0) && (trigger == 3))
808 trigger = 1;
809
810 intsrc.mpc_type = MP_INTSRC;
811 intsrc.mpc_irqtype = mp_INT;
812 intsrc.mpc_irqflag = (trigger << 2) | polarity;
813 intsrc.mpc_srcbus = MP_ISA_BUS;
814 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
815 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
816 intsrc.mpc_dstirq = pin; /* INTIN# */
817
818 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
819 intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
820 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
821 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
822
823 mp_irqs[mp_irq_entries] = intsrc;
824 if (++mp_irq_entries == MAX_IRQ_SOURCES)
825 panic("Max # of irq sources exceeded!\n");
826
827 return;
828}
829
830
831void __init mp_config_acpi_legacy_irqs (void)
832{
833 struct mpc_config_intsrc intsrc;
834 int i = 0;
835 int ioapic = -1;
836
837 /*
838 * Fabricate the legacy ISA bus (bus #31).
839 */
840 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
841 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
842
843 /*
844 * Locate the IOAPIC that manages the ISA IRQs (0-15).
845 */
846 ioapic = mp_find_ioapic(0);
847 if (ioapic < 0)
848 return;
849
850 intsrc.mpc_type = MP_INTSRC;
851 intsrc.mpc_irqflag = 0; /* Conforming */
852 intsrc.mpc_srcbus = MP_ISA_BUS;
853 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
854
855 /*
856 * Use the default configuration for the IRQs 0-15. Unless
857 * overridden by (MADT) interrupt source override entries.
858 */
859 for (i = 0; i < 16; i++) {
860 int idx;
861
862 for (idx = 0; idx < mp_irq_entries; idx++) {
863 struct mpc_config_intsrc *irq = mp_irqs + idx;
864
865 /* Do we already have a mapping for this ISA IRQ? */
866 if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
867 break;
868
869 /* Do we already have a mapping for this IOAPIC pin */
870 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
871 (irq->mpc_dstirq == i))
872 break;
873 }
874
875 if (idx != mp_irq_entries) {
876 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
877 continue; /* IRQ already used */
878 }
879
880 intsrc.mpc_irqtype = mp_INT;
881 intsrc.mpc_srcbusirq = i; /* Identity mapped */
882 intsrc.mpc_dstirq = i;
883
884 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
885 "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
886 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
887 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
888 intsrc.mpc_dstirq);
889
890 mp_irqs[mp_irq_entries] = intsrc;
891 if (++mp_irq_entries == MAX_IRQ_SOURCES)
892 panic("Max # of irq sources exceeded!\n");
893 }
894
895 return;
896}
897
898int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
899{
900 int ioapic = -1;
901 int ioapic_pin = 0;
902 int idx, bit = 0;
903
904 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
905 return gsi;
906
907#ifdef CONFIG_ACPI_BUS
908 /* Don't set up the ACPI SCI because it's already set up */
909 if (acpi_fadt.sci_int == gsi)
910 return gsi;
911#endif
912
913 ioapic = mp_find_ioapic(gsi);
914 if (ioapic < 0) {
915 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
916 return gsi;
917 }
918
919 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
920
921 /*
922 * Avoid pin reprogramming. PRTs typically include entries
923 * with redundant pin->gsi mappings (but unique PCI devices);
924 * we only program the IOAPIC on the first.
925 */
926 bit = ioapic_pin % 32;
927 idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
928 if (idx > 3) {
929 printk(KERN_ERR "Invalid reference to IOAPIC pin "
930 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
931 ioapic_pin);
932 return gsi;
933 }
934 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
935 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
936 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
937 return gsi;
938 }
939
940 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
941
942 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
943 edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
944 active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
945 return gsi;
946}
947
948#endif /*CONFIG_X86_IO_APIC*/
949#endif /*CONFIG_ACPI_BOOT*/
diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c
new file mode 100644
index 000000000000..598953ab0154
--- /dev/null
+++ b/arch/x86_64/kernel/msr.c
@@ -0,0 +1,279 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * msr.c
15 *
16 * x86 MSR access device
17 *
18 * This device is accessed by lseek() to the appropriate register number
19 * and then read/write in chunks of 8 bytes. A larger size means multiple
20 * reads or writes of the same register.
21 *
22 * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27#include <linux/config.h>
28
29#include <linux/types.h>
30#include <linux/errno.h>
31#include <linux/fcntl.h>
32#include <linux/init.h>
33#include <linux/poll.h>
34#include <linux/smp.h>
35#include <linux/smp_lock.h>
36#include <linux/major.h>
37#include <linux/fs.h>
38
39#include <asm/processor.h>
40#include <asm/msr.h>
41#include <asm/uaccess.h>
42#include <asm/system.h>
43
44/* Note: "err" is handled in a funny way below. Otherwise one version
45 of gcc or another breaks. */
46
47static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx)
48{
49 int err;
50
51 asm volatile ("1: wrmsr\n"
52 "2:\n"
53 ".section .fixup,\"ax\"\n"
54 "3: movl %4,%0\n"
55 " jmp 2b\n"
56 ".previous\n"
57 ".section __ex_table,\"a\"\n"
58 " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err)
59 :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0));
60
61 return err;
62}
63
64static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
65{
66 int err;
67
68 asm volatile ("1: rdmsr\n"
69 "2:\n"
70 ".section .fixup,\"ax\"\n"
71 "3: movl %4,%0\n"
72 " jmp 2b\n"
73 ".previous\n"
74 ".section __ex_table,\"a\"\n"
75 " .align 8\n"
76 " .quad 1b,3b\n"
77 ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx)
78 :"c"(reg), "i"(-EIO), "0"(0));
79
80 return err;
81}
82
83#ifdef CONFIG_SMP
84
85struct msr_command {
86 int cpu;
87 int err;
88 u32 reg;
89 u32 data[2];
90};
91
92static void msr_smp_wrmsr(void *cmd_block)
93{
94 struct msr_command *cmd = (struct msr_command *)cmd_block;
95
96 if (cmd->cpu == smp_processor_id())
97 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
98}
99
100static void msr_smp_rdmsr(void *cmd_block)
101{
102 struct msr_command *cmd = (struct msr_command *)cmd_block;
103
104 if (cmd->cpu == smp_processor_id())
105 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
106}
107
108static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
109{
110 struct msr_command cmd;
111 int ret;
112
113 preempt_disable();
114 if (cpu == smp_processor_id()) {
115 ret = wrmsr_eio(reg, eax, edx);
116 } else {
117 cmd.cpu = cpu;
118 cmd.reg = reg;
119 cmd.data[0] = eax;
120 cmd.data[1] = edx;
121
122 smp_call_function(msr_smp_wrmsr, &cmd, 1, 1);
123 ret = cmd.err;
124 }
125 preempt_enable();
126 return ret;
127}
128
129static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
130{
131 struct msr_command cmd;
132 int ret;
133
134 preempt_disable();
135 if (cpu == smp_processor_id()) {
136 ret = rdmsr_eio(reg, eax, edx);
137 } else {
138 cmd.cpu = cpu;
139 cmd.reg = reg;
140
141 smp_call_function(msr_smp_rdmsr, &cmd, 1, 1);
142
143 *eax = cmd.data[0];
144 *edx = cmd.data[1];
145
146 ret = cmd.err;
147 }
148 preempt_enable();
149 return ret;
150}
151
152#else /* ! CONFIG_SMP */
153
154static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
155{
156 return wrmsr_eio(reg, eax, edx);
157}
158
159static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx)
160{
161 return rdmsr_eio(reg, eax, edx);
162}
163
164#endif /* ! CONFIG_SMP */
165
166static loff_t msr_seek(struct file *file, loff_t offset, int orig)
167{
168 loff_t ret = -EINVAL;
169
170 lock_kernel();
171 switch (orig) {
172 case 0:
173 file->f_pos = offset;
174 ret = file->f_pos;
175 break;
176 case 1:
177 file->f_pos += offset;
178 ret = file->f_pos;
179 }
180 unlock_kernel();
181 return ret;
182}
183
184static ssize_t msr_read(struct file *file, char __user * buf,
185 size_t count, loff_t * ppos)
186{
187 u32 __user *tmp = (u32 __user *) buf;
188 u32 data[2];
189 size_t rv;
190 u32 reg = *ppos;
191 int cpu = iminor(file->f_dentry->d_inode);
192 int err;
193
194 if (count % 8)
195 return -EINVAL; /* Invalid chunk size */
196
197 for (rv = 0; count; count -= 8) {
198 err = do_rdmsr(cpu, reg, &data[0], &data[1]);
199 if (err)
200 return err;
201 if (copy_to_user(tmp, &data, 8))
202 return -EFAULT;
203 tmp += 2;
204 }
205
206 return ((char __user *)tmp) - buf;
207}
208
209static ssize_t msr_write(struct file *file, const char __user *buf,
210 size_t count, loff_t *ppos)
211{
212 const u32 __user *tmp = (const u32 __user *)buf;
213 u32 data[2];
214 size_t rv;
215 u32 reg = *ppos;
216 int cpu = iminor(file->f_dentry->d_inode);
217 int err;
218
219 if (count % 8)
220 return -EINVAL; /* Invalid chunk size */
221
222 for (rv = 0; count; count -= 8) {
223 if (copy_from_user(&data, tmp, 8))
224 return -EFAULT;
225 err = do_wrmsr(cpu, reg, data[0], data[1]);
226 if (err)
227 return err;
228 tmp += 2;
229 }
230
231 return ((char __user *)tmp) - buf;
232}
233
234static int msr_open(struct inode *inode, struct file *file)
235{
236 unsigned int cpu = iminor(file->f_dentry->d_inode);
237 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
238
239 if (cpu >= NR_CPUS || !cpu_online(cpu))
240 return -ENXIO; /* No such CPU */
241 if (!cpu_has(c, X86_FEATURE_MSR))
242 return -EIO; /* MSR not supported */
243
244 return 0;
245}
246
247/*
248 * File operations we support
249 */
250static struct file_operations msr_fops = {
251 .owner = THIS_MODULE,
252 .llseek = msr_seek,
253 .read = msr_read,
254 .write = msr_write,
255 .open = msr_open,
256};
257
258static int __init msr_init(void)
259{
260 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
261 printk(KERN_ERR "msr: unable to get major %d for msr\n",
262 MSR_MAJOR);
263 return -EBUSY;
264 }
265
266 return 0;
267}
268
269static void __exit msr_exit(void)
270{
271 unregister_chrdev(MSR_MAJOR, "cpu/msr");
272}
273
274module_init(msr_init);
275module_exit(msr_exit)
276
277MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
278MODULE_DESCRIPTION("x86 generic MSR driver");
279MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
new file mode 100644
index 000000000000..d9867de6a626
--- /dev/null
+++ b/arch/x86_64/kernel/nmi.c
@@ -0,0 +1,488 @@
1/*
2 * linux/arch/x86_64/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Pavel Machek and
12 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/irq.h>
18#include <linux/delay.h>
19#include <linux/bootmem.h>
20#include <linux/smp_lock.h>
21#include <linux/interrupt.h>
22#include <linux/mc146818rtc.h>
23#include <linux/kernel_stat.h>
24#include <linux/module.h>
25#include <linux/sysdev.h>
26#include <linux/nmi.h>
27#include <linux/sysctl.h>
28
29#include <asm/smp.h>
30#include <asm/mtrr.h>
31#include <asm/mpspec.h>
32#include <asm/nmi.h>
33#include <asm/msr.h>
34#include <asm/proto.h>
35#include <asm/kdebug.h>
36
37/*
38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
39 * - it may be reserved by some other driver, or not
40 * - when not reserved by some other driver, it may be used for
41 * the NMI watchdog, or not
42 *
43 * This is maintained separately from nmi_active because the NMI
44 * watchdog may also be driven from the I/O APIC timer.
45 */
46static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
47static unsigned int lapic_nmi_owner;
48#define LAPIC_NMI_WATCHDOG (1<<0)
49#define LAPIC_NMI_RESERVED (1<<1)
50
51/* nmi_active:
52 * +1: the lapic NMI watchdog is active, but can be disabled
53 * 0: the lapic NMI watchdog has not been set up, and cannot
54 * be enabled
55 * -1: the lapic NMI watchdog is disabled, but can be enabled
56 */
57int nmi_active; /* oprofile uses this */
58int panic_on_timeout;
59
60unsigned int nmi_watchdog = NMI_DEFAULT;
61static unsigned int nmi_hz = HZ;
62unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
63
64/* Note that these events don't tick when the CPU idles. This means
65 the frequency varies with CPU load. */
66
67#define K7_EVNTSEL_ENABLE (1 << 22)
68#define K7_EVNTSEL_INT (1 << 20)
69#define K7_EVNTSEL_OS (1 << 17)
70#define K7_EVNTSEL_USR (1 << 16)
71#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
72#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
73
74#define P6_EVNTSEL0_ENABLE (1 << 22)
75#define P6_EVNTSEL_INT (1 << 20)
76#define P6_EVNTSEL_OS (1 << 17)
77#define P6_EVNTSEL_USR (1 << 16)
78#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
79#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
80
81/* Run after command line and cpu_init init, but before all other checks */
82void __init nmi_watchdog_default(void)
83{
84 if (nmi_watchdog != NMI_DEFAULT)
85 return;
86
87 /* For some reason the IO APIC watchdog doesn't work on the AMD
88 8111 chipset. For now switch to local APIC mode using
89 perfctr0 there. On Intel CPUs we don't have code to handle
90 the perfctr and the IO-APIC seems to work, so use that. */
91
92 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
93 nmi_watchdog = NMI_LOCAL_APIC;
94 printk(KERN_INFO
95 "Using local APIC NMI watchdog using perfctr0\n");
96 } else {
97 printk(KERN_INFO "Using IO APIC NMI watchdog\n");
98 nmi_watchdog = NMI_IO_APIC;
99 }
100}
101
102/* Why is there no CPUID flag for this? */
103static __init int cpu_has_lapic(void)
104{
105 switch (boot_cpu_data.x86_vendor) {
106 case X86_VENDOR_INTEL:
107 case X86_VENDOR_AMD:
108 return boot_cpu_data.x86 >= 6;
109 /* .... add more cpus here or find a different way to figure this out. */
110 default:
111 return 0;
112 }
113}
114
115int __init check_nmi_watchdog (void)
116{
117 int counts[NR_CPUS];
118 int cpu;
119
120 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) {
121 nmi_watchdog = NMI_NONE;
122 return -1;
123 }
124
125 printk(KERN_INFO "testing NMI watchdog ... ");
126
127 for (cpu = 0; cpu < NR_CPUS; cpu++)
128 counts[cpu] = cpu_pda[cpu].__nmi_count;
129 local_irq_enable();
130 mdelay((10*1000)/nmi_hz); // wait 10 ticks
131
132 for (cpu = 0; cpu < NR_CPUS; cpu++) {
133#ifdef CONFIG_SMP
134 /* Check cpu_callin_map here because that is set
135 after the timer is started. */
136 if (!cpu_isset(cpu, cpu_callin_map))
137 continue;
138#endif
139 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
140 printk("CPU#%d: NMI appears to be stuck (%d)!\n",
141 cpu,
142 cpu_pda[cpu].__nmi_count);
143 nmi_active = 0;
144 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
145 return -1;
146 }
147 }
148 printk("OK.\n");
149
150 /* now that we know it works we can reduce NMI frequency to
151 something more reasonable; makes a difference in some configs */
152 if (nmi_watchdog == NMI_LOCAL_APIC)
153 nmi_hz = 1;
154
155 return 0;
156}
157
158int __init setup_nmi_watchdog(char *str)
159{
160 int nmi;
161
162 if (!strncmp(str,"panic",5)) {
163 panic_on_timeout = 1;
164 str = strchr(str, ',');
165 if (!str)
166 return 1;
167 ++str;
168 }
169
170 get_option(&str, &nmi);
171
172 if (nmi >= NMI_INVALID)
173 return 0;
174 nmi_watchdog = nmi;
175 return 1;
176}
177
178__setup("nmi_watchdog=", setup_nmi_watchdog);
179
180static void disable_lapic_nmi_watchdog(void)
181{
182 if (nmi_active <= 0)
183 return;
184 switch (boot_cpu_data.x86_vendor) {
185 case X86_VENDOR_AMD:
186 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
187 break;
188 case X86_VENDOR_INTEL:
189 wrmsr(MSR_IA32_EVNTSEL0, 0, 0);
190 break;
191 }
192 nmi_active = -1;
193 /* tell do_nmi() and others that we're not active any more */
194 nmi_watchdog = 0;
195}
196
197static void enable_lapic_nmi_watchdog(void)
198{
199 if (nmi_active < 0) {
200 nmi_watchdog = NMI_LOCAL_APIC;
201 setup_apic_nmi_watchdog();
202 }
203}
204
205int reserve_lapic_nmi(void)
206{
207 unsigned int old_owner;
208
209 spin_lock(&lapic_nmi_owner_lock);
210 old_owner = lapic_nmi_owner;
211 lapic_nmi_owner |= LAPIC_NMI_RESERVED;
212 spin_unlock(&lapic_nmi_owner_lock);
213 if (old_owner & LAPIC_NMI_RESERVED)
214 return -EBUSY;
215 if (old_owner & LAPIC_NMI_WATCHDOG)
216 disable_lapic_nmi_watchdog();
217 return 0;
218}
219
220void release_lapic_nmi(void)
221{
222 unsigned int new_owner;
223
224 spin_lock(&lapic_nmi_owner_lock);
225 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
226 lapic_nmi_owner = new_owner;
227 spin_unlock(&lapic_nmi_owner_lock);
228 if (new_owner & LAPIC_NMI_WATCHDOG)
229 enable_lapic_nmi_watchdog();
230}
231
232void disable_timer_nmi_watchdog(void)
233{
234 if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
235 return;
236
237 disable_irq(0);
238 unset_nmi_callback();
239 nmi_active = -1;
240 nmi_watchdog = NMI_NONE;
241}
242
243void enable_timer_nmi_watchdog(void)
244{
245 if (nmi_active < 0) {
246 nmi_watchdog = NMI_IO_APIC;
247 touch_nmi_watchdog();
248 nmi_active = 1;
249 enable_irq(0);
250 }
251}
252
253#ifdef CONFIG_PM
254
255static int nmi_pm_active; /* nmi_active before suspend */
256
257static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
258{
259 nmi_pm_active = nmi_active;
260 disable_lapic_nmi_watchdog();
261 return 0;
262}
263
264static int lapic_nmi_resume(struct sys_device *dev)
265{
266 if (nmi_pm_active > 0)
267 enable_lapic_nmi_watchdog();
268 return 0;
269}
270
271static struct sysdev_class nmi_sysclass = {
272 set_kset_name("lapic_nmi"),
273 .resume = lapic_nmi_resume,
274 .suspend = lapic_nmi_suspend,
275};
276
277static struct sys_device device_lapic_nmi = {
278 .id = 0,
279 .cls = &nmi_sysclass,
280};
281
282static int __init init_lapic_nmi_sysfs(void)
283{
284 int error;
285
286 if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
287 return 0;
288
289 error = sysdev_class_register(&nmi_sysclass);
290 if (!error)
291 error = sysdev_register(&device_lapic_nmi);
292 return error;
293}
294/* must come after the local APIC's device_initcall() */
295late_initcall(init_lapic_nmi_sysfs);
296
297#endif /* CONFIG_PM */
298
299/*
300 * Activate the NMI watchdog via the local APIC.
301 * Original code written by Keith Owens.
302 */
303
304static void setup_k7_watchdog(void)
305{
306 int i;
307 unsigned int evntsel;
308
309 /* No check, so can start with slow frequency */
310 nmi_hz = 1;
311
312 /* XXX should check these in EFER */
313
314 nmi_perfctr_msr = MSR_K7_PERFCTR0;
315
316 for(i = 0; i < 4; ++i) {
317 /* Simulator may not support it */
318 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL))
319 return;
320 wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
321 }
322
323 evntsel = K7_EVNTSEL_INT
324 | K7_EVNTSEL_OS
325 | K7_EVNTSEL_USR
326 | K7_NMI_EVENT;
327
328 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
329 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz);
330 apic_write(APIC_LVTPC, APIC_DM_NMI);
331 evntsel |= K7_EVNTSEL_ENABLE;
332 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
333}
334
335void setup_apic_nmi_watchdog(void)
336{
337 switch (boot_cpu_data.x86_vendor) {
338 case X86_VENDOR_AMD:
339 if (boot_cpu_data.x86 < 6)
340 return;
341 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
342 return;
343 setup_k7_watchdog();
344 break;
345 default:
346 return;
347 }
348 lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
349 nmi_active = 1;
350}
351
352/*
353 * the best way to detect whether a CPU has a 'hard lockup' problem
354 * is to check it's local APIC timer IRQ counts. If they are not
355 * changing then that CPU has some problem.
356 *
357 * as these watchdog NMI IRQs are generated on every CPU, we only
358 * have to check the current processor.
359 *
360 * since NMIs don't listen to _any_ locks, we have to be extremely
361 * careful not to rely on unsafe variables. The printk might lock
362 * up though, so we have to break up any console locks first ...
363 * [when there will be more tty-related locks, break them up
364 * here too!]
365 */
366
367static unsigned int
368 last_irq_sums [NR_CPUS],
369 alert_counter [NR_CPUS];
370
371void touch_nmi_watchdog (void)
372{
373 int i;
374
375 /*
376 * Just reset the alert counters, (other CPUs might be
377 * spinning on locks we hold):
378 */
379 for (i = 0; i < NR_CPUS; i++)
380 alert_counter[i] = 0;
381}
382
383void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
384{
385 int sum, cpu;
386
387 cpu = safe_smp_processor_id();
388 sum = read_pda(apic_timer_irqs);
389 if (last_irq_sums[cpu] == sum) {
390 /*
391 * Ayiee, looks like this CPU is stuck ...
392 * wait a few IRQs (5 seconds) before doing the oops ...
393 */
394 alert_counter[cpu]++;
395 if (alert_counter[cpu] == 5*nmi_hz) {
396 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
397 == NOTIFY_STOP) {
398 alert_counter[cpu] = 0;
399 return;
400 }
401 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
402 }
403 } else {
404 last_irq_sums[cpu] = sum;
405 alert_counter[cpu] = 0;
406 }
407 if (nmi_perfctr_msr)
408 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
409}
410
411static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
412{
413 return 0;
414}
415
416static nmi_callback_t nmi_callback = dummy_nmi_callback;
417
418asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
419{
420 int cpu = safe_smp_processor_id();
421
422 nmi_enter();
423 add_pda(__nmi_count,1);
424 if (!nmi_callback(regs, cpu))
425 default_do_nmi(regs);
426 nmi_exit();
427}
428
429void set_nmi_callback(nmi_callback_t callback)
430{
431 nmi_callback = callback;
432}
433
434void unset_nmi_callback(void)
435{
436 nmi_callback = dummy_nmi_callback;
437}
438
439#ifdef CONFIG_SYSCTL
440
441static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
442{
443 unsigned char reason = get_nmi_reason();
444 char buf[64];
445
446 if (!(reason & 0xc0)) {
447 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
448 die_nmi(buf,regs);
449 }
450 return 0;
451}
452
453/*
454 * proc handler for /proc/sys/kernel/unknown_nmi_panic
455 */
456int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
457 void __user *buffer, size_t *length, loff_t *ppos)
458{
459 int old_state;
460
461 old_state = unknown_nmi_panic;
462 proc_dointvec(table, write, file, buffer, length, ppos);
463 if (!!old_state == !!unknown_nmi_panic)
464 return 0;
465
466 if (unknown_nmi_panic) {
467 if (reserve_lapic_nmi() < 0) {
468 unknown_nmi_panic = 0;
469 return -EBUSY;
470 } else {
471 set_nmi_callback(unknown_nmi_panic_callback);
472 }
473 } else {
474 release_lapic_nmi();
475 unset_nmi_callback();
476 }
477 return 0;
478}
479
480#endif
481
482EXPORT_SYMBOL(nmi_active);
483EXPORT_SYMBOL(nmi_watchdog);
484EXPORT_SYMBOL(reserve_lapic_nmi);
485EXPORT_SYMBOL(release_lapic_nmi);
486EXPORT_SYMBOL(disable_timer_nmi_watchdog);
487EXPORT_SYMBOL(enable_timer_nmi_watchdog);
488EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
new file mode 100644
index 000000000000..cab471cf3edb
--- /dev/null
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -0,0 +1,60 @@
1/*
2 * Dynamic DMA mapping support.
3 */
4
5#include <linux/types.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/pci.h>
9#include <linux/module.h>
10#include <asm/io.h>
11
12/* Map a set of buffers described by scatterlist in streaming
13 * mode for DMA. This is the scatter-gather version of the
14 * above pci_map_single interface. Here the scatter gather list
15 * elements are each tagged with the appropriate dma address
16 * and length. They are obtained via sg_dma_{address,length}(SG).
17 *
18 * NOTE: An implementation may be able to use a smaller number of
19 * DMA address/length pairs than there are SG table elements.
20 * (for example via virtual mapping capabilities)
21 * The routine returns the number of addr/length pairs actually
22 * used, at most nents.
23 *
24 * Device ownership issues as mentioned above for pci_map_single are
25 * the same here.
26 */
27int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
28 int nents, int direction)
29{
30 int i;
31
32 BUG_ON(direction == DMA_NONE);
33 for (i = 0; i < nents; i++ ) {
34 struct scatterlist *s = &sg[i];
35 BUG_ON(!s->page);
36 s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
37 s->dma_length = s->length;
38 }
39 return nents;
40}
41
42EXPORT_SYMBOL(dma_map_sg);
43
44/* Unmap a set of streaming mode DMA translations.
45 * Again, cpu read rules concerning calls here are the same as for
46 * pci_unmap_single() above.
47 */
48void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
49 int nents, int dir)
50{
51 int i;
52 for (i = 0; i < nents; i++) {
53 struct scatterlist *s = &sg[i];
54 BUG_ON(s->page == NULL);
55 BUG_ON(s->dma_address == 0);
56 dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
57 }
58}
59
60EXPORT_SYMBOL(dma_unmap_sg);
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
new file mode 100644
index 000000000000..57f35c68aa34
--- /dev/null
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -0,0 +1,980 @@
1/*
2 * Dynamic DMA mapping support for AMD Hammer.
3 *
4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB.
7 *
8 * See Documentation/DMA-mapping.txt for the interface specification.
9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */
12
13#include <linux/config.h>
14#include <linux/types.h>
15#include <linux/ctype.h>
16#include <linux/agp_backend.h>
17#include <linux/init.h>
18#include <linux/mm.h>
19#include <linux/string.h>
20#include <linux/spinlock.h>
21#include <linux/pci.h>
22#include <linux/module.h>
23#include <linux/topology.h>
24#include <linux/interrupt.h>
25#include <linux/bitops.h>
26#include <asm/atomic.h>
27#include <asm/io.h>
28#include <asm/mtrr.h>
29#include <asm/pgtable.h>
30#include <asm/proto.h>
31#include <asm/cacheflush.h>
32#include <asm/kdebug.h>
33
34dma_addr_t bad_dma_address;
35
36unsigned long iommu_bus_base; /* GART remapping area (physical) */
37static unsigned long iommu_size; /* size of remapping area bytes */
38static unsigned long iommu_pages; /* .. and in pages */
39
40u32 *iommu_gatt_base; /* Remapping table */
41
42int no_iommu;
43static int no_agp;
44#ifdef CONFIG_IOMMU_DEBUG
45int panic_on_overflow = 1;
46int force_iommu = 1;
47#else
48int panic_on_overflow = 0;
49int force_iommu = 0;
50#endif
51int iommu_merge = 1;
52int iommu_sac_force = 0;
53
54/* If this is disabled the IOMMU will use an optimized flushing strategy
55 of only flushing when an mapping is reused. With it true the GART is flushed
56 for every mapping. Problem is that doing the lazy flush seems to trigger
57 bugs with some popular PCI cards, in particular 3ware (but has been also
58 also seen with Qlogic at least). */
59int iommu_fullflush = 1;
60
61/* This tells the BIO block layer to assume merging. Default to off
62 because we cannot guarantee merging later. */
63int iommu_bio_merge = 0;
64
65#define MAX_NB 8
66
67/* Allocation bitmap for the remapping area */
68static DEFINE_SPINLOCK(iommu_bitmap_lock);
69static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
70
71static u32 gart_unmapped_entry;
72
73#define GPTE_VALID 1
74#define GPTE_COHERENT 2
75#define GPTE_ENCODE(x) \
76 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
77#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
78
79#define to_pages(addr,size) \
80 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
81
82#define for_all_nb(dev) \
83 dev = NULL; \
84 while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\
85 if (dev->bus->number == 0 && \
86 (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31))
87
88static struct pci_dev *northbridges[MAX_NB];
89static u32 northbridge_flush_word[MAX_NB];
90
91#define EMERGENCY_PAGES 32 /* = 128KB */
92
93#ifdef CONFIG_AGP
94#define AGPEXTERN extern
95#else
96#define AGPEXTERN
97#endif
98
99/* backdoor interface to AGP driver */
100AGPEXTERN int agp_memory_reserved;
101AGPEXTERN __u32 *agp_gatt_table;
102
103static unsigned long next_bit; /* protected by iommu_bitmap_lock */
104static int need_flush; /* global flush state. set for each gart wrap */
105static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
106 size_t size, int dir, int do_panic);
107
108/* Dummy device used for NULL arguments (normally ISA). Better would
109 be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */
110static struct device fallback_dev = {
111 .bus_id = "fallback device",
112 .coherent_dma_mask = 0xffffffff,
113 .dma_mask = &fallback_dev.coherent_dma_mask,
114};
115
116static unsigned long alloc_iommu(int size)
117{
118 unsigned long offset, flags;
119
120 spin_lock_irqsave(&iommu_bitmap_lock, flags);
121 offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
122 if (offset == -1) {
123 need_flush = 1;
124 offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size);
125 }
126 if (offset != -1) {
127 set_bit_string(iommu_gart_bitmap, offset, size);
128 next_bit = offset+size;
129 if (next_bit >= iommu_pages) {
130 next_bit = 0;
131 need_flush = 1;
132 }
133 }
134 if (iommu_fullflush)
135 need_flush = 1;
136 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
137 return offset;
138}
139
140static void free_iommu(unsigned long offset, int size)
141{
142 unsigned long flags;
143 if (size == 1) {
144 clear_bit(offset, iommu_gart_bitmap);
145 return;
146 }
147 spin_lock_irqsave(&iommu_bitmap_lock, flags);
148 __clear_bit_string(iommu_gart_bitmap, offset, size);
149 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
150}
151
152/*
153 * Use global flush state to avoid races with multiple flushers.
154 */
155static void flush_gart(struct device *dev)
156{
157 unsigned long flags;
158 int flushed = 0;
159 int i, max;
160
161 spin_lock_irqsave(&iommu_bitmap_lock, flags);
162 if (need_flush) {
163 max = 0;
164 for (i = 0; i < MAX_NB; i++) {
165 if (!northbridges[i])
166 continue;
167 pci_write_config_dword(northbridges[i], 0x9c,
168 northbridge_flush_word[i] | 1);
169 flushed++;
170 max = i;
171 }
172 for (i = 0; i <= max; i++) {
173 u32 w;
174 if (!northbridges[i])
175 continue;
176 /* Make sure the hardware actually executed the flush. */
177 do {
178 pci_read_config_dword(northbridges[i], 0x9c, &w);
179 } while (w & 1);
180 }
181 if (!flushed)
182 printk("nothing to flush?\n");
183 need_flush = 0;
184 }
185 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
186}
187
188/* Allocate DMA memory on node near device */
189noinline
190static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
191{
192 struct page *page;
193 int node;
194 if (dev->bus == &pci_bus_type) {
195 cpumask_t mask;
196 mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
197 node = cpu_to_node(first_cpu(mask));
198 } else
199 node = numa_node_id();
200 page = alloc_pages_node(node, gfp, order);
201 return page ? page_address(page) : NULL;
202}
203
204/*
205 * Allocate memory for a coherent mapping.
206 */
207void *
208dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
209 unsigned gfp)
210{
211 void *memory;
212 unsigned long dma_mask = 0;
213 u64 bus;
214
215 if (!dev)
216 dev = &fallback_dev;
217 dma_mask = dev->coherent_dma_mask;
218 if (dma_mask == 0)
219 dma_mask = 0xffffffff;
220
221 /* Kludge to make it bug-to-bug compatible with i386. i386
222 uses the normal dma_mask for alloc_coherent. */
223 dma_mask &= *dev->dma_mask;
224
225 again:
226 memory = dma_alloc_pages(dev, gfp, get_order(size));
227 if (memory == NULL)
228 return NULL;
229
230 {
231 int high, mmu;
232 bus = virt_to_bus(memory);
233 high = (bus + size) >= dma_mask;
234 mmu = high;
235 if (force_iommu && !(gfp & GFP_DMA))
236 mmu = 1;
237 if (no_iommu || dma_mask < 0xffffffffUL) {
238 if (high) {
239 free_pages((unsigned long)memory,
240 get_order(size));
241
242 if (swiotlb) {
243 return
244 swiotlb_alloc_coherent(dev, size,
245 dma_handle,
246 gfp);
247 }
248
249 if (!(gfp & GFP_DMA)) {
250 gfp |= GFP_DMA;
251 goto again;
252 }
253 return NULL;
254 }
255 mmu = 0;
256 }
257 memset(memory, 0, size);
258 if (!mmu) {
259 *dma_handle = virt_to_bus(memory);
260 return memory;
261 }
262 }
263
264 *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0);
265 if (*dma_handle == bad_dma_address)
266 goto error;
267 flush_gart(dev);
268 return memory;
269
270error:
271 if (panic_on_overflow)
272 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size);
273 free_pages((unsigned long)memory, get_order(size));
274 return NULL;
275}
276
277/*
278 * Unmap coherent memory.
279 * The caller must ensure that the device has finished accessing the mapping.
280 */
281void dma_free_coherent(struct device *dev, size_t size,
282 void *vaddr, dma_addr_t bus)
283{
284 if (swiotlb) {
285 swiotlb_free_coherent(dev, size, vaddr, bus);
286 return;
287 }
288
289 dma_unmap_single(dev, bus, size, 0);
290 free_pages((unsigned long)vaddr, get_order(size));
291}
292
293#ifdef CONFIG_IOMMU_LEAK
294
295#define SET_LEAK(x) if (iommu_leak_tab) \
296 iommu_leak_tab[x] = __builtin_return_address(0);
297#define CLEAR_LEAK(x) if (iommu_leak_tab) \
298 iommu_leak_tab[x] = NULL;
299
300/* Debugging aid for drivers that don't free their IOMMU tables */
301static void **iommu_leak_tab;
302static int leak_trace;
303int iommu_leak_pages = 20;
304void dump_leak(void)
305{
306 int i;
307 static int dump;
308 if (dump || !iommu_leak_tab) return;
309 dump = 1;
310 show_stack(NULL,NULL);
311 /* Very crude. dump some from the end of the table too */
312 printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages);
313 for (i = 0; i < iommu_leak_pages; i+=2) {
314 printk("%lu: ", iommu_pages-i);
315 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
316 printk("%c", (i+1)%2 == 0 ? '\n' : ' ');
317 }
318 printk("\n");
319}
320#else
321#define SET_LEAK(x)
322#define CLEAR_LEAK(x)
323#endif
324
325static void iommu_full(struct device *dev, size_t size, int dir, int do_panic)
326{
327 /*
328 * Ran out of IOMMU space for this operation. This is very bad.
329 * Unfortunately the drivers cannot handle this operation properly.
330 * Return some non mapped prereserved space in the aperture and
331 * let the Northbridge deal with it. This will result in garbage
332 * in the IO operation. When the size exceeds the prereserved space
333 * memory corruption will occur or random memory will be DMAed
334 * out. Hopefully no network devices use single mappings that big.
335 */
336
337 printk(KERN_ERR
338 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
339 size, dev->bus_id);
340
341 if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) {
342 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
343 panic("PCI-DMA: Memory would be corrupted\n");
344 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
345 panic("PCI-DMA: Random memory would be DMAed\n");
346 }
347
348#ifdef CONFIG_IOMMU_LEAK
349 dump_leak();
350#endif
351}
352
353static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
354{
355 u64 mask = *dev->dma_mask;
356 int high = addr + size >= mask;
357 int mmu = high;
358 if (force_iommu)
359 mmu = 1;
360 if (no_iommu) {
361 if (high)
362 panic("PCI-DMA: high address but no IOMMU.\n");
363 mmu = 0;
364 }
365 return mmu;
366}
367
368static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
369{
370 u64 mask = *dev->dma_mask;
371 int high = addr + size >= mask;
372 int mmu = high;
373 if (no_iommu) {
374 if (high)
375 panic("PCI-DMA: high address but no IOMMU.\n");
376 mmu = 0;
377 }
378 return mmu;
379}
380
381/* Map a single continuous physical area into the IOMMU.
382 * Caller needs to check if the iommu is needed and flush.
383 */
384static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
385 size_t size, int dir, int do_panic)
386{
387 unsigned long npages = to_pages(phys_mem, size);
388 unsigned long iommu_page = alloc_iommu(npages);
389 int i;
390 if (iommu_page == -1) {
391 if (!nonforced_iommu(dev, phys_mem, size))
392 return phys_mem;
393 if (panic_on_overflow)
394 panic("dma_map_area overflow %lu bytes\n", size);
395 iommu_full(dev, size, dir, do_panic);
396 return bad_dma_address;
397 }
398
399 for (i = 0; i < npages; i++) {
400 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
401 SET_LEAK(iommu_page + i);
402 phys_mem += PAGE_SIZE;
403 }
404 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
405}
406
407/* Map a single area into the IOMMU */
408dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir)
409{
410 unsigned long phys_mem, bus;
411
412 BUG_ON(dir == DMA_NONE);
413
414 if (swiotlb)
415 return swiotlb_map_single(dev,addr,size,dir);
416 if (!dev)
417 dev = &fallback_dev;
418
419 phys_mem = virt_to_phys(addr);
420 if (!need_iommu(dev, phys_mem, size))
421 return phys_mem;
422
423 bus = dma_map_area(dev, phys_mem, size, dir, 1);
424 flush_gart(dev);
425 return bus;
426}
427
428/* Fallback for dma_map_sg in case of overflow */
429static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
430 int nents, int dir)
431{
432 int i;
433
434#ifdef CONFIG_IOMMU_DEBUG
435 printk(KERN_DEBUG "dma_map_sg overflow\n");
436#endif
437
438 for (i = 0; i < nents; i++ ) {
439 struct scatterlist *s = &sg[i];
440 unsigned long addr = page_to_phys(s->page) + s->offset;
441 if (nonforced_iommu(dev, addr, s->length)) {
442 addr = dma_map_area(dev, addr, s->length, dir, 0);
443 if (addr == bad_dma_address) {
444 if (i > 0)
445 dma_unmap_sg(dev, sg, i, dir);
446 nents = 0;
447 sg[0].dma_length = 0;
448 break;
449 }
450 }
451 s->dma_address = addr;
452 s->dma_length = s->length;
453 }
454 flush_gart(dev);
455 return nents;
456}
457
458/* Map multiple scatterlist entries continuous into the first. */
459static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
460 struct scatterlist *sout, unsigned long pages)
461{
462 unsigned long iommu_start = alloc_iommu(pages);
463 unsigned long iommu_page = iommu_start;
464 int i;
465
466 if (iommu_start == -1)
467 return -1;
468
469 for (i = start; i < stopat; i++) {
470 struct scatterlist *s = &sg[i];
471 unsigned long pages, addr;
472 unsigned long phys_addr = s->dma_address;
473
474 BUG_ON(i > start && s->offset);
475 if (i == start) {
476 *sout = *s;
477 sout->dma_address = iommu_bus_base;
478 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
479 sout->dma_length = s->length;
480 } else {
481 sout->dma_length += s->length;
482 }
483
484 addr = phys_addr;
485 pages = to_pages(s->offset, s->length);
486 while (pages--) {
487 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
488 SET_LEAK(iommu_page);
489 addr += PAGE_SIZE;
490 iommu_page++;
491 }
492 }
493 BUG_ON(iommu_page - iommu_start != pages);
494 return 0;
495}
496
497static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
498 struct scatterlist *sout,
499 unsigned long pages, int need)
500{
501 if (!need) {
502 BUG_ON(stopat - start != 1);
503 *sout = sg[start];
504 sout->dma_length = sg[start].length;
505 return 0;
506 }
507 return __dma_map_cont(sg, start, stopat, sout, pages);
508}
509
510/*
511 * DMA map all entries in a scatterlist.
512 * Merge chunks that have page aligned sizes into a continuous mapping.
513 */
514int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
515{
516 int i;
517 int out;
518 int start;
519 unsigned long pages = 0;
520 int need = 0, nextneed;
521
522 BUG_ON(dir == DMA_NONE);
523 if (nents == 0)
524 return 0;
525
526 if (swiotlb)
527 return swiotlb_map_sg(dev,sg,nents,dir);
528 if (!dev)
529 dev = &fallback_dev;
530
531 out = 0;
532 start = 0;
533 for (i = 0; i < nents; i++) {
534 struct scatterlist *s = &sg[i];
535 dma_addr_t addr = page_to_phys(s->page) + s->offset;
536 s->dma_address = addr;
537 BUG_ON(s->length == 0);
538
539 nextneed = need_iommu(dev, addr, s->length);
540
541 /* Handle the previous not yet processed entries */
542 if (i > start) {
543 struct scatterlist *ps = &sg[i-1];
544 /* Can only merge when the last chunk ends on a page
545 boundary and the new one doesn't have an offset. */
546 if (!iommu_merge || !nextneed || !need || s->offset ||
547 (ps->offset + ps->length) % PAGE_SIZE) {
548 if (dma_map_cont(sg, start, i, sg+out, pages,
549 need) < 0)
550 goto error;
551 out++;
552 pages = 0;
553 start = i;
554 }
555 }
556
557 need = nextneed;
558 pages += to_pages(s->offset, s->length);
559 }
560 if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
561 goto error;
562 out++;
563 flush_gart(dev);
564 if (out < nents)
565 sg[out].dma_length = 0;
566 return out;
567
568error:
569 flush_gart(NULL);
570 dma_unmap_sg(dev, sg, nents, dir);
571 /* When it was forced try again unforced */
572 if (force_iommu)
573 return dma_map_sg_nonforce(dev, sg, nents, dir);
574 if (panic_on_overflow)
575 panic("dma_map_sg: overflow on %lu pages\n", pages);
576 iommu_full(dev, pages << PAGE_SHIFT, dir, 0);
577 for (i = 0; i < nents; i++)
578 sg[i].dma_address = bad_dma_address;
579 return 0;
580}
581
582/*
583 * Free a DMA mapping.
584 */
585void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
586 size_t size, int direction)
587{
588 unsigned long iommu_page;
589 int npages;
590 int i;
591
592 if (swiotlb) {
593 swiotlb_unmap_single(dev,dma_addr,size,direction);
594 return;
595 }
596
597 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
598 dma_addr >= iommu_bus_base + iommu_size)
599 return;
600 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
601 npages = to_pages(dma_addr, size);
602 for (i = 0; i < npages; i++) {
603 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
604 CLEAR_LEAK(iommu_page + i);
605 }
606 free_iommu(iommu_page, npages);
607}
608
609/*
610 * Wrapper for pci_unmap_single working with scatterlists.
611 */
612void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
613{
614 int i;
615 if (swiotlb) {
616 swiotlb_unmap_sg(dev,sg,nents,dir);
617 return;
618 }
619 for (i = 0; i < nents; i++) {
620 struct scatterlist *s = &sg[i];
621 if (!s->dma_length || !s->length)
622 break;
623 dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
624 }
625}
626
627int dma_supported(struct device *dev, u64 mask)
628{
629 /* Copied from i386. Doesn't make much sense, because it will
630 only work for pci_alloc_coherent.
631 The caller just has to use GFP_DMA in this case. */
632 if (mask < 0x00ffffff)
633 return 0;
634
635 /* Tell the device to use SAC when IOMMU force is on.
636 This allows the driver to use cheaper accesses in some cases.
637
638 Problem with this is that if we overflow the IOMMU area
639 and return DAC as fallback address the device may not handle it correctly.
640
641 As a special case some controllers have a 39bit address mode
642 that is as efficient as 32bit (aic79xx). Don't force SAC for these.
643 Assume all masks <= 40 bits are of this type. Normally this doesn't
644 make any difference, but gives more gentle handling of IOMMU overflow. */
645 if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
646 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
647 return 0;
648 }
649
650 return 1;
651}
652
653int dma_get_cache_alignment(void)
654{
655 return boot_cpu_data.x86_clflush_size;
656}
657
658EXPORT_SYMBOL(dma_unmap_sg);
659EXPORT_SYMBOL(dma_map_sg);
660EXPORT_SYMBOL(dma_map_single);
661EXPORT_SYMBOL(dma_unmap_single);
662EXPORT_SYMBOL(dma_supported);
663EXPORT_SYMBOL(no_iommu);
664EXPORT_SYMBOL(force_iommu);
665EXPORT_SYMBOL(bad_dma_address);
666EXPORT_SYMBOL(iommu_bio_merge);
667EXPORT_SYMBOL(iommu_sac_force);
668EXPORT_SYMBOL(dma_get_cache_alignment);
669EXPORT_SYMBOL(dma_alloc_coherent);
670EXPORT_SYMBOL(dma_free_coherent);
671
672static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
673{
674 unsigned long a;
675 if (!iommu_size) {
676 iommu_size = aper_size;
677 if (!no_agp)
678 iommu_size /= 2;
679 }
680
681 a = aper + iommu_size;
682 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
683
684 if (iommu_size < 64*1024*1024)
685 printk(KERN_WARNING
686 "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20);
687
688 return iommu_size;
689}
690
691static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
692{
693 unsigned aper_size = 0, aper_base_32;
694 u64 aper_base;
695 unsigned aper_order;
696
697 pci_read_config_dword(dev, 0x94, &aper_base_32);
698 pci_read_config_dword(dev, 0x90, &aper_order);
699 aper_order = (aper_order >> 1) & 7;
700
701 aper_base = aper_base_32 & 0x7fff;
702 aper_base <<= 25;
703
704 aper_size = (32 * 1024 * 1024) << aper_order;
705 if (aper_base + aper_size >= 0xffffffff || !aper_size)
706 aper_base = 0;
707
708 *size = aper_size;
709 return aper_base;
710}
711
712/*
713 * Private Northbridge GATT initialization in case we cannot use the
714 * AGP driver for some reason.
715 */
716static __init int init_k8_gatt(struct agp_kern_info *info)
717{
718 struct pci_dev *dev;
719 void *gatt;
720 unsigned aper_base, new_aper_base;
721 unsigned aper_size, gatt_size, new_aper_size;
722
723 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
724 aper_size = aper_base = info->aper_size = 0;
725 for_all_nb(dev) {
726 new_aper_base = read_aperture(dev, &new_aper_size);
727 if (!new_aper_base)
728 goto nommu;
729
730 if (!aper_base) {
731 aper_size = new_aper_size;
732 aper_base = new_aper_base;
733 }
734 if (aper_size != new_aper_size || aper_base != new_aper_base)
735 goto nommu;
736 }
737 if (!aper_base)
738 goto nommu;
739 info->aper_base = aper_base;
740 info->aper_size = aper_size>>20;
741
742 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
743 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
744 if (!gatt)
745 panic("Cannot allocate GATT table");
746 memset(gatt, 0, gatt_size);
747 agp_gatt_table = gatt;
748
749 for_all_nb(dev) {
750 u32 ctl;
751 u32 gatt_reg;
752
753 gatt_reg = __pa(gatt) >> 12;
754 gatt_reg <<= 4;
755 pci_write_config_dword(dev, 0x98, gatt_reg);
756 pci_read_config_dword(dev, 0x90, &ctl);
757
758 ctl |= 1;
759 ctl &= ~((1<<4) | (1<<5));
760
761 pci_write_config_dword(dev, 0x90, ctl);
762 }
763 flush_gart(NULL);
764
765 printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10);
766 return 0;
767
768 nommu:
769 /* Should not happen anymore */
770 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
771 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.");
772 return -1;
773}
774
775extern int agp_amd64_init(void);
776
777static int __init pci_iommu_init(void)
778{
779 struct agp_kern_info info;
780 unsigned long aper_size;
781 unsigned long iommu_start;
782 struct pci_dev *dev;
783 unsigned long scratch;
784 long i;
785
786#ifndef CONFIG_AGP_AMD64
787 no_agp = 1;
788#else
789 /* Makefile puts PCI initialization via subsys_initcall first. */
790 /* Add other K8 AGP bridge drivers here */
791 no_agp = no_agp ||
792 (agp_amd64_init() < 0) ||
793 (agp_copy_info(agp_bridge, &info) < 0);
794#endif
795
796 if (swiotlb) {
797 no_iommu = 1;
798 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
799 return -1;
800 }
801
802 if (no_iommu ||
803 (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) ||
804 !iommu_aperture ||
805 (no_agp && init_k8_gatt(&info) < 0)) {
806 printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
807 no_iommu = 1;
808 return -1;
809 }
810
811 aper_size = info.aper_size * 1024 * 1024;
812 iommu_size = check_iommu_size(info.aper_base, aper_size);
813 iommu_pages = iommu_size >> PAGE_SHIFT;
814
815 iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL,
816 get_order(iommu_pages/8));
817 if (!iommu_gart_bitmap)
818 panic("Cannot allocate iommu bitmap\n");
819 memset(iommu_gart_bitmap, 0, iommu_pages/8);
820
821#ifdef CONFIG_IOMMU_LEAK
822 if (leak_trace) {
823 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
824 get_order(iommu_pages*sizeof(void *)));
825 if (iommu_leak_tab)
826 memset(iommu_leak_tab, 0, iommu_pages * 8);
827 else
828 printk("PCI-DMA: Cannot allocate leak trace area\n");
829 }
830#endif
831
832 /*
833 * Out of IOMMU space handling.
834 * Reserve some invalid pages at the beginning of the GART.
835 */
836 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
837
838 agp_memory_reserved = iommu_size;
839 printk(KERN_INFO
840 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
841 iommu_size>>20);
842
843 iommu_start = aper_size - iommu_size;
844 iommu_bus_base = info.aper_base + iommu_start;
845 bad_dma_address = iommu_bus_base;
846 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
847
848 /*
849 * Unmap the IOMMU part of the GART. The alias of the page is
850 * always mapped with cache enabled and there is no full cache
851 * coherency across the GART remapping. The unmapping avoids
852 * automatic prefetches from the CPU allocating cache lines in
853 * there. All CPU accesses are done via the direct mapping to
854 * the backing memory. The GART address is only used by PCI
855 * devices.
856 */
857 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
858
859 /*
860 * Try to workaround a bug (thanks to BenH)
861 * Set unmapped entries to a scratch page instead of 0.
862 * Any prefetches that hit unmapped entries won't get an bus abort
863 * then.
864 */
865 scratch = get_zeroed_page(GFP_KERNEL);
866 if (!scratch)
867 panic("Cannot allocate iommu scratch page");
868 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
869 for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
870 iommu_gatt_base[i] = gart_unmapped_entry;
871
872 for_all_nb(dev) {
873 u32 flag;
874 int cpu = PCI_SLOT(dev->devfn) - 24;
875 if (cpu >= MAX_NB)
876 continue;
877 northbridges[cpu] = dev;
878 pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */
879 northbridge_flush_word[cpu] = flag;
880 }
881
882 flush_gart(NULL);
883
884 return 0;
885}
886
887/* Must execute after PCI subsystem */
888fs_initcall(pci_iommu_init);
889
890/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
891 [,forcesac][,fullflush][,nomerge][,biomerge]
892 size set size of iommu (in bytes)
893 noagp don't initialize the AGP driver and use full aperture.
894 off don't use the IOMMU
895 leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
896 memaper[=order] allocate an own aperture over RAM with size 32MB^order.
897 noforce don't force IOMMU usage. Default.
898 force Force IOMMU.
899 merge Do lazy merging. This may improve performance on some block devices.
900 Implies force (experimental)
901 biomerge Do merging at the BIO layer. This is more efficient than merge,
902 but should be only done with very big IOMMUs. Implies merge,force.
903 nomerge Don't do SG merging.
904 forcesac For SAC mode for masks <40bits (experimental)
905 fullflush Flush IOMMU on each allocation (default)
906 nofullflush Don't use IOMMU fullflush
907 allowed overwrite iommu off workarounds for specific chipsets.
908 soft Use software bounce buffering (default for Intel machines)
909 noaperture Don't touch the aperture for AGP.
910*/
911__init int iommu_setup(char *p)
912{
913 int arg;
914
915 while (*p) {
916 if (!strncmp(p,"noagp",5))
917 no_agp = 1;
918 if (!strncmp(p,"off",3))
919 no_iommu = 1;
920 if (!strncmp(p,"force",5)) {
921 force_iommu = 1;
922 iommu_aperture_allowed = 1;
923 }
924 if (!strncmp(p,"allowed",7))
925 iommu_aperture_allowed = 1;
926 if (!strncmp(p,"noforce",7)) {
927 iommu_merge = 0;
928 force_iommu = 0;
929 }
930 if (!strncmp(p, "memaper", 7)) {
931 fallback_aper_force = 1;
932 p += 7;
933 if (*p == '=') {
934 ++p;
935 if (get_option(&p, &arg))
936 fallback_aper_order = arg;
937 }
938 }
939 if (!strncmp(p, "biomerge",8)) {
940 iommu_bio_merge = 4096;
941 iommu_merge = 1;
942 force_iommu = 1;
943 }
944 if (!strncmp(p, "panic",5))
945 panic_on_overflow = 1;
946 if (!strncmp(p, "nopanic",7))
947 panic_on_overflow = 0;
948 if (!strncmp(p, "merge",5)) {
949 iommu_merge = 1;
950 force_iommu = 1;
951 }
952 if (!strncmp(p, "nomerge",7))
953 iommu_merge = 0;
954 if (!strncmp(p, "forcesac",8))
955 iommu_sac_force = 1;
956 if (!strncmp(p, "fullflush",8))
957 iommu_fullflush = 1;
958 if (!strncmp(p, "nofullflush",11))
959 iommu_fullflush = 0;
960 if (!strncmp(p, "soft",4))
961 swiotlb = 1;
962 if (!strncmp(p, "noaperture",10))
963 fix_aperture = 0;
964#ifdef CONFIG_IOMMU_LEAK
965 if (!strncmp(p,"leak",4)) {
966 leak_trace = 1;
967 p += 4;
968 if (*p == '=') ++p;
969 if (isdigit(*p) && get_option(&p, &arg))
970 iommu_leak_pages = arg;
971 } else
972#endif
973 if (isdigit(*p) && get_option(&p, &arg))
974 iommu_size = arg;
975 p += strcspn(p, ",");
976 if (*p == ',')
977 ++p;
978 }
979 return 1;
980}
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
new file mode 100644
index 000000000000..67d90b89af0b
--- /dev/null
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -0,0 +1,94 @@
1/* Fallback functions when the main IOMMU code is not compiled in. This
2 code is roughly equivalent to i386. */
3#include <linux/mm.h>
4#include <linux/init.h>
5#include <linux/pci.h>
6#include <linux/string.h>
7#include <asm/proto.h>
8#include <asm/processor.h>
9
10int iommu_merge = 0;
11EXPORT_SYMBOL(iommu_merge);
12
13dma_addr_t bad_dma_address;
14EXPORT_SYMBOL(bad_dma_address);
15
16int iommu_bio_merge = 0;
17EXPORT_SYMBOL(iommu_bio_merge);
18
19int iommu_sac_force = 0;
20EXPORT_SYMBOL(iommu_sac_force);
21
22/*
23 * Dummy IO MMU functions
24 */
25
26void *dma_alloc_coherent(struct device *hwdev, size_t size,
27 dma_addr_t *dma_handle, unsigned gfp)
28{
29 void *ret;
30 u64 mask;
31 int order = get_order(size);
32
33 if (hwdev)
34 mask = hwdev->coherent_dma_mask & *hwdev->dma_mask;
35 else
36 mask = 0xffffffff;
37 for (;;) {
38 ret = (void *)__get_free_pages(gfp, order);
39 if (ret == NULL)
40 return NULL;
41 *dma_handle = virt_to_bus(ret);
42 if ((*dma_handle & ~mask) == 0)
43 break;
44 free_pages((unsigned long)ret, order);
45 if (gfp & GFP_DMA)
46 return NULL;
47 gfp |= GFP_DMA;
48 }
49
50 memset(ret, 0, size);
51 return ret;
52}
53EXPORT_SYMBOL(dma_alloc_coherent);
54
55void dma_free_coherent(struct device *hwdev, size_t size,
56 void *vaddr, dma_addr_t dma_handle)
57{
58 free_pages((unsigned long)vaddr, get_order(size));
59}
60EXPORT_SYMBOL(dma_free_coherent);
61
62int dma_supported(struct device *hwdev, u64 mask)
63{
64 /*
65 * we fall back to GFP_DMA when the mask isn't all 1s,
66 * so we can't guarantee allocations that must be
67 * within a tighter range than GFP_DMA..
68 * RED-PEN this won't work for pci_map_single. Caller has to
69 * use GFP_DMA in the first place.
70 */
71 if (mask < 0x00ffffff)
72 return 0;
73
74 return 1;
75}
76EXPORT_SYMBOL(dma_supported);
77
78int dma_get_cache_alignment(void)
79{
80 return boot_cpu_data.x86_clflush_size;
81}
82EXPORT_SYMBOL(dma_get_cache_alignment);
83
84static int __init check_ram(void)
85{
86 if (end_pfn >= 0xffffffff>>PAGE_SHIFT) {
87 printk(
88 KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n"
89 KERN_ERR "WARNING 32bit PCI may malfunction.\n");
90 }
91 return 0;
92}
93__initcall(check_ram);
94
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
new file mode 100644
index 000000000000..9922d2ba24a3
--- /dev/null
+++ b/arch/x86_64/kernel/process.c
@@ -0,0 +1,770 @@
1/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
11 *
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
13 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
21#include <linux/errno.h>
22#include <linux/sched.h>
23#include <linux/kernel.h>
24#include <linux/mm.h>
25#include <linux/elfcore.h>
26#include <linux/smp.h>
27#include <linux/slab.h>
28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h>
31#include <linux/interrupt.h>
32#include <linux/delay.h>
33#include <linux/irq.h>
34#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
37
38#include <asm/uaccess.h>
39#include <asm/pgtable.h>
40#include <asm/system.h>
41#include <asm/io.h>
42#include <asm/processor.h>
43#include <asm/i387.h>
44#include <asm/mmu_context.h>
45#include <asm/pda.h>
46#include <asm/prctl.h>
47#include <asm/kdebug.h>
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
51
52asmlinkage extern void ret_from_fork(void);
53
54unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
55
56static atomic_t hlt_counter = ATOMIC_INIT(0);
57
58unsigned long boot_option_idle_override = 0;
59EXPORT_SYMBOL(boot_option_idle_override);
60
61/*
62 * Powermanagement idle function, if any..
63 */
64void (*pm_idle)(void);
65static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
66
67void disable_hlt(void)
68{
69 atomic_inc(&hlt_counter);
70}
71
72EXPORT_SYMBOL(disable_hlt);
73
74void enable_hlt(void)
75{
76 atomic_dec(&hlt_counter);
77}
78
79EXPORT_SYMBOL(enable_hlt);
80
81/*
82 * We use this if we don't have any better
83 * idle routine..
84 */
85void default_idle(void)
86{
87 if (!atomic_read(&hlt_counter)) {
88 local_irq_disable();
89 if (!need_resched())
90 safe_halt();
91 else
92 local_irq_enable();
93 }
94}
95
96/*
97 * On SMP it's slightly faster (but much more power-consuming!)
98 * to poll the ->need_resched flag instead of waiting for the
99 * cross-CPU IPI to arrive. Use this option with caution.
100 */
101static void poll_idle (void)
102{
103 int oldval;
104
105 local_irq_enable();
106
107 /*
108 * Deal with another CPU just having chosen a thread to
109 * run here:
110 */
111 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
112
113 if (!oldval) {
114 set_thread_flag(TIF_POLLING_NRFLAG);
115 asm volatile(
116 "2:"
117 "testl %0,%1;"
118 "rep; nop;"
119 "je 2b;"
120 : :
121 "i" (_TIF_NEED_RESCHED),
122 "m" (current_thread_info()->flags));
123 } else {
124 set_need_resched();
125 }
126}
127
128void cpu_idle_wait(void)
129{
130 unsigned int cpu, this_cpu = get_cpu();
131 cpumask_t map;
132
133 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
134 put_cpu();
135
136 cpus_clear(map);
137 for_each_online_cpu(cpu) {
138 per_cpu(cpu_idle_state, cpu) = 1;
139 cpu_set(cpu, map);
140 }
141
142 __get_cpu_var(cpu_idle_state) = 0;
143
144 wmb();
145 do {
146 ssleep(1);
147 for_each_online_cpu(cpu) {
148 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
149 cpu_clear(cpu, map);
150 }
151 cpus_and(map, map, cpu_online_map);
152 } while (!cpus_empty(map));
153}
154EXPORT_SYMBOL_GPL(cpu_idle_wait);
155
156/*
157 * The idle thread. There's no useful work to be
158 * done, so just try to conserve power and have a
159 * low exit latency (ie sit in a loop waiting for
160 * somebody to say that they'd like to reschedule)
161 */
162void cpu_idle (void)
163{
164 /* endless idle loop with no priority at all */
165 while (1) {
166 while (!need_resched()) {
167 void (*idle)(void);
168
169 if (__get_cpu_var(cpu_idle_state))
170 __get_cpu_var(cpu_idle_state) = 0;
171
172 rmb();
173 idle = pm_idle;
174 if (!idle)
175 idle = default_idle;
176 idle();
177 }
178
179 schedule();
180 }
181}
182
183/*
184 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
185 * which can obviate IPI to trigger checking of need_resched.
186 * We execute MONITOR against need_resched and enter optimized wait state
187 * through MWAIT. Whenever someone changes need_resched, we would be woken
188 * up from MWAIT (without an IPI).
189 */
190static void mwait_idle(void)
191{
192 local_irq_enable();
193
194 if (!need_resched()) {
195 set_thread_flag(TIF_POLLING_NRFLAG);
196 do {
197 __monitor((void *)&current_thread_info()->flags, 0, 0);
198 if (need_resched())
199 break;
200 __mwait(0, 0);
201 } while (!need_resched());
202 clear_thread_flag(TIF_POLLING_NRFLAG);
203 }
204}
205
206void __init select_idle_routine(const struct cpuinfo_x86 *c)
207{
208 static int printed;
209 if (cpu_has(c, X86_FEATURE_MWAIT)) {
210 /*
211 * Skip, if setup has overridden idle.
212 * One CPU supports mwait => All CPUs supports mwait
213 */
214 if (!pm_idle) {
215 if (!printed) {
216 printk("using mwait in idle threads.\n");
217 printed = 1;
218 }
219 pm_idle = mwait_idle;
220 }
221 }
222}
223
224static int __init idle_setup (char *str)
225{
226 if (!strncmp(str, "poll", 4)) {
227 printk("using polling idle threads.\n");
228 pm_idle = poll_idle;
229 }
230
231 boot_option_idle_override = 1;
232 return 1;
233}
234
235__setup("idle=", idle_setup);
236
237/* Prints also some state that isn't saved in the pt_regs */
238void __show_regs(struct pt_regs * regs)
239{
240 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
241 unsigned int fsindex,gsindex;
242 unsigned int ds,cs,es;
243
244 printk("\n");
245 print_modules();
246 printk("Pid: %d, comm: %.20s %s %s\n",
247 current->pid, current->comm, print_tainted(), system_utsname.release);
248 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
249 printk_address(regs->rip);
250 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
251 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
252 regs->rax, regs->rbx, regs->rcx);
253 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
254 regs->rdx, regs->rsi, regs->rdi);
255 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
256 regs->rbp, regs->r8, regs->r9);
257 printk("R10: %016lx R11: %016lx R12: %016lx\n",
258 regs->r10, regs->r11, regs->r12);
259 printk("R13: %016lx R14: %016lx R15: %016lx\n",
260 regs->r13, regs->r14, regs->r15);
261
262 asm("movl %%ds,%0" : "=r" (ds));
263 asm("movl %%cs,%0" : "=r" (cs));
264 asm("movl %%es,%0" : "=r" (es));
265 asm("movl %%fs,%0" : "=r" (fsindex));
266 asm("movl %%gs,%0" : "=r" (gsindex));
267
268 rdmsrl(MSR_FS_BASE, fs);
269 rdmsrl(MSR_GS_BASE, gs);
270 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
271
272 asm("movq %%cr0, %0": "=r" (cr0));
273 asm("movq %%cr2, %0": "=r" (cr2));
274 asm("movq %%cr3, %0": "=r" (cr3));
275 asm("movq %%cr4, %0": "=r" (cr4));
276
277 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
278 fs,fsindex,gs,gsindex,shadowgs);
279 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
280 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
281}
282
283void show_regs(struct pt_regs *regs)
284{
285 __show_regs(regs);
286 show_trace(&regs->rsp);
287}
288
289/*
290 * Free current thread data structures etc..
291 */
292void exit_thread(void)
293{
294 struct task_struct *me = current;
295 struct thread_struct *t = &me->thread;
296 if (me->thread.io_bitmap_ptr) {
297 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
298
299 kfree(t->io_bitmap_ptr);
300 t->io_bitmap_ptr = NULL;
301 /*
302 * Careful, clear this in the TSS too:
303 */
304 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
305 t->io_bitmap_max = 0;
306 put_cpu();
307 }
308}
309
310void flush_thread(void)
311{
312 struct task_struct *tsk = current;
313 struct thread_info *t = current_thread_info();
314
315 if (t->flags & _TIF_ABI_PENDING)
316 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
317
318 tsk->thread.debugreg0 = 0;
319 tsk->thread.debugreg1 = 0;
320 tsk->thread.debugreg2 = 0;
321 tsk->thread.debugreg3 = 0;
322 tsk->thread.debugreg6 = 0;
323 tsk->thread.debugreg7 = 0;
324 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
325 /*
326 * Forget coprocessor state..
327 */
328 clear_fpu(tsk);
329 clear_used_math();
330}
331
332void release_thread(struct task_struct *dead_task)
333{
334 if (dead_task->mm) {
335 if (dead_task->mm->context.size) {
336 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
337 dead_task->comm,
338 dead_task->mm->context.ldt,
339 dead_task->mm->context.size);
340 BUG();
341 }
342 }
343}
344
345static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
346{
347 struct user_desc ud = {
348 .base_addr = addr,
349 .limit = 0xfffff,
350 .seg_32bit = 1,
351 .limit_in_pages = 1,
352 .useable = 1,
353 };
354 struct n_desc_struct *desc = (void *)t->thread.tls_array;
355 desc += tls;
356 desc->a = LDT_entry_a(&ud);
357 desc->b = LDT_entry_b(&ud);
358}
359
360static inline u32 read_32bit_tls(struct task_struct *t, int tls)
361{
362 struct desc_struct *desc = (void *)t->thread.tls_array;
363 desc += tls;
364 return desc->base0 |
365 (((u32)desc->base1) << 16) |
366 (((u32)desc->base2) << 24);
367}
368
369/*
370 * This gets called before we allocate a new thread and copy
371 * the current task into it.
372 */
373void prepare_to_copy(struct task_struct *tsk)
374{
375 unlazy_fpu(tsk);
376}
377
378int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
379 unsigned long unused,
380 struct task_struct * p, struct pt_regs * regs)
381{
382 int err;
383 struct pt_regs * childregs;
384 struct task_struct *me = current;
385
386 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
387
388 *childregs = *regs;
389
390 childregs->rax = 0;
391 childregs->rsp = rsp;
392 if (rsp == ~0UL) {
393 childregs->rsp = (unsigned long)childregs;
394 }
395
396 p->thread.rsp = (unsigned long) childregs;
397 p->thread.rsp0 = (unsigned long) (childregs+1);
398 p->thread.userrsp = me->thread.userrsp;
399
400 set_ti_thread_flag(p->thread_info, TIF_FORK);
401
402 p->thread.fs = me->thread.fs;
403 p->thread.gs = me->thread.gs;
404
405 asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
406 asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
407 asm("movl %%es,%0" : "=m" (p->thread.es));
408 asm("movl %%ds,%0" : "=m" (p->thread.ds));
409
410 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
411 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
412 if (!p->thread.io_bitmap_ptr) {
413 p->thread.io_bitmap_max = 0;
414 return -ENOMEM;
415 }
416 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
417 }
418
419 /*
420 * Set a new TLS for the child thread?
421 */
422 if (clone_flags & CLONE_SETTLS) {
423#ifdef CONFIG_IA32_EMULATION
424 if (test_thread_flag(TIF_IA32))
425 err = ia32_child_tls(p, childregs);
426 else
427#endif
428 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
429 if (err)
430 goto out;
431 }
432 err = 0;
433out:
434 if (err && p->thread.io_bitmap_ptr) {
435 kfree(p->thread.io_bitmap_ptr);
436 p->thread.io_bitmap_max = 0;
437 }
438 return err;
439}
440
441/*
442 * This special macro can be used to load a debugging register
443 */
444#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
445
446/*
447 * switch_to(x,y) should switch tasks from x to y.
448 *
449 * This could still be optimized:
450 * - fold all the options into a flag word and test it with a single test.
451 * - could test fs/gs bitsliced
452 */
453struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
454{
455 struct thread_struct *prev = &prev_p->thread,
456 *next = &next_p->thread;
457 int cpu = smp_processor_id();
458 struct tss_struct *tss = &per_cpu(init_tss, cpu);
459
460 unlazy_fpu(prev_p);
461
462 /*
463 * Reload esp0, LDT and the page table pointer:
464 */
465 tss->rsp0 = next->rsp0;
466
467 /*
468 * Switch DS and ES.
469 * This won't pick up thread selector changes, but I guess that is ok.
470 */
471 asm volatile("movl %%es,%0" : "=m" (prev->es));
472 if (unlikely(next->es | prev->es))
473 loadsegment(es, next->es);
474
475 asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
476 if (unlikely(next->ds | prev->ds))
477 loadsegment(ds, next->ds);
478
479 load_TLS(next, cpu);
480
481 /*
482 * Switch FS and GS.
483 */
484 {
485 unsigned fsindex;
486 asm volatile("movl %%fs,%0" : "=r" (fsindex));
487 /* segment register != 0 always requires a reload.
488 also reload when it has changed.
489 when prev process used 64bit base always reload
490 to avoid an information leak. */
491 if (unlikely(fsindex | next->fsindex | prev->fs)) {
492 loadsegment(fs, next->fsindex);
493 /* check if the user used a selector != 0
494 * if yes clear 64bit base, since overloaded base
495 * is always mapped to the Null selector
496 */
497 if (fsindex)
498 prev->fs = 0;
499 }
500 /* when next process has a 64bit base use it */
501 if (next->fs)
502 wrmsrl(MSR_FS_BASE, next->fs);
503 prev->fsindex = fsindex;
504 }
505 {
506 unsigned gsindex;
507 asm volatile("movl %%gs,%0" : "=r" (gsindex));
508 if (unlikely(gsindex | next->gsindex | prev->gs)) {
509 load_gs_index(next->gsindex);
510 if (gsindex)
511 prev->gs = 0;
512 }
513 if (next->gs)
514 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
515 prev->gsindex = gsindex;
516 }
517
518 /*
519 * Switch the PDA context.
520 */
521 prev->userrsp = read_pda(oldrsp);
522 write_pda(oldrsp, next->userrsp);
523 write_pda(pcurrent, next_p);
524 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
525
526 /*
527 * Now maybe reload the debug registers
528 */
529 if (unlikely(next->debugreg7)) {
530 loaddebug(next, 0);
531 loaddebug(next, 1);
532 loaddebug(next, 2);
533 loaddebug(next, 3);
534 /* no 4 and 5 */
535 loaddebug(next, 6);
536 loaddebug(next, 7);
537 }
538
539
540 /*
541 * Handle the IO bitmap
542 */
543 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
544 if (next->io_bitmap_ptr)
545 /*
546 * Copy the relevant range of the IO bitmap.
547 * Normally this is 128 bytes or less:
548 */
549 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
550 max(prev->io_bitmap_max, next->io_bitmap_max));
551 else {
552 /*
553 * Clear any possible leftover bits:
554 */
555 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
556 }
557 }
558
559 return prev_p;
560}
561
562/*
563 * sys_execve() executes a new program.
564 */
565asmlinkage
566long sys_execve(char __user *name, char __user * __user *argv,
567 char __user * __user *envp, struct pt_regs regs)
568{
569 long error;
570 char * filename;
571
572 filename = getname(name);
573 error = PTR_ERR(filename);
574 if (IS_ERR(filename))
575 return error;
576 error = do_execve(filename, argv, envp, &regs);
577 if (error == 0) {
578 task_lock(current);
579 current->ptrace &= ~PT_DTRACE;
580 task_unlock(current);
581 }
582 putname(filename);
583 return error;
584}
585
586void set_personality_64bit(void)
587{
588 /* inherit personality from parent */
589
590 /* Make sure to be in 64bit mode */
591 clear_thread_flag(TIF_IA32);
592
593 /* TBD: overwrites user setup. Should have two bits.
594 But 64bit processes have always behaved this way,
595 so it's not too bad. The main problem is just that
596 32bit childs are affected again. */
597 current->personality &= ~READ_IMPLIES_EXEC;
598}
599
600asmlinkage long sys_fork(struct pt_regs *regs)
601{
602 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
603}
604
605asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
606{
607 if (!newsp)
608 newsp = regs->rsp;
609 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
610}
611
612/*
613 * This is trivial, and on the face of it looks like it
614 * could equally well be done in user mode.
615 *
616 * Not so, for quite unobvious reasons - register pressure.
617 * In user mode vfork() cannot have a stack frame, and if
618 * done by calling the "clone()" system call directly, you
619 * do not have enough call-clobbered registers to hold all
620 * the information you need.
621 */
622asmlinkage long sys_vfork(struct pt_regs *regs)
623{
624 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
625 NULL, NULL);
626}
627
628unsigned long get_wchan(struct task_struct *p)
629{
630 unsigned long stack;
631 u64 fp,rip;
632 int count = 0;
633
634 if (!p || p == current || p->state==TASK_RUNNING)
635 return 0;
636 stack = (unsigned long)p->thread_info;
637 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
638 return 0;
639 fp = *(u64 *)(p->thread.rsp);
640 do {
641 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
642 return 0;
643 rip = *(u64 *)(fp+8);
644 if (!in_sched_functions(rip))
645 return rip;
646 fp = *(u64 *)fp;
647 } while (count++ < 16);
648 return 0;
649}
650
651long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
652{
653 int ret = 0;
654 int doit = task == current;
655 int cpu;
656
657 switch (code) {
658 case ARCH_SET_GS:
659 if (addr >= TASK_SIZE)
660 return -EPERM;
661 cpu = get_cpu();
662 /* handle small bases via the GDT because that's faster to
663 switch. */
664 if (addr <= 0xffffffff) {
665 set_32bit_tls(task, GS_TLS, addr);
666 if (doit) {
667 load_TLS(&task->thread, cpu);
668 load_gs_index(GS_TLS_SEL);
669 }
670 task->thread.gsindex = GS_TLS_SEL;
671 task->thread.gs = 0;
672 } else {
673 task->thread.gsindex = 0;
674 task->thread.gs = addr;
675 if (doit) {
676 load_gs_index(0);
677 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
678 }
679 }
680 put_cpu();
681 break;
682 case ARCH_SET_FS:
683 /* Not strictly needed for fs, but do it for symmetry
684 with gs */
685 if (addr >= TASK_SIZE)
686 return -EPERM;
687 cpu = get_cpu();
688 /* handle small bases via the GDT because that's faster to
689 switch. */
690 if (addr <= 0xffffffff) {
691 set_32bit_tls(task, FS_TLS, addr);
692 if (doit) {
693 load_TLS(&task->thread, cpu);
694 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
695 }
696 task->thread.fsindex = FS_TLS_SEL;
697 task->thread.fs = 0;
698 } else {
699 task->thread.fsindex = 0;
700 task->thread.fs = addr;
701 if (doit) {
702 /* set the selector to 0 to not confuse
703 __switch_to */
704 asm volatile("movl %0,%%fs" :: "r" (0));
705 ret = checking_wrmsrl(MSR_FS_BASE, addr);
706 }
707 }
708 put_cpu();
709 break;
710 case ARCH_GET_FS: {
711 unsigned long base;
712 if (task->thread.fsindex == FS_TLS_SEL)
713 base = read_32bit_tls(task, FS_TLS);
714 else if (doit) {
715 rdmsrl(MSR_FS_BASE, base);
716 } else
717 base = task->thread.fs;
718 ret = put_user(base, (unsigned long __user *)addr);
719 break;
720 }
721 case ARCH_GET_GS: {
722 unsigned long base;
723 if (task->thread.gsindex == GS_TLS_SEL)
724 base = read_32bit_tls(task, GS_TLS);
725 else if (doit) {
726 rdmsrl(MSR_KERNEL_GS_BASE, base);
727 } else
728 base = task->thread.gs;
729 ret = put_user(base, (unsigned long __user *)addr);
730 break;
731 }
732
733 default:
734 ret = -EINVAL;
735 break;
736 }
737
738 return ret;
739}
740
741long sys_arch_prctl(int code, unsigned long addr)
742{
743 return do_arch_prctl(current, code, addr);
744}
745
746/*
747 * Capture the user space registers if the task is not running (in user space)
748 */
749int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
750{
751 struct pt_regs *pp, ptregs;
752
753 pp = (struct pt_regs *)(tsk->thread.rsp0);
754 --pp;
755
756 ptregs = *pp;
757 ptregs.cs &= 0xffff;
758 ptregs.ss &= 0xffff;
759
760 elf_core_copy_regs(regs, &ptregs);
761
762 return 1;
763}
764
765unsigned long arch_align_stack(unsigned long sp)
766{
767 if (randomize_va_space)
768 sp -= get_random_int() % 8192;
769 return sp & ~0xf;
770}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
new file mode 100644
index 000000000000..0b7b101debdf
--- /dev/null
+++ b/arch/x86_64/kernel/ptrace.c
@@ -0,0 +1,547 @@
1/* ptrace.c */
2/* By Ross Biro 1/23/92 */
3/*
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * x86-64 port 2000-2002 Andi Kleen
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/smp_lock.h>
15#include <linux/errno.h>
16#include <linux/ptrace.h>
17#include <linux/user.h>
18#include <linux/security.h>
19#include <linux/audit.h>
20#include <linux/seccomp.h>
21
22#include <asm/uaccess.h>
23#include <asm/pgtable.h>
24#include <asm/system.h>
25#include <asm/processor.h>
26#include <asm/i387.h>
27#include <asm/debugreg.h>
28#include <asm/ldt.h>
29#include <asm/desc.h>
30#include <asm/proto.h>
31#include <asm/ia32.h>
32
33/*
34 * does not yet catch signals sent when the child dies.
35 * in exit.c or in signal.c.
36 */
37
38/* determines which flags the user has access to. */
39/* 1 = access 0 = no access */
40#define FLAG_MASK 0x44dd5UL
41
42/* set's the trap flag. */
43#define TRAP_FLAG 0x100UL
44
45/*
46 * eflags and offset of eflags on child stack..
47 */
48#define EFLAGS offsetof(struct pt_regs, eflags)
49#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
50
51/*
52 * this routine will get a word off of the processes privileged stack.
53 * the offset is how far from the base addr as stored in the TSS.
54 * this routine assumes that all the privileged stacks are in our
55 * data space.
56 */
57static inline unsigned long get_stack_long(struct task_struct *task, int offset)
58{
59 unsigned char *stack;
60
61 stack = (unsigned char *)task->thread.rsp0;
62 stack += offset;
63 return (*((unsigned long *)stack));
64}
65
66/*
67 * this routine will put a word on the processes privileged stack.
68 * the offset is how far from the base addr as stored in the TSS.
69 * this routine assumes that all the privileged stacks are in our
70 * data space.
71 */
72static inline long put_stack_long(struct task_struct *task, int offset,
73 unsigned long data)
74{
75 unsigned char * stack;
76
77 stack = (unsigned char *) task->thread.rsp0;
78 stack += offset;
79 *(unsigned long *) stack = data;
80 return 0;
81}
82
83/*
84 * Called by kernel/ptrace.c when detaching..
85 *
86 * Make sure the single step bit is not set.
87 */
88void ptrace_disable(struct task_struct *child)
89{
90 long tmp;
91
92 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
93 tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
94 put_stack_long(child, EFL_OFFSET, tmp);
95}
96
97static int putreg(struct task_struct *child,
98 unsigned long regno, unsigned long value)
99{
100 unsigned long tmp;
101
102 /* Some code in the 64bit emulation may not be 64bit clean.
103 Don't take any chances. */
104 if (test_tsk_thread_flag(child, TIF_IA32))
105 value &= 0xffffffff;
106 switch (regno) {
107 case offsetof(struct user_regs_struct,fs):
108 if (value && (value & 3) != 3)
109 return -EIO;
110 child->thread.fsindex = value & 0xffff;
111 return 0;
112 case offsetof(struct user_regs_struct,gs):
113 if (value && (value & 3) != 3)
114 return -EIO;
115 child->thread.gsindex = value & 0xffff;
116 return 0;
117 case offsetof(struct user_regs_struct,ds):
118 if (value && (value & 3) != 3)
119 return -EIO;
120 child->thread.ds = value & 0xffff;
121 return 0;
122 case offsetof(struct user_regs_struct,es):
123 if (value && (value & 3) != 3)
124 return -EIO;
125 child->thread.es = value & 0xffff;
126 return 0;
127 case offsetof(struct user_regs_struct,ss):
128 if ((value & 3) != 3)
129 return -EIO;
130 value &= 0xffff;
131 return 0;
132 case offsetof(struct user_regs_struct,fs_base):
133 if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
134 return -EIO;
135 child->thread.fs = value;
136 return 0;
137 case offsetof(struct user_regs_struct,gs_base):
138 if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
139 return -EIO;
140 child->thread.gs = value;
141 return 0;
142 case offsetof(struct user_regs_struct, eflags):
143 value &= FLAG_MASK;
144 tmp = get_stack_long(child, EFL_OFFSET);
145 tmp &= ~FLAG_MASK;
146 value |= tmp;
147 break;
148 case offsetof(struct user_regs_struct,cs):
149 if ((value & 3) != 3)
150 return -EIO;
151 value &= 0xffff;
152 break;
153 }
154 put_stack_long(child, regno - sizeof(struct pt_regs), value);
155 return 0;
156}
157
158static unsigned long getreg(struct task_struct *child, unsigned long regno)
159{
160 unsigned long val;
161 switch (regno) {
162 case offsetof(struct user_regs_struct, fs):
163 return child->thread.fsindex;
164 case offsetof(struct user_regs_struct, gs):
165 return child->thread.gsindex;
166 case offsetof(struct user_regs_struct, ds):
167 return child->thread.ds;
168 case offsetof(struct user_regs_struct, es):
169 return child->thread.es;
170 case offsetof(struct user_regs_struct, fs_base):
171 return child->thread.fs;
172 case offsetof(struct user_regs_struct, gs_base):
173 return child->thread.gs;
174 default:
175 regno = regno - sizeof(struct pt_regs);
176 val = get_stack_long(child, regno);
177 if (test_tsk_thread_flag(child, TIF_IA32))
178 val &= 0xffffffff;
179 return val;
180 }
181
182}
183
184asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
185{
186 struct task_struct *child;
187 long i, ret;
188 unsigned ui;
189
190 /* This lock_kernel fixes a subtle race with suid exec */
191 lock_kernel();
192 ret = -EPERM;
193 if (request == PTRACE_TRACEME) {
194 /* are we already being traced? */
195 if (current->ptrace & PT_PTRACED)
196 goto out;
197 ret = security_ptrace(current->parent, current);
198 if (ret)
199 goto out;
200 /* set the ptrace bit in the process flags. */
201 current->ptrace |= PT_PTRACED;
202 ret = 0;
203 goto out;
204 }
205 ret = -ESRCH;
206 read_lock(&tasklist_lock);
207 child = find_task_by_pid(pid);
208 if (child)
209 get_task_struct(child);
210 read_unlock(&tasklist_lock);
211 if (!child)
212 goto out;
213
214 ret = -EPERM;
215 if (pid == 1) /* you may not mess with init */
216 goto out_tsk;
217
218 if (request == PTRACE_ATTACH) {
219 ret = ptrace_attach(child);
220 goto out_tsk;
221 }
222 ret = ptrace_check_attach(child, request == PTRACE_KILL);
223 if (ret < 0)
224 goto out_tsk;
225
226 switch (request) {
227 /* when I and D space are separate, these will need to be fixed. */
228 case PTRACE_PEEKTEXT: /* read word at location addr. */
229 case PTRACE_PEEKDATA: {
230 unsigned long tmp;
231 int copied;
232
233 copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
234 ret = -EIO;
235 if (copied != sizeof(tmp))
236 break;
237 ret = put_user(tmp,(unsigned long __user *) data);
238 break;
239 }
240
241 /* read the word at location addr in the USER area. */
242 case PTRACE_PEEKUSR: {
243 unsigned long tmp;
244
245 ret = -EIO;
246 if ((addr & 7) ||
247 addr > sizeof(struct user) - 7)
248 break;
249
250 switch (addr) {
251 case 0 ... sizeof(struct user_regs_struct):
252 tmp = getreg(child, addr);
253 break;
254 case offsetof(struct user, u_debugreg[0]):
255 tmp = child->thread.debugreg0;
256 break;
257 case offsetof(struct user, u_debugreg[1]):
258 tmp = child->thread.debugreg1;
259 break;
260 case offsetof(struct user, u_debugreg[2]):
261 tmp = child->thread.debugreg2;
262 break;
263 case offsetof(struct user, u_debugreg[3]):
264 tmp = child->thread.debugreg3;
265 break;
266 case offsetof(struct user, u_debugreg[6]):
267 tmp = child->thread.debugreg6;
268 break;
269 case offsetof(struct user, u_debugreg[7]):
270 tmp = child->thread.debugreg7;
271 break;
272 default:
273 tmp = 0;
274 break;
275 }
276 ret = put_user(tmp,(unsigned long __user *) data);
277 break;
278 }
279
280 /* when I and D space are separate, this will have to be fixed. */
281 case PTRACE_POKETEXT: /* write the word at location addr. */
282 case PTRACE_POKEDATA:
283 ret = 0;
284 if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
285 break;
286 ret = -EIO;
287 break;
288
289 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
290 ret = -EIO;
291 if ((addr & 7) ||
292 addr > sizeof(struct user) - 7)
293 break;
294
295 switch (addr) {
296 case 0 ... sizeof(struct user_regs_struct):
297 ret = putreg(child, addr, data);
298 break;
299 /* Disallows to set a breakpoint into the vsyscall */
300 case offsetof(struct user, u_debugreg[0]):
301 if (data >= TASK_SIZE-7) break;
302 child->thread.debugreg0 = data;
303 ret = 0;
304 break;
305 case offsetof(struct user, u_debugreg[1]):
306 if (data >= TASK_SIZE-7) break;
307 child->thread.debugreg1 = data;
308 ret = 0;
309 break;
310 case offsetof(struct user, u_debugreg[2]):
311 if (data >= TASK_SIZE-7) break;
312 child->thread.debugreg2 = data;
313 ret = 0;
314 break;
315 case offsetof(struct user, u_debugreg[3]):
316 if (data >= TASK_SIZE-7) break;
317 child->thread.debugreg3 = data;
318 ret = 0;
319 break;
320 case offsetof(struct user, u_debugreg[6]):
321 if (data >> 32)
322 break;
323 child->thread.debugreg6 = data;
324 ret = 0;
325 break;
326 case offsetof(struct user, u_debugreg[7]):
327 /* See arch/i386/kernel/ptrace.c for an explanation of
328 * this awkward check.*/
329 data &= ~DR_CONTROL_RESERVED;
330 for(i=0; i<4; i++)
331 if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
332 break;
333 if (i == 4) {
334 child->thread.debugreg7 = data;
335 ret = 0;
336 }
337 break;
338 }
339 break;
340 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
341 case PTRACE_CONT: { /* restart after signal. */
342 long tmp;
343
344 ret = -EIO;
345 if ((unsigned long) data > _NSIG)
346 break;
347 if (request == PTRACE_SYSCALL)
348 set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
349 else
350 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
351 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
352 child->exit_code = data;
353 /* make sure the single step bit is not set. */
354 tmp = get_stack_long(child, EFL_OFFSET);
355 tmp &= ~TRAP_FLAG;
356 put_stack_long(child, EFL_OFFSET,tmp);
357 wake_up_process(child);
358 ret = 0;
359 break;
360 }
361
362#ifdef CONFIG_IA32_EMULATION
363 /* This makes only sense with 32bit programs. Allow a
364 64bit debugger to fully examine them too. Better
365 don't use it against 64bit processes, use
366 PTRACE_ARCH_PRCTL instead. */
367 case PTRACE_SET_THREAD_AREA: {
368 struct user_desc __user *p;
369 int old;
370 p = (struct user_desc __user *)data;
371 get_user(old, &p->entry_number);
372 put_user(addr, &p->entry_number);
373 ret = do_set_thread_area(&child->thread, p);
374 put_user(old, &p->entry_number);
375 break;
376 case PTRACE_GET_THREAD_AREA:
377 p = (struct user_desc __user *)data;
378 get_user(old, &p->entry_number);
379 put_user(addr, &p->entry_number);
380 ret = do_get_thread_area(&child->thread, p);
381 put_user(old, &p->entry_number);
382 break;
383 }
384#endif
385 /* normal 64bit interface to access TLS data.
386 Works just like arch_prctl, except that the arguments
387 are reversed. */
388 case PTRACE_ARCH_PRCTL:
389 ret = do_arch_prctl(child, data, addr);
390 break;
391
392/*
393 * make the child exit. Best I can do is send it a sigkill.
394 * perhaps it should be put in the status that it wants to
395 * exit.
396 */
397 case PTRACE_KILL: {
398 long tmp;
399
400 ret = 0;
401 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
402 break;
403 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
404 child->exit_code = SIGKILL;
405 /* make sure the single step bit is not set. */
406 tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
407 put_stack_long(child, EFL_OFFSET, tmp);
408 wake_up_process(child);
409 break;
410 }
411
412 case PTRACE_SINGLESTEP: { /* set the trap flag. */
413 long tmp;
414
415 ret = -EIO;
416 if ((unsigned long) data > _NSIG)
417 break;
418 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
419 if ((child->ptrace & PT_DTRACE) == 0) {
420 /* Spurious delayed TF traps may occur */
421 child->ptrace |= PT_DTRACE;
422 }
423 tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG;
424 put_stack_long(child, EFL_OFFSET, tmp);
425 set_tsk_thread_flag(child, TIF_SINGLESTEP);
426 child->exit_code = data;
427 /* give it a chance to run. */
428 wake_up_process(child);
429 ret = 0;
430 break;
431 }
432
433 case PTRACE_DETACH:
434 /* detach a process that was attached. */
435 ret = ptrace_detach(child, data);
436 break;
437
438 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
439 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
440 sizeof(struct user_regs_struct))) {
441 ret = -EIO;
442 break;
443 }
444 ret = 0;
445 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
446 ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
447 data += sizeof(long);
448 }
449 break;
450 }
451
452 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
453 unsigned long tmp;
454 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
455 sizeof(struct user_regs_struct))) {
456 ret = -EIO;
457 break;
458 }
459 ret = 0;
460 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
461 ret |= __get_user(tmp, (unsigned long __user *) data);
462 putreg(child, ui, tmp);
463 data += sizeof(long);
464 }
465 break;
466 }
467
468 case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
469 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
470 sizeof(struct user_i387_struct))) {
471 ret = -EIO;
472 break;
473 }
474 ret = get_fpregs((struct user_i387_struct __user *)data, child);
475 break;
476 }
477
478 case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
479 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
480 sizeof(struct user_i387_struct))) {
481 ret = -EIO;
482 break;
483 }
484 set_stopped_child_used_math(child);
485 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
486 break;
487 }
488
489 default:
490 ret = ptrace_request(child, request, addr, data);
491 break;
492 }
493out_tsk:
494 put_task_struct(child);
495out:
496 unlock_kernel();
497 return ret;
498}
499
500static void syscall_trace(struct pt_regs *regs)
501{
502
503#if 0
504 printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
505 current->comm,
506 regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
507 current_thread_info()->flags, current->ptrace);
508#endif
509
510 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
511 ? 0x80 : 0));
512 /*
513 * this isn't the same as continuing with a signal, but it will do
514 * for normal use. strace only continues with a signal if the
515 * stopping signal is not SIGTRAP. -brl
516 */
517 if (current->exit_code) {
518 send_sig(current->exit_code, current, 1);
519 current->exit_code = 0;
520 }
521}
522
523asmlinkage void syscall_trace_enter(struct pt_regs *regs)
524{
525 /* do the secure computing check first */
526 secure_computing(regs->orig_rax);
527
528 if (unlikely(current->audit_context))
529 audit_syscall_entry(current, regs->orig_rax,
530 regs->rdi, regs->rsi,
531 regs->rdx, regs->r10);
532
533 if (test_thread_flag(TIF_SYSCALL_TRACE)
534 && (current->ptrace & PT_PTRACED))
535 syscall_trace(regs);
536}
537
538asmlinkage void syscall_trace_leave(struct pt_regs *regs)
539{
540 if (unlikely(current->audit_context))
541 audit_syscall_exit(current, regs->rax);
542
543 if ((test_thread_flag(TIF_SYSCALL_TRACE)
544 || test_thread_flag(TIF_SINGLESTEP))
545 && (current->ptrace & PT_PTRACED))
546 syscall_trace(regs);
547}
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
new file mode 100644
index 000000000000..be4b36f762cf
--- /dev/null
+++ b/arch/x86_64/kernel/reboot.c
@@ -0,0 +1,163 @@
1/* Various gunk just to reboot the machine. */
2#include <linux/module.h>
3#include <linux/reboot.h>
4#include <linux/init.h>
5#include <linux/smp.h>
6#include <linux/kernel.h>
7#include <linux/ctype.h>
8#include <linux/string.h>
9#include <asm/io.h>
10#include <asm/kdebug.h>
11#include <asm/delay.h>
12#include <asm/hw_irq.h>
13#include <asm/system.h>
14#include <asm/pgtable.h>
15#include <asm/tlbflush.h>
16#include <asm/apic.h>
17
18/*
19 * Power off function, if any
20 */
21void (*pm_power_off)(void);
22
23static long no_idt[3];
24static enum {
25 BOOT_TRIPLE = 't',
26 BOOT_KBD = 'k'
27} reboot_type = BOOT_KBD;
28static int reboot_mode = 0;
29int reboot_force;
30
31/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
32 warm Don't set the cold reboot flag
33 cold Set the cold reboot flag
34 triple Force a triple fault (init)
35 kbd Use the keyboard controller. cold reset (default)
36 force Avoid anything that could hang.
37 */
38static int __init reboot_setup(char *str)
39{
40 for (;;) {
41 switch (*str) {
42 case 'w':
43 reboot_mode = 0x1234;
44 break;
45
46 case 'c':
47 reboot_mode = 0;
48 break;
49
50 case 't':
51 case 'b':
52 case 'k':
53 reboot_type = *str;
54 break;
55 case 'f':
56 reboot_force = 1;
57 break;
58 }
59 if((str = strchr(str,',')) != NULL)
60 str++;
61 else
62 break;
63 }
64 return 1;
65}
66
67__setup("reboot=", reboot_setup);
68
69#ifdef CONFIG_SMP
70static void smp_halt(void)
71{
72 int cpuid = safe_smp_processor_id();
73 static int first_entry = 1;
74
75 if (reboot_force)
76 return;
77
78 if (first_entry) {
79 first_entry = 0;
80 smp_call_function((void *)machine_restart, NULL, 1, 0);
81 }
82
83 smp_stop_cpu();
84
85 /* AP calling this. Just halt */
86 if (cpuid != boot_cpu_id) {
87 for (;;)
88 asm("hlt");
89 }
90
91 /* Wait for all other CPUs to have run smp_stop_cpu */
92 while (!cpus_empty(cpu_online_map))
93 rep_nop();
94}
95#endif
96
97static inline void kb_wait(void)
98{
99 int i;
100
101 for (i=0; i<0x10000; i++)
102 if ((inb_p(0x64) & 0x02) == 0)
103 break;
104}
105
106void machine_restart(char * __unused)
107{
108 int i;
109
110 printk("machine restart\n");
111
112#ifdef CONFIG_SMP
113 smp_halt();
114#endif
115
116 if (!reboot_force) {
117 local_irq_disable();
118#ifndef CONFIG_SMP
119 disable_local_APIC();
120#endif
121 disable_IO_APIC();
122 local_irq_enable();
123 }
124
125 /* Tell the BIOS if we want cold or warm reboot */
126 *((unsigned short *)__va(0x472)) = reboot_mode;
127
128 for (;;) {
129 /* Could also try the reset bit in the Hammer NB */
130 switch (reboot_type) {
131 case BOOT_KBD:
132 for (i=0; i<100; i++) {
133 kb_wait();
134 udelay(50);
135 outb(0xfe,0x64); /* pulse reset low */
136 udelay(50);
137 }
138
139 case BOOT_TRIPLE:
140 __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
141 __asm__ __volatile__("int3");
142
143 reboot_type = BOOT_KBD;
144 break;
145 }
146 }
147}
148
149EXPORT_SYMBOL(machine_restart);
150
151void machine_halt(void)
152{
153}
154
155EXPORT_SYMBOL(machine_halt);
156
157void machine_power_off(void)
158{
159 if (pm_power_off)
160 pm_power_off();
161}
162
163EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
new file mode 100644
index 000000000000..48f7c18172b9
--- /dev/null
+++ b/arch/x86_64/kernel/semaphore.c
@@ -0,0 +1,180 @@
1/*
2 * x86_64 semaphore implementation.
3 *
4 * (C) Copyright 1999 Linus Torvalds
5 *
6 * Portions Copyright 1999 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
14 */
15#include <linux/config.h>
16#include <linux/sched.h>
17#include <linux/init.h>
18#include <asm/errno.h>
19
20#include <asm/semaphore.h>
21
22/*
23 * Semaphores are implemented using a two-way counter:
24 * The "count" variable is decremented for each process
25 * that tries to acquire the semaphore, while the "sleeping"
26 * variable is a count of such acquires.
27 *
28 * Notably, the inline "up()" and "down()" functions can
29 * efficiently test if they need to do any extra work (up
30 * needs to do something only if count was negative before
31 * the increment operation.
32 *
33 * "sleeping" and the contention routine ordering is protected
34 * by the spinlock in the semaphore's waitqueue head.
35 *
36 * Note that these functions are only called when there is
37 * contention on the lock, and as such all this is the
38 * "non-critical" part of the whole semaphore business. The
39 * critical part is the inline stuff in <asm/semaphore.h>
40 * where we want to avoid any extra jumps and calls.
41 */
42
43/*
44 * Logic:
45 * - only on a boundary condition do we need to care. When we go
46 * from a negative count to a non-negative, we wake people up.
47 * - when we go from a non-negative count to a negative do we
48 * (a) synchronize with the "sleeper" count and (b) make sure
49 * that we're on the wakeup list before we synchronize so that
50 * we cannot lose wakeup events.
51 */
52
53void __up(struct semaphore *sem)
54{
55 wake_up(&sem->wait);
56}
57
58void __sched __down(struct semaphore * sem)
59{
60 struct task_struct *tsk = current;
61 DECLARE_WAITQUEUE(wait, tsk);
62 unsigned long flags;
63
64 tsk->state = TASK_UNINTERRUPTIBLE;
65 spin_lock_irqsave(&sem->wait.lock, flags);
66 add_wait_queue_exclusive_locked(&sem->wait, &wait);
67
68 sem->sleepers++;
69 for (;;) {
70 int sleepers = sem->sleepers;
71
72 /*
73 * Add "everybody else" into it. They aren't
74 * playing, because we own the spinlock in
75 * the wait_queue_head.
76 */
77 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
78 sem->sleepers = 0;
79 break;
80 }
81 sem->sleepers = 1; /* us - see -1 above */
82 spin_unlock_irqrestore(&sem->wait.lock, flags);
83
84 schedule();
85
86 spin_lock_irqsave(&sem->wait.lock, flags);
87 tsk->state = TASK_UNINTERRUPTIBLE;
88 }
89 remove_wait_queue_locked(&sem->wait, &wait);
90 wake_up_locked(&sem->wait);
91 spin_unlock_irqrestore(&sem->wait.lock, flags);
92 tsk->state = TASK_RUNNING;
93}
94
95int __sched __down_interruptible(struct semaphore * sem)
96{
97 int retval = 0;
98 struct task_struct *tsk = current;
99 DECLARE_WAITQUEUE(wait, tsk);
100 unsigned long flags;
101
102 tsk->state = TASK_INTERRUPTIBLE;
103 spin_lock_irqsave(&sem->wait.lock, flags);
104 add_wait_queue_exclusive_locked(&sem->wait, &wait);
105
106 sem->sleepers++;
107 for (;;) {
108 int sleepers = sem->sleepers;
109
110 /*
111 * With signals pending, this turns into
112 * the trylock failure case - we won't be
113 * sleeping, and we* can't get the lock as
114 * it has contention. Just correct the count
115 * and exit.
116 */
117 if (signal_pending(current)) {
118 retval = -EINTR;
119 sem->sleepers = 0;
120 atomic_add(sleepers, &sem->count);
121 break;
122 }
123
124 /*
125 * Add "everybody else" into it. They aren't
126 * playing, because we own the spinlock in
127 * wait_queue_head. The "-1" is because we're
128 * still hoping to get the semaphore.
129 */
130 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
131 sem->sleepers = 0;
132 break;
133 }
134 sem->sleepers = 1; /* us - see -1 above */
135 spin_unlock_irqrestore(&sem->wait.lock, flags);
136
137 schedule();
138
139 spin_lock_irqsave(&sem->wait.lock, flags);
140 tsk->state = TASK_INTERRUPTIBLE;
141 }
142 remove_wait_queue_locked(&sem->wait, &wait);
143 wake_up_locked(&sem->wait);
144 spin_unlock_irqrestore(&sem->wait.lock, flags);
145
146 tsk->state = TASK_RUNNING;
147 return retval;
148}
149
150/*
151 * Trylock failed - make sure we correct for
152 * having decremented the count.
153 *
154 * We could have done the trylock with a
155 * single "cmpxchg" without failure cases,
156 * but then it wouldn't work on a 386.
157 */
158int __down_trylock(struct semaphore * sem)
159{
160 int sleepers;
161 unsigned long flags;
162
163 spin_lock_irqsave(&sem->wait.lock, flags);
164 sleepers = sem->sleepers + 1;
165 sem->sleepers = 0;
166
167 /*
168 * Add "everybody else" and us into it. They aren't
169 * playing, because we own the spinlock in the
170 * wait_queue_head.
171 */
172 if (!atomic_add_negative(sleepers, &sem->count)) {
173 wake_up_locked(&sem->wait);
174 }
175
176 spin_unlock_irqrestore(&sem->wait.lock, flags);
177 return 1;
178}
179
180
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
new file mode 100644
index 000000000000..e50fc41de6b9
--- /dev/null
+++ b/arch/x86_64/kernel/setup.c
@@ -0,0 +1,1189 @@
1/*
2 * linux/arch/x86-64/kernel/setup.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Nov 2001 Dave Jones <davej@suse.de>
7 * Forked from i386 setup code.
8 *
9 * $Id$
10 */
11
12/*
13 * This file handles the architecture-dependent parts of initialization
14 */
15
16#include <linux/errno.h>
17#include <linux/sched.h>
18#include <linux/kernel.h>
19#include <linux/mm.h>
20#include <linux/stddef.h>
21#include <linux/unistd.h>
22#include <linux/ptrace.h>
23#include <linux/slab.h>
24#include <linux/user.h>
25#include <linux/a.out.h>
26#include <linux/tty.h>
27#include <linux/ioport.h>
28#include <linux/delay.h>
29#include <linux/config.h>
30#include <linux/init.h>
31#include <linux/initrd.h>
32#include <linux/highmem.h>
33#include <linux/bootmem.h>
34#include <linux/module.h>
35#include <asm/processor.h>
36#include <linux/console.h>
37#include <linux/seq_file.h>
38#include <linux/root_dev.h>
39#include <linux/pci.h>
40#include <linux/acpi.h>
41#include <linux/kallsyms.h>
42#include <linux/edd.h>
43#include <asm/mtrr.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46#include <asm/io.h>
47#include <asm/smp.h>
48#include <asm/msr.h>
49#include <asm/desc.h>
50#include <video/edid.h>
51#include <asm/e820.h>
52#include <asm/dma.h>
53#include <asm/mpspec.h>
54#include <asm/mmu_context.h>
55#include <asm/bootsetup.h>
56#include <asm/proto.h>
57#include <asm/setup.h>
58#include <asm/mach_apic.h>
59#include <asm/numa.h>
60
61/*
62 * Machine setup..
63 */
64
65struct cpuinfo_x86 boot_cpu_data;
66
67unsigned long mmu_cr4_features;
68
69int acpi_disabled;
70EXPORT_SYMBOL(acpi_disabled);
71#ifdef CONFIG_ACPI_BOOT
72extern int __initdata acpi_ht;
73extern acpi_interrupt_flags acpi_sci_flags;
74int __initdata acpi_force = 0;
75#endif
76
77int acpi_numa __initdata;
78
79/* For PCI or other memory-mapped resources */
80unsigned long pci_mem_start = 0x10000000;
81
82/* Boot loader ID as an integer, for the benefit of proc_dointvec */
83int bootloader_type;
84
85unsigned long saved_video_mode;
86
87#ifdef CONFIG_SWIOTLB
88int swiotlb;
89EXPORT_SYMBOL(swiotlb);
90#endif
91
92/*
93 * Setup options
94 */
95struct drive_info_struct { char dummy[32]; } drive_info;
96struct screen_info screen_info;
97struct sys_desc_table_struct {
98 unsigned short length;
99 unsigned char table[0];
100};
101
102struct edid_info edid_info;
103struct e820map e820;
104
105extern int root_mountflags;
106extern char _text, _etext, _edata, _end;
107
108char command_line[COMMAND_LINE_SIZE];
109
110struct resource standard_io_resources[] = {
111 { .name = "dma1", .start = 0x00, .end = 0x1f,
112 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
113 { .name = "pic1", .start = 0x20, .end = 0x21,
114 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
115 { .name = "timer0", .start = 0x40, .end = 0x43,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
117 { .name = "timer1", .start = 0x50, .end = 0x53,
118 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
119 { .name = "keyboard", .start = 0x60, .end = 0x6f,
120 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
121 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
122 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
123 { .name = "pic2", .start = 0xa0, .end = 0xa1,
124 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
125 { .name = "dma2", .start = 0xc0, .end = 0xdf,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
127 { .name = "fpu", .start = 0xf0, .end = 0xff,
128 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
129};
130
131#define STANDARD_IO_RESOURCES \
132 (sizeof standard_io_resources / sizeof standard_io_resources[0])
133
134#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
135
136struct resource data_resource = {
137 .name = "Kernel data",
138 .start = 0,
139 .end = 0,
140 .flags = IORESOURCE_RAM,
141};
142struct resource code_resource = {
143 .name = "Kernel code",
144 .start = 0,
145 .end = 0,
146 .flags = IORESOURCE_RAM,
147};
148
149#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
150
151static struct resource system_rom_resource = {
152 .name = "System ROM",
153 .start = 0xf0000,
154 .end = 0xfffff,
155 .flags = IORESOURCE_ROM,
156};
157
158static struct resource extension_rom_resource = {
159 .name = "Extension ROM",
160 .start = 0xe0000,
161 .end = 0xeffff,
162 .flags = IORESOURCE_ROM,
163};
164
165static struct resource adapter_rom_resources[] = {
166 { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
167 .flags = IORESOURCE_ROM },
168 { .name = "Adapter ROM", .start = 0, .end = 0,
169 .flags = IORESOURCE_ROM },
170 { .name = "Adapter ROM", .start = 0, .end = 0,
171 .flags = IORESOURCE_ROM },
172 { .name = "Adapter ROM", .start = 0, .end = 0,
173 .flags = IORESOURCE_ROM },
174 { .name = "Adapter ROM", .start = 0, .end = 0,
175 .flags = IORESOURCE_ROM },
176 { .name = "Adapter ROM", .start = 0, .end = 0,
177 .flags = IORESOURCE_ROM }
178};
179
180#define ADAPTER_ROM_RESOURCES \
181 (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
182
183static struct resource video_rom_resource = {
184 .name = "Video ROM",
185 .start = 0xc0000,
186 .end = 0xc7fff,
187 .flags = IORESOURCE_ROM,
188};
189
190static struct resource video_ram_resource = {
191 .name = "Video RAM area",
192 .start = 0xa0000,
193 .end = 0xbffff,
194 .flags = IORESOURCE_RAM,
195};
196
197#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
198
199static int __init romchecksum(unsigned char *rom, unsigned long length)
200{
201 unsigned char *p, sum = 0;
202
203 for (p = rom; p < rom + length; p++)
204 sum += *p;
205 return sum == 0;
206}
207
208static void __init probe_roms(void)
209{
210 unsigned long start, length, upper;
211 unsigned char *rom;
212 int i;
213
214 /* video rom */
215 upper = adapter_rom_resources[0].start;
216 for (start = video_rom_resource.start; start < upper; start += 2048) {
217 rom = isa_bus_to_virt(start);
218 if (!romsignature(rom))
219 continue;
220
221 video_rom_resource.start = start;
222
223 /* 0 < length <= 0x7f * 512, historically */
224 length = rom[2] * 512;
225
226 /* if checksum okay, trust length byte */
227 if (length && romchecksum(rom, length))
228 video_rom_resource.end = start + length - 1;
229
230 request_resource(&iomem_resource, &video_rom_resource);
231 break;
232 }
233
234 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
235 if (start < upper)
236 start = upper;
237
238 /* system rom */
239 request_resource(&iomem_resource, &system_rom_resource);
240 upper = system_rom_resource.start;
241
242 /* check for extension rom (ignore length byte!) */
243 rom = isa_bus_to_virt(extension_rom_resource.start);
244 if (romsignature(rom)) {
245 length = extension_rom_resource.end - extension_rom_resource.start + 1;
246 if (romchecksum(rom, length)) {
247 request_resource(&iomem_resource, &extension_rom_resource);
248 upper = extension_rom_resource.start;
249 }
250 }
251
252 /* check for adapter roms on 2k boundaries */
253 for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
254 rom = isa_bus_to_virt(start);
255 if (!romsignature(rom))
256 continue;
257
258 /* 0 < length <= 0x7f * 512, historically */
259 length = rom[2] * 512;
260
261 /* but accept any length that fits if checksum okay */
262 if (!length || start + length > upper || !romchecksum(rom, length))
263 continue;
264
265 adapter_rom_resources[i].start = start;
266 adapter_rom_resources[i].end = start + length - 1;
267 request_resource(&iomem_resource, &adapter_rom_resources[i]);
268
269 start = adapter_rom_resources[i++].end & ~2047UL;
270 }
271}
272
273static __init void parse_cmdline_early (char ** cmdline_p)
274{
275 char c = ' ', *to = command_line, *from = COMMAND_LINE;
276 int len = 0;
277
278 /* Save unparsed command line copy for /proc/cmdline */
279 memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE);
280 saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
281
282 for (;;) {
283 if (c != ' ')
284 goto next_char;
285
286#ifdef CONFIG_SMP
287 /*
288 * If the BIOS enumerates physical processors before logical,
289 * maxcpus=N at enumeration-time can be used to disable HT.
290 */
291 else if (!memcmp(from, "maxcpus=", 8)) {
292 extern unsigned int maxcpus;
293
294 maxcpus = simple_strtoul(from + 8, NULL, 0);
295 }
296#endif
297#ifdef CONFIG_ACPI_BOOT
298 /* "acpi=off" disables both ACPI table parsing and interpreter init */
299 if (!memcmp(from, "acpi=off", 8))
300 disable_acpi();
301
302 if (!memcmp(from, "acpi=force", 10)) {
303 /* add later when we do DMI horrors: */
304 acpi_force = 1;
305 acpi_disabled = 0;
306 }
307
308 /* acpi=ht just means: do ACPI MADT parsing
309 at bootup, but don't enable the full ACPI interpreter */
310 if (!memcmp(from, "acpi=ht", 7)) {
311 if (!acpi_force)
312 disable_acpi();
313 acpi_ht = 1;
314 }
315 else if (!memcmp(from, "pci=noacpi", 10))
316 acpi_disable_pci();
317 else if (!memcmp(from, "acpi=noirq", 10))
318 acpi_noirq_set();
319
320 else if (!memcmp(from, "acpi_sci=edge", 13))
321 acpi_sci_flags.trigger = 1;
322 else if (!memcmp(from, "acpi_sci=level", 14))
323 acpi_sci_flags.trigger = 3;
324 else if (!memcmp(from, "acpi_sci=high", 13))
325 acpi_sci_flags.polarity = 1;
326 else if (!memcmp(from, "acpi_sci=low", 12))
327 acpi_sci_flags.polarity = 3;
328
329 /* acpi=strict disables out-of-spec workarounds */
330 else if (!memcmp(from, "acpi=strict", 11)) {
331 acpi_strict = 1;
332 }
333#endif
334
335 if (!memcmp(from, "nolapic", 7) ||
336 !memcmp(from, "disableapic", 11))
337 disable_apic = 1;
338
339 if (!memcmp(from, "noapic", 6))
340 skip_ioapic_setup = 1;
341
342 if (!memcmp(from, "apic", 4)) {
343 skip_ioapic_setup = 0;
344 ioapic_force = 1;
345 }
346
347 if (!memcmp(from, "mem=", 4))
348 parse_memopt(from+4, &from);
349
350#ifdef CONFIG_DISCONTIGMEM
351 if (!memcmp(from, "numa=", 5))
352 numa_setup(from+5);
353#endif
354
355#ifdef CONFIG_GART_IOMMU
356 if (!memcmp(from,"iommu=",6)) {
357 iommu_setup(from+6);
358 }
359#endif
360
361 if (!memcmp(from,"oops=panic", 10))
362 panic_on_oops = 1;
363
364 if (!memcmp(from, "noexec=", 7))
365 nonx_setup(from + 7);
366
367 next_char:
368 c = *(from++);
369 if (!c)
370 break;
371 if (COMMAND_LINE_SIZE <= ++len)
372 break;
373 *(to++) = c;
374 }
375 *to = '\0';
376 *cmdline_p = command_line;
377}
378
379#ifndef CONFIG_DISCONTIGMEM
380static void __init contig_initmem_init(void)
381{
382 unsigned long bootmap_size, bootmap;
383 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
384 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
385 if (bootmap == -1L)
386 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
387 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
388 e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT);
389 reserve_bootmem(bootmap, bootmap_size);
390}
391#endif
392
393/* Use inline assembly to define this because the nops are defined
394 as inline assembly strings in the include files and we cannot
395 get them easily into strings. */
396asm("\t.data\nk8nops: "
397 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
398 K8_NOP7 K8_NOP8);
399
400extern unsigned char k8nops[];
401static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
402 NULL,
403 k8nops,
404 k8nops + 1,
405 k8nops + 1 + 2,
406 k8nops + 1 + 2 + 3,
407 k8nops + 1 + 2 + 3 + 4,
408 k8nops + 1 + 2 + 3 + 4 + 5,
409 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
410 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
411};
412
413/* Replace instructions with better alternatives for this CPU type.
414
415 This runs before SMP is initialized to avoid SMP problems with
416 self modifying code. This implies that assymetric systems where
417 APs have less capabilities than the boot processor are not handled.
418 In this case boot with "noreplacement". */
419void apply_alternatives(void *start, void *end)
420{
421 struct alt_instr *a;
422 int diff, i, k;
423 for (a = start; (void *)a < end; a++) {
424 if (!boot_cpu_has(a->cpuid))
425 continue;
426
427 BUG_ON(a->replacementlen > a->instrlen);
428 __inline_memcpy(a->instr, a->replacement, a->replacementlen);
429 diff = a->instrlen - a->replacementlen;
430
431 /* Pad the rest with nops */
432 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
433 k = diff;
434 if (k > ASM_NOP_MAX)
435 k = ASM_NOP_MAX;
436 __inline_memcpy(a->instr + i, k8_nops[k], k);
437 }
438 }
439}
440
441static int no_replacement __initdata = 0;
442
443void __init alternative_instructions(void)
444{
445 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
446 if (no_replacement)
447 return;
448 apply_alternatives(__alt_instructions, __alt_instructions_end);
449}
450
451static int __init noreplacement_setup(char *s)
452{
453 no_replacement = 1;
454 return 0;
455}
456
457__setup("noreplacement", noreplacement_setup);
458
459#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
460struct edd edd;
461#ifdef CONFIG_EDD_MODULE
462EXPORT_SYMBOL(edd);
463#endif
464/**
465 * copy_edd() - Copy the BIOS EDD information
466 * from boot_params into a safe place.
467 *
468 */
469static inline void copy_edd(void)
470{
471 memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
472 memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
473 edd.mbr_signature_nr = EDD_MBR_SIG_NR;
474 edd.edd_info_nr = EDD_NR;
475}
476#else
477static inline void copy_edd(void)
478{
479}
480#endif
481
482#define EBDA_ADDR_POINTER 0x40E
483static void __init reserve_ebda_region(void)
484{
485 unsigned int addr;
486 /**
487 * there is a real-mode segmented pointer pointing to the
488 * 4K EBDA area at 0x40E
489 */
490 addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
491 addr <<= 4;
492 if (addr)
493 reserve_bootmem_generic(addr, PAGE_SIZE);
494}
495
496void __init setup_arch(char **cmdline_p)
497{
498 unsigned long low_mem_size;
499 unsigned long kernel_end;
500
501 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
502 drive_info = DRIVE_INFO;
503 screen_info = SCREEN_INFO;
504 edid_info = EDID_INFO;
505 saved_video_mode = SAVED_VIDEO_MODE;
506 bootloader_type = LOADER_TYPE;
507
508#ifdef CONFIG_BLK_DEV_RAM
509 rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
510 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
511 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
512#endif
513 setup_memory_region();
514 copy_edd();
515
516 if (!MOUNT_ROOT_RDONLY)
517 root_mountflags &= ~MS_RDONLY;
518 init_mm.start_code = (unsigned long) &_text;
519 init_mm.end_code = (unsigned long) &_etext;
520 init_mm.end_data = (unsigned long) &_edata;
521 init_mm.brk = (unsigned long) &_end;
522
523 code_resource.start = virt_to_phys(&_text);
524 code_resource.end = virt_to_phys(&_etext)-1;
525 data_resource.start = virt_to_phys(&_etext);
526 data_resource.end = virt_to_phys(&_edata)-1;
527
528 parse_cmdline_early(cmdline_p);
529
530 early_identify_cpu(&boot_cpu_data);
531
532 /*
533 * partially used pages are not usable - thus
534 * we are rounding upwards:
535 */
536 end_pfn = e820_end_of_ram();
537
538 check_efer();
539
540 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
541
542#ifdef CONFIG_ACPI_BOOT
543 /*
544 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
545 * Call this early for SRAT node setup.
546 */
547 acpi_boot_table_init();
548#endif
549
550#ifdef CONFIG_ACPI_NUMA
551 /*
552 * Parse SRAT to discover nodes.
553 */
554 acpi_numa_init();
555#endif
556
557#ifdef CONFIG_DISCONTIGMEM
558 numa_initmem_init(0, end_pfn);
559#else
560 contig_initmem_init();
561#endif
562
563 /* Reserve direct mapping */
564 reserve_bootmem_generic(table_start << PAGE_SHIFT,
565 (table_end - table_start) << PAGE_SHIFT);
566
567 /* reserve kernel */
568 kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
569 reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
570
571 /*
572 * reserve physical page 0 - it's a special BIOS page on many boxes,
573 * enabling clean reboots, SMP operation, laptop functions.
574 */
575 reserve_bootmem_generic(0, PAGE_SIZE);
576
577 /* reserve ebda region */
578 reserve_ebda_region();
579
580#ifdef CONFIG_SMP
581 /*
582 * But first pinch a few for the stack/trampoline stuff
583 * FIXME: Don't need the extra page at 4K, but need to fix
584 * trampoline before removing it. (see the GDT stuff)
585 */
586 reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
587
588 /* Reserve SMP trampoline */
589 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
590#endif
591
592#ifdef CONFIG_ACPI_SLEEP
593 /*
594 * Reserve low memory region for sleep support.
595 */
596 acpi_reserve_bootmem();
597#endif
598#ifdef CONFIG_X86_LOCAL_APIC
599 /*
600 * Find and reserve possible boot-time SMP configuration:
601 */
602 find_smp_config();
603#endif
604#ifdef CONFIG_BLK_DEV_INITRD
605 if (LOADER_TYPE && INITRD_START) {
606 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
607 reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
608 initrd_start =
609 INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
610 initrd_end = initrd_start+INITRD_SIZE;
611 }
612 else {
613 printk(KERN_ERR "initrd extends beyond end of memory "
614 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
615 (unsigned long)(INITRD_START + INITRD_SIZE),
616 (unsigned long)(end_pfn << PAGE_SHIFT));
617 initrd_start = 0;
618 }
619 }
620#endif
621 paging_init();
622
623 check_ioapic();
624
625#ifdef CONFIG_ACPI_BOOT
626 /*
627 * Read APIC and some other early information from ACPI tables.
628 */
629 acpi_boot_init();
630#endif
631
632#ifdef CONFIG_X86_LOCAL_APIC
633 /*
634 * get boot-time SMP configuration:
635 */
636 if (smp_found_config)
637 get_smp_config();
638 init_apic_mappings();
639#endif
640
641 /*
642 * Request address space for all standard RAM and ROM resources
643 * and also for regions reported as reserved by the e820.
644 */
645 probe_roms();
646 e820_reserve_resources();
647
648 request_resource(&iomem_resource, &video_ram_resource);
649
650 {
651 unsigned i;
652 /* request I/O space for devices used on all i[345]86 PCs */
653 for (i = 0; i < STANDARD_IO_RESOURCES; i++)
654 request_resource(&ioport_resource, &standard_io_resources[i]);
655 }
656
657 /* Will likely break when you have unassigned resources with more
658 than 4GB memory and bridges that don't support more than 4GB.
659 Doing it properly would require to use pci_alloc_consistent
660 in this case. */
661 low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff;
662 if (low_mem_size > pci_mem_start)
663 pci_mem_start = low_mem_size;
664
665#ifdef CONFIG_GART_IOMMU
666 iommu_hole_init();
667#endif
668
669#ifdef CONFIG_VT
670#if defined(CONFIG_VGA_CONSOLE)
671 conswitchp = &vga_con;
672#elif defined(CONFIG_DUMMY_CONSOLE)
673 conswitchp = &dummy_con;
674#endif
675#endif
676}
677
678static int __init get_model_name(struct cpuinfo_x86 *c)
679{
680 unsigned int *v;
681
682 if (c->x86_cpuid_level < 0x80000004)
683 return 0;
684
685 v = (unsigned int *) c->x86_model_id;
686 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
687 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
688 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
689 c->x86_model_id[48] = 0;
690 return 1;
691}
692
693
694static void __init display_cacheinfo(struct cpuinfo_x86 *c)
695{
696 unsigned int n, dummy, eax, ebx, ecx, edx;
697
698 n = c->x86_cpuid_level;
699
700 if (n >= 0x80000005) {
701 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
702 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
703 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
704 c->x86_cache_size=(ecx>>24)+(edx>>24);
705 /* On K8 L1 TLB is inclusive, so don't count it */
706 c->x86_tlbsize = 0;
707 }
708
709 if (n >= 0x80000006) {
710 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
711 ecx = cpuid_ecx(0x80000006);
712 c->x86_cache_size = ecx >> 16;
713 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
714
715 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
716 c->x86_cache_size, ecx & 0xFF);
717 }
718
719 if (n >= 0x80000007)
720 cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
721 if (n >= 0x80000008) {
722 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
723 c->x86_virt_bits = (eax >> 8) & 0xff;
724 c->x86_phys_bits = eax & 0xff;
725 }
726}
727
728
729static int __init init_amd(struct cpuinfo_x86 *c)
730{
731 int r;
732 int level;
733#ifdef CONFIG_NUMA
734 int cpu;
735#endif
736
737 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
738 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
739 clear_bit(0*32+31, &c->x86_capability);
740
741 /* C-stepping K8? */
742 level = cpuid_eax(1);
743 if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
744 set_bit(X86_FEATURE_K8_C, &c->x86_capability);
745
746 r = get_model_name(c);
747 if (!r) {
748 switch (c->x86) {
749 case 15:
750 /* Should distinguish Models here, but this is only
751 a fallback anyways. */
752 strcpy(c->x86_model_id, "Hammer");
753 break;
754 }
755 }
756 display_cacheinfo(c);
757
758 if (c->x86_cpuid_level >= 0x80000008) {
759 c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
760 if (c->x86_num_cores & (c->x86_num_cores - 1))
761 c->x86_num_cores = 1;
762
763#ifdef CONFIG_NUMA
764 /* On a dual core setup the lower bits of apic id
765 distingush the cores. Fix up the CPU<->node mappings
766 here based on that.
767 Assumes number of cores is a power of two.
768 When using SRAT use mapping from SRAT. */
769 cpu = c->x86_apicid;
770 if (acpi_numa <= 0 && c->x86_num_cores > 1) {
771 cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
772 if (!node_online(cpu_to_node[cpu]))
773 cpu_to_node[cpu] = first_node(node_online_map);
774 }
775 printk(KERN_INFO "CPU %d(%d) -> Node %d\n",
776 cpu, c->x86_num_cores, cpu_to_node[cpu]);
777#endif
778 }
779
780 return r;
781}
782
783static void __init detect_ht(struct cpuinfo_x86 *c)
784{
785#ifdef CONFIG_SMP
786 u32 eax, ebx, ecx, edx;
787 int index_lsb, index_msb, tmp;
788 int cpu = smp_processor_id();
789
790 if (!cpu_has(c, X86_FEATURE_HT))
791 return;
792
793 cpuid(1, &eax, &ebx, &ecx, &edx);
794 smp_num_siblings = (ebx & 0xff0000) >> 16;
795
796 if (smp_num_siblings == 1) {
797 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
798 } else if (smp_num_siblings > 1) {
799 index_lsb = 0;
800 index_msb = 31;
801 /*
802 * At this point we only support two siblings per
803 * processor package.
804 */
805 if (smp_num_siblings > NR_CPUS) {
806 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
807 smp_num_siblings = 1;
808 return;
809 }
810 tmp = smp_num_siblings;
811 while ((tmp & 1) == 0) {
812 tmp >>=1 ;
813 index_lsb++;
814 }
815 tmp = smp_num_siblings;
816 while ((tmp & 0x80000000 ) == 0) {
817 tmp <<=1 ;
818 index_msb--;
819 }
820 if (index_lsb != index_msb )
821 index_msb++;
822 phys_proc_id[cpu] = phys_pkg_id(index_msb);
823
824 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
825 phys_proc_id[cpu]);
826 }
827#endif
828}
829
830static void __init sched_cmp_hack(struct cpuinfo_x86 *c)
831{
832#ifdef CONFIG_SMP
833 /* AMD dual core looks like HT but isn't really. Hide it from the
834 scheduler. This works around problems with the domain scheduler.
835 Also probably gives slightly better scheduling and disables
836 SMT nice which is harmful on dual core.
837 TBD tune the domain scheduler for dual core. */
838 if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY))
839 smp_num_siblings = 1;
840#endif
841}
842
843static void __init init_intel(struct cpuinfo_x86 *c)
844{
845 /* Cache sizes */
846 unsigned n;
847
848 init_intel_cacheinfo(c);
849 n = c->x86_cpuid_level;
850 if (n >= 0x80000008) {
851 unsigned eax = cpuid_eax(0x80000008);
852 c->x86_virt_bits = (eax >> 8) & 0xff;
853 c->x86_phys_bits = eax & 0xff;
854 }
855
856 if (c->x86 == 15)
857 c->x86_cache_alignment = c->x86_clflush_size * 2;
858}
859
860void __init get_cpu_vendor(struct cpuinfo_x86 *c)
861{
862 char *v = c->x86_vendor_id;
863
864 if (!strcmp(v, "AuthenticAMD"))
865 c->x86_vendor = X86_VENDOR_AMD;
866 else if (!strcmp(v, "GenuineIntel"))
867 c->x86_vendor = X86_VENDOR_INTEL;
868 else
869 c->x86_vendor = X86_VENDOR_UNKNOWN;
870}
871
872struct cpu_model_info {
873 int vendor;
874 int family;
875 char *model_names[16];
876};
877
878/* Do some early cpuid on the boot CPU to get some parameter that are
879 needed before check_bugs. Everything advanced is in identify_cpu
880 below. */
881void __init early_identify_cpu(struct cpuinfo_x86 *c)
882{
883 u32 tfms;
884
885 c->loops_per_jiffy = loops_per_jiffy;
886 c->x86_cache_size = -1;
887 c->x86_vendor = X86_VENDOR_UNKNOWN;
888 c->x86_model = c->x86_mask = 0; /* So far unknown... */
889 c->x86_vendor_id[0] = '\0'; /* Unset */
890 c->x86_model_id[0] = '\0'; /* Unset */
891 c->x86_clflush_size = 64;
892 c->x86_cache_alignment = c->x86_clflush_size;
893 c->x86_num_cores = 1;
894 c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
895 c->x86_cpuid_level = 0;
896 memset(&c->x86_capability, 0, sizeof c->x86_capability);
897
898 /* Get vendor name */
899 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
900 (unsigned int *)&c->x86_vendor_id[0],
901 (unsigned int *)&c->x86_vendor_id[8],
902 (unsigned int *)&c->x86_vendor_id[4]);
903
904 get_cpu_vendor(c);
905
906 /* Initialize the standard set of capabilities */
907 /* Note that the vendor-specific code below might override */
908
909 /* Intel-defined flags: level 0x00000001 */
910 if (c->cpuid_level >= 0x00000001) {
911 __u32 misc;
912 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
913 &c->x86_capability[0]);
914 c->x86 = (tfms >> 8) & 0xf;
915 c->x86_model = (tfms >> 4) & 0xf;
916 c->x86_mask = tfms & 0xf;
917 if (c->x86 == 0xf) {
918 c->x86 += (tfms >> 20) & 0xff;
919 c->x86_model += ((tfms >> 16) & 0xF) << 4;
920 }
921 if (c->x86_capability[0] & (1<<19))
922 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
923 c->x86_apicid = misc >> 24;
924 } else {
925 /* Have CPUID level 0 only - unheard of */
926 c->x86 = 4;
927 }
928}
929
930/*
931 * This does the hard work of actually picking apart the CPU stuff...
932 */
933void __init identify_cpu(struct cpuinfo_x86 *c)
934{
935 int i;
936 u32 xlvl;
937
938 early_identify_cpu(c);
939
940 /* AMD-defined flags: level 0x80000001 */
941 xlvl = cpuid_eax(0x80000000);
942 c->x86_cpuid_level = xlvl;
943 if ((xlvl & 0xffff0000) == 0x80000000) {
944 if (xlvl >= 0x80000001) {
945 c->x86_capability[1] = cpuid_edx(0x80000001);
946 c->x86_capability[5] = cpuid_ecx(0x80000001);
947 }
948 if (xlvl >= 0x80000004)
949 get_model_name(c); /* Default name */
950 }
951
952 /* Transmeta-defined flags: level 0x80860001 */
953 xlvl = cpuid_eax(0x80860000);
954 if ((xlvl & 0xffff0000) == 0x80860000) {
955 /* Don't set x86_cpuid_level here for now to not confuse. */
956 if (xlvl >= 0x80860001)
957 c->x86_capability[2] = cpuid_edx(0x80860001);
958 }
959
960 /*
961 * Vendor-specific initialization. In this section we
962 * canonicalize the feature flags, meaning if there are
963 * features a certain CPU supports which CPUID doesn't
964 * tell us, CPUID claiming incorrect flags, or other bugs,
965 * we handle them here.
966 *
967 * At the end of this section, c->x86_capability better
968 * indicate the features this CPU genuinely supports!
969 */
970 switch (c->x86_vendor) {
971 case X86_VENDOR_AMD:
972 init_amd(c);
973 break;
974
975 case X86_VENDOR_INTEL:
976 init_intel(c);
977 break;
978
979 case X86_VENDOR_UNKNOWN:
980 default:
981 display_cacheinfo(c);
982 break;
983 }
984
985 select_idle_routine(c);
986 detect_ht(c);
987 sched_cmp_hack(c);
988
989 /*
990 * On SMP, boot_cpu_data holds the common feature set between
991 * all CPUs; so make sure that we indicate which features are
992 * common between the CPUs. The first time this routine gets
993 * executed, c == &boot_cpu_data.
994 */
995 if (c != &boot_cpu_data) {
996 /* AND the already accumulated flags with these */
997 for (i = 0 ; i < NCAPINTS ; i++)
998 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
999 }
1000
1001#ifdef CONFIG_X86_MCE
1002 mcheck_init(c);
1003#endif
1004#ifdef CONFIG_NUMA
1005 if (c != &boot_cpu_data)
1006 numa_add_cpu(c - cpu_data);
1007#endif
1008}
1009
1010
1011void __init print_cpu_info(struct cpuinfo_x86 *c)
1012{
1013 if (c->x86_model_id[0])
1014 printk("%s", c->x86_model_id);
1015
1016 if (c->x86_mask || c->cpuid_level >= 0)
1017 printk(" stepping %02x\n", c->x86_mask);
1018 else
1019 printk("\n");
1020}
1021
1022/*
1023 * Get CPU information for use by the procfs.
1024 */
1025
1026static int show_cpuinfo(struct seq_file *m, void *v)
1027{
1028 struct cpuinfo_x86 *c = v;
1029
1030 /*
1031 * These flag bits must match the definitions in <asm/cpufeature.h>.
1032 * NULL means this bit is undefined or reserved; either way it doesn't
1033 * have meaning as far as Linux is concerned. Note that it's important
1034 * to realize there is a difference between this table and CPUID -- if
1035 * applications want to get the raw CPUID data, they should access
1036 * /dev/cpu/<cpu_nr>/cpuid instead.
1037 */
1038 static char *x86_cap_flags[] = {
1039 /* Intel-defined */
1040 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
1041 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
1042 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
1043 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
1044
1045 /* AMD-defined */
1046 "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1047 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
1048 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
1049 NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow",
1050
1051 /* Transmeta-defined */
1052 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
1053 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1054 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1055 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1056
1057 /* Other (Linux-defined) */
1058 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL,
1059 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1060 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1061 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1062
1063 /* Intel-defined (#2) */
1064 "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
1065 "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
1066 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1067 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1068
1069 /* AMD-defined (#2) */
1070 "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL,
1071 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1072 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1073 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
1074 };
1075 static char *x86_power_flags[] = {
1076 "ts", /* temperature sensor */
1077 "fid", /* frequency id control */
1078 "vid", /* voltage id control */
1079 "ttp", /* thermal trip */
1080 "tm",
1081 "stc"
1082 };
1083
1084
1085#ifdef CONFIG_SMP
1086 if (!cpu_online(c-cpu_data))
1087 return 0;
1088#endif
1089
1090 seq_printf(m,"processor\t: %u\n"
1091 "vendor_id\t: %s\n"
1092 "cpu family\t: %d\n"
1093 "model\t\t: %d\n"
1094 "model name\t: %s\n",
1095 (unsigned)(c-cpu_data),
1096 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
1097 c->x86,
1098 (int)c->x86_model,
1099 c->x86_model_id[0] ? c->x86_model_id : "unknown");
1100
1101 if (c->x86_mask || c->cpuid_level >= 0)
1102 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
1103 else
1104 seq_printf(m, "stepping\t: unknown\n");
1105
1106 if (cpu_has(c,X86_FEATURE_TSC)) {
1107 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1108 cpu_khz / 1000, (cpu_khz % 1000));
1109 }
1110
1111 /* Cache size */
1112 if (c->x86_cache_size >= 0)
1113 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1114
1115#ifdef CONFIG_SMP
1116 seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]);
1117 seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings);
1118#endif
1119
1120 seq_printf(m,
1121 "fpu\t\t: yes\n"
1122 "fpu_exception\t: yes\n"
1123 "cpuid level\t: %d\n"
1124 "wp\t\t: yes\n"
1125 "flags\t\t:",
1126 c->cpuid_level);
1127
1128 {
1129 int i;
1130 for ( i = 0 ; i < 32*NCAPINTS ; i++ )
1131 if ( test_bit(i, &c->x86_capability) &&
1132 x86_cap_flags[i] != NULL )
1133 seq_printf(m, " %s", x86_cap_flags[i]);
1134 }
1135
1136 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
1137 c->loops_per_jiffy/(500000/HZ),
1138 (c->loops_per_jiffy/(5000/HZ)) % 100);
1139
1140 if (c->x86_tlbsize > 0)
1141 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
1142 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
1143 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
1144
1145 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
1146 c->x86_phys_bits, c->x86_virt_bits);
1147
1148 seq_printf(m, "power management:");
1149 {
1150 unsigned i;
1151 for (i = 0; i < 32; i++)
1152 if (c->x86_power & (1 << i)) {
1153 if (i < ARRAY_SIZE(x86_power_flags))
1154 seq_printf(m, " %s", x86_power_flags[i]);
1155 else
1156 seq_printf(m, " [%d]", i);
1157 }
1158 }
1159 seq_printf(m, "\n");
1160
1161 if (c->x86_num_cores > 1)
1162 seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores);
1163
1164 seq_printf(m, "\n\n");
1165
1166 return 0;
1167}
1168
1169static void *c_start(struct seq_file *m, loff_t *pos)
1170{
1171 return *pos < NR_CPUS ? cpu_data + *pos : NULL;
1172}
1173
1174static void *c_next(struct seq_file *m, void *v, loff_t *pos)
1175{
1176 ++*pos;
1177 return c_start(m, pos);
1178}
1179
1180static void c_stop(struct seq_file *m, void *v)
1181{
1182}
1183
1184struct seq_operations cpuinfo_op = {
1185 .start =c_start,
1186 .next = c_next,
1187 .stop = c_stop,
1188 .show = show_cpuinfo,
1189};
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
new file mode 100644
index 000000000000..b5305b04bc40
--- /dev/null
+++ b/arch/x86_64/kernel/setup64.c
@@ -0,0 +1,292 @@
1/*
2 * X86-64 specific CPU setup.
3 * Copyright (C) 1995 Linus Torvalds
4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
5 * See setup.c for older changelog.
6 * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
7 */
8#include <linux/config.h>
9#include <linux/init.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/bootmem.h>
14#include <linux/bitops.h>
15#include <asm/pda.h>
16#include <asm/pgtable.h>
17#include <asm/processor.h>
18#include <asm/desc.h>
19#include <asm/atomic.h>
20#include <asm/mmu_context.h>
21#include <asm/smp.h>
22#include <asm/i387.h>
23#include <asm/percpu.h>
24#include <asm/mtrr.h>
25#include <asm/proto.h>
26#include <asm/mman.h>
27#include <asm/numa.h>
28
29char x86_boot_params[2048] __initdata = {0,};
30
31cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
32
33struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
34
35extern struct task_struct init_task;
36
37extern unsigned char __per_cpu_start[], __per_cpu_end[];
38
39extern struct desc_ptr cpu_gdt_descr[];
40struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
41
42char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
43
44unsigned long __supported_pte_mask = ~0UL;
45static int do_not_nx __initdata = 0;
46
47/* noexec=on|off
48Control non executable mappings for 64bit processes.
49
50on Enable(default)
51off Disable
52*/
53int __init nonx_setup(char *str)
54{
55 if (!strncmp(str, "on", 2)) {
56 __supported_pte_mask |= _PAGE_NX;
57 do_not_nx = 0;
58 } else if (!strncmp(str, "off", 3)) {
59 do_not_nx = 1;
60 __supported_pte_mask &= ~_PAGE_NX;
61 }
62 return 0;
63}
64__setup("noexec=", nonx_setup); /* parsed early actually */
65
66int force_personality32 = READ_IMPLIES_EXEC;
67
68/* noexec32=on|off
69Control non executable heap for 32bit processes.
70To control the stack too use noexec=off
71
72on PROT_READ does not imply PROT_EXEC for 32bit processes
73off PROT_READ implies PROT_EXEC (default)
74*/
75static int __init nonx32_setup(char *str)
76{
77 if (!strcmp(str, "on"))
78 force_personality32 &= ~READ_IMPLIES_EXEC;
79 else if (!strcmp(str, "off"))
80 force_personality32 |= READ_IMPLIES_EXEC;
81 return 0;
82}
83__setup("noexec32=", nonx32_setup);
84
85/*
86 * Great future plan:
87 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
88 * Always point %gs to its beginning
89 */
90void __init setup_per_cpu_areas(void)
91{
92 int i;
93 unsigned long size;
94
95 /* Copy section for each CPU (we discard the original) */
96 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
97#ifdef CONFIG_MODULES
98 if (size < PERCPU_ENOUGH_ROOM)
99 size = PERCPU_ENOUGH_ROOM;
100#endif
101
102 for (i = 0; i < NR_CPUS; i++) {
103 unsigned char *ptr;
104
105 if (!NODE_DATA(cpu_to_node(i))) {
106 printk("cpu with no node %d, num_online_nodes %d\n",
107 i, num_online_nodes());
108 ptr = alloc_bootmem(size);
109 } else {
110 ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
111 }
112 if (!ptr)
113 panic("Cannot allocate cpu data for CPU %d\n", i);
114 cpu_pda[i].data_offset = ptr - __per_cpu_start;
115 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
116 }
117}
118
119void pda_init(int cpu)
120{
121 struct x8664_pda *pda = &cpu_pda[cpu];
122
123 /* Setup up data that may be needed in __get_free_pages early */
124 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
125 wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
126
127 pda->me = pda;
128 pda->cpunumber = cpu;
129 pda->irqcount = -1;
130 pda->kernelstack =
131 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
132 pda->active_mm = &init_mm;
133 pda->mmu_state = 0;
134
135 if (cpu == 0) {
136 /* others are initialized in smpboot.c */
137 pda->pcurrent = &init_task;
138 pda->irqstackptr = boot_cpu_stack;
139 } else {
140 pda->irqstackptr = (char *)
141 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
142 if (!pda->irqstackptr)
143 panic("cannot allocate irqstack for cpu %d", cpu);
144 }
145
146 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
147
148 pda->irqstackptr += IRQSTACKSIZE-64;
149}
150
151char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ]
152__attribute__((section(".bss.page_aligned")));
153
154/* May not be marked __init: used by software suspend */
155void syscall_init(void)
156{
157 /*
158 * LSTAR and STAR live in a bit strange symbiosis.
159 * They both write to the same internal register. STAR allows to set CS/DS
160 * but only a 32bit target. LSTAR sets the 64bit rip.
161 */
162 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
163 wrmsrl(MSR_LSTAR, system_call);
164
165#ifdef CONFIG_IA32_EMULATION
166 syscall32_cpu_init ();
167#endif
168
169 /* Flags to clear on syscall */
170 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
171}
172
173void __init check_efer(void)
174{
175 unsigned long efer;
176
177 rdmsrl(MSR_EFER, efer);
178 if (!(efer & EFER_NX) || do_not_nx) {
179 __supported_pte_mask &= ~_PAGE_NX;
180 }
181}
182
183/*
184 * cpu_init() initializes state that is per-CPU. Some data is already
185 * initialized (naturally) in the bootstrap process, such as the GDT
186 * and IDT. We reload them nevertheless, this function acts as a
187 * 'CPU state barrier', nothing should get across.
188 * A lot of state is already set up in PDA init.
189 */
190void __init cpu_init (void)
191{
192#ifdef CONFIG_SMP
193 int cpu = stack_smp_processor_id();
194#else
195 int cpu = smp_processor_id();
196#endif
197 struct tss_struct *t = &per_cpu(init_tss, cpu);
198 unsigned long v;
199 char *estacks = NULL;
200 struct task_struct *me;
201 int i;
202
203 /* CPU 0 is initialised in head64.c */
204 if (cpu != 0) {
205 pda_init(cpu);
206 } else
207 estacks = boot_exception_stacks;
208
209 me = current;
210
211 if (cpu_test_and_set(cpu, cpu_initialized))
212 panic("CPU#%d already initialized!\n", cpu);
213
214 printk("Initializing CPU#%d\n", cpu);
215
216 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
217
218 /*
219 * Initialize the per-CPU GDT with the boot GDT,
220 * and set up the GDT descriptor:
221 */
222 if (cpu) {
223 memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE);
224 }
225
226 cpu_gdt_descr[cpu].size = GDT_SIZE;
227 cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu];
228 asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
229 asm volatile("lidt %0" :: "m" (idt_descr));
230
231 memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8);
232
233 /*
234 * Delete NT
235 */
236
237 asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax");
238
239 syscall_init();
240
241 wrmsrl(MSR_FS_BASE, 0);
242 wrmsrl(MSR_KERNEL_GS_BASE, 0);
243 barrier();
244
245 check_efer();
246
247 /*
248 * set up and load the per-CPU TSS
249 */
250 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
251 if (cpu) {
252 estacks = (char *)__get_free_pages(GFP_ATOMIC,
253 EXCEPTION_STACK_ORDER);
254 if (!estacks)
255 panic("Cannot allocate exception stack %ld %d\n",
256 v, cpu);
257 }
258 estacks += EXCEPTION_STKSZ;
259 t->ist[v] = (unsigned long)estacks;
260 }
261
262 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
263 /*
264 * <= is required because the CPU will access up to
265 * 8 bits beyond the end of the IO permission bitmap.
266 */
267 for (i = 0; i <= IO_BITMAP_LONGS; i++)
268 t->io_bitmap[i] = ~0UL;
269
270 atomic_inc(&init_mm.mm_count);
271 me->active_mm = &init_mm;
272 if (me->mm)
273 BUG();
274 enter_lazy_tlb(&init_mm, me);
275
276 set_tss_desc(cpu, t);
277 load_TR_desc();
278 load_LDT(&init_mm.context);
279
280 /*
281 * Clear all 6 debug registers:
282 */
283
284 set_debug(0UL, 0);
285 set_debug(0UL, 1);
286 set_debug(0UL, 2);
287 set_debug(0UL, 3);
288 set_debug(0UL, 6);
289 set_debug(0UL, 7);
290
291 fpu_init();
292}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
new file mode 100644
index 000000000000..5ace32c6ac5c
--- /dev/null
+++ b/arch/x86_64/kernel/signal.c
@@ -0,0 +1,486 @@
1/*
2 * linux/arch/x86_64/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 *
7 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
8 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
9 * 2000-2002 x86-64 support by Andi Kleen
10 *
11 * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $
12 */
13
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/smp_lock.h>
18#include <linux/kernel.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/wait.h>
22#include <linux/ptrace.h>
23#include <linux/unistd.h>
24#include <linux/stddef.h>
25#include <linux/personality.h>
26#include <linux/compiler.h>
27#include <asm/ucontext.h>
28#include <asm/uaccess.h>
29#include <asm/i387.h>
30#include <asm/proto.h>
31
32/* #define DEBUG_SIG 1 */
33
34#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
35
36void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
37 sigset_t *set, struct pt_regs * regs);
38void ia32_setup_frame(int sig, struct k_sigaction *ka,
39 sigset_t *set, struct pt_regs * regs);
40
41asmlinkage long
42sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
43{
44 sigset_t saveset, newset;
45
46 /* XXX: Don't preclude handling different sized sigset_t's. */
47 if (sigsetsize != sizeof(sigset_t))
48 return -EINVAL;
49
50 if (copy_from_user(&newset, unewset, sizeof(newset)))
51 return -EFAULT;
52 sigdelsetmask(&newset, ~_BLOCKABLE);
53
54 spin_lock_irq(&current->sighand->siglock);
55 saveset = current->blocked;
56 current->blocked = newset;
57 recalc_sigpending();
58 spin_unlock_irq(&current->sighand->siglock);
59#ifdef DEBUG_SIG
60 printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
61 saveset, newset, regs, regs->rip);
62#endif
63 regs->rax = -EINTR;
64 while (1) {
65 current->state = TASK_INTERRUPTIBLE;
66 schedule();
67 if (do_signal(regs, &saveset))
68 return -EINTR;
69 }
70}
71
72asmlinkage long
73sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
74 struct pt_regs *regs)
75{
76 return do_sigaltstack(uss, uoss, regs->rsp);
77}
78
79
80/*
81 * Do a signal return; undo the signal stack.
82 */
83
84struct rt_sigframe
85{
86 char *pretcode;
87 struct ucontext uc;
88 struct siginfo info;
89};
90
91static int
92restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
93{
94 unsigned int err = 0;
95
96 /* Always make any pending restarted system calls return -EINTR */
97 current_thread_info()->restart_block.fn = do_no_restart_syscall;
98
99#define COPY(x) err |= __get_user(regs->x, &sc->x)
100
101 COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
102 COPY(rdx); COPY(rcx); COPY(rip);
103 COPY(r8);
104 COPY(r9);
105 COPY(r10);
106 COPY(r11);
107 COPY(r12);
108 COPY(r13);
109 COPY(r14);
110 COPY(r15);
111
112 {
113 unsigned int tmpflags;
114 err |= __get_user(tmpflags, &sc->eflags);
115 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
116 regs->orig_rax = -1; /* disable syscall checks */
117 }
118
119 {
120 struct _fpstate __user * buf;
121 err |= __get_user(buf, &sc->fpstate);
122
123 if (buf) {
124 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
125 goto badframe;
126 err |= restore_i387(buf);
127 } else {
128 struct task_struct *me = current;
129 if (used_math()) {
130 clear_fpu(me);
131 clear_used_math();
132 }
133 }
134 }
135
136 err |= __get_user(*prax, &sc->rax);
137 return err;
138
139badframe:
140 return 1;
141}
142
143asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
144{
145 struct rt_sigframe __user *frame;
146 sigset_t set;
147 unsigned long eax;
148
149 frame = (struct rt_sigframe __user *)(regs->rsp - 8);
150 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
151 goto badframe;
152 }
153 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) {
154 goto badframe;
155 }
156
157 sigdelsetmask(&set, ~_BLOCKABLE);
158 spin_lock_irq(&current->sighand->siglock);
159 current->blocked = set;
160 recalc_sigpending();
161 spin_unlock_irq(&current->sighand->siglock);
162
163 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
164 goto badframe;
165
166#ifdef DEBUG_SIG
167 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax);
168#endif
169
170 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
171 goto badframe;
172
173 return eax;
174
175badframe:
176 signal_fault(regs,frame,"sigreturn");
177 return 0;
178}
179
180/*
181 * Set up a signal frame.
182 */
183
184static inline int
185setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
186{
187 int err = 0;
188 unsigned long eflags;
189
190 err |= __put_user(0, &sc->gs);
191 err |= __put_user(0, &sc->fs);
192
193 err |= __put_user(regs->rdi, &sc->rdi);
194 err |= __put_user(regs->rsi, &sc->rsi);
195 err |= __put_user(regs->rbp, &sc->rbp);
196 err |= __put_user(regs->rsp, &sc->rsp);
197 err |= __put_user(regs->rbx, &sc->rbx);
198 err |= __put_user(regs->rdx, &sc->rdx);
199 err |= __put_user(regs->rcx, &sc->rcx);
200 err |= __put_user(regs->rax, &sc->rax);
201 err |= __put_user(regs->r8, &sc->r8);
202 err |= __put_user(regs->r9, &sc->r9);
203 err |= __put_user(regs->r10, &sc->r10);
204 err |= __put_user(regs->r11, &sc->r11);
205 err |= __put_user(regs->r12, &sc->r12);
206 err |= __put_user(regs->r13, &sc->r13);
207 err |= __put_user(regs->r14, &sc->r14);
208 err |= __put_user(regs->r15, &sc->r15);
209 err |= __put_user(me->thread.trap_no, &sc->trapno);
210 err |= __put_user(me->thread.error_code, &sc->err);
211 err |= __put_user(regs->rip, &sc->rip);
212 eflags = regs->eflags;
213 if (current->ptrace & PT_PTRACED) {
214 eflags &= ~TF_MASK;
215 }
216 err |= __put_user(eflags, &sc->eflags);
217 err |= __put_user(mask, &sc->oldmask);
218 err |= __put_user(me->thread.cr2, &sc->cr2);
219
220 return err;
221}
222
223/*
224 * Determine which stack to use..
225 */
226
227static void __user *
228get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
229{
230 unsigned long rsp;
231
232 /* Default to using normal stack - redzone*/
233 rsp = regs->rsp - 128;
234
235 /* This is the X/Open sanctioned signal stack switching. */
236 /* RED-PEN: redzone on that stack? */
237 if (ka->sa.sa_flags & SA_ONSTACK) {
238 if (sas_ss_flags(rsp) == 0)
239 rsp = current->sas_ss_sp + current->sas_ss_size;
240 }
241
242 return (void __user *)round_down(rsp - size, 16);
243}
244
245static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
246 sigset_t *set, struct pt_regs * regs)
247{
248 struct rt_sigframe __user *frame;
249 struct _fpstate __user *fp = NULL;
250 int err = 0;
251 struct task_struct *me = current;
252
253 if (used_math()) {
254 fp = get_stack(ka, regs, sizeof(struct _fpstate));
255 frame = (void __user *)round_down(
256 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
257
258 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
259 goto give_sigsegv;
260
261 if (save_i387(fp) < 0)
262 err |= -1;
263 } else
264 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
265
266 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
267 goto give_sigsegv;
268
269 if (ka->sa.sa_flags & SA_SIGINFO) {
270 err |= copy_siginfo_to_user(&frame->info, info);
271 if (err)
272 goto give_sigsegv;
273 }
274
275 /* Create the ucontext. */
276 err |= __put_user(0, &frame->uc.uc_flags);
277 err |= __put_user(0, &frame->uc.uc_link);
278 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
279 err |= __put_user(sas_ss_flags(regs->rsp),
280 &frame->uc.uc_stack.ss_flags);
281 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
282 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
283 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
284 if (sizeof(*set) == 16) {
285 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
286 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
287 } else
288 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
289
290 /* Set up to return from userspace. If provided, use a stub
291 already in userspace. */
292 /* x86-64 should always use SA_RESTORER. */
293 if (ka->sa.sa_flags & SA_RESTORER) {
294 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
295 } else {
296 /* could use a vstub here */
297 goto give_sigsegv;
298 }
299
300 if (err)
301 goto give_sigsegv;
302
303#ifdef DEBUG_SIG
304 printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
305#endif
306
307 /* Set up registers for signal handler */
308 {
309 struct exec_domain *ed = current_thread_info()->exec_domain;
310 if (unlikely(ed && ed->signal_invmap && sig < 32))
311 sig = ed->signal_invmap[sig];
312 }
313 regs->rdi = sig;
314 /* In case the signal handler was declared without prototypes */
315 regs->rax = 0;
316
317 /* This also works for non SA_SIGINFO handlers because they expect the
318 next argument after the signal number on the stack. */
319 regs->rsi = (unsigned long)&frame->info;
320 regs->rdx = (unsigned long)&frame->uc;
321 regs->rip = (unsigned long) ka->sa.sa_handler;
322
323 regs->rsp = (unsigned long)frame;
324
325 set_fs(USER_DS);
326 if (regs->eflags & TF_MASK) {
327 if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) {
328 ptrace_notify(SIGTRAP);
329 } else {
330 regs->eflags &= ~TF_MASK;
331 }
332 }
333
334#ifdef DEBUG_SIG
335 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
336 current->comm, current->pid, frame, regs->rip, frame->pretcode);
337#endif
338
339 return;
340
341give_sigsegv:
342 force_sigsegv(sig, current);
343}
344
345/*
346 * OK, we're invoking a handler
347 */
348
349static void
350handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
351 sigset_t *oldset, struct pt_regs *regs)
352{
353#ifdef DEBUG_SIG
354 printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
355 current->pid, sig,
356 regs->rip, regs->rsp, regs);
357#endif
358
359 /* Are we from a system call? */
360 if ((long)regs->orig_rax >= 0) {
361 /* If so, check system call restarting.. */
362 switch (regs->rax) {
363 case -ERESTART_RESTARTBLOCK:
364 case -ERESTARTNOHAND:
365 regs->rax = -EINTR;
366 break;
367
368 case -ERESTARTSYS:
369 if (!(ka->sa.sa_flags & SA_RESTART)) {
370 regs->rax = -EINTR;
371 break;
372 }
373 /* fallthrough */
374 case -ERESTARTNOINTR:
375 regs->rax = regs->orig_rax;
376 regs->rip -= 2;
377 break;
378 }
379 }
380
381#ifdef CONFIG_IA32_EMULATION
382 if (test_thread_flag(TIF_IA32)) {
383 if (ka->sa.sa_flags & SA_SIGINFO)
384 ia32_setup_rt_frame(sig, ka, info, oldset, regs);
385 else
386 ia32_setup_frame(sig, ka, oldset, regs);
387 } else
388#endif
389 setup_rt_frame(sig, ka, info, oldset, regs);
390
391 if (!(ka->sa.sa_flags & SA_NODEFER)) {
392 spin_lock_irq(&current->sighand->siglock);
393 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
394 sigaddset(&current->blocked,sig);
395 recalc_sigpending();
396 spin_unlock_irq(&current->sighand->siglock);
397 }
398}
399
400/*
401 * Note that 'init' is a special process: it doesn't get signals it doesn't
402 * want to handle. Thus you cannot kill init even with a SIGKILL even by
403 * mistake.
404 */
405int do_signal(struct pt_regs *regs, sigset_t *oldset)
406{
407 struct k_sigaction ka;
408 siginfo_t info;
409 int signr;
410
411 /*
412 * We want the common case to go fast, which
413 * is why we may in certain cases get here from
414 * kernel mode. Just return without doing anything
415 * if so.
416 */
417 if ((regs->cs & 3) != 3)
418 return 1;
419
420 if (try_to_freeze(0))
421 goto no_signal;
422
423 if (!oldset)
424 oldset = &current->blocked;
425
426 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
427 if (signr > 0) {
428 /* Reenable any watchpoints before delivering the
429 * signal to user space. The processor register will
430 * have been cleared if the watchpoint triggered
431 * inside the kernel.
432 */
433 if (current->thread.debugreg7)
434 asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7));
435
436 /* Whee! Actually deliver the signal. */
437 handle_signal(signr, &info, &ka, oldset, regs);
438 return 1;
439 }
440
441 no_signal:
442 /* Did we come from a system call? */
443 if ((long)regs->orig_rax >= 0) {
444 /* Restart the system call - no handlers present */
445 long res = regs->rax;
446 if (res == -ERESTARTNOHAND ||
447 res == -ERESTARTSYS ||
448 res == -ERESTARTNOINTR) {
449 regs->rax = regs->orig_rax;
450 regs->rip -= 2;
451 }
452 if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
453 regs->rax = __NR_restart_syscall;
454 regs->rip -= 2;
455 }
456 }
457 return 0;
458}
459
460void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags)
461{
462#ifdef DEBUG_SIG
463 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
464 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current));
465#endif
466
467 /* Pending single-step? */
468 if (thread_info_flags & _TIF_SINGLESTEP) {
469 regs->eflags |= TF_MASK;
470 clear_thread_flag(TIF_SINGLESTEP);
471 }
472
473 /* deal with pending signal delivery */
474 if (thread_info_flags & _TIF_SIGPENDING)
475 do_signal(regs,oldset);
476}
477
478void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
479{
480 struct task_struct *me = current;
481 if (exception_trace)
482 printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
483 me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax);
484
485 force_sig(SIGSEGV, me);
486}
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
new file mode 100644
index 000000000000..e3cdbf9a88bd
--- /dev/null
+++ b/arch/x86_64/kernel/smp.c
@@ -0,0 +1,415 @@
1/*
2 * Intel SMP support routines.
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6 * (c) 2002,2003 Andi Kleen, SuSE Labs.
7 *
8 * This code is released under the GNU General Public License version 2 or
9 * later.
10 */
11
12#include <linux/init.h>
13
14#include <linux/mm.h>
15#include <linux/irq.h>
16#include <linux/delay.h>
17#include <linux/spinlock.h>
18#include <linux/smp_lock.h>
19#include <linux/smp.h>
20#include <linux/kernel_stat.h>
21#include <linux/mc146818rtc.h>
22#include <linux/interrupt.h>
23
24#include <asm/mtrr.h>
25#include <asm/pgalloc.h>
26#include <asm/tlbflush.h>
27#include <asm/mach_apic.h>
28#include <asm/mmu_context.h>
29#include <asm/proto.h>
30
31/*
32 * Smarter SMP flushing macros.
33 * c/o Linus Torvalds.
34 *
35 * These mean you can really definitely utterly forget about
36 * writing to user space from interrupts. (Its not allowed anyway).
37 *
38 * Optimizations Manfred Spraul <manfred@colorfullife.com>
39 */
40
41static cpumask_t flush_cpumask;
42static struct mm_struct * flush_mm;
43static unsigned long flush_va;
44static DEFINE_SPINLOCK(tlbstate_lock);
45#define FLUSH_ALL -1ULL
46
47/*
48 * We cannot call mmdrop() because we are in interrupt context,
49 * instead update mm->cpu_vm_mask.
50 */
51static inline void leave_mm (unsigned long cpu)
52{
53 if (read_pda(mmu_state) == TLBSTATE_OK)
54 BUG();
55 clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
56 load_cr3(swapper_pg_dir);
57}
58
59/*
60 *
61 * The flush IPI assumes that a thread switch happens in this order:
62 * [cpu0: the cpu that switches]
63 * 1) switch_mm() either 1a) or 1b)
64 * 1a) thread switch to a different mm
65 * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
66 * Stop ipi delivery for the old mm. This is not synchronized with
67 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
68 * for the wrong mm, and in the worst case we perform a superfluous
69 * tlb flush.
70 * 1a2) set cpu mmu_state to TLBSTATE_OK
71 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
72 * was in lazy tlb mode.
73 * 1a3) update cpu active_mm
74 * Now cpu0 accepts tlb flushes for the new mm.
75 * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
76 * Now the other cpus will send tlb flush ipis.
77 * 1a4) change cr3.
78 * 1b) thread switch without mm change
79 * cpu active_mm is correct, cpu0 already handles
80 * flush ipis.
81 * 1b1) set cpu mmu_state to TLBSTATE_OK
82 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
83 * Atomically set the bit [other cpus will start sending flush ipis],
84 * and test the bit.
85 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
86 * 2) switch %%esp, ie current
87 *
88 * The interrupt must handle 2 special cases:
89 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
90 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
91 * runs in kernel space, the cpu could load tlb entries for user space
92 * pages.
93 *
94 * The good news is that cpu mmu_state is local to each cpu, no
95 * write/read ordering problems.
96 */
97
98/*
99 * TLB flush IPI:
100 *
101 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
102 * 2) Leave the mm if we are in the lazy tlb mode.
103 */
104
105asmlinkage void smp_invalidate_interrupt (void)
106{
107 unsigned long cpu;
108
109 cpu = get_cpu();
110
111 if (!cpu_isset(cpu, flush_cpumask))
112 goto out;
113 /*
114 * This was a BUG() but until someone can quote me the
115 * line from the intel manual that guarantees an IPI to
116 * multiple CPUs is retried _only_ on the erroring CPUs
117 * its staying as a return
118 *
119 * BUG();
120 */
121
122 if (flush_mm == read_pda(active_mm)) {
123 if (read_pda(mmu_state) == TLBSTATE_OK) {
124 if (flush_va == FLUSH_ALL)
125 local_flush_tlb();
126 else
127 __flush_tlb_one(flush_va);
128 } else
129 leave_mm(cpu);
130 }
131 ack_APIC_irq();
132 cpu_clear(cpu, flush_cpumask);
133
134out:
135 put_cpu_no_resched();
136}
137
138static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
139 unsigned long va)
140{
141 cpumask_t tmp;
142 /*
143 * A couple of (to be removed) sanity checks:
144 *
145 * - we do not send IPIs to not-yet booted CPUs.
146 * - current CPU must not be in mask
147 * - mask must exist :)
148 */
149 BUG_ON(cpus_empty(cpumask));
150 cpus_and(tmp, cpumask, cpu_online_map);
151 BUG_ON(!cpus_equal(tmp, cpumask));
152 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
153 if (!mm)
154 BUG();
155
156 /*
157 * I'm not happy about this global shared spinlock in the
158 * MM hot path, but we'll see how contended it is.
159 * Temporarily this turns IRQs off, so that lockups are
160 * detected by the NMI watchdog.
161 */
162 spin_lock(&tlbstate_lock);
163
164 flush_mm = mm;
165 flush_va = va;
166 cpus_or(flush_cpumask, cpumask, flush_cpumask);
167
168 /*
169 * We have to send the IPI only to
170 * CPUs affected.
171 */
172 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
173
174 while (!cpus_empty(flush_cpumask))
175 mb(); /* nothing. lockup detection does not belong here */;
176
177 flush_mm = NULL;
178 flush_va = 0;
179 spin_unlock(&tlbstate_lock);
180}
181
182void flush_tlb_current_task(void)
183{
184 struct mm_struct *mm = current->mm;
185 cpumask_t cpu_mask;
186
187 preempt_disable();
188 cpu_mask = mm->cpu_vm_mask;
189 cpu_clear(smp_processor_id(), cpu_mask);
190
191 local_flush_tlb();
192 if (!cpus_empty(cpu_mask))
193 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
194 preempt_enable();
195}
196
197void flush_tlb_mm (struct mm_struct * mm)
198{
199 cpumask_t cpu_mask;
200
201 preempt_disable();
202 cpu_mask = mm->cpu_vm_mask;
203 cpu_clear(smp_processor_id(), cpu_mask);
204
205 if (current->active_mm == mm) {
206 if (current->mm)
207 local_flush_tlb();
208 else
209 leave_mm(smp_processor_id());
210 }
211 if (!cpus_empty(cpu_mask))
212 flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
213
214 preempt_enable();
215}
216
217void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
218{
219 struct mm_struct *mm = vma->vm_mm;
220 cpumask_t cpu_mask;
221
222 preempt_disable();
223 cpu_mask = mm->cpu_vm_mask;
224 cpu_clear(smp_processor_id(), cpu_mask);
225
226 if (current->active_mm == mm) {
227 if(current->mm)
228 __flush_tlb_one(va);
229 else
230 leave_mm(smp_processor_id());
231 }
232
233 if (!cpus_empty(cpu_mask))
234 flush_tlb_others(cpu_mask, mm, va);
235
236 preempt_enable();
237}
238
239static void do_flush_tlb_all(void* info)
240{
241 unsigned long cpu = smp_processor_id();
242
243 __flush_tlb_all();
244 if (read_pda(mmu_state) == TLBSTATE_LAZY)
245 leave_mm(cpu);
246}
247
248void flush_tlb_all(void)
249{
250 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
251}
252
253void smp_kdb_stop(void)
254{
255 send_IPI_allbutself(KDB_VECTOR);
256}
257
258/*
259 * this function sends a 'reschedule' IPI to another CPU.
260 * it goes straight through and wastes no time serializing
261 * anything. Worst case is that we lose a reschedule ...
262 */
263
264void smp_send_reschedule(int cpu)
265{
266 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
267}
268
269/*
270 * Structure and data for smp_call_function(). This is designed to minimise
271 * static memory requirements. It also looks cleaner.
272 */
273static DEFINE_SPINLOCK(call_lock);
274
275struct call_data_struct {
276 void (*func) (void *info);
277 void *info;
278 atomic_t started;
279 atomic_t finished;
280 int wait;
281};
282
283static struct call_data_struct * call_data;
284
285/*
286 * this function sends a 'generic call function' IPI to all other CPUs
287 * in the system.
288 */
289static void __smp_call_function (void (*func) (void *info), void *info,
290 int nonatomic, int wait)
291{
292 struct call_data_struct data;
293 int cpus = num_online_cpus()-1;
294
295 if (!cpus)
296 return;
297
298 data.func = func;
299 data.info = info;
300 atomic_set(&data.started, 0);
301 data.wait = wait;
302 if (wait)
303 atomic_set(&data.finished, 0);
304
305 call_data = &data;
306 wmb();
307 /* Send a message to all other CPUs and wait for them to respond */
308 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
309
310 /* Wait for response */
311 while (atomic_read(&data.started) != cpus)
312 cpu_relax();
313
314 if (!wait)
315 return;
316
317 while (atomic_read(&data.finished) != cpus)
318 cpu_relax();
319}
320
321/*
322 * smp_call_function - run a function on all other CPUs.
323 * @func: The function to run. This must be fast and non-blocking.
324 * @info: An arbitrary pointer to pass to the function.
325 * @nonatomic: currently unused.
326 * @wait: If true, wait (atomically) until function has completed on other
327 * CPUs.
328 *
329 * Returns 0 on success, else a negative status code. Does not return until
330 * remote CPUs are nearly ready to execute func or are or have executed.
331 *
332 * You must not call this function with disabled interrupts or from a
333 * hardware interrupt handler or from a bottom half handler.
334 * Actually there are a few legal cases, like panic.
335 */
336int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
337 int wait)
338{
339 spin_lock(&call_lock);
340 __smp_call_function(func,info,nonatomic,wait);
341 spin_unlock(&call_lock);
342 return 0;
343}
344
345void smp_stop_cpu(void)
346{
347 /*
348 * Remove this CPU:
349 */
350 cpu_clear(smp_processor_id(), cpu_online_map);
351 local_irq_disable();
352 disable_local_APIC();
353 local_irq_enable();
354}
355
356static void smp_really_stop_cpu(void *dummy)
357{
358 smp_stop_cpu();
359 for (;;)
360 asm("hlt");
361}
362
363void smp_send_stop(void)
364{
365 int nolock = 0;
366 if (reboot_force)
367 return;
368 /* Don't deadlock on the call lock in panic */
369 if (!spin_trylock(&call_lock)) {
370 /* ignore locking because we have paniced anyways */
371 nolock = 1;
372 }
373 __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
374 if (!nolock)
375 spin_unlock(&call_lock);
376
377 local_irq_disable();
378 disable_local_APIC();
379 local_irq_enable();
380}
381
382/*
383 * Reschedule call back. Nothing to do,
384 * all the work is done automatically when
385 * we return from the interrupt.
386 */
387asmlinkage void smp_reschedule_interrupt(void)
388{
389 ack_APIC_irq();
390}
391
392asmlinkage void smp_call_function_interrupt(void)
393{
394 void (*func) (void *info) = call_data->func;
395 void *info = call_data->info;
396 int wait = call_data->wait;
397
398 ack_APIC_irq();
399 /*
400 * Notify initiating CPU that I've grabbed the data and am
401 * about to execute the function
402 */
403 mb();
404 atomic_inc(&call_data->started);
405 /*
406 * At this point the info structure may be out of scope unless wait==1
407 */
408 irq_enter();
409 (*func)(info);
410 irq_exit();
411 if (wait) {
412 mb();
413 atomic_inc(&call_data->finished);
414 }
415}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
new file mode 100644
index 000000000000..afd3600b919e
--- /dev/null
+++ b/arch/x86_64/kernel/smpboot.c
@@ -0,0 +1,938 @@
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2 or
16 * later.
17 *
18 * Fixes
19 * Felix Koop : NR_CPUS used properly
20 * Jose Renau : Handle single CPU case.
21 * Alan Cox : By repeated request 8) - Total BogoMIP report.
22 * Greg Wright : Fix for kernel stacks panic.
23 * Erich Boleyn : MP v1.4 and additional changes.
24 * Matthias Sattler : Changes for 2.1 kernel map.
25 * Michel Lespinasse : Changes for 2.1 kernel map.
26 * Michael Chastain : Change trampoline.S to gnu as.
27 * Alan Cox : Dumb bug: 'B' step PPro's are fine
28 * Ingo Molnar : Added APIC timers, based on code
29 * from Jose Renau
30 * Ingo Molnar : various cleanups and rewrites
31 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
32 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
33 * Andi Kleen : Changed for SMP boot into long mode.
34 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
35 */
36
37#include <linux/config.h>
38#include <linux/init.h>
39
40#include <linux/mm.h>
41#include <linux/kernel_stat.h>
42#include <linux/smp_lock.h>
43#include <linux/irq.h>
44#include <linux/bootmem.h>
45#include <linux/thread_info.h>
46#include <linux/module.h>
47
48#include <linux/delay.h>
49#include <linux/mc146818rtc.h>
50#include <asm/mtrr.h>
51#include <asm/pgalloc.h>
52#include <asm/desc.h>
53#include <asm/kdebug.h>
54#include <asm/tlbflush.h>
55#include <asm/proto.h>
56
57/* Number of siblings per CPU package */
58int smp_num_siblings = 1;
59/* Package ID of each logical CPU */
60u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
61EXPORT_SYMBOL(phys_proc_id);
62
63/* Bitmask of currently online CPUs */
64cpumask_t cpu_online_map;
65
66cpumask_t cpu_callin_map;
67cpumask_t cpu_callout_map;
68static cpumask_t smp_commenced_mask;
69
70/* Per CPU bogomips and other parameters */
71struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
72
73cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
74
75/*
76 * Trampoline 80x86 program as an array.
77 */
78
79extern unsigned char trampoline_data [];
80extern unsigned char trampoline_end [];
81
82/*
83 * Currently trivial. Write the real->protected mode
84 * bootstrap into the page concerned. The caller
85 * has made sure it's suitably aligned.
86 */
87
88static unsigned long __init setup_trampoline(void)
89{
90 void *tramp = __va(SMP_TRAMPOLINE_BASE);
91 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
92 return virt_to_phys(tramp);
93}
94
95/*
96 * The bootstrap kernel entry code has set these up. Save them for
97 * a given CPU
98 */
99
100static void __init smp_store_cpu_info(int id)
101{
102 struct cpuinfo_x86 *c = cpu_data + id;
103
104 *c = boot_cpu_data;
105 identify_cpu(c);
106}
107
108/*
109 * TSC synchronization.
110 *
111 * We first check whether all CPUs have their TSC's synchronized,
112 * then we print a warning if not, and always resync.
113 */
114
115static atomic_t tsc_start_flag = ATOMIC_INIT(0);
116static atomic_t tsc_count_start = ATOMIC_INIT(0);
117static atomic_t tsc_count_stop = ATOMIC_INIT(0);
118static unsigned long long tsc_values[NR_CPUS];
119
120#define NR_LOOPS 5
121
122extern unsigned int fast_gettimeoffset_quotient;
123
124static void __init synchronize_tsc_bp (void)
125{
126 int i;
127 unsigned long long t0;
128 unsigned long long sum, avg;
129 long long delta;
130 long one_usec;
131 int buggy = 0;
132
133 printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus());
134
135 one_usec = cpu_khz;
136
137 atomic_set(&tsc_start_flag, 1);
138 wmb();
139
140 /*
141 * We loop a few times to get a primed instruction cache,
142 * then the last pass is more or less synchronized and
143 * the BP and APs set their cycle counters to zero all at
144 * once. This reduces the chance of having random offsets
145 * between the processors, and guarantees that the maximum
146 * delay between the cycle counters is never bigger than
147 * the latency of information-passing (cachelines) between
148 * two CPUs.
149 */
150 for (i = 0; i < NR_LOOPS; i++) {
151 /*
152 * all APs synchronize but they loop on '== num_cpus'
153 */
154 while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb();
155 atomic_set(&tsc_count_stop, 0);
156 wmb();
157 /*
158 * this lets the APs save their current TSC:
159 */
160 atomic_inc(&tsc_count_start);
161
162 sync_core();
163 rdtscll(tsc_values[smp_processor_id()]);
164 /*
165 * We clear the TSC in the last loop:
166 */
167 if (i == NR_LOOPS-1)
168 write_tsc(0, 0);
169
170 /*
171 * Wait for all APs to leave the synchronization point:
172 */
173 while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb();
174 atomic_set(&tsc_count_start, 0);
175 wmb();
176 atomic_inc(&tsc_count_stop);
177 }
178
179 sum = 0;
180 for (i = 0; i < NR_CPUS; i++) {
181 if (cpu_isset(i, cpu_callout_map)) {
182 t0 = tsc_values[i];
183 sum += t0;
184 }
185 }
186 avg = sum / num_booting_cpus();
187
188 sum = 0;
189 for (i = 0; i < NR_CPUS; i++) {
190 if (!cpu_isset(i, cpu_callout_map))
191 continue;
192
193 delta = tsc_values[i] - avg;
194 if (delta < 0)
195 delta = -delta;
196 /*
197 * We report bigger than 2 microseconds clock differences.
198 */
199 if (delta > 2*one_usec) {
200 long realdelta;
201 if (!buggy) {
202 buggy = 1;
203 printk("\n");
204 }
205 realdelta = delta / one_usec;
206 if (tsc_values[i] < avg)
207 realdelta = -realdelta;
208
209 printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
210 i, realdelta);
211 }
212
213 sum += delta;
214 }
215 if (!buggy)
216 printk("passed.\n");
217}
218
219static void __init synchronize_tsc_ap (void)
220{
221 int i;
222
223 /*
224 * Not every cpu is online at the time
225 * this gets called, so we first wait for the BP to
226 * finish SMP initialization:
227 */
228 while (!atomic_read(&tsc_start_flag)) mb();
229
230 for (i = 0; i < NR_LOOPS; i++) {
231 atomic_inc(&tsc_count_start);
232 while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb();
233
234 sync_core();
235 rdtscll(tsc_values[smp_processor_id()]);
236 if (i == NR_LOOPS-1)
237 write_tsc(0, 0);
238
239 atomic_inc(&tsc_count_stop);
240 while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
241 }
242}
243#undef NR_LOOPS
244
245static atomic_t init_deasserted;
246
247static void __init smp_callin(void)
248{
249 int cpuid, phys_id;
250 unsigned long timeout;
251
252 /*
253 * If waken up by an INIT in an 82489DX configuration
254 * we may get here before an INIT-deassert IPI reaches
255 * our local APIC. We have to wait for the IPI or we'll
256 * lock up on an APIC access.
257 */
258 while (!atomic_read(&init_deasserted));
259
260 /*
261 * (This works even if the APIC is not enabled.)
262 */
263 phys_id = GET_APIC_ID(apic_read(APIC_ID));
264 cpuid = smp_processor_id();
265 if (cpu_isset(cpuid, cpu_callin_map)) {
266 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
267 phys_id, cpuid);
268 }
269 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
270
271 /*
272 * STARTUP IPIs are fragile beasts as they might sometimes
273 * trigger some glue motherboard logic. Complete APIC bus
274 * silence for 1 second, this overestimates the time the
275 * boot CPU is spending to send the up to 2 STARTUP IPIs
276 * by a factor of two. This should be enough.
277 */
278
279 /*
280 * Waiting 2s total for startup (udelay is not yet working)
281 */
282 timeout = jiffies + 2*HZ;
283 while (time_before(jiffies, timeout)) {
284 /*
285 * Has the boot CPU finished it's STARTUP sequence?
286 */
287 if (cpu_isset(cpuid, cpu_callout_map))
288 break;
289 rep_nop();
290 }
291
292 if (!time_before(jiffies, timeout)) {
293 panic("smp_callin: CPU%d started up but did not get a callout!\n",
294 cpuid);
295 }
296
297 /*
298 * the boot CPU has finished the init stage and is spinning
299 * on callin_map until we finish. We are free to set up this
300 * CPU, first the APIC. (this is probably redundant on most
301 * boards)
302 */
303
304 Dprintk("CALLIN, before setup_local_APIC().\n");
305 setup_local_APIC();
306
307 local_irq_enable();
308
309 /*
310 * Get our bogomips.
311 */
312 calibrate_delay();
313 Dprintk("Stack at about %p\n",&cpuid);
314
315 disable_APIC_timer();
316
317 /*
318 * Save our processor parameters
319 */
320 smp_store_cpu_info(cpuid);
321
322 local_irq_disable();
323
324 /*
325 * Allow the master to continue.
326 */
327 cpu_set(cpuid, cpu_callin_map);
328
329 /*
330 * Synchronize the TSC with the BP
331 */
332 if (cpu_has_tsc)
333 synchronize_tsc_ap();
334}
335
336static int cpucount;
337
338/*
339 * Activate a secondary processor.
340 */
341void __init start_secondary(void)
342{
343 /*
344 * Dont put anything before smp_callin(), SMP
345 * booting is too fragile that we want to limit the
346 * things done here to the most necessary things.
347 */
348 cpu_init();
349 smp_callin();
350
351 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
352 barrier();
353
354 Dprintk("cpu %d: waiting for commence\n", smp_processor_id());
355 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
356 rep_nop();
357
358 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
359 setup_secondary_APIC_clock();
360
361 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
362
363 if (nmi_watchdog == NMI_IO_APIC) {
364 disable_8259A_irq(0);
365 enable_NMI_through_LVT0(NULL);
366 enable_8259A_irq(0);
367 }
368
369
370 enable_APIC_timer();
371
372 /*
373 * low-memory mappings have been cleared, flush them from
374 * the local TLBs too.
375 */
376 local_flush_tlb();
377
378 Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id());
379 cpu_set(smp_processor_id(), cpu_online_map);
380 wmb();
381
382 cpu_idle();
383}
384
385extern volatile unsigned long init_rsp;
386extern void (*initial_code)(void);
387
388#if APIC_DEBUG
389static inline void inquire_remote_apic(int apicid)
390{
391 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
392 char *names[] = { "ID", "VERSION", "SPIV" };
393 int timeout, status;
394
395 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
396
397 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
398 printk("... APIC #%d %s: ", apicid, names[i]);
399
400 /*
401 * Wait for idle.
402 */
403 apic_wait_icr_idle();
404
405 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
406 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
407
408 timeout = 0;
409 do {
410 udelay(100);
411 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
412 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
413
414 switch (status) {
415 case APIC_ICR_RR_VALID:
416 status = apic_read(APIC_RRR);
417 printk("%08x\n", status);
418 break;
419 default:
420 printk("failed\n");
421 }
422 }
423}
424#endif
425
426static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
427{
428 unsigned long send_status = 0, accept_status = 0;
429 int maxlvt, timeout, num_starts, j;
430
431 Dprintk("Asserting INIT.\n");
432
433 /*
434 * Turn INIT on target chip
435 */
436 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
437
438 /*
439 * Send IPI
440 */
441 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
442 | APIC_DM_INIT);
443
444 Dprintk("Waiting for send to finish...\n");
445 timeout = 0;
446 do {
447 Dprintk("+");
448 udelay(100);
449 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
450 } while (send_status && (timeout++ < 1000));
451
452 mdelay(10);
453
454 Dprintk("Deasserting INIT.\n");
455
456 /* Target chip */
457 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
458
459 /* Send IPI */
460 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
461
462 Dprintk("Waiting for send to finish...\n");
463 timeout = 0;
464 do {
465 Dprintk("+");
466 udelay(100);
467 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
468 } while (send_status && (timeout++ < 1000));
469
470 atomic_set(&init_deasserted, 1);
471
472 /*
473 * Should we send STARTUP IPIs ?
474 *
475 * Determine this based on the APIC version.
476 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
477 */
478 if (APIC_INTEGRATED(apic_version[phys_apicid]))
479 num_starts = 2;
480 else
481 num_starts = 0;
482
483 /*
484 * Run STARTUP IPI loop.
485 */
486 Dprintk("#startup loops: %d.\n", num_starts);
487
488 maxlvt = get_maxlvt();
489
490 for (j = 1; j <= num_starts; j++) {
491 Dprintk("Sending STARTUP #%d.\n",j);
492 apic_read_around(APIC_SPIV);
493 apic_write(APIC_ESR, 0);
494 apic_read(APIC_ESR);
495 Dprintk("After apic_write.\n");
496
497 /*
498 * STARTUP IPI
499 */
500
501 /* Target chip */
502 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
503
504 /* Boot on the stack */
505 /* Kick the second */
506 apic_write_around(APIC_ICR, APIC_DM_STARTUP
507 | (start_rip >> 12));
508
509 /*
510 * Give the other CPU some time to accept the IPI.
511 */
512 udelay(300);
513
514 Dprintk("Startup point 1.\n");
515
516 Dprintk("Waiting for send to finish...\n");
517 timeout = 0;
518 do {
519 Dprintk("+");
520 udelay(100);
521 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
522 } while (send_status && (timeout++ < 1000));
523
524 /*
525 * Give the other CPU some time to accept the IPI.
526 */
527 udelay(200);
528 /*
529 * Due to the Pentium erratum 3AP.
530 */
531 if (maxlvt > 3) {
532 apic_read_around(APIC_SPIV);
533 apic_write(APIC_ESR, 0);
534 }
535 accept_status = (apic_read(APIC_ESR) & 0xEF);
536 if (send_status || accept_status)
537 break;
538 }
539 Dprintk("After Startup.\n");
540
541 if (send_status)
542 printk(KERN_ERR "APIC never delivered???\n");
543 if (accept_status)
544 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
545
546 return (send_status | accept_status);
547}
548
549static void __init do_boot_cpu (int apicid)
550{
551 struct task_struct *idle;
552 unsigned long boot_error;
553 int timeout, cpu;
554 unsigned long start_rip;
555
556 cpu = ++cpucount;
557 /*
558 * We can't use kernel_thread since we must avoid to
559 * reschedule the child.
560 */
561 idle = fork_idle(cpu);
562 if (IS_ERR(idle))
563 panic("failed fork for CPU %d", cpu);
564 x86_cpu_to_apicid[cpu] = apicid;
565
566 cpu_pda[cpu].pcurrent = idle;
567
568 start_rip = setup_trampoline();
569
570 init_rsp = idle->thread.rsp;
571 per_cpu(init_tss,cpu).rsp0 = init_rsp;
572 initial_code = start_secondary;
573 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
574
575 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
576 start_rip, init_rsp);
577
578 /*
579 * This grunge runs the startup process for
580 * the targeted processor.
581 */
582
583 atomic_set(&init_deasserted, 0);
584
585 Dprintk("Setting warm reset code and vector.\n");
586
587 CMOS_WRITE(0xa, 0xf);
588 local_flush_tlb();
589 Dprintk("1.\n");
590 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
591 Dprintk("2.\n");
592 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
593 Dprintk("3.\n");
594
595 /*
596 * Be paranoid about clearing APIC errors.
597 */
598 if (APIC_INTEGRATED(apic_version[apicid])) {
599 apic_read_around(APIC_SPIV);
600 apic_write(APIC_ESR, 0);
601 apic_read(APIC_ESR);
602 }
603
604 /*
605 * Status is now clean
606 */
607 boot_error = 0;
608
609 /*
610 * Starting actual IPI sequence...
611 */
612 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
613
614 if (!boot_error) {
615 /*
616 * allow APs to start initializing.
617 */
618 Dprintk("Before Callout %d.\n", cpu);
619 cpu_set(cpu, cpu_callout_map);
620 Dprintk("After Callout %d.\n", cpu);
621
622 /*
623 * Wait 5s total for a response
624 */
625 for (timeout = 0; timeout < 50000; timeout++) {
626 if (cpu_isset(cpu, cpu_callin_map))
627 break; /* It has booted */
628 udelay(100);
629 }
630
631 if (cpu_isset(cpu, cpu_callin_map)) {
632 /* number CPUs logically, starting from 1 (BSP is 0) */
633 Dprintk("OK.\n");
634 print_cpu_info(&cpu_data[cpu]);
635 Dprintk("CPU has booted.\n");
636 } else {
637 boot_error = 1;
638 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
639 == 0xA5)
640 /* trampoline started but...? */
641 printk("Stuck ??\n");
642 else
643 /* trampoline code not run */
644 printk("Not responding.\n");
645#if APIC_DEBUG
646 inquire_remote_apic(apicid);
647#endif
648 }
649 }
650 if (boot_error) {
651 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
652 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
653 cpucount--;
654 x86_cpu_to_apicid[cpu] = BAD_APICID;
655 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
656 }
657}
658
659static void smp_tune_scheduling (void)
660{
661 int cachesize; /* kB */
662 unsigned long bandwidth = 1000; /* MB/s */
663 /*
664 * Rough estimation for SMP scheduling, this is the number of
665 * cycles it takes for a fully memory-limited process to flush
666 * the SMP-local cache.
667 *
668 * (For a P5 this pretty much means we will choose another idle
669 * CPU almost always at wakeup time (this is due to the small
670 * L1 cache), on PIIs it's around 50-100 usecs, depending on
671 * the cache size)
672 */
673
674 if (!cpu_khz) {
675 return;
676 } else {
677 cachesize = boot_cpu_data.x86_cache_size;
678 if (cachesize == -1) {
679 cachesize = 16; /* Pentiums, 2x8kB cache */
680 bandwidth = 100;
681 }
682 }
683}
684
685/*
686 * Cycle through the processors sending APIC IPIs to boot each.
687 */
688
689static void __init smp_boot_cpus(unsigned int max_cpus)
690{
691 unsigned apicid, cpu, bit, kicked;
692
693 nmi_watchdog_default();
694
695 /*
696 * Setup boot CPU information
697 */
698 smp_store_cpu_info(0); /* Final full version of the data */
699 printk(KERN_INFO "CPU%d: ", 0);
700 print_cpu_info(&cpu_data[0]);
701
702 current_thread_info()->cpu = 0;
703 smp_tune_scheduling();
704
705 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
706 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
707 hard_smp_processor_id());
708 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
709 }
710
711 /*
712 * If we couldn't find an SMP configuration at boot time,
713 * get out of here now!
714 */
715 if (!smp_found_config) {
716 printk(KERN_NOTICE "SMP motherboard not detected.\n");
717 io_apic_irqs = 0;
718 cpu_online_map = cpumask_of_cpu(0);
719 cpu_set(0, cpu_sibling_map[0]);
720 phys_cpu_present_map = physid_mask_of_physid(0);
721 if (APIC_init_uniprocessor())
722 printk(KERN_NOTICE "Local APIC not detected."
723 " Using dummy APIC emulation.\n");
724 goto smp_done;
725 }
726
727 /*
728 * Should not be necessary because the MP table should list the boot
729 * CPU too, but we do it for the sake of robustness anyway.
730 */
731 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
732 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
733 boot_cpu_id);
734 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
735 }
736
737 /*
738 * If we couldn't find a local APIC, then get out of here now!
739 */
740 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
741 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
742 boot_cpu_id);
743 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
744 io_apic_irqs = 0;
745 cpu_online_map = cpumask_of_cpu(0);
746 cpu_set(0, cpu_sibling_map[0]);
747 phys_cpu_present_map = physid_mask_of_physid(0);
748 disable_apic = 1;
749 goto smp_done;
750 }
751
752 verify_local_APIC();
753
754 /*
755 * If SMP should be disabled, then really disable it!
756 */
757 if (!max_cpus) {
758 smp_found_config = 0;
759 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
760 io_apic_irqs = 0;
761 cpu_online_map = cpumask_of_cpu(0);
762 cpu_set(0, cpu_sibling_map[0]);
763 phys_cpu_present_map = physid_mask_of_physid(0);
764 disable_apic = 1;
765 goto smp_done;
766 }
767
768 connect_bsp_APIC();
769 setup_local_APIC();
770
771 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id)
772 BUG();
773
774 x86_cpu_to_apicid[0] = boot_cpu_id;
775
776 /*
777 * Now scan the CPU present map and fire up the other CPUs.
778 */
779 Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
780
781 kicked = 1;
782 for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
783 apicid = cpu_present_to_apicid(bit);
784 /*
785 * Don't even attempt to start the boot CPU!
786 */
787 if (apicid == boot_cpu_id || (apicid == BAD_APICID))
788 continue;
789
790 if (!physid_isset(apicid, phys_cpu_present_map))
791 continue;
792 if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
793 continue;
794
795 do_boot_cpu(apicid);
796 ++kicked;
797 }
798
799 /*
800 * Cleanup possible dangling ends...
801 */
802 {
803 /*
804 * Install writable page 0 entry to set BIOS data area.
805 */
806 local_flush_tlb();
807
808 /*
809 * Paranoid: Set warm reset code and vector here back
810 * to default values.
811 */
812 CMOS_WRITE(0, 0xf);
813
814 *((volatile int *) phys_to_virt(0x467)) = 0;
815 }
816
817 /*
818 * Allow the user to impress friends.
819 */
820
821 Dprintk("Before bogomips.\n");
822 if (!cpucount) {
823 printk(KERN_INFO "Only one processor found.\n");
824 } else {
825 unsigned long bogosum = 0;
826 for (cpu = 0; cpu < NR_CPUS; cpu++)
827 if (cpu_isset(cpu, cpu_callout_map))
828 bogosum += cpu_data[cpu].loops_per_jiffy;
829 printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
830 cpucount+1,
831 bogosum/(500000/HZ),
832 (bogosum/(5000/HZ))%100);
833 Dprintk("Before bogocount - setting activated=1.\n");
834 }
835
836 /*
837 * Construct cpu_sibling_map[], so that we can tell the
838 * sibling CPU efficiently.
839 */
840 for (cpu = 0; cpu < NR_CPUS; cpu++)
841 cpus_clear(cpu_sibling_map[cpu]);
842
843 for (cpu = 0; cpu < NR_CPUS; cpu++) {
844 int siblings = 0;
845 int i;
846 if (!cpu_isset(cpu, cpu_callout_map))
847 continue;
848
849 if (smp_num_siblings > 1) {
850 for (i = 0; i < NR_CPUS; i++) {
851 if (!cpu_isset(i, cpu_callout_map))
852 continue;
853 if (phys_proc_id[cpu] == phys_proc_id[i]) {
854 siblings++;
855 cpu_set(i, cpu_sibling_map[cpu]);
856 }
857 }
858 } else {
859 siblings++;
860 cpu_set(cpu, cpu_sibling_map[cpu]);
861 }
862
863 if (siblings != smp_num_siblings) {
864 printk(KERN_WARNING
865 "WARNING: %d siblings found for CPU%d, should be %d\n",
866 siblings, cpu, smp_num_siblings);
867 smp_num_siblings = siblings;
868 }
869 }
870
871 Dprintk("Boot done.\n");
872
873 /*
874 * Here we can be sure that there is an IO-APIC in the system. Let's
875 * go and set it up:
876 */
877 if (!skip_ioapic_setup && nr_ioapics)
878 setup_IO_APIC();
879 else
880 nr_ioapics = 0;
881
882 setup_boot_APIC_clock();
883
884 /*
885 * Synchronize the TSC with the AP
886 */
887 if (cpu_has_tsc && cpucount)
888 synchronize_tsc_bp();
889
890 smp_done:
891 time_init_smp();
892}
893
894/* These are wrappers to interface to the new boot process. Someone
895 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
896void __init smp_prepare_cpus(unsigned int max_cpus)
897{
898 smp_boot_cpus(max_cpus);
899}
900
901void __devinit smp_prepare_boot_cpu(void)
902{
903 cpu_set(smp_processor_id(), cpu_online_map);
904 cpu_set(smp_processor_id(), cpu_callout_map);
905}
906
907int __devinit __cpu_up(unsigned int cpu)
908{
909 /* This only works at boot for x86. See "rewrite" above. */
910 if (cpu_isset(cpu, smp_commenced_mask)) {
911 local_irq_enable();
912 return -ENOSYS;
913 }
914
915 /* In case one didn't come up */
916 if (!cpu_isset(cpu, cpu_callin_map)) {
917 local_irq_enable();
918 return -EIO;
919 }
920 local_irq_enable();
921
922 /* Unleash the CPU! */
923 Dprintk("waiting for cpu %d\n", cpu);
924
925 cpu_set(cpu, smp_commenced_mask);
926 while (!cpu_isset(cpu, cpu_online_map))
927 mb();
928 return 0;
929}
930
931void __init smp_cpus_done(unsigned int max_cpus)
932{
933#ifdef CONFIG_X86_IO_APIC
934 setup_ioapic_dest();
935#endif
936 zap_low_mappings();
937}
938
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
new file mode 100644
index 000000000000..ebaa1e37d657
--- /dev/null
+++ b/arch/x86_64/kernel/suspend.c
@@ -0,0 +1,157 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/config.h>
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/spinlock.h>
16#include <linux/poll.h>
17#include <linux/delay.h>
18#include <linux/sysrq.h>
19#include <linux/proc_fs.h>
20#include <linux/irq.h>
21#include <linux/pm.h>
22#include <linux/device.h>
23#include <linux/suspend.h>
24#include <asm/uaccess.h>
25#include <asm/acpi.h>
26#include <asm/tlbflush.h>
27#include <asm/io.h>
28#include <asm/proto.h>
29
30struct saved_context saved_context;
31
32unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
33unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
34unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
35unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
36unsigned long saved_context_eflags;
37
38void __save_processor_state(struct saved_context *ctxt)
39{
40 kernel_fpu_begin();
41
42 /*
43 * descriptor tables
44 */
45 asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
46 asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
47 asm volatile ("sldt %0" : "=m" (ctxt->ldt));
48 asm volatile ("str %0" : "=m" (ctxt->tr));
49
50 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
51 /* EFER should be constant for kernel version, no need to handle it. */
52 /*
53 * segment registers
54 */
55 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
56 asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
57 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
58 asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
59 asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
60
61 rdmsrl(MSR_FS_BASE, ctxt->fs_base);
62 rdmsrl(MSR_GS_BASE, ctxt->gs_base);
63 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
64
65 /*
66 * control registers
67 */
68 asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0));
69 asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
70 asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
71 asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
72}
73
74void save_processor_state(void)
75{
76 __save_processor_state(&saved_context);
77}
78
79static void
80do_fpu_end(void)
81{
82 /* restore FPU regs if necessary */
83 /* Do it out of line so that gcc does not move cr0 load to some stupid place */
84 kernel_fpu_end();
85 mxcsr_feature_mask_init();
86}
87
88void __restore_processor_state(struct saved_context *ctxt)
89{
90 /*
91 * control registers
92 */
93 asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
94 asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
95 asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
96 asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
97
98 /*
99 * segment registers
100 */
101 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
102 asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
103 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
104 load_gs_index(ctxt->gs);
105 asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
106
107 wrmsrl(MSR_FS_BASE, ctxt->fs_base);
108 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
109 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
110
111 /*
112 * now restore the descriptor tables to their proper values
113 * ltr is done i fix_processor_context().
114 */
115 asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
116 asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
117 asm volatile ("lldt %0" :: "m" (ctxt->ldt));
118
119 fix_processor_context();
120
121 do_fpu_end();
122}
123
124void restore_processor_state(void)
125{
126 __restore_processor_state(&saved_context);
127}
128
129void fix_processor_context(void)
130{
131 int cpu = smp_processor_id();
132 struct tss_struct *t = &per_cpu(init_tss, cpu);
133
134 set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
135
136 cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9;
137
138 syscall_init(); /* This sets MSR_*STAR and related */
139 load_TR_desc(); /* This does ltr */
140 load_LDT(&current->active_mm->context); /* This does lldt */
141
142 /*
143 * Now maybe reload the debug registers
144 */
145 if (current->thread.debugreg7){
146 loaddebug(&current->thread, 0);
147 loaddebug(&current->thread, 1);
148 loaddebug(&current->thread, 2);
149 loaddebug(&current->thread, 3);
150 /* no 4 and 5 */
151 loaddebug(&current->thread, 6);
152 loaddebug(&current->thread, 7);
153 }
154
155}
156
157
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
new file mode 100644
index 000000000000..53f8e1659511
--- /dev/null
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -0,0 +1,104 @@
1/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
2 *
3 * Distribute under GPLv2.
4 *
5 * swsusp_arch_resume may not use any stack, nor any variable that is
6 * not "NoSave" during copying pages:
7 *
8 * Its rewriting one kernel image with another. What is stack in "old"
9 * image could very well be data page in "new" image, and overwriting
10 * your own stack under you is bad idea.
11 */
12
13 .text
14#include <linux/linkage.h>
15#include <asm/segment.h>
16#include <asm/page.h>
17#include <asm/offset.h>
18
19ENTRY(swsusp_arch_suspend)
20
21 movq %rsp, saved_context_esp(%rip)
22 movq %rax, saved_context_eax(%rip)
23 movq %rbx, saved_context_ebx(%rip)
24 movq %rcx, saved_context_ecx(%rip)
25 movq %rdx, saved_context_edx(%rip)
26 movq %rbp, saved_context_ebp(%rip)
27 movq %rsi, saved_context_esi(%rip)
28 movq %rdi, saved_context_edi(%rip)
29 movq %r8, saved_context_r08(%rip)
30 movq %r9, saved_context_r09(%rip)
31 movq %r10, saved_context_r10(%rip)
32 movq %r11, saved_context_r11(%rip)
33 movq %r12, saved_context_r12(%rip)
34 movq %r13, saved_context_r13(%rip)
35 movq %r14, saved_context_r14(%rip)
36 movq %r15, saved_context_r15(%rip)
37 pushfq ; popq saved_context_eflags(%rip)
38
39 call swsusp_save
40 ret
41
42ENTRY(swsusp_arch_resume)
43 /* set up cr3 */
44 leaq init_level4_pgt(%rip),%rax
45 subq $__START_KERNEL_map,%rax
46 movq %rax,%cr3
47
48 movq mmu_cr4_features(%rip), %rax
49 movq %rax, %rdx
50 andq $~(1<<7), %rdx # PGE
51 movq %rdx, %cr4; # turn off PGE
52 movq %cr3, %rcx; # flush TLB
53 movq %rcx, %cr3;
54 movq %rax, %cr4; # turn PGE back on
55
56 movq pagedir_nosave(%rip), %rdx
57loop:
58 testq %rdx, %rdx
59 jz done
60
61 /* get addresses from the pbe and copy the page */
62 movq pbe_address(%rdx), %rsi
63 movq pbe_orig_address(%rdx), %rdi
64 movq $512, %rcx
65 rep
66 movsq
67
68 /* progress to the next pbe */
69 movq pbe_next(%rdx), %rdx
70 jmp loop
71done:
72 /* Flush TLB, including "global" things (vmalloc) */
73 movq mmu_cr4_features(%rip), %rax
74 movq %rax, %rdx
75 andq $~(1<<7), %rdx; # PGE
76 movq %rdx, %cr4; # turn off PGE
77 movq %cr3, %rcx; # flush TLB
78 movq %rcx, %cr3
79 movq %rax, %cr4; # turn PGE back on
80
81 movl $24, %eax
82 movl %eax, %ds
83
84 movq saved_context_esp(%rip), %rsp
85 movq saved_context_ebp(%rip), %rbp
86 /* Don't restore %rax, it must be 0 anyway */
87 movq saved_context_ebx(%rip), %rbx
88 movq saved_context_ecx(%rip), %rcx
89 movq saved_context_edx(%rip), %rdx
90 movq saved_context_esi(%rip), %rsi
91 movq saved_context_edi(%rip), %rdi
92 movq saved_context_r08(%rip), %r8
93 movq saved_context_r09(%rip), %r9
94 movq saved_context_r10(%rip), %r10
95 movq saved_context_r11(%rip), %r11
96 movq saved_context_r12(%rip), %r12
97 movq saved_context_r13(%rip), %r13
98 movq saved_context_r14(%rip), %r14
99 movq saved_context_r15(%rip), %r15
100 pushq saved_context_eflags(%rip) ; popfq
101
102 xorq %rax, %rax
103
104 ret
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
new file mode 100644
index 000000000000..477d8be57d64
--- /dev/null
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -0,0 +1,173 @@
1/*
2 * linux/arch/x86_64/kernel/sys_x86_64.c
3 */
4
5#include <linux/errno.h>
6#include <linux/sched.h>
7#include <linux/syscalls.h>
8#include <linux/mm.h>
9#include <linux/smp.h>
10#include <linux/smp_lock.h>
11#include <linux/sem.h>
12#include <linux/msg.h>
13#include <linux/shm.h>
14#include <linux/stat.h>
15#include <linux/mman.h>
16#include <linux/file.h>
17#include <linux/utsname.h>
18#include <linux/personality.h>
19
20#include <asm/uaccess.h>
21#include <asm/ia32.h>
22
23/*
24 * sys_pipe() is the normal C calling standard for creating
25 * a pipe. It's not the way Unix traditionally does this, though.
26 */
27asmlinkage long sys_pipe(int __user *fildes)
28{
29 int fd[2];
30 int error;
31
32 error = do_pipe(fd);
33 if (!error) {
34 if (copy_to_user(fildes, fd, 2*sizeof(int)))
35 error = -EFAULT;
36 }
37 return error;
38}
39
40asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
41 unsigned long fd, unsigned long off)
42{
43 long error;
44 struct file * file;
45
46 error = -EINVAL;
47 if (off & ~PAGE_MASK)
48 goto out;
49
50 error = -EBADF;
51 file = NULL;
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) {
54 file = fget(fd);
55 if (!file)
56 goto out;
57 }
58 down_write(&current->mm->mmap_sem);
59 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
60 up_write(&current->mm->mmap_sem);
61
62 if (file)
63 fput(file);
64out:
65 return error;
66}
67
68static void find_start_end(unsigned long flags, unsigned long *begin,
69 unsigned long *end)
70{
71#ifdef CONFIG_IA32_EMULATION
72 if (test_thread_flag(TIF_IA32)) {
73 *begin = TASK_UNMAPPED_32;
74 *end = IA32_PAGE_OFFSET;
75 } else
76#endif
77 if (flags & MAP_32BIT) {
78 /* This is usually used needed to map code in small
79 model, so it needs to be in the first 31bit. Limit
80 it to that. This means we need to move the
81 unmapped base down for this case. This can give
82 conflicts with the heap, but we assume that glibc
83 malloc knows how to fall back to mmap. Give it 1GB
84 of playground for now. -AK */
85 *begin = 0x40000000;
86 *end = 0x80000000;
87 } else {
88 *begin = TASK_UNMAPPED_64;
89 *end = TASK_SIZE;
90 }
91}
92
93unsigned long
94arch_get_unmapped_area(struct file *filp, unsigned long addr,
95 unsigned long len, unsigned long pgoff, unsigned long flags)
96{
97 struct mm_struct *mm = current->mm;
98 struct vm_area_struct *vma;
99 unsigned long start_addr;
100 unsigned long begin, end;
101
102 find_start_end(flags, &begin, &end);
103
104 if (len > end)
105 return -ENOMEM;
106
107 if (addr) {
108 addr = PAGE_ALIGN(addr);
109 vma = find_vma(mm, addr);
110 if (end - len >= addr &&
111 (!vma || addr + len <= vma->vm_start))
112 return addr;
113 }
114 addr = mm->free_area_cache;
115 if (addr < begin)
116 addr = begin;
117 start_addr = addr;
118
119full_search:
120 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
121 /* At this point: (!vma || addr < vma->vm_end). */
122 if (end - len < addr) {
123 /*
124 * Start a new search - just in case we missed
125 * some holes.
126 */
127 if (start_addr != begin) {
128 start_addr = addr = begin;
129 goto full_search;
130 }
131 return -ENOMEM;
132 }
133 if (!vma || addr + len <= vma->vm_start) {
134 /*
135 * Remember the place where we stopped the search:
136 */
137 mm->free_area_cache = addr + len;
138 return addr;
139 }
140 addr = vma->vm_end;
141 }
142}
143
144asmlinkage long sys_uname(struct new_utsname __user * name)
145{
146 int err;
147 down_read(&uts_sem);
148 err = copy_to_user(name, &system_utsname, sizeof (*name));
149 up_read(&uts_sem);
150 if (personality(current->personality) == PER_LINUX32)
151 err |= copy_to_user(&name->machine, "i686", 5);
152 return err ? -EFAULT : 0;
153}
154
155asmlinkage long wrap_sys_shmat(int shmid, char __user *shmaddr, int shmflg)
156{
157 unsigned long raddr;
158 return do_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr;
159}
160
161asmlinkage long sys_time64(long __user * tloc)
162{
163 struct timeval now;
164 int i;
165
166 do_gettimeofday(&now);
167 i = now.tv_sec;
168 if (tloc) {
169 if (put_user(i,tloc))
170 i = -EFAULT;
171 }
172 return i;
173}
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
new file mode 100644
index 000000000000..e263685f864c
--- /dev/null
+++ b/arch/x86_64/kernel/syscall.c
@@ -0,0 +1,26 @@
1/* System call table for x86-64. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <linux/config.h>
7
8#define __NO_STUBS
9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_
12#include <asm-x86_64/unistd.h>
13
14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [ nr ] = sym,
16#undef _ASM_X86_64_UNISTD_H_
17
18typedef void (*sys_call_ptr_t)(void);
19
20extern void sys_ni_syscall(void);
21
22sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = {
23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
24 [0 ... __NR_syscall_max] = &sys_ni_syscall,
25#include <asm-x86_64/unistd.h>
26};
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
new file mode 100644
index 000000000000..3bafe438fa75
--- /dev/null
+++ b/arch/x86_64/kernel/time.c
@@ -0,0 +1,1262 @@
1/*
2 * linux/arch/x86-64/kernel/time.c
3 *
4 * "High Precision Event Timer" based timekeeping.
5 *
6 * Copyright (c) 1991,1992,1995 Linus Torvalds
7 * Copyright (c) 1994 Alan Modra
8 * Copyright (c) 1995 Markus Kuhn
9 * Copyright (c) 1996 Ingo Molnar
10 * Copyright (c) 1998 Andrea Arcangeli
11 * Copyright (c) 2002 Vojtech Pavlik
12 * Copyright (c) 2003 Andi Kleen
13 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
14 */
15
16#include <linux/kernel.h>
17#include <linux/sched.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/mc146818rtc.h>
21#include <linux/irq.h>
22#include <linux/time.h>
23#include <linux/ioport.h>
24#include <linux/module.h>
25#include <linux/device.h>
26#include <linux/sysdev.h>
27#include <linux/bcd.h>
28#include <linux/kallsyms.h>
29#include <asm/8253pit.h>
30#include <asm/pgtable.h>
31#include <asm/vsyscall.h>
32#include <asm/timex.h>
33#include <asm/proto.h>
34#include <asm/hpet.h>
35#include <asm/sections.h>
36#include <linux/cpufreq.h>
37#include <linux/hpet.h>
38#ifdef CONFIG_X86_LOCAL_APIC
39#include <asm/apic.h>
40#endif
41
42u64 jiffies_64 = INITIAL_JIFFIES;
43
44EXPORT_SYMBOL(jiffies_64);
45
46#ifdef CONFIG_CPU_FREQ
47static void cpufreq_delayed_get(void);
48#endif
49extern void i8254_timer_resume(void);
50extern int using_apic_timer;
51
52DEFINE_SPINLOCK(rtc_lock);
53DEFINE_SPINLOCK(i8253_lock);
54
55static int nohpet __initdata = 0;
56static int notsc __initdata = 0;
57
58#undef HPET_HACK_ENABLE_DANGEROUS
59
60unsigned int cpu_khz; /* TSC clocks / usec, not used here */
61static unsigned long hpet_period; /* fsecs / HPET clock */
62unsigned long hpet_tick; /* HPET clocks / interrupt */
63unsigned long vxtime_hz = PIT_TICK_RATE;
64int report_lost_ticks; /* command line option */
65unsigned long long monotonic_base;
66
67struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
68
69volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
70unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
71struct timespec __xtime __section_xtime;
72struct timezone __sys_tz __section_sys_tz;
73
74static inline void rdtscll_sync(unsigned long *tsc)
75{
76#ifdef CONFIG_SMP
77 sync_core();
78#endif
79 rdtscll(*tsc);
80}
81
82/*
83 * do_gettimeoffset() returns microseconds since last timer interrupt was
84 * triggered by hardware. A memory read of HPET is slower than a register read
85 * of TSC, but much more reliable. It's also synchronized to the timer
86 * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
87 * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
88 * This is not a problem, because jiffies hasn't updated either. They are bound
89 * together by xtime_lock.
90 */
91
92static inline unsigned int do_gettimeoffset_tsc(void)
93{
94 unsigned long t;
95 unsigned long x;
96 rdtscll_sync(&t);
97 if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
98 x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
99 return x;
100}
101
102static inline unsigned int do_gettimeoffset_hpet(void)
103{
104 return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
105}
106
107unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
108
109/*
110 * This version of gettimeofday() has microsecond resolution and better than
111 * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
112 * MHz) HPET timer.
113 */
114
115void do_gettimeofday(struct timeval *tv)
116{
117 unsigned long seq, t;
118 unsigned int sec, usec;
119
120 do {
121 seq = read_seqbegin(&xtime_lock);
122
123 sec = xtime.tv_sec;
124 usec = xtime.tv_nsec / 1000;
125
126 /* i386 does some correction here to keep the clock
127 monotonous even when ntpd is fixing drift.
128 But they didn't work for me, there is a non monotonic
129 clock anyways with ntp.
130 I dropped all corrections now until a real solution can
131 be found. Note when you fix it here you need to do the same
132 in arch/x86_64/kernel/vsyscall.c and export all needed
133 variables in vmlinux.lds. -AK */
134
135 t = (jiffies - wall_jiffies) * (1000000L / HZ) +
136 do_gettimeoffset();
137 usec += t;
138
139 } while (read_seqretry(&xtime_lock, seq));
140
141 tv->tv_sec = sec + usec / 1000000;
142 tv->tv_usec = usec % 1000000;
143}
144
145EXPORT_SYMBOL(do_gettimeofday);
146
147/*
148 * settimeofday() first undoes the correction that gettimeofday would do
149 * on the time, and then saves it. This is ugly, but has been like this for
150 * ages already.
151 */
152
153int do_settimeofday(struct timespec *tv)
154{
155 time_t wtm_sec, sec = tv->tv_sec;
156 long wtm_nsec, nsec = tv->tv_nsec;
157
158 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
159 return -EINVAL;
160
161 write_seqlock_irq(&xtime_lock);
162
163 nsec -= do_gettimeoffset() * 1000 +
164 (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
165
166 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
167 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
168
169 set_normalized_timespec(&xtime, sec, nsec);
170 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
171
172 time_adjust = 0; /* stop active adjtime() */
173 time_status |= STA_UNSYNC;
174 time_maxerror = NTP_PHASE_LIMIT;
175 time_esterror = NTP_PHASE_LIMIT;
176
177 write_sequnlock_irq(&xtime_lock);
178 clock_was_set();
179 return 0;
180}
181
182EXPORT_SYMBOL(do_settimeofday);
183
184unsigned long profile_pc(struct pt_regs *regs)
185{
186 unsigned long pc = instruction_pointer(regs);
187
188 /* Assume the lock function has either no stack frame or only a single word.
189 This checks if the address on the stack looks like a kernel text address.
190 There is a small window for false hits, but in that case the tick
191 is just accounted to the spinlock function.
192 Better would be to write these functions in assembler again
193 and check exactly. */
194 if (in_lock_functions(pc)) {
195 char *v = *(char **)regs->rsp;
196 if ((v >= _stext && v <= _etext) ||
197 (v >= _sinittext && v <= _einittext) ||
198 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
199 return (unsigned long)v;
200 return ((unsigned long *)regs->rsp)[1];
201 }
202 return pc;
203}
204EXPORT_SYMBOL(profile_pc);
205
206/*
207 * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
208 * ms after the second nowtime has started, because when nowtime is written
209 * into the registers of the CMOS clock, it will jump to the next second
210 * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
211 * sheet for details.
212 */
213
214static void set_rtc_mmss(unsigned long nowtime)
215{
216 int real_seconds, real_minutes, cmos_minutes;
217 unsigned char control, freq_select;
218
219/*
220 * IRQs are disabled when we're called from the timer interrupt,
221 * no need for spin_lock_irqsave()
222 */
223
224 spin_lock(&rtc_lock);
225
226/*
227 * Tell the clock it's being set and stop it.
228 */
229
230 control = CMOS_READ(RTC_CONTROL);
231 CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
232
233 freq_select = CMOS_READ(RTC_FREQ_SELECT);
234 CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
235
236 cmos_minutes = CMOS_READ(RTC_MINUTES);
237 BCD_TO_BIN(cmos_minutes);
238
239/*
240 * since we're only adjusting minutes and seconds, don't interfere with hour
241 * overflow. This avoids messing with unknown time zones but requires your RTC
242 * not to be off by more than 15 minutes. Since we're calling it only when
243 * our clock is externally synchronized using NTP, this shouldn't be a problem.
244 */
245
246 real_seconds = nowtime % 60;
247 real_minutes = nowtime / 60;
248 if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
249 real_minutes += 30; /* correct for half hour time zone */
250 real_minutes %= 60;
251
252#if 0
253 /* AMD 8111 is a really bad time keeper and hits this regularly.
254 It probably was an attempt to avoid screwing up DST, but ignore
255 that for now. */
256 if (abs(real_minutes - cmos_minutes) >= 30) {
257 printk(KERN_WARNING "time.c: can't update CMOS clock "
258 "from %d to %d\n", cmos_minutes, real_minutes);
259 } else
260#endif
261
262 {
263 BIN_TO_BCD(real_seconds);
264 BIN_TO_BCD(real_minutes);
265 CMOS_WRITE(real_seconds, RTC_SECONDS);
266 CMOS_WRITE(real_minutes, RTC_MINUTES);
267 }
268
269/*
270 * The following flags have to be released exactly in this order, otherwise the
271 * DS12887 (popular MC146818A clone with integrated battery and quartz) will
272 * not reset the oscillator and will not update precisely 500 ms later. You
273 * won't find this mentioned in the Dallas Semiconductor data sheets, but who
274 * believes data sheets anyway ... -- Markus Kuhn
275 */
276
277 CMOS_WRITE(control, RTC_CONTROL);
278 CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
279
280 spin_unlock(&rtc_lock);
281}
282
283
284/* monotonic_clock(): returns # of nanoseconds passed since time_init()
285 * Note: This function is required to return accurate
286 * time even in the absence of multiple timer ticks.
287 */
288unsigned long long monotonic_clock(void)
289{
290 unsigned long seq;
291 u32 last_offset, this_offset, offset;
292 unsigned long long base;
293
294 if (vxtime.mode == VXTIME_HPET) {
295 do {
296 seq = read_seqbegin(&xtime_lock);
297
298 last_offset = vxtime.last;
299 base = monotonic_base;
300 this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
301
302 } while (read_seqretry(&xtime_lock, seq));
303 offset = (this_offset - last_offset);
304 offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
305 return base + offset;
306 }else{
307 do {
308 seq = read_seqbegin(&xtime_lock);
309
310 last_offset = vxtime.last_tsc;
311 base = monotonic_base;
312 } while (read_seqretry(&xtime_lock, seq));
313 sync_core();
314 rdtscll(this_offset);
315 offset = (this_offset - last_offset)*1000/cpu_khz;
316 return base + offset;
317 }
318
319
320}
321EXPORT_SYMBOL(monotonic_clock);
322
323static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
324{
325 static long lost_count;
326 static int warned;
327
328 if (report_lost_ticks) {
329 printk(KERN_WARNING "time.c: Lost %d timer "
330 "tick(s)! ", lost);
331 print_symbol("rip %s)\n", regs->rip);
332 }
333
334 if (lost_count == 1000 && !warned) {
335 printk(KERN_WARNING
336 "warning: many lost ticks.\n"
337 KERN_WARNING "Your time source seems to be instable or "
338 "some driver is hogging interupts\n");
339 print_symbol("rip %s\n", regs->rip);
340 if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
341 printk(KERN_WARNING "Falling back to HPET\n");
342 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
343 vxtime.mode = VXTIME_HPET;
344 do_gettimeoffset = do_gettimeoffset_hpet;
345 }
346 /* else should fall back to PIT, but code missing. */
347 warned = 1;
348 } else
349 lost_count++;
350
351#ifdef CONFIG_CPU_FREQ
352 /* In some cases the CPU can change frequency without us noticing
353 (like going into thermal throttle)
354 Give cpufreq a change to catch up. */
355 if ((lost_count+1) % 25 == 0) {
356 cpufreq_delayed_get();
357 }
358#endif
359}
360
361static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
362{
363 static unsigned long rtc_update = 0;
364 unsigned long tsc;
365 int delay, offset = 0, lost = 0;
366
367/*
368 * Here we are in the timer irq handler. We have irqs locally disabled (so we
369 * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
370 * on the other CPU, so we need a lock. We also need to lock the vsyscall
371 * variables, because both do_timer() and us change them -arca+vojtech
372 */
373
374 write_seqlock(&xtime_lock);
375
376 if (vxtime.hpet_address) {
377 offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
378 delay = hpet_readl(HPET_COUNTER) - offset;
379 } else {
380 spin_lock(&i8253_lock);
381 outb_p(0x00, 0x43);
382 delay = inb_p(0x40);
383 delay |= inb(0x40) << 8;
384 spin_unlock(&i8253_lock);
385 delay = LATCH - 1 - delay;
386 }
387
388 rdtscll_sync(&tsc);
389
390 if (vxtime.mode == VXTIME_HPET) {
391 if (offset - vxtime.last > hpet_tick) {
392 lost = (offset - vxtime.last) / hpet_tick - 1;
393 }
394
395 monotonic_base +=
396 (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
397
398 vxtime.last = offset;
399 } else {
400 offset = (((tsc - vxtime.last_tsc) *
401 vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
402
403 if (offset < 0)
404 offset = 0;
405
406 if (offset > (USEC_PER_SEC / HZ)) {
407 lost = offset / (USEC_PER_SEC / HZ);
408 offset %= (USEC_PER_SEC / HZ);
409 }
410
411 monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
412
413 vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
414
415 if ((((tsc - vxtime.last_tsc) *
416 vxtime.tsc_quot) >> 32) < offset)
417 vxtime.last_tsc = tsc -
418 (((long) offset << 32) / vxtime.tsc_quot) - 1;
419 }
420
421 if (lost > 0) {
422 handle_lost_ticks(lost, regs);
423 jiffies += lost;
424 }
425
426/*
427 * Do the timer stuff.
428 */
429
430 do_timer(regs);
431#ifndef CONFIG_SMP
432 update_process_times(user_mode(regs));
433#endif
434
435/*
436 * In the SMP case we use the local APIC timer interrupt to do the profiling,
437 * except when we simulate SMP mode on a uniprocessor system, in that case we
438 * have to call the local interrupt handler.
439 */
440
441#ifndef CONFIG_X86_LOCAL_APIC
442 profile_tick(CPU_PROFILING, regs);
443#else
444 if (!using_apic_timer)
445 smp_local_timer_interrupt(regs);
446#endif
447
448/*
449 * If we have an externally synchronized Linux clock, then update CMOS clock
450 * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
451 * closest to exactly 500 ms before the next second. If the update fails, we
452 * don't care, as it'll be updated on the next turn, and the problem (time way
453 * off) isn't likely to go away much sooner anyway.
454 */
455
456 if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
457 abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
458 set_rtc_mmss(xtime.tv_sec);
459 rtc_update = xtime.tv_sec + 660;
460 }
461
462 write_sequnlock(&xtime_lock);
463
464 return IRQ_HANDLED;
465}
466
467static unsigned int cyc2ns_scale;
468#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
469
470static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
471{
472 cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
473}
474
475static inline unsigned long long cycles_2_ns(unsigned long long cyc)
476{
477 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
478}
479
480unsigned long long sched_clock(void)
481{
482 unsigned long a = 0;
483
484#if 0
485 /* Don't do a HPET read here. Using TSC always is much faster
486 and HPET may not be mapped yet when the scheduler first runs.
487 Disadvantage is a small drift between CPUs in some configurations,
488 but that should be tolerable. */
489 if (__vxtime.mode == VXTIME_HPET)
490 return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
491#endif
492
493 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
494 which means it is not completely exact and may not be monotonous between
495 CPUs. But the errors should be too small to matter for scheduling
496 purposes. */
497
498 rdtscll(a);
499 return cycles_2_ns(a);
500}
501
502unsigned long get_cmos_time(void)
503{
504 unsigned int timeout, year, mon, day, hour, min, sec;
505 unsigned char last, this;
506 unsigned long flags;
507
508/*
509 * The Linux interpretation of the CMOS clock register contents: When the
510 * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
511 * second which has precisely just started. Waiting for this can take up to 1
512 * second, we timeout approximately after 2.4 seconds on a machine with
513 * standard 8.3 MHz ISA bus.
514 */
515
516 spin_lock_irqsave(&rtc_lock, flags);
517
518 timeout = 1000000;
519 last = this = 0;
520
521 while (timeout && last && !this) {
522 last = this;
523 this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
524 timeout--;
525 }
526
527/*
528 * Here we are safe to assume the registers won't change for a whole second, so
529 * we just go ahead and read them.
530 */
531
532 sec = CMOS_READ(RTC_SECONDS);
533 min = CMOS_READ(RTC_MINUTES);
534 hour = CMOS_READ(RTC_HOURS);
535 day = CMOS_READ(RTC_DAY_OF_MONTH);
536 mon = CMOS_READ(RTC_MONTH);
537 year = CMOS_READ(RTC_YEAR);
538
539 spin_unlock_irqrestore(&rtc_lock, flags);
540
541/*
542 * We know that x86-64 always uses BCD format, no need to check the config
543 * register.
544 */
545
546 BCD_TO_BIN(sec);
547 BCD_TO_BIN(min);
548 BCD_TO_BIN(hour);
549 BCD_TO_BIN(day);
550 BCD_TO_BIN(mon);
551 BCD_TO_BIN(year);
552
553/*
554 * x86-64 systems only exists since 2002.
555 * This will work up to Dec 31, 2100
556 */
557 year += 2000;
558
559 return mktime(year, mon, day, hour, min, sec);
560}
561
562#ifdef CONFIG_CPU_FREQ
563
564/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
565 changes.
566
567 RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
568 not that important because current Opteron setups do not support
569 scaling on SMP anyroads.
570
571 Should fix up last_tsc too. Currently gettimeofday in the
572 first tick after the change will be slightly wrong. */
573
574#include <linux/workqueue.h>
575
576static unsigned int cpufreq_delayed_issched = 0;
577static unsigned int cpufreq_init = 0;
578static struct work_struct cpufreq_delayed_get_work;
579
580static void handle_cpufreq_delayed_get(void *v)
581{
582 unsigned int cpu;
583 for_each_online_cpu(cpu) {
584 cpufreq_get(cpu);
585 }
586 cpufreq_delayed_issched = 0;
587}
588
589/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
590 * to verify the CPU frequency the timing core thinks the CPU is running
591 * at is still correct.
592 */
593static void cpufreq_delayed_get(void)
594{
595 static int warned;
596 if (cpufreq_init && !cpufreq_delayed_issched) {
597 cpufreq_delayed_issched = 1;
598 if (!warned) {
599 warned = 1;
600 printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
601 }
602 schedule_work(&cpufreq_delayed_get_work);
603 }
604}
605
606static unsigned int ref_freq = 0;
607static unsigned long loops_per_jiffy_ref = 0;
608
609static unsigned long cpu_khz_ref = 0;
610
611static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
612 void *data)
613{
614 struct cpufreq_freqs *freq = data;
615 unsigned long *lpj, dummy;
616
617 lpj = &dummy;
618 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
619#ifdef CONFIG_SMP
620 lpj = &cpu_data[freq->cpu].loops_per_jiffy;
621#else
622 lpj = &boot_cpu_data.loops_per_jiffy;
623#endif
624
625
626
627 if (!ref_freq) {
628 ref_freq = freq->old;
629 loops_per_jiffy_ref = *lpj;
630 cpu_khz_ref = cpu_khz;
631 }
632 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
633 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
634 (val == CPUFREQ_RESUMECHANGE)) {
635 *lpj =
636 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
637
638 cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
639 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
640 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
641 }
642
643 set_cyc2ns_scale(cpu_khz_ref / 1000);
644
645 return 0;
646}
647
648static struct notifier_block time_cpufreq_notifier_block = {
649 .notifier_call = time_cpufreq_notifier
650};
651
652static int __init cpufreq_tsc(void)
653{
654 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
655 if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
656 CPUFREQ_TRANSITION_NOTIFIER))
657 cpufreq_init = 1;
658 return 0;
659}
660
661core_initcall(cpufreq_tsc);
662
663#endif
664
665/*
666 * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
667 * it to the HPET timer of known frequency.
668 */
669
670#define TICK_COUNT 100000000
671
672static unsigned int __init hpet_calibrate_tsc(void)
673{
674 int tsc_start, hpet_start;
675 int tsc_now, hpet_now;
676 unsigned long flags;
677
678 local_irq_save(flags);
679 local_irq_disable();
680
681 hpet_start = hpet_readl(HPET_COUNTER);
682 rdtscl(tsc_start);
683
684 do {
685 local_irq_disable();
686 hpet_now = hpet_readl(HPET_COUNTER);
687 sync_core();
688 rdtscl(tsc_now);
689 local_irq_restore(flags);
690 } while ((tsc_now - tsc_start) < TICK_COUNT &&
691 (hpet_now - hpet_start) < TICK_COUNT);
692
693 return (tsc_now - tsc_start) * 1000000000L
694 / ((hpet_now - hpet_start) * hpet_period / 1000);
695}
696
697
698/*
699 * pit_calibrate_tsc() uses the speaker output (channel 2) of
700 * the PIT. This is better than using the timer interrupt output,
701 * because we can read the value of the speaker with just one inb(),
702 * where we need three i/o operations for the interrupt channel.
703 * We count how many ticks the TSC does in 50 ms.
704 */
705
706static unsigned int __init pit_calibrate_tsc(void)
707{
708 unsigned long start, end;
709 unsigned long flags;
710
711 spin_lock_irqsave(&i8253_lock, flags);
712
713 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
714
715 outb(0xb0, 0x43);
716 outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
717 outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
718 rdtscll(start);
719 sync_core();
720 while ((inb(0x61) & 0x20) == 0);
721 sync_core();
722 rdtscll(end);
723
724 spin_unlock_irqrestore(&i8253_lock, flags);
725
726 return (end - start) / 50;
727}
728
729#ifdef CONFIG_HPET
730static __init int late_hpet_init(void)
731{
732 struct hpet_data hd;
733 unsigned int ntimer;
734
735 if (!vxtime.hpet_address)
736 return -1;
737
738 memset(&hd, 0, sizeof (hd));
739
740 ntimer = hpet_readl(HPET_ID);
741 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
742 ntimer++;
743
744 /*
745 * Register with driver.
746 * Timer0 and Timer1 is used by platform.
747 */
748 hd.hd_phys_address = vxtime.hpet_address;
749 hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE);
750 hd.hd_nirqs = ntimer;
751 hd.hd_flags = HPET_DATA_PLATFORM;
752 hpet_reserve_timer(&hd, 0);
753#ifdef CONFIG_HPET_EMULATE_RTC
754 hpet_reserve_timer(&hd, 1);
755#endif
756 hd.hd_irq[0] = HPET_LEGACY_8254;
757 hd.hd_irq[1] = HPET_LEGACY_RTC;
758 if (ntimer > 2) {
759 struct hpet *hpet;
760 struct hpet_timer *timer;
761 int i;
762
763 hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
764
765 for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
766 timer++, i++)
767 hd.hd_irq[i] = (timer->hpet_config &
768 Tn_INT_ROUTE_CNF_MASK) >>
769 Tn_INT_ROUTE_CNF_SHIFT;
770
771 }
772
773 hpet_alloc(&hd);
774 return 0;
775}
776fs_initcall(late_hpet_init);
777#endif
778
779static int hpet_timer_stop_set_go(unsigned long tick)
780{
781 unsigned int cfg;
782
783/*
784 * Stop the timers and reset the main counter.
785 */
786
787 cfg = hpet_readl(HPET_CFG);
788 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
789 hpet_writel(cfg, HPET_CFG);
790 hpet_writel(0, HPET_COUNTER);
791 hpet_writel(0, HPET_COUNTER + 4);
792
793/*
794 * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
795 * and period also hpet_tick.
796 */
797
798 hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
799 HPET_TN_32BIT, HPET_T0_CFG);
800 hpet_writel(hpet_tick, HPET_T0_CMP);
801 hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
802
803/*
804 * Go!
805 */
806
807 cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
808 hpet_writel(cfg, HPET_CFG);
809
810 return 0;
811}
812
813static int hpet_init(void)
814{
815 unsigned int id;
816
817 if (!vxtime.hpet_address)
818 return -1;
819 set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
820 __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
821
822/*
823 * Read the period, compute tick and quotient.
824 */
825
826 id = hpet_readl(HPET_ID);
827
828 if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
829 !(id & HPET_ID_LEGSUP))
830 return -1;
831
832 hpet_period = hpet_readl(HPET_PERIOD);
833 if (hpet_period < 100000 || hpet_period > 100000000)
834 return -1;
835
836 hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
837 hpet_period;
838
839 return hpet_timer_stop_set_go(hpet_tick);
840}
841
842static int hpet_reenable(void)
843{
844 return hpet_timer_stop_set_go(hpet_tick);
845}
846
847void __init pit_init(void)
848{
849 unsigned long flags;
850
851 spin_lock_irqsave(&i8253_lock, flags);
852 outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */
853 outb_p(LATCH & 0xff, 0x40); /* LSB */
854 outb_p(LATCH >> 8, 0x40); /* MSB */
855 spin_unlock_irqrestore(&i8253_lock, flags);
856}
857
858int __init time_setup(char *str)
859{
860 report_lost_ticks = 1;
861 return 1;
862}
863
864static struct irqaction irq0 = {
865 timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
866};
867
868extern void __init config_acpi_tables(void);
869
870void __init time_init(void)
871{
872 char *timename;
873
874#ifdef HPET_HACK_ENABLE_DANGEROUS
875 if (!vxtime.hpet_address) {
876 printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
877 "manually!\n");
878 outl(0x800038a0, 0xcf8);
879 outl(0xff000001, 0xcfc);
880 outl(0x800038a0, 0xcf8);
881 vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
882 printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
883 "at %#lx.\n", vxtime.hpet_address);
884 }
885#endif
886 if (nohpet)
887 vxtime.hpet_address = 0;
888
889 xtime.tv_sec = get_cmos_time();
890 xtime.tv_nsec = 0;
891
892 set_normalized_timespec(&wall_to_monotonic,
893 -xtime.tv_sec, -xtime.tv_nsec);
894
895 if (!hpet_init()) {
896 vxtime_hz = (1000000000000000L + hpet_period / 2) /
897 hpet_period;
898 cpu_khz = hpet_calibrate_tsc();
899 timename = "HPET";
900 } else {
901 pit_init();
902 cpu_khz = pit_calibrate_tsc();
903 timename = "PIT";
904 }
905
906 printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
907 vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
908 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
909 cpu_khz / 1000, cpu_khz % 1000);
910 vxtime.mode = VXTIME_TSC;
911 vxtime.quot = (1000000L << 32) / vxtime_hz;
912 vxtime.tsc_quot = (1000L << 32) / cpu_khz;
913 vxtime.hz = vxtime_hz;
914 rdtscll_sync(&vxtime.last_tsc);
915 setup_irq(0, &irq0);
916
917 set_cyc2ns_scale(cpu_khz / 1000);
918}
919
920void __init time_init_smp(void)
921{
922 char *timetype;
923
924 /*
925 * AMD systems with more than one CPU don't have fully synchronized
926 * TSCs. Always use HPET gettimeofday for these, although it is slower.
927 * Intel SMP systems usually have synchronized TSCs, so use always
928 * the TSC.
929 *
930 * Exceptions:
931 * IBM Summit2 checked by oem_force_hpet_timer().
932 * AMD dual core may also not need HPET. Check me.
933 *
934 * Can be turned off with "notsc".
935 */
936 if (num_online_cpus() > 1 &&
937 boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
938 notsc = 1;
939 /* Some systems will want to disable TSC and use HPET. */
940 if (oem_force_hpet_timer())
941 notsc = 1;
942 if (vxtime.hpet_address && notsc) {
943 timetype = "HPET";
944 vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
945 vxtime.mode = VXTIME_HPET;
946 do_gettimeoffset = do_gettimeoffset_hpet;
947 } else {
948 timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
949 vxtime.mode = VXTIME_TSC;
950 }
951
952 printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
953}
954
955__setup("report_lost_ticks", time_setup);
956
957static long clock_cmos_diff;
958static unsigned long sleep_start;
959
960static int timer_suspend(struct sys_device *dev, u32 state)
961{
962 /*
963 * Estimate time zone so that set_time can update the clock
964 */
965 long cmos_time = get_cmos_time();
966
967 clock_cmos_diff = -cmos_time;
968 clock_cmos_diff += get_seconds();
969 sleep_start = cmos_time;
970 return 0;
971}
972
973static int timer_resume(struct sys_device *dev)
974{
975 unsigned long flags;
976 unsigned long sec;
977 unsigned long ctime = get_cmos_time();
978 unsigned long sleep_length = (ctime - sleep_start) * HZ;
979
980 if (vxtime.hpet_address)
981 hpet_reenable();
982 else
983 i8254_timer_resume();
984
985 sec = ctime + clock_cmos_diff;
986 write_seqlock_irqsave(&xtime_lock,flags);
987 xtime.tv_sec = sec;
988 xtime.tv_nsec = 0;
989 write_sequnlock_irqrestore(&xtime_lock,flags);
990 jiffies += sleep_length;
991 wall_jiffies += sleep_length;
992 return 0;
993}
994
995static struct sysdev_class timer_sysclass = {
996 .resume = timer_resume,
997 .suspend = timer_suspend,
998 set_kset_name("timer"),
999};
1000
1001
1002/* XXX this driverfs stuff should probably go elsewhere later -john */
1003static struct sys_device device_timer = {
1004 .id = 0,
1005 .cls = &timer_sysclass,
1006};
1007
1008static int time_init_device(void)
1009{
1010 int error = sysdev_class_register(&timer_sysclass);
1011 if (!error)
1012 error = sysdev_register(&device_timer);
1013 return error;
1014}
1015
1016device_initcall(time_init_device);
1017
1018#ifdef CONFIG_HPET_EMULATE_RTC
1019/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
1020 * is enabled, we support RTC interrupt functionality in software.
1021 * RTC has 3 kinds of interrupts:
1022 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
1023 * is updated
1024 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
1025 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
1026 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
1027 * (1) and (2) above are implemented using polling at a frequency of
1028 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
1029 * overhead. (DEFAULT_RTC_INT_FREQ)
1030 * For (3), we use interrupts at 64Hz or user specified periodic
1031 * frequency, whichever is higher.
1032 */
1033#include <linux/rtc.h>
1034
1035extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
1036
1037#define DEFAULT_RTC_INT_FREQ 64
1038#define RTC_NUM_INTS 1
1039
1040static unsigned long UIE_on;
1041static unsigned long prev_update_sec;
1042
1043static unsigned long AIE_on;
1044static struct rtc_time alarm_time;
1045
1046static unsigned long PIE_on;
1047static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
1048static unsigned long PIE_count;
1049
1050static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
1051
1052int is_hpet_enabled(void)
1053{
1054 return vxtime.hpet_address != 0;
1055}
1056
1057/*
1058 * Timer 1 for RTC, we do not use periodic interrupt feature,
1059 * even if HPET supports periodic interrupts on Timer 1.
1060 * The reason being, to set up a periodic interrupt in HPET, we need to
1061 * stop the main counter. And if we do that everytime someone diables/enables
1062 * RTC, we will have adverse effect on main kernel timer running on Timer 0.
1063 * So, for the time being, simulate the periodic interrupt in software.
1064 *
1065 * hpet_rtc_timer_init() is called for the first time and during subsequent
1066 * interuppts reinit happens through hpet_rtc_timer_reinit().
1067 */
1068int hpet_rtc_timer_init(void)
1069{
1070 unsigned int cfg, cnt;
1071 unsigned long flags;
1072
1073 if (!is_hpet_enabled())
1074 return 0;
1075 /*
1076 * Set the counter 1 and enable the interrupts.
1077 */
1078 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
1079 hpet_rtc_int_freq = PIE_freq;
1080 else
1081 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1082
1083 local_irq_save(flags);
1084 cnt = hpet_readl(HPET_COUNTER);
1085 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
1086 hpet_writel(cnt, HPET_T1_CMP);
1087 local_irq_restore(flags);
1088
1089 cfg = hpet_readl(HPET_T1_CFG);
1090 cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
1091 hpet_writel(cfg, HPET_T1_CFG);
1092
1093 return 1;
1094}
1095
1096static void hpet_rtc_timer_reinit(void)
1097{
1098 unsigned int cfg, cnt;
1099
1100 if (!(PIE_on | AIE_on | UIE_on))
1101 return;
1102
1103 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
1104 hpet_rtc_int_freq = PIE_freq;
1105 else
1106 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1107
1108 /* It is more accurate to use the comparator value than current count.*/
1109 cnt = hpet_readl(HPET_T1_CMP);
1110 cnt += hpet_tick*HZ/hpet_rtc_int_freq;
1111 hpet_writel(cnt, HPET_T1_CMP);
1112
1113 cfg = hpet_readl(HPET_T1_CFG);
1114 cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
1115 hpet_writel(cfg, HPET_T1_CFG);
1116
1117 return;
1118}
1119
1120/*
1121 * The functions below are called from rtc driver.
1122 * Return 0 if HPET is not being used.
1123 * Otherwise do the necessary changes and return 1.
1124 */
1125int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
1126{
1127 if (!is_hpet_enabled())
1128 return 0;
1129
1130 if (bit_mask & RTC_UIE)
1131 UIE_on = 0;
1132 if (bit_mask & RTC_PIE)
1133 PIE_on = 0;
1134 if (bit_mask & RTC_AIE)
1135 AIE_on = 0;
1136
1137 return 1;
1138}
1139
1140int hpet_set_rtc_irq_bit(unsigned long bit_mask)
1141{
1142 int timer_init_reqd = 0;
1143
1144 if (!is_hpet_enabled())
1145 return 0;
1146
1147 if (!(PIE_on | AIE_on | UIE_on))
1148 timer_init_reqd = 1;
1149
1150 if (bit_mask & RTC_UIE) {
1151 UIE_on = 1;
1152 }
1153 if (bit_mask & RTC_PIE) {
1154 PIE_on = 1;
1155 PIE_count = 0;
1156 }
1157 if (bit_mask & RTC_AIE) {
1158 AIE_on = 1;
1159 }
1160
1161 if (timer_init_reqd)
1162 hpet_rtc_timer_init();
1163
1164 return 1;
1165}
1166
1167int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
1168{
1169 if (!is_hpet_enabled())
1170 return 0;
1171
1172 alarm_time.tm_hour = hrs;
1173 alarm_time.tm_min = min;
1174 alarm_time.tm_sec = sec;
1175
1176 return 1;
1177}
1178
1179int hpet_set_periodic_freq(unsigned long freq)
1180{
1181 if (!is_hpet_enabled())
1182 return 0;
1183
1184 PIE_freq = freq;
1185 PIE_count = 0;
1186
1187 return 1;
1188}
1189
1190int hpet_rtc_dropped_irq(void)
1191{
1192 if (!is_hpet_enabled())
1193 return 0;
1194
1195 return 1;
1196}
1197
1198irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
1199{
1200 struct rtc_time curr_time;
1201 unsigned long rtc_int_flag = 0;
1202 int call_rtc_interrupt = 0;
1203
1204 hpet_rtc_timer_reinit();
1205
1206 if (UIE_on | AIE_on) {
1207 rtc_get_rtc_time(&curr_time);
1208 }
1209 if (UIE_on) {
1210 if (curr_time.tm_sec != prev_update_sec) {
1211 /* Set update int info, call real rtc int routine */
1212 call_rtc_interrupt = 1;
1213 rtc_int_flag = RTC_UF;
1214 prev_update_sec = curr_time.tm_sec;
1215 }
1216 }
1217 if (PIE_on) {
1218 PIE_count++;
1219 if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
1220 /* Set periodic int info, call real rtc int routine */
1221 call_rtc_interrupt = 1;
1222 rtc_int_flag |= RTC_PF;
1223 PIE_count = 0;
1224 }
1225 }
1226 if (AIE_on) {
1227 if ((curr_time.tm_sec == alarm_time.tm_sec) &&
1228 (curr_time.tm_min == alarm_time.tm_min) &&
1229 (curr_time.tm_hour == alarm_time.tm_hour)) {
1230 /* Set alarm int info, call real rtc int routine */
1231 call_rtc_interrupt = 1;
1232 rtc_int_flag |= RTC_AF;
1233 }
1234 }
1235 if (call_rtc_interrupt) {
1236 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
1237 rtc_interrupt(rtc_int_flag, dev_id, regs);
1238 }
1239 return IRQ_HANDLED;
1240}
1241#endif
1242
1243
1244
1245static int __init nohpet_setup(char *s)
1246{
1247 nohpet = 1;
1248 return 0;
1249}
1250
1251__setup("nohpet", nohpet_setup);
1252
1253
1254static int __init notsc_setup(char *s)
1255{
1256 notsc = 1;
1257 return 0;
1258}
1259
1260__setup("notsc", notsc_setup);
1261
1262
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
new file mode 100644
index 000000000000..6d9c9a8e7d0e
--- /dev/null
+++ b/arch/x86_64/kernel/trampoline.S
@@ -0,0 +1,64 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 *
7 * Entry: CS:IP point to the start of our code, we are
8 * in real mode with no stack, but the rest of the
9 * trampoline page to make our stack and everything else
10 * is a mystery.
11 *
12 * In fact we don't actually need a stack so we don't
13 * set one up.
14 *
15 * On entry to trampoline_data, the processor is in real mode
16 * with 16-bit addressing and 16-bit data. CS has some value
17 * and IP is zero. Thus, data addresses need to be absolute
18 * (no relocation) and are taken with regard to r_base.
19 *
20 * If you work on this file, check the object module with objdump
21 * --full-contents --reloc to make sure there are no relocation
22 * entries. For the GDT entry we do hand relocation in smpboot.c
23 * because of 64bit linker limitations.
24 */
25
26#include <linux/linkage.h>
27#include <asm/segment.h>
28#include <asm/page.h>
29
30.data
31
32.code16
33
34ENTRY(trampoline_data)
35r_base = .
36 wbinvd
37 mov %cs, %ax # Code and data in the same place
38 mov %ax, %ds
39
40 cli # We should be safe anyway
41
42 movl $0xA5A5A5A5, trampoline_data - r_base
43 # write marker for master knows we're running
44
45 lidt idt_48 - r_base # load idt with 0, 0
46 lgdt gdt_48 - r_base # load gdt with whatever is appropriate
47
48 xor %ax, %ax
49 inc %ax # protected mode (PE) bit
50 lmsw %ax # into protected mode
51 # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S
52 ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map)
53
54 # Careful these need to be in the same 64K segment as the above;
55idt_48:
56 .word 0 # idt limit = 0
57 .word 0, 0 # idt base = 0L
58
59gdt_48:
60 .short __KERNEL32_CS + 7 # gdt limit
61 .long cpu_gdt_table-__START_KERNEL_map
62
63.globl trampoline_end
64trampoline_end:
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
new file mode 100644
index 000000000000..d87ac6496cc2
--- /dev/null
+++ b/arch/x86_64/kernel/traps.c
@@ -0,0 +1,948 @@
1/*
2 * linux/arch/x86-64/traps.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
6 *
7 * Pentium III FXSR, SSE support
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 *
10 * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
11 */
12
13/*
14 * 'Traps.c' handles hardware traps and faults after we have saved some
15 * state in 'entry.S'.
16 */
17#include <linux/config.h>
18#include <linux/sched.h>
19#include <linux/kernel.h>
20#include <linux/string.h>
21#include <linux/errno.h>
22#include <linux/ptrace.h>
23#include <linux/timer.h>
24#include <linux/mm.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/spinlock.h>
28#include <linux/interrupt.h>
29#include <linux/module.h>
30#include <linux/moduleparam.h>
31
32#include <asm/system.h>
33#include <asm/uaccess.h>
34#include <asm/io.h>
35#include <asm/atomic.h>
36#include <asm/debugreg.h>
37#include <asm/desc.h>
38#include <asm/i387.h>
39#include <asm/kdebug.h>
40#include <asm/processor.h>
41
42#include <asm/smp.h>
43#include <asm/pgalloc.h>
44#include <asm/pda.h>
45#include <asm/proto.h>
46#include <asm/nmi.h>
47
48#include <linux/irq.h>
49
50
51extern struct gate_struct idt_table[256];
52
53asmlinkage void divide_error(void);
54asmlinkage void debug(void);
55asmlinkage void nmi(void);
56asmlinkage void int3(void);
57asmlinkage void overflow(void);
58asmlinkage void bounds(void);
59asmlinkage void invalid_op(void);
60asmlinkage void device_not_available(void);
61asmlinkage void double_fault(void);
62asmlinkage void coprocessor_segment_overrun(void);
63asmlinkage void invalid_TSS(void);
64asmlinkage void segment_not_present(void);
65asmlinkage void stack_segment(void);
66asmlinkage void general_protection(void);
67asmlinkage void page_fault(void);
68asmlinkage void coprocessor_error(void);
69asmlinkage void simd_coprocessor_error(void);
70asmlinkage void reserved(void);
71asmlinkage void alignment_check(void);
72asmlinkage void machine_check(void);
73asmlinkage void spurious_interrupt_bug(void);
74asmlinkage void call_debug(void);
75
76struct notifier_block *die_chain;
77static DEFINE_SPINLOCK(die_notifier_lock);
78
79int register_die_notifier(struct notifier_block *nb)
80{
81 int err = 0;
82 unsigned long flags;
83 spin_lock_irqsave(&die_notifier_lock, flags);
84 err = notifier_chain_register(&die_chain, nb);
85 spin_unlock_irqrestore(&die_notifier_lock, flags);
86 return err;
87}
88
89static inline void conditional_sti(struct pt_regs *regs)
90{
91 if (regs->eflags & X86_EFLAGS_IF)
92 local_irq_enable();
93}
94
95static int kstack_depth_to_print = 10;
96
97#ifdef CONFIG_KALLSYMS
98#include <linux/kallsyms.h>
99int printk_address(unsigned long address)
100{
101 unsigned long offset = 0, symsize;
102 const char *symname;
103 char *modname;
104 char *delim = ":";
105 char namebuf[128];
106
107 symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
108 if (!symname)
109 return printk("[<%016lx>]", address);
110 if (!modname)
111 modname = delim = "";
112 return printk("<%016lx>{%s%s%s%s%+ld}",
113 address,delim,modname,delim,symname,offset);
114}
115#else
116int printk_address(unsigned long address)
117{
118 return printk("[<%016lx>]", address);
119}
120#endif
121
122unsigned long *in_exception_stack(int cpu, unsigned long stack)
123{
124 int k;
125 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
126 struct tss_struct *tss = &per_cpu(init_tss, cpu);
127 unsigned long start = tss->ist[k] - EXCEPTION_STKSZ;
128
129 if (stack >= start && stack < tss->ist[k])
130 return (unsigned long *)tss->ist[k];
131 }
132 return NULL;
133}
134
135/*
136 * x86-64 can have upto three kernel stacks:
137 * process stack
138 * interrupt stack
139 * severe exception (double fault, nmi, stack fault) hardware stack
140 * Check and process them in order.
141 */
142
143void show_trace(unsigned long *stack)
144{
145 unsigned long addr;
146 unsigned long *irqstack, *irqstack_end, *estack_end;
147 const int cpu = safe_smp_processor_id();
148 int i;
149
150 printk("\nCall Trace:");
151 i = 0;
152
153 estack_end = in_exception_stack(cpu, (unsigned long)stack);
154 if (estack_end) {
155 while (stack < estack_end) {
156 addr = *stack++;
157 if (__kernel_text_address(addr)) {
158 i += printk_address(addr);
159 i += printk(" ");
160 if (i > 50) {
161 printk("\n");
162 i = 0;
163 }
164 }
165 }
166 i += printk(" <EOE> ");
167 i += 7;
168 stack = (unsigned long *) estack_end[-2];
169 }
170
171 irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
172 irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64);
173
174 if (stack >= irqstack && stack < irqstack_end) {
175 printk("<IRQ> ");
176 while (stack < irqstack_end) {
177 addr = *stack++;
178 /*
179 * If the address is either in the text segment of the
180 * kernel, or in the region which contains vmalloc'ed
181 * memory, it *may* be the address of a calling
182 * routine; if so, print it so that someone tracing
183 * down the cause of the crash will be able to figure
184 * out the call path that was taken.
185 */
186 if (__kernel_text_address(addr)) {
187 i += printk_address(addr);
188 i += printk(" ");
189 if (i > 50) {
190 printk("\n ");
191 i = 0;
192 }
193 }
194 }
195 stack = (unsigned long *) (irqstack_end[-1]);
196 printk(" <EOI> ");
197 i += 7;
198 }
199
200 while (((long) stack & (THREAD_SIZE-1)) != 0) {
201 addr = *stack++;
202 if (__kernel_text_address(addr)) {
203 i += printk_address(addr);
204 i += printk(" ");
205 if (i > 50) {
206 printk("\n ");
207 i = 0;
208 }
209 }
210 }
211 printk("\n");
212}
213
214void show_stack(struct task_struct *tsk, unsigned long * rsp)
215{
216 unsigned long *stack;
217 int i;
218 const int cpu = safe_smp_processor_id();
219 unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
220 unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE);
221
222 // debugging aid: "show_stack(NULL, NULL);" prints the
223 // back trace for this cpu.
224
225 if (rsp == NULL) {
226 if (tsk)
227 rsp = (unsigned long *)tsk->thread.rsp;
228 else
229 rsp = (unsigned long *)&rsp;
230 }
231
232 stack = rsp;
233 for(i=0; i < kstack_depth_to_print; i++) {
234 if (stack >= irqstack && stack <= irqstack_end) {
235 if (stack == irqstack_end) {
236 stack = (unsigned long *) (irqstack_end[-1]);
237 printk(" <EOI> ");
238 }
239 } else {
240 if (((long) stack & (THREAD_SIZE-1)) == 0)
241 break;
242 }
243 if (i && ((i % 4) == 0))
244 printk("\n ");
245 printk("%016lx ", *stack++);
246 }
247 show_trace((unsigned long *)rsp);
248}
249
250/*
251 * The architecture-independent dump_stack generator
252 */
253void dump_stack(void)
254{
255 unsigned long dummy;
256 show_trace(&dummy);
257}
258
259EXPORT_SYMBOL(dump_stack);
260
261void show_registers(struct pt_regs *regs)
262{
263 int i;
264 int in_kernel = (regs->cs & 3) == 0;
265 unsigned long rsp;
266 const int cpu = safe_smp_processor_id();
267 struct task_struct *cur = cpu_pda[cpu].pcurrent;
268
269 rsp = regs->rsp;
270
271 printk("CPU %d ", cpu);
272 __show_regs(regs);
273 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
274 cur->comm, cur->pid, cur->thread_info, cur);
275
276 /*
277 * When in-kernel, we also print out the stack and code at the
278 * time of the fault..
279 */
280 if (in_kernel) {
281
282 printk("Stack: ");
283 show_stack(NULL, (unsigned long*)rsp);
284
285 printk("\nCode: ");
286 if(regs->rip < PAGE_OFFSET)
287 goto bad;
288
289 for(i=0;i<20;i++)
290 {
291 unsigned char c;
292 if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
293bad:
294 printk(" Bad RIP value.");
295 break;
296 }
297 printk("%02x ", c);
298 }
299 }
300 printk("\n");
301}
302
303void handle_BUG(struct pt_regs *regs)
304{
305 struct bug_frame f;
306 char tmp;
307
308 if (regs->cs & 3)
309 return;
310 if (__copy_from_user(&f, (struct bug_frame *) regs->rip,
311 sizeof(struct bug_frame)))
312 return;
313 if ((unsigned long)f.filename < __PAGE_OFFSET ||
314 f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
315 return;
316 if (__get_user(tmp, f.filename))
317 f.filename = "unmapped filename";
318 printk("----------- [cut here ] --------- [please bite here ] ---------\n");
319 printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line);
320}
321
322void out_of_line_bug(void)
323{
324 BUG();
325}
326
327static DEFINE_SPINLOCK(die_lock);
328static int die_owner = -1;
329
330void oops_begin(void)
331{
332 int cpu = safe_smp_processor_id();
333 /* racy, but better than risking deadlock. */
334 local_irq_disable();
335 if (!spin_trylock(&die_lock)) {
336 if (cpu == die_owner)
337 /* nested oops. should stop eventually */;
338 else
339 spin_lock(&die_lock);
340 }
341 die_owner = cpu;
342 console_verbose();
343 bust_spinlocks(1);
344}
345
346void oops_end(void)
347{
348 die_owner = -1;
349 bust_spinlocks(0);
350 spin_unlock(&die_lock);
351 if (panic_on_oops)
352 panic("Oops");
353}
354
355void __die(const char * str, struct pt_regs * regs, long err)
356{
357 static int die_counter;
358 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
359#ifdef CONFIG_PREEMPT
360 printk("PREEMPT ");
361#endif
362#ifdef CONFIG_SMP
363 printk("SMP ");
364#endif
365#ifdef CONFIG_DEBUG_PAGEALLOC
366 printk("DEBUG_PAGEALLOC");
367#endif
368 printk("\n");
369 notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
370 show_registers(regs);
371 /* Executive summary in case the oops scrolled away */
372 printk(KERN_ALERT "RIP ");
373 printk_address(regs->rip);
374 printk(" RSP <%016lx>\n", regs->rsp);
375}
376
377void die(const char * str, struct pt_regs * regs, long err)
378{
379 oops_begin();
380 handle_BUG(regs);
381 __die(str, regs, err);
382 oops_end();
383 do_exit(SIGSEGV);
384}
385static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
386{
387 if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS))
388 die(str, regs, err);
389}
390
391void die_nmi(char *str, struct pt_regs *regs)
392{
393 oops_begin();
394 /*
395 * We are in trouble anyway, lets at least try
396 * to get a message out.
397 */
398 printk(str, safe_smp_processor_id());
399 show_registers(regs);
400 if (panic_on_timeout || panic_on_oops)
401 panic("nmi watchdog");
402 printk("console shuts up ...\n");
403 oops_end();
404 do_exit(SIGSEGV);
405}
406
407static void do_trap(int trapnr, int signr, char *str,
408 struct pt_regs * regs, long error_code, siginfo_t *info)
409{
410 conditional_sti(regs);
411
412#ifdef CONFIG_CHECKING
413 {
414 unsigned long gs;
415 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
416 rdmsrl(MSR_GS_BASE, gs);
417 if (gs != (unsigned long)pda) {
418 wrmsrl(MSR_GS_BASE, pda);
419 printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
420 regs->rip);
421 }
422 }
423#endif
424
425 if ((regs->cs & 3) != 0) {
426 struct task_struct *tsk = current;
427
428 if (exception_trace && unhandled_signal(tsk, signr))
429 printk(KERN_INFO
430 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
431 tsk->comm, tsk->pid, str,
432 regs->rip,regs->rsp,error_code);
433
434 tsk->thread.error_code = error_code;
435 tsk->thread.trap_no = trapnr;
436 if (info)
437 force_sig_info(signr, info, tsk);
438 else
439 force_sig(signr, tsk);
440 return;
441 }
442
443
444 /* kernel trap */
445 {
446 const struct exception_table_entry *fixup;
447 fixup = search_exception_tables(regs->rip);
448 if (fixup) {
449 regs->rip = fixup->fixup;
450 } else
451 die(str, regs, error_code);
452 return;
453 }
454}
455
456#define DO_ERROR(trapnr, signr, str, name) \
457asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
458{ \
459 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
460 == NOTIFY_STOP) \
461 return; \
462 do_trap(trapnr, signr, str, regs, error_code, NULL); \
463}
464
465#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
466asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
467{ \
468 siginfo_t info; \
469 info.si_signo = signr; \
470 info.si_errno = 0; \
471 info.si_code = sicode; \
472 info.si_addr = (void __user *)siaddr; \
473 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
474 == NOTIFY_STOP) \
475 return; \
476 do_trap(trapnr, signr, str, regs, error_code, &info); \
477}
478
479DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
480DO_ERROR( 4, SIGSEGV, "overflow", overflow)
481DO_ERROR( 5, SIGSEGV, "bounds", bounds)
482DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip)
483DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
484DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
485DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
486DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
487DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
488DO_ERROR(18, SIGSEGV, "reserved", reserved)
489
490#define DO_ERROR_STACK(trapnr, signr, str, name) \
491asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \
492{ \
493 struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \
494 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
495 == NOTIFY_STOP) \
496 return regs; \
497 if (regs->cs & 3) { \
498 memcpy(pr, regs, sizeof(struct pt_regs)); \
499 regs = pr; \
500 } \
501 do_trap(trapnr, signr, str, regs, error_code, NULL); \
502 return regs; \
503}
504
505DO_ERROR_STACK(12, SIGBUS, "stack segment", stack_segment)
506DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault)
507
508asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
509{
510 conditional_sti(regs);
511
512#ifdef CONFIG_CHECKING
513 {
514 unsigned long gs;
515 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
516 rdmsrl(MSR_GS_BASE, gs);
517 if (gs != (unsigned long)pda) {
518 wrmsrl(MSR_GS_BASE, pda);
519 oops_in_progress++;
520 printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
521 oops_in_progress--;
522 }
523 }
524#endif
525
526 if ((regs->cs & 3)!=0) {
527 struct task_struct *tsk = current;
528
529 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
530 printk(KERN_INFO
531 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
532 tsk->comm, tsk->pid,
533 regs->rip,regs->rsp,error_code);
534
535 tsk->thread.error_code = error_code;
536 tsk->thread.trap_no = 13;
537 force_sig(SIGSEGV, tsk);
538 return;
539 }
540
541 /* kernel gp */
542 {
543 const struct exception_table_entry *fixup;
544 fixup = search_exception_tables(regs->rip);
545 if (fixup) {
546 regs->rip = fixup->fixup;
547 return;
548 }
549 if (notify_die(DIE_GPF, "general protection fault", regs,
550 error_code, 13, SIGSEGV) == NOTIFY_STOP)
551 return;
552 die("general protection fault", regs, error_code);
553 }
554}
555
556static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
557{
558 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
559 printk("You probably have a hardware problem with your RAM chips\n");
560
561 /* Clear and disable the memory parity error line. */
562 reason = (reason & 0xf) | 4;
563 outb(reason, 0x61);
564}
565
566static void io_check_error(unsigned char reason, struct pt_regs * regs)
567{
568 printk("NMI: IOCK error (debug interrupt?)\n");
569 show_registers(regs);
570
571 /* Re-enable the IOCK line, wait for a few seconds */
572 reason = (reason & 0xf) | 8;
573 outb(reason, 0x61);
574 mdelay(2000);
575 reason &= ~8;
576 outb(reason, 0x61);
577}
578
579static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
580{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
581 printk("Dazed and confused, but trying to continue\n");
582 printk("Do you have a strange power saving mode enabled?\n");
583}
584
585asmlinkage void default_do_nmi(struct pt_regs *regs)
586{
587 unsigned char reason = 0;
588
589 /* Only the BSP gets external NMIs from the system. */
590 if (!smp_processor_id())
591 reason = get_nmi_reason();
592
593 if (!(reason & 0xc0)) {
594 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
595 == NOTIFY_STOP)
596 return;
597#ifdef CONFIG_X86_LOCAL_APIC
598 /*
599 * Ok, so this is none of the documented NMI sources,
600 * so it must be the NMI watchdog.
601 */
602 if (nmi_watchdog > 0) {
603 nmi_watchdog_tick(regs,reason);
604 return;
605 }
606#endif
607 unknown_nmi_error(reason, regs);
608 return;
609 }
610 if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
611 return;
612
613 /* AK: following checks seem to be broken on modern chipsets. FIXME */
614
615 if (reason & 0x80)
616 mem_parity_error(reason, regs);
617 if (reason & 0x40)
618 io_check_error(reason, regs);
619}
620
621asmlinkage void do_int3(struct pt_regs * regs, long error_code)
622{
623 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
624 return;
625 }
626 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
627 return;
628}
629
630/* runs on IST stack. */
631asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code)
632{
633 struct pt_regs *pr;
634 unsigned long condition;
635 struct task_struct *tsk = current;
636 siginfo_t info;
637
638 pr = (struct pt_regs *)(current->thread.rsp0)-1;
639 if (regs->cs & 3) {
640 memcpy(pr, regs, sizeof(struct pt_regs));
641 regs = pr;
642 }
643
644#ifdef CONFIG_CHECKING
645 {
646 /* RED-PEN interaction with debugger - could destroy gs */
647 unsigned long gs;
648 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
649 rdmsrl(MSR_GS_BASE, gs);
650 if (gs != (unsigned long)pda) {
651 wrmsrl(MSR_GS_BASE, pda);
652 printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
653 }
654 }
655#endif
656
657 asm("movq %%db6,%0" : "=r" (condition));
658
659 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
660 SIGTRAP) == NOTIFY_STOP) {
661 return regs;
662 }
663 conditional_sti(regs);
664
665 /* Mask out spurious debug traps due to lazy DR7 setting */
666 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
667 if (!tsk->thread.debugreg7) {
668 goto clear_dr7;
669 }
670 }
671
672 tsk->thread.debugreg6 = condition;
673
674 /* Mask out spurious TF errors due to lazy TF clearing */
675 if ((condition & DR_STEP) &&
676 (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition,
677 1, SIGTRAP) != NOTIFY_STOP)) {
678 /*
679 * The TF error should be masked out only if the current
680 * process is not traced and if the TRAP flag has been set
681 * previously by a tracing process (condition detected by
682 * the PT_DTRACE flag); remember that the i386 TRAP flag
683 * can be modified by the process itself in user mode,
684 * allowing programs to debug themselves without the ptrace()
685 * interface.
686 */
687 if ((regs->cs & 3) == 0)
688 goto clear_TF_reenable;
689 if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
690 goto clear_TF;
691 }
692
693 /* Ok, finally something we can handle */
694 tsk->thread.trap_no = 1;
695 tsk->thread.error_code = error_code;
696 info.si_signo = SIGTRAP;
697 info.si_errno = 0;
698 info.si_code = TRAP_BRKPT;
699 if ((regs->cs & 3) == 0)
700 goto clear_dr7;
701
702 info.si_addr = (void __user *)regs->rip;
703 force_sig_info(SIGTRAP, &info, tsk);
704clear_dr7:
705 asm volatile("movq %0,%%db7"::"r"(0UL));
706 notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP);
707 return regs;
708
709clear_TF_reenable:
710 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
711
712clear_TF:
713 /* RED-PEN could cause spurious errors */
714 if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP)
715 != NOTIFY_STOP)
716 regs->eflags &= ~TF_MASK;
717 return regs;
718}
719
720static int kernel_math_error(struct pt_regs *regs, char *str)
721{
722 const struct exception_table_entry *fixup;
723 fixup = search_exception_tables(regs->rip);
724 if (fixup) {
725 regs->rip = fixup->fixup;
726 return 1;
727 }
728 notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE);
729#if 0
730 /* This should be a die, but warn only for now */
731 die(str, regs, 0);
732#else
733 printk(KERN_DEBUG "%s: %s at ", current->comm, str);
734 printk_address(regs->rip);
735 printk("\n");
736#endif
737 return 0;
738}
739
740/*
741 * Note that we play around with the 'TS' bit in an attempt to get
742 * the correct behaviour even in the presence of the asynchronous
743 * IRQ13 behaviour
744 */
745asmlinkage void do_coprocessor_error(struct pt_regs *regs)
746{
747 void __user *rip = (void __user *)(regs->rip);
748 struct task_struct * task;
749 siginfo_t info;
750 unsigned short cwd, swd;
751
752 conditional_sti(regs);
753 if ((regs->cs & 3) == 0 &&
754 kernel_math_error(regs, "kernel x87 math error"))
755 return;
756
757 /*
758 * Save the info for the exception handler and clear the error.
759 */
760 task = current;
761 save_init_fpu(task);
762 task->thread.trap_no = 16;
763 task->thread.error_code = 0;
764 info.si_signo = SIGFPE;
765 info.si_errno = 0;
766 info.si_code = __SI_FAULT;
767 info.si_addr = rip;
768 /*
769 * (~cwd & swd) will mask out exceptions that are not set to unmasked
770 * status. 0x3f is the exception bits in these regs, 0x200 is the
771 * C1 reg you need in case of a stack fault, 0x040 is the stack
772 * fault bit. We should only be taking one exception at a time,
773 * so if this combination doesn't produce any single exception,
774 * then we have a bad program that isn't synchronizing its FPU usage
775 * and it will suffer the consequences since we won't be able to
776 * fully reproduce the context of the exception
777 */
778 cwd = get_fpu_cwd(task);
779 swd = get_fpu_swd(task);
780 switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
781 case 0x000:
782 default:
783 break;
784 case 0x001: /* Invalid Op */
785 case 0x041: /* Stack Fault */
786 case 0x241: /* Stack Fault | Direction */
787 info.si_code = FPE_FLTINV;
788 break;
789 case 0x002: /* Denormalize */
790 case 0x010: /* Underflow */
791 info.si_code = FPE_FLTUND;
792 break;
793 case 0x004: /* Zero Divide */
794 info.si_code = FPE_FLTDIV;
795 break;
796 case 0x008: /* Overflow */
797 info.si_code = FPE_FLTOVF;
798 break;
799 case 0x020: /* Precision */
800 info.si_code = FPE_FLTRES;
801 break;
802 }
803 force_sig_info(SIGFPE, &info, task);
804}
805
806asmlinkage void bad_intr(void)
807{
808 printk("bad interrupt");
809}
810
811asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
812{
813 void __user *rip = (void __user *)(regs->rip);
814 struct task_struct * task;
815 siginfo_t info;
816 unsigned short mxcsr;
817
818 conditional_sti(regs);
819 if ((regs->cs & 3) == 0 &&
820 kernel_math_error(regs, "simd math error"))
821 return;
822
823 /*
824 * Save the info for the exception handler and clear the error.
825 */
826 task = current;
827 save_init_fpu(task);
828 task->thread.trap_no = 19;
829 task->thread.error_code = 0;
830 info.si_signo = SIGFPE;
831 info.si_errno = 0;
832 info.si_code = __SI_FAULT;
833 info.si_addr = rip;
834 /*
835 * The SIMD FPU exceptions are handled a little differently, as there
836 * is only a single status/control register. Thus, to determine which
837 * unmasked exception was caught we must mask the exception mask bits
838 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
839 */
840 mxcsr = get_fpu_mxcsr(task);
841 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
842 case 0x000:
843 default:
844 break;
845 case 0x001: /* Invalid Op */
846 info.si_code = FPE_FLTINV;
847 break;
848 case 0x002: /* Denormalize */
849 case 0x010: /* Underflow */
850 info.si_code = FPE_FLTUND;
851 break;
852 case 0x004: /* Zero Divide */
853 info.si_code = FPE_FLTDIV;
854 break;
855 case 0x008: /* Overflow */
856 info.si_code = FPE_FLTOVF;
857 break;
858 case 0x020: /* Precision */
859 info.si_code = FPE_FLTRES;
860 break;
861 }
862 force_sig_info(SIGFPE, &info, task);
863}
864
865asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
866{
867}
868
869asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
870{
871}
872
873/*
874 * 'math_state_restore()' saves the current math information in the
875 * old math state array, and gets the new ones from the current task
876 *
877 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
878 * Don't touch unless you *really* know how it works.
879 */
880asmlinkage void math_state_restore(void)
881{
882 struct task_struct *me = current;
883 clts(); /* Allow maths ops (or we recurse) */
884
885 if (!used_math())
886 init_fpu(me);
887 restore_fpu_checking(&me->thread.i387.fxsave);
888 me->thread_info->status |= TS_USEDFPU;
889}
890
891void do_call_debug(struct pt_regs *regs)
892{
893 notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT);
894}
895
896void __init trap_init(void)
897{
898 set_intr_gate(0,&divide_error);
899 set_intr_gate_ist(1,&debug,DEBUG_STACK);
900 set_intr_gate_ist(2,&nmi,NMI_STACK);
901 set_system_gate(3,&int3);
902 set_system_gate(4,&overflow); /* int4-5 can be called from all */
903 set_system_gate(5,&bounds);
904 set_intr_gate(6,&invalid_op);
905 set_intr_gate(7,&device_not_available);
906 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
907 set_intr_gate(9,&coprocessor_segment_overrun);
908 set_intr_gate(10,&invalid_TSS);
909 set_intr_gate(11,&segment_not_present);
910 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
911 set_intr_gate(13,&general_protection);
912 set_intr_gate(14,&page_fault);
913 set_intr_gate(15,&spurious_interrupt_bug);
914 set_intr_gate(16,&coprocessor_error);
915 set_intr_gate(17,&alignment_check);
916#ifdef CONFIG_X86_MCE
917 set_intr_gate_ist(18,&machine_check, MCE_STACK);
918#endif
919 set_intr_gate(19,&simd_coprocessor_error);
920
921#ifdef CONFIG_IA32_EMULATION
922 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
923#endif
924
925 set_intr_gate(KDB_VECTOR, call_debug);
926
927 /*
928 * Should be a barrier for any external CPU state.
929 */
930 cpu_init();
931}
932
933
934/* Actual parsing is done early in setup.c. */
935static int __init oops_dummy(char *s)
936{
937 panic_on_oops = 1;
938 return -1;
939}
940__setup("oops=", oops_dummy);
941
942static int __init kstack_setup(char *s)
943{
944 kstack_depth_to_print = simple_strtoul(s,NULL,0);
945 return 0;
946}
947__setup("kstack=", kstack_setup);
948
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..59ebd5beda87
--- /dev/null
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -0,0 +1,164 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#include <asm-generic/vmlinux.lds.h>
6#include <linux/config.h>
7
8OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
9OUTPUT_ARCH(i386:x86-64)
10ENTRY(phys_startup_64)
11jiffies_64 = jiffies;
12SECTIONS
13{
14 . = 0xffffffff80100000;
15 phys_startup_64 = startup_64 - LOAD_OFFSET;
16 _text = .; /* Text and read-only data */
17 .text : {
18 *(.text)
19 SCHED_TEXT
20 LOCK_TEXT
21 *(.fixup)
22 *(.gnu.warning)
23 } = 0x9090
24 .text.lock : { *(.text.lock) } /* out-of-line lock text */
25
26 _etext = .; /* End of text section */
27
28 . = ALIGN(16); /* Exception table */
29 __start___ex_table = .;
30 __ex_table : { *(__ex_table) }
31 __stop___ex_table = .;
32
33 RODATA
34
35 .data : { /* Data */
36 *(.data)
37 CONSTRUCTORS
38 }
39
40 _edata = .; /* End of data section */
41
42 __bss_start = .; /* BSS */
43 .bss : {
44 *(.bss.page_aligned)
45 *(.bss)
46 }
47 __bss_end = .;
48
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
50 .data.cacheline_aligned : { *(.data.cacheline_aligned) }
51
52#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
53#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1))
54#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
55
56 .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
57 __vsyscall_0 = LOADADDR(.vsyscall_0);
58 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
59 .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
60 xtime_lock = LOADADDR(.xtime_lock);
61 .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
62 vxtime = LOADADDR(.vxtime);
63 .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
64 wall_jiffies = LOADADDR(.wall_jiffies);
65 .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
66 sys_tz = LOADADDR(.sys_tz);
67 .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
68 sysctl_vsyscall = LOADADDR(.sysctl_vsyscall);
69 .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
70 xtime = LOADADDR(.xtime);
71 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
72 .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
73 jiffies = LOADADDR(.jiffies);
74 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
75 . = LOADADDR(.vsyscall_0) + 4096;
76
77 . = ALIGN(8192); /* init_task */
78 .data.init_task : { *(.data.init_task) }
79
80 . = ALIGN(4096);
81 .data.page_aligned : { *(.data.page_aligned) }
82
83 . = ALIGN(4096); /* Init code and data */
84 __init_begin = .;
85 .init.text : {
86 _sinittext = .;
87 *(.init.text)
88 _einittext = .;
89 }
90 __initdata_begin = .;
91 .init.data : { *(.init.data) }
92 __initdata_end = .;
93 . = ALIGN(16);
94 __setup_start = .;
95 .init.setup : { *(.init.setup) }
96 __setup_end = .;
97 __initcall_start = .;
98 .initcall.init : {
99 *(.initcall1.init)
100 *(.initcall2.init)
101 *(.initcall3.init)
102 *(.initcall4.init)
103 *(.initcall5.init)
104 *(.initcall6.init)
105 *(.initcall7.init)
106 }
107 __initcall_end = .;
108 __con_initcall_start = .;
109 .con_initcall.init : { *(.con_initcall.init) }
110 __con_initcall_end = .;
111 SECURITY_INIT
112 . = ALIGN(8);
113 __alt_instructions = .;
114 .altinstructions : { *(.altinstructions) }
115 __alt_instructions_end = .;
116 .altinstr_replacement : { *(.altinstr_replacement) }
117 /* .exit.text is discard at runtime, not link time, to deal with references
118 from .altinstructions and .eh_frame */
119 .exit.text : { *(.exit.text) }
120 .exit.data : { *(.exit.data) }
121 . = ALIGN(4096);
122 __initramfs_start = .;
123 .init.ramfs : { *(.init.ramfs) }
124 __initramfs_end = .;
125 . = ALIGN(32);
126 __per_cpu_start = .;
127 .data.percpu : { *(.data.percpu) }
128 __per_cpu_end = .;
129 . = ALIGN(4096);
130 __init_end = .;
131
132 . = ALIGN(4096);
133 __nosave_begin = .;
134 .data_nosave : { *(.data.nosave) }
135 . = ALIGN(4096);
136 __nosave_end = .;
137
138 _end = . ;
139
140 /* Sections to be discarded */
141 /DISCARD/ : {
142 *(.exitcall.exit)
143#ifndef CONFIG_DEBUG_INFO
144 *(.eh_frame)
145#endif
146 }
147
148 /* DWARF 2 */
149 .debug_info 0 : { *(.debug_info) }
150 .debug_abbrev 0 : { *(.debug_abbrev) }
151 .debug_line 0 : { *(.debug_line) }
152 .debug_frame 0 : { *(.debug_frame) }
153 .debug_str 0 : { *(.debug_str) }
154 .debug_loc 0 : { *(.debug_loc) }
155 .debug_macinfo 0 : { *(.debug_macinfo) }
156 /* SGI/MIPS DWARF 2 extensions */
157 .debug_weaknames 0 : { *(.debug_weaknames) }
158 .debug_funcnames 0 : { *(.debug_funcnames) }
159 .debug_typenames 0 : { *(.debug_typenames) }
160 .debug_varnames 0 : { *(.debug_varnames) }
161
162
163 .comment 0 : { *(.comment) }
164}
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
new file mode 100644
index 000000000000..b4b8dc59663a
--- /dev/null
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -0,0 +1,225 @@
1/*
2 * linux/arch/x86_64/kernel/vsyscall.c
3 *
4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright 2003 Andi Kleen, SuSE Labs.
6 *
7 * Thanks to hpa@transmeta.com for some useful hint.
8 * Special thanks to Ingo Molnar for his early experience with
9 * a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 * jumping out of line if necessary. We cannot add more with this
15 * mechanism because older kernels won't return -ENOSYS.
16 * If we want more than four we need a vDSO.
17 *
18 * Note: the concept clashes with user mode linux. If you use UML and
19 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
20 */
21
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/timer.h>
26#include <linux/seqlock.h>
27#include <linux/jiffies.h>
28#include <linux/sysctl.h>
29
30#include <asm/vsyscall.h>
31#include <asm/pgtable.h>
32#include <asm/page.h>
33#include <asm/fixmap.h>
34#include <asm/errno.h>
35#include <asm/io.h>
36
37#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
38#define force_inline __attribute__((always_inline)) inline
39
40int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
41seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
42
43#include <asm/unistd.h>
44
45static force_inline void timeval_normalize(struct timeval * tv)
46{
47 time_t __sec;
48
49 __sec = tv->tv_usec / 1000000;
50 if (__sec) {
51 tv->tv_usec %= 1000000;
52 tv->tv_sec += __sec;
53 }
54}
55
56static force_inline void do_vgettimeofday(struct timeval * tv)
57{
58 long sequence, t;
59 unsigned long sec, usec;
60
61 do {
62 sequence = read_seqbegin(&__xtime_lock);
63
64 sec = __xtime.tv_sec;
65 usec = (__xtime.tv_nsec / 1000) +
66 (__jiffies - __wall_jiffies) * (1000000 / HZ);
67
68 if (__vxtime.mode == VXTIME_TSC) {
69 sync_core();
70 rdtscll(t);
71 if (t < __vxtime.last_tsc)
72 t = __vxtime.last_tsc;
73 usec += ((t - __vxtime.last_tsc) *
74 __vxtime.tsc_quot) >> 32;
75 /* See comment in x86_64 do_gettimeofday. */
76 } else {
77 usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
78 __vxtime.last) * __vxtime.quot) >> 32;
79 }
80 } while (read_seqretry(&__xtime_lock, sequence));
81
82 tv->tv_sec = sec + usec / 1000000;
83 tv->tv_usec = usec % 1000000;
84}
85
86/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
87static force_inline void do_get_tz(struct timezone * tz)
88{
89 *tz = __sys_tz;
90}
91
92static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
93{
94 int ret;
95 asm volatile("vsysc2: syscall"
96 : "=a" (ret)
97 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
98 return ret;
99}
100
101static force_inline long time_syscall(long *t)
102{
103 long secs;
104 asm volatile("vsysc1: syscall"
105 : "=a" (secs)
106 : "0" (__NR_time),"D" (t) : __syscall_clobber);
107 return secs;
108}
109
110static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
111{
112 if (unlikely(!__sysctl_vsyscall))
113 return gettimeofday(tv,tz);
114 if (tv)
115 do_vgettimeofday(tv);
116 if (tz)
117 do_get_tz(tz);
118 return 0;
119}
120
121/* This will break when the xtime seconds get inaccurate, but that is
122 * unlikely */
123static time_t __vsyscall(1) vtime(time_t *t)
124{
125 if (unlikely(!__sysctl_vsyscall))
126 return time_syscall(t);
127 else if (t)
128 *t = __xtime.tv_sec;
129 return __xtime.tv_sec;
130}
131
132static long __vsyscall(2) venosys_0(void)
133{
134 return -ENOSYS;
135}
136
137static long __vsyscall(3) venosys_1(void)
138{
139 return -ENOSYS;
140}
141
142#ifdef CONFIG_SYSCTL
143
144#define SYSCALL 0x050f
145#define NOP2 0x9090
146
147/*
148 * NOP out syscall in vsyscall page when not needed.
149 */
150static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
151 void __user *buffer, size_t *lenp, loff_t *ppos)
152{
153 extern u16 vsysc1, vsysc2;
154 u16 *map1, *map2;
155 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
156 if (!write)
157 return ret;
158 /* gcc has some trouble with __va(__pa()), so just do it this
159 way. */
160 map1 = ioremap(__pa_symbol(&vsysc1), 2);
161 if (!map1)
162 return -ENOMEM;
163 map2 = ioremap(__pa_symbol(&vsysc2), 2);
164 if (!map2) {
165 ret = -ENOMEM;
166 goto out;
167 }
168 if (!sysctl_vsyscall) {
169 *map1 = SYSCALL;
170 *map2 = SYSCALL;
171 } else {
172 *map1 = NOP2;
173 *map2 = NOP2;
174 }
175 iounmap(map2);
176out:
177 iounmap(map1);
178 return ret;
179}
180
181static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
182 void __user *oldval, size_t __user *oldlenp,
183 void __user *newval, size_t newlen,
184 void **context)
185{
186 return -ENOSYS;
187}
188
189static ctl_table kernel_table2[] = {
190 { .ctl_name = 99, .procname = "vsyscall64",
191 .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
192 .strategy = vsyscall_sysctl_nostrat,
193 .proc_handler = vsyscall_sysctl_change },
194 { 0, }
195};
196
197static ctl_table kernel_root_table2[] = {
198 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
199 .child = kernel_table2 },
200 { 0 },
201};
202
203#endif
204
205static void __init map_vsyscall(void)
206{
207 extern char __vsyscall_0;
208 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
209
210 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
211}
212
213static int __init vsyscall_init(void)
214{
215 BUG_ON(((unsigned long) &vgettimeofday !=
216 VSYSCALL_ADDR(__NR_vgettimeofday)));
217 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
218 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
219 map_vsyscall();
220 sysctl_vsyscall = 1;
221 register_sysctl_table(kernel_root_table2, 0);
222 return 0;
223}
224
225__initcall(vsyscall_init);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
new file mode 100644
index 000000000000..88626e626886
--- /dev/null
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -0,0 +1,221 @@
1#include <linux/config.h>
2#include <linux/module.h>
3#include <linux/smp.h>
4#include <linux/user.h>
5#include <linux/sched.h>
6#include <linux/in6.h>
7#include <linux/interrupt.h>
8#include <linux/smp_lock.h>
9#include <linux/pm.h>
10#include <linux/pci.h>
11#include <linux/apm_bios.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/syscalls.h>
15#include <linux/tty.h>
16#include <linux/ioctl32.h>
17
18#include <asm/semaphore.h>
19#include <asm/processor.h>
20#include <asm/i387.h>
21#include <asm/uaccess.h>
22#include <asm/checksum.h>
23#include <asm/io.h>
24#include <asm/delay.h>
25#include <asm/irq.h>
26#include <asm/mmx.h>
27#include <asm/desc.h>
28#include <asm/pgtable.h>
29#include <asm/pgalloc.h>
30#include <asm/nmi.h>
31#include <asm/kdebug.h>
32#include <asm/unistd.h>
33#include <asm/tlbflush.h>
34#include <asm/kdebug.h>
35
36extern spinlock_t rtc_lock;
37
38#ifdef CONFIG_SMP
39extern void __write_lock_failed(rwlock_t *rw);
40extern void __read_lock_failed(rwlock_t *rw);
41#endif
42
43#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
44extern struct drive_info_struct drive_info;
45EXPORT_SYMBOL(drive_info);
46#endif
47
48extern unsigned long get_cmos_time(void);
49
50/* platform dependent support */
51EXPORT_SYMBOL(boot_cpu_data);
52//EXPORT_SYMBOL(dump_fpu);
53EXPORT_SYMBOL(__ioremap);
54EXPORT_SYMBOL(ioremap_nocache);
55EXPORT_SYMBOL(iounmap);
56EXPORT_SYMBOL(enable_irq);
57EXPORT_SYMBOL(disable_irq);
58EXPORT_SYMBOL(disable_irq_nosync);
59EXPORT_SYMBOL(probe_irq_mask);
60EXPORT_SYMBOL(kernel_thread);
61EXPORT_SYMBOL(pm_idle);
62EXPORT_SYMBOL(pm_power_off);
63EXPORT_SYMBOL(get_cmos_time);
64
65EXPORT_SYMBOL(__down_failed);
66EXPORT_SYMBOL(__down_failed_interruptible);
67EXPORT_SYMBOL(__down_failed_trylock);
68EXPORT_SYMBOL(__up_wakeup);
69/* Networking helper routines. */
70EXPORT_SYMBOL(csum_partial_copy_nocheck);
71EXPORT_SYMBOL(ip_compute_csum);
72/* Delay loops */
73EXPORT_SYMBOL(__udelay);
74EXPORT_SYMBOL(__ndelay);
75EXPORT_SYMBOL(__delay);
76EXPORT_SYMBOL(__const_udelay);
77
78EXPORT_SYMBOL(__get_user_1);
79EXPORT_SYMBOL(__get_user_2);
80EXPORT_SYMBOL(__get_user_4);
81EXPORT_SYMBOL(__get_user_8);
82EXPORT_SYMBOL(__put_user_1);
83EXPORT_SYMBOL(__put_user_2);
84EXPORT_SYMBOL(__put_user_4);
85EXPORT_SYMBOL(__put_user_8);
86
87EXPORT_SYMBOL(strpbrk);
88EXPORT_SYMBOL(strstr);
89
90EXPORT_SYMBOL(strncpy_from_user);
91EXPORT_SYMBOL(__strncpy_from_user);
92EXPORT_SYMBOL(clear_user);
93EXPORT_SYMBOL(__clear_user);
94EXPORT_SYMBOL(copy_user_generic);
95EXPORT_SYMBOL(copy_from_user);
96EXPORT_SYMBOL(copy_to_user);
97EXPORT_SYMBOL(copy_in_user);
98EXPORT_SYMBOL(strnlen_user);
99
100#ifdef CONFIG_PCI
101EXPORT_SYMBOL(pci_alloc_consistent);
102EXPORT_SYMBOL(pci_free_consistent);
103#endif
104
105#ifdef CONFIG_PCI
106EXPORT_SYMBOL(pci_mem_start);
107#endif
108
109EXPORT_SYMBOL(copy_page);
110EXPORT_SYMBOL(clear_page);
111
112EXPORT_SYMBOL(cpu_pda);
113#ifdef CONFIG_SMP
114EXPORT_SYMBOL(cpu_data);
115EXPORT_SYMBOL(cpu_online_map);
116EXPORT_SYMBOL(__write_lock_failed);
117EXPORT_SYMBOL(__read_lock_failed);
118
119EXPORT_SYMBOL(synchronize_irq);
120EXPORT_SYMBOL(smp_call_function);
121EXPORT_SYMBOL(cpu_callout_map);
122#endif
123
124#ifdef CONFIG_VT
125EXPORT_SYMBOL(screen_info);
126#endif
127
128EXPORT_SYMBOL(get_wchan);
129
130EXPORT_SYMBOL(rtc_lock);
131
132EXPORT_SYMBOL_GPL(set_nmi_callback);
133EXPORT_SYMBOL_GPL(unset_nmi_callback);
134
135/* Export string functions. We normally rely on gcc builtin for most of these,
136 but gcc sometimes decides not to inline them. */
137#undef memcpy
138#undef memset
139#undef memmove
140#undef memchr
141#undef strlen
142#undef strcpy
143#undef strncmp
144#undef strncpy
145#undef strchr
146#undef strcmp
147#undef strcpy
148#undef strcat
149#undef memcmp
150
151extern void * memset(void *,int,__kernel_size_t);
152extern size_t strlen(const char *);
153extern void * memmove(void * dest,const void *src,size_t count);
154extern char * strcpy(char * dest,const char *src);
155extern int strcmp(const char * cs,const char * ct);
156extern void *memchr(const void *s, int c, size_t n);
157extern void * memcpy(void *,const void *,__kernel_size_t);
158extern void * __memcpy(void *,const void *,__kernel_size_t);
159extern char * strcat(char *, const char *);
160extern int memcmp(const void * cs,const void * ct,size_t count);
161
162EXPORT_SYMBOL(memset);
163EXPORT_SYMBOL(strlen);
164EXPORT_SYMBOL(memmove);
165EXPORT_SYMBOL(strcpy);
166EXPORT_SYMBOL(strncmp);
167EXPORT_SYMBOL(strncpy);
168EXPORT_SYMBOL(strchr);
169EXPORT_SYMBOL(strcmp);
170EXPORT_SYMBOL(strcat);
171EXPORT_SYMBOL(strncat);
172EXPORT_SYMBOL(memchr);
173EXPORT_SYMBOL(strrchr);
174EXPORT_SYMBOL(strnlen);
175EXPORT_SYMBOL(memscan);
176EXPORT_SYMBOL(memcpy);
177EXPORT_SYMBOL(__memcpy);
178EXPORT_SYMBOL(memcmp);
179
180#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
181/* prototypes are wrong, these are assembly with custom calling functions */
182extern void rwsem_down_read_failed_thunk(void);
183extern void rwsem_wake_thunk(void);
184extern void rwsem_downgrade_thunk(void);
185extern void rwsem_down_write_failed_thunk(void);
186EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
187EXPORT_SYMBOL(rwsem_wake_thunk);
188EXPORT_SYMBOL(rwsem_downgrade_thunk);
189EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
190#endif
191
192EXPORT_SYMBOL(empty_zero_page);
193
194#ifdef CONFIG_HAVE_DEC_LOCK
195EXPORT_SYMBOL(_atomic_dec_and_lock);
196#endif
197
198EXPORT_SYMBOL(die_chain);
199EXPORT_SYMBOL(register_die_notifier);
200
201#ifdef CONFIG_SMP
202EXPORT_SYMBOL(cpu_sibling_map);
203EXPORT_SYMBOL(smp_num_siblings);
204#endif
205
206extern void do_softirq_thunk(void);
207EXPORT_SYMBOL(do_softirq_thunk);
208
209void out_of_line_bug(void);
210EXPORT_SYMBOL(out_of_line_bug);
211
212EXPORT_SYMBOL(init_level4_pgt);
213
214extern unsigned long __supported_pte_mask;
215EXPORT_SYMBOL(__supported_pte_mask);
216
217#ifdef CONFIG_SMP
218EXPORT_SYMBOL(flush_tlb_page);
219#endif
220
221EXPORT_SYMBOL(cpu_khz);