aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-21 13:34:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-21 13:34:25 -0400
commit72a73693aac5ae82850cedc69fa5d264ca977c13 (patch)
tree80ab4bad93a2204ff264c0b07b63449a91410585 /arch
parentb7e6f62fe259187f2578d00960ef1b0e6ff6afd5 (diff)
parent2e2dcc7631e331cf2e8396ce452e7f01e35f1182 (diff)
Merge branch 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (160 commits) x86: remove extra calling to get ext cpuid level x86: use setup_clear_cpu_cap() when disabling the lapic KVM: fix exception entry / build bug, on 64-bit x86: add unknown_nmi_panic kernel parameter x86, VisWS: turn into generic arch, eliminate leftover files x86: add ->pre_time_init to x86_quirks x86: extend and use x86_quirks to clean up NUMAQ code x86: introduce x86_quirks x86: improve debug printout: add target bootmem range in early_res_to_bootmem() Subject: devmem, x86: fix rename of CONFIG_NONPROMISC_DEVMEM x86: remove arch_get_ram_range x86: Add a debugfs interface to dump PAT memtype x86: Add a arch directory for x86 under debugfs x86: i386: reduce boot fixmap space i386/xen: add proper unwind annotations to xen_sysenter_target x86: reduce force_mwait visibility x86: reduce forbid_dac's visibility x86: fix two modpost warnings x86: check function status in EDD boot code x86_64: ia32_signal.c: remove signal number conversion ...
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Kconfig.cpu4
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/boot/edd.c5
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/ia32/ia32_signal.c11
-rw-r--r--arch/x86/ia32/ia32entry.S18
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/sleep.c10
-rw-r--r--arch/x86/kernel/amd_iommu.c231
-rw-r--r--arch/x86/kernel/amd_iommu_init.c357
-rw-r--r--arch/x86/kernel/aperture_64.c1
-rw-r--r--arch/x86/kernel/apic_32.c175
-rw-r--r--arch/x86/kernel/apic_64.c26
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/bios_uv.c48
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c23
-rw-r--r--arch/x86/kernel/cpu/common_64.c15
-rw-r--r--arch/x86/kernel/cpu/intel.c10
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/e820.c33
-rw-r--r--arch/x86/kernel/early-quirks.c5
-rw-r--r--arch/x86/kernel/entry_32.S24
-rw-r--r--arch/x86/kernel/entry_64.S120
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c23
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/io_apic_32.c53
-rw-r--r--arch/x86/kernel/io_apic_64.c41
-rw-r--r--arch/x86/kernel/io_delay.c3
-rw-r--r--arch/x86/kernel/ipi.c6
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/kdebugfs.c8
-rw-r--r--arch/x86/kernel/kprobes.c1
-rw-r--r--arch/x86/kernel/module_64.c10
-rw-r--r--arch/x86/kernel/mpparse.c208
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/numaq_32.c197
-rw-r--r--arch/x86/kernel/paravirt.c29
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c17
-rw-r--r--arch/x86/kernel/pci-gart_64.c1
-rw-r--r--arch/x86/kernel/pci-nommu.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c2
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_64.c56
-rw-r--r--arch/x86/kernel/ptrace.c151
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c22
-rw-r--r--arch/x86/kernel/signal_32.c8
-rw-r--r--arch/x86/kernel/signal_64.c6
-rw-r--r--arch/x86/kernel/smpboot.c56
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/time_32.c1
-rw-r--r--arch/x86/kernel/traps_32.c118
-rw-r--r--arch/x86/kernel/traps_64.c48
-rw-r--r--arch/x86/kernel/visws_quirks.c42
-rw-r--r--arch/x86/kernel/vmi_32.c1
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/mach-default/setup.c34
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c112
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/pat.c94
-rw-r--r--arch/x86/pci/Makefile12
-rw-r--r--arch/x86/pci/legacy.c9
-rw-r--r--arch/x86/pci/numaq_32.c (renamed from arch/x86/pci/numa.c)4
-rw-r--r--arch/x86/pci/pci.h3
-rw-r--r--arch/x86/pci/visws.c23
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/vdso/vma.c11
-rw-r--r--arch/x86/xen/Kconfig14
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c697
-rw-r--r--arch/x86/xen/mmu.c316
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c306
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
93 files changed, 3154 insertions, 1440 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96e0c2ebc388..03980cb04291 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
447 447
448config MEMTEST 448config MEMTEST
449 bool "Memtest" 449 bool "Memtest"
450 depends on X86_64
451 help 450 help
452 This option adds a kernel parameter 'memtest', which allows memtest 451 This option adds a kernel parameter 'memtest', which allows memtest
453 to be set. 452 to be set.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index abff1b84ed5b..54b8c02c71e6 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
362 def_bool y 362 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 364
365config X86_GOOD_APIC
366 def_bool y
367 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
368
369config X86_INTEL_USERCOPY 365config X86_INTEL_USERCOPY
370 def_bool y 366 def_bool y
371 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 367 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ae36bfa814e5..85a87d2ac0c0 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
5 5
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 If this option is left off, you allow userspace access to all 11 If this option is left on, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
14 be used by people debugging the kernel. 14 be used by people debugging the kernel. Note that with PAT support
15 enabled, even in this case there are restrictions on /dev/mem
16 use due to the cache aliasing requirements.
15 17
16 If this option is switched on, the /dev/mem file only allows 18 If this option is switched on, the /dev/mem file only allows
17 userspace access to PCI space and the BIOS code and data regions. 19 userspace access to PCI space and the BIOS code and data regions.
@@ -287,7 +289,6 @@ config CPA_DEBUG
287 289
288config OPTIMIZE_INLINING 290config OPTIMIZE_INLINING
289 bool "Allow gcc to uninline functions marked 'inline'" 291 bool "Allow gcc to uninline functions marked 'inline'"
290 depends on BROKEN
291 help 292 help
292 This option determines if the kernel forces gcc to inline the functions 293 This option determines if the kernel forces gcc to inline the functions
293 developers have marked 'inline'. Doing so takes away freedom from gcc to 294 developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -298,5 +299,7 @@ config OPTIMIZE_INLINING
298 become the default in the future, until then this option is there to 299 become the default in the future, until then this option is there to
299 test gcc for this. 300 test gcc for this.
300 301
302 If unsure, say N.
303
301endmenu 304endmenu
302 305
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 03399d64013b..d93cbc6464d0 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -167,9 +167,8 @@ void query_edd(void)
167 * Scan the BIOS-supported hard disks and query EDD 167 * Scan the BIOS-supported hard disks and query EDD
168 * information... 168 * information...
169 */ 169 */
170 get_edd_info(devno, &ei); 170 if (!get_edd_info(devno, &ei)
171 171 && boot_params.eddbuf_entries < EDDMAXNR) {
172 if (boot_params.eddbuf_entries < EDDMAXNR) {
173 memcpy(edp, &ei, sizeof ei); 172 memcpy(edp, &ei, sizeof ei);
174 edp++; 173 edp++;
175 boot_params.eddbuf_entries++; 174 boot_params.eddbuf_entries++;
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 328956fdb59e..85a1cd8a8ff8 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
98/* 98/*
99 * Set up the GDT 99 * Set up the GDT
100 */ 100 */
101#define GDT_ENTRY(flags, base, limit) \
102 (((u64)(base & 0xff000000) << 32) | \
103 ((u64)flags << 40) | \
104 ((u64)(limit & 0x00ff0000) << 32) | \
105 ((u64)(base & 0x00ffffff) << 16) | \
106 ((u64)(limit & 0x0000ffff)))
107 101
108struct gdt_ptr { 102struct gdt_ptr {
109 u16 len; 103 u16 len;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 9bc34e2033ec..4d73f53287b6 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -2047,7 +2047,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2047# CONFIG_SAMPLES is not set 2047# CONFIG_SAMPLES is not set
2048# CONFIG_KGDB is not set 2048# CONFIG_KGDB is not set
2049CONFIG_HAVE_ARCH_KGDB=y 2049CONFIG_HAVE_ARCH_KGDB=y
2050# CONFIG_NONPROMISC_DEVMEM is not set 2050# CONFIG_STRICT_DEVMEM is not set
2051CONFIG_EARLY_PRINTK=y 2051CONFIG_EARLY_PRINTK=y
2052CONFIG_DEBUG_STACKOVERFLOW=y 2052CONFIG_DEBUG_STACKOVERFLOW=y
2053CONFIG_DEBUG_STACK_USAGE=y 2053CONFIG_DEBUG_STACK_USAGE=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ae5124e064d4..a40452429625 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -2012,7 +2012,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2012# CONFIG_SAMPLES is not set 2012# CONFIG_SAMPLES is not set
2013# CONFIG_KGDB is not set 2013# CONFIG_KGDB is not set
2014CONFIG_HAVE_ARCH_KGDB=y 2014CONFIG_HAVE_ARCH_KGDB=y
2015# CONFIG_NONPROMISC_DEVMEM is not set 2015# CONFIG_STRICT_DEVMEM is not set
2016CONFIG_EARLY_PRINTK=y 2016CONFIG_EARLY_PRINTK=y
2017CONFIG_DEBUG_STACKOVERFLOW=y 2017CONFIG_DEBUG_STACKOVERFLOW=y
2018CONFIG_DEBUG_STACK_USAGE=y 2018CONFIG_DEBUG_STACK_USAGE=y
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb3856a18c85..20af4c79579a 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
36 36
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
38 38
39#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
40 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF)
43
39asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
40void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
41 46
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
248 regs->ss |= 3; 253 regs->ss |= 3;
249 254
250 err |= __get_user(tmpflags, &sc->flags); 255 err |= __get_user(tmpflags, &sc->flags);
251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); 256 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
252 /* disable syscall checks */ 257 /* disable syscall checks */
253 regs->orig_ax = -1; 258 regs->orig_ax = -1;
254 259
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
515 compat_sigset_t *set, struct pt_regs *regs) 520 compat_sigset_t *set, struct pt_regs *regs)
516{ 521{
517 struct rt_sigframe __user *frame; 522 struct rt_sigframe __user *frame;
518 struct exec_domain *ed = current_thread_info()->exec_domain;
519 void __user *restorer; 523 void __user *restorer;
520 int err = 0; 524 int err = 0;
521 525
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
538 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 542 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
539 goto give_sigsegv; 543 goto give_sigsegv;
540 544
541 err |= __put_user((ed && ed->signal_invmap && sig < 32 545 err |= __put_user(sig, &frame->sig);
542 ? ed->signal_invmap[sig] : sig), &frame->sig);
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 546 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 547 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info); 548 err |= copy_siginfo_to_user32(&frame->info, info);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e4..23d146ce676b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -37,6 +37,11 @@
37 movq %rax,R8(%rsp) 37 movq %rax,R8(%rsp)
38 .endm 38 .endm
39 39
40 /*
41 * Reload arg registers from stack in case ptrace changed them.
42 * We don't reload %eax because syscall_trace_enter() returned
43 * the value it wants us to use in the table lookup.
44 */
40 .macro LOAD_ARGS32 offset 45 .macro LOAD_ARGS32 offset
41 movl \offset(%rsp),%r11d 46 movl \offset(%rsp),%r11d
42 movl \offset+8(%rsp),%r10d 47 movl \offset+8(%rsp),%r10d
@@ -46,7 +51,6 @@
46 movl \offset+48(%rsp),%edx 51 movl \offset+48(%rsp),%edx
47 movl \offset+56(%rsp),%esi 52 movl \offset+56(%rsp),%esi
48 movl \offset+64(%rsp),%edi 53 movl \offset+64(%rsp),%edi
49 movl \offset+72(%rsp),%eax
50 .endm 54 .endm
51 55
52 .macro CFI_STARTPROC32 simple 56 .macro CFI_STARTPROC32 simple
@@ -137,13 +141,12 @@ ENTRY(ia32_sysenter_target)
137 .previous 141 .previous
138 GET_THREAD_INFO(%r10) 142 GET_THREAD_INFO(%r10)
139 orl $TS_COMPAT,TI_status(%r10) 143 orl $TS_COMPAT,TI_status(%r10)
140 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 144 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
141 TI_flags(%r10)
142 CFI_REMEMBER_STATE 145 CFI_REMEMBER_STATE
143 jnz sysenter_tracesys 146 jnz sysenter_tracesys
144sysenter_do_call:
145 cmpl $(IA32_NR_syscalls-1),%eax 147 cmpl $(IA32_NR_syscalls-1),%eax
146 ja ia32_badsys 148 ja ia32_badsys
149sysenter_do_call:
147 IA32_ARG_FIXUP 1 150 IA32_ARG_FIXUP 1
148 call *ia32_sys_call_table(,%rax,8) 151 call *ia32_sys_call_table(,%rax,8)
149 movq %rax,RAX-ARGOFFSET(%rsp) 152 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -242,8 +245,7 @@ ENTRY(ia32_cstar_target)
242 .previous 245 .previous
243 GET_THREAD_INFO(%r10) 246 GET_THREAD_INFO(%r10)
244 orl $TS_COMPAT,TI_status(%r10) 247 orl $TS_COMPAT,TI_status(%r10)
245 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 248 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
246 TI_flags(%r10)
247 CFI_REMEMBER_STATE 249 CFI_REMEMBER_STATE
248 jnz cstar_tracesys 250 jnz cstar_tracesys
249cstar_do_call: 251cstar_do_call:
@@ -321,6 +323,7 @@ ENTRY(ia32_syscall)
321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 323 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
322 /*CFI_REL_OFFSET cs,CS-RIP*/ 324 /*CFI_REL_OFFSET cs,CS-RIP*/
323 CFI_REL_OFFSET rip,RIP-RIP 325 CFI_REL_OFFSET rip,RIP-RIP
326 PARAVIRT_ADJUST_EXCEPTION_FRAME
324 SWAPGS 327 SWAPGS
325 /* 328 /*
326 * No need to follow this irqs on/off section: the syscall 329 * No need to follow this irqs on/off section: the syscall
@@ -336,8 +339,7 @@ ENTRY(ia32_syscall)
336 SAVE_ARGS 0,0,1 339 SAVE_ARGS 0,0,1
337 GET_THREAD_INFO(%r10) 340 GET_THREAD_INFO(%r10)
338 orl $TS_COMPAT,TI_status(%r10) 341 orl $TS_COMPAT,TI_status(%r10)
339 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 342 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
340 TI_flags(%r10)
341 jnz ia32_tracesys 343 jnz ia32_tracesys
342ia32_do_syscall: 344ia32_do_syscall:
343 cmpl $(IA32_NR_syscalls-1),%eax 345 cmpl $(IA32_NR_syscalls-1),%eax
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..3db651fc8ec5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE 9ifdef CONFIG_FTRACE
10# Do not profile debug utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg
13endif 14endif
14 15
15# 16#
@@ -102,6 +103,7 @@ obj-$(CONFIG_OLPC) += olpc.o
102# 64 bit specific files 103# 64 bit specific files
103ifeq ($(CONFIG_X86_64),y) 104ifeq ($(CONFIG_X86_64),y)
104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o
105 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
106 obj-$(CONFIG_AUDIT) += audit_64.o 108 obj-$(CONFIG_AUDIT) += audit_64.o
107 109
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 868de3d5c39d..a3ddad18aaa3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,7 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
12 13
13#include "realmode/wakeup.h" 14#include "realmode/wakeup.h"
14#include "sleep.h" 15#include "sleep.h"
@@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
23static char temp_stack[10240]; 24static char temp_stack[10240];
24#endif 25#endif
25 26
26/* XXX: this macro should move to asm-x86/segment.h and be shared with the
27 boot code... */
28#define GDT_ENTRY(flags, base, limit) \
29 (((u64)(base & 0xff000000) << 32) | \
30 ((u64)flags << 40) | \
31 ((u64)(limit & 0x00ff0000) << 32) | \
32 ((u64)(base & 0x00ffffff) << 16) | \
33 ((u64)(limit & 0x0000ffff)))
34
35/** 27/**
36 * acpi_save_state_mem - save kernel state 28 * acpi_save_state_mem - save kernel state
37 * 29 *
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a0..c25210e6ac88 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,7 +23,7 @@
23#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/gart.h> 26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h> 27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 28#include <asm/amd_iommu.h>
29 29
@@ -32,21 +32,37 @@
32#define to_pages(addr, size) \ 32#define to_pages(addr, size) \
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) 33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34 34
35#define EXIT_LOOP_COUNT 10000000
36
35static DEFINE_RWLOCK(amd_iommu_devtable_lock); 37static DEFINE_RWLOCK(amd_iommu_devtable_lock);
36 38
37struct command { 39/*
40 * general struct to manage commands send to an IOMMU
41 */
42struct iommu_cmd {
38 u32 data[4]; 43 u32 data[4];
39}; 44};
40 45
41static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 46static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
42 struct unity_map_entry *e); 47 struct unity_map_entry *e);
43 48
49/* returns !0 if the IOMMU is caching non-present entries in its TLB */
44static int iommu_has_npcache(struct amd_iommu *iommu) 50static int iommu_has_npcache(struct amd_iommu *iommu)
45{ 51{
46 return iommu->cap & IOMMU_CAP_NPCACHE; 52 return iommu->cap & IOMMU_CAP_NPCACHE;
47} 53}
48 54
49static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 55/****************************************************************************
56 *
57 * IOMMU command queuing functions
58 *
59 ****************************************************************************/
60
61/*
62 * Writes the command to the IOMMUs command buffer and informs the
63 * hardware about the new command. Must be called with iommu->lock held.
64 */
65static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
50{ 66{
51 u32 tail, head; 67 u32 tail, head;
52 u8 *target; 68 u8 *target;
@@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
63 return 0; 79 return 0;
64} 80}
65 81
66static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 82/*
83 * General queuing function for commands. Takes iommu->lock and calls
84 * __iommu_queue_command().
85 */
86static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
67{ 87{
68 unsigned long flags; 88 unsigned long flags;
69 int ret; 89 int ret;
@@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
75 return ret; 95 return ret;
76} 96}
77 97
98/*
99 * This function is called whenever we need to ensure that the IOMMU has
100 * completed execution of all commands we sent. It sends a
101 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
102 * us about that by writing a value to a physical address we pass with
103 * the command.
104 */
78static int iommu_completion_wait(struct amd_iommu *iommu) 105static int iommu_completion_wait(struct amd_iommu *iommu)
79{ 106{
80 int ret; 107 int ret;
81 struct command cmd; 108 struct iommu_cmd cmd;
82 volatile u64 ready = 0; 109 volatile u64 ready = 0;
83 unsigned long ready_phys = virt_to_phys(&ready); 110 unsigned long ready_phys = virt_to_phys(&ready);
111 unsigned long i = 0;
84 112
85 memset(&cmd, 0, sizeof(cmd)); 113 memset(&cmd, 0, sizeof(cmd));
86 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 114 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
87 cmd.data[1] = HIGH_U32(ready_phys); 115 cmd.data[1] = upper_32_bits(ready_phys);
88 cmd.data[2] = 1; /* value written to 'ready' */ 116 cmd.data[2] = 1; /* value written to 'ready' */
89 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 117 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
90 118
@@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
95 if (ret) 123 if (ret)
96 return ret; 124 return ret;
97 125
98 while (!ready) 126 while (!ready && (i < EXIT_LOOP_COUNT)) {
127 ++i;
99 cpu_relax(); 128 cpu_relax();
129 }
130
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
100 133
101 return 0; 134 return 0;
102} 135}
103 136
137/*
138 * Command send function for invalidating a device table entry
139 */
104static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
105{ 141{
106 struct command cmd; 142 struct iommu_cmd cmd;
107 143
108 BUG_ON(iommu == NULL); 144 BUG_ON(iommu == NULL);
109 145
@@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
116 return iommu_queue_command(iommu, &cmd); 152 return iommu_queue_command(iommu, &cmd);
117} 153}
118 154
155/*
156 * Generic command send function for invalidaing TLB entries
157 */
119static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 158static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
120 u64 address, u16 domid, int pde, int s) 159 u64 address, u16 domid, int pde, int s)
121{ 160{
122 struct command cmd; 161 struct iommu_cmd cmd;
123 162
124 memset(&cmd, 0, sizeof(cmd)); 163 memset(&cmd, 0, sizeof(cmd));
125 address &= PAGE_MASK; 164 address &= PAGE_MASK;
126 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 165 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
127 cmd.data[1] |= domid; 166 cmd.data[1] |= domid;
128 cmd.data[2] = LOW_U32(address); 167 cmd.data[2] = LOW_U32(address);
129 cmd.data[3] = HIGH_U32(address); 168 cmd.data[3] = upper_32_bits(address);
130 if (s) 169 if (s) /* size bit - we flush more than one 4kb page */
131 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 170 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
132 if (pde) 171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
133 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
134 173
135 iommu->need_sync = 1; 174 iommu->need_sync = 1;
@@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
137 return iommu_queue_command(iommu, &cmd); 176 return iommu_queue_command(iommu, &cmd);
138} 177}
139 178
179/*
180 * TLB invalidation function which is called from the mapping functions.
181 * It invalidates a single PTE if the range to flush is within a single
182 * page. Otherwise it flushes the whole TLB of the IOMMU.
183 */
140static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 184static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
141 u64 address, size_t size) 185 u64 address, size_t size)
142{ 186{
@@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
159 return 0; 203 return 0;
160} 204}
161 205
206/****************************************************************************
207 *
208 * The functions below are used the create the page table mappings for
209 * unity mapped regions.
210 *
211 ****************************************************************************/
212
213/*
214 * Generic mapping functions. It maps a physical address into a DMA
215 * address space. It allocates the page table pages if necessary.
216 * In the future it can be extended to a generic mapping function
217 * supporting all features of AMD IOMMU page tables like level skipping
218 * and full 64 bit address spaces.
219 */
162static int iommu_map(struct protection_domain *dom, 220static int iommu_map(struct protection_domain *dom,
163 unsigned long bus_addr, 221 unsigned long bus_addr,
164 unsigned long phys_addr, 222 unsigned long phys_addr,
@@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
209 return 0; 267 return 0;
210} 268}
211 269
270/*
271 * This function checks if a specific unity mapping entry is needed for
272 * this specific IOMMU.
273 */
212static int iommu_for_unity_map(struct amd_iommu *iommu, 274static int iommu_for_unity_map(struct amd_iommu *iommu,
213 struct unity_map_entry *entry) 275 struct unity_map_entry *entry)
214{ 276{
@@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
223 return 0; 285 return 0;
224} 286}
225 287
288/*
289 * Init the unity mappings for a specific IOMMU in the system
290 *
291 * Basically iterates over all unity mapping entries and applies them to
292 * the default domain DMA of that IOMMU if necessary.
293 */
226static int iommu_init_unity_mappings(struct amd_iommu *iommu) 294static int iommu_init_unity_mappings(struct amd_iommu *iommu)
227{ 295{
228 struct unity_map_entry *entry; 296 struct unity_map_entry *entry;
@@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
239 return 0; 307 return 0;
240} 308}
241 309
310/*
311 * This function actually applies the mapping to the page table of the
312 * dma_ops domain.
313 */
242static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 314static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
243 struct unity_map_entry *e) 315 struct unity_map_entry *e)
244{ 316{
@@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
261 return 0; 333 return 0;
262} 334}
263 335
336/*
337 * Inits the unity mappings required for a specific device
338 */
264static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 339static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
265 u16 devid) 340 u16 devid)
266{ 341{
@@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
278 return 0; 353 return 0;
279} 354}
280 355
356/****************************************************************************
357 *
358 * The next functions belong to the address allocator for the dma_ops
359 * interface functions. They work like the allocators in the other IOMMU
360 * drivers. Its basically a bitmap which marks the allocated pages in
361 * the aperture. Maybe it could be enhanced in the future to a more
362 * efficient allocator.
363 *
364 ****************************************************************************/
281static unsigned long dma_mask_to_pages(unsigned long mask) 365static unsigned long dma_mask_to_pages(unsigned long mask)
282{ 366{
283 return (mask >> PAGE_SHIFT) + 367 return (mask >> PAGE_SHIFT) +
284 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); 368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
285} 369}
286 370
371/*
372 * The address allocator core function.
373 *
374 * called with domain->lock held
375 */
287static unsigned long dma_ops_alloc_addresses(struct device *dev, 376static unsigned long dma_ops_alloc_addresses(struct device *dev,
288 struct dma_ops_domain *dom, 377 struct dma_ops_domain *dom,
289 unsigned int pages) 378 unsigned int pages)
@@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
317 return address; 406 return address;
318} 407}
319 408
409/*
410 * The address free function.
411 *
412 * called with domain->lock held
413 */
320static void dma_ops_free_addresses(struct dma_ops_domain *dom, 414static void dma_ops_free_addresses(struct dma_ops_domain *dom,
321 unsigned long address, 415 unsigned long address,
322 unsigned int pages) 416 unsigned int pages)
@@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
325 iommu_area_free(dom->bitmap, address, pages); 419 iommu_area_free(dom->bitmap, address, pages);
326} 420}
327 421
422/****************************************************************************
423 *
424 * The next functions belong to the domain allocation. A domain is
425 * allocated for every IOMMU as the default domain. If device isolation
426 * is enabled, every device get its own domain. The most important thing
427 * about domains is the page table mapping the DMA address space they
428 * contain.
429 *
430 ****************************************************************************/
431
328static u16 domain_id_alloc(void) 432static u16 domain_id_alloc(void)
329{ 433{
330 unsigned long flags; 434 unsigned long flags;
@@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
342 return id; 446 return id;
343} 447}
344 448
449/*
450 * Used to reserve address ranges in the aperture (e.g. for exclusion
451 * ranges.
452 */
345static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 453static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
346 unsigned long start_page, 454 unsigned long start_page,
347 unsigned int pages) 455 unsigned int pages)
@@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
382 free_page((unsigned long)p1); 490 free_page((unsigned long)p1);
383} 491}
384 492
493/*
494 * Free a domain, only used if something went wrong in the
495 * allocation path and we need to free an already allocated page table
496 */
385static void dma_ops_domain_free(struct dma_ops_domain *dom) 497static void dma_ops_domain_free(struct dma_ops_domain *dom)
386{ 498{
387 if (!dom) 499 if (!dom)
@@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
396 kfree(dom); 508 kfree(dom);
397} 509}
398 510
511/*
512 * Allocates a new protection domain usable for the dma_ops functions.
513 * It also intializes the page table and the address allocator data
514 * structures required for the dma_ops interface
515 */
399static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 516static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
400 unsigned order) 517 unsigned order)
401{ 518{
@@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
436 dma_dom->bitmap[0] = 1; 553 dma_dom->bitmap[0] = 1;
437 dma_dom->next_bit = 0; 554 dma_dom->next_bit = 0;
438 555
556 /* Intialize the exclusion range if necessary */
439 if (iommu->exclusion_start && 557 if (iommu->exclusion_start &&
440 iommu->exclusion_start < dma_dom->aperture_size) { 558 iommu->exclusion_start < dma_dom->aperture_size) {
441 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
444 dma_ops_reserve_addresses(dma_dom, startpage, pages); 562 dma_ops_reserve_addresses(dma_dom, startpage, pages);
445 } 563 }
446 564
565 /*
566 * At the last step, build the page tables so we don't need to
567 * allocate page table pages in the dma_ops mapping/unmapping
568 * path.
569 */
447 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 570 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
448 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 571 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
449 GFP_KERNEL); 572 GFP_KERNEL);
@@ -472,6 +595,10 @@ free_dma_dom:
472 return NULL; 595 return NULL;
473} 596}
474 597
598/*
599 * Find out the protection domain structure for a given PCI device. This
600 * will give us the pointer to the page table root for example.
601 */
475static struct protection_domain *domain_for_device(u16 devid) 602static struct protection_domain *domain_for_device(u16 devid)
476{ 603{
477 struct protection_domain *dom; 604 struct protection_domain *dom;
@@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
484 return dom; 611 return dom;
485} 612}
486 613
614/*
615 * If a device is not yet associated with a domain, this function does
616 * assigns it visible for the hardware
617 */
487static void set_device_domain(struct amd_iommu *iommu, 618static void set_device_domain(struct amd_iommu *iommu,
488 struct protection_domain *domain, 619 struct protection_domain *domain,
489 u16 devid) 620 u16 devid)
@@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
508 iommu->need_sync = 1; 639 iommu->need_sync = 1;
509} 640}
510 641
642/*****************************************************************************
643 *
644 * The next functions belong to the dma_ops mapping/unmapping code.
645 *
646 *****************************************************************************/
647
648/*
649 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device.
652 * If the device is not yet associated with a domain this is also done
653 * in this function.
654 */
511static int get_device_resources(struct device *dev, 655static int get_device_resources(struct device *dev,
512 struct amd_iommu **iommu, 656 struct amd_iommu **iommu,
513 struct protection_domain **domain, 657 struct protection_domain **domain,
@@ -520,8 +664,9 @@ static int get_device_resources(struct device *dev,
520 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
521 665
522 pcidev = to_pci_dev(dev); 666 pcidev = to_pci_dev(dev);
523 _bdf = (pcidev->bus->number << 8) | pcidev->devfn; 667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
524 668
669 /* device not translated by any IOMMU in the system? */
525 if (_bdf >= amd_iommu_last_bdf) { 670 if (_bdf >= amd_iommu_last_bdf) {
526 *iommu = NULL; 671 *iommu = NULL;
527 *domain = NULL; 672 *domain = NULL;
@@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
547 return 1; 692 return 1;
548} 693}
549 694
695/*
696 * This is the generic map function. It maps one 4kb page at paddr to
697 * the given address in the DMA address space for the domain.
698 */
550static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 699static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
551 struct dma_ops_domain *dom, 700 struct dma_ops_domain *dom,
552 unsigned long address, 701 unsigned long address,
@@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
578 return (dma_addr_t)address; 727 return (dma_addr_t)address;
579} 728}
580 729
730/*
731 * The generic unmapping function for on page in the DMA address space.
732 */
581static void dma_ops_domain_unmap(struct amd_iommu *iommu, 733static void dma_ops_domain_unmap(struct amd_iommu *iommu,
582 struct dma_ops_domain *dom, 734 struct dma_ops_domain *dom,
583 unsigned long address) 735 unsigned long address)
@@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
597 *pte = 0ULL; 749 *pte = 0ULL;
598} 750}
599 751
752/*
753 * This function contains common code for mapping of a physically
754 * contiguous memory region into DMA address space. It is uses by all
755 * mapping functions provided by this IOMMU driver.
756 * Must be called with the domain lock held.
757 */
600static dma_addr_t __map_single(struct device *dev, 758static dma_addr_t __map_single(struct device *dev,
601 struct amd_iommu *iommu, 759 struct amd_iommu *iommu,
602 struct dma_ops_domain *dma_dom, 760 struct dma_ops_domain *dma_dom,
@@ -628,6 +786,10 @@ out:
628 return address; 786 return address;
629} 787}
630 788
789/*
790 * Does the reverse of the __map_single function. Must be called with
791 * the domain lock held too
792 */
631static void __unmap_single(struct amd_iommu *iommu, 793static void __unmap_single(struct amd_iommu *iommu,
632 struct dma_ops_domain *dma_dom, 794 struct dma_ops_domain *dma_dom,
633 dma_addr_t dma_addr, 795 dma_addr_t dma_addr,
@@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
652 dma_ops_free_addresses(dma_dom, dma_addr, pages); 814 dma_ops_free_addresses(dma_dom, dma_addr, pages);
653} 815}
654 816
817/*
818 * The exported map_single function for dma_ops.
819 */
655static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 820static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
656 size_t size, int dir) 821 size_t size, int dir)
657{ 822{
@@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
664 get_device_resources(dev, &iommu, &domain, &devid); 829 get_device_resources(dev, &iommu, &domain, &devid);
665 830
666 if (iommu == NULL || domain == NULL) 831 if (iommu == NULL || domain == NULL)
832 /* device not handled by any AMD IOMMU */
667 return (dma_addr_t)paddr; 833 return (dma_addr_t)paddr;
668 834
669 spin_lock_irqsave(&domain->lock, flags); 835 spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +849,9 @@ out:
683 return addr; 849 return addr;
684} 850}
685 851
852/*
853 * The exported unmap_single function for dma_ops.
854 */
686static void unmap_single(struct device *dev, dma_addr_t dma_addr, 855static void unmap_single(struct device *dev, dma_addr_t dma_addr,
687 size_t size, int dir) 856 size_t size, int dir)
688{ 857{
@@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
692 u16 devid; 861 u16 devid;
693 862
694 if (!get_device_resources(dev, &iommu, &domain, &devid)) 863 if (!get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */
695 return; 865 return;
696 866
697 spin_lock_irqsave(&domain->lock, flags); 867 spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
706 spin_unlock_irqrestore(&domain->lock, flags); 876 spin_unlock_irqrestore(&domain->lock, flags);
707} 877}
708 878
879/*
880 * This is a special map_sg function which is used if we should map a
881 * device which is not handled by an AMD IOMMU in the system.
882 */
709static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, 883static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
710 int nelems, int dir) 884 int nelems, int dir)
711{ 885{
@@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
720 return nelems; 894 return nelems;
721} 895}
722 896
897/*
898 * The exported map_sg function for dma_ops (handles scatter-gather
899 * lists).
900 */
723static int map_sg(struct device *dev, struct scatterlist *sglist, 901static int map_sg(struct device *dev, struct scatterlist *sglist,
724 int nelems, int dir) 902 int nelems, int dir)
725{ 903{
@@ -775,6 +953,10 @@ unmap:
775 goto out; 953 goto out;
776} 954}
777 955
956/*
957 * The exported map_sg function for dma_ops (handles scatter-gather
958 * lists).
959 */
778static void unmap_sg(struct device *dev, struct scatterlist *sglist, 960static void unmap_sg(struct device *dev, struct scatterlist *sglist,
779 int nelems, int dir) 961 int nelems, int dir)
780{ 962{
@@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
804 spin_unlock_irqrestore(&domain->lock, flags); 986 spin_unlock_irqrestore(&domain->lock, flags);
805} 987}
806 988
989/*
990 * The exported alloc_coherent function for dma_ops.
991 */
807static void *alloc_coherent(struct device *dev, size_t size, 992static void *alloc_coherent(struct device *dev, size_t size,
808 dma_addr_t *dma_addr, gfp_t flag) 993 dma_addr_t *dma_addr, gfp_t flag)
809{ 994{
@@ -851,6 +1036,11 @@ out:
851 return virt_addr; 1036 return virt_addr;
852} 1037}
853 1038
1039/*
1040 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */
854static void free_coherent(struct device *dev, size_t size, 1044static void free_coherent(struct device *dev, size_t size,
855 void *virt_addr, dma_addr_t dma_addr) 1045 void *virt_addr, dma_addr_t dma_addr)
856{ 1046{
@@ -879,6 +1069,8 @@ free_mem:
879} 1069}
880 1070
881/* 1071/*
1072 * The function for pre-allocating protection domains.
1073 *
882 * If the driver core informs the DMA layer if a driver grabs a device 1074 * If the driver core informs the DMA layer if a driver grabs a device
883 * we don't need to preallocate the protection domains anymore. 1075 * we don't need to preallocate the protection domains anymore.
884 * For now we have to. 1076 * For now we have to.
@@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
921 .unmap_sg = unmap_sg, 1113 .unmap_sg = unmap_sg,
922}; 1114};
923 1115
1116/*
1117 * The function which clues the AMD IOMMU driver into dma_ops.
1118 */
924int __init amd_iommu_init_dma_ops(void) 1119int __init amd_iommu_init_dma_ops(void)
925{ 1120{
926 struct amd_iommu *iommu; 1121 struct amd_iommu *iommu;
927 int order = amd_iommu_aperture_order; 1122 int order = amd_iommu_aperture_order;
928 int ret; 1123 int ret;
929 1124
1125 /*
1126 * first allocate a default protection domain for every IOMMU we
1127 * found in the system. Devices not assigned to any other
1128 * protection domain will be assigned to the default one.
1129 */
930 list_for_each_entry(iommu, &amd_iommu_list, list) { 1130 list_for_each_entry(iommu, &amd_iommu_list, list) {
931 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1131 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
932 if (iommu->default_dom == NULL) 1132 if (iommu->default_dom == NULL)
@@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
936 goto free_domains; 1136 goto free_domains;
937 } 1137 }
938 1138
1139 /*
1140 * If device isolation is enabled, pre-allocate the protection
1141 * domains for each device.
1142 */
939 if (amd_iommu_isolate) 1143 if (amd_iommu_isolate)
940 prealloc_protection_domains(); 1144 prealloc_protection_domains();
941 1145
@@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
947 gart_iommu_aperture = 0; 1151 gart_iommu_aperture = 0;
948#endif 1152#endif
949 1153
1154 /* Make the driver finally visible to the drivers */
950 dma_ops = &amd_iommu_dma_ops; 1155 dma_ops = &amd_iommu_dma_ops;
951 1156
952 return 0; 1157 return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437d..c9d8ff2eb130 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -25,20 +25,13 @@
25#include <asm/pci-direct.h> 25#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 26#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 27#include <asm/amd_iommu.h>
28#include <asm/gart.h> 28#include <asm/iommu.h>
29 29
30/* 30/*
31 * definitions for the ACPI scanning code 31 * definitions for the ACPI scanning code
32 */ 32 */
33#define UPDATE_LAST_BDF(x) do {\
34 if ((x) > amd_iommu_last_bdf) \
35 amd_iommu_last_bdf = (x); \
36 } while (0);
37
38#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
39#define PCI_BUS(x) (((x) >> 8) & 0xff) 33#define PCI_BUS(x) (((x) >> 8) & 0xff)
40#define IVRS_HEADER_LENGTH 48 34#define IVRS_HEADER_LENGTH 48
41#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
42 35
43#define ACPI_IVHD_TYPE 0x10 36#define ACPI_IVHD_TYPE 0x10
44#define ACPI_IVMD_TYPE_ALL 0x20 37#define ACPI_IVMD_TYPE_ALL 0x20
@@ -71,6 +64,17 @@
71#define ACPI_DEVFLAG_LINT1 0x80 64#define ACPI_DEVFLAG_LINT1 0x80
72#define ACPI_DEVFLAG_ATSDIS 0x10000000 65#define ACPI_DEVFLAG_ATSDIS 0x10000000
73 66
67/*
68 * ACPI table definitions
69 *
70 * These data structures are laid over the table to parse the important values
71 * out of it.
72 */
73
74/*
75 * structure describing one IOMMU in the ACPI table. Typically followed by one
76 * or more ivhd_entrys.
77 */
74struct ivhd_header { 78struct ivhd_header {
75 u8 type; 79 u8 type;
76 u8 flags; 80 u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
83 u32 reserved; 87 u32 reserved;
84} __attribute__((packed)); 88} __attribute__((packed));
85 89
90/*
91 * A device entry describing which devices a specific IOMMU translates and
92 * which requestor ids they use.
93 */
86struct ivhd_entry { 94struct ivhd_entry {
87 u8 type; 95 u8 type;
88 u16 devid; 96 u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
90 u32 ext; 98 u32 ext;
91} __attribute__((packed)); 99} __attribute__((packed));
92 100
101/*
102 * An AMD IOMMU memory definition structure. It defines things like exclusion
103 * ranges for devices and regions that should be unity mapped.
104 */
93struct ivmd_header { 105struct ivmd_header {
94 u8 type; 106 u8 type;
95 u8 flags; 107 u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
103 115
104static int __initdata amd_iommu_detected; 116static int __initdata amd_iommu_detected;
105 117
106u16 amd_iommu_last_bdf; 118u16 amd_iommu_last_bdf; /* largest PCI device id we have
107struct list_head amd_iommu_unity_map; 119 to handle */
108unsigned amd_iommu_aperture_order = 26; 120LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
109int amd_iommu_isolate; 121 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */
124
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */
110 127
111struct list_head amd_iommu_list; 128/*
129 * Pointer to the device table which is shared by all AMD IOMMUs
130 * it is indexed by the PCI device id or the HT unit id and contains
131 * information about the domain the device belongs to as well as the
132 * page table root pointer.
133 */
112struct dev_table_entry *amd_iommu_dev_table; 134struct dev_table_entry *amd_iommu_dev_table;
135
136/*
137 * The alias table is a driver specific data structure which contains the
138 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
139 * More than one device can share the same requestor id.
140 */
113u16 *amd_iommu_alias_table; 141u16 *amd_iommu_alias_table;
142
143/*
144 * The rlookup table is used to find the IOMMU which is responsible
145 * for a specific device. It is also indexed by the PCI device id.
146 */
114struct amd_iommu **amd_iommu_rlookup_table; 147struct amd_iommu **amd_iommu_rlookup_table;
148
149/*
150 * The pd table (protection domain table) is used to find the protection domain
151 * data structure a device belongs to. Indexed with the PCI device id too.
152 */
115struct protection_domain **amd_iommu_pd_table; 153struct protection_domain **amd_iommu_pd_table;
154
155/*
156 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
157 * to know which ones are already in use.
158 */
116unsigned long *amd_iommu_pd_alloc_bitmap; 159unsigned long *amd_iommu_pd_alloc_bitmap;
117 160
118static u32 dev_table_size; 161static u32 dev_table_size; /* size of the device table */
119static u32 alias_table_size; 162static u32 alias_table_size; /* size of the alias table */
120static u32 rlookup_table_size; 163static u32 rlookup_table_size; /* size if the rlookup table */
121 164
165static inline void update_last_devid(u16 devid)
166{
167 if (devid > amd_iommu_last_bdf)
168 amd_iommu_last_bdf = devid;
169}
170
171static inline unsigned long tbl_size(int entry_size)
172{
173 unsigned shift = PAGE_SHIFT +
174 get_order(amd_iommu_last_bdf * entry_size);
175
176 return 1UL << shift;
177}
178
179/****************************************************************************
180 *
181 * AMD IOMMU MMIO register space handling functions
182 *
183 * These functions are used to program the IOMMU device registers in
184 * MMIO space required for that driver.
185 *
186 ****************************************************************************/
187
188/*
189 * This function set the exclusion range in the IOMMU. DMA accesses to the
190 * exclusion range are passed through untranslated
191 */
122static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 192static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
123{ 193{
124 u64 start = iommu->exclusion_start & PAGE_MASK; 194 u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
137 &entry, sizeof(entry)); 207 &entry, sizeof(entry));
138} 208}
139 209
210/* Programs the physical address of the device table into the IOMMU hardware */
140static void __init iommu_set_device_table(struct amd_iommu *iommu) 211static void __init iommu_set_device_table(struct amd_iommu *iommu)
141{ 212{
142 u32 entry; 213 u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
149 &entry, sizeof(entry)); 220 &entry, sizeof(entry));
150} 221}
151 222
223/* Generic functions to enable/disable certain features of the IOMMU. */
152static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 224static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
153{ 225{
154 u32 ctrl; 226 u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
167 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
168} 240}
169 241
242/* Function to enable the hardware */
170void __init iommu_enable(struct amd_iommu *iommu) 243void __init iommu_enable(struct amd_iommu *iommu)
171{ 244{
172 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
176 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
177} 250}
178 251
252/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one.
255 */
179static u8 * __init iommu_map_mmio_space(u64 address) 256static u8 * __init iommu_map_mmio_space(u64 address)
180{ 257{
181 u8 *ret; 258 u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
199 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); 276 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
200} 277}
201 278
279/****************************************************************************
280 *
281 * The functions below belong to the first pass of AMD IOMMU ACPI table
282 * parsing. In this pass we try to find out the highest device id this
283 * code has to handle. Upon this information the size of the shared data
284 * structures is determined later.
285 *
286 ****************************************************************************/
287
288/*
289 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU
291 */
202static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) 292static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
203{ 293{
204 u32 cap; 294 u32 cap;
205 295
206 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 296 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
207 UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); 297 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
208 298
209 return 0; 299 return 0;
210} 300}
211 301
302/*
303 * After reading the highest device id from the IOMMU PCI capability header
304 * this function looks if there is a higher device id defined in the ACPI table
305 */
212static int __init find_last_devid_from_ivhd(struct ivhd_header *h) 306static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
213{ 307{
214 u8 *p = (void *)h, *end = (void *)h; 308 u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
229 case IVHD_DEV_RANGE_END: 323 case IVHD_DEV_RANGE_END:
230 case IVHD_DEV_ALIAS: 324 case IVHD_DEV_ALIAS:
231 case IVHD_DEV_EXT_SELECT: 325 case IVHD_DEV_EXT_SELECT:
232 UPDATE_LAST_BDF(dev->devid); 326 /* all the above subfield types refer to device ids */
327 update_last_devid(dev->devid);
233 break; 328 break;
234 default: 329 default:
235 break; 330 break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
242 return 0; 337 return 0;
243} 338}
244 339
340/*
341 * Iterate over all IVHD entries in the ACPI table and find the highest device
342 * id which we need to handle. This is the first of three functions which parse
343 * the ACPI table. So we check the checksum here.
344 */
245static int __init find_last_devid_acpi(struct acpi_table_header *table) 345static int __init find_last_devid_acpi(struct acpi_table_header *table)
246{ 346{
247 int i; 347 int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
277 return 0; 377 return 0;
278} 378}
279 379
380/****************************************************************************
381 *
382 * The following functions belong the the code path which parses the ACPI table
383 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
384 * data structures, initialize the device/alias/rlookup table and also
385 * basically initialize the hardware.
386 *
387 ****************************************************************************/
388
389/*
390 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
391 * write commands to that buffer later and the IOMMU will execute them
392 * asynchronously
393 */
280static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) 394static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
281{ 395{
282 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, 396 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
283 get_order(CMD_BUFFER_SIZE)); 397 get_order(CMD_BUFFER_SIZE));
284 u64 entry = 0; 398 u64 entry;
285 399
286 if (cmd_buf == NULL) 400 if (cmd_buf == NULL)
287 return NULL; 401 return NULL;
288 402
289 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 403 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
290 404
291 memset(cmd_buf, 0, CMD_BUFFER_SIZE);
292
293 entry = (u64)virt_to_phys(cmd_buf); 405 entry = (u64)virt_to_phys(cmd_buf);
294 entry |= MMIO_CMD_SIZE_512; 406 entry |= MMIO_CMD_SIZE_512;
295 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 407 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
302 414
303static void __init free_command_buffer(struct amd_iommu *iommu) 415static void __init free_command_buffer(struct amd_iommu *iommu)
304{ 416{
305 if (iommu->cmd_buf) 417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
306 free_pages((unsigned long)iommu->cmd_buf,
307 get_order(CMD_BUFFER_SIZE));
308} 418}
309 419
420/* sets a specific bit in the device table entry. */
310static void set_dev_entry_bit(u16 devid, u8 bit) 421static void set_dev_entry_bit(u16 devid, u8 bit)
311{ 422{
312 int i = (bit >> 5) & 0x07; 423 int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
315 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 426 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
316} 427}
317 428
318static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) 429/* Writes the specific IOMMU for a device into the rlookup table */
430static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
431{
432 amd_iommu_rlookup_table[devid] = iommu;
433}
434
435/*
436 * This function takes the device specific flags read from the ACPI
437 * table and sets up the device table entry with that information
438 */
439static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
440 u16 devid, u32 flags, u32 ext_flags)
319{ 441{
320 if (flags & ACPI_DEVFLAG_INITPASS) 442 if (flags & ACPI_DEVFLAG_INITPASS)
321 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); 443 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
331 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); 453 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
332 if (flags & ACPI_DEVFLAG_LINT1) 454 if (flags & ACPI_DEVFLAG_LINT1)
333 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 455 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
334}
335 456
336static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 457 set_iommu_for_device(iommu, devid);
337{
338 amd_iommu_rlookup_table[devid] = iommu;
339} 458}
340 459
460/*
461 * Reads the device exclusion range from ACPI and initialize IOMMU with
462 * it
463 */
341static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) 464static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
342{ 465{
343 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 466 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
346 return; 469 return;
347 470
348 if (iommu) { 471 if (iommu) {
472 /*
473 * We only can configure exclusion ranges per IOMMU, not
474 * per device. But we can enable the exclusion range per
475 * device. This is done here
476 */
349 set_dev_entry_bit(m->devid, DEV_ENTRY_EX); 477 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
350 iommu->exclusion_start = m->range_start; 478 iommu->exclusion_start = m->range_start;
351 iommu->exclusion_length = m->range_length; 479 iommu->exclusion_length = m->range_length;
352 } 480 }
353} 481}
354 482
483/*
484 * This function reads some important data from the IOMMU PCI space and
485 * initializes the driver data structure with it. It reads the hardware
486 * capabilities and the first/last device entries
487 */
355static void __init init_iommu_from_pci(struct amd_iommu *iommu) 488static void __init init_iommu_from_pci(struct amd_iommu *iommu)
356{ 489{
357 int bus = PCI_BUS(iommu->devid); 490 int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
363 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
364 497
365 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
366 iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); 499 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
367 iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); 500 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range));
368} 503}
369 504
505/*
506 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
507 * initializes the hardware and our data structures with it.
508 */
370static void __init init_iommu_from_acpi(struct amd_iommu *iommu, 509static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
371 struct ivhd_header *h) 510 struct ivhd_header *h)
372{ 511{
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
374 u8 *end = p, flags = 0; 513 u8 *end = p, flags = 0;
375 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 514 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
376 u32 ext_flags = 0; 515 u32 ext_flags = 0;
377 bool alias = 0; 516 bool alias = false;
378 struct ivhd_entry *e; 517 struct ivhd_entry *e;
379 518
380 /* 519 /*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
414 case IVHD_DEV_ALL: 553 case IVHD_DEV_ALL:
415 for (dev_i = iommu->first_device; 554 for (dev_i = iommu->first_device;
416 dev_i <= iommu->last_device; ++dev_i) 555 dev_i <= iommu->last_device; ++dev_i)
417 set_dev_entry_from_acpi(dev_i, e->flags, 0); 556 set_dev_entry_from_acpi(iommu, dev_i,
557 e->flags, 0);
418 break; 558 break;
419 case IVHD_DEV_SELECT: 559 case IVHD_DEV_SELECT:
420 devid = e->devid; 560 devid = e->devid;
421 set_dev_entry_from_acpi(devid, e->flags, 0); 561 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
422 break; 562 break;
423 case IVHD_DEV_SELECT_RANGE_START: 563 case IVHD_DEV_SELECT_RANGE_START:
424 devid_start = e->devid; 564 devid_start = e->devid;
425 flags = e->flags; 565 flags = e->flags;
426 ext_flags = 0; 566 ext_flags = 0;
427 alias = 0; 567 alias = false;
428 break; 568 break;
429 case IVHD_DEV_ALIAS: 569 case IVHD_DEV_ALIAS:
430 devid = e->devid; 570 devid = e->devid;
431 devid_to = e->ext >> 8; 571 devid_to = e->ext >> 8;
432 set_dev_entry_from_acpi(devid, e->flags, 0); 572 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
433 amd_iommu_alias_table[devid] = devid_to; 573 amd_iommu_alias_table[devid] = devid_to;
434 break; 574 break;
435 case IVHD_DEV_ALIAS_RANGE: 575 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
437 flags = e->flags; 577 flags = e->flags;
438 devid_to = e->ext >> 8; 578 devid_to = e->ext >> 8;
439 ext_flags = 0; 579 ext_flags = 0;
440 alias = 1; 580 alias = true;
441 break; 581 break;
442 case IVHD_DEV_EXT_SELECT: 582 case IVHD_DEV_EXT_SELECT:
443 devid = e->devid; 583 devid = e->devid;
444 set_dev_entry_from_acpi(devid, e->flags, e->ext); 584 set_dev_entry_from_acpi(iommu, devid, e->flags,
585 e->ext);
445 break; 586 break;
446 case IVHD_DEV_EXT_SELECT_RANGE: 587 case IVHD_DEV_EXT_SELECT_RANGE:
447 devid_start = e->devid; 588 devid_start = e->devid;
448 flags = e->flags; 589 flags = e->flags;
449 ext_flags = e->ext; 590 ext_flags = e->ext;
450 alias = 0; 591 alias = false;
451 break; 592 break;
452 case IVHD_DEV_RANGE_END: 593 case IVHD_DEV_RANGE_END:
453 devid = e->devid; 594 devid = e->devid;
454 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 595 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
455 if (alias) 596 if (alias)
456 amd_iommu_alias_table[dev_i] = devid_to; 597 amd_iommu_alias_table[dev_i] = devid_to;
457 set_dev_entry_from_acpi( 598 set_dev_entry_from_acpi(iommu,
458 amd_iommu_alias_table[dev_i], 599 amd_iommu_alias_table[dev_i],
459 flags, ext_flags); 600 flags, ext_flags);
460 } 601 }
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
467 } 608 }
468} 609}
469 610
611/* Initializes the device->iommu mapping for the driver */
470static int __init init_iommu_devices(struct amd_iommu *iommu) 612static int __init init_iommu_devices(struct amd_iommu *iommu)
471{ 613{
472 u16 i; 614 u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
494 } 636 }
495} 637}
496 638
639/*
640 * This function clues the initialization function for one IOMMU
641 * together and also allocates the command buffer and programs the
642 * hardware. It does NOT enable the IOMMU. This is done afterwards.
643 */
497static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 644static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
498{ 645{
499 spin_lock_init(&iommu->lock); 646 spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
521 return 0; 668 return 0;
522} 669}
523 670
671/*
672 * Iterates over all IOMMU entries in the ACPI table, allocates the
673 * IOMMU structure and initializes it with init_iommu_one()
674 */
524static int __init init_iommu_all(struct acpi_table_header *table) 675static int __init init_iommu_all(struct acpi_table_header *table)
525{ 676{
526 u8 *p = (u8 *)table, *end = (u8 *)table; 677 u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
528 struct amd_iommu *iommu; 679 struct amd_iommu *iommu;
529 int ret; 680 int ret;
530 681
531 INIT_LIST_HEAD(&amd_iommu_list);
532
533 end += table->length; 682 end += table->length;
534 p += IVRS_HEADER_LENGTH; 683 p += IVRS_HEADER_LENGTH;
535 684
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
555 return 0; 704 return 0;
556} 705}
557 706
707/****************************************************************************
708 *
709 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges).
712 *
713 ****************************************************************************/
714
558static void __init free_unity_maps(void) 715static void __init free_unity_maps(void)
559{ 716{
560 struct unity_map_entry *entry, *next; 717 struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
565 } 722 }
566} 723}
567 724
725/* called when we find an exclusion range definition in ACPI */
568static int __init init_exclusion_range(struct ivmd_header *m) 726static int __init init_exclusion_range(struct ivmd_header *m)
569{ 727{
570 int i; 728 int i;
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
588 return 0; 746 return 0;
589} 747}
590 748
749/* called for unity map ACPI definition */
591static int __init init_unity_map_range(struct ivmd_header *m) 750static int __init init_unity_map_range(struct ivmd_header *m)
592{ 751{
593 struct unity_map_entry *e = 0; 752 struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
619 return 0; 778 return 0;
620} 779}
621 780
781/* iterates over all memory definitions we find in the ACPI table */
622static int __init init_memory_definitions(struct acpi_table_header *table) 782static int __init init_memory_definitions(struct acpi_table_header *table)
623{ 783{
624 u8 *p = (u8 *)table, *end = (u8 *)table; 784 u8 *p = (u8 *)table, *end = (u8 *)table;
625 struct ivmd_header *m; 785 struct ivmd_header *m;
626 786
627 INIT_LIST_HEAD(&amd_iommu_unity_map);
628
629 end += table->length; 787 end += table->length;
630 p += IVRS_HEADER_LENGTH; 788 p += IVRS_HEADER_LENGTH;
631 789
@@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
642 return 0; 800 return 0;
643} 801}
644 802
803/*
804 * This function finally enables all IOMMUs found in the system after
805 * they have been initialized
806 */
645static void __init enable_iommus(void) 807static void __init enable_iommus(void)
646{ 808{
647 struct amd_iommu *iommu; 809 struct amd_iommu *iommu;
@@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
678 .cls = &amd_iommu_sysdev_class, 840 .cls = &amd_iommu_sysdev_class,
679}; 841};
680 842
843/*
844 * This is the core init function for AMD IOMMU hardware in the system.
845 * This function is called from the generic x86 DMA layer initialization
846 * code.
847 *
848 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
849 * three times:
850 *
851 * 1 pass) Find the highest PCI device id the driver has to handle.
852 * Upon this information the size of the data structures is
853 * determined that needs to be allocated.
854 *
855 * 2 pass) Initialize the data structures just allocated with the
856 * information in the ACPI table about available AMD IOMMUs
857 * in the system. It also maps the PCI devices in the
858 * system to specific IOMMUs
859 *
860 * 3 pass) After the basic data structures are allocated and
861 * initialized we update them with information about memory
862 * remapping requirements parsed out of the ACPI table in
863 * this last pass.
864 *
865 * After that the hardware is initialized and ready to go. In the last
866 * step we do some Linux specific things like registering the driver in
867 * the dma_ops interface and initializing the suspend/resume support
868 * functions. Finally it prints some information about AMD IOMMUs and
869 * the driver state and enables the hardware.
870 */
681int __init amd_iommu_init(void) 871int __init amd_iommu_init(void)
682{ 872{
683 int i, ret = 0; 873 int i, ret = 0;
@@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
699 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 889 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
700 return -ENODEV; 890 return -ENODEV;
701 891
702 dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); 892 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
703 alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); 893 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
704 rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); 894 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
705 895
706 ret = -ENOMEM; 896 ret = -ENOMEM;
707 897
708 /* Device table - directly used by all IOMMUs */ 898 /* Device table - directly used by all IOMMUs */
709 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, 899 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
710 get_order(dev_table_size)); 900 get_order(dev_table_size));
711 if (amd_iommu_dev_table == NULL) 901 if (amd_iommu_dev_table == NULL)
712 goto out; 902 goto out;
@@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
730 * Protection Domain table - maps devices to protection domains 920 * Protection Domain table - maps devices to protection domains
731 * This table has the same size as the rlookup_table 921 * This table has the same size as the rlookup_table
732 */ 922 */
733 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, 923 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
734 get_order(rlookup_table_size)); 924 get_order(rlookup_table_size));
735 if (amd_iommu_pd_table == NULL) 925 if (amd_iommu_pd_table == NULL)
736 goto free; 926 goto free;
737 927
738 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, 928 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
929 GFP_KERNEL | __GFP_ZERO,
739 get_order(MAX_DOMAIN_ID/8)); 930 get_order(MAX_DOMAIN_ID/8));
740 if (amd_iommu_pd_alloc_bitmap == NULL) 931 if (amd_iommu_pd_alloc_bitmap == NULL)
741 goto free; 932 goto free;
742 933
743 /* 934 /*
744 * memory is allocated now; initialize the device table with all zeroes 935 * let all alias entries point to itself
745 * and let all alias entries point to itself
746 */ 936 */
747 memset(amd_iommu_dev_table, 0, dev_table_size);
748 for (i = 0; i < amd_iommu_last_bdf; ++i) 937 for (i = 0; i < amd_iommu_last_bdf; ++i)
749 amd_iommu_alias_table[i] = i; 938 amd_iommu_alias_table[i] = i;
750 939
751 memset(amd_iommu_pd_table, 0, rlookup_table_size);
752 memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
753
754 /* 940 /*
755 * never allocate domain 0 because its used as the non-allocated and 941 * never allocate domain 0 because its used as the non-allocated and
756 * error value placeholder 942 * error value placeholder
@@ -795,24 +981,19 @@ out:
795 return ret; 981 return ret;
796 982
797free: 983free:
798 if (amd_iommu_pd_alloc_bitmap) 984 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
799 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
800 985
801 if (amd_iommu_pd_table) 986 free_pages((unsigned long)amd_iommu_pd_table,
802 free_pages((unsigned long)amd_iommu_pd_table, 987 get_order(rlookup_table_size));
803 get_order(rlookup_table_size));
804 988
805 if (amd_iommu_rlookup_table) 989 free_pages((unsigned long)amd_iommu_rlookup_table,
806 free_pages((unsigned long)amd_iommu_rlookup_table, 990 get_order(rlookup_table_size));
807 get_order(rlookup_table_size));
808 991
809 if (amd_iommu_alias_table) 992 free_pages((unsigned long)amd_iommu_alias_table,
810 free_pages((unsigned long)amd_iommu_alias_table, 993 get_order(alias_table_size));
811 get_order(alias_table_size));
812 994
813 if (amd_iommu_dev_table) 995 free_pages((unsigned long)amd_iommu_dev_table,
814 free_pages((unsigned long)amd_iommu_dev_table, 996 get_order(dev_table_size));
815 get_order(dev_table_size));
816 997
817 free_iommu_all(); 998 free_iommu_all();
818 999
@@ -821,6 +1002,13 @@ free:
821 goto out; 1002 goto out;
822} 1003}
823 1004
1005/****************************************************************************
1006 *
1007 * Early detect code. This code runs at IOMMU detection time in the DMA
1008 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1009 * IOMMUs
1010 *
1011 ****************************************************************************/
824static int __init early_amd_iommu_detect(struct acpi_table_header *table) 1012static int __init early_amd_iommu_detect(struct acpi_table_header *table)
825{ 1013{
826 return 0; 1014 return 0;
@@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
828 1016
829void __init amd_iommu_detect(void) 1017void __init amd_iommu_detect(void)
830{ 1018{
831 if (swiotlb || no_iommu || iommu_detected) 1019 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
832 return; 1020 return;
833 1021
834 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1022 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
841 } 1029 }
842} 1030}
843 1031
1032/****************************************************************************
1033 *
1034 * Parsing functions for the AMD IOMMU specific kernel command line
1035 * options.
1036 *
1037 ****************************************************************************/
1038
844static int __init parse_amd_iommu_options(char *str) 1039static int __init parse_amd_iommu_options(char *str)
845{ 1040{
846 for (; *str; ++str) { 1041 for (; *str; ++str) {
@@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
853 1048
854static int __init parse_amd_iommu_size_options(char *str) 1049static int __init parse_amd_iommu_size_options(char *str)
855{ 1050{
856 for (; *str; ++str) { 1051 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
857 if (strcmp(str, "32M") == 0) 1052
858 amd_iommu_aperture_order = 25; 1053 if ((order > 24) && (order < 31))
859 if (strcmp(str, "64M") == 0) 1054 amd_iommu_aperture_order = order;
860 amd_iommu_aperture_order = 26;
861 if (strcmp(str, "128M") == 0)
862 amd_iommu_aperture_order = 27;
863 if (strcmp(str, "256M") == 0)
864 amd_iommu_aperture_order = 28;
865 if (strcmp(str, "512M") == 0)
866 amd_iommu_aperture_order = 29;
867 if (strcmp(str, "1G") == 0)
868 amd_iommu_aperture_order = 30;
869 }
870 1055
871 return 1; 1056 return 1;
872} 1057}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a5..44e21826db11 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index a437d027f20b..d6c898358371 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
75/* 75/*
76 * Debug level, exported for io_apic.c 76 * Debug level, exported for io_apic.c
77 */ 77 */
78int apic_verbosity; 78unsigned int apic_verbosity;
79 79
80int pic_mode; 80int pic_mode;
81 81
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
177 /* Level triggered for 82489DX */ 177 /* Level triggered for 82489DX */
178 if (!lapic_is_integrated()) 178 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 179 v |= APIC_LVT_LEVEL_TRIGGER;
180 apic_write_around(APIC_LVT0, v); 180 apic_write(APIC_LVT0, v);
181} 181}
182 182
183/** 183/**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 212 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 213 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 214 * call this function only once, with the real, calibrated value.
215 *
216 * We do reads before writes even if unnecessary, to get around the
217 * P5 APIC double write bug.
218 */ 215 */
219static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
220{ 217{
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
229 if (!irqen) 226 if (!irqen)
230 lvtt_value |= APIC_LVT_MASKED; 227 lvtt_value |= APIC_LVT_MASKED;
231 228
232 apic_write_around(APIC_LVTT, lvtt_value); 229 apic_write(APIC_LVTT, lvtt_value);
233 230
234 /* 231 /*
235 * Divide PICLK by 16 232 * Divide PICLK by 16
236 */ 233 */
237 tmp_value = apic_read(APIC_TDCR); 234 tmp_value = apic_read(APIC_TDCR);
238 apic_write_around(APIC_TDCR, (tmp_value 235 apic_write(APIC_TDCR,
239 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
240 | APIC_TDR_DIV_16); 237 APIC_TDR_DIV_16);
241 238
242 if (!oneshot) 239 if (!oneshot)
243 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
244} 241}
245 242
246/* 243/*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
249static int lapic_next_event(unsigned long delta, 246static int lapic_next_event(unsigned long delta,
250 struct clock_event_device *evt) 247 struct clock_event_device *evt)
251{ 248{
252 apic_write_around(APIC_TMICT, delta); 249 apic_write(APIC_TMICT, delta);
253 return 0; 250 return 0;
254} 251}
255 252
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
278 case CLOCK_EVT_MODE_SHUTDOWN: 275 case CLOCK_EVT_MODE_SHUTDOWN:
279 v = apic_read(APIC_LVTT); 276 v = apic_read(APIC_LVTT);
280 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 277 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
281 apic_write_around(APIC_LVTT, v); 278 apic_write(APIC_LVTT, v);
282 break; 279 break;
283 case CLOCK_EVT_MODE_RESUME: 280 case CLOCK_EVT_MODE_RESUME:
284 /* Nothing to do here */ 281 /* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
372 } 369 }
373} 370}
374 371
375/* 372static int __init calibrate_APIC_clock(void)
376 * Setup the boot APIC
377 *
378 * Calibrate and verify the result.
379 */
380void __init setup_boot_APIC_clock(void)
381{ 373{
382 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 374 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
383 const long pm_100ms = PMTMR_TICKS_PER_SEC/10; 375 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
387 long delta, deltapm; 379 long delta, deltapm;
388 int pm_referenced = 0; 380 int pm_referenced = 0;
389 381
390 /*
391 * The local apic timer can be disabled via the kernel
392 * commandline or from the CPU detection code. Register the lapic
393 * timer as a dummy clock event source on SMP systems, so the
394 * broadcast mechanism is used. On UP systems simply ignore it.
395 */
396 if (local_apic_timer_disabled) {
397 /* No broadcast on UP ! */
398 if (num_possible_cpus() > 1) {
399 lapic_clockevent.mult = 1;
400 setup_APIC_timer();
401 }
402 return;
403 }
404
405 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
406 "calibrating APIC timer ...\n");
407
408 local_irq_disable(); 382 local_irq_disable();
409 383
410 /* Replace the global interrupt handler */ 384 /* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
489 calibration_result / (1000000 / HZ), 463 calibration_result / (1000000 / HZ),
490 calibration_result % (1000000 / HZ)); 464 calibration_result % (1000000 / HZ));
491 465
492 local_apic_timer_verify_ok = 1;
493
494 /* 466 /*
495 * Do a sanity check on the APIC calibration result 467 * Do a sanity check on the APIC calibration result
496 */ 468 */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
498 local_irq_enable(); 470 local_irq_enable();
499 printk(KERN_WARNING 471 printk(KERN_WARNING
500 "APIC frequency too slow, disabling apic timer\n"); 472 "APIC frequency too slow, disabling apic timer\n");
501 /* No broadcast on UP ! */ 473 return -1;
502 if (num_possible_cpus() > 1)
503 setup_APIC_timer();
504 return;
505 } 474 }
506 475
476 local_apic_timer_verify_ok = 1;
477
507 /* We trust the pm timer based calibration */ 478 /* We trust the pm timer based calibration */
508 if (!pm_referenced) { 479 if (!pm_referenced) {
509 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 480 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
543 if (!local_apic_timer_verify_ok) { 514 if (!local_apic_timer_verify_ok) {
544 printk(KERN_WARNING 515 printk(KERN_WARNING
545 "APIC timer disabled due to verification failure.\n"); 516 "APIC timer disabled due to verification failure.\n");
517 return -1;
518 }
519
520 return 0;
521}
522
523/*
524 * Setup the boot APIC
525 *
526 * Calibrate and verify the result.
527 */
528void __init setup_boot_APIC_clock(void)
529{
530 /*
531 * The local apic timer can be disabled via the kernel
532 * commandline or from the CPU detection code. Register the lapic
533 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it.
535 */
536 if (local_apic_timer_disabled) {
546 /* No broadcast on UP ! */ 537 /* No broadcast on UP ! */
547 if (num_possible_cpus() == 1) 538 if (num_possible_cpus() > 1) {
548 return; 539 lapic_clockevent.mult = 1;
549 } else { 540 setup_APIC_timer();
550 /* 541 }
551 * If nmi_watchdog is set to IO_APIC, we need the 542 return;
552 * PIT/HPET going. Otherwise register lapic as a dummy
553 * device.
554 */
555 if (nmi_watchdog != NMI_IO_APIC)
556 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
557 else
558 printk(KERN_WARNING "APIC timer registered as dummy,"
559 " due to nmi_watchdog=%d!\n", nmi_watchdog);
560 } 543 }
561 544
545 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
546 "calibrating APIC timer ...\n");
547
548 if (calibrate_APIC_clock()) {
549 /* No broadcast on UP ! */
550 if (num_possible_cpus() > 1)
551 setup_APIC_timer();
552 return;
553 }
554
555 /*
556 * If nmi_watchdog is set to IO_APIC, we need the
557 * PIT/HPET going. Otherwise register lapic as a dummy
558 * device.
559 */
560 if (nmi_watchdog != NMI_IO_APIC)
561 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
562 else
563 printk(KERN_WARNING "APIC timer registered as dummy,"
564 " due to nmi_watchdog=%d!\n", nmi_watchdog);
565
562 /* Setup the lapic or request the broadcast */ 566 /* Setup the lapic or request the broadcast */
563 setup_APIC_timer(); 567 setup_APIC_timer();
564} 568}
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
693 */ 697 */
694 if (maxlvt >= 3) { 698 if (maxlvt >= 3) {
695 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 699 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
696 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 700 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
697 } 701 }
698 /* 702 /*
699 * Careful: we have to set masks only first to deassert 703 * Careful: we have to set masks only first to deassert
700 * any level-triggered sources. 704 * any level-triggered sources.
701 */ 705 */
702 v = apic_read(APIC_LVTT); 706 v = apic_read(APIC_LVTT);
703 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 707 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
704 v = apic_read(APIC_LVT0); 708 v = apic_read(APIC_LVT0);
705 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 709 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
706 v = apic_read(APIC_LVT1); 710 v = apic_read(APIC_LVT1);
707 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 711 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
708 if (maxlvt >= 4) { 712 if (maxlvt >= 4) {
709 v = apic_read(APIC_LVTPC); 713 v = apic_read(APIC_LVTPC);
710 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 714 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
711 } 715 }
712 716
713 /* lets not touch this if we didn't frob it */ 717 /* lets not touch this if we didn't frob it */
714#ifdef CONFIG_X86_MCE_P4THERMAL 718#ifdef CONFIG_X86_MCE_P4THERMAL
715 if (maxlvt >= 5) { 719 if (maxlvt >= 5) {
716 v = apic_read(APIC_LVTTHMR); 720 v = apic_read(APIC_LVTTHMR);
717 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
718 } 722 }
719#endif 723#endif
720 /* 724 /*
721 * Clean APIC state for other OSs: 725 * Clean APIC state for other OSs:
722 */ 726 */
723 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 727 apic_write(APIC_LVTT, APIC_LVT_MASKED);
724 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 728 apic_write(APIC_LVT0, APIC_LVT_MASKED);
725 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 729 apic_write(APIC_LVT1, APIC_LVT_MASKED);
726 if (maxlvt >= 3) 730 if (maxlvt >= 3)
727 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 731 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
728 if (maxlvt >= 4) 732 if (maxlvt >= 4)
729 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 733 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
730 734
731#ifdef CONFIG_X86_MCE_P4THERMAL 735#ifdef CONFIG_X86_MCE_P4THERMAL
732 if (maxlvt >= 5) 736 if (maxlvt >= 5)
733 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
734#endif 738#endif
735 /* Integrated APIC (!82489DX) ? */ 739 /* Integrated APIC (!82489DX) ? */
736 if (lapic_is_integrated()) { 740 if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
756 */ 760 */
757 value = apic_read(APIC_SPIV); 761 value = apic_read(APIC_SPIV);
758 value &= ~APIC_SPIV_APIC_ENABLED; 762 value &= ~APIC_SPIV_APIC_ENABLED;
759 apic_write_around(APIC_SPIV, value); 763 apic_write(APIC_SPIV, value);
760 764
761 /* 765 /*
762 * When LAPIC was disabled by the BIOS and enabled by the kernel, 766 * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
865 apic_wait_icr_idle(); 869 apic_wait_icr_idle();
866 870
867 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
868 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 872 apic_write(APIC_ICR,
869 | APIC_DM_INIT); 873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
870} 874}
871 875
872/* 876/*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
902 else 906 else
903 value |= APIC_SPIV_FOCUS_DISABLED; 907 value |= APIC_SPIV_FOCUS_DISABLED;
904 value |= SPURIOUS_APIC_VECTOR; 908 value |= SPURIOUS_APIC_VECTOR;
905 apic_write_around(APIC_SPIV, value); 909 apic_write(APIC_SPIV, value);
906 910
907 /* 911 /*
908 * Set up the virtual wire mode. 912 * Set up the virtual wire mode.
909 */ 913 */
910 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 914 apic_write(APIC_LVT0, APIC_DM_EXTINT);
911 value = APIC_DM_NMI; 915 value = APIC_DM_NMI;
912 if (!lapic_is_integrated()) /* 82489DX */ 916 if (!lapic_is_integrated()) /* 82489DX */
913 value |= APIC_LVT_LEVEL_TRIGGER; 917 value |= APIC_LVT_LEVEL_TRIGGER;
914 apic_write_around(APIC_LVT1, value); 918 apic_write(APIC_LVT1, value);
915} 919}
916 920
917static void __cpuinit lapic_setup_esr(void) 921static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
926 930
927 /* enables sending errors */ 931 /* enables sending errors */
928 value = ERROR_APIC_VECTOR; 932 value = ERROR_APIC_VECTOR;
929 apic_write_around(APIC_LVTERR, value); 933 apic_write(APIC_LVTERR, value);
930 /* 934 /*
931 * spec says clear errors after enabling vector. 935 * spec says clear errors after enabling vector.
932 */ 936 */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
989 */ 993 */
990 value = apic_read(APIC_TASKPRI); 994 value = apic_read(APIC_TASKPRI);
991 value &= ~APIC_TPRI_MASK; 995 value &= ~APIC_TPRI_MASK;
992 apic_write_around(APIC_TASKPRI, value); 996 apic_write(APIC_TASKPRI, value);
993 997
994 /* 998 /*
995 * After a crash, we no longer service the interrupts and a pending 999 * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
1047 * Set spurious IRQ vector 1051 * Set spurious IRQ vector
1048 */ 1052 */
1049 value |= SPURIOUS_APIC_VECTOR; 1053 value |= SPURIOUS_APIC_VECTOR;
1050 apic_write_around(APIC_SPIV, value); 1054 apic_write(APIC_SPIV, value);
1051 1055
1052 /* 1056 /*
1053 * Set up LVT0, LVT1: 1057 * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
1069 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1073 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1070 smp_processor_id()); 1074 smp_processor_id());
1071 } 1075 }
1072 apic_write_around(APIC_LVT0, value); 1076 apic_write(APIC_LVT0, value);
1073 1077
1074 /* 1078 /*
1075 * only the BP should see the LINT1 NMI signal, obviously. 1079 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
1080 value = APIC_DM_NMI | APIC_LVT_MASKED; 1084 value = APIC_DM_NMI | APIC_LVT_MASKED;
1081 if (!integrated) /* 82489DX */ 1085 if (!integrated) /* 82489DX */
1082 value |= APIC_LVT_LEVEL_TRIGGER; 1086 value |= APIC_LVT_LEVEL_TRIGGER;
1083 apic_write_around(APIC_LVT1, value); 1087 apic_write(APIC_LVT1, value);
1084} 1088}
1085 1089
1086void __cpuinit end_local_APIC_setup(void) 1090void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
1091 /* Disable the local apic timer */ 1095 /* Disable the local apic timer */
1092 value = apic_read(APIC_LVTT); 1096 value = apic_read(APIC_LVTT);
1093 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1094 apic_write_around(APIC_LVTT, value); 1098 apic_write(APIC_LVTT, value);
1095 1099
1096 setup_apic_nmi_watchdog(NULL); 1100 setup_apic_nmi_watchdog(NULL);
1097 apic_pm_activate(); 1101 apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
1214 1218
1215int __init APIC_init_uniprocessor(void) 1219int __init APIC_init_uniprocessor(void)
1216{ 1220{
1217 if (disable_apic)
1218 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1219
1220 if (!smp_found_config && !cpu_has_apic) 1221 if (!smp_found_config && !cpu_has_apic)
1221 return -1; 1222 return -1;
1222 1223
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1419 value &= ~APIC_VECTOR_MASK; 1420 value &= ~APIC_VECTOR_MASK;
1420 value |= APIC_SPIV_APIC_ENABLED; 1421 value |= APIC_SPIV_APIC_ENABLED;
1421 value |= 0xf; 1422 value |= 0xf;
1422 apic_write_around(APIC_SPIV, value); 1423 apic_write(APIC_SPIV, value);
1423 1424
1424 if (!virt_wire_setup) { 1425 if (!virt_wire_setup) {
1425 /* 1426 /*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1432 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1433 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1434 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1435 apic_write_around(APIC_LVT0, value); 1436 apic_write(APIC_LVT0, value);
1436 } else { 1437 } else {
1437 /* Disable LVT0 */ 1438 /* Disable LVT0 */
1438 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1439 } 1440 }
1440 1441
1441 /* 1442 /*
@@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1449 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1450 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1451 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1452 apic_write_around(APIC_LVT1, value); 1453 apic_write(APIC_LVT1, value);
1453 } 1454 }
1454} 1455}
1455 1456
@@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
1700static int __init parse_nolapic(char *arg) 1701static int __init parse_nolapic(char *arg)
1701{ 1702{
1702 disable_apic = 1; 1703 disable_apic = 1;
1703 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1704 setup_clear_cpu_cap(X86_FEATURE_APIC);
1704 return 0; 1705 return 0;
1705} 1706}
1706early_param("nolapic", parse_nolapic); 1707early_param("nolapic", parse_nolapic);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1e3d32e27c14..7f1f030da7ee 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
54/* 54/*
55 * Debug level, exported for io_apic.c 55 * Debug level, exported for io_apic.c
56 */ 56 */
57int apic_verbosity; 57unsigned int apic_verbosity;
58 58
59/* Have we found an MP table */ 59/* Have we found an MP table */
60int smp_found_config; 60int smp_found_config;
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
314 314
315#define TICK_COUNT 100000000 315#define TICK_COUNT 100000000
316 316
317static void __init calibrate_APIC_clock(void) 317static int __init calibrate_APIC_clock(void)
318{ 318{
319 unsigned apic, apic_start; 319 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start; 320 unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
368 clockevent_delta2ns(0xF, &lapic_clockevent); 368 clockevent_delta2ns(0xF, &lapic_clockevent);
369 369
370 calibration_result = result / HZ; 370 calibration_result = result / HZ;
371
372 /*
373 * Do a sanity check on the APIC calibration result
374 */
375 if (calibration_result < (1000000 / HZ)) {
376 printk(KERN_WARNING
377 "APIC frequency too slow, disabling apic timer\n");
378 return -1;
379 }
380
381 return 0;
371} 382}
372 383
373/* 384/*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
394 } 405 }
395 406
396 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 407 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock(); 408 if (calibrate_APIC_clock()) {
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */ 409 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1) 410 if (num_possible_cpus() > 1)
407 setup_APIC_timer(); 411 setup_APIC_timer();
@@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
1337static __init int setup_disableapic(char *str) 1341static __init int setup_disableapic(char *str)
1338{ 1342{
1339 disable_apic = 1; 1343 disable_apic = 1;
1340 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1344 setup_clear_cpu_cap(X86_FEATURE_APIC);
1341 return 0; 1345 return 0;
1342} 1346}
1343early_param("disableapic", setup_disableapic); 1347early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..aa89387006fe 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..c639bd55391c
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <asm/uv/bios.h>
22
23const char *
24x86_bios_strerror(long status)
25{
26 const char *str;
27 switch (status) {
28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break;
33 }
34 return str;
35}
36
37long
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
39 unsigned long *drift_info)
40{
41 struct uv_bios_retval isrv;
42
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
44 *ticks_per_second = isrv.v0;
45 *drift_info = isrv.v1;
46 return isrv.status;
47}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d44..cae9cabc3031 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,8 +24,6 @@
24extern void vide(void); 24extern void vide(void);
25__asm__(".align 4\nvide: ret"); 25__asm__(".align 4\nvide: ret");
26 26
27int force_mwait __cpuinitdata;
28
29static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
30{ 28{
31 if (cpuid_eax(0x80000000) >= 0x80000007) { 29 if (cpuid_eax(0x80000000) >= 0x80000007) {
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d4..d1692b2a41ff 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8)) 116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
118} 120}
119 121
120static void __cpuinit init_amd(struct cpuinfo_x86 *c) 122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338f..c9b58a806e85 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -131,13 +131,7 @@ static void __init check_popad(void)
131 * (for due to lack of "invlpg" and working WP on a i386) 131 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be 132 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486. 133 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine, 134 */
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141 135
142static void __init check_config(void) 136static void __init check_config(void)
143{ 137{
@@ -151,21 +145,6 @@ static void __init check_config(void)
151 if (boot_cpu_data.x86 == 3) 145 if (boot_cpu_data.x86 == 3)
152 panic("Kernel requires i486+ for 'invlpg' and other features"); 146 panic("Kernel requires i486+ for 'invlpg' and other features");
153#endif 147#endif
154
155/*
156 * If we were told we had a good local APIC, check for buggy Pentia,
157 * i.e. all B steppings and the C2 stepping of P54C when using their
158 * integrated APIC (see 11AP erratum in "Pentium Processor
159 * Specification Update").
160 */
161#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
163 && cpu_has_apic
164 && boot_cpu_data.x86 == 5
165 && boot_cpu_data.x86_model == 2
166 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
167 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
168#endif
169} 148}
170 149
171 150
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb40..dd6e3f15017e 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -7,15 +7,13 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/kgdb.h> 8#include <linux/kgdb.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/string.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
13#include <linux/module.h>
14#include <linux/percpu.h> 12#include <linux/percpu.h>
15#include <asm/processor.h>
16#include <asm/i387.h> 13#include <asm/i387.h>
17#include <asm/msr.h> 14#include <asm/msr.h>
18#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
19#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
20#include <asm/mtrr.h> 18#include <asm/mtrr.h>
21#include <asm/mce.h> 19#include <asm/mce.h>
@@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
305 c->x86_capability[2] = cpuid_edx(0x80860001); 303 c->x86_capability[2] = cpuid_edx(0x80860001);
306 } 304 }
307 305
308 c->extended_cpuid_level = cpuid_eax(0x80000000);
309 if (c->extended_cpuid_level >= 0x80000007) 306 if (c->extended_cpuid_level >= 0x80000007)
310 c->x86_power = cpuid_edx(0x80000007); 307 c->x86_power = cpuid_edx(0x80000007);
311 308
@@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
316 c->x86_phys_bits = eax & 0xff; 313 c->x86_phys_bits = eax & 0xff;
317 } 314 }
318 315
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 316 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init) 317 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c); 318 cpu_devs[c->x86_vendor]->c_early_init(c);
325 319
326 validate_pat_support(c); 320 validate_pat_support(c);
327
328 /* early_param could clear that, but recall get it set again */
329 if (disable_apic)
330 clear_cpu_cap(c, X86_FEATURE_APIC);
331} 321}
332 322
333/* 323/*
@@ -517,8 +507,7 @@ void pda_init(int cpu)
517} 507}
518 508
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 509char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ] 510 DEBUG_STKSZ] __page_aligned_bss;
521__attribute__((section(".bss.page_aligned")));
522 511
523extern asmlinkage void ignore_sysret(void); 512extern asmlinkage void ignore_sysret(void);
524 513
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1da..b75f2569b8f8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
227 if (cpu_has_bts) 227 if (cpu_has_bts)
228 ds_init_intel(c); 228 ds_init_intel(c);
229 229
230 /*
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239
230#ifdef CONFIG_X86_NUMAQ 240#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable(); 241 numaq_tsc_disable();
232#endif 242#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e8..ff517f0b8cc4 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
780 } 780 }
781 kobject_put(per_cpu(cache_kobject, cpu)); 781 kobject_put(per_cpu(cache_kobject, cpu));
782 cpuid4_cache_sysfs_exit(cpu); 782 cpuid4_cache_sysfs_exit(cpu);
783 break; 783 return retval;
784 } 784 }
785 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 785 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
786 } 786 }
787 if (!retval) 787 cpu_set(cpu, cache_dev_map);
788 cpu_set(cpu, cache_dev_map);
789 788
790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 789 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
791 return retval; 790 return 0;
792} 791}
793 792
794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 793static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bde..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b380..9af89078f7bb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++; 878 count++;
879 879
880 printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); 880 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
881 count, start, end);
881 for (i = 0; i < count; i++) { 882 for (i = 0; i < count; i++) {
882 struct early_res *r = &early_res[i]; 883 struct early_res *r = &early_res[i];
883 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 884 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
1298 } 1299 }
1299} 1300}
1300 1301
1301/*
1302 * Non-standard memory setup can be specified via this quirk:
1303 */
1304char * (*arch_memory_setup_quirk)(void);
1305
1306char *__init default_machine_specific_memory_setup(void) 1302char *__init default_machine_specific_memory_setup(void)
1307{ 1303{
1308 char *who = "BIOS-e820"; 1304 char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
1343 1339
1344char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1340char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1345{ 1341{
1346 if (arch_memory_setup_quirk) { 1342 if (x86_quirks->arch_memory_setup) {
1347 char *who = arch_memory_setup_quirk(); 1343 char *who = x86_quirks->arch_memory_setup();
1348 1344
1349 if (who) 1345 if (who)
1350 return who; 1346 return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1363 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1368 e820_print_map(who); 1364 e820_print_map(who);
1369} 1365}
1370
1371#ifdef CONFIG_X86_64
1372int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
1373{
1374 int i;
1375
1376 if (slot < 0 || slot >= e820.nr_map)
1377 return -1;
1378 for (i = slot; i < e820.nr_map; i++) {
1379 if (e820.map[i].type != E820_RAM)
1380 continue;
1381 break;
1382 }
1383 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
1384 return -1;
1385 *addr = e820.map[i].addr;
1386 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
1387 max_pfn << PAGE_SHIFT) - *addr;
1388 return i + 1;
1389}
1390#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc872..4353cf5e6fac 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6bc07f0f1202..cdfd94cc6b14 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -332,7 +332,7 @@ sysenter_past_esp:
332 GET_THREAD_INFO(%ebp) 332 GET_THREAD_INFO(%ebp)
333 333
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 335 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 336 jnz syscall_trace_entry
337 cmpl $(nr_syscalls), %eax 337 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 338 jae syscall_badsys
@@ -370,7 +370,7 @@ ENTRY(system_call)
370 GET_THREAD_INFO(%ebp) 370 GET_THREAD_INFO(%ebp)
371 # system call tracing in operation / emulation 371 # system call tracing in operation / emulation
372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
373 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 373 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
374 jnz syscall_trace_entry 374 jnz syscall_trace_entry
375 cmpl $(nr_syscalls), %eax 375 cmpl $(nr_syscalls), %eax
376 jae syscall_badsys 376 jae syscall_badsys
@@ -383,10 +383,6 @@ syscall_exit:
383 # setting need_resched or sigpending 383 # setting need_resched or sigpending
384 # between sampling and the iret 384 # between sampling and the iret
385 TRACE_IRQS_OFF 385 TRACE_IRQS_OFF
386 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
387 jz no_singlestep
388 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
389no_singlestep:
390 movl TI_flags(%ebp), %ecx 386 movl TI_flags(%ebp), %ecx
391 testw $_TIF_ALLWORK_MASK, %cx # current->work 387 testw $_TIF_ALLWORK_MASK, %cx # current->work
392 jne syscall_exit_work 388 jne syscall_exit_work
@@ -514,12 +510,8 @@ END(work_pending)
514syscall_trace_entry: 510syscall_trace_entry:
515 movl $-ENOSYS,PT_EAX(%esp) 511 movl $-ENOSYS,PT_EAX(%esp)
516 movl %esp, %eax 512 movl %esp, %eax
517 xorl %edx,%edx 513 call syscall_trace_enter
518 call do_syscall_trace 514 /* What it returned is what we'll actually use. */
519 cmpl $0, %eax
520 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
521 # so must skip actual syscall
522 movl PT_ORIG_EAX(%esp), %eax
523 cmpl $(nr_syscalls), %eax 515 cmpl $(nr_syscalls), %eax
524 jnae syscall_call 516 jnae syscall_call
525 jmp syscall_exit 517 jmp syscall_exit
@@ -528,14 +520,13 @@ END(syscall_trace_entry)
528 # perform syscall exit tracing 520 # perform syscall exit tracing
529 ALIGN 521 ALIGN
530syscall_exit_work: 522syscall_exit_work:
531 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 523 testb $_TIF_WORK_SYSCALL_EXIT, %cl
532 jz work_pending 524 jz work_pending
533 TRACE_IRQS_ON 525 TRACE_IRQS_ON
534 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 526 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
535 # schedule() instead 527 # schedule() instead
536 movl %esp, %eax 528 movl %esp, %eax
537 movl $1, %edx 529 call syscall_trace_leave
538 call do_syscall_trace
539 jmp resume_userspace 530 jmp resume_userspace
540END(syscall_exit_work) 531END(syscall_exit_work)
541 CFI_ENDPROC 532 CFI_ENDPROC
@@ -1024,6 +1015,7 @@ ENDPROC(kernel_thread_helper)
1024ENTRY(xen_sysenter_target) 1015ENTRY(xen_sysenter_target)
1025 RING0_INT_FRAME 1016 RING0_INT_FRAME
1026 addl $5*4, %esp /* remove xen-provided frame */ 1017 addl $5*4, %esp /* remove xen-provided frame */
1018 CFI_ADJUST_CFA_OFFSET -5*4
1027 jmp sysenter_past_esp 1019 jmp sysenter_past_esp
1028 CFI_ENDPROC 1020 CFI_ENDPROC
1029 1021
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..8410e26f4183 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs)
349 movq %rcx,RIP-ARGOFFSET(%rsp) 349 movq %rcx,RIP-ARGOFFSET(%rsp)
350 CFI_REL_OFFSET rip,RIP-ARGOFFSET 350 CFI_REL_OFFSET rip,RIP-ARGOFFSET
351 GET_THREAD_INFO(%rcx) 351 GET_THREAD_INFO(%rcx)
352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 352 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 TI_flags(%rcx)
354 jnz tracesys 353 jnz tracesys
355 cmpq $__NR_syscall_max,%rax 354 cmpq $__NR_syscall_max,%rax
356 ja badsys 355 ja badsys
@@ -430,7 +429,12 @@ tracesys:
430 FIXUP_TOP_OF_STACK %rdi 429 FIXUP_TOP_OF_STACK %rdi
431 movq %rsp,%rdi 430 movq %rsp,%rdi
432 call syscall_trace_enter 431 call syscall_trace_enter
433 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 432 /*
433 * Reload arg registers from stack in case ptrace changed them.
434 * We don't reload %rax because syscall_trace_enter() returned
435 * the value it wants us to use in the table lookup.
436 */
437 LOAD_ARGS ARGOFFSET, 1
434 RESTORE_REST 438 RESTORE_REST
435 cmpq $__NR_syscall_max,%rax 439 cmpq $__NR_syscall_max,%rax
436 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 440 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -483,7 +487,7 @@ int_very_careful:
483 ENABLE_INTERRUPTS(CLBR_NONE) 487 ENABLE_INTERRUPTS(CLBR_NONE)
484 SAVE_REST 488 SAVE_REST
485 /* Check for syscall exit trace */ 489 /* Check for syscall exit trace */
486 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 490 testl $_TIF_WORK_SYSCALL_EXIT,%edx
487 jz int_signal 491 jz int_signal
488 pushq %rdi 492 pushq %rdi
489 CFI_ADJUST_CFA_OFFSET 8 493 CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +495,7 @@ int_very_careful:
491 call syscall_trace_leave 495 call syscall_trace_leave
492 popq %rdi 496 popq %rdi
493 CFI_ADJUST_CFA_OFFSET -8 497 CFI_ADJUST_CFA_OFFSET -8
494 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 498 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
495 jmp int_restore_rest 499 jmp int_restore_rest
496 500
497int_signal: 501int_signal:
@@ -1189,6 +1193,7 @@ END(device_not_available)
1189 /* runs on exception stack */ 1193 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1194KPROBE_ENTRY(debug)
1191 INTR_FRAME 1195 INTR_FRAME
1196 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1197 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1198 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1199 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1203,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1203 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1204KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1205 INTR_FRAME
1206 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1207 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1208 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1209 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1217,7 @@ KPROBE_END(nmi)
1211 1217
1212KPROBE_ENTRY(int3) 1218KPROBE_ENTRY(int3)
1213 INTR_FRAME 1219 INTR_FRAME
1220 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1221 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1222 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1223 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1244,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1244 /* runs on exception stack */
1238ENTRY(double_fault) 1245ENTRY(double_fault)
1239 XCPT_FRAME 1246 XCPT_FRAME
1247 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1248 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1249 jmp paranoid_exit1
1242 CFI_ENDPROC 1250 CFI_ENDPROC
@@ -1253,6 +1261,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1261 /* runs on exception stack */
1254ENTRY(stack_segment) 1262ENTRY(stack_segment)
1255 XCPT_FRAME 1263 XCPT_FRAME
1264 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1265 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1266 jmp paranoid_exit1
1258 CFI_ENDPROC 1267 CFI_ENDPROC
@@ -1278,6 +1287,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1287 /* runs on exception stack */
1279ENTRY(machine_check) 1288ENTRY(machine_check)
1280 INTR_FRAME 1289 INTR_FRAME
1290 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1291 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1292 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1293 paranoidentry do_machine_check
@@ -1312,3 +1322,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1322 sysret
1313 CFI_ENDPROC 1323 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1324ENDPROC(ignore_sysret)
1325
1326#ifdef CONFIG_XEN
1327ENTRY(xen_hypervisor_callback)
1328 zeroentry xen_do_hypervisor_callback
1329END(xen_hypervisor_callback)
1330
1331/*
1332# A note on the "critical region" in our callback handler.
1333# We want to avoid stacking callback handlers due to events occurring
1334# during handling of the last event. To do this, we keep events disabled
1335# until we've done all processing. HOWEVER, we must enable events before
1336# popping the stack frame (can't be done atomically) and so it would still
1337# be possible to get enough handler activations to overflow the stack.
1338# Although unlikely, bugs of that kind are hard to track down, so we'd
1339# like to avoid the possibility.
1340# So, on entry to the handler we detect whether we interrupted an
1341# existing activation in its critical region -- if so, we pop the current
1342# activation and restart the handler using the previous one.
1343*/
1344ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1345 CFI_STARTPROC
1346/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1347 see the correct pointer to the pt_regs */
1348 movq %rdi, %rsp # we don't return, adjust the stack frame
1349 CFI_ENDPROC
1350 CFI_DEFAULT_STACK
135111: incl %gs:pda_irqcount
1352 movq %rsp,%rbp
1353 CFI_DEF_CFA_REGISTER rbp
1354 cmovzq %gs:pda_irqstackptr,%rsp
1355 pushq %rbp # backlink for old unwinder
1356 call xen_evtchn_do_upcall
1357 popq %rsp
1358 CFI_DEF_CFA_REGISTER rsp
1359 decl %gs:pda_irqcount
1360 jmp error_exit
1361 CFI_ENDPROC
1362END(do_hypervisor_callback)
1363
1364/*
1365# Hypervisor uses this for application faults while it executes.
1366# We get here for two reasons:
1367# 1. Fault while reloading DS, ES, FS or GS
1368# 2. Fault while executing IRET
1369# Category 1 we do not need to fix up as Xen has already reloaded all segment
1370# registers that could be reloaded and zeroed the others.
1371# Category 2 we fix up by killing the current process. We cannot use the
1372# normal Linux return path in this case because if we use the IRET hypercall
1373# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1374# We distinguish between categories by comparing each saved segment register
1375# with its current contents: any discrepancy means we in category 1.
1376*/
1377ENTRY(xen_failsafe_callback)
1378 framesz = (RIP-0x30) /* workaround buggy gas */
1379 _frame framesz
1380 CFI_REL_OFFSET rcx, 0
1381 CFI_REL_OFFSET r11, 8
1382 movw %ds,%cx
1383 cmpw %cx,0x10(%rsp)
1384 CFI_REMEMBER_STATE
1385 jne 1f
1386 movw %es,%cx
1387 cmpw %cx,0x18(%rsp)
1388 jne 1f
1389 movw %fs,%cx
1390 cmpw %cx,0x20(%rsp)
1391 jne 1f
1392 movw %gs,%cx
1393 cmpw %cx,0x28(%rsp)
1394 jne 1f
1395 /* All segments match their saved values => Category 2 (Bad IRET). */
1396 movq (%rsp),%rcx
1397 CFI_RESTORE rcx
1398 movq 8(%rsp),%r11
1399 CFI_RESTORE r11
1400 addq $0x30,%rsp
1401 CFI_ADJUST_CFA_OFFSET -0x30
1402 pushq $0
1403 CFI_ADJUST_CFA_OFFSET 8
1404 pushq %r11
1405 CFI_ADJUST_CFA_OFFSET 8
1406 pushq %rcx
1407 CFI_ADJUST_CFA_OFFSET 8
1408 jmp general_protection
1409 CFI_RESTORE_STATE
14101: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1411 movq (%rsp),%rcx
1412 CFI_RESTORE rcx
1413 movq 8(%rsp),%r11
1414 CFI_RESTORE r11
1415 addq $0x30,%rsp
1416 CFI_ADJUST_CFA_OFFSET -0x30
1417 pushq $0
1418 CFI_ADJUST_CFA_OFFSET 8
1419 SAVE_ALL
1420 jmp error_exit
1421 CFI_ENDPROC
1422END(xen_failsafe_callback)
1423
1424#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b06..3c3929340692 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -24,6 +24,7 @@
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
27 28
28DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
29EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
40short uv_possible_blades; 41short uv_possible_blades;
41EXPORT_SYMBOL_GPL(uv_possible_blades); 42EXPORT_SYMBOL_GPL(uv_possible_blades);
42 43
44unsigned long sn_rtc_cycles_per_second;
45EXPORT_SYMBOL(sn_rtc_cycles_per_second);
46
43/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 47/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
44 48
45static cpumask_t uv_target_cpus(void) 49static cpumask_t uv_target_cpus(void)
@@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc); 276 map_high("MMIOH", mmioh.s.base, shift, map_uc);
273} 277}
274 278
279static __init void uv_rtc_init(void)
280{
281 long status, ticks_per_sec, drift;
282
283 status =
284 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
285 &drift);
286 if (status != 0 || ticks_per_sec < 100000) {
287 printk(KERN_WARNING
288 "unable to determine platform RTC clock frequency, "
289 "guessing.\n");
290 /* BIOS gives wrong value for clock freq. so guess */
291 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
292 } else
293 sn_rtc_cycles_per_second = ticks_per_sec;
294}
295
275static __init void uv_system_init(void) 296static __init void uv_system_init(void)
276{ 297{
277 union uvh_si_addr_map_config_u m_n_config; 298 union uvh_si_addr_map_config_u m_n_config;
@@ -326,6 +347,8 @@ static __init void uv_system_init(void)
326 gnode_upper = (((unsigned long)node_id.s.node_id) & 347 gnode_upper = (((unsigned long)node_id.s.node_id) &
327 ~((1 << n_val) - 1)) << m_val; 348 ~((1 << n_val) - 1)) << m_val;
328 349
350 uv_rtc_init();
351
329 for_each_present_cpu(cpu) { 352 for_each_present_cpu(cpu) {
330 nid = cpu_to_node(cpu); 353 nid = cpu_to_node(cpu);
331 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 354 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..1b318e903bf6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
102 109
103 early_printk("Kernel alive\n"); 110 early_printk("Kernel alive\n");
104 111
105 _cpu_pda = __cpu_pda; 112 x86_64_init_pda();
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 early_printk("Kernel really alive\n");
110 115
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..db3280afe886 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 558abf4c796a..de9aa0e3a9c5 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
756 /* 756 /*
757 * Send the IPI. The write to APIC_ICR fires this off. 757 * Send the IPI. The write to APIC_ICR fires this off.
758 */ 758 */
759 apic_write_around(APIC_ICR, cfg); 759 apic_write(APIC_ICR, cfg);
760} 760}
761#endif /* !CONFIG_SMP */ 761#endif /* !CONFIG_SMP */
762 762
@@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
2030 unsigned long v; 2030 unsigned long v;
2031 2031
2032 v = apic_read(APIC_LVT0); 2032 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 2033 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2034} 2034}
2035 2035
2036static void unmask_lapic_irq(unsigned int irq) 2036static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
2038 unsigned long v; 2038 unsigned long v;
2039 2039
2040 v = apic_read(APIC_LVT0); 2040 v = apic_read(APIC_LVT0);
2041 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 2041 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2042} 2042}
2043 2043
2044static struct irq_chip lapic_chip __read_mostly = { 2044static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
2168 * The AEOI mode will finish them in the 8259A 2168 * The AEOI mode will finish them in the 8259A
2169 * automatically. 2169 * automatically.
2170 */ 2170 */
2171 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2171 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2172 init_8259A(1); 2172 init_8259A(1);
2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2174 2174
@@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
2177 pin2 = ioapic_i8259.pin; 2177 pin2 = ioapic_i8259.pin;
2178 apic2 = ioapic_i8259.apic; 2178 apic2 = ioapic_i8259.apic;
2179 2179
2180 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2180 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2181 vector, apic1, pin1, apic2, pin2); 2181 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2182 vector, apic1, pin1, apic2, pin2);
2182 2183
2183 /* 2184 /*
2184 * Some BIOS writers are clueless and report the ExtINTA 2185 * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
2216 } 2217 }
2217 clear_IO_APIC_pin(apic1, pin1); 2218 clear_IO_APIC_pin(apic1, pin1);
2218 if (!no_pin1) 2219 if (!no_pin1)
2219 printk(KERN_ERR "..MP-BIOS bug: " 2220 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2220 "8254 timer not connected to IO-APIC\n"); 2221 "8254 timer not connected to IO-APIC\n");
2221 2222
2222 printk(KERN_INFO "...trying to set up timer (IRQ0) " 2223 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2223 "through the 8259A ... "); 2224 "(IRQ0) through the 8259A ...\n");
2224 printk("\n..... (found pin %d) ...", pin2); 2225 apic_printk(APIC_QUIET, KERN_INFO
2226 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2225 /* 2227 /*
2226 * legacy devices should be connected to IO APIC #0 2228 * legacy devices should be connected to IO APIC #0
2227 */ 2229 */
@@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
2230 unmask_IO_APIC_irq(0); 2232 unmask_IO_APIC_irq(0);
2231 enable_8259A_irq(0); 2233 enable_8259A_irq(0);
2232 if (timer_irq_works()) { 2234 if (timer_irq_works()) {
2233 printk("works.\n"); 2235 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2234 timer_through_8259 = 1; 2236 timer_through_8259 = 1;
2235 if (nmi_watchdog == NMI_IO_APIC) { 2237 if (nmi_watchdog == NMI_IO_APIC) {
2236 disable_8259A_irq(0); 2238 disable_8259A_irq(0);
@@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
2244 */ 2246 */
2245 disable_8259A_irq(0); 2247 disable_8259A_irq(0);
2246 clear_IO_APIC_pin(apic2, pin2); 2248 clear_IO_APIC_pin(apic2, pin2);
2247 printk(" failed.\n"); 2249 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2248 } 2250 }
2249 2251
2250 if (nmi_watchdog == NMI_IO_APIC) { 2252 if (nmi_watchdog == NMI_IO_APIC) {
2251 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2253 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2254 "through the IO-APIC - disabling NMI Watchdog!\n");
2252 nmi_watchdog = NMI_NONE; 2255 nmi_watchdog = NMI_NONE;
2253 } 2256 }
2254 timer_ack = 0; 2257 timer_ack = 0;
2255 2258
2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2259 apic_printk(APIC_QUIET, KERN_INFO
2260 "...trying to set up timer as Virtual Wire IRQ...\n");
2257 2261
2258 lapic_register_intr(0, vector); 2262 lapic_register_intr(0, vector);
2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2263 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0); 2264 enable_8259A_irq(0);
2261 2265
2262 if (timer_irq_works()) { 2266 if (timer_irq_works()) {
2263 printk(" works.\n"); 2267 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2264 goto out; 2268 goto out;
2265 } 2269 }
2266 disable_8259A_irq(0); 2270 disable_8259A_irq(0);
2267 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 2271 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 printk(" failed.\n"); 2272 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2269 2273
2270 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2274 apic_printk(APIC_QUIET, KERN_INFO
2275 "...trying to set up timer as ExtINT IRQ...\n");
2271 2276
2272 init_8259A(0); 2277 init_8259A(0);
2273 make_8259A_irq(0); 2278 make_8259A_irq(0);
2274 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 2279 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2275 2280
2276 unlock_ExtINT_logic(); 2281 unlock_ExtINT_logic();
2277 2282
2278 if (timer_irq_works()) { 2283 if (timer_irq_works()) {
2279 printk(" works.\n"); 2284 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2280 goto out; 2285 goto out;
2281 } 2286 }
2282 printk(" failed :(.\n"); 2287 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2283 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2288 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2284 "report. Then try booting with the 'noapic' option"); 2289 "report. Then try booting with the 'noapic' option.\n");
2285out: 2290out:
2286 local_irq_restore(flags); 2291 local_irq_restore(flags);
2287} 2292}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 6510cde36b35..64a46affd858 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -45,6 +45,7 @@
45#include <asm/proto.h> 45#include <asm/proto.h>
46#include <asm/acpi.h> 46#include <asm/acpi.h>
47#include <asm/dma.h> 47#include <asm/dma.h>
48#include <asm/i8259.h>
48#include <asm/nmi.h> 49#include <asm/nmi.h>
49#include <asm/msidef.h> 50#include <asm/msidef.h>
50#include <asm/hypertransport.h> 51#include <asm/hypertransport.h>
@@ -1696,8 +1697,9 @@ static inline void __init check_timer(void)
1696 pin2 = ioapic_i8259.pin; 1697 pin2 = ioapic_i8259.pin;
1697 apic2 = ioapic_i8259.apic; 1698 apic2 = ioapic_i8259.apic;
1698 1699
1699 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 1700 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1700 cfg->vector, apic1, pin1, apic2, pin2); 1701 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
1702 cfg->vector, apic1, pin1, apic2, pin2);
1701 1703
1702 /* 1704 /*
1703 * Some BIOS writers are clueless and report the ExtINTA 1705 * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1737,13 @@ static inline void __init check_timer(void)
1735 } 1737 }
1736 clear_IO_APIC_pin(apic1, pin1); 1738 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 1739 if (!no_pin1)
1738 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " 1740 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1739 "8254 timer not connected to IO-APIC\n"); 1741 "8254 timer not connected to IO-APIC\n");
1740 1742
1741 apic_printk(APIC_VERBOSE,KERN_INFO 1743 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1742 "...trying to set up timer (IRQ0) " 1744 "(IRQ0) through the 8259A ...\n");
1743 "through the 8259A ... "); 1745 apic_printk(APIC_QUIET, KERN_INFO
1744 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 1746 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1745 apic2, pin2);
1746 /* 1747 /*
1747 * legacy devices should be connected to IO APIC #0 1748 * legacy devices should be connected to IO APIC #0
1748 */ 1749 */
@@ -1751,7 +1752,7 @@ static inline void __init check_timer(void)
1751 unmask_IO_APIC_irq(0); 1752 unmask_IO_APIC_irq(0);
1752 enable_8259A_irq(0); 1753 enable_8259A_irq(0);
1753 if (timer_irq_works()) { 1754 if (timer_irq_works()) {
1754 apic_printk(APIC_VERBOSE," works.\n"); 1755 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1755 timer_through_8259 = 1; 1756 timer_through_8259 = 1;
1756 if (nmi_watchdog == NMI_IO_APIC) { 1757 if (nmi_watchdog == NMI_IO_APIC) {
1757 disable_8259A_irq(0); 1758 disable_8259A_irq(0);
@@ -1765,29 +1766,32 @@ static inline void __init check_timer(void)
1765 */ 1766 */
1766 disable_8259A_irq(0); 1767 disable_8259A_irq(0);
1767 clear_IO_APIC_pin(apic2, pin2); 1768 clear_IO_APIC_pin(apic2, pin2);
1768 apic_printk(APIC_VERBOSE," failed.\n"); 1769 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1769 } 1770 }
1770 1771
1771 if (nmi_watchdog == NMI_IO_APIC) { 1772 if (nmi_watchdog == NMI_IO_APIC) {
1772 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 1773 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
1774 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE; 1775 nmi_watchdog = NMI_NONE;
1774 } 1776 }
1775 1777
1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 1778 apic_printk(APIC_QUIET, KERN_INFO
1779 "...trying to set up timer as Virtual Wire IRQ...\n");
1777 1780
1778 lapic_register_intr(0); 1781 lapic_register_intr(0);
1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 1782 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1780 enable_8259A_irq(0); 1783 enable_8259A_irq(0);
1781 1784
1782 if (timer_irq_works()) { 1785 if (timer_irq_works()) {
1783 apic_printk(APIC_VERBOSE," works.\n"); 1786 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1784 goto out; 1787 goto out;
1785 } 1788 }
1786 disable_8259A_irq(0); 1789 disable_8259A_irq(0);
1787 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 1790 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1788 apic_printk(APIC_VERBOSE," failed.\n"); 1791 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1789 1792
1790 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 1793 apic_printk(APIC_QUIET, KERN_INFO
1794 "...trying to set up timer as ExtINT IRQ...\n");
1791 1795
1792 init_8259A(0); 1796 init_8259A(0);
1793 make_8259A_irq(0); 1797 make_8259A_irq(0);
@@ -1796,11 +1800,12 @@ static inline void __init check_timer(void)
1796 unlock_ExtINT_logic(); 1800 unlock_ExtINT_logic();
1797 1801
1798 if (timer_irq_works()) { 1802 if (timer_irq_works()) {
1799 apic_printk(APIC_VERBOSE," works.\n"); 1803 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1800 goto out; 1804 goto out;
1801 } 1805 }
1802 apic_printk(APIC_VERBOSE," failed :(.\n"); 1806 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1803 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 1807 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
1808 "report. Then try booting with the 'noapic' option.\n");
1804out: 1809out:
1805 local_irq_restore(flags); 1810 local_irq_restore(flags);
1806} 1811}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..1c3a66a67f83 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -103,6 +103,9 @@ void __init io_delay_init(void)
103 103
104static int __init io_delay_param(char *s) 104static int __init io_delay_param(char *s)
105{ 105{
106 if (!s)
107 return -EINVAL;
108
106 if (!strcmp(s, "0x80")) 109 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 110 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 111 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad9..3f7537b669d3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
70 /* 70 /*
71 * Send the IPI. The write to APIC_ICR fires this off. 71 * Send the IPI. The write to APIC_ICR fires this off.
72 */ 72 */
73 apic_write_around(APIC_ICR, cfg); 73 apic_write(APIC_ICR, cfg);
74} 74}
75 75
76void send_IPI_self(int vector) 76void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
98 * prepare target chip field 98 * prepare target chip field
99 */ 99 */
100 cfg = __prepare_ICR2(mask); 100 cfg = __prepare_ICR2(mask);
101 apic_write_around(APIC_ICR2, cfg); 101 apic_write(APIC_ICR2, cfg);
102 102
103 /* 103 /*
104 * program the ICR 104 * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
108 /* 108 /*
109 * Send the IPI. The write to APIC_ICR fires this off. 109 * Send the IPI. The write to APIC_ICR fires this off.
110 */ 110 */
111 apic_write_around(APIC_ICR, cfg); 111 apic_write(APIC_ICR, cfg);
112} 112}
113 113
114/* 114/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..1cf8c1fcc088 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 85
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 88
92static void call_on_stack(void *func, void *stack) 89static void call_on_stack(void *func, void *stack)
93{ 90{
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..f2d43bc75514 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
209{ 213{
210 int error = 0; 214 int error = 0;
211 215
216 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
217 if (!arch_debugfs_dir)
218 return -ENOMEM;
219
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 220#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 221 error = boot_params_kdebugfs_init();
214#endif 222#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..43c019f85f0d 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 860
861 resume_execution(cur, regs, kcb); 861 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 862 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 863
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 864 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 865 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..0e867676b5a5 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 150 const Elf_Shdr *sechdrs,
151 struct module *me) 151 struct module *me)
152{ 152{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
154 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 155 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 156
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 157 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 161 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 162 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 163 locks= s;
164 if (!strcmp(".parainstructions", secstrings + s->sh_name))
165 para = s;
163 } 166 }
164 167
165 if (alt) { 168 if (alt) {
@@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 178 tseg, tseg + text->sh_size);
176 } 179 }
177 180
181 if (para) {
182 void *pseg = (void *)para->sh_addr;
183 apply_paravirt(pseg, pseg + para->sh_size);
184 }
185
178 return module_bug_finalize(hdr, sechdrs, me); 186 return module_bug_finalize(hdr, sechdrs, me);
179} 187}
180 188
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c6..6ae005ccaed8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
30#include <asm/setup.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
48 return sum & 0xFF; 49 return sum & 0xFF;
49} 50}
50 51
51#ifdef CONFIG_X86_NUMAQ
52int found_numaq;
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68
69static int mpc_record;
70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
119#endif
120
121static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 52static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
122{ 53{
123 int apicid; 54 int apicid;
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
127 disabled_cpus++; 58 disabled_cpus++;
128 return; 59 return;
129 } 60 }
130#ifdef CONFIG_X86_NUMAQ 61
131 if (found_numaq) 62 if (x86_quirks->mpc_apic_id)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]); 63 apicid = x86_quirks->mpc_apic_id(m);
133 else 64 else
134 apicid = m->mpc_apicid; 65 apicid = m->mpc_apicid;
135#else 66
136 apicid = m->mpc_apicid;
137#endif
138 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
139 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
140 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
151 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
152 str[6] = 0; 81 str[6] = 0;
153 82
154#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
155 if (found_numaq) 84 x86_quirks->mpc_oem_bus_info(m, str);
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 85 else
157#else 86 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
159#endif
160 87
161#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
162 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
174#endif 101#endif
175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
176#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
177 if (found_numaq) 104 x86_quirks->mpc_oem_pci_bus(m);
178 mpc_oem_pci_bus(m, translation_table[mpc_record]); 105
179#endif
180 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
181#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
316 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
317} 243}
318 244
319#ifdef CONFIG_X86_NUMAQ
320static void __init MP_translation_info(struct mpc_config_translation *m)
321{
322 printk(KERN_INFO
323 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
324 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
325 m->trans_local);
326
327 if (mpc_record >= MAX_MPC_ENTRY)
328 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
329 else
330 translation_table[mpc_record] = m; /* stash this for later */
331 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
332 node_set_online(m->trans_quad);
333}
334
335/*
336 * Read/parse the MPC oem tables
337 */
338
339static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
340 unsigned short oemsize)
341{
342 int count = sizeof(*oemtable); /* the header size */
343 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
344
345 mpc_record = 0;
346 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
347 oemtable);
348 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
349 printk(KERN_WARNING
350 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
351 oemtable->oem_signature[0], oemtable->oem_signature[1],
352 oemtable->oem_signature[2], oemtable->oem_signature[3]);
353 return;
354 }
355 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
356 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
357 return;
358 }
359 while (count < oemtable->oem_length) {
360 switch (*oemptr) {
361 case MP_TRANSLATION:
362 {
363 struct mpc_config_translation *m =
364 (struct mpc_config_translation *)oemptr;
365 MP_translation_info(m);
366 oemptr += sizeof(*m);
367 count += sizeof(*m);
368 ++mpc_record;
369 break;
370 }
371 default:
372 {
373 printk(KERN_WARNING
374 "Unrecognised OEM table entry type! - %d\n",
375 (int)*oemptr);
376 return;
377 }
378 }
379 }
380}
381
382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
383 char *productid)
384{
385 if (strncmp(oem, "IBM NUMA", 8))
386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
390 if (mpc->mpc_oemptr)
391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
392 mpc->mpc_oemsize);
393}
394#endif /* CONFIG_X86_NUMAQ */
395
396/* 245/*
397 * Read/parse the MPC 246 * Read/parse the MPC
398 */ 247 */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
457 } else 306 } else
458 mps_oem_check(mpc, oem, str); 307 mps_oem_check(mpc, oem, str);
459#endif 308#endif
460
461 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
462 if (!acpi_lapic) 310 if (!acpi_lapic)
463 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
465 if (early) 313 if (early)
466 return 1; 314 return 1;
467 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
468 /* 321 /*
469 * Now process the configuration blocks. 322 * Now process the configuration blocks.
470 */ 323 */
471#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
472 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
473#endif 326
474 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
475 switch (*mpt) { 328 switch (*mpt) {
476 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
536 count = mpc->mpc_length; 389 count = mpc->mpc_length;
537 break; 390 break;
538 } 391 }
539#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
540 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
541#endif
542 } 394 }
543 395
544#ifdef CONFIG_X86_GENERICARCH 396#ifdef CONFIG_X86_GENERICARCH
@@ -726,20 +578,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 578static struct intel_mp_floating *mpf_found;
727 579
728/* 580/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
735 * Scan the memory blocks for an SMP configuration block. 581 * Scan the memory blocks for an SMP configuration block.
736 */ 582 */
737static void __init __get_smp_config(unsigned int early) 583static void __init __get_smp_config(unsigned int early)
738{ 584{
739 struct intel_mp_floating *mpf = mpf_found; 585 struct intel_mp_floating *mpf = mpf_found;
740 586
741 if (mach_get_smp_config_quirk) { 587 if (x86_quirks->mach_get_smp_config) {
742 if (mach_get_smp_config_quirk(early)) 588 if (x86_quirks->mach_get_smp_config(early))
743 return; 589 return;
744 } 590 }
745 if (acpi_lapic && early) 591 if (acpi_lapic && early)
@@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
899 return 0; 745 return 0;
900} 746}
901 747
902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve) 748static void __init __find_smp_config(unsigned int reserve)
905{ 749{
906 unsigned int address; 750 unsigned int address;
907 751
908 if (mach_find_smp_config_quirk) { 752 if (x86_quirks->mach_find_smp_config) {
909 if (mach_find_smp_config_quirk(reserve)) 753 if (x86_quirks->mach_find_smp_config(reserve))
910 return; 754 return;
911 } 755 }
912 /* 756 /*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad0..ac6d51222e7d 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
263 263
264static void __acpi_nmi_enable(void *__unused) 264static void __acpi_nmi_enable(void *__unused)
265{ 265{
266 apic_write_around(APIC_LVT0, APIC_DM_NMI); 266 apic_write(APIC_LVT0, APIC_DM_NMI);
267} 267}
268 268
269/* 269/*
@@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
277 277
278static void __acpi_nmi_disable(void *__unused) 278static void __acpi_nmi_disable(void *__unused)
279{ 279{
280 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); 280 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
281} 281}
282 282
283/* 283/*
@@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
448 448
449#ifdef CONFIG_SYSCTL 449#ifdef CONFIG_SYSCTL
450 450
451static int __init setup_unknown_nmi_panic(char *str)
452{
453 unknown_nmi_panic = 1;
454 return 1;
455}
456__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
457
451static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 458static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
452{ 459{
453 unsigned char reason = get_nmi_reason(); 460 unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9ac..b8c45610b20a 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h>
36 37
37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
38 39
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
71 } 72 }
72} 73}
73 74
75
76void __init numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL,
235 .arch_pre_intr_init = NULL,
236 .arch_memory_setup = NULL,
237 .arch_intr_init = NULL,
238 .arch_trap_init = NULL,
239 .mach_get_smp_config = NULL,
240 .mach_find_smp_config = NULL,
241 .mpc_record = &mpc_record,
242 .mpc_apic_id = mpc_apic_id,
243 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem,
246};
247
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
249 char *productid)
250{
251 if (strncmp(oem, "IBM NUMA", 8))
252 printk("Warning! Not a NUMA-Q system!\n");
253 else
254 found_numaq = 1;
255}
256
74static __init void early_check_numaq(void) 257static __init void early_check_numaq(void)
75{ 258{
76 /* 259 /*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
82 */ 265 */
83 if (smp_found_config) 266 if (smp_found_config)
84 early_get_smp_config(); 267 early_get_smp_config();
268
269 if (found_numaq)
270 x86_quirks = &numaq_x86_quirks;
85} 271}
86 272
87int __init get_memcfg_numaq(void) 273int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
92 smp_dump_qct(); 278 smp_dump_qct();
93 return 1; 279 return 1;
94} 280}
95
96void __init numaq_tsc_disable(void)
97{
98 if (!found_numaq)
99 return;
100
101 if (num_online_nodes() > 1) {
102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
103 setup_clear_cpu_cap(X86_FEATURE_TSC);
104 }
105}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..097d8a6797fa 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
123 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
124 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
125 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
126 }; 128 };
127 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
128} 130}
@@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
266 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
267} 269}
268 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
269struct pv_info pv_info = { 282struct pv_info pv_info = {
270 .name = "bare hardware", 283 .name = "bare hardware",
271 .paravirt_enabled = 0, 284 .paravirt_enabled = 0,
@@ -361,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = {
361struct pv_apic_ops pv_apic_ops = { 374struct pv_apic_ops pv_apic_ops = {
362#ifdef CONFIG_X86_LOCAL_APIC 375#ifdef CONFIG_X86_LOCAL_APIC
363 .apic_write = native_apic_write, 376 .apic_write = native_apic_write,
364 .apic_write_atomic = native_apic_write_atomic,
365 .apic_read = native_apic_read, 377 .apic_read = native_apic_read,
366 .setup_boot_clock = setup_boot_APIC_clock, 378 .setup_boot_clock = setup_boot_APIC_clock,
367 .setup_secondary_clock = setup_secondary_APIC_clock, 379 .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 385#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 386 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 387 .pagetable_setup_done = native_pagetable_setup_done,
388#else
389 .pagetable_setup_start = paravirt_nop,
390 .pagetable_setup_done = paravirt_nop,
376#endif 391#endif
377 392
378 .read_cr2 = native_read_cr2, 393 .read_cr2 = native_read_cr2,
@@ -446,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = {
446 .set_fixmap = native_set_fixmap, 461 .set_fixmap = native_set_fixmap,
447}; 462};
448 463
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
473};
474EXPORT_SYMBOL_GPL(pv_lock_ops);
475
449EXPORT_SYMBOL_GPL(pv_time_ops); 476EXPORT_SYMBOL_GPL(pv_time_ops);
450EXPORT_SYMBOL (pv_cpu_ops); 477EXPORT_SYMBOL (pv_cpu_ops);
451EXPORT_SYMBOL (pv_mmu_ops); 478EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df4..151f2d171f7c 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -36,7 +36,7 @@
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/scatterlist.h> 37#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 38#include <linux/iommu-helper.h>
39#include <asm/gart.h> 39#include <asm/iommu.h>
40#include <asm/calgary.h> 40#include <asm/calgary.h>
41#include <asm/tce.h> 41#include <asm/tce.h>
42#include <asm/pci-direct.h> 42#include <asm/pci-direct.h>
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8467ec2320f1..a4213c00dffc 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,12 +5,11 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
13EXPORT_SYMBOL(forbid_dac);
14 13
15const struct dma_mapping_ops *dma_ops; 14const struct dma_mapping_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
@@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
114 * The order of these functions is important for 113 * The order of these functions is important for
115 * fall-back/fail-over reasons 114 * fall-back/fail-over reasons
116 */ 115 */
117#ifdef CONFIG_GART_IOMMU
118 gart_iommu_hole_init(); 116 gart_iommu_hole_init();
119#endif
120 117
121#ifdef CONFIG_CALGARY_IOMMU
122 detect_calgary(); 118 detect_calgary();
123#endif
124 119
125 detect_intel_iommu(); 120 detect_intel_iommu();
126 121
127 amd_iommu_detect(); 122 amd_iommu_detect();
128 123
129#ifdef CONFIG_SWIOTLB
130 pci_swiotlb_init(); 124 pci_swiotlb_init();
131#endif
132} 125}
133#endif 126#endif
134 127
@@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
184 swiotlb = 1; 177 swiotlb = 1;
185#endif 178#endif
186 179
187#ifdef CONFIG_GART_IOMMU
188 gart_parse_options(p); 180 gart_parse_options(p);
189#endif
190 181
191#ifdef CONFIG_CALGARY_IOMMU 182#ifdef CONFIG_CALGARY_IOMMU
192 if (!strncmp(p, "calgary", 7)) 183 if (!strncmp(p, "calgary", 7))
@@ -500,17 +491,13 @@ EXPORT_SYMBOL(dma_free_coherent);
500 491
501static int __init pci_iommu_init(void) 492static int __init pci_iommu_init(void)
502{ 493{
503#ifdef CONFIG_CALGARY_IOMMU
504 calgary_iommu_init(); 494 calgary_iommu_init();
505#endif
506 495
507 intel_iommu_init(); 496 intel_iommu_init();
508 497
509 amd_iommu_init(); 498 amd_iommu_init();
510 499
511#ifdef CONFIG_GART_IOMMU
512 gart_iommu_init(); 500 gart_iommu_init();
513#endif
514 501
515 no_iommu_init(); 502 no_iommu_init();
516 return 0; 503 return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d18..be60961f8695 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -32,6 +32,7 @@
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..792b9179eff3 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,7 +7,7 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04d..20df839b9c20 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f8..7fc4d5b0a6a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
18 19
19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
20{ 21{
@@ -199,6 +200,7 @@ static void poll_idle(void)
199 * 200 *
200 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
201 */ 202 */
203static int __cpuinitdata force_mwait;
202 204
203#define MWAIT_INFO 0x05 205#define MWAIT_INFO 0x05
204#define MWAIT_ECX_EXTENDED_INFO 0x01 206#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
326 328
327static int __init idle_setup(char *str) 329static int __init idle_setup(char *str)
328{ 330{
331 if (!str)
332 return -EINVAL;
333
329 if (!strcmp(str, "poll")) { 334 if (!strcmp(str, "poll")) {
330 printk("using polling idle threads.\n"); 335 printk("using polling idle threads.\n");
331 pm_idle = poll_idle; 336 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..e8a8e1b99817 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 537struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 538__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 539{
540 struct thread_struct *prev = &prev_p->thread, 540 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 541 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 542 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 543 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 544 unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 586
587 /* 587 /*
588 * Switch FS and GS. 588 * Switch FS and GS.
589 *
590 * Segment register != 0 always requires a reload. Also
591 * reload when it has changed. When prev process used 64bit
592 * base always reload to avoid an information leak.
589 */ 593 */
590 { 594 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 595 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 596 /*
593 when prev process used 64bit base always reload 597 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 598 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 599 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 600 */
597 /* check if the user used a selector != 0 601 if (fsindex)
598 * if yes clear 64bit base, since overloaded base
599 * is always mapped to the Null selector
600 */
601 if (fsindex)
602 prev->fs = 0; 602 prev->fs = 0;
603 } 603 }
604 /* when next process has a 64bit base use it */ 604 /* when next process has a 64bit base use it */
605 if (next->fs) 605 if (next->fs)
606 wrmsrl(MSR_FS_BASE, next->fs); 606 wrmsrl(MSR_FS_BASE, next->fs);
607 prev->fsindex = fsindex; 607 prev->fsindex = fsindex;
608 608
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 610 load_gs_index(next->gsindex);
611 if (gsindex) 611 if (gsindex)
612 prev->gs = 0; 612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
618 617
619 /* Must be after DS reload */ 618 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 619 unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 write_pda(pcurrent, next_p); 626 write_pda(pcurrent, next_p);
628 627
629 write_pda(kernelstack, 628 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 629 (unsigned long)task_stack_page(next_p) +
630 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 632 write_pda(stack_canary, next_p->stack_canary);
633 /* 633 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e1..e37dccce85db 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1357#endif
1358} 1358}
1359 1359
1360#ifdef CONFIG_X86_32
1361
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1361{
1364 struct siginfo info; 1362 struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1375 force_sig_info(SIGTRAP, &info, tsk);
1378} 1376}
1379 1377
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435
1436 /*
1437 * this isn't the same as continuing with a signal, but it will do
1438 * for normal use. strace only continues with a signal if the
1439 * stopping signal is not SIGTRAP. -brl
1440 */
1441 if (current->exit_code) {
1442 send_sig(current->exit_code, current, 1);
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460
1461static void syscall_trace(struct pt_regs *regs) 1378static void syscall_trace(struct pt_regs *regs)
1462{ 1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1463 1382
1464#if 0 1383#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", 1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
1481 } 1400 }
1482} 1401}
1483 1402
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs) 1403#ifdef CONFIG_X86_32
1404# define IS_IA32 1
1405#elif defined CONFIG_IA32_EMULATION
1406# define IS_IA32 test_thread_flag(TIF_IA32)
1407#else
1408# define IS_IA32 0
1409#endif
1410
1411/*
1412 * We must return the syscall number to actually look up in the table.
1413 * This can be -1L to skip running any syscall at all.
1414 */
1415asmregparm long syscall_trace_enter(struct pt_regs *regs)
1485{ 1416{
1417 long ret = 0;
1418
1419 /*
1420 * If we stepped into a sysenter/syscall insn, it trapped in
1421 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1422 * If user-mode had set TF itself, then it's still clear from
1423 * do_debug() and we need to set it again to restore the user
1424 * state. If we entered on the slow path, TF was already set.
1425 */
1426 if (test_thread_flag(TIF_SINGLESTEP))
1427 regs->flags |= X86_EFLAGS_TF;
1428
1486 /* do the secure computing check first */ 1429 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1430 secure_computing(regs->orig_ax);
1488 1431
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1433 ret = -1L;
1434
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
1491 syscall_trace(regs); 1436 syscall_trace(regs);
1492 1437
1493 if (unlikely(current->audit_context)) { 1438 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1439 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1440 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1441 regs->orig_ax,
1497 regs->bx, regs->cx, 1442 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1443 regs->dx, regs->si);
1499 } else { 1444#ifdef CONFIG_X86_64
1445 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1446 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1447 regs->orig_ax,
1502 regs->di, regs->si, 1448 regs->di, regs->si,
1503 regs->dx, regs->r10); 1449 regs->dx, regs->r10);
1504 } 1450#endif
1505 } 1451 }
1452
1453 return ret ?: regs->orig_ax;
1506} 1454}
1507 1455
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1456asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1457{
1510 if (unlikely(current->audit_context)) 1458 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1460
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1461 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP))
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs); 1462 syscall_trace(regs);
1517}
1518 1463
1519#endif /* CONFIG_X86_32 */ 1464 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of
1466 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1467 * We already reported this syscall instruction in
1468 * syscall_trace_enter(), so don't do any more now.
1469 */
1470 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1471 return;
1472
1473 /*
1474 * If we are single-stepping, synthesize a trap to follow the
1475 * system call instruction.
1476 */
1477 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED))
1479 send_sigtrap(current, regs, 0);
1480}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e151..9dcf39c02972 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 178 },
179 }, 179 },
180 { /* Handle problems with rebooting on Dell T5400's */
181 .callback = set_bios_reboot,
182 .ident = "Dell Precision T5400",
183 .matches = {
184 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
185 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
186 },
187 },
180 { /* Handle problems with rebooting on HP laptops */ 188 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 189 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 190 .ident = "HP Compaq Laptop",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..ec952aa5394a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/user.h> 58#include <linux/user.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/highmem.h>
61 60
62#include <linux/kallsyms.h> 61#include <linux/kallsyms.h>
63#include <linux/edd.h>
64#include <linux/iscsi_ibft.h>
65#include <linux/kexec.h>
66#include <linux/cpufreq.h> 62#include <linux/cpufreq.h>
67#include <linux/dma-mapping.h> 63#include <linux/dma-mapping.h>
68#include <linux/ctype.h> 64#include <linux/ctype.h>
@@ -96,7 +92,7 @@
96#include <asm/smp.h> 92#include <asm/smp.h>
97#include <asm/desc.h> 93#include <asm/desc.h>
98#include <asm/dma.h> 94#include <asm/dma.h>
99#include <asm/gart.h> 95#include <asm/iommu.h>
100#include <asm/mmu_context.h> 96#include <asm/mmu_context.h>
101#include <asm/proto.h> 97#include <asm/proto.h>
102 98
@@ -104,7 +100,6 @@
104#include <asm/paravirt.h> 100#include <asm/paravirt.h>
105 101
106#include <asm/percpu.h> 102#include <asm/percpu.h>
107#include <asm/sections.h>
108#include <asm/topology.h> 103#include <asm/topology.h>
109#include <asm/apicdef.h> 104#include <asm/apicdef.h>
110#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
579early_param("elfcorehdr", setup_elfcorehdr); 574early_param("elfcorehdr", setup_elfcorehdr);
580#endif 575#endif
581 576
577static struct x86_quirks default_x86_quirks __initdata;
578
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580
582/* 581/*
583 * Determine if we were loaded by an EFI loader. If so, then we have also been 582 * Determine if we were loaded by an EFI loader. If so, then we have also been
584 * passed the efi memmap, systab, etc., so we should use these data structures 583 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -824,7 +823,10 @@ void __init setup_arch(char **cmdline_p)
824 vmi_init(); 823 vmi_init();
825#endif 824#endif
826 825
826 paravirt_pagetable_setup_start(swapper_pg_dir);
827 paging_init(); 827 paging_init();
828 paravirt_pagetable_setup_done(swapper_pg_dir);
829 paravirt_post_allocator_init();
828 830
829#ifdef CONFIG_X86_64 831#ifdef CONFIG_X86_64
830 map_vsyscall(); 832 map_vsyscall();
@@ -854,14 +856,6 @@ void __init setup_arch(char **cmdline_p)
854 init_cpu_to_node(); 856 init_cpu_to_node();
855#endif 857#endif
856 858
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
865 init_apic_mappings(); 859 init_apic_mappings();
866 ioapic_init_mappings(); 860 ioapic_init_mappings();
867 861
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..07faaa5109cb 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 212
213badframe: 213badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 214 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 215 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 216 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 218 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs)
657void 657void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 659{
660 /* Pending single-step? */
661 if (thread_info_flags & _TIF_SINGLESTEP) {
662 regs->flags |= X86_EFLAGS_TF;
663 clear_thread_flag(TIF_SINGLESTEP);
664 }
665
666 /* deal with pending signal delivery */ 660 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 662 do_signal(regs);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..bf87684474f1 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs)
487void do_notify_resume(struct pt_regs *regs, void *unused, 487void do_notify_resume(struct pt_regs *regs, void *unused,
488 __u32 thread_info_flags) 488 __u32 thread_info_flags)
489{ 489{
490 /* Pending single-step? */
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE 490#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 491 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 492 if (thread_info_flags & _TIF_MCE_NOTIFY)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..27640196eb7c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 546 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 547 "a previous APIC delivery may have failed\n");
548 548
549 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 549 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
550 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 550 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 551
552 timeout = 0; 552 timeout = 0;
553 do { 553 do {
@@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 579 int maxlvt;
580 580
581 /* Target chip */ 581 /* Target chip */
582 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 582 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583 583
584 /* Boot on the stack */ 584 /* Boot on the stack */
585 /* Kick the second */ 585 /* Kick the second */
586 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 586 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
587 587
588 Dprintk("Waiting for send to finish...\n"); 588 Dprintk("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 589 send_status = safe_apic_wait_icr_idle();
@@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
592 * Give the other CPU some time to accept the IPI. 592 * Give the other CPU some time to accept the IPI.
593 */ 593 */
594 udelay(200); 594 udelay(200);
595 /*
596 * Due to the Pentium erratum 3AP.
597 */
598 maxlvt = lapic_get_maxlvt(); 595 maxlvt = lapic_get_maxlvt();
599 if (maxlvt > 3) { 596 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
600 apic_read_around(APIC_SPIV);
601 apic_write(APIC_ESR, 0); 597 apic_write(APIC_ESR, 0);
602 }
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 598 accept_status = (apic_read(APIC_ESR) & 0xEF);
604 Dprintk("NMI sent.\n"); 599 Dprintk("NMI sent.\n");
605 600
@@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
625 return send_status; 620 return send_status;
626 } 621 }
627 622
623 maxlvt = lapic_get_maxlvt();
624
628 /* 625 /*
629 * Be paranoid about clearing APIC errors. 626 * Be paranoid about clearing APIC errors.
630 */ 627 */
631 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 628 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
632 apic_read_around(APIC_SPIV); 629 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
633 apic_write(APIC_ESR, 0); 630 apic_write(APIC_ESR, 0);
634 apic_read(APIC_ESR); 631 apic_read(APIC_ESR);
635 } 632 }
636 633
@@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
639 /* 636 /*
640 * Turn INIT on target chip 637 * Turn INIT on target chip
641 */ 638 */
642 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 639 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643 640
644 /* 641 /*
645 * Send IPI 642 * Send IPI
646 */ 643 */
647 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 644 apic_write(APIC_ICR,
648 | APIC_DM_INIT); 645 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
649 646
650 Dprintk("Waiting for send to finish...\n"); 647 Dprintk("Waiting for send to finish...\n");
651 send_status = safe_apic_wait_icr_idle(); 648 send_status = safe_apic_wait_icr_idle();
@@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
655 Dprintk("Deasserting INIT.\n"); 652 Dprintk("Deasserting INIT.\n");
656 653
657 /* Target chip */ 654 /* Target chip */
658 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 655 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
659 656
660 /* Send IPI */ 657 /* Send IPI */
661 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 658 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
662 659
663 Dprintk("Waiting for send to finish...\n"); 660 Dprintk("Waiting for send to finish...\n");
664 send_status = safe_apic_wait_icr_idle(); 661 send_status = safe_apic_wait_icr_idle();
@@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
689 */ 686 */
690 Dprintk("#startup loops: %d.\n", num_starts); 687 Dprintk("#startup loops: %d.\n", num_starts);
691 688
692 maxlvt = lapic_get_maxlvt();
693
694 for (j = 1; j <= num_starts; j++) { 689 for (j = 1; j <= num_starts; j++) {
695 Dprintk("Sending STARTUP #%d.\n", j); 690 Dprintk("Sending STARTUP #%d.\n", j);
696 apic_read_around(APIC_SPIV); 691 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
697 apic_write(APIC_ESR, 0); 692 apic_write(APIC_ESR, 0);
698 apic_read(APIC_ESR); 693 apic_read(APIC_ESR);
699 Dprintk("After apic_write.\n"); 694 Dprintk("After apic_write.\n");
700 695
@@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
703 */ 698 */
704 699
705 /* Target chip */ 700 /* Target chip */
706 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 701 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707 702
708 /* Boot on the stack */ 703 /* Boot on the stack */
709 /* Kick the second */ 704 /* Kick the second */
710 apic_write_around(APIC_ICR, APIC_DM_STARTUP 705 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
711 | (start_eip >> 12));
712 706
713 /* 707 /*
714 * Give the other CPU some time to accept the IPI. 708 * Give the other CPU some time to accept the IPI.
@@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
724 * Give the other CPU some time to accept the IPI. 718 * Give the other CPU some time to accept the IPI.
725 */ 719 */
726 udelay(200); 720 udelay(200);
727 /* 721 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
728 * Due to the Pentium erratum 3AP.
729 */
730 if (maxlvt > 3) {
731 apic_read_around(APIC_SPIV);
732 apic_write(APIC_ESR, 0); 722 apic_write(APIC_ESR, 0);
733 }
734 accept_status = (apic_read(APIC_ESR) & 0xEF); 723 accept_status = (apic_read(APIC_ESR) & 0xEF);
735 if (send_status || accept_status) 724 if (send_status || accept_status)
736 break; 725 break;
@@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
768 * 757 *
769 * Must be called after the _cpu_pda pointer table is initialized. 758 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 759 */
771static int __cpuinit get_local_pda(int cpu) 760int __cpuinit get_local_pda(int cpu)
772{ 761{
773 struct x8664_pda *oldpda, *newpda; 762 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 763 unsigned long size = sizeof(struct x8664_pda);
@@ -1311,7 +1300,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1311 cpu_clear(cpu, cpu_callout_map); 1300 cpu_clear(cpu, cpu_callout_map);
1312 cpu_clear(cpu, cpu_callin_map); 1301 cpu_clear(cpu, cpu_callin_map);
1313 /* was set by cpu_init() */ 1302 /* was set by cpu_init() */
1314 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1303 cpu_clear(cpu, cpu_initialized);
1315 numa_remove_cpu(cpu); 1304 numa_remove_cpu(cpu);
1316} 1305}
1317 1306
@@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
1390{ 1379{
1391 extern unsigned int maxcpus; 1380 extern unsigned int maxcpus;
1392 1381
1393 maxcpus = simple_strtoul(arg, NULL, 0); 1382 if (arg)
1383 maxcpus = simple_strtoul(arg, NULL, 0);
1394 return 0; 1384 return 0;
1395} 1385}
1396early_param("maxcpus", parse_maxcpus); 1386early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b4..ffe3c664afc0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
129 */ 129 */
130void __init time_init(void) 130void __init time_init(void)
131{ 131{
132 pre_time_init_hook();
132 tsc_init(); 133 tsc_init();
133 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
134} 135}
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 8a768973c4f0..03df8e45e5a1 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -58,6 +58,7 @@
58#include <asm/nmi.h> 58#include <asm/nmi.h>
59#include <asm/smp.h> 59#include <asm/smp.h>
60#include <asm/io.h> 60#include <asm/io.h>
61#include <asm/traps.h>
61 62
62#include "mach_traps.h" 63#include "mach_traps.h"
63 64
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
77gate_desc idt_table[256] 78gate_desc idt_table[256]
78 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
79 80
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int panic_on_unrecovered_nmi; 81int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24; 82int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 83static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
256 237
257static void 238static void
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl) 240 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 241{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
262 printk("%s =======================\n", log_lvl); 243 printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
383 return ud2 == 0x0b0f; 364 return ud2 == 0x0b0f;
384} 365}
385 366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
386int __kprobes __die(const char *str, struct pt_regs *regs, long err) 415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
387{ 416{
388 unsigned short ss; 417 unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
423 */ 452 */
424void die(const char *str, struct pt_regs *regs, long err) 453void die(const char *str, struct pt_regs *regs, long err)
425{ 454{
426 static struct { 455 unsigned long flags = oops_begin();
427 raw_spinlock_t lock;
428 u32 lock_owner;
429 int lock_owner_depth;
430 } die = {
431 .lock = __RAW_SPIN_LOCK_UNLOCKED,
432 .lock_owner = -1,
433 .lock_owner_depth = 0
434 };
435 unsigned long flags;
436
437 oops_enter();
438
439 if (die.lock_owner != raw_smp_processor_id()) {
440 console_verbose();
441 raw_local_irq_save(flags);
442 __raw_spin_lock(&die.lock);
443 die.lock_owner = smp_processor_id();
444 die.lock_owner_depth = 0;
445 bust_spinlocks(1);
446 } else {
447 raw_local_irq_save(flags);
448 }
449 456
450 if (++die.lock_owner_depth < 3) { 457 if (die_nest_count < 3) {
451 report_bug(regs->ip, regs); 458 report_bug(regs->ip, regs);
452 459
453 if (__die(str, regs, err)) 460 if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
456 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
457 } 464 }
458 465
459 bust_spinlocks(0); 466 oops_end(flags, regs, SIGSEGV);
460 die.lock_owner = -1;
461 add_taint(TAINT_DIE);
462 __raw_spin_unlock(&die.lock);
463 raw_local_irq_restore(flags);
464
465 if (!regs)
466 return;
467
468 if (kexec_should_crash(current))
469 crash_kexec(regs);
470
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473
474 if (panic_on_oops)
475 panic("Fatal exception");
476
477 oops_exit();
478 do_exit(SIGSEGV);
479} 467}
480 468
481static inline void 469static inline void
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 2696a6837782..3f18d73f420c 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -51,30 +51,10 @@
51#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
52#include <asm/proto.h> 52#include <asm/proto.h>
53#include <asm/pda.h> 53#include <asm/pda.h>
54#include <asm/traps.h>
54 55
55#include <mach_traps.h> 56#include <mach_traps.h>
56 57
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void);
75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77
78int panic_on_unrecovered_nmi; 58int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12; 59int kstack_depth_to_print = 12;
80static unsigned int code_bytes = 64; 60static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
355 .address = print_trace_address, 335 .address = print_trace_address,
356}; 336};
357 337
358void show_trace(struct task_struct *task, struct pt_regs *regs, 338static void
359 unsigned long *stack, unsigned long bp) 339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl)
360{ 341{
361 printk("\nCall Trace:\n"); 342 printk("\nCall Trace:\n");
362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); 343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
363 printk("\n"); 344 printk("\n");
364} 345}
365 346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
366static void 353static void
367_show_stack(struct task_struct *task, struct pt_regs *regs, 354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
368 unsigned long *sp, unsigned long bp) 355 unsigned long *sp, unsigned long bp, char *log_lvl)
369{ 356{
370 unsigned long *stack; 357 unsigned long *stack;
371 int i; 358 int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
399 printk(" %016lx", *stack++); 386 printk(" %016lx", *stack++);
400 touch_nmi_watchdog(); 387 touch_nmi_watchdog();
401 } 388 }
402 show_trace(task, regs, sp, bp); 389 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
403} 390}
404 391
405void show_stack(struct task_struct *task, unsigned long *sp) 392void show_stack(struct task_struct *task, unsigned long *sp)
406{ 393{
407 _show_stack(task, NULL, sp, 0); 394 show_stack_log_lvl(task, NULL, sp, 0, "");
408} 395}
409 396
410/* 397/*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
454 u8 *ip; 441 u8 *ip;
455 442
456 printk("Stack: "); 443 printk("Stack: ");
457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); 444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, "");
458 printk("\n"); 446 printk("\n");
459 447
460 printk(KERN_EMERG "Code: "); 448 printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
518} 506}
519 507
520void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 508void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
521{ 509{
522 die_owner = -1; 510 die_owner = -1;
523 bust_spinlocks(0); 511 bust_spinlocks(0);
524 die_nest_count--; 512 die_nest_count--;
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1d..41e01b145c48 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -73,7 +73,7 @@ int is_visws_box(void)
73 return visws_board_type >= 0; 73 return visws_board_type >= 0;
74} 74}
75 75
76static int __init visws_time_init_quirk(void) 76static int __init visws_time_init(void)
77{ 77{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 78 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79 79
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
93 return 0; 93 return 0;
94} 94}
95 95
96static int __init visws_pre_intr_init_quirk(void) 96static int __init visws_pre_intr_init(void)
97{ 97{
98 init_VISWS_APIC_irqs(); 98 init_VISWS_APIC_irqs();
99 99
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
114 114
115long long mem_size __initdata = 0; 115long long mem_size __initdata = 0;
116 116
117static char * __init visws_memory_setup_quirk(void) 117static char * __init visws_memory_setup(void)
118{ 118{
119 long long gfx_mem_size = 8 * MB; 119 long long gfx_mem_size = 8 * MB;
120 120
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
176 outl(PIIX_SPECIAL_STOP, 0xCFC); 176 outl(PIIX_SPECIAL_STOP, 0xCFC);
177} 177}
178 178
179static int __init visws_get_smp_config_quirk(unsigned int early) 179static int __init visws_get_smp_config(unsigned int early)
180{ 180{
181 /* 181 /*
182 * Prevent MP-table parsing by the generic code: 182 * Prevent MP-table parsing by the generic code:
@@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
192 * No problem for Linux. 192 * No problem for Linux.
193 */ 193 */
194 194
195static void __init MP_processor_info (struct mpc_config_processor *m) 195static void __init MP_processor_info(struct mpc_config_processor *m)
196{ 196{
197 int ver, logical_apicid; 197 int ver, logical_apicid;
198 physid_mask_t apic_cpus; 198 physid_mask_t apic_cpus;
@@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
232 apic_version[m->mpc_apicid] = ver; 232 apic_version[m->mpc_apicid] = ver;
233} 233}
234 234
235int __init visws_find_smp_config_quirk(unsigned int reserve) 235static int __init visws_find_smp_config(unsigned int reserve)
236{ 236{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
258 return 1; 258 return 1;
259} 259}
260 260
261extern int visws_trap_init_quirk(void); 261static int visws_trap_init(void);
262
263static struct x86_quirks visws_x86_quirks __initdata = {
264 .arch_time_init = visws_time_init,
265 .arch_pre_intr_init = visws_pre_intr_init,
266 .arch_memory_setup = visws_memory_setup,
267 .arch_intr_init = NULL,
268 .arch_trap_init = visws_trap_init,
269 .mach_get_smp_config = visws_get_smp_config,
270 .mach_find_smp_config = visws_find_smp_config,
271};
262 272
263void __init visws_early_detect(void) 273void __init visws_early_detect(void)
264{ 274{
@@ -272,16 +282,10 @@ void __init visws_early_detect(void)
272 282
273 /* 283 /*
274 * Install special quirks for timer, interrupt and memory setup: 284 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps: 285 * Fall back to generic behavior for traps:
286 * Override generic MP-table parsing:
282 */ 287 */
283 arch_intr_init_quirk = NULL; 288 x86_quirks = &visws_x86_quirks;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285 289
286 /* 290 /*
287 * Install reboot quirks: 291 * Install reboot quirks:
@@ -294,12 +298,6 @@ void __init visws_early_detect(void)
294 */ 298 */
295 no_broadcast = 0; 299 no_broadcast = 0;
296 300
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC 301#ifdef CONFIG_X86_IO_APIC
304 /* 302 /*
305 * Turn off IO-APIC detection and initialization: 303 * Turn off IO-APIC detection and initialization:
@@ -426,7 +424,7 @@ static __init void cobalt_init(void)
426 co_apic_read(CO_APIC_ID)); 424 co_apic_read(CO_APIC_ID));
427} 425}
428 426
429int __init visws_trap_init_quirk(void) 427static int __init visws_trap_init(void)
430{ 428{
431 lithium_init(); 429 lithium_init();
432 cobalt_init(); 430 cobalt_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7b..0a1b1a9d922d 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
906#ifdef CONFIG_X86_LOCAL_APIC 906#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 907 para_fill(pv_apic_ops.apic_read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 908 para_fill(pv_apic_ops.apic_write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 909#endif
911 910
912 /* 911 /*
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 50dad44fb542..0313a5eec412 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -991,7 +991,6 @@ __init void lguest_init(void)
991#ifdef CONFIG_X86_LOCAL_APIC 991#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 992 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 993 pv_apic_ops.apic_write = lguest_apic_write;
994 pv_apic_ops.apic_write_atomic = lguest_apic_write;
995 pv_apic_ops.apic_read = lguest_apic_read; 994 pv_apic_ops.apic_read = lguest_apic_read;
996#endif 995#endif
997 996
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 48278fa7d3de..3d317836be9e 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,14 +10,6 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13/*
14 * Any quirks to be performed to initialize timers/irqs/etc?
15 */
16int (*arch_time_init_quirk)(void);
17int (*arch_pre_intr_init_quirk)(void);
18int (*arch_intr_init_quirk)(void);
19int (*arch_trap_init_quirk)(void);
20
21#ifdef CONFIG_HOTPLUG_CPU 13#ifdef CONFIG_HOTPLUG_CPU
22#define DEFAULT_SEND_IPI (1) 14#define DEFAULT_SEND_IPI (1)
23#else 15#else
@@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
37 **/ 29 **/
38void __init pre_intr_init_hook(void) 30void __init pre_intr_init_hook(void)
39{ 31{
40 if (arch_pre_intr_init_quirk) { 32 if (x86_quirks->arch_pre_intr_init) {
41 if (arch_pre_intr_init_quirk()) 33 if (x86_quirks->arch_pre_intr_init())
42 return; 34 return;
43 } 35 }
44 init_ISA_irqs(); 36 init_ISA_irqs();
@@ -64,8 +56,8 @@ static struct irqaction irq2 = {
64 **/ 56 **/
65void __init intr_init_hook(void) 57void __init intr_init_hook(void)
66{ 58{
67 if (arch_intr_init_quirk) { 59 if (x86_quirks->arch_intr_init) {
68 if (arch_intr_init_quirk()) 60 if (x86_quirks->arch_intr_init())
69 return; 61 return;
70 } 62 }
71#ifdef CONFIG_X86_LOCAL_APIC 63#ifdef CONFIG_X86_LOCAL_APIC
@@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
97 **/ 89 **/
98void __init trap_init_hook(void) 90void __init trap_init_hook(void)
99{ 91{
100 if (arch_trap_init_quirk) { 92 if (x86_quirks->arch_trap_init) {
101 if (arch_trap_init_quirk()) 93 if (x86_quirks->arch_trap_init())
102 return; 94 return;
103 } 95 }
104} 96}
@@ -111,6 +103,16 @@ static struct irqaction irq0 = {
111}; 103};
112 104
113/** 105/**
106 * pre_time_init_hook - do any specific initialisations before.
107 *
108 **/
109void __init pre_time_init_hook(void)
110{
111 if (x86_quirks->arch_pre_time_init)
112 x86_quirks->arch_pre_time_init();
113}
114
115/**
114 * time_init_hook - do any specific initialisations for the system timer. 116 * time_init_hook - do any specific initialisations for the system timer.
115 * 117 *
116 * Description: 118 * Description:
@@ -119,13 +121,13 @@ static struct irqaction irq0 = {
119 **/ 121 **/
120void __init time_init_hook(void) 122void __init time_init_hook(void)
121{ 123{
122 if (arch_time_init_quirk) { 124 if (x86_quirks->arch_time_init) {
123 /* 125 /*
124 * A nonzero return code does not mean failure, it means 126 * A nonzero return code does not mean failure, it means
125 * that the architecture quirk does not want any 127 * that the architecture quirk does not want any
126 * generic (timer) setup to be performed after this: 128 * generic (timer) setup to be performed after this:
127 */ 129 */
128 if (arch_time_init_quirk()) 130 if (x86_quirks->arch_time_init())
129 return; 131 return;
130 } 132 }
131 133
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9873716e9f76..1fbb844c3d7a 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif 21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 23
24obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e64..d37f29376b0c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
844 reserve_early(table_start << PAGE_SHIFT, 844 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE"); 845 table_end << PAGE_SHIFT, "PGTABLE");
846 846
847 if (!after_init_bootmem)
848 early_memtest(start, end);
849
847 return end >> PAGE_SHIFT; 850 return end >> PAGE_SHIFT;
848} 851}
849 852
@@ -868,8 +871,6 @@ void __init paging_init(void)
868 */ 871 */
869 sparse_init(); 872 sparse_init();
870 zone_sizes_init(); 873 zone_sizes_init();
871
872 paravirt_post_allocator_init();
873} 874}
874 875
875/* 876/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 306049edd553..ec37121f6709 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -517,118 +517,6 @@ static void __init init_gbpages(void)
517 direct_gbpages = 0; 517 direct_gbpages = 0;
518} 518}
519 519
520#ifdef CONFIG_MEMTEST
521
522static void __init memtest(unsigned long start_phys, unsigned long size,
523 unsigned pattern)
524{
525 unsigned long i;
526 unsigned long *start;
527 unsigned long start_bad;
528 unsigned long last_bad;
529 unsigned long val;
530 unsigned long start_phys_aligned;
531 unsigned long count;
532 unsigned long incr;
533
534 switch (pattern) {
535 case 0:
536 val = 0UL;
537 break;
538 case 1:
539 val = -1UL;
540 break;
541 case 2:
542 val = 0x5555555555555555UL;
543 break;
544 case 3:
545 val = 0xaaaaaaaaaaaaaaaaUL;
546 break;
547 default:
548 return;
549 }
550
551 incr = sizeof(unsigned long);
552 start_phys_aligned = ALIGN(start_phys, incr);
553 count = (size - (start_phys_aligned - start_phys))/incr;
554 start = __va(start_phys_aligned);
555 start_bad = 0;
556 last_bad = 0;
557
558 for (i = 0; i < count; i++)
559 start[i] = val;
560 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
561 if (*start != val) {
562 if (start_phys_aligned == last_bad + incr) {
563 last_bad += incr;
564 } else {
565 if (start_bad) {
566 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
567 val, start_bad, last_bad + incr);
568 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
569 }
570 start_bad = last_bad = start_phys_aligned;
571 }
572 }
573 }
574 if (start_bad) {
575 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
576 val, start_bad, last_bad + incr);
577 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
578 }
579
580}
581
582/* default is disabled */
583static int memtest_pattern __initdata;
584
585static int __init parse_memtest(char *arg)
586{
587 if (arg)
588 memtest_pattern = simple_strtoul(arg, NULL, 0);
589 return 0;
590}
591
592early_param("memtest", parse_memtest);
593
594static void __init early_memtest(unsigned long start, unsigned long end)
595{
596 u64 t_start, t_size;
597 unsigned pattern;
598
599 if (!memtest_pattern)
600 return;
601
602 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
603 for (pattern = 0; pattern < memtest_pattern; pattern++) {
604 t_start = start;
605 t_size = 0;
606 while (t_start < end) {
607 t_start = find_e820_area_size(t_start, &t_size, 1);
608
609 /* done ? */
610 if (t_start >= end)
611 break;
612 if (t_start + t_size > end)
613 t_size = end - t_start;
614
615 printk(KERN_CONT "\n %016llx - %016llx pattern %d",
616 (unsigned long long)t_start,
617 (unsigned long long)t_start + t_size, pattern);
618
619 memtest(t_start, t_size, pattern);
620
621 t_start += t_size;
622 }
623 }
624 printk(KERN_CONT "\n");
625}
626#else
627static void __init early_memtest(unsigned long start, unsigned long end)
628{
629}
630#endif
631
632static unsigned long __init kernel_physical_mapping_init(unsigned long start, 520static unsigned long __init kernel_physical_mapping_init(unsigned long start,
633 unsigned long end, 521 unsigned long end,
634 unsigned long page_size_mask) 522 unsigned long page_size_mask)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d4585077977a..2fe30916d4b6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,8 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
15 17
16#include <asm/msr.h> 18#include <asm/msr.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
@@ -373,8 +375,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
373 return vma_prot; 375 return vma_prot;
374} 376}
375 377
376#ifdef CONFIG_NONPROMISC_DEVMEM 378#ifdef CONFIG_STRICT_DEVMEM
377/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 379/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
378static inline int range_is_allowed(unsigned long pfn, unsigned long size) 380static inline int range_is_allowed(unsigned long pfn, unsigned long size)
379{ 381{
380 return 1; 382 return 1;
@@ -398,7 +400,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
398 } 400 }
399 return 1; 401 return 1;
400} 402}
401#endif /* CONFIG_NONPROMISC_DEVMEM */ 403#endif /* CONFIG_STRICT_DEVMEM */
402 404
403int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 405int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
404 unsigned long size, pgprot_t *vma_prot) 406 unsigned long size, pgprot_t *vma_prot)
@@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
489 491
490 free_memtype(addr, addr + size); 492 free_memtype(addr, addr + size);
491} 493}
494
495#if defined(CONFIG_DEBUG_FS)
496
497/* get Nth element of the linked list */
498static struct memtype *memtype_get_idx(loff_t pos)
499{
500 struct memtype *list_node, *print_entry;
501 int i = 1;
502
503 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
504 if (!print_entry)
505 return NULL;
506
507 spin_lock(&memtype_lock);
508 list_for_each_entry(list_node, &memtype_list, nd) {
509 if (pos == i) {
510 *print_entry = *list_node;
511 spin_unlock(&memtype_lock);
512 return print_entry;
513 }
514 ++i;
515 }
516 spin_unlock(&memtype_lock);
517 kfree(print_entry);
518 return NULL;
519}
520
521static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
522{
523 if (*pos == 0) {
524 ++*pos;
525 seq_printf(seq, "PAT memtype list:\n");
526 }
527
528 return memtype_get_idx(*pos);
529}
530
531static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
532{
533 ++*pos;
534 return memtype_get_idx(*pos);
535}
536
537static void memtype_seq_stop(struct seq_file *seq, void *v)
538{
539}
540
541static int memtype_seq_show(struct seq_file *seq, void *v)
542{
543 struct memtype *print_entry = (struct memtype *)v;
544
545 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
546 print_entry->start, print_entry->end);
547 kfree(print_entry);
548 return 0;
549}
550
551static struct seq_operations memtype_seq_ops = {
552 .start = memtype_seq_start,
553 .next = memtype_seq_next,
554 .stop = memtype_seq_stop,
555 .show = memtype_seq_show,
556};
557
558static int memtype_seq_open(struct inode *inode, struct file *file)
559{
560 return seq_open(file, &memtype_seq_ops);
561}
562
563static const struct file_operations memtype_fops = {
564 .open = memtype_seq_open,
565 .read = seq_read,
566 .llseek = seq_lseek,
567 .release = seq_release,
568};
569
570static int __init pat_memtype_list_init(void)
571{
572 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
573 NULL, &memtype_fops);
574 return 0;
575}
576
577late_initcall(pat_memtype_list_init);
578
579#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e515e8db842a..d49202e740ea 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o 6obj-$(CONFIG_PCI_OLPC) += olpc.o
7 7
8pci-y := fixup.o 8obj-y += fixup.o
9pci-$(CONFIG_ACPI) += acpi.o 9obj-$(CONFIG_ACPI) += acpi.o
10pci-y += legacy.o irq.o 10obj-y += legacy.o irq.o
11 11
12pci-$(CONFIG_X86_VISWS) += visws.o 12obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14pci-$(CONFIG_X86_NUMAQ) += numa.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += $(pci-y) common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 132876cc6fca..ec9ce35e44d6 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
57 57
58int __init pci_subsys_init(void) 58int __init pci_subsys_init(void)
59{ 59{
60#ifdef CONFIG_X86_NUMAQ
61 pci_numaq_init();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 pci_acpi_init(); 64 pci_acpi_init();
62#endif 65#endif
66#ifdef CONFIG_X86_VISWS
67 pci_visws_init();
68#endif
63 pci_legacy_init(); 69 pci_legacy_init();
64 pcibios_irq_init(); 70 pcibios_irq_init();
65#ifdef CONFIG_X86_NUMAQ
66 pci_numa_init();
67#endif
68 pcibios_init(); 71 pcibios_init();
69 72
70 return 0; 73 return 0;
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c
index 8b5ca1966731..f4b16dc11dad 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numaq_32.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines 2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */ 3 */
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
@@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
151} 151}
152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
153 153
154int __init pci_numa_init(void) 154int __init pci_numaq_init(void)
155{ 155{
156 int quad; 156 int quad;
157 157
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 3e25deb821ac..15b9cf6be729 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
108/* some common used subsys_initcalls */ 108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void); 109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void); 110extern int __init pcibios_irq_init(void);
111extern int __init pci_numa_init(void); 111extern int __init pci_visws_init(void);
112extern int __init pci_numaq_init(void);
112extern int __init pcibios_init(void); 113extern int __init pcibios_init(void);
113 114
114/* pci-mmconfig.c */ 115/* pci-mmconfig.c */
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 1a7bed492bb1..42f4cb19faca 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
87} 87}
88 88
89static int __init pci_visws_init(void) 89int __init pci_visws_init(void)
90{ 90{
91 if (!is_visws_box())
92 return -1;
93
94 pcibios_enable_irq = &pci_visws_enable_irq;
95 pcibios_disable_irq = &pci_visws_disable_irq;
96
91 /* The VISWS supports configuration access type 1 only */ 97 /* The VISWS supports configuration access type 1 only */
92 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 98 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
93 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); 99 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
105 pcibios_resource_survey(); 111 pcibios_resource_survey();
106 return 0; 112 return 0;
107} 113}
108
109static __init int pci_subsys_init(void)
110{
111 if (!is_visws_box())
112 return -1;
113
114 pcibios_enable_irq = &pci_visws_enable_irq;
115 pcibios_disable_irq = &pci_visws_disable_irq;
116
117 pci_visws_init();
118 pcibios_init();
119
120 return 0;
121}
122subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a515..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
207 202
208/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
209void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
226#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
227 222
228#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
229 225
230void enable_sep_cpu(void) 226void enable_sep_cpu(void)
231{ 227{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
296 gate_vma_init(); 292 gate_vma_init();
297#endif 293#endif
298 294
299 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
300 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
301 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
302 } else { 298 } else if (vdso32_sysenter()){
303 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
304 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
305 } 304 }
306 305
307 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 19a6cfaf5db9..257ba4a10abf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
21extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
23 23
24struct page **vdso_pages; 24static struct page **vdso_pages;
25static unsigned vdso_size;
25 26
26static inline void *var_ref(void *p, char *name) 27static inline void *var_ref(void *p, char *name)
27{ 28{
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
38 int i; 39 int i;
39 char *vbase; 40 char *vbase;
40 41
42 vdso_size = npages << PAGE_SHIFT;
41 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
42 if (!vdso_pages) 44 if (!vdso_pages)
43 goto oom; 45 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
101 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
102 unsigned long addr; 104 unsigned long addr;
103 int ret; 105 int ret;
104 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
105 106
106 if (!vdso_enabled) 107 if (!vdso_enabled)
107 return 0; 108 return 0;
108 109
109 down_write(&mm->mmap_sem); 110 down_write(&mm->mmap_sem);
110 addr = vdso_addr(mm->start_stack, len); 111 addr = vdso_addr(mm->start_stack, vdso_size);
111 addr = get_unmapped_area(NULL, addr, len, 0, 0); 112 addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
112 if (IS_ERR_VALUE(addr)) { 113 if (IS_ERR_VALUE(addr)) {
113 ret = addr; 114 ret = addr;
114 goto up_fail; 115 goto up_fail;
115 } 116 }
116 117
117 ret = install_special_mapping(mm, addr, len, 118 ret = install_special_mapping(mm, addr, vdso_size,
118 VM_READ|VM_EXEC| 119 VM_READ|VM_EXEC|
119 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
120 VM_ALWAYSDUMP, 121 VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc99580871..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d1649131..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o xen-asm.o grant-table.o suspend.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bb508456ef52..194bbd6e3241 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,12 +41,12 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
47#include <asm/reboot.h> 49#include <asm/reboot.h>
48#include <asm/pgalloc.h>
49 50
50#include "xen-ops.h" 51#include "xen-ops.h"
51#include "mmu.h" 52#include "mmu.h"
@@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58 59
59/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
60 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
61 * 74 *
62 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -167,10 +180,14 @@ void xen_vcpu_restore(void)
167 180
168static void __init xen_banner(void) 181static void __init xen_banner(void)
169{ 182{
183 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
184 struct xen_extraversion extra;
185 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
186
170 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 187 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
171 pv_info.name); 188 pv_info.name);
172 printk(KERN_INFO "Hypervisor signature: %s%s\n", 189 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
173 xen_start_info->magic, 190 version >> 16, version & 0xffff, extra.extraversion,
174 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 191 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
175} 192}
176 193
@@ -363,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
363 380
364static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 381static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
365{ 382{
366 xen_mc_batch();
367
368 load_TLS_descriptor(t, cpu, 0);
369 load_TLS_descriptor(t, cpu, 1);
370 load_TLS_descriptor(t, cpu, 2);
371
372 xen_mc_issue(PARAVIRT_LAZY_CPU);
373
374 /* 383 /*
375 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
376 * it means we're in a context switch, and %gs has just been 385 * it means we're in a context switch, and %gs has just been
@@ -379,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
379 * Either way, it has been saved, and the new value will get 388 * Either way, it has been saved, and the new value will get
380 * loaded properly. This will go away as soon as Xen has been 389 * loaded properly. This will go away as soon as Xen has been
381 * modified to not save/restore %gs for normal hypercalls. 390 * modified to not save/restore %gs for normal hypercalls.
391 *
392 * On x86_64, this hack is not used for %gs, because gs points
393 * to KERNEL_GS_BASE (and uses it for PDA references), so we
394 * must not zero %gs on x86_64
395 *
396 * For x86_64, we need to zero %fs, otherwise we may get an
397 * exception between the new %fs descriptor being loaded and
398 * %fs being effectively cleared at __switch_to().
382 */ 399 */
383 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
401#ifdef CONFIG_X86_32
384 loadsegment(gs, 0); 402 loadsegment(gs, 0);
403#else
404 loadsegment(fs, 0);
405#endif
406 }
407
408 xen_mc_batch();
409
410 load_TLS_descriptor(t, cpu, 0);
411 load_TLS_descriptor(t, cpu, 1);
412 load_TLS_descriptor(t, cpu, 2);
413
414 xen_mc_issue(PARAVIRT_LAZY_CPU);
415}
416
417#ifdef CONFIG_X86_64
418static void xen_load_gs_index(unsigned int idx)
419{
420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
421 BUG();
385} 422}
423#endif
386 424
387static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
388 const void *ptr) 426 const void *ptr)
@@ -400,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
400 preempt_enable(); 438 preempt_enable();
401} 439}
402 440
403static int cvt_gate_to_trap(int vector, u32 low, u32 high, 441static int cvt_gate_to_trap(int vector, const gate_desc *val,
404 struct trap_info *info) 442 struct trap_info *info)
405{ 443{
406 u8 type, dpl; 444 if (val->type != 0xf && val->type != 0xe)
407
408 type = (high >> 8) & 0x1f;
409 dpl = (high >> 13) & 3;
410
411 if (type != 0xf && type != 0xe)
412 return 0; 445 return 0;
413 446
414 info->vector = vector; 447 info->vector = vector;
415 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 448 info->address = gate_offset(*val);
416 info->cs = low >> 16; 449 info->cs = gate_segment(*val);
417 info->flags = dpl; 450 info->flags = val->dpl;
418 /* interrupt gates clear IF */ 451 /* interrupt gates clear IF */
419 if (type == 0xe) 452 if (val->type == 0xe)
420 info->flags |= 4; 453 info->flags |= 4;
421 454
422 return 1; 455 return 1;
@@ -443,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
443 476
444 if (p >= start && (p + 8) <= end) { 477 if (p >= start && (p + 8) <= end) {
445 struct trap_info info[2]; 478 struct trap_info info[2];
446 u32 *desc = (u32 *)g;
447 479
448 info[1].address = 0; 480 info[1].address = 0;
449 481
450 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 482 if (cvt_gate_to_trap(entrynum, g, &info[0]))
451 if (HYPERVISOR_set_trap_table(info)) 483 if (HYPERVISOR_set_trap_table(info))
452 BUG(); 484 BUG();
453 } 485 }
@@ -460,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
460{ 492{
461 unsigned in, out, count; 493 unsigned in, out, count;
462 494
463 count = (desc->size+1) / 8; 495 count = (desc->size+1) / sizeof(gate_desc);
464 BUG_ON(count > 256); 496 BUG_ON(count > 256);
465 497
466 for (in = out = 0; in < count; in++) { 498 for (in = out = 0; in < count; in++) {
467 const u32 *entry = (u32 *)(desc->address + in * 8); 499 gate_desc *entry = (gate_desc*)(desc->address) + in;
468 500
469 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 501 if (cvt_gate_to_trap(in, entry, &traps[out]))
470 out++; 502 out++;
471 } 503 }
472 traps[out].address = 0; 504 traps[out].address = 0;
@@ -695,33 +727,89 @@ static void set_current_cr3(void *v)
695 x86_write_percpu(xen_current_cr3, (unsigned long)v); 727 x86_write_percpu(xen_current_cr3, (unsigned long)v);
696} 728}
697 729
698static void xen_write_cr3(unsigned long cr3) 730static void __xen_write_cr3(bool kernel, unsigned long cr3)
699{ 731{
700 struct mmuext_op *op; 732 struct mmuext_op *op;
701 struct multicall_space mcs; 733 struct multicall_space mcs;
702 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 734 unsigned long mfn;
703 735
704 BUG_ON(preemptible()); 736 if (cr3)
737 mfn = pfn_to_mfn(PFN_DOWN(cr3));
738 else
739 mfn = 0;
705 740
706 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 741 WARN_ON(mfn == 0 && kernel);
707 742
708 /* Update while interrupts are disabled, so its atomic with 743 mcs = __xen_mc_entry(sizeof(*op));
709 respect to ipis */
710 x86_write_percpu(xen_cr3, cr3);
711 744
712 op = mcs.args; 745 op = mcs.args;
713 op->cmd = MMUEXT_NEW_BASEPTR; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
714 op->arg1.mfn = mfn; 747 op->arg1.mfn = mfn;
715 748
716 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
717 750
718 /* Update xen_update_cr3 once the batch has actually 751 if (kernel) {
719 been submitted. */ 752 x86_write_percpu(xen_cr3, cr3);
720 xen_mc_callback(set_current_cr3, (void *)cr3); 753
754 /* Update xen_current_cr3 once the batch has actually
755 been submitted. */
756 xen_mc_callback(set_current_cr3, (void *)cr3);
757 }
758}
759
760static void xen_write_cr3(unsigned long cr3)
761{
762 BUG_ON(preemptible());
763
764 xen_mc_batch(); /* disables interrupts */
765
766 /* Update while interrupts are disabled, so its atomic with
767 respect to ipis */
768 x86_write_percpu(xen_cr3, cr3);
769
770 __xen_write_cr3(true, cr3);
771
772#ifdef CONFIG_X86_64
773 {
774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
775 if (user_pgd)
776 __xen_write_cr3(false, __pa(user_pgd));
777 else
778 __xen_write_cr3(false, 0);
779 }
780#endif
721 781
722 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
723} 783}
724 784
785static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
786{
787 int ret;
788
789 ret = 0;
790
791 switch(msr) {
792#ifdef CONFIG_X86_64
793 unsigned which;
794 u64 base;
795
796 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
799
800 set:
801 base = ((u64)high << 32) | low;
802 if (HYPERVISOR_set_segment_base(which, base) != 0)
803 ret = -EFAULT;
804 break;
805#endif
806 default:
807 ret = native_write_msr_safe(msr, low, high);
808 }
809
810 return ret;
811}
812
725/* Early in boot, while setting up the initial pagetable, assume 813/* Early in boot, while setting up the initial pagetable, assume
726 everything is pinned. */ 814 everything is pinned. */
727static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
778 xen_alloc_ptpage(mm, pfn, PT_PMD); 866 xen_alloc_ptpage(mm, pfn, PT_PMD);
779} 867}
780 868
869static int xen_pgd_alloc(struct mm_struct *mm)
870{
871 pgd_t *pgd = mm->pgd;
872 int ret = 0;
873
874 BUG_ON(PagePinned(virt_to_page(pgd)));
875
876#ifdef CONFIG_X86_64
877 {
878 struct page *page = virt_to_page(pgd);
879 pgd_t *user_pgd;
880
881 BUG_ON(page->private != 0);
882
883 ret = -ENOMEM;
884
885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
886 page->private = (unsigned long)user_pgd;
887
888 if (user_pgd != NULL) {
889 user_pgd[pgd_index(VSYSCALL_START)] =
890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
891 ret = 0;
892 }
893
894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
895 }
896#endif
897
898 return ret;
899}
900
901static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
902{
903#ifdef CONFIG_X86_64
904 pgd_t *user_pgd = xen_get_user_pgd(pgd);
905
906 if (user_pgd)
907 free_page((unsigned long)user_pgd);
908#endif
909}
910
781/* This should never happen until we're OK to use struct page */ 911/* This should never happen until we're OK to use struct page */
782static void xen_release_ptpage(u32 pfn, unsigned level) 912static void xen_release_ptpage(u32 pfn, unsigned level)
783{ 913{
@@ -803,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
803 xen_release_ptpage(pfn, PT_PMD); 933 xen_release_ptpage(pfn, PT_PMD);
804} 934}
805 935
936#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
938{
939 xen_alloc_ptpage(mm, pfn, PT_PUD);
940}
941
942static void xen_release_pud(u32 pfn)
943{
944 xen_release_ptpage(pfn, PT_PUD);
945}
946#endif
947
806#ifdef CONFIG_HIGHPTE 948#ifdef CONFIG_HIGHPTE
807static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 949static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
808{ 950{
@@ -841,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
841 983
842static __init void xen_pagetable_setup_start(pgd_t *base) 984static __init void xen_pagetable_setup_start(pgd_t *base)
843{ 985{
844 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
845 int i;
846
847 /* special set_pte for pagetable initialization */
848 pv_mmu_ops.set_pte = xen_set_pte_init;
849
850 init_mm.pgd = base;
851 /*
852 * copy top-level of Xen-supplied pagetable into place. This
853 * is a stand-in while we copy the pmd pages.
854 */
855 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
856
857 /*
858 * For PAE, need to allocate new pmds, rather than
859 * share Xen's, since Xen doesn't like pmd's being
860 * shared between address spaces.
861 */
862 for (i = 0; i < PTRS_PER_PGD; i++) {
863 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
864 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
865
866 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
867 PAGE_SIZE);
868
869 make_lowmem_page_readonly(pmd);
870
871 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
872 } else
873 pgd_clear(&base[i]);
874 }
875
876 /* make sure zero_page is mapped RO so we can use it in pagetables */
877 make_lowmem_page_readonly(empty_zero_page);
878 make_lowmem_page_readonly(base);
879 /*
880 * Switch to new pagetable. This is done before
881 * pagetable_init has done anything so that the new pages
882 * added to the table can be prepared properly for Xen.
883 */
884 xen_write_cr3(__pa(base));
885
886 /* Unpin initial Xen pagetable */
887 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
888 PFN_DOWN(__pa(xen_start_info->pt_base)));
889} 986}
890 987
891void xen_setup_shared_info(void) 988void xen_setup_shared_info(void)
892{ 989{
893 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
894 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 991 set_fixmap(FIX_PARAVIRT_BOOTMAP,
895 992 xen_start_info->shared_info);
896 /* 993
897 * Create a mapping for the shared info page. 994 HYPERVISOR_shared_info =
898 * Should be set_fixmap(), but shared_info is a machine 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
899 * address with no corresponding pseudo-phys address.
900 */
901 set_pte_mfn(addr,
902 PFN_DOWN(xen_start_info->shared_info),
903 PAGE_KERNEL);
904
905 HYPERVISOR_shared_info = (struct shared_info *)addr;
906 } else 996 } else
907 HYPERVISOR_shared_info = 997 HYPERVISOR_shared_info =
908 (struct shared_info *)__va(xen_start_info->shared_info); 998 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1007,32 @@ void xen_setup_shared_info(void)
917 1007
918static __init void xen_pagetable_setup_done(pgd_t *base) 1008static __init void xen_pagetable_setup_done(pgd_t *base)
919{ 1009{
920 /* This will work as long as patching hasn't happened yet
921 (which it hasn't) */
922 pv_mmu_ops.alloc_pte = xen_alloc_pte;
923 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
924 pv_mmu_ops.release_pte = xen_release_pte;
925 pv_mmu_ops.release_pmd = xen_release_pmd;
926 pv_mmu_ops.set_pte = xen_set_pte;
927
928 xen_setup_shared_info(); 1010 xen_setup_shared_info();
929
930 /* Actually pin the pagetable down, but we can't set PG_pinned
931 yet because the page structures don't exist yet. */
932 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
933} 1011}
934 1012
935static __init void xen_post_allocator_init(void) 1013static __init void xen_post_allocator_init(void)
936{ 1014{
1015 pv_mmu_ops.set_pte = xen_set_pte;
937 pv_mmu_ops.set_pmd = xen_set_pmd; 1016 pv_mmu_ops.set_pmd = xen_set_pmd;
938 pv_mmu_ops.set_pud = xen_set_pud; 1017 pv_mmu_ops.set_pud = xen_set_pud;
1018#if PAGETABLE_LEVELS == 4
1019 pv_mmu_ops.set_pgd = xen_set_pgd;
1020#endif
1021
1022 /* This will work as long as patching hasn't happened yet
1023 (which it hasn't) */
1024 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1026 pv_mmu_ops.release_pte = xen_release_pte;
1027 pv_mmu_ops.release_pmd = xen_release_pmd;
1028#if PAGETABLE_LEVELS == 4
1029 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1030 pv_mmu_ops.release_pud = xen_release_pud;
1031#endif
939 1032
1033#ifdef CONFIG_X86_64
1034 SetPagePinned(virt_to_page(level3_user_vsyscall));
1035#endif
940 xen_mark_init_mm_pinned(); 1036 xen_mark_init_mm_pinned();
941} 1037}
942 1038
@@ -950,6 +1046,7 @@ void xen_setup_vcpu_info_placement(void)
950 1046
951 /* xen_vcpu_setup managed to place the vcpu_info within the 1047 /* xen_vcpu_setup managed to place the vcpu_info within the
952 percpu area for all cpus, so make use of it */ 1048 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
953 if (have_vcpu_info_placement) { 1050 if (have_vcpu_info_placement) {
954 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n");
955 1052
@@ -959,6 +1056,7 @@ void xen_setup_vcpu_info_placement(void)
959 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct;
960 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
961 } 1058 }
1059#endif
962} 1060}
963 1061
964static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
979 goto patch_site 1077 goto patch_site
980 1078
981 switch (type) { 1079 switch (type) {
1080#ifdef CONFIG_X86_32
982 SITE(pv_irq_ops, irq_enable); 1081 SITE(pv_irq_ops, irq_enable);
983 SITE(pv_irq_ops, irq_disable); 1082 SITE(pv_irq_ops, irq_disable);
984 SITE(pv_irq_ops, save_fl); 1083 SITE(pv_irq_ops, save_fl);
985 SITE(pv_irq_ops, restore_fl); 1084 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
986#undef SITE 1086#undef SITE
987 1087
988 patch_site: 1088 patch_site:
@@ -1025,8 +1125,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1025#ifdef CONFIG_X86_F00F_BUG 1125#ifdef CONFIG_X86_F00F_BUG
1026 case FIX_F00F_IDT: 1126 case FIX_F00F_IDT:
1027#endif 1127#endif
1128#ifdef CONFIG_X86_32
1028 case FIX_WP_TEST: 1129 case FIX_WP_TEST:
1029 case FIX_VDSO: 1130 case FIX_VDSO:
1131# ifdef CONFIG_HIGHMEM
1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1133# endif
1134#else
1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1136#endif
1030#ifdef CONFIG_X86_LOCAL_APIC 1137#ifdef CONFIG_X86_LOCAL_APIC
1031 case FIX_APIC_BASE: /* maps dummy local APIC */ 1138 case FIX_APIC_BASE: /* maps dummy local APIC */
1032#endif 1139#endif
@@ -1039,6 +1146,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1039 } 1146 }
1040 1147
1041 __native_set_fixmap(idx, pte); 1148 __native_set_fixmap(idx, pte);
1149
1150#ifdef CONFIG_X86_64
1151 /* Replicate changes to map the vsyscall page into the user
1152 pagetable vsyscall mapping. */
1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1154 unsigned long vaddr = __fix_to_virt(idx);
1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1156 }
1157#endif
1042} 1158}
1043 1159
1044static const struct pv_info xen_info __initdata = { 1160static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1084 .wbinvd = native_wbinvd, 1200 .wbinvd = native_wbinvd,
1085 1201
1086 .read_msr = native_read_msr_safe, 1202 .read_msr = native_read_msr_safe,
1087 .write_msr = native_write_msr_safe, 1203 .write_msr = xen_write_msr_safe,
1088 .read_tsc = native_read_tsc, 1204 .read_tsc = native_read_tsc,
1089 .read_pmc = native_read_pmc, 1205 .read_pmc = native_read_pmc,
1090 1206
1091 .iret = xen_iret, 1207 .iret = xen_iret,
1092 .irq_enable_sysexit = xen_sysexit, 1208 .irq_enable_sysexit = xen_sysexit,
1209#ifdef CONFIG_X86_64
1210 .usergs_sysret32 = xen_sysret32,
1211 .usergs_sysret64 = xen_sysret64,
1212#endif
1093 1213
1094 .load_tr_desc = paravirt_nop, 1214 .load_tr_desc = paravirt_nop,
1095 .set_ldt = xen_set_ldt, 1215 .set_ldt = xen_set_ldt,
1096 .load_gdt = xen_load_gdt, 1216 .load_gdt = xen_load_gdt,
1097 .load_idt = xen_load_idt, 1217 .load_idt = xen_load_idt,
1098 .load_tls = xen_load_tls, 1218 .load_tls = xen_load_tls,
1219#ifdef CONFIG_X86_64
1220 .load_gs_index = xen_load_gs_index,
1221#endif
1099 1222
1100 .store_gdt = native_store_gdt, 1223 .store_gdt = native_store_gdt,
1101 .store_idt = native_store_idt, 1224 .store_idt = native_store_idt,
@@ -1109,14 +1232,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1109 .set_iopl_mask = xen_set_iopl_mask, 1232 .set_iopl_mask = xen_set_iopl_mask,
1110 .io_delay = xen_io_delay, 1233 .io_delay = xen_io_delay,
1111 1234
1235 /* Xen takes care of %gs when switching to usermode for us */
1236 .swapgs = paravirt_nop,
1237
1112 .lazy_mode = { 1238 .lazy_mode = {
1113 .enter = paravirt_enter_lazy_cpu, 1239 .enter = paravirt_enter_lazy_cpu,
1114 .leave = xen_leave_lazy, 1240 .leave = xen_leave_lazy,
1115 }, 1241 },
1116}; 1242};
1117 1243
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1118static const struct pv_irq_ops xen_irq_ops __initdata = { 1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1119 .init_IRQ = xen_init_IRQ, 1262 .init_IRQ = __xen_init_IRQ,
1120 .save_fl = xen_save_fl, 1263 .save_fl = xen_save_fl,
1121 .restore_fl = xen_restore_fl, 1264 .restore_fl = xen_restore_fl,
1122 .irq_disable = xen_irq_disable, 1265 .irq_disable = xen_irq_disable,
@@ -1124,14 +1267,13 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
1124 .safe_halt = xen_safe_halt, 1267 .safe_halt = xen_safe_halt,
1125 .halt = xen_halt, 1268 .halt = xen_halt,
1126#ifdef CONFIG_X86_64 1269#ifdef CONFIG_X86_64
1127 .adjust_exception_frame = paravirt_nop, 1270 .adjust_exception_frame = xen_adjust_exception_frame,
1128#endif 1271#endif
1129}; 1272};
1130 1273
1131static const struct pv_apic_ops xen_apic_ops __initdata = { 1274static const struct pv_apic_ops xen_apic_ops __initdata = {
1132#ifdef CONFIG_X86_LOCAL_APIC 1275#ifdef CONFIG_X86_LOCAL_APIC
1133 .apic_write = xen_apic_write, 1276 .apic_write = xen_apic_write,
1134 .apic_write_atomic = xen_apic_write,
1135 .apic_read = xen_apic_read, 1277 .apic_read = xen_apic_read,
1136 .setup_boot_clock = paravirt_nop, 1278 .setup_boot_clock = paravirt_nop,
1137 .setup_secondary_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop,
@@ -1157,8 +1299,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1157 .pte_update = paravirt_nop, 1299 .pte_update = paravirt_nop,
1158 .pte_update_defer = paravirt_nop, 1300 .pte_update_defer = paravirt_nop,
1159 1301
1160 .pgd_alloc = __paravirt_pgd_alloc, 1302 .pgd_alloc = xen_pgd_alloc,
1161 .pgd_free = paravirt_nop, 1303 .pgd_free = xen_pgd_free,
1162 1304
1163 .alloc_pte = xen_alloc_pte_init, 1305 .alloc_pte = xen_alloc_pte_init,
1164 .release_pte = xen_release_pte_init, 1306 .release_pte = xen_release_pte_init,
@@ -1170,7 +1312,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1170 .kmap_atomic_pte = xen_kmap_atomic_pte, 1312 .kmap_atomic_pte = xen_kmap_atomic_pte,
1171#endif 1313#endif
1172 1314
1173 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1315#ifdef CONFIG_X86_64
1316 .set_pte = xen_set_pte,
1317#else
1318 .set_pte = xen_set_pte_init,
1319#endif
1174 .set_pte_at = xen_set_pte_at, 1320 .set_pte_at = xen_set_pte_at,
1175 .set_pmd = xen_set_pmd_hyper, 1321 .set_pmd = xen_set_pmd_hyper,
1176 1322
@@ -1184,15 +1330,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1184 .make_pte = xen_make_pte, 1330 .make_pte = xen_make_pte,
1185 .make_pgd = xen_make_pgd, 1331 .make_pgd = xen_make_pgd,
1186 1332
1333#ifdef CONFIG_X86_PAE
1187 .set_pte_atomic = xen_set_pte_atomic, 1334 .set_pte_atomic = xen_set_pte_atomic,
1188 .set_pte_present = xen_set_pte_at, 1335 .set_pte_present = xen_set_pte_at,
1189 .set_pud = xen_set_pud_hyper,
1190 .pte_clear = xen_pte_clear, 1336 .pte_clear = xen_pte_clear,
1191 .pmd_clear = xen_pmd_clear, 1337 .pmd_clear = xen_pmd_clear,
1338#endif /* CONFIG_X86_PAE */
1339 .set_pud = xen_set_pud_hyper,
1192 1340
1193 .make_pmd = xen_make_pmd, 1341 .make_pmd = xen_make_pmd,
1194 .pmd_val = xen_pmd_val, 1342 .pmd_val = xen_pmd_val,
1195 1343
1344#if PAGETABLE_LEVELS == 4
1345 .pud_val = xen_pud_val,
1346 .make_pud = xen_make_pud,
1347 .set_pgd = xen_set_pgd_hyper,
1348
1349 .alloc_pud = xen_alloc_pte_init,
1350 .release_pud = xen_release_pte_init,
1351#endif /* PAGETABLE_LEVELS == 4 */
1352
1196 .activate_mm = xen_activate_mm, 1353 .activate_mm = xen_activate_mm,
1197 .dup_mmap = xen_dup_mmap, 1354 .dup_mmap = xen_dup_mmap,
1198 .exit_mmap = xen_exit_mmap, 1355 .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1362,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1205 .set_fixmap = xen_set_fixmap, 1362 .set_fixmap = xen_set_fixmap,
1206}; 1363};
1207 1364
1208#ifdef CONFIG_SMP
1209static const struct smp_ops xen_smp_ops __initdata = {
1210 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1211 .smp_prepare_cpus = xen_smp_prepare_cpus,
1212 .cpu_up = xen_cpu_up,
1213 .smp_cpus_done = xen_smp_cpus_done,
1214
1215 .smp_send_stop = xen_smp_send_stop,
1216 .smp_send_reschedule = xen_smp_send_reschedule,
1217
1218 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1219 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1220};
1221#endif /* CONFIG_SMP */
1222
1223static void xen_reboot(int reason) 1365static void xen_reboot(int reason)
1224{ 1366{
1225 struct sched_shutdown r = { .reason = reason }; 1367 struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1264 1406
1265static void __init xen_reserve_top(void) 1407static void __init xen_reserve_top(void)
1266{ 1408{
1409#ifdef CONFIG_X86_32
1267 unsigned long top = HYPERVISOR_VIRT_START; 1410 unsigned long top = HYPERVISOR_VIRT_START;
1268 struct xen_platform_parameters pp; 1411 struct xen_platform_parameters pp;
1269 1412
@@ -1271,8 +1414,248 @@ static void __init xen_reserve_top(void)
1271 top = pp.virt_start; 1414 top = pp.virt_start;
1272 1415
1273 reserve_top_address(-top + 2 * PAGE_SIZE); 1416 reserve_top_address(-top + 2 * PAGE_SIZE);
1417#endif /* CONFIG_X86_32 */
1418}
1419
1420/*
1421 * Like __va(), but returns address in the kernel mapping (which is
1422 * all we have until the physical memory mapping has been set up.
1423 */
1424static void *__ka(phys_addr_t paddr)
1425{
1426#ifdef CONFIG_X86_64
1427 return (void *)(paddr + __START_KERNEL_map);
1428#else
1429 return __va(paddr);
1430#endif
1274} 1431}
1275 1432
1433/* Convert a machine address to physical address */
1434static unsigned long m2p(phys_addr_t maddr)
1435{
1436 phys_addr_t paddr;
1437
1438 maddr &= PTE_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440
1441 return paddr;
1442}
1443
1444/* Convert a machine address to kernel virtual */
1445static void *m2v(phys_addr_t maddr)
1446{
1447 return __ka(m2p(maddr));
1448}
1449
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot)
1484{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot);
1487
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG();
1494}
1495
1496static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1497{
1498 unsigned pmdidx, pteidx;
1499 unsigned ident_pte;
1500 unsigned long pfn;
1501
1502 ident_pte = 0;
1503 pfn = 0;
1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1505 pte_t *pte_page;
1506
1507 /* Reuse or allocate a page of ptes */
1508 if (pmd_present(pmd[pmdidx]))
1509 pte_page = m2v(pmd[pmdidx].pmd);
1510 else {
1511 /* Check for free pte pages */
1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1513 break;
1514
1515 pte_page = &level1_ident_pgt[ident_pte];
1516 ident_pte += PTRS_PER_PTE;
1517
1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1519 }
1520
1521 /* Install mappings */
1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1523 pte_t pte;
1524
1525 if (pfn > max_pfn_mapped)
1526 max_pfn_mapped = pfn;
1527
1528 if (!pte_none(pte_page[pteidx]))
1529 continue;
1530
1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1532 pte_page[pteidx] = pte;
1533 }
1534 }
1535
1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1538
1539 set_page_prot(pmd, PAGE_KERNEL_RO);
1540}
1541
1542#ifdef CONFIG_X86_64
1543static void convert_pfn_mfn(void *v)
1544{
1545 pte_t *pte = v;
1546 int i;
1547
1548 /* All levels are converted the same way, so just treat them
1549 as ptes. */
1550 for(i = 0; i < PTRS_PER_PTE; i++)
1551 pte[i] = xen_make_pte(pte[i].pte);
1552}
1553
1554/*
1555 * Set up the inital kernel pagetable.
1556 *
1557 * We can construct this by grafting the Xen provided pagetable into
1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1560 * means that only the kernel has a physical mapping to start with -
1561 * but that's enough to get __va working. We need to fill in the rest
1562 * of the physical mapping once some sort of allocator has been set
1563 * up.
1564 */
1565static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1566{
1567 pud_t *l3;
1568 pmd_t *l2;
1569
1570 /* Zap identity mapping */
1571 init_level4_pgt[0] = __pgd(0);
1572
1573 /* Pre-constructed entries are in pfn, so convert to mfn */
1574 convert_pfn_mfn(init_level4_pgt);
1575 convert_pfn_mfn(level3_ident_pgt);
1576 convert_pfn_mfn(level3_kernel_pgt);
1577
1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1580
1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1583
1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1587
1588 /* Set up identity map */
1589 xen_map_identity_early(level2_ident_pgt, max_pfn);
1590
1591 /* Make pagetable pieces RO */
1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1598
1599 /* Pin down new L4 */
1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1601 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1602
1603 /* Unpin Xen-provided one */
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 /* Switch over */
1607 pgd = init_level4_pgt;
1608
1609 /*
1610 * At this stage there can be no user pgd, and no page
1611 * structure to attach it to, so make sure we just set kernel
1612 * pgd.
1613 */
1614 xen_mc_batch();
1615 __xen_write_cr3(true, __pa(pgd));
1616 xen_mc_issue(PARAVIRT_LAZY_CPU);
1617
1618 reserve_early(__pa(xen_start_info->pt_base),
1619 __pa(xen_start_info->pt_base +
1620 xen_start_info->nr_pt_frames * PAGE_SIZE),
1621 "XEN PAGETABLES");
1622
1623 return pgd;
1624}
1625#else /* !CONFIG_X86_64 */
1626static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1627
1628static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1629{
1630 pmd_t *kernel_pmd;
1631
1632 init_pg_tables_start = __pa(pgd);
1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1635
1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1638
1639 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1640
1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1644
1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1648
1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1650
1651 xen_write_cr3(__pa(swapper_pg_dir));
1652
1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1654
1655 return swapper_pg_dir;
1656}
1657#endif /* CONFIG_X86_64 */
1658
1276/* First C function to be called on Xen boot */ 1659/* First C function to be called on Xen boot */
1277asmlinkage void __init xen_start_kernel(void) 1660asmlinkage void __init xen_start_kernel(void)
1278{ 1661{
@@ -1301,53 +1684,56 @@ asmlinkage void __init xen_start_kernel(void)
1301 1684
1302 machine_ops = xen_machine_ops; 1685 machine_ops = xen_machine_ops;
1303 1686
1304#ifdef CONFIG_SMP 1687#ifdef CONFIG_X86_64
1305 smp_ops = xen_smp_ops; 1688 /* Disable until direct per-cpu data access. */
1689 have_vcpu_info_placement = 0;
1690 x86_64_init_pda();
1306#endif 1691#endif
1307 1692
1693 xen_smp_init();
1694
1308 /* Get mfn list */ 1695 /* Get mfn list */
1309 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1696 if (!xen_feature(XENFEAT_auto_translated_physmap))
1310 xen_build_dynamic_phys_to_machine(); 1697 xen_build_dynamic_phys_to_machine();
1311 1698
1312 pgd = (pgd_t *)xen_start_info->pt_base; 1699 pgd = (pgd_t *)xen_start_info->pt_base;
1313 1700
1314 init_pg_tables_start = __pa(pgd); 1701 /* Prevent unwanted bits from being set in PTEs. */
1315 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1702 __supported_pte_mask &= ~_PAGE_GLOBAL;
1316 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1703 if (!is_initial_xendomain())
1317 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1318 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1319
1320 /* keep using Xen gdt for now; no urgent need to change it */
1321
1322 x86_write_percpu(xen_cr3, __pa(pgd));
1323 x86_write_percpu(xen_current_cr3, __pa(pgd));
1324 1705
1325 /* Don't do the full vcpu_info placement stuff until we have a 1706 /* Don't do the full vcpu_info placement stuff until we have a
1326 possible map and a non-dummy shared_info. */ 1707 possible map and a non-dummy shared_info. */
1327 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1328 1709
1710 xen_raw_console_write("mapping kernel into physical memory\n");
1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1712
1713 init_mm.pgd = pgd;
1714
1715 /* keep using Xen gdt for now; no urgent need to change it */
1716
1329 pv_info.kernel_rpl = 1; 1717 pv_info.kernel_rpl = 1;
1330 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1331 pv_info.kernel_rpl = 0; 1719 pv_info.kernel_rpl = 0;
1332 1720
1333 /* Prevent unwanted bits from being set in PTEs. */
1334 __supported_pte_mask &= ~_PAGE_GLOBAL;
1335 if (!is_initial_xendomain())
1336 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1337
1338 /* set the limit of our address space */ 1721 /* set the limit of our address space */
1339 xen_reserve_top(); 1722 xen_reserve_top();
1340 1723
1724#ifdef CONFIG_X86_32
1341 /* set up basic CPUID stuff */ 1725 /* set up basic CPUID stuff */
1342 cpu_detect(&new_cpu_data); 1726 cpu_detect(&new_cpu_data);
1343 new_cpu_data.hard_math = 1; 1727 new_cpu_data.hard_math = 1;
1344 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1729#endif
1345 1730
1346 /* Poke various useful things into boot_params */ 1731 /* Poke various useful things into boot_params */
1347 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1348 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1349 ? __pa(xen_start_info->mod_start) : 0; 1734 ? __pa(xen_start_info->mod_start) : 0;
1350 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1351 1737
1352 if (!is_initial_xendomain()) { 1738 if (!is_initial_xendomain()) {
1353 add_preferred_console("xenboot", 0, NULL); 1739 add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1741,21 @@ asmlinkage void __init xen_start_kernel(void)
1355 add_preferred_console("hvc", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL);
1356 } 1742 }
1357 1743
1744 xen_raw_console_write("about to get started...\n");
1745
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1754
1358 /* Start the world */ 1755 /* Start the world */
1756#ifdef CONFIG_X86_32
1359 i386_start_kernel(); 1757 i386_start_kernel();
1758#else
1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1760#endif
1360} 1761}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa1..a44d56e38bd1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 70
62/* Placeholder for holes in the address space */ 71/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 74
67 /* Array of pointers to pages containing p2m entries */ 75 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 78
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 79/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 81
76static unsigned long p2m_top_mfn_list[ 82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 83 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 84
80static inline unsigned p2m_top_index(unsigned long pfn) 85static inline unsigned p2m_top_index(unsigned long pfn)
81{ 86{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 186 p2m_top[topidx][idx] = mfn;
182} 187}
183 188
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 190{
191 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 192 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
189 195
190 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
191 197
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 199}
194 200
195void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 262
257 xen_mc_batch(); 263 xen_mc_batch();
258 264
259 u.ptr = virt_to_machine(ptr).maddr; 265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 267 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 268 extend_mmu_update(&u);
262 269
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 290 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 292{
286 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 294}
316 295
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 397
419 xen_mc_batch(); 398 xen_mc_batch();
420 399
421 u.ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 402 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 403 extend_mmu_update(&u);
424 404
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
441 421
442void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 423{
424#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 426 smp_wmb();
446 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
447} 431}
448 432
433#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 435{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 437}
453 438
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 447{
463 set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
464} 449}
450#endif /* CONFIG_X86_PAE */
465 451
466pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
467{ 453{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
470} 456}
471 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
472/* 543/*
473 (Yet another) pagetable walker. This one is intended for pinning a 544 * (Yet another) pagetable walker. This one is intended for pinning a
474 pagetable. This means that it walks a pagetable and calls the 545 * pagetable. This means that it walks a pagetable and calls the
475 callback function on each page it finds making up the page table, 546 * callback function on each page it finds making up the page table,
476 at every level. It walks the entire pagetable, but it only bothers 547 * at every level. It walks the entire pagetable, but it only bothers
477 pinning pte pages which are below pte_limit. In the normal case 548 * pinning pte pages which are below limit. In the normal case this
478 this will be TASK_SIZE, but at boot we need to pin up to 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 550 * FIXADDR_TOP.
480 there, because then we start getting into Xen's ptes. 551 *
481*/ 552 * For 32-bit the important bit is that we don't pin beyond there,
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
483 unsigned long limit) 559 unsigned long limit)
484{ 560{
485 pgd_t *pgd = pgd_base;
486 int flush = 0; 561 int flush = 0;
487 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
489 565
490 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
491 569
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 571 return 0;
494 572
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 596 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 597
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
500 600
501 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
502 continue; 602 continue;
503 603
504 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
505 605
506 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
508 608
509 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 610 pmd_t *pmd;
511 unsigned long pmd_limit;
512 611
513 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
514 613 pudidx > pudidx_limit)
515 if (pud_next < limit) 614 goto out;
516 pmd_limit = pud_next;
517 else
518 pmd_limit = limit;
519 615
520 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
521 continue; 617 continue;
522 618
523 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
524 620
525 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
527 623
528 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
530 if ((pmd_limit-1) < (addr-1)) { 626
531 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
532 break; 628 pudidx == pudidx_limit &&
533 } 629 pmdidx > pmdidx_limit)
630 goto out;
534 631
535 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
536 continue; 633 continue;
537 634
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
539 } 637 }
540 } 638 }
541 } 639 }
542 640out:
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
544 641
545 return flush; 642 return flush;
546} 643}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
622{ 719{
623 xen_mc_batch(); 720 xen_mc_batch();
624 721
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 724 xen_mc_issue(0);
628 kmap_flush_unused(); 725 kmap_flush_unused();
629 xen_mc_batch(); 726 xen_mc_batch();
630 } 727 }
631 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 747 xen_mc_issue(0);
634} 748}
635 749
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 770 spin_unlock_irqrestore(&pgd_lock, flags);
657} 771}
658 772
659/* The init_mm pagetable is really pinned as soon as its created, but 773/*
660 that's before we have page structures to store the bits. So do all 774 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
662static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
663{ 779{
664 SetPagePinned(page); 780 SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
708 824
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 826
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
712 844
713 xen_mc_issue(0); 845 xen_mc_issue(0);
714} 846}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 859 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 860 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 861 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page)); 862 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 863 ClearPageSavePinned(page);
733 } 864 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
758{ 889{
759 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
760 898
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
763 901
764 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8bc..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 32void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 43void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 46
53void xen_pmd_clear(pmd_t *pmdp); 47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 55
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed1..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde3..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7f..e693812ac59a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
15 * This does not handle HOTPLUG_CPU yet. 15 * This does not handle HOTPLUG_CPU yet.
16 */ 16 */
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20 21
@@ -35,6 +36,8 @@
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
38cpumask_t xen_cpu_initialized_map; 41cpumask_t xen_cpu_initialized_map;
39 42
40static DEFINE_PER_CPU(int, resched_irq); 43static DEFINE_PER_CPU(int, resched_irq);
@@ -66,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
66 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
67 70
68 cpu_init(); 71 cpu_init();
72 preempt_disable();
73
69 xen_enable_sysenter(); 74 xen_enable_sysenter();
75 xen_enable_syscall();
70 76
71 preempt_disable(); 77 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 78 smp_store_cpu_info(cpu);
79 cpu_data(cpu).x86_max_cores = 1;
80 set_cpu_sibling_map(cpu);
73 81
74 xen_setup_cpu_clockevents(); 82 xen_setup_cpu_clockevents();
75 83
84 cpu_set(cpu, cpu_online_map);
85 x86_write_percpu(cpu_state, CPU_ONLINE);
86 wmb();
87
76 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 89 local_irq_enable();
78 90
@@ -141,56 +153,39 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 153 return rc;
142} 154}
143 155
144void __init xen_fill_possible_map(void) 156static void __init xen_fill_possible_map(void)
145{ 157{
146 int i, rc; 158 int i, rc;
147 159
148 for (i = 0; i < NR_CPUS; i++) { 160 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 161 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 162 if (rc >= 0) {
163 num_processors++;
151 cpu_set(i, cpu_possible_map); 164 cpu_set(i, cpu_possible_map);
165 }
152 } 166 }
153} 167}
154 168
155void __init xen_smp_prepare_boot_cpu(void) 169static void __init xen_smp_prepare_boot_cpu(void)
156{ 170{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 171 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 172 native_smp_prepare_boot_cpu();
161 173
162 /* We've switched to the "real" per-cpu gdt, so make sure the 174 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 175 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 176 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 177
176 xen_setup_vcpu_info_placement(); 178 xen_setup_vcpu_info_placement();
177} 179}
178 180
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 181static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 182{
181 unsigned cpu; 183 unsigned cpu;
182 184
183 for_each_possible_cpu(cpu) { 185 xen_init_lock_cpu(0);
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192 186
193 smp_store_cpu_info(0); 187 smp_store_cpu_info(0);
188 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 189 set_cpu_sibling_map(0);
195 190
196 if (xen_smp_intr_init(0)) 191 if (xen_smp_intr_init(0))
@@ -225,7 +220,7 @@ static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 220cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 221{
227 struct vcpu_guest_context *ctxt; 222 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 223 struct desc_struct *gdt;
229 224
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 225 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 226 return 0;
@@ -234,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 229 if (ctxt == NULL)
235 return -ENOMEM; 230 return -ENOMEM;
236 231
232 gdt = get_cpu_gdt_table(cpu);
233
237 ctxt->flags = VGCF_IN_KERNEL; 234 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 235 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 236 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 237 ctxt->user_regs.ss = __KERNEL_DS;
238#ifdef CONFIG_X86_32
239 ctxt->user_regs.fs = __KERNEL_PERCPU;
240#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 243
@@ -249,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 247
250 ctxt->ldt_ents = 0; 248 ctxt->ldt_ents = 0;
251 249
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 250 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 251 make_lowmem_page_readonly(gdt);
254 252
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 253 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 254 ctxt->gdt_ents = GDT_ENTRIES;
257 255
258 ctxt->user_regs.cs = __KERNEL_CS; 256 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 257 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 259 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 260 ctxt->kernel_sp = idle->thread.sp0;
263 261
262#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 263 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 264 ctxt->failsafe_callback_cs = __KERNEL_CS;
265#endif
266 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 268
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 276 return 0;
277} 277}
278 278
279int __cpuinit xen_cpu_up(unsigned int cpu) 279static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 280{
281 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
282 int rc; 282 int rc;
@@ -287,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
287 return rc; 287 return rc;
288#endif 288#endif
289 289
290#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0);
293 if (cpu > 0) {
294 rc = get_local_pda(cpu);
295 if (rc)
296 return rc;
297 }
298#endif
299
300#ifdef CONFIG_X86_32
290 init_gdt(cpu); 301 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 302 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 303 irq_ctx_init(cpu);
304#else
305 cpu_pda(cpu)->pcurrent = idle;
306 clear_tsk_thread_flag(idle, TIF_FORK);
307#endif
293 xen_setup_timer(cpu); 308 xen_setup_timer(cpu);
309 xen_init_lock_cpu(cpu);
310
311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
294 312
295 /* make sure interrupts start blocked */ 313 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 314 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 324 if (rc)
307 return rc; 325 return rc;
308 326
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 327 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 328 BUG_ON(rc);
318 329
330 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
331 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
332 barrier();
333 }
334
319 return 0; 335 return 0;
320} 336}
321 337
322void xen_smp_cpus_done(unsigned int max_cpus) 338static void xen_smp_cpus_done(unsigned int max_cpus)
323{ 339{
324} 340}
325 341
@@ -335,12 +351,12 @@ static void stop_self(void *v)
335 BUG(); 351 BUG();
336} 352}
337 353
338void xen_smp_send_stop(void) 354static void xen_smp_send_stop(void)
339{ 355{
340 smp_call_function(stop_self, NULL, 0); 356 smp_call_function(stop_self, NULL, 0);
341} 357}
342 358
343void xen_smp_send_reschedule(int cpu) 359static void xen_smp_send_reschedule(int cpu)
344{ 360{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 361 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 362}
@@ -355,7 +371,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
355 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
356} 372}
357 373
358void xen_smp_send_call_function_ipi(cpumask_t mask) 374static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 375{
360 int cpu; 376 int cpu;
361 377
@@ -370,7 +386,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 386 }
371} 387}
372 388
373void xen_smp_send_call_function_single_ipi(int cpu) 389static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 390{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 391 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 392}
@@ -379,7 +395,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 395{
380 irq_enter(); 396 irq_enter();
381 generic_smp_call_function_interrupt(); 397 generic_smp_call_function_interrupt();
398#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 399 __get_cpu_var(irq_stat).irq_call_count++;
400#else
401 add_pda(irq_call_count, 1);
402#endif
383 irq_exit(); 403 irq_exit();
384 404
385 return IRQ_HANDLED; 405 return IRQ_HANDLED;
@@ -389,8 +409,196 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 409{
390 irq_enter(); 410 irq_enter();
391 generic_smp_call_function_single_interrupt(); 411 generic_smp_call_function_single_interrupt();
412#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 413 __get_cpu_var(irq_stat).irq_call_count++;
414#else
415 add_pda(irq_call_count, 1);
416#endif
393 irq_exit(); 417 irq_exit();
394 418
395 return IRQ_HANDLED; 419 return IRQ_HANDLED;
396} 420}
421
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done,
591
592 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule,
594
595 .send_call_func_ipi = xen_smp_send_call_function_ipi,
596 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
597};
598
599void __init xen_smp_init(void)
600{
601 smp_ops = xen_smp_ops;
602 xen_fill_possible_map();
603 xen_init_spinlocks();
604}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d4..2a234db5949b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..4038cbfe3331
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $VGCF_in_syscall
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0a..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c2..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 30void xen_vcpu_restore(void);
30 31
31void __init xen_build_dynamic_phys_to_machine(void); 32void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 40unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 41
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 42irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 43
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
45 45
46void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
47 47
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 49
56void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask);
59void xen_smp_send_call_function_single_ipi(int cpu);
60 52
61extern cpumask_t xen_cpu_initialized_map; 53extern cpumask_t xen_cpu_initialized_map;
54#else
55static inline void xen_smp_init(void) {}
56#endif
62 57
63 58
64/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 70
71/* These are not functions, and cannot be called normally */
76void xen_iret(void); 72void xen_iret(void);
77void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
78 77
79#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */