aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Kconfig.cpu4
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/boot/edd.c5
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/ia32/ia32_signal.c11
-rw-r--r--arch/x86/ia32/ia32entry.S18
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/sleep.c10
-rw-r--r--arch/x86/kernel/amd_iommu.c231
-rw-r--r--arch/x86/kernel/amd_iommu_init.c357
-rw-r--r--arch/x86/kernel/aperture_64.c1
-rw-r--r--arch/x86/kernel/apic_32.c175
-rw-r--r--arch/x86/kernel/apic_64.c26
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/bios_uv.c48
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c23
-rw-r--r--arch/x86/kernel/cpu/common_64.c15
-rw-r--r--arch/x86/kernel/cpu/intel.c10
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/e820.c33
-rw-r--r--arch/x86/kernel/early-quirks.c5
-rw-r--r--arch/x86/kernel/entry_32.S23
-rw-r--r--arch/x86/kernel/entry_64.S120
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c23
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/io_apic_32.c53
-rw-r--r--arch/x86/kernel/io_apic_64.c41
-rw-r--r--arch/x86/kernel/io_delay.c3
-rw-r--r--arch/x86/kernel/ipi.c6
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/kdebugfs.c8
-rw-r--r--arch/x86/kernel/kprobes.c1
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/module_64.c10
-rw-r--r--arch/x86/kernel/mpparse.c208
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/numaq_32.c197
-rw-r--r--arch/x86/kernel/paravirt.c5
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c17
-rw-r--r--arch/x86/kernel/pci-gart_64.c1
-rw-r--r--arch/x86/kernel/pci-nommu.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c2
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_64.c56
-rw-r--r--arch/x86/kernel/ptrace.c151
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c22
-rw-r--r--arch/x86/kernel/signal_32.c8
-rw-r--r--arch/x86/kernel/signal_64.c6
-rw-r--r--arch/x86/kernel/smpboot.c56
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/time_32.c1
-rw-r--r--arch/x86/kernel/traps_32.c118
-rw-r--r--arch/x86/kernel/traps_64.c48
-rw-r--r--arch/x86/kernel/visws_quirks.c42
-rw-r--r--arch/x86/kernel/vmi_32.c1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/i8254.c24
-rw-r--r--arch/x86/kvm/i8259.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c14
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c62
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h28
-rw-r--r--arch/x86/kvm/svm.c131
-rw-r--r--arch/x86/kvm/vmx.c230
-rw-r--r--arch/x86/kvm/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c299
-rw-r--r--arch/x86/kvm/x86_emulate.c257
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/mach-default/setup.c34
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c112
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/pat.c94
-rw-r--r--arch/x86/pci/Makefile12
-rw-r--r--arch/x86/pci/legacy.c9
-rw-r--r--arch/x86/pci/numaq_32.c (renamed from arch/x86/pci/numa.c)4
-rw-r--r--arch/x86/pci/pci.h3
-rw-r--r--arch/x86/pci/visws.c23
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/vdso/vma.c11
-rw-r--r--arch/x86/xen/Kconfig14
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c689
-rw-r--r--arch/x86/xen/mmu.c316
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c137
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
108 files changed, 3642 insertions, 1824 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96e0c2ebc38..03980cb0429 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
447 447
448config MEMTEST 448config MEMTEST
449 bool "Memtest" 449 bool "Memtest"
450 depends on X86_64
451 help 450 help
452 This option adds a kernel parameter 'memtest', which allows memtest 451 This option adds a kernel parameter 'memtest', which allows memtest
453 to be set. 452 to be set.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index abff1b84ed5..54b8c02c71e 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
362 def_bool y 362 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 364
365config X86_GOOD_APIC
366 def_bool y
367 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
368
369config X86_INTEL_USERCOPY 365config X86_INTEL_USERCOPY
370 def_bool y 366 def_bool y
371 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 367 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ae36bfa814e..85a87d2ac0c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
5 5
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 If this option is left off, you allow userspace access to all 11 If this option is left on, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
14 be used by people debugging the kernel. 14 be used by people debugging the kernel. Note that with PAT support
15 enabled, even in this case there are restrictions on /dev/mem
16 use due to the cache aliasing requirements.
15 17
16 If this option is switched on, the /dev/mem file only allows 18 If this option is switched on, the /dev/mem file only allows
17 userspace access to PCI space and the BIOS code and data regions. 19 userspace access to PCI space and the BIOS code and data regions.
@@ -287,7 +289,6 @@ config CPA_DEBUG
287 289
288config OPTIMIZE_INLINING 290config OPTIMIZE_INLINING
289 bool "Allow gcc to uninline functions marked 'inline'" 291 bool "Allow gcc to uninline functions marked 'inline'"
290 depends on BROKEN
291 help 292 help
292 This option determines if the kernel forces gcc to inline the functions 293 This option determines if the kernel forces gcc to inline the functions
293 developers have marked 'inline'. Doing so takes away freedom from gcc to 294 developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -298,5 +299,7 @@ config OPTIMIZE_INLINING
298 become the default in the future, until then this option is there to 299 become the default in the future, until then this option is there to
299 test gcc for this. 300 test gcc for this.
300 301
302 If unsure, say N.
303
301endmenu 304endmenu
302 305
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 03399d64013..d93cbc6464d 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -167,9 +167,8 @@ void query_edd(void)
167 * Scan the BIOS-supported hard disks and query EDD 167 * Scan the BIOS-supported hard disks and query EDD
168 * information... 168 * information...
169 */ 169 */
170 get_edd_info(devno, &ei); 170 if (!get_edd_info(devno, &ei)
171 171 && boot_params.eddbuf_entries < EDDMAXNR) {
172 if (boot_params.eddbuf_entries < EDDMAXNR) {
173 memcpy(edp, &ei, sizeof ei); 172 memcpy(edp, &ei, sizeof ei);
174 edp++; 173 edp++;
175 boot_params.eddbuf_entries++; 174 boot_params.eddbuf_entries++;
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 328956fdb59..85a1cd8a8ff 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
98/* 98/*
99 * Set up the GDT 99 * Set up the GDT
100 */ 100 */
101#define GDT_ENTRY(flags, base, limit) \
102 (((u64)(base & 0xff000000) << 32) | \
103 ((u64)flags << 40) | \
104 ((u64)(limit & 0x00ff0000) << 32) | \
105 ((u64)(base & 0x00ffffff) << 16) | \
106 ((u64)(limit & 0x0000ffff)))
107 101
108struct gdt_ptr { 102struct gdt_ptr {
109 u16 len; 103 u16 len;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 9bc34e2033e..4d73f53287b 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -2047,7 +2047,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2047# CONFIG_SAMPLES is not set 2047# CONFIG_SAMPLES is not set
2048# CONFIG_KGDB is not set 2048# CONFIG_KGDB is not set
2049CONFIG_HAVE_ARCH_KGDB=y 2049CONFIG_HAVE_ARCH_KGDB=y
2050# CONFIG_NONPROMISC_DEVMEM is not set 2050# CONFIG_STRICT_DEVMEM is not set
2051CONFIG_EARLY_PRINTK=y 2051CONFIG_EARLY_PRINTK=y
2052CONFIG_DEBUG_STACKOVERFLOW=y 2052CONFIG_DEBUG_STACKOVERFLOW=y
2053CONFIG_DEBUG_STACK_USAGE=y 2053CONFIG_DEBUG_STACK_USAGE=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ae5124e064d..a4045242962 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -2012,7 +2012,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2012# CONFIG_SAMPLES is not set 2012# CONFIG_SAMPLES is not set
2013# CONFIG_KGDB is not set 2013# CONFIG_KGDB is not set
2014CONFIG_HAVE_ARCH_KGDB=y 2014CONFIG_HAVE_ARCH_KGDB=y
2015# CONFIG_NONPROMISC_DEVMEM is not set 2015# CONFIG_STRICT_DEVMEM is not set
2016CONFIG_EARLY_PRINTK=y 2016CONFIG_EARLY_PRINTK=y
2017CONFIG_DEBUG_STACKOVERFLOW=y 2017CONFIG_DEBUG_STACKOVERFLOW=y
2018CONFIG_DEBUG_STACK_USAGE=y 2018CONFIG_DEBUG_STACK_USAGE=y
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb3856a18c8..20af4c79579 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
36 36
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
38 38
39#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
40 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF)
43
39asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
40void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
41 46
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
248 regs->ss |= 3; 253 regs->ss |= 3;
249 254
250 err |= __get_user(tmpflags, &sc->flags); 255 err |= __get_user(tmpflags, &sc->flags);
251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); 256 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
252 /* disable syscall checks */ 257 /* disable syscall checks */
253 regs->orig_ax = -1; 258 regs->orig_ax = -1;
254 259
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
515 compat_sigset_t *set, struct pt_regs *regs) 520 compat_sigset_t *set, struct pt_regs *regs)
516{ 521{
517 struct rt_sigframe __user *frame; 522 struct rt_sigframe __user *frame;
518 struct exec_domain *ed = current_thread_info()->exec_domain;
519 void __user *restorer; 523 void __user *restorer;
520 int err = 0; 524 int err = 0;
521 525
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
538 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 542 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
539 goto give_sigsegv; 543 goto give_sigsegv;
540 544
541 err |= __put_user((ed && ed->signal_invmap && sig < 32 545 err |= __put_user(sig, &frame->sig);
542 ? ed->signal_invmap[sig] : sig), &frame->sig);
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 546 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 547 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info); 548 err |= copy_siginfo_to_user32(&frame->info, info);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e..23d146ce676 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -37,6 +37,11 @@
37 movq %rax,R8(%rsp) 37 movq %rax,R8(%rsp)
38 .endm 38 .endm
39 39
40 /*
41 * Reload arg registers from stack in case ptrace changed them.
42 * We don't reload %eax because syscall_trace_enter() returned
43 * the value it wants us to use in the table lookup.
44 */
40 .macro LOAD_ARGS32 offset 45 .macro LOAD_ARGS32 offset
41 movl \offset(%rsp),%r11d 46 movl \offset(%rsp),%r11d
42 movl \offset+8(%rsp),%r10d 47 movl \offset+8(%rsp),%r10d
@@ -46,7 +51,6 @@
46 movl \offset+48(%rsp),%edx 51 movl \offset+48(%rsp),%edx
47 movl \offset+56(%rsp),%esi 52 movl \offset+56(%rsp),%esi
48 movl \offset+64(%rsp),%edi 53 movl \offset+64(%rsp),%edi
49 movl \offset+72(%rsp),%eax
50 .endm 54 .endm
51 55
52 .macro CFI_STARTPROC32 simple 56 .macro CFI_STARTPROC32 simple
@@ -137,13 +141,12 @@ ENTRY(ia32_sysenter_target)
137 .previous 141 .previous
138 GET_THREAD_INFO(%r10) 142 GET_THREAD_INFO(%r10)
139 orl $TS_COMPAT,TI_status(%r10) 143 orl $TS_COMPAT,TI_status(%r10)
140 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 144 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
141 TI_flags(%r10)
142 CFI_REMEMBER_STATE 145 CFI_REMEMBER_STATE
143 jnz sysenter_tracesys 146 jnz sysenter_tracesys
144sysenter_do_call:
145 cmpl $(IA32_NR_syscalls-1),%eax 147 cmpl $(IA32_NR_syscalls-1),%eax
146 ja ia32_badsys 148 ja ia32_badsys
149sysenter_do_call:
147 IA32_ARG_FIXUP 1 150 IA32_ARG_FIXUP 1
148 call *ia32_sys_call_table(,%rax,8) 151 call *ia32_sys_call_table(,%rax,8)
149 movq %rax,RAX-ARGOFFSET(%rsp) 152 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -242,8 +245,7 @@ ENTRY(ia32_cstar_target)
242 .previous 245 .previous
243 GET_THREAD_INFO(%r10) 246 GET_THREAD_INFO(%r10)
244 orl $TS_COMPAT,TI_status(%r10) 247 orl $TS_COMPAT,TI_status(%r10)
245 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 248 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
246 TI_flags(%r10)
247 CFI_REMEMBER_STATE 249 CFI_REMEMBER_STATE
248 jnz cstar_tracesys 250 jnz cstar_tracesys
249cstar_do_call: 251cstar_do_call:
@@ -321,6 +323,7 @@ ENTRY(ia32_syscall)
321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 323 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
322 /*CFI_REL_OFFSET cs,CS-RIP*/ 324 /*CFI_REL_OFFSET cs,CS-RIP*/
323 CFI_REL_OFFSET rip,RIP-RIP 325 CFI_REL_OFFSET rip,RIP-RIP
326 PARAVIRT_ADJUST_EXCEPTION_FRAME
324 SWAPGS 327 SWAPGS
325 /* 328 /*
326 * No need to follow this irqs on/off section: the syscall 329 * No need to follow this irqs on/off section: the syscall
@@ -336,8 +339,7 @@ ENTRY(ia32_syscall)
336 SAVE_ARGS 0,0,1 339 SAVE_ARGS 0,0,1
337 GET_THREAD_INFO(%r10) 340 GET_THREAD_INFO(%r10)
338 orl $TS_COMPAT,TI_status(%r10) 341 orl $TS_COMPAT,TI_status(%r10)
339 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 342 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
340 TI_flags(%r10)
341 jnz ia32_tracesys 343 jnz ia32_tracesys
342ia32_do_syscall: 344ia32_do_syscall:
343 cmpl $(IA32_NR_syscalls-1),%eax 345 cmpl $(IA32_NR_syscalls-1),%eax
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb5..b78a17b1281 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_OLPC) += olpc.o
102# 64 bit specific files 102# 64 bit specific files
103ifeq ($(CONFIG_X86_64),y) 103ifeq ($(CONFIG_X86_64),y)
104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
105 obj-y += bios_uv.o
105 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 106 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
106 obj-$(CONFIG_AUDIT) += audit_64.o 107 obj-$(CONFIG_AUDIT) += audit_64.o
107 108
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 868de3d5c39..a3ddad18aaa 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,7 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
12 13
13#include "realmode/wakeup.h" 14#include "realmode/wakeup.h"
14#include "sleep.h" 15#include "sleep.h"
@@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
23static char temp_stack[10240]; 24static char temp_stack[10240];
24#endif 25#endif
25 26
26/* XXX: this macro should move to asm-x86/segment.h and be shared with the
27 boot code... */
28#define GDT_ENTRY(flags, base, limit) \
29 (((u64)(base & 0xff000000) << 32) | \
30 ((u64)flags << 40) | \
31 ((u64)(limit & 0x00ff0000) << 32) | \
32 ((u64)(base & 0x00ffffff) << 16) | \
33 ((u64)(limit & 0x0000ffff)))
34
35/** 27/**
36 * acpi_save_state_mem - save kernel state 28 * acpi_save_state_mem - save kernel state
37 * 29 *
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a..c25210e6ac8 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,7 +23,7 @@
23#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/gart.h> 26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h> 27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 28#include <asm/amd_iommu.h>
29 29
@@ -32,21 +32,37 @@
32#define to_pages(addr, size) \ 32#define to_pages(addr, size) \
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) 33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34 34
35#define EXIT_LOOP_COUNT 10000000
36
35static DEFINE_RWLOCK(amd_iommu_devtable_lock); 37static DEFINE_RWLOCK(amd_iommu_devtable_lock);
36 38
37struct command { 39/*
40 * general struct to manage commands send to an IOMMU
41 */
42struct iommu_cmd {
38 u32 data[4]; 43 u32 data[4];
39}; 44};
40 45
41static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 46static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
42 struct unity_map_entry *e); 47 struct unity_map_entry *e);
43 48
49/* returns !0 if the IOMMU is caching non-present entries in its TLB */
44static int iommu_has_npcache(struct amd_iommu *iommu) 50static int iommu_has_npcache(struct amd_iommu *iommu)
45{ 51{
46 return iommu->cap & IOMMU_CAP_NPCACHE; 52 return iommu->cap & IOMMU_CAP_NPCACHE;
47} 53}
48 54
49static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 55/****************************************************************************
56 *
57 * IOMMU command queuing functions
58 *
59 ****************************************************************************/
60
61/*
62 * Writes the command to the IOMMUs command buffer and informs the
63 * hardware about the new command. Must be called with iommu->lock held.
64 */
65static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
50{ 66{
51 u32 tail, head; 67 u32 tail, head;
52 u8 *target; 68 u8 *target;
@@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
63 return 0; 79 return 0;
64} 80}
65 81
66static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 82/*
83 * General queuing function for commands. Takes iommu->lock and calls
84 * __iommu_queue_command().
85 */
86static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
67{ 87{
68 unsigned long flags; 88 unsigned long flags;
69 int ret; 89 int ret;
@@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
75 return ret; 95 return ret;
76} 96}
77 97
98/*
99 * This function is called whenever we need to ensure that the IOMMU has
100 * completed execution of all commands we sent. It sends a
101 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
102 * us about that by writing a value to a physical address we pass with
103 * the command.
104 */
78static int iommu_completion_wait(struct amd_iommu *iommu) 105static int iommu_completion_wait(struct amd_iommu *iommu)
79{ 106{
80 int ret; 107 int ret;
81 struct command cmd; 108 struct iommu_cmd cmd;
82 volatile u64 ready = 0; 109 volatile u64 ready = 0;
83 unsigned long ready_phys = virt_to_phys(&ready); 110 unsigned long ready_phys = virt_to_phys(&ready);
111 unsigned long i = 0;
84 112
85 memset(&cmd, 0, sizeof(cmd)); 113 memset(&cmd, 0, sizeof(cmd));
86 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 114 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
87 cmd.data[1] = HIGH_U32(ready_phys); 115 cmd.data[1] = upper_32_bits(ready_phys);
88 cmd.data[2] = 1; /* value written to 'ready' */ 116 cmd.data[2] = 1; /* value written to 'ready' */
89 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 117 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
90 118
@@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
95 if (ret) 123 if (ret)
96 return ret; 124 return ret;
97 125
98 while (!ready) 126 while (!ready && (i < EXIT_LOOP_COUNT)) {
127 ++i;
99 cpu_relax(); 128 cpu_relax();
129 }
130
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
100 133
101 return 0; 134 return 0;
102} 135}
103 136
137/*
138 * Command send function for invalidating a device table entry
139 */
104static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
105{ 141{
106 struct command cmd; 142 struct iommu_cmd cmd;
107 143
108 BUG_ON(iommu == NULL); 144 BUG_ON(iommu == NULL);
109 145
@@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
116 return iommu_queue_command(iommu, &cmd); 152 return iommu_queue_command(iommu, &cmd);
117} 153}
118 154
155/*
156 * Generic command send function for invalidaing TLB entries
157 */
119static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 158static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
120 u64 address, u16 domid, int pde, int s) 159 u64 address, u16 domid, int pde, int s)
121{ 160{
122 struct command cmd; 161 struct iommu_cmd cmd;
123 162
124 memset(&cmd, 0, sizeof(cmd)); 163 memset(&cmd, 0, sizeof(cmd));
125 address &= PAGE_MASK; 164 address &= PAGE_MASK;
126 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 165 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
127 cmd.data[1] |= domid; 166 cmd.data[1] |= domid;
128 cmd.data[2] = LOW_U32(address); 167 cmd.data[2] = LOW_U32(address);
129 cmd.data[3] = HIGH_U32(address); 168 cmd.data[3] = upper_32_bits(address);
130 if (s) 169 if (s) /* size bit - we flush more than one 4kb page */
131 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 170 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
132 if (pde) 171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
133 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
134 173
135 iommu->need_sync = 1; 174 iommu->need_sync = 1;
@@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
137 return iommu_queue_command(iommu, &cmd); 176 return iommu_queue_command(iommu, &cmd);
138} 177}
139 178
179/*
180 * TLB invalidation function which is called from the mapping functions.
181 * It invalidates a single PTE if the range to flush is within a single
182 * page. Otherwise it flushes the whole TLB of the IOMMU.
183 */
140static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 184static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
141 u64 address, size_t size) 185 u64 address, size_t size)
142{ 186{
@@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
159 return 0; 203 return 0;
160} 204}
161 205
206/****************************************************************************
207 *
208 * The functions below are used the create the page table mappings for
209 * unity mapped regions.
210 *
211 ****************************************************************************/
212
213/*
214 * Generic mapping functions. It maps a physical address into a DMA
215 * address space. It allocates the page table pages if necessary.
216 * In the future it can be extended to a generic mapping function
217 * supporting all features of AMD IOMMU page tables like level skipping
218 * and full 64 bit address spaces.
219 */
162static int iommu_map(struct protection_domain *dom, 220static int iommu_map(struct protection_domain *dom,
163 unsigned long bus_addr, 221 unsigned long bus_addr,
164 unsigned long phys_addr, 222 unsigned long phys_addr,
@@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
209 return 0; 267 return 0;
210} 268}
211 269
270/*
271 * This function checks if a specific unity mapping entry is needed for
272 * this specific IOMMU.
273 */
212static int iommu_for_unity_map(struct amd_iommu *iommu, 274static int iommu_for_unity_map(struct amd_iommu *iommu,
213 struct unity_map_entry *entry) 275 struct unity_map_entry *entry)
214{ 276{
@@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
223 return 0; 285 return 0;
224} 286}
225 287
288/*
289 * Init the unity mappings for a specific IOMMU in the system
290 *
291 * Basically iterates over all unity mapping entries and applies them to
292 * the default domain DMA of that IOMMU if necessary.
293 */
226static int iommu_init_unity_mappings(struct amd_iommu *iommu) 294static int iommu_init_unity_mappings(struct amd_iommu *iommu)
227{ 295{
228 struct unity_map_entry *entry; 296 struct unity_map_entry *entry;
@@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
239 return 0; 307 return 0;
240} 308}
241 309
310/*
311 * This function actually applies the mapping to the page table of the
312 * dma_ops domain.
313 */
242static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 314static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
243 struct unity_map_entry *e) 315 struct unity_map_entry *e)
244{ 316{
@@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
261 return 0; 333 return 0;
262} 334}
263 335
336/*
337 * Inits the unity mappings required for a specific device
338 */
264static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 339static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
265 u16 devid) 340 u16 devid)
266{ 341{
@@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
278 return 0; 353 return 0;
279} 354}
280 355
356/****************************************************************************
357 *
358 * The next functions belong to the address allocator for the dma_ops
359 * interface functions. They work like the allocators in the other IOMMU
360 * drivers. Its basically a bitmap which marks the allocated pages in
361 * the aperture. Maybe it could be enhanced in the future to a more
362 * efficient allocator.
363 *
364 ****************************************************************************/
281static unsigned long dma_mask_to_pages(unsigned long mask) 365static unsigned long dma_mask_to_pages(unsigned long mask)
282{ 366{
283 return (mask >> PAGE_SHIFT) + 367 return (mask >> PAGE_SHIFT) +
284 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); 368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
285} 369}
286 370
371/*
372 * The address allocator core function.
373 *
374 * called with domain->lock held
375 */
287static unsigned long dma_ops_alloc_addresses(struct device *dev, 376static unsigned long dma_ops_alloc_addresses(struct device *dev,
288 struct dma_ops_domain *dom, 377 struct dma_ops_domain *dom,
289 unsigned int pages) 378 unsigned int pages)
@@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
317 return address; 406 return address;
318} 407}
319 408
409/*
410 * The address free function.
411 *
412 * called with domain->lock held
413 */
320static void dma_ops_free_addresses(struct dma_ops_domain *dom, 414static void dma_ops_free_addresses(struct dma_ops_domain *dom,
321 unsigned long address, 415 unsigned long address,
322 unsigned int pages) 416 unsigned int pages)
@@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
325 iommu_area_free(dom->bitmap, address, pages); 419 iommu_area_free(dom->bitmap, address, pages);
326} 420}
327 421
422/****************************************************************************
423 *
424 * The next functions belong to the domain allocation. A domain is
425 * allocated for every IOMMU as the default domain. If device isolation
426 * is enabled, every device get its own domain. The most important thing
427 * about domains is the page table mapping the DMA address space they
428 * contain.
429 *
430 ****************************************************************************/
431
328static u16 domain_id_alloc(void) 432static u16 domain_id_alloc(void)
329{ 433{
330 unsigned long flags; 434 unsigned long flags;
@@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
342 return id; 446 return id;
343} 447}
344 448
449/*
450 * Used to reserve address ranges in the aperture (e.g. for exclusion
451 * ranges.
452 */
345static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 453static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
346 unsigned long start_page, 454 unsigned long start_page,
347 unsigned int pages) 455 unsigned int pages)
@@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
382 free_page((unsigned long)p1); 490 free_page((unsigned long)p1);
383} 491}
384 492
493/*
494 * Free a domain, only used if something went wrong in the
495 * allocation path and we need to free an already allocated page table
496 */
385static void dma_ops_domain_free(struct dma_ops_domain *dom) 497static void dma_ops_domain_free(struct dma_ops_domain *dom)
386{ 498{
387 if (!dom) 499 if (!dom)
@@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
396 kfree(dom); 508 kfree(dom);
397} 509}
398 510
511/*
512 * Allocates a new protection domain usable for the dma_ops functions.
513 * It also intializes the page table and the address allocator data
514 * structures required for the dma_ops interface
515 */
399static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 516static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
400 unsigned order) 517 unsigned order)
401{ 518{
@@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
436 dma_dom->bitmap[0] = 1; 553 dma_dom->bitmap[0] = 1;
437 dma_dom->next_bit = 0; 554 dma_dom->next_bit = 0;
438 555
556 /* Intialize the exclusion range if necessary */
439 if (iommu->exclusion_start && 557 if (iommu->exclusion_start &&
440 iommu->exclusion_start < dma_dom->aperture_size) { 558 iommu->exclusion_start < dma_dom->aperture_size) {
441 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
444 dma_ops_reserve_addresses(dma_dom, startpage, pages); 562 dma_ops_reserve_addresses(dma_dom, startpage, pages);
445 } 563 }
446 564
565 /*
566 * At the last step, build the page tables so we don't need to
567 * allocate page table pages in the dma_ops mapping/unmapping
568 * path.
569 */
447 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 570 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
448 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 571 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
449 GFP_KERNEL); 572 GFP_KERNEL);
@@ -472,6 +595,10 @@ free_dma_dom:
472 return NULL; 595 return NULL;
473} 596}
474 597
598/*
599 * Find out the protection domain structure for a given PCI device. This
600 * will give us the pointer to the page table root for example.
601 */
475static struct protection_domain *domain_for_device(u16 devid) 602static struct protection_domain *domain_for_device(u16 devid)
476{ 603{
477 struct protection_domain *dom; 604 struct protection_domain *dom;
@@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
484 return dom; 611 return dom;
485} 612}
486 613
614/*
615 * If a device is not yet associated with a domain, this function does
616 * assigns it visible for the hardware
617 */
487static void set_device_domain(struct amd_iommu *iommu, 618static void set_device_domain(struct amd_iommu *iommu,
488 struct protection_domain *domain, 619 struct protection_domain *domain,
489 u16 devid) 620 u16 devid)
@@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
508 iommu->need_sync = 1; 639 iommu->need_sync = 1;
509} 640}
510 641
642/*****************************************************************************
643 *
644 * The next functions belong to the dma_ops mapping/unmapping code.
645 *
646 *****************************************************************************/
647
648/*
649 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device.
652 * If the device is not yet associated with a domain this is also done
653 * in this function.
654 */
511static int get_device_resources(struct device *dev, 655static int get_device_resources(struct device *dev,
512 struct amd_iommu **iommu, 656 struct amd_iommu **iommu,
513 struct protection_domain **domain, 657 struct protection_domain **domain,
@@ -520,8 +664,9 @@ static int get_device_resources(struct device *dev,
520 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
521 665
522 pcidev = to_pci_dev(dev); 666 pcidev = to_pci_dev(dev);
523 _bdf = (pcidev->bus->number << 8) | pcidev->devfn; 667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
524 668
669 /* device not translated by any IOMMU in the system? */
525 if (_bdf >= amd_iommu_last_bdf) { 670 if (_bdf >= amd_iommu_last_bdf) {
526 *iommu = NULL; 671 *iommu = NULL;
527 *domain = NULL; 672 *domain = NULL;
@@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
547 return 1; 692 return 1;
548} 693}
549 694
695/*
696 * This is the generic map function. It maps one 4kb page at paddr to
697 * the given address in the DMA address space for the domain.
698 */
550static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 699static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
551 struct dma_ops_domain *dom, 700 struct dma_ops_domain *dom,
552 unsigned long address, 701 unsigned long address,
@@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
578 return (dma_addr_t)address; 727 return (dma_addr_t)address;
579} 728}
580 729
730/*
731 * The generic unmapping function for on page in the DMA address space.
732 */
581static void dma_ops_domain_unmap(struct amd_iommu *iommu, 733static void dma_ops_domain_unmap(struct amd_iommu *iommu,
582 struct dma_ops_domain *dom, 734 struct dma_ops_domain *dom,
583 unsigned long address) 735 unsigned long address)
@@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
597 *pte = 0ULL; 749 *pte = 0ULL;
598} 750}
599 751
752/*
753 * This function contains common code for mapping of a physically
754 * contiguous memory region into DMA address space. It is uses by all
755 * mapping functions provided by this IOMMU driver.
756 * Must be called with the domain lock held.
757 */
600static dma_addr_t __map_single(struct device *dev, 758static dma_addr_t __map_single(struct device *dev,
601 struct amd_iommu *iommu, 759 struct amd_iommu *iommu,
602 struct dma_ops_domain *dma_dom, 760 struct dma_ops_domain *dma_dom,
@@ -628,6 +786,10 @@ out:
628 return address; 786 return address;
629} 787}
630 788
789/*
790 * Does the reverse of the __map_single function. Must be called with
791 * the domain lock held too
792 */
631static void __unmap_single(struct amd_iommu *iommu, 793static void __unmap_single(struct amd_iommu *iommu,
632 struct dma_ops_domain *dma_dom, 794 struct dma_ops_domain *dma_dom,
633 dma_addr_t dma_addr, 795 dma_addr_t dma_addr,
@@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
652 dma_ops_free_addresses(dma_dom, dma_addr, pages); 814 dma_ops_free_addresses(dma_dom, dma_addr, pages);
653} 815}
654 816
817/*
818 * The exported map_single function for dma_ops.
819 */
655static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 820static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
656 size_t size, int dir) 821 size_t size, int dir)
657{ 822{
@@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
664 get_device_resources(dev, &iommu, &domain, &devid); 829 get_device_resources(dev, &iommu, &domain, &devid);
665 830
666 if (iommu == NULL || domain == NULL) 831 if (iommu == NULL || domain == NULL)
832 /* device not handled by any AMD IOMMU */
667 return (dma_addr_t)paddr; 833 return (dma_addr_t)paddr;
668 834
669 spin_lock_irqsave(&domain->lock, flags); 835 spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +849,9 @@ out:
683 return addr; 849 return addr;
684} 850}
685 851
852/*
853 * The exported unmap_single function for dma_ops.
854 */
686static void unmap_single(struct device *dev, dma_addr_t dma_addr, 855static void unmap_single(struct device *dev, dma_addr_t dma_addr,
687 size_t size, int dir) 856 size_t size, int dir)
688{ 857{
@@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
692 u16 devid; 861 u16 devid;
693 862
694 if (!get_device_resources(dev, &iommu, &domain, &devid)) 863 if (!get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */
695 return; 865 return;
696 866
697 spin_lock_irqsave(&domain->lock, flags); 867 spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
706 spin_unlock_irqrestore(&domain->lock, flags); 876 spin_unlock_irqrestore(&domain->lock, flags);
707} 877}
708 878
879/*
880 * This is a special map_sg function which is used if we should map a
881 * device which is not handled by an AMD IOMMU in the system.
882 */
709static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, 883static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
710 int nelems, int dir) 884 int nelems, int dir)
711{ 885{
@@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
720 return nelems; 894 return nelems;
721} 895}
722 896
897/*
898 * The exported map_sg function for dma_ops (handles scatter-gather
899 * lists).
900 */
723static int map_sg(struct device *dev, struct scatterlist *sglist, 901static int map_sg(struct device *dev, struct scatterlist *sglist,
724 int nelems, int dir) 902 int nelems, int dir)
725{ 903{
@@ -775,6 +953,10 @@ unmap:
775 goto out; 953 goto out;
776} 954}
777 955
956/*
957 * The exported map_sg function for dma_ops (handles scatter-gather
958 * lists).
959 */
778static void unmap_sg(struct device *dev, struct scatterlist *sglist, 960static void unmap_sg(struct device *dev, struct scatterlist *sglist,
779 int nelems, int dir) 961 int nelems, int dir)
780{ 962{
@@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
804 spin_unlock_irqrestore(&domain->lock, flags); 986 spin_unlock_irqrestore(&domain->lock, flags);
805} 987}
806 988
989/*
990 * The exported alloc_coherent function for dma_ops.
991 */
807static void *alloc_coherent(struct device *dev, size_t size, 992static void *alloc_coherent(struct device *dev, size_t size,
808 dma_addr_t *dma_addr, gfp_t flag) 993 dma_addr_t *dma_addr, gfp_t flag)
809{ 994{
@@ -851,6 +1036,11 @@ out:
851 return virt_addr; 1036 return virt_addr;
852} 1037}
853 1038
1039/*
1040 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */
854static void free_coherent(struct device *dev, size_t size, 1044static void free_coherent(struct device *dev, size_t size,
855 void *virt_addr, dma_addr_t dma_addr) 1045 void *virt_addr, dma_addr_t dma_addr)
856{ 1046{
@@ -879,6 +1069,8 @@ free_mem:
879} 1069}
880 1070
881/* 1071/*
1072 * The function for pre-allocating protection domains.
1073 *
882 * If the driver core informs the DMA layer if a driver grabs a device 1074 * If the driver core informs the DMA layer if a driver grabs a device
883 * we don't need to preallocate the protection domains anymore. 1075 * we don't need to preallocate the protection domains anymore.
884 * For now we have to. 1076 * For now we have to.
@@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
921 .unmap_sg = unmap_sg, 1113 .unmap_sg = unmap_sg,
922}; 1114};
923 1115
1116/*
1117 * The function which clues the AMD IOMMU driver into dma_ops.
1118 */
924int __init amd_iommu_init_dma_ops(void) 1119int __init amd_iommu_init_dma_ops(void)
925{ 1120{
926 struct amd_iommu *iommu; 1121 struct amd_iommu *iommu;
927 int order = amd_iommu_aperture_order; 1122 int order = amd_iommu_aperture_order;
928 int ret; 1123 int ret;
929 1124
1125 /*
1126 * first allocate a default protection domain for every IOMMU we
1127 * found in the system. Devices not assigned to any other
1128 * protection domain will be assigned to the default one.
1129 */
930 list_for_each_entry(iommu, &amd_iommu_list, list) { 1130 list_for_each_entry(iommu, &amd_iommu_list, list) {
931 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1131 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
932 if (iommu->default_dom == NULL) 1132 if (iommu->default_dom == NULL)
@@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
936 goto free_domains; 1136 goto free_domains;
937 } 1137 }
938 1138
1139 /*
1140 * If device isolation is enabled, pre-allocate the protection
1141 * domains for each device.
1142 */
939 if (amd_iommu_isolate) 1143 if (amd_iommu_isolate)
940 prealloc_protection_domains(); 1144 prealloc_protection_domains();
941 1145
@@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
947 gart_iommu_aperture = 0; 1151 gart_iommu_aperture = 0;
948#endif 1152#endif
949 1153
1154 /* Make the driver finally visible to the drivers */
950 dma_ops = &amd_iommu_dma_ops; 1155 dma_ops = &amd_iommu_dma_ops;
951 1156
952 return 0; 1157 return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437..c9d8ff2eb13 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -25,20 +25,13 @@
25#include <asm/pci-direct.h> 25#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 26#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 27#include <asm/amd_iommu.h>
28#include <asm/gart.h> 28#include <asm/iommu.h>
29 29
30/* 30/*
31 * definitions for the ACPI scanning code 31 * definitions for the ACPI scanning code
32 */ 32 */
33#define UPDATE_LAST_BDF(x) do {\
34 if ((x) > amd_iommu_last_bdf) \
35 amd_iommu_last_bdf = (x); \
36 } while (0);
37
38#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
39#define PCI_BUS(x) (((x) >> 8) & 0xff) 33#define PCI_BUS(x) (((x) >> 8) & 0xff)
40#define IVRS_HEADER_LENGTH 48 34#define IVRS_HEADER_LENGTH 48
41#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
42 35
43#define ACPI_IVHD_TYPE 0x10 36#define ACPI_IVHD_TYPE 0x10
44#define ACPI_IVMD_TYPE_ALL 0x20 37#define ACPI_IVMD_TYPE_ALL 0x20
@@ -71,6 +64,17 @@
71#define ACPI_DEVFLAG_LINT1 0x80 64#define ACPI_DEVFLAG_LINT1 0x80
72#define ACPI_DEVFLAG_ATSDIS 0x10000000 65#define ACPI_DEVFLAG_ATSDIS 0x10000000
73 66
67/*
68 * ACPI table definitions
69 *
70 * These data structures are laid over the table to parse the important values
71 * out of it.
72 */
73
74/*
75 * structure describing one IOMMU in the ACPI table. Typically followed by one
76 * or more ivhd_entrys.
77 */
74struct ivhd_header { 78struct ivhd_header {
75 u8 type; 79 u8 type;
76 u8 flags; 80 u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
83 u32 reserved; 87 u32 reserved;
84} __attribute__((packed)); 88} __attribute__((packed));
85 89
90/*
91 * A device entry describing which devices a specific IOMMU translates and
92 * which requestor ids they use.
93 */
86struct ivhd_entry { 94struct ivhd_entry {
87 u8 type; 95 u8 type;
88 u16 devid; 96 u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
90 u32 ext; 98 u32 ext;
91} __attribute__((packed)); 99} __attribute__((packed));
92 100
101/*
102 * An AMD IOMMU memory definition structure. It defines things like exclusion
103 * ranges for devices and regions that should be unity mapped.
104 */
93struct ivmd_header { 105struct ivmd_header {
94 u8 type; 106 u8 type;
95 u8 flags; 107 u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
103 115
104static int __initdata amd_iommu_detected; 116static int __initdata amd_iommu_detected;
105 117
106u16 amd_iommu_last_bdf; 118u16 amd_iommu_last_bdf; /* largest PCI device id we have
107struct list_head amd_iommu_unity_map; 119 to handle */
108unsigned amd_iommu_aperture_order = 26; 120LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
109int amd_iommu_isolate; 121 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */
124
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */
110 127
111struct list_head amd_iommu_list; 128/*
129 * Pointer to the device table which is shared by all AMD IOMMUs
130 * it is indexed by the PCI device id or the HT unit id and contains
131 * information about the domain the device belongs to as well as the
132 * page table root pointer.
133 */
112struct dev_table_entry *amd_iommu_dev_table; 134struct dev_table_entry *amd_iommu_dev_table;
135
136/*
137 * The alias table is a driver specific data structure which contains the
138 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
139 * More than one device can share the same requestor id.
140 */
113u16 *amd_iommu_alias_table; 141u16 *amd_iommu_alias_table;
142
143/*
144 * The rlookup table is used to find the IOMMU which is responsible
145 * for a specific device. It is also indexed by the PCI device id.
146 */
114struct amd_iommu **amd_iommu_rlookup_table; 147struct amd_iommu **amd_iommu_rlookup_table;
148
149/*
150 * The pd table (protection domain table) is used to find the protection domain
151 * data structure a device belongs to. Indexed with the PCI device id too.
152 */
115struct protection_domain **amd_iommu_pd_table; 153struct protection_domain **amd_iommu_pd_table;
154
155/*
156 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
157 * to know which ones are already in use.
158 */
116unsigned long *amd_iommu_pd_alloc_bitmap; 159unsigned long *amd_iommu_pd_alloc_bitmap;
117 160
118static u32 dev_table_size; 161static u32 dev_table_size; /* size of the device table */
119static u32 alias_table_size; 162static u32 alias_table_size; /* size of the alias table */
120static u32 rlookup_table_size; 163static u32 rlookup_table_size; /* size if the rlookup table */
121 164
165static inline void update_last_devid(u16 devid)
166{
167 if (devid > amd_iommu_last_bdf)
168 amd_iommu_last_bdf = devid;
169}
170
171static inline unsigned long tbl_size(int entry_size)
172{
173 unsigned shift = PAGE_SHIFT +
174 get_order(amd_iommu_last_bdf * entry_size);
175
176 return 1UL << shift;
177}
178
179/****************************************************************************
180 *
181 * AMD IOMMU MMIO register space handling functions
182 *
183 * These functions are used to program the IOMMU device registers in
184 * MMIO space required for that driver.
185 *
186 ****************************************************************************/
187
188/*
189 * This function set the exclusion range in the IOMMU. DMA accesses to the
190 * exclusion range are passed through untranslated
191 */
122static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 192static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
123{ 193{
124 u64 start = iommu->exclusion_start & PAGE_MASK; 194 u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
137 &entry, sizeof(entry)); 207 &entry, sizeof(entry));
138} 208}
139 209
210/* Programs the physical address of the device table into the IOMMU hardware */
140static void __init iommu_set_device_table(struct amd_iommu *iommu) 211static void __init iommu_set_device_table(struct amd_iommu *iommu)
141{ 212{
142 u32 entry; 213 u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
149 &entry, sizeof(entry)); 220 &entry, sizeof(entry));
150} 221}
151 222
223/* Generic functions to enable/disable certain features of the IOMMU. */
152static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 224static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
153{ 225{
154 u32 ctrl; 226 u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
167 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
168} 240}
169 241
242/* Function to enable the hardware */
170void __init iommu_enable(struct amd_iommu *iommu) 243void __init iommu_enable(struct amd_iommu *iommu)
171{ 244{
172 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
176 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
177} 250}
178 251
252/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one.
255 */
179static u8 * __init iommu_map_mmio_space(u64 address) 256static u8 * __init iommu_map_mmio_space(u64 address)
180{ 257{
181 u8 *ret; 258 u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
199 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); 276 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
200} 277}
201 278
279/****************************************************************************
280 *
281 * The functions below belong to the first pass of AMD IOMMU ACPI table
282 * parsing. In this pass we try to find out the highest device id this
283 * code has to handle. Upon this information the size of the shared data
284 * structures is determined later.
285 *
286 ****************************************************************************/
287
288/*
289 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU
291 */
202static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) 292static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
203{ 293{
204 u32 cap; 294 u32 cap;
205 295
206 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 296 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
207 UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); 297 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
208 298
209 return 0; 299 return 0;
210} 300}
211 301
302/*
303 * After reading the highest device id from the IOMMU PCI capability header
304 * this function looks if there is a higher device id defined in the ACPI table
305 */
212static int __init find_last_devid_from_ivhd(struct ivhd_header *h) 306static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
213{ 307{
214 u8 *p = (void *)h, *end = (void *)h; 308 u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
229 case IVHD_DEV_RANGE_END: 323 case IVHD_DEV_RANGE_END:
230 case IVHD_DEV_ALIAS: 324 case IVHD_DEV_ALIAS:
231 case IVHD_DEV_EXT_SELECT: 325 case IVHD_DEV_EXT_SELECT:
232 UPDATE_LAST_BDF(dev->devid); 326 /* all the above subfield types refer to device ids */
327 update_last_devid(dev->devid);
233 break; 328 break;
234 default: 329 default:
235 break; 330 break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
242 return 0; 337 return 0;
243} 338}
244 339
340/*
341 * Iterate over all IVHD entries in the ACPI table and find the highest device
342 * id which we need to handle. This is the first of three functions which parse
343 * the ACPI table. So we check the checksum here.
344 */
245static int __init find_last_devid_acpi(struct acpi_table_header *table) 345static int __init find_last_devid_acpi(struct acpi_table_header *table)
246{ 346{
247 int i; 347 int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
277 return 0; 377 return 0;
278} 378}
279 379
380/****************************************************************************
381 *
382 * The following functions belong the the code path which parses the ACPI table
383 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
384 * data structures, initialize the device/alias/rlookup table and also
385 * basically initialize the hardware.
386 *
387 ****************************************************************************/
388
389/*
390 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
391 * write commands to that buffer later and the IOMMU will execute them
392 * asynchronously
393 */
280static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) 394static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
281{ 395{
282 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, 396 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
283 get_order(CMD_BUFFER_SIZE)); 397 get_order(CMD_BUFFER_SIZE));
284 u64 entry = 0; 398 u64 entry;
285 399
286 if (cmd_buf == NULL) 400 if (cmd_buf == NULL)
287 return NULL; 401 return NULL;
288 402
289 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 403 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
290 404
291 memset(cmd_buf, 0, CMD_BUFFER_SIZE);
292
293 entry = (u64)virt_to_phys(cmd_buf); 405 entry = (u64)virt_to_phys(cmd_buf);
294 entry |= MMIO_CMD_SIZE_512; 406 entry |= MMIO_CMD_SIZE_512;
295 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 407 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
302 414
303static void __init free_command_buffer(struct amd_iommu *iommu) 415static void __init free_command_buffer(struct amd_iommu *iommu)
304{ 416{
305 if (iommu->cmd_buf) 417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
306 free_pages((unsigned long)iommu->cmd_buf,
307 get_order(CMD_BUFFER_SIZE));
308} 418}
309 419
420/* sets a specific bit in the device table entry. */
310static void set_dev_entry_bit(u16 devid, u8 bit) 421static void set_dev_entry_bit(u16 devid, u8 bit)
311{ 422{
312 int i = (bit >> 5) & 0x07; 423 int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
315 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 426 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
316} 427}
317 428
318static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) 429/* Writes the specific IOMMU for a device into the rlookup table */
430static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
431{
432 amd_iommu_rlookup_table[devid] = iommu;
433}
434
435/*
436 * This function takes the device specific flags read from the ACPI
437 * table and sets up the device table entry with that information
438 */
439static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
440 u16 devid, u32 flags, u32 ext_flags)
319{ 441{
320 if (flags & ACPI_DEVFLAG_INITPASS) 442 if (flags & ACPI_DEVFLAG_INITPASS)
321 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); 443 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
331 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); 453 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
332 if (flags & ACPI_DEVFLAG_LINT1) 454 if (flags & ACPI_DEVFLAG_LINT1)
333 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 455 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
334}
335 456
336static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 457 set_iommu_for_device(iommu, devid);
337{
338 amd_iommu_rlookup_table[devid] = iommu;
339} 458}
340 459
460/*
461 * Reads the device exclusion range from ACPI and initialize IOMMU with
462 * it
463 */
341static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) 464static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
342{ 465{
343 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 466 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
346 return; 469 return;
347 470
348 if (iommu) { 471 if (iommu) {
472 /*
473 * We only can configure exclusion ranges per IOMMU, not
474 * per device. But we can enable the exclusion range per
475 * device. This is done here
476 */
349 set_dev_entry_bit(m->devid, DEV_ENTRY_EX); 477 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
350 iommu->exclusion_start = m->range_start; 478 iommu->exclusion_start = m->range_start;
351 iommu->exclusion_length = m->range_length; 479 iommu->exclusion_length = m->range_length;
352 } 480 }
353} 481}
354 482
483/*
484 * This function reads some important data from the IOMMU PCI space and
485 * initializes the driver data structure with it. It reads the hardware
486 * capabilities and the first/last device entries
487 */
355static void __init init_iommu_from_pci(struct amd_iommu *iommu) 488static void __init init_iommu_from_pci(struct amd_iommu *iommu)
356{ 489{
357 int bus = PCI_BUS(iommu->devid); 490 int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
363 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
364 497
365 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
366 iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); 499 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
367 iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); 500 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range));
368} 503}
369 504
505/*
506 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
507 * initializes the hardware and our data structures with it.
508 */
370static void __init init_iommu_from_acpi(struct amd_iommu *iommu, 509static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
371 struct ivhd_header *h) 510 struct ivhd_header *h)
372{ 511{
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
374 u8 *end = p, flags = 0; 513 u8 *end = p, flags = 0;
375 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 514 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
376 u32 ext_flags = 0; 515 u32 ext_flags = 0;
377 bool alias = 0; 516 bool alias = false;
378 struct ivhd_entry *e; 517 struct ivhd_entry *e;
379 518
380 /* 519 /*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
414 case IVHD_DEV_ALL: 553 case IVHD_DEV_ALL:
415 for (dev_i = iommu->first_device; 554 for (dev_i = iommu->first_device;
416 dev_i <= iommu->last_device; ++dev_i) 555 dev_i <= iommu->last_device; ++dev_i)
417 set_dev_entry_from_acpi(dev_i, e->flags, 0); 556 set_dev_entry_from_acpi(iommu, dev_i,
557 e->flags, 0);
418 break; 558 break;
419 case IVHD_DEV_SELECT: 559 case IVHD_DEV_SELECT:
420 devid = e->devid; 560 devid = e->devid;
421 set_dev_entry_from_acpi(devid, e->flags, 0); 561 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
422 break; 562 break;
423 case IVHD_DEV_SELECT_RANGE_START: 563 case IVHD_DEV_SELECT_RANGE_START:
424 devid_start = e->devid; 564 devid_start = e->devid;
425 flags = e->flags; 565 flags = e->flags;
426 ext_flags = 0; 566 ext_flags = 0;
427 alias = 0; 567 alias = false;
428 break; 568 break;
429 case IVHD_DEV_ALIAS: 569 case IVHD_DEV_ALIAS:
430 devid = e->devid; 570 devid = e->devid;
431 devid_to = e->ext >> 8; 571 devid_to = e->ext >> 8;
432 set_dev_entry_from_acpi(devid, e->flags, 0); 572 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
433 amd_iommu_alias_table[devid] = devid_to; 573 amd_iommu_alias_table[devid] = devid_to;
434 break; 574 break;
435 case IVHD_DEV_ALIAS_RANGE: 575 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
437 flags = e->flags; 577 flags = e->flags;
438 devid_to = e->ext >> 8; 578 devid_to = e->ext >> 8;
439 ext_flags = 0; 579 ext_flags = 0;
440 alias = 1; 580 alias = true;
441 break; 581 break;
442 case IVHD_DEV_EXT_SELECT: 582 case IVHD_DEV_EXT_SELECT:
443 devid = e->devid; 583 devid = e->devid;
444 set_dev_entry_from_acpi(devid, e->flags, e->ext); 584 set_dev_entry_from_acpi(iommu, devid, e->flags,
585 e->ext);
445 break; 586 break;
446 case IVHD_DEV_EXT_SELECT_RANGE: 587 case IVHD_DEV_EXT_SELECT_RANGE:
447 devid_start = e->devid; 588 devid_start = e->devid;
448 flags = e->flags; 589 flags = e->flags;
449 ext_flags = e->ext; 590 ext_flags = e->ext;
450 alias = 0; 591 alias = false;
451 break; 592 break;
452 case IVHD_DEV_RANGE_END: 593 case IVHD_DEV_RANGE_END:
453 devid = e->devid; 594 devid = e->devid;
454 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 595 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
455 if (alias) 596 if (alias)
456 amd_iommu_alias_table[dev_i] = devid_to; 597 amd_iommu_alias_table[dev_i] = devid_to;
457 set_dev_entry_from_acpi( 598 set_dev_entry_from_acpi(iommu,
458 amd_iommu_alias_table[dev_i], 599 amd_iommu_alias_table[dev_i],
459 flags, ext_flags); 600 flags, ext_flags);
460 } 601 }
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
467 } 608 }
468} 609}
469 610
611/* Initializes the device->iommu mapping for the driver */
470static int __init init_iommu_devices(struct amd_iommu *iommu) 612static int __init init_iommu_devices(struct amd_iommu *iommu)
471{ 613{
472 u16 i; 614 u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
494 } 636 }
495} 637}
496 638
639/*
640 * This function clues the initialization function for one IOMMU
641 * together and also allocates the command buffer and programs the
642 * hardware. It does NOT enable the IOMMU. This is done afterwards.
643 */
497static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 644static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
498{ 645{
499 spin_lock_init(&iommu->lock); 646 spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
521 return 0; 668 return 0;
522} 669}
523 670
671/*
672 * Iterates over all IOMMU entries in the ACPI table, allocates the
673 * IOMMU structure and initializes it with init_iommu_one()
674 */
524static int __init init_iommu_all(struct acpi_table_header *table) 675static int __init init_iommu_all(struct acpi_table_header *table)
525{ 676{
526 u8 *p = (u8 *)table, *end = (u8 *)table; 677 u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
528 struct amd_iommu *iommu; 679 struct amd_iommu *iommu;
529 int ret; 680 int ret;
530 681
531 INIT_LIST_HEAD(&amd_iommu_list);
532
533 end += table->length; 682 end += table->length;
534 p += IVRS_HEADER_LENGTH; 683 p += IVRS_HEADER_LENGTH;
535 684
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
555 return 0; 704 return 0;
556} 705}
557 706
707/****************************************************************************
708 *
709 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges).
712 *
713 ****************************************************************************/
714
558static void __init free_unity_maps(void) 715static void __init free_unity_maps(void)
559{ 716{
560 struct unity_map_entry *entry, *next; 717 struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
565 } 722 }
566} 723}
567 724
725/* called when we find an exclusion range definition in ACPI */
568static int __init init_exclusion_range(struct ivmd_header *m) 726static int __init init_exclusion_range(struct ivmd_header *m)
569{ 727{
570 int i; 728 int i;
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
588 return 0; 746 return 0;
589} 747}
590 748
749/* called for unity map ACPI definition */
591static int __init init_unity_map_range(struct ivmd_header *m) 750static int __init init_unity_map_range(struct ivmd_header *m)
592{ 751{
593 struct unity_map_entry *e = 0; 752 struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
619 return 0; 778 return 0;
620} 779}
621 780
781/* iterates over all memory definitions we find in the ACPI table */
622static int __init init_memory_definitions(struct acpi_table_header *table) 782static int __init init_memory_definitions(struct acpi_table_header *table)
623{ 783{
624 u8 *p = (u8 *)table, *end = (u8 *)table; 784 u8 *p = (u8 *)table, *end = (u8 *)table;
625 struct ivmd_header *m; 785 struct ivmd_header *m;
626 786
627 INIT_LIST_HEAD(&amd_iommu_unity_map);
628
629 end += table->length; 787 end += table->length;
630 p += IVRS_HEADER_LENGTH; 788 p += IVRS_HEADER_LENGTH;
631 789
@@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
642 return 0; 800 return 0;
643} 801}
644 802
803/*
804 * This function finally enables all IOMMUs found in the system after
805 * they have been initialized
806 */
645static void __init enable_iommus(void) 807static void __init enable_iommus(void)
646{ 808{
647 struct amd_iommu *iommu; 809 struct amd_iommu *iommu;
@@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
678 .cls = &amd_iommu_sysdev_class, 840 .cls = &amd_iommu_sysdev_class,
679}; 841};
680 842
843/*
844 * This is the core init function for AMD IOMMU hardware in the system.
845 * This function is called from the generic x86 DMA layer initialization
846 * code.
847 *
848 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
849 * three times:
850 *
851 * 1 pass) Find the highest PCI device id the driver has to handle.
852 * Upon this information the size of the data structures is
853 * determined that needs to be allocated.
854 *
855 * 2 pass) Initialize the data structures just allocated with the
856 * information in the ACPI table about available AMD IOMMUs
857 * in the system. It also maps the PCI devices in the
858 * system to specific IOMMUs
859 *
860 * 3 pass) After the basic data structures are allocated and
861 * initialized we update them with information about memory
862 * remapping requirements parsed out of the ACPI table in
863 * this last pass.
864 *
865 * After that the hardware is initialized and ready to go. In the last
866 * step we do some Linux specific things like registering the driver in
867 * the dma_ops interface and initializing the suspend/resume support
868 * functions. Finally it prints some information about AMD IOMMUs and
869 * the driver state and enables the hardware.
870 */
681int __init amd_iommu_init(void) 871int __init amd_iommu_init(void)
682{ 872{
683 int i, ret = 0; 873 int i, ret = 0;
@@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
699 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 889 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
700 return -ENODEV; 890 return -ENODEV;
701 891
702 dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); 892 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
703 alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); 893 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
704 rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); 894 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
705 895
706 ret = -ENOMEM; 896 ret = -ENOMEM;
707 897
708 /* Device table - directly used by all IOMMUs */ 898 /* Device table - directly used by all IOMMUs */
709 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, 899 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
710 get_order(dev_table_size)); 900 get_order(dev_table_size));
711 if (amd_iommu_dev_table == NULL) 901 if (amd_iommu_dev_table == NULL)
712 goto out; 902 goto out;
@@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
730 * Protection Domain table - maps devices to protection domains 920 * Protection Domain table - maps devices to protection domains
731 * This table has the same size as the rlookup_table 921 * This table has the same size as the rlookup_table
732 */ 922 */
733 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, 923 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
734 get_order(rlookup_table_size)); 924 get_order(rlookup_table_size));
735 if (amd_iommu_pd_table == NULL) 925 if (amd_iommu_pd_table == NULL)
736 goto free; 926 goto free;
737 927
738 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, 928 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
929 GFP_KERNEL | __GFP_ZERO,
739 get_order(MAX_DOMAIN_ID/8)); 930 get_order(MAX_DOMAIN_ID/8));
740 if (amd_iommu_pd_alloc_bitmap == NULL) 931 if (amd_iommu_pd_alloc_bitmap == NULL)
741 goto free; 932 goto free;
742 933
743 /* 934 /*
744 * memory is allocated now; initialize the device table with all zeroes 935 * let all alias entries point to itself
745 * and let all alias entries point to itself
746 */ 936 */
747 memset(amd_iommu_dev_table, 0, dev_table_size);
748 for (i = 0; i < amd_iommu_last_bdf; ++i) 937 for (i = 0; i < amd_iommu_last_bdf; ++i)
749 amd_iommu_alias_table[i] = i; 938 amd_iommu_alias_table[i] = i;
750 939
751 memset(amd_iommu_pd_table, 0, rlookup_table_size);
752 memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
753
754 /* 940 /*
755 * never allocate domain 0 because its used as the non-allocated and 941 * never allocate domain 0 because its used as the non-allocated and
756 * error value placeholder 942 * error value placeholder
@@ -795,24 +981,19 @@ out:
795 return ret; 981 return ret;
796 982
797free: 983free:
798 if (amd_iommu_pd_alloc_bitmap) 984 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
799 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
800 985
801 if (amd_iommu_pd_table) 986 free_pages((unsigned long)amd_iommu_pd_table,
802 free_pages((unsigned long)amd_iommu_pd_table, 987 get_order(rlookup_table_size));
803 get_order(rlookup_table_size));
804 988
805 if (amd_iommu_rlookup_table) 989 free_pages((unsigned long)amd_iommu_rlookup_table,
806 free_pages((unsigned long)amd_iommu_rlookup_table, 990 get_order(rlookup_table_size));
807 get_order(rlookup_table_size));
808 991
809 if (amd_iommu_alias_table) 992 free_pages((unsigned long)amd_iommu_alias_table,
810 free_pages((unsigned long)amd_iommu_alias_table, 993 get_order(alias_table_size));
811 get_order(alias_table_size));
812 994
813 if (amd_iommu_dev_table) 995 free_pages((unsigned long)amd_iommu_dev_table,
814 free_pages((unsigned long)amd_iommu_dev_table, 996 get_order(dev_table_size));
815 get_order(dev_table_size));
816 997
817 free_iommu_all(); 998 free_iommu_all();
818 999
@@ -821,6 +1002,13 @@ free:
821 goto out; 1002 goto out;
822} 1003}
823 1004
1005/****************************************************************************
1006 *
1007 * Early detect code. This code runs at IOMMU detection time in the DMA
1008 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1009 * IOMMUs
1010 *
1011 ****************************************************************************/
824static int __init early_amd_iommu_detect(struct acpi_table_header *table) 1012static int __init early_amd_iommu_detect(struct acpi_table_header *table)
825{ 1013{
826 return 0; 1014 return 0;
@@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
828 1016
829void __init amd_iommu_detect(void) 1017void __init amd_iommu_detect(void)
830{ 1018{
831 if (swiotlb || no_iommu || iommu_detected) 1019 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
832 return; 1020 return;
833 1021
834 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1022 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
841 } 1029 }
842} 1030}
843 1031
1032/****************************************************************************
1033 *
1034 * Parsing functions for the AMD IOMMU specific kernel command line
1035 * options.
1036 *
1037 ****************************************************************************/
1038
844static int __init parse_amd_iommu_options(char *str) 1039static int __init parse_amd_iommu_options(char *str)
845{ 1040{
846 for (; *str; ++str) { 1041 for (; *str; ++str) {
@@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
853 1048
854static int __init parse_amd_iommu_size_options(char *str) 1049static int __init parse_amd_iommu_size_options(char *str)
855{ 1050{
856 for (; *str; ++str) { 1051 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
857 if (strcmp(str, "32M") == 0) 1052
858 amd_iommu_aperture_order = 25; 1053 if ((order > 24) && (order < 31))
859 if (strcmp(str, "64M") == 0) 1054 amd_iommu_aperture_order = order;
860 amd_iommu_aperture_order = 26;
861 if (strcmp(str, "128M") == 0)
862 amd_iommu_aperture_order = 27;
863 if (strcmp(str, "256M") == 0)
864 amd_iommu_aperture_order = 28;
865 if (strcmp(str, "512M") == 0)
866 amd_iommu_aperture_order = 29;
867 if (strcmp(str, "1G") == 0)
868 amd_iommu_aperture_order = 30;
869 }
870 1055
871 return 1; 1056 return 1;
872} 1057}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a..44e21826db1 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index a437d027f20..d6c89835837 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
75/* 75/*
76 * Debug level, exported for io_apic.c 76 * Debug level, exported for io_apic.c
77 */ 77 */
78int apic_verbosity; 78unsigned int apic_verbosity;
79 79
80int pic_mode; 80int pic_mode;
81 81
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
177 /* Level triggered for 82489DX */ 177 /* Level triggered for 82489DX */
178 if (!lapic_is_integrated()) 178 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 179 v |= APIC_LVT_LEVEL_TRIGGER;
180 apic_write_around(APIC_LVT0, v); 180 apic_write(APIC_LVT0, v);
181} 181}
182 182
183/** 183/**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 212 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 213 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 214 * call this function only once, with the real, calibrated value.
215 *
216 * We do reads before writes even if unnecessary, to get around the
217 * P5 APIC double write bug.
218 */ 215 */
219static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
220{ 217{
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
229 if (!irqen) 226 if (!irqen)
230 lvtt_value |= APIC_LVT_MASKED; 227 lvtt_value |= APIC_LVT_MASKED;
231 228
232 apic_write_around(APIC_LVTT, lvtt_value); 229 apic_write(APIC_LVTT, lvtt_value);
233 230
234 /* 231 /*
235 * Divide PICLK by 16 232 * Divide PICLK by 16
236 */ 233 */
237 tmp_value = apic_read(APIC_TDCR); 234 tmp_value = apic_read(APIC_TDCR);
238 apic_write_around(APIC_TDCR, (tmp_value 235 apic_write(APIC_TDCR,
239 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
240 | APIC_TDR_DIV_16); 237 APIC_TDR_DIV_16);
241 238
242 if (!oneshot) 239 if (!oneshot)
243 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
244} 241}
245 242
246/* 243/*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
249static int lapic_next_event(unsigned long delta, 246static int lapic_next_event(unsigned long delta,
250 struct clock_event_device *evt) 247 struct clock_event_device *evt)
251{ 248{
252 apic_write_around(APIC_TMICT, delta); 249 apic_write(APIC_TMICT, delta);
253 return 0; 250 return 0;
254} 251}
255 252
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
278 case CLOCK_EVT_MODE_SHUTDOWN: 275 case CLOCK_EVT_MODE_SHUTDOWN:
279 v = apic_read(APIC_LVTT); 276 v = apic_read(APIC_LVTT);
280 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 277 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
281 apic_write_around(APIC_LVTT, v); 278 apic_write(APIC_LVTT, v);
282 break; 279 break;
283 case CLOCK_EVT_MODE_RESUME: 280 case CLOCK_EVT_MODE_RESUME:
284 /* Nothing to do here */ 281 /* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
372 } 369 }
373} 370}
374 371
375/* 372static int __init calibrate_APIC_clock(void)
376 * Setup the boot APIC
377 *
378 * Calibrate and verify the result.
379 */
380void __init setup_boot_APIC_clock(void)
381{ 373{
382 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 374 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
383 const long pm_100ms = PMTMR_TICKS_PER_SEC/10; 375 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
387 long delta, deltapm; 379 long delta, deltapm;
388 int pm_referenced = 0; 380 int pm_referenced = 0;
389 381
390 /*
391 * The local apic timer can be disabled via the kernel
392 * commandline or from the CPU detection code. Register the lapic
393 * timer as a dummy clock event source on SMP systems, so the
394 * broadcast mechanism is used. On UP systems simply ignore it.
395 */
396 if (local_apic_timer_disabled) {
397 /* No broadcast on UP ! */
398 if (num_possible_cpus() > 1) {
399 lapic_clockevent.mult = 1;
400 setup_APIC_timer();
401 }
402 return;
403 }
404
405 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
406 "calibrating APIC timer ...\n");
407
408 local_irq_disable(); 382 local_irq_disable();
409 383
410 /* Replace the global interrupt handler */ 384 /* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
489 calibration_result / (1000000 / HZ), 463 calibration_result / (1000000 / HZ),
490 calibration_result % (1000000 / HZ)); 464 calibration_result % (1000000 / HZ));
491 465
492 local_apic_timer_verify_ok = 1;
493
494 /* 466 /*
495 * Do a sanity check on the APIC calibration result 467 * Do a sanity check on the APIC calibration result
496 */ 468 */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
498 local_irq_enable(); 470 local_irq_enable();
499 printk(KERN_WARNING 471 printk(KERN_WARNING
500 "APIC frequency too slow, disabling apic timer\n"); 472 "APIC frequency too slow, disabling apic timer\n");
501 /* No broadcast on UP ! */ 473 return -1;
502 if (num_possible_cpus() > 1)
503 setup_APIC_timer();
504 return;
505 } 474 }
506 475
476 local_apic_timer_verify_ok = 1;
477
507 /* We trust the pm timer based calibration */ 478 /* We trust the pm timer based calibration */
508 if (!pm_referenced) { 479 if (!pm_referenced) {
509 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 480 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
543 if (!local_apic_timer_verify_ok) { 514 if (!local_apic_timer_verify_ok) {
544 printk(KERN_WARNING 515 printk(KERN_WARNING
545 "APIC timer disabled due to verification failure.\n"); 516 "APIC timer disabled due to verification failure.\n");
517 return -1;
518 }
519
520 return 0;
521}
522
523/*
524 * Setup the boot APIC
525 *
526 * Calibrate and verify the result.
527 */
528void __init setup_boot_APIC_clock(void)
529{
530 /*
531 * The local apic timer can be disabled via the kernel
532 * commandline or from the CPU detection code. Register the lapic
533 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it.
535 */
536 if (local_apic_timer_disabled) {
546 /* No broadcast on UP ! */ 537 /* No broadcast on UP ! */
547 if (num_possible_cpus() == 1) 538 if (num_possible_cpus() > 1) {
548 return; 539 lapic_clockevent.mult = 1;
549 } else { 540 setup_APIC_timer();
550 /* 541 }
551 * If nmi_watchdog is set to IO_APIC, we need the 542 return;
552 * PIT/HPET going. Otherwise register lapic as a dummy
553 * device.
554 */
555 if (nmi_watchdog != NMI_IO_APIC)
556 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
557 else
558 printk(KERN_WARNING "APIC timer registered as dummy,"
559 " due to nmi_watchdog=%d!\n", nmi_watchdog);
560 } 543 }
561 544
545 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
546 "calibrating APIC timer ...\n");
547
548 if (calibrate_APIC_clock()) {
549 /* No broadcast on UP ! */
550 if (num_possible_cpus() > 1)
551 setup_APIC_timer();
552 return;
553 }
554
555 /*
556 * If nmi_watchdog is set to IO_APIC, we need the
557 * PIT/HPET going. Otherwise register lapic as a dummy
558 * device.
559 */
560 if (nmi_watchdog != NMI_IO_APIC)
561 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
562 else
563 printk(KERN_WARNING "APIC timer registered as dummy,"
564 " due to nmi_watchdog=%d!\n", nmi_watchdog);
565
562 /* Setup the lapic or request the broadcast */ 566 /* Setup the lapic or request the broadcast */
563 setup_APIC_timer(); 567 setup_APIC_timer();
564} 568}
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
693 */ 697 */
694 if (maxlvt >= 3) { 698 if (maxlvt >= 3) {
695 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 699 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
696 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 700 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
697 } 701 }
698 /* 702 /*
699 * Careful: we have to set masks only first to deassert 703 * Careful: we have to set masks only first to deassert
700 * any level-triggered sources. 704 * any level-triggered sources.
701 */ 705 */
702 v = apic_read(APIC_LVTT); 706 v = apic_read(APIC_LVTT);
703 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 707 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
704 v = apic_read(APIC_LVT0); 708 v = apic_read(APIC_LVT0);
705 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 709 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
706 v = apic_read(APIC_LVT1); 710 v = apic_read(APIC_LVT1);
707 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 711 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
708 if (maxlvt >= 4) { 712 if (maxlvt >= 4) {
709 v = apic_read(APIC_LVTPC); 713 v = apic_read(APIC_LVTPC);
710 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 714 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
711 } 715 }
712 716
713 /* lets not touch this if we didn't frob it */ 717 /* lets not touch this if we didn't frob it */
714#ifdef CONFIG_X86_MCE_P4THERMAL 718#ifdef CONFIG_X86_MCE_P4THERMAL
715 if (maxlvt >= 5) { 719 if (maxlvt >= 5) {
716 v = apic_read(APIC_LVTTHMR); 720 v = apic_read(APIC_LVTTHMR);
717 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
718 } 722 }
719#endif 723#endif
720 /* 724 /*
721 * Clean APIC state for other OSs: 725 * Clean APIC state for other OSs:
722 */ 726 */
723 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 727 apic_write(APIC_LVTT, APIC_LVT_MASKED);
724 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 728 apic_write(APIC_LVT0, APIC_LVT_MASKED);
725 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 729 apic_write(APIC_LVT1, APIC_LVT_MASKED);
726 if (maxlvt >= 3) 730 if (maxlvt >= 3)
727 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 731 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
728 if (maxlvt >= 4) 732 if (maxlvt >= 4)
729 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 733 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
730 734
731#ifdef CONFIG_X86_MCE_P4THERMAL 735#ifdef CONFIG_X86_MCE_P4THERMAL
732 if (maxlvt >= 5) 736 if (maxlvt >= 5)
733 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
734#endif 738#endif
735 /* Integrated APIC (!82489DX) ? */ 739 /* Integrated APIC (!82489DX) ? */
736 if (lapic_is_integrated()) { 740 if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
756 */ 760 */
757 value = apic_read(APIC_SPIV); 761 value = apic_read(APIC_SPIV);
758 value &= ~APIC_SPIV_APIC_ENABLED; 762 value &= ~APIC_SPIV_APIC_ENABLED;
759 apic_write_around(APIC_SPIV, value); 763 apic_write(APIC_SPIV, value);
760 764
761 /* 765 /*
762 * When LAPIC was disabled by the BIOS and enabled by the kernel, 766 * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
865 apic_wait_icr_idle(); 869 apic_wait_icr_idle();
866 870
867 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
868 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 872 apic_write(APIC_ICR,
869 | APIC_DM_INIT); 873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
870} 874}
871 875
872/* 876/*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
902 else 906 else
903 value |= APIC_SPIV_FOCUS_DISABLED; 907 value |= APIC_SPIV_FOCUS_DISABLED;
904 value |= SPURIOUS_APIC_VECTOR; 908 value |= SPURIOUS_APIC_VECTOR;
905 apic_write_around(APIC_SPIV, value); 909 apic_write(APIC_SPIV, value);
906 910
907 /* 911 /*
908 * Set up the virtual wire mode. 912 * Set up the virtual wire mode.
909 */ 913 */
910 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 914 apic_write(APIC_LVT0, APIC_DM_EXTINT);
911 value = APIC_DM_NMI; 915 value = APIC_DM_NMI;
912 if (!lapic_is_integrated()) /* 82489DX */ 916 if (!lapic_is_integrated()) /* 82489DX */
913 value |= APIC_LVT_LEVEL_TRIGGER; 917 value |= APIC_LVT_LEVEL_TRIGGER;
914 apic_write_around(APIC_LVT1, value); 918 apic_write(APIC_LVT1, value);
915} 919}
916 920
917static void __cpuinit lapic_setup_esr(void) 921static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
926 930
927 /* enables sending errors */ 931 /* enables sending errors */
928 value = ERROR_APIC_VECTOR; 932 value = ERROR_APIC_VECTOR;
929 apic_write_around(APIC_LVTERR, value); 933 apic_write(APIC_LVTERR, value);
930 /* 934 /*
931 * spec says clear errors after enabling vector. 935 * spec says clear errors after enabling vector.
932 */ 936 */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
989 */ 993 */
990 value = apic_read(APIC_TASKPRI); 994 value = apic_read(APIC_TASKPRI);
991 value &= ~APIC_TPRI_MASK; 995 value &= ~APIC_TPRI_MASK;
992 apic_write_around(APIC_TASKPRI, value); 996 apic_write(APIC_TASKPRI, value);
993 997
994 /* 998 /*
995 * After a crash, we no longer service the interrupts and a pending 999 * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
1047 * Set spurious IRQ vector 1051 * Set spurious IRQ vector
1048 */ 1052 */
1049 value |= SPURIOUS_APIC_VECTOR; 1053 value |= SPURIOUS_APIC_VECTOR;
1050 apic_write_around(APIC_SPIV, value); 1054 apic_write(APIC_SPIV, value);
1051 1055
1052 /* 1056 /*
1053 * Set up LVT0, LVT1: 1057 * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
1069 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1073 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1070 smp_processor_id()); 1074 smp_processor_id());
1071 } 1075 }
1072 apic_write_around(APIC_LVT0, value); 1076 apic_write(APIC_LVT0, value);
1073 1077
1074 /* 1078 /*
1075 * only the BP should see the LINT1 NMI signal, obviously. 1079 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
1080 value = APIC_DM_NMI | APIC_LVT_MASKED; 1084 value = APIC_DM_NMI | APIC_LVT_MASKED;
1081 if (!integrated) /* 82489DX */ 1085 if (!integrated) /* 82489DX */
1082 value |= APIC_LVT_LEVEL_TRIGGER; 1086 value |= APIC_LVT_LEVEL_TRIGGER;
1083 apic_write_around(APIC_LVT1, value); 1087 apic_write(APIC_LVT1, value);
1084} 1088}
1085 1089
1086void __cpuinit end_local_APIC_setup(void) 1090void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
1091 /* Disable the local apic timer */ 1095 /* Disable the local apic timer */
1092 value = apic_read(APIC_LVTT); 1096 value = apic_read(APIC_LVTT);
1093 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1094 apic_write_around(APIC_LVTT, value); 1098 apic_write(APIC_LVTT, value);
1095 1099
1096 setup_apic_nmi_watchdog(NULL); 1100 setup_apic_nmi_watchdog(NULL);
1097 apic_pm_activate(); 1101 apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
1214 1218
1215int __init APIC_init_uniprocessor(void) 1219int __init APIC_init_uniprocessor(void)
1216{ 1220{
1217 if (disable_apic)
1218 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1219
1220 if (!smp_found_config && !cpu_has_apic) 1221 if (!smp_found_config && !cpu_has_apic)
1221 return -1; 1222 return -1;
1222 1223
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1419 value &= ~APIC_VECTOR_MASK; 1420 value &= ~APIC_VECTOR_MASK;
1420 value |= APIC_SPIV_APIC_ENABLED; 1421 value |= APIC_SPIV_APIC_ENABLED;
1421 value |= 0xf; 1422 value |= 0xf;
1422 apic_write_around(APIC_SPIV, value); 1423 apic_write(APIC_SPIV, value);
1423 1424
1424 if (!virt_wire_setup) { 1425 if (!virt_wire_setup) {
1425 /* 1426 /*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1432 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1433 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1434 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1435 apic_write_around(APIC_LVT0, value); 1436 apic_write(APIC_LVT0, value);
1436 } else { 1437 } else {
1437 /* Disable LVT0 */ 1438 /* Disable LVT0 */
1438 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1439 } 1440 }
1440 1441
1441 /* 1442 /*
@@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1449 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1450 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1451 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1452 apic_write_around(APIC_LVT1, value); 1453 apic_write(APIC_LVT1, value);
1453 } 1454 }
1454} 1455}
1455 1456
@@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
1700static int __init parse_nolapic(char *arg) 1701static int __init parse_nolapic(char *arg)
1701{ 1702{
1702 disable_apic = 1; 1703 disable_apic = 1;
1703 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1704 setup_clear_cpu_cap(X86_FEATURE_APIC);
1704 return 0; 1705 return 0;
1705} 1706}
1706early_param("nolapic", parse_nolapic); 1707early_param("nolapic", parse_nolapic);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1e3d32e27c1..7f1f030da7e 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
54/* 54/*
55 * Debug level, exported for io_apic.c 55 * Debug level, exported for io_apic.c
56 */ 56 */
57int apic_verbosity; 57unsigned int apic_verbosity;
58 58
59/* Have we found an MP table */ 59/* Have we found an MP table */
60int smp_found_config; 60int smp_found_config;
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
314 314
315#define TICK_COUNT 100000000 315#define TICK_COUNT 100000000
316 316
317static void __init calibrate_APIC_clock(void) 317static int __init calibrate_APIC_clock(void)
318{ 318{
319 unsigned apic, apic_start; 319 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start; 320 unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
368 clockevent_delta2ns(0xF, &lapic_clockevent); 368 clockevent_delta2ns(0xF, &lapic_clockevent);
369 369
370 calibration_result = result / HZ; 370 calibration_result = result / HZ;
371
372 /*
373 * Do a sanity check on the APIC calibration result
374 */
375 if (calibration_result < (1000000 / HZ)) {
376 printk(KERN_WARNING
377 "APIC frequency too slow, disabling apic timer\n");
378 return -1;
379 }
380
381 return 0;
371} 382}
372 383
373/* 384/*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
394 } 405 }
395 406
396 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 407 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock(); 408 if (calibrate_APIC_clock()) {
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */ 409 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1) 410 if (num_possible_cpus() > 1)
407 setup_APIC_timer(); 411 setup_APIC_timer();
@@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
1337static __init int setup_disableapic(char *str) 1341static __init int setup_disableapic(char *str)
1338{ 1342{
1339 disable_apic = 1; 1343 disable_apic = 1;
1340 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1344 setup_clear_cpu_cap(X86_FEATURE_APIC);
1341 return 0; 1345 return 0;
1342} 1346}
1343early_param("disableapic", setup_disableapic); 1347early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2..aa89387006f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 00000000000..c639bd55391
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <asm/uv/bios.h>
22
23const char *
24x86_bios_strerror(long status)
25{
26 const char *str;
27 switch (status) {
28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break;
33 }
34 return str;
35}
36
37long
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
39 unsigned long *drift_info)
40{
41 struct uv_bios_retval isrv;
42
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
44 *ticks_per_second = isrv.v0;
45 *drift_info = isrv.v1;
46 return isrv.status;
47}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d4..cae9cabc303 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,8 +24,6 @@
24extern void vide(void); 24extern void vide(void);
25__asm__(".align 4\nvide: ret"); 25__asm__(".align 4\nvide: ret");
26 26
27int force_mwait __cpuinitdata;
28
29static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
30{ 28{
31 if (cpuid_eax(0x80000000) >= 0x80000007) { 29 if (cpuid_eax(0x80000000) >= 0x80000007) {
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d..d1692b2a41f 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8)) 116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
118} 120}
119 121
120static void __cpuinit init_amd(struct cpuinfo_x86 *c) 122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338..c9b58a806e8 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -131,13 +131,7 @@ static void __init check_popad(void)
131 * (for due to lack of "invlpg" and working WP on a i386) 131 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be 132 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486. 133 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine, 134 */
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141 135
142static void __init check_config(void) 136static void __init check_config(void)
143{ 137{
@@ -151,21 +145,6 @@ static void __init check_config(void)
151 if (boot_cpu_data.x86 == 3) 145 if (boot_cpu_data.x86 == 3)
152 panic("Kernel requires i486+ for 'invlpg' and other features"); 146 panic("Kernel requires i486+ for 'invlpg' and other features");
153#endif 147#endif
154
155/*
156 * If we were told we had a good local APIC, check for buggy Pentia,
157 * i.e. all B steppings and the C2 stepping of P54C when using their
158 * integrated APIC (see 11AP erratum in "Pentium Processor
159 * Specification Update").
160 */
161#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
163 && cpu_has_apic
164 && boot_cpu_data.x86 == 5
165 && boot_cpu_data.x86_model == 2
166 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
167 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
168#endif
169} 148}
170 149
171 150
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb4..dd6e3f15017 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -7,15 +7,13 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/kgdb.h> 8#include <linux/kgdb.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/string.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
13#include <linux/module.h>
14#include <linux/percpu.h> 12#include <linux/percpu.h>
15#include <asm/processor.h>
16#include <asm/i387.h> 13#include <asm/i387.h>
17#include <asm/msr.h> 14#include <asm/msr.h>
18#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
19#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
20#include <asm/mtrr.h> 18#include <asm/mtrr.h>
21#include <asm/mce.h> 19#include <asm/mce.h>
@@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
305 c->x86_capability[2] = cpuid_edx(0x80860001); 303 c->x86_capability[2] = cpuid_edx(0x80860001);
306 } 304 }
307 305
308 c->extended_cpuid_level = cpuid_eax(0x80000000);
309 if (c->extended_cpuid_level >= 0x80000007) 306 if (c->extended_cpuid_level >= 0x80000007)
310 c->x86_power = cpuid_edx(0x80000007); 307 c->x86_power = cpuid_edx(0x80000007);
311 308
@@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
316 c->x86_phys_bits = eax & 0xff; 313 c->x86_phys_bits = eax & 0xff;
317 } 314 }
318 315
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 316 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init) 317 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c); 318 cpu_devs[c->x86_vendor]->c_early_init(c);
325 319
326 validate_pat_support(c); 320 validate_pat_support(c);
327
328 /* early_param could clear that, but recall get it set again */
329 if (disable_apic)
330 clear_cpu_cap(c, X86_FEATURE_APIC);
331} 321}
332 322
333/* 323/*
@@ -517,8 +507,7 @@ void pda_init(int cpu)
517} 507}
518 508
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 509char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ] 510 DEBUG_STKSZ] __page_aligned_bss;
521__attribute__((section(".bss.page_aligned")));
522 511
523extern asmlinkage void ignore_sysret(void); 512extern asmlinkage void ignore_sysret(void);
524 513
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1d..b75f2569b8f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
227 if (cpu_has_bts) 227 if (cpu_has_bts)
228 ds_init_intel(c); 228 ds_init_intel(c);
229 229
230 /*
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239
230#ifdef CONFIG_X86_NUMAQ 240#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable(); 241 numaq_tsc_disable();
232#endif 242#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e..ff517f0b8cc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
780 } 780 }
781 kobject_put(per_cpu(cache_kobject, cpu)); 781 kobject_put(per_cpu(cache_kobject, cpu));
782 cpuid4_cache_sysfs_exit(cpu); 782 cpuid4_cache_sysfs_exit(cpu);
783 break; 783 return retval;
784 } 784 }
785 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 785 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
786 } 786 }
787 if (!retval) 787 cpu_set(cpu, cache_dev_map);
788 cpu_set(cpu, cache_dev_map);
789 788
790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 789 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
791 return retval; 790 return 0;
792} 791}
793 792
794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 793static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bd..9b60fce09f7 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b38..9af89078f7b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++; 878 count++;
879 879
880 printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); 880 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
881 count, start, end);
881 for (i = 0; i < count; i++) { 882 for (i = 0; i < count; i++) {
882 struct early_res *r = &early_res[i]; 883 struct early_res *r = &early_res[i];
883 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 884 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
1298 } 1299 }
1299} 1300}
1300 1301
1301/*
1302 * Non-standard memory setup can be specified via this quirk:
1303 */
1304char * (*arch_memory_setup_quirk)(void);
1305
1306char *__init default_machine_specific_memory_setup(void) 1302char *__init default_machine_specific_memory_setup(void)
1307{ 1303{
1308 char *who = "BIOS-e820"; 1304 char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
1343 1339
1344char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1340char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1345{ 1341{
1346 if (arch_memory_setup_quirk) { 1342 if (x86_quirks->arch_memory_setup) {
1347 char *who = arch_memory_setup_quirk(); 1343 char *who = x86_quirks->arch_memory_setup();
1348 1344
1349 if (who) 1345 if (who)
1350 return who; 1346 return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1363 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1368 e820_print_map(who); 1364 e820_print_map(who);
1369} 1365}
1370
1371#ifdef CONFIG_X86_64
1372int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
1373{
1374 int i;
1375
1376 if (slot < 0 || slot >= e820.nr_map)
1377 return -1;
1378 for (i = slot; i < e820.nr_map; i++) {
1379 if (e820.map[i].type != E820_RAM)
1380 continue;
1381 break;
1382 }
1383 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
1384 return -1;
1385 *addr = e820.map[i].addr;
1386 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
1387 max_pfn << PAGE_SHIFT) - *addr;
1388 return i + 1;
1389}
1390#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc87..4353cf5e6fa 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index ad5264c29e9..cdfd94cc6b1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -332,7 +332,7 @@ sysenter_past_esp:
332 GET_THREAD_INFO(%ebp) 332 GET_THREAD_INFO(%ebp)
333 333
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 335 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 336 jnz syscall_trace_entry
337 cmpl $(nr_syscalls), %eax 337 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 338 jae syscall_badsys
@@ -370,7 +370,7 @@ ENTRY(system_call)
370 GET_THREAD_INFO(%ebp) 370 GET_THREAD_INFO(%ebp)
371 # system call tracing in operation / emulation 371 # system call tracing in operation / emulation
372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
373 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 373 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
374 jnz syscall_trace_entry 374 jnz syscall_trace_entry
375 cmpl $(nr_syscalls), %eax 375 cmpl $(nr_syscalls), %eax
376 jae syscall_badsys 376 jae syscall_badsys
@@ -383,10 +383,6 @@ syscall_exit:
383 # setting need_resched or sigpending 383 # setting need_resched or sigpending
384 # between sampling and the iret 384 # between sampling and the iret
385 TRACE_IRQS_OFF 385 TRACE_IRQS_OFF
386 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
387 jz no_singlestep
388 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
389no_singlestep:
390 movl TI_flags(%ebp), %ecx 386 movl TI_flags(%ebp), %ecx
391 testw $_TIF_ALLWORK_MASK, %cx # current->work 387 testw $_TIF_ALLWORK_MASK, %cx # current->work
392 jne syscall_exit_work 388 jne syscall_exit_work
@@ -514,12 +510,8 @@ END(work_pending)
514syscall_trace_entry: 510syscall_trace_entry:
515 movl $-ENOSYS,PT_EAX(%esp) 511 movl $-ENOSYS,PT_EAX(%esp)
516 movl %esp, %eax 512 movl %esp, %eax
517 xorl %edx,%edx 513 call syscall_trace_enter
518 call do_syscall_trace 514 /* What it returned is what we'll actually use. */
519 cmpl $0, %eax
520 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
521 # so must skip actual syscall
522 movl PT_ORIG_EAX(%esp), %eax
523 cmpl $(nr_syscalls), %eax 515 cmpl $(nr_syscalls), %eax
524 jnae syscall_call 516 jnae syscall_call
525 jmp syscall_exit 517 jmp syscall_exit
@@ -528,14 +520,13 @@ END(syscall_trace_entry)
528 # perform syscall exit tracing 520 # perform syscall exit tracing
529 ALIGN 521 ALIGN
530syscall_exit_work: 522syscall_exit_work:
531 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 523 testb $_TIF_WORK_SYSCALL_EXIT, %cl
532 jz work_pending 524 jz work_pending
533 TRACE_IRQS_ON 525 TRACE_IRQS_ON
534 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 526 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
535 # schedule() instead 527 # schedule() instead
536 movl %esp, %eax 528 movl %esp, %eax
537 movl $1, %edx 529 call syscall_trace_leave
538 call do_syscall_trace
539 jmp resume_userspace 530 jmp resume_userspace
540END(syscall_exit_work) 531END(syscall_exit_work)
541 CFI_ENDPROC 532 CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c34..8410e26f418 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs)
349 movq %rcx,RIP-ARGOFFSET(%rsp) 349 movq %rcx,RIP-ARGOFFSET(%rsp)
350 CFI_REL_OFFSET rip,RIP-ARGOFFSET 350 CFI_REL_OFFSET rip,RIP-ARGOFFSET
351 GET_THREAD_INFO(%rcx) 351 GET_THREAD_INFO(%rcx)
352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 352 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 TI_flags(%rcx)
354 jnz tracesys 353 jnz tracesys
355 cmpq $__NR_syscall_max,%rax 354 cmpq $__NR_syscall_max,%rax
356 ja badsys 355 ja badsys
@@ -430,7 +429,12 @@ tracesys:
430 FIXUP_TOP_OF_STACK %rdi 429 FIXUP_TOP_OF_STACK %rdi
431 movq %rsp,%rdi 430 movq %rsp,%rdi
432 call syscall_trace_enter 431 call syscall_trace_enter
433 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 432 /*
433 * Reload arg registers from stack in case ptrace changed them.
434 * We don't reload %rax because syscall_trace_enter() returned
435 * the value it wants us to use in the table lookup.
436 */
437 LOAD_ARGS ARGOFFSET, 1
434 RESTORE_REST 438 RESTORE_REST
435 cmpq $__NR_syscall_max,%rax 439 cmpq $__NR_syscall_max,%rax
436 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 440 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -483,7 +487,7 @@ int_very_careful:
483 ENABLE_INTERRUPTS(CLBR_NONE) 487 ENABLE_INTERRUPTS(CLBR_NONE)
484 SAVE_REST 488 SAVE_REST
485 /* Check for syscall exit trace */ 489 /* Check for syscall exit trace */
486 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 490 testl $_TIF_WORK_SYSCALL_EXIT,%edx
487 jz int_signal 491 jz int_signal
488 pushq %rdi 492 pushq %rdi
489 CFI_ADJUST_CFA_OFFSET 8 493 CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +495,7 @@ int_very_careful:
491 call syscall_trace_leave 495 call syscall_trace_leave
492 popq %rdi 496 popq %rdi
493 CFI_ADJUST_CFA_OFFSET -8 497 CFI_ADJUST_CFA_OFFSET -8
494 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 498 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
495 jmp int_restore_rest 499 jmp int_restore_rest
496 500
497int_signal: 501int_signal:
@@ -1189,6 +1193,7 @@ END(device_not_available)
1189 /* runs on exception stack */ 1193 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1194KPROBE_ENTRY(debug)
1191 INTR_FRAME 1195 INTR_FRAME
1196 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1197 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1198 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1199 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1203,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1203 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1204KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1205 INTR_FRAME
1206 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1207 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1208 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1209 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1217,7 @@ KPROBE_END(nmi)
1211 1217
1212KPROBE_ENTRY(int3) 1218KPROBE_ENTRY(int3)
1213 INTR_FRAME 1219 INTR_FRAME
1220 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1221 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1222 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1223 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1244,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1244 /* runs on exception stack */
1238ENTRY(double_fault) 1245ENTRY(double_fault)
1239 XCPT_FRAME 1246 XCPT_FRAME
1247 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1248 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1249 jmp paranoid_exit1
1242 CFI_ENDPROC 1250 CFI_ENDPROC
@@ -1253,6 +1261,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1261 /* runs on exception stack */
1254ENTRY(stack_segment) 1262ENTRY(stack_segment)
1255 XCPT_FRAME 1263 XCPT_FRAME
1264 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1265 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1266 jmp paranoid_exit1
1258 CFI_ENDPROC 1267 CFI_ENDPROC
@@ -1278,6 +1287,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1287 /* runs on exception stack */
1279ENTRY(machine_check) 1288ENTRY(machine_check)
1280 INTR_FRAME 1289 INTR_FRAME
1290 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1291 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1292 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1293 paranoidentry do_machine_check
@@ -1312,3 +1322,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1322 sysret
1313 CFI_ENDPROC 1323 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1324ENDPROC(ignore_sysret)
1325
1326#ifdef CONFIG_XEN
1327ENTRY(xen_hypervisor_callback)
1328 zeroentry xen_do_hypervisor_callback
1329END(xen_hypervisor_callback)
1330
1331/*
1332# A note on the "critical region" in our callback handler.
1333# We want to avoid stacking callback handlers due to events occurring
1334# during handling of the last event. To do this, we keep events disabled
1335# until we've done all processing. HOWEVER, we must enable events before
1336# popping the stack frame (can't be done atomically) and so it would still
1337# be possible to get enough handler activations to overflow the stack.
1338# Although unlikely, bugs of that kind are hard to track down, so we'd
1339# like to avoid the possibility.
1340# So, on entry to the handler we detect whether we interrupted an
1341# existing activation in its critical region -- if so, we pop the current
1342# activation and restart the handler using the previous one.
1343*/
1344ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1345 CFI_STARTPROC
1346/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1347 see the correct pointer to the pt_regs */
1348 movq %rdi, %rsp # we don't return, adjust the stack frame
1349 CFI_ENDPROC
1350 CFI_DEFAULT_STACK
135111: incl %gs:pda_irqcount
1352 movq %rsp,%rbp
1353 CFI_DEF_CFA_REGISTER rbp
1354 cmovzq %gs:pda_irqstackptr,%rsp
1355 pushq %rbp # backlink for old unwinder
1356 call xen_evtchn_do_upcall
1357 popq %rsp
1358 CFI_DEF_CFA_REGISTER rsp
1359 decl %gs:pda_irqcount
1360 jmp error_exit
1361 CFI_ENDPROC
1362END(do_hypervisor_callback)
1363
1364/*
1365# Hypervisor uses this for application faults while it executes.
1366# We get here for two reasons:
1367# 1. Fault while reloading DS, ES, FS or GS
1368# 2. Fault while executing IRET
1369# Category 1 we do not need to fix up as Xen has already reloaded all segment
1370# registers that could be reloaded and zeroed the others.
1371# Category 2 we fix up by killing the current process. We cannot use the
1372# normal Linux return path in this case because if we use the IRET hypercall
1373# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1374# We distinguish between categories by comparing each saved segment register
1375# with its current contents: any discrepancy means we in category 1.
1376*/
1377ENTRY(xen_failsafe_callback)
1378 framesz = (RIP-0x30) /* workaround buggy gas */
1379 _frame framesz
1380 CFI_REL_OFFSET rcx, 0
1381 CFI_REL_OFFSET r11, 8
1382 movw %ds,%cx
1383 cmpw %cx,0x10(%rsp)
1384 CFI_REMEMBER_STATE
1385 jne 1f
1386 movw %es,%cx
1387 cmpw %cx,0x18(%rsp)
1388 jne 1f
1389 movw %fs,%cx
1390 cmpw %cx,0x20(%rsp)
1391 jne 1f
1392 movw %gs,%cx
1393 cmpw %cx,0x28(%rsp)
1394 jne 1f
1395 /* All segments match their saved values => Category 2 (Bad IRET). */
1396 movq (%rsp),%rcx
1397 CFI_RESTORE rcx
1398 movq 8(%rsp),%r11
1399 CFI_RESTORE r11
1400 addq $0x30,%rsp
1401 CFI_ADJUST_CFA_OFFSET -0x30
1402 pushq $0
1403 CFI_ADJUST_CFA_OFFSET 8
1404 pushq %r11
1405 CFI_ADJUST_CFA_OFFSET 8
1406 pushq %rcx
1407 CFI_ADJUST_CFA_OFFSET 8
1408 jmp general_protection
1409 CFI_RESTORE_STATE
14101: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1411 movq (%rsp),%rcx
1412 CFI_RESTORE rcx
1413 movq 8(%rsp),%r11
1414 CFI_RESTORE r11
1415 addq $0x30,%rsp
1416 CFI_ADJUST_CFA_OFFSET -0x30
1417 pushq $0
1418 CFI_ADJUST_CFA_OFFSET 8
1419 SAVE_ALL
1420 jmp error_exit
1421 CFI_ENDPROC
1422END(xen_failsafe_callback)
1423
1424#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b0..3c392934069 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -24,6 +24,7 @@
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
27 28
28DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
29EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
40short uv_possible_blades; 41short uv_possible_blades;
41EXPORT_SYMBOL_GPL(uv_possible_blades); 42EXPORT_SYMBOL_GPL(uv_possible_blades);
42 43
44unsigned long sn_rtc_cycles_per_second;
45EXPORT_SYMBOL(sn_rtc_cycles_per_second);
46
43/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 47/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
44 48
45static cpumask_t uv_target_cpus(void) 49static cpumask_t uv_target_cpus(void)
@@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc); 276 map_high("MMIOH", mmioh.s.base, shift, map_uc);
273} 277}
274 278
279static __init void uv_rtc_init(void)
280{
281 long status, ticks_per_sec, drift;
282
283 status =
284 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
285 &drift);
286 if (status != 0 || ticks_per_sec < 100000) {
287 printk(KERN_WARNING
288 "unable to determine platform RTC clock frequency, "
289 "guessing.\n");
290 /* BIOS gives wrong value for clock freq. so guess */
291 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
292 } else
293 sn_rtc_cycles_per_second = ticks_per_sec;
294}
295
275static __init void uv_system_init(void) 296static __init void uv_system_init(void)
276{ 297{
277 union uvh_si_addr_map_config_u m_n_config; 298 union uvh_si_addr_map_config_u m_n_config;
@@ -326,6 +347,8 @@ static __init void uv_system_init(void)
326 gnode_upper = (((unsigned long)node_id.s.node_id) & 347 gnode_upper = (((unsigned long)node_id.s.node_id) &
327 ~((1 << n_val) - 1)) << m_val; 348 ~((1 << n_val) - 1)) << m_val;
328 349
350 uv_rtc_init();
351
329 for_each_present_cpu(cpu) { 352 for_each_present_cpu(cpu) {
330 nid = cpu_to_node(cpu); 353 nid = cpu_to_node(cpu);
331 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 354 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c9781982914..1b318e903bf 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
102 109
103 early_printk("Kernel alive\n"); 110 early_printk("Kernel alive\n");
104 111
105 _cpu_pda = __cpu_pda; 112 x86_64_init_pda();
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 early_printk("Kernel really alive\n");
110 115
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217c..db3280afe88 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 558abf4c796..de9aa0e3a9c 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
756 /* 756 /*
757 * Send the IPI. The write to APIC_ICR fires this off. 757 * Send the IPI. The write to APIC_ICR fires this off.
758 */ 758 */
759 apic_write_around(APIC_ICR, cfg); 759 apic_write(APIC_ICR, cfg);
760} 760}
761#endif /* !CONFIG_SMP */ 761#endif /* !CONFIG_SMP */
762 762
@@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
2030 unsigned long v; 2030 unsigned long v;
2031 2031
2032 v = apic_read(APIC_LVT0); 2032 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 2033 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2034} 2034}
2035 2035
2036static void unmask_lapic_irq(unsigned int irq) 2036static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
2038 unsigned long v; 2038 unsigned long v;
2039 2039
2040 v = apic_read(APIC_LVT0); 2040 v = apic_read(APIC_LVT0);
2041 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 2041 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2042} 2042}
2043 2043
2044static struct irq_chip lapic_chip __read_mostly = { 2044static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
2168 * The AEOI mode will finish them in the 8259A 2168 * The AEOI mode will finish them in the 8259A
2169 * automatically. 2169 * automatically.
2170 */ 2170 */
2171 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2171 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2172 init_8259A(1); 2172 init_8259A(1);
2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2174 2174
@@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
2177 pin2 = ioapic_i8259.pin; 2177 pin2 = ioapic_i8259.pin;
2178 apic2 = ioapic_i8259.apic; 2178 apic2 = ioapic_i8259.apic;
2179 2179
2180 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2180 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2181 vector, apic1, pin1, apic2, pin2); 2181 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2182 vector, apic1, pin1, apic2, pin2);
2182 2183
2183 /* 2184 /*
2184 * Some BIOS writers are clueless and report the ExtINTA 2185 * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
2216 } 2217 }
2217 clear_IO_APIC_pin(apic1, pin1); 2218 clear_IO_APIC_pin(apic1, pin1);
2218 if (!no_pin1) 2219 if (!no_pin1)
2219 printk(KERN_ERR "..MP-BIOS bug: " 2220 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2220 "8254 timer not connected to IO-APIC\n"); 2221 "8254 timer not connected to IO-APIC\n");
2221 2222
2222 printk(KERN_INFO "...trying to set up timer (IRQ0) " 2223 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2223 "through the 8259A ... "); 2224 "(IRQ0) through the 8259A ...\n");
2224 printk("\n..... (found pin %d) ...", pin2); 2225 apic_printk(APIC_QUIET, KERN_INFO
2226 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2225 /* 2227 /*
2226 * legacy devices should be connected to IO APIC #0 2228 * legacy devices should be connected to IO APIC #0
2227 */ 2229 */
@@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
2230 unmask_IO_APIC_irq(0); 2232 unmask_IO_APIC_irq(0);
2231 enable_8259A_irq(0); 2233 enable_8259A_irq(0);
2232 if (timer_irq_works()) { 2234 if (timer_irq_works()) {
2233 printk("works.\n"); 2235 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2234 timer_through_8259 = 1; 2236 timer_through_8259 = 1;
2235 if (nmi_watchdog == NMI_IO_APIC) { 2237 if (nmi_watchdog == NMI_IO_APIC) {
2236 disable_8259A_irq(0); 2238 disable_8259A_irq(0);
@@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
2244 */ 2246 */
2245 disable_8259A_irq(0); 2247 disable_8259A_irq(0);
2246 clear_IO_APIC_pin(apic2, pin2); 2248 clear_IO_APIC_pin(apic2, pin2);
2247 printk(" failed.\n"); 2249 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2248 } 2250 }
2249 2251
2250 if (nmi_watchdog == NMI_IO_APIC) { 2252 if (nmi_watchdog == NMI_IO_APIC) {
2251 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2253 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2254 "through the IO-APIC - disabling NMI Watchdog!\n");
2252 nmi_watchdog = NMI_NONE; 2255 nmi_watchdog = NMI_NONE;
2253 } 2256 }
2254 timer_ack = 0; 2257 timer_ack = 0;
2255 2258
2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2259 apic_printk(APIC_QUIET, KERN_INFO
2260 "...trying to set up timer as Virtual Wire IRQ...\n");
2257 2261
2258 lapic_register_intr(0, vector); 2262 lapic_register_intr(0, vector);
2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2263 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0); 2264 enable_8259A_irq(0);
2261 2265
2262 if (timer_irq_works()) { 2266 if (timer_irq_works()) {
2263 printk(" works.\n"); 2267 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2264 goto out; 2268 goto out;
2265 } 2269 }
2266 disable_8259A_irq(0); 2270 disable_8259A_irq(0);
2267 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 2271 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 printk(" failed.\n"); 2272 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2269 2273
2270 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2274 apic_printk(APIC_QUIET, KERN_INFO
2275 "...trying to set up timer as ExtINT IRQ...\n");
2271 2276
2272 init_8259A(0); 2277 init_8259A(0);
2273 make_8259A_irq(0); 2278 make_8259A_irq(0);
2274 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 2279 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2275 2280
2276 unlock_ExtINT_logic(); 2281 unlock_ExtINT_logic();
2277 2282
2278 if (timer_irq_works()) { 2283 if (timer_irq_works()) {
2279 printk(" works.\n"); 2284 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2280 goto out; 2285 goto out;
2281 } 2286 }
2282 printk(" failed :(.\n"); 2287 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2283 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2288 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2284 "report. Then try booting with the 'noapic' option"); 2289 "report. Then try booting with the 'noapic' option.\n");
2285out: 2290out:
2286 local_irq_restore(flags); 2291 local_irq_restore(flags);
2287} 2292}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 6510cde36b3..64a46affd85 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -45,6 +45,7 @@
45#include <asm/proto.h> 45#include <asm/proto.h>
46#include <asm/acpi.h> 46#include <asm/acpi.h>
47#include <asm/dma.h> 47#include <asm/dma.h>
48#include <asm/i8259.h>
48#include <asm/nmi.h> 49#include <asm/nmi.h>
49#include <asm/msidef.h> 50#include <asm/msidef.h>
50#include <asm/hypertransport.h> 51#include <asm/hypertransport.h>
@@ -1696,8 +1697,9 @@ static inline void __init check_timer(void)
1696 pin2 = ioapic_i8259.pin; 1697 pin2 = ioapic_i8259.pin;
1697 apic2 = ioapic_i8259.apic; 1698 apic2 = ioapic_i8259.apic;
1698 1699
1699 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 1700 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1700 cfg->vector, apic1, pin1, apic2, pin2); 1701 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
1702 cfg->vector, apic1, pin1, apic2, pin2);
1701 1703
1702 /* 1704 /*
1703 * Some BIOS writers are clueless and report the ExtINTA 1705 * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1737,13 @@ static inline void __init check_timer(void)
1735 } 1737 }
1736 clear_IO_APIC_pin(apic1, pin1); 1738 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 1739 if (!no_pin1)
1738 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " 1740 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1739 "8254 timer not connected to IO-APIC\n"); 1741 "8254 timer not connected to IO-APIC\n");
1740 1742
1741 apic_printk(APIC_VERBOSE,KERN_INFO 1743 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1742 "...trying to set up timer (IRQ0) " 1744 "(IRQ0) through the 8259A ...\n");
1743 "through the 8259A ... "); 1745 apic_printk(APIC_QUIET, KERN_INFO
1744 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 1746 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1745 apic2, pin2);
1746 /* 1747 /*
1747 * legacy devices should be connected to IO APIC #0 1748 * legacy devices should be connected to IO APIC #0
1748 */ 1749 */
@@ -1751,7 +1752,7 @@ static inline void __init check_timer(void)
1751 unmask_IO_APIC_irq(0); 1752 unmask_IO_APIC_irq(0);
1752 enable_8259A_irq(0); 1753 enable_8259A_irq(0);
1753 if (timer_irq_works()) { 1754 if (timer_irq_works()) {
1754 apic_printk(APIC_VERBOSE," works.\n"); 1755 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1755 timer_through_8259 = 1; 1756 timer_through_8259 = 1;
1756 if (nmi_watchdog == NMI_IO_APIC) { 1757 if (nmi_watchdog == NMI_IO_APIC) {
1757 disable_8259A_irq(0); 1758 disable_8259A_irq(0);
@@ -1765,29 +1766,32 @@ static inline void __init check_timer(void)
1765 */ 1766 */
1766 disable_8259A_irq(0); 1767 disable_8259A_irq(0);
1767 clear_IO_APIC_pin(apic2, pin2); 1768 clear_IO_APIC_pin(apic2, pin2);
1768 apic_printk(APIC_VERBOSE," failed.\n"); 1769 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1769 } 1770 }
1770 1771
1771 if (nmi_watchdog == NMI_IO_APIC) { 1772 if (nmi_watchdog == NMI_IO_APIC) {
1772 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 1773 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
1774 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE; 1775 nmi_watchdog = NMI_NONE;
1774 } 1776 }
1775 1777
1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 1778 apic_printk(APIC_QUIET, KERN_INFO
1779 "...trying to set up timer as Virtual Wire IRQ...\n");
1777 1780
1778 lapic_register_intr(0); 1781 lapic_register_intr(0);
1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 1782 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1780 enable_8259A_irq(0); 1783 enable_8259A_irq(0);
1781 1784
1782 if (timer_irq_works()) { 1785 if (timer_irq_works()) {
1783 apic_printk(APIC_VERBOSE," works.\n"); 1786 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1784 goto out; 1787 goto out;
1785 } 1788 }
1786 disable_8259A_irq(0); 1789 disable_8259A_irq(0);
1787 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 1790 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1788 apic_printk(APIC_VERBOSE," failed.\n"); 1791 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1789 1792
1790 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 1793 apic_printk(APIC_QUIET, KERN_INFO
1794 "...trying to set up timer as ExtINT IRQ...\n");
1791 1795
1792 init_8259A(0); 1796 init_8259A(0);
1793 make_8259A_irq(0); 1797 make_8259A_irq(0);
@@ -1796,11 +1800,12 @@ static inline void __init check_timer(void)
1796 unlock_ExtINT_logic(); 1800 unlock_ExtINT_logic();
1797 1801
1798 if (timer_irq_works()) { 1802 if (timer_irq_works()) {
1799 apic_printk(APIC_VERBOSE," works.\n"); 1803 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1800 goto out; 1804 goto out;
1801 } 1805 }
1802 apic_printk(APIC_VERBOSE," failed :(.\n"); 1806 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1803 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 1807 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
1808 "report. Then try booting with the 'noapic' option.\n");
1804out: 1809out:
1805 local_irq_restore(flags); 1810 local_irq_restore(flags);
1806} 1811}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a64..1c3a66a67f8 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -103,6 +103,9 @@ void __init io_delay_init(void)
103 103
104static int __init io_delay_param(char *s) 104static int __init io_delay_param(char *s)
105{ 105{
106 if (!s)
107 return -EINVAL;
108
106 if (!strcmp(s, "0x80")) 109 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 110 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 111 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad..3f7537b669d 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
70 /* 70 /*
71 * Send the IPI. The write to APIC_ICR fires this off. 71 * Send the IPI. The write to APIC_ICR fires this off.
72 */ 72 */
73 apic_write_around(APIC_ICR, cfg); 73 apic_write(APIC_ICR, cfg);
74} 74}
75 75
76void send_IPI_self(int vector) 76void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
98 * prepare target chip field 98 * prepare target chip field
99 */ 99 */
100 cfg = __prepare_ICR2(mask); 100 cfg = __prepare_ICR2(mask);
101 apic_write_around(APIC_ICR2, cfg); 101 apic_write(APIC_ICR2, cfg);
102 102
103 /* 103 /*
104 * program the ICR 104 * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
108 /* 108 /*
109 * Send the IPI. The write to APIC_ICR fires this off. 109 * Send the IPI. The write to APIC_ICR fires this off.
110 */ 110 */
111 apic_write_around(APIC_ICR, cfg); 111 apic_write(APIC_ICR, cfg);
112} 112}
113 113
114/* 114/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f1247..1cf8c1fcc08 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 85
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 88
92static void call_on_stack(void *func, void *stack) 89static void call_on_stack(void *func, void *stack)
93{ 90{
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c0320599171..f2d43bc7551 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
209{ 213{
210 int error = 0; 214 int error = 0;
211 215
216 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
217 if (!arch_debugfs_dir)
218 return -ENOMEM;
219
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 220#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 221 error = boot_params_kdebugfs_init();
214#endif 222#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13d..43c019f85f0 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 860
861 resume_execution(cur, regs, kcb); 861 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 862 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 863
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 864 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 865 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1d..d02def06ca9 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 113#endif
114 114
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 116static void __init kvm_smp_prepare_boot_cpu(void)
117{ 117{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 119 native_smp_prepare_boot_cpu();
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f587..0e867676b5a 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 150 const Elf_Shdr *sechdrs,
151 struct module *me) 151 struct module *me)
152{ 152{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
154 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 155 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 156
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 157 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 161 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 162 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 163 locks= s;
164 if (!strcmp(".parainstructions", secstrings + s->sh_name))
165 para = s;
163 } 166 }
164 167
165 if (alt) { 168 if (alt) {
@@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 178 tseg, tseg + text->sh_size);
176 } 179 }
177 180
181 if (para) {
182 void *pseg = (void *)para->sh_addr;
183 apply_paravirt(pseg, pseg + para->sh_size);
184 }
185
178 return module_bug_finalize(hdr, sechdrs, me); 186 return module_bug_finalize(hdr, sechdrs, me);
179} 187}
180 188
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c..6ae005ccaed 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
30#include <asm/setup.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
48 return sum & 0xFF; 49 return sum & 0xFF;
49} 50}
50 51
51#ifdef CONFIG_X86_NUMAQ
52int found_numaq;
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68
69static int mpc_record;
70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
119#endif
120
121static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 52static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
122{ 53{
123 int apicid; 54 int apicid;
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
127 disabled_cpus++; 58 disabled_cpus++;
128 return; 59 return;
129 } 60 }
130#ifdef CONFIG_X86_NUMAQ 61
131 if (found_numaq) 62 if (x86_quirks->mpc_apic_id)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]); 63 apicid = x86_quirks->mpc_apic_id(m);
133 else 64 else
134 apicid = m->mpc_apicid; 65 apicid = m->mpc_apicid;
135#else 66
136 apicid = m->mpc_apicid;
137#endif
138 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
139 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
140 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
151 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
152 str[6] = 0; 81 str[6] = 0;
153 82
154#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
155 if (found_numaq) 84 x86_quirks->mpc_oem_bus_info(m, str);
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 85 else
157#else 86 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
159#endif
160 87
161#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
162 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
174#endif 101#endif
175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
176#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
177 if (found_numaq) 104 x86_quirks->mpc_oem_pci_bus(m);
178 mpc_oem_pci_bus(m, translation_table[mpc_record]); 105
179#endif
180 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
181#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
316 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
317} 243}
318 244
319#ifdef CONFIG_X86_NUMAQ
320static void __init MP_translation_info(struct mpc_config_translation *m)
321{
322 printk(KERN_INFO
323 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
324 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
325 m->trans_local);
326
327 if (mpc_record >= MAX_MPC_ENTRY)
328 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
329 else
330 translation_table[mpc_record] = m; /* stash this for later */
331 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
332 node_set_online(m->trans_quad);
333}
334
335/*
336 * Read/parse the MPC oem tables
337 */
338
339static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
340 unsigned short oemsize)
341{
342 int count = sizeof(*oemtable); /* the header size */
343 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
344
345 mpc_record = 0;
346 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
347 oemtable);
348 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
349 printk(KERN_WARNING
350 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
351 oemtable->oem_signature[0], oemtable->oem_signature[1],
352 oemtable->oem_signature[2], oemtable->oem_signature[3]);
353 return;
354 }
355 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
356 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
357 return;
358 }
359 while (count < oemtable->oem_length) {
360 switch (*oemptr) {
361 case MP_TRANSLATION:
362 {
363 struct mpc_config_translation *m =
364 (struct mpc_config_translation *)oemptr;
365 MP_translation_info(m);
366 oemptr += sizeof(*m);
367 count += sizeof(*m);
368 ++mpc_record;
369 break;
370 }
371 default:
372 {
373 printk(KERN_WARNING
374 "Unrecognised OEM table entry type! - %d\n",
375 (int)*oemptr);
376 return;
377 }
378 }
379 }
380}
381
382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
383 char *productid)
384{
385 if (strncmp(oem, "IBM NUMA", 8))
386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
390 if (mpc->mpc_oemptr)
391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
392 mpc->mpc_oemsize);
393}
394#endif /* CONFIG_X86_NUMAQ */
395
396/* 245/*
397 * Read/parse the MPC 246 * Read/parse the MPC
398 */ 247 */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
457 } else 306 } else
458 mps_oem_check(mpc, oem, str); 307 mps_oem_check(mpc, oem, str);
459#endif 308#endif
460
461 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
462 if (!acpi_lapic) 310 if (!acpi_lapic)
463 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
465 if (early) 313 if (early)
466 return 1; 314 return 1;
467 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
468 /* 321 /*
469 * Now process the configuration blocks. 322 * Now process the configuration blocks.
470 */ 323 */
471#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
472 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
473#endif 326
474 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
475 switch (*mpt) { 328 switch (*mpt) {
476 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
536 count = mpc->mpc_length; 389 count = mpc->mpc_length;
537 break; 390 break;
538 } 391 }
539#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
540 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
541#endif
542 } 394 }
543 395
544#ifdef CONFIG_X86_GENERICARCH 396#ifdef CONFIG_X86_GENERICARCH
@@ -726,20 +578,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 578static struct intel_mp_floating *mpf_found;
727 579
728/* 580/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
735 * Scan the memory blocks for an SMP configuration block. 581 * Scan the memory blocks for an SMP configuration block.
736 */ 582 */
737static void __init __get_smp_config(unsigned int early) 583static void __init __get_smp_config(unsigned int early)
738{ 584{
739 struct intel_mp_floating *mpf = mpf_found; 585 struct intel_mp_floating *mpf = mpf_found;
740 586
741 if (mach_get_smp_config_quirk) { 587 if (x86_quirks->mach_get_smp_config) {
742 if (mach_get_smp_config_quirk(early)) 588 if (x86_quirks->mach_get_smp_config(early))
743 return; 589 return;
744 } 590 }
745 if (acpi_lapic && early) 591 if (acpi_lapic && early)
@@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
899 return 0; 745 return 0;
900} 746}
901 747
902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve) 748static void __init __find_smp_config(unsigned int reserve)
905{ 749{
906 unsigned int address; 750 unsigned int address;
907 751
908 if (mach_find_smp_config_quirk) { 752 if (x86_quirks->mach_find_smp_config) {
909 if (mach_find_smp_config_quirk(reserve)) 753 if (x86_quirks->mach_find_smp_config(reserve))
910 return; 754 return;
911 } 755 }
912 /* 756 /*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad..ac6d51222e7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
263 263
264static void __acpi_nmi_enable(void *__unused) 264static void __acpi_nmi_enable(void *__unused)
265{ 265{
266 apic_write_around(APIC_LVT0, APIC_DM_NMI); 266 apic_write(APIC_LVT0, APIC_DM_NMI);
267} 267}
268 268
269/* 269/*
@@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
277 277
278static void __acpi_nmi_disable(void *__unused) 278static void __acpi_nmi_disable(void *__unused)
279{ 279{
280 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); 280 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
281} 281}
282 282
283/* 283/*
@@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
448 448
449#ifdef CONFIG_SYSCTL 449#ifdef CONFIG_SYSCTL
450 450
451static int __init setup_unknown_nmi_panic(char *str)
452{
453 unknown_nmi_panic = 1;
454 return 1;
455}
456__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
457
451static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 458static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
452{ 459{
453 unsigned char reason = get_nmi_reason(); 460 unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9a..b8c45610b20 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h>
36 37
37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
38 39
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
71 } 72 }
72} 73}
73 74
75
76void __init numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL,
235 .arch_pre_intr_init = NULL,
236 .arch_memory_setup = NULL,
237 .arch_intr_init = NULL,
238 .arch_trap_init = NULL,
239 .mach_get_smp_config = NULL,
240 .mach_find_smp_config = NULL,
241 .mpc_record = &mpc_record,
242 .mpc_apic_id = mpc_apic_id,
243 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem,
246};
247
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
249 char *productid)
250{
251 if (strncmp(oem, "IBM NUMA", 8))
252 printk("Warning! Not a NUMA-Q system!\n");
253 else
254 found_numaq = 1;
255}
256
74static __init void early_check_numaq(void) 257static __init void early_check_numaq(void)
75{ 258{
76 /* 259 /*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
82 */ 265 */
83 if (smp_found_config) 266 if (smp_found_config)
84 early_get_smp_config(); 267 early_get_smp_config();
268
269 if (found_numaq)
270 x86_quirks = &numaq_x86_quirks;
85} 271}
86 272
87int __init get_memcfg_numaq(void) 273int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
92 smp_dump_qct(); 278 smp_dump_qct();
93 return 1; 279 return 1;
94} 280}
95
96void __init numaq_tsc_disable(void)
97{
98 if (!found_numaq)
99 return;
100
101 if (num_online_nodes() > 1) {
102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
103 setup_clear_cpu_cap(X86_FEATURE_TSC);
104 }
105}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c1..b4564d089b4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -361,7 +362,6 @@ struct pv_cpu_ops pv_cpu_ops = {
361struct pv_apic_ops pv_apic_ops = { 362struct pv_apic_ops pv_apic_ops = {
362#ifdef CONFIG_X86_LOCAL_APIC 363#ifdef CONFIG_X86_LOCAL_APIC
363 .apic_write = native_apic_write, 364 .apic_write = native_apic_write,
364 .apic_write_atomic = native_apic_write_atomic,
365 .apic_read = native_apic_read, 365 .apic_read = native_apic_read,
366 .setup_boot_clock = setup_boot_APIC_clock, 366 .setup_boot_clock = setup_boot_APIC_clock,
367 .setup_secondary_clock = setup_secondary_APIC_clock, 367 .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +373,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 373#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 374 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 375 .pagetable_setup_done = native_pagetable_setup_done,
376#else
377 .pagetable_setup_start = paravirt_nop,
378 .pagetable_setup_done = paravirt_nop,
376#endif 379#endif
377 380
378 .read_cr2 = native_read_cr2, 381 .read_cr2 = native_read_cr2,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df..151f2d171f7 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -36,7 +36,7 @@
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/scatterlist.h> 37#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 38#include <linux/iommu-helper.h>
39#include <asm/gart.h> 39#include <asm/iommu.h>
40#include <asm/calgary.h> 40#include <asm/calgary.h>
41#include <asm/tce.h> 41#include <asm/tce.h>
42#include <asm/pci-direct.h> 42#include <asm/pci-direct.h>
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8467ec2320f..a4213c00dff 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,12 +5,11 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
13EXPORT_SYMBOL(forbid_dac);
14 13
15const struct dma_mapping_ops *dma_ops; 14const struct dma_mapping_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
@@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
114 * The order of these functions is important for 113 * The order of these functions is important for
115 * fall-back/fail-over reasons 114 * fall-back/fail-over reasons
116 */ 115 */
117#ifdef CONFIG_GART_IOMMU
118 gart_iommu_hole_init(); 116 gart_iommu_hole_init();
119#endif
120 117
121#ifdef CONFIG_CALGARY_IOMMU
122 detect_calgary(); 118 detect_calgary();
123#endif
124 119
125 detect_intel_iommu(); 120 detect_intel_iommu();
126 121
127 amd_iommu_detect(); 122 amd_iommu_detect();
128 123
129#ifdef CONFIG_SWIOTLB
130 pci_swiotlb_init(); 124 pci_swiotlb_init();
131#endif
132} 125}
133#endif 126#endif
134 127
@@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
184 swiotlb = 1; 177 swiotlb = 1;
185#endif 178#endif
186 179
187#ifdef CONFIG_GART_IOMMU
188 gart_parse_options(p); 180 gart_parse_options(p);
189#endif
190 181
191#ifdef CONFIG_CALGARY_IOMMU 182#ifdef CONFIG_CALGARY_IOMMU
192 if (!strncmp(p, "calgary", 7)) 183 if (!strncmp(p, "calgary", 7))
@@ -500,17 +491,13 @@ EXPORT_SYMBOL(dma_free_coherent);
500 491
501static int __init pci_iommu_init(void) 492static int __init pci_iommu_init(void)
502{ 493{
503#ifdef CONFIG_CALGARY_IOMMU
504 calgary_iommu_init(); 494 calgary_iommu_init();
505#endif
506 495
507 intel_iommu_init(); 496 intel_iommu_init();
508 497
509 amd_iommu_init(); 498 amd_iommu_init();
510 499
511#ifdef CONFIG_GART_IOMMU
512 gart_iommu_init(); 500 gart_iommu_init();
513#endif
514 501
515 no_iommu_init(); 502 no_iommu_init();
516 return 0; 503 return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d1..be60961f869 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -32,6 +32,7 @@
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49..792b9179eff 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,7 +7,7 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04..20df839b9c2 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f..7fc4d5b0a6a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
18 19
19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
20{ 21{
@@ -199,6 +200,7 @@ static void poll_idle(void)
199 * 200 *
200 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
201 */ 202 */
203static int __cpuinitdata force_mwait;
202 204
203#define MWAIT_INFO 0x05 205#define MWAIT_INFO 0x05
204#define MWAIT_ECX_EXTENDED_INFO 0x01 206#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
326 328
327static int __init idle_setup(char *str) 329static int __init idle_setup(char *str)
328{ 330{
331 if (!str)
332 return -EINVAL;
333
329 if (!strcmp(str, "poll")) { 334 if (!strcmp(str, "poll")) {
330 printk("using polling idle threads.\n"); 335 printk("using polling idle threads.\n");
331 pm_idle = poll_idle; 336 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9..e8a8e1b9981 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 537struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 538__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 539{
540 struct thread_struct *prev = &prev_p->thread, 540 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 541 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 542 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 543 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 544 unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 586
587 /* 587 /*
588 * Switch FS and GS. 588 * Switch FS and GS.
589 *
590 * Segment register != 0 always requires a reload. Also
591 * reload when it has changed. When prev process used 64bit
592 * base always reload to avoid an information leak.
589 */ 593 */
590 { 594 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 595 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 596 /*
593 when prev process used 64bit base always reload 597 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 598 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 599 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 600 */
597 /* check if the user used a selector != 0 601 if (fsindex)
598 * if yes clear 64bit base, since overloaded base
599 * is always mapped to the Null selector
600 */
601 if (fsindex)
602 prev->fs = 0; 602 prev->fs = 0;
603 } 603 }
604 /* when next process has a 64bit base use it */ 604 /* when next process has a 64bit base use it */
605 if (next->fs) 605 if (next->fs)
606 wrmsrl(MSR_FS_BASE, next->fs); 606 wrmsrl(MSR_FS_BASE, next->fs);
607 prev->fsindex = fsindex; 607 prev->fsindex = fsindex;
608 608
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 610 load_gs_index(next->gsindex);
611 if (gsindex) 611 if (gsindex)
612 prev->gs = 0; 612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
618 617
619 /* Must be after DS reload */ 618 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 619 unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 write_pda(pcurrent, next_p); 626 write_pda(pcurrent, next_p);
628 627
629 write_pda(kernelstack, 628 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 629 (unsigned long)task_stack_page(next_p) +
630 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 632 write_pda(stack_canary, next_p->stack_canary);
633 /* 633 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e..e37dccce85d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1357#endif
1358} 1358}
1359 1359
1360#ifdef CONFIG_X86_32
1361
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1361{
1364 struct siginfo info; 1362 struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1375 force_sig_info(SIGTRAP, &info, tsk);
1378} 1376}
1379 1377
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435
1436 /*
1437 * this isn't the same as continuing with a signal, but it will do
1438 * for normal use. strace only continues with a signal if the
1439 * stopping signal is not SIGTRAP. -brl
1440 */
1441 if (current->exit_code) {
1442 send_sig(current->exit_code, current, 1);
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460
1461static void syscall_trace(struct pt_regs *regs) 1378static void syscall_trace(struct pt_regs *regs)
1462{ 1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1463 1382
1464#if 0 1383#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", 1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
1481 } 1400 }
1482} 1401}
1483 1402
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs) 1403#ifdef CONFIG_X86_32
1404# define IS_IA32 1
1405#elif defined CONFIG_IA32_EMULATION
1406# define IS_IA32 test_thread_flag(TIF_IA32)
1407#else
1408# define IS_IA32 0
1409#endif
1410
1411/*
1412 * We must return the syscall number to actually look up in the table.
1413 * This can be -1L to skip running any syscall at all.
1414 */
1415asmregparm long syscall_trace_enter(struct pt_regs *regs)
1485{ 1416{
1417 long ret = 0;
1418
1419 /*
1420 * If we stepped into a sysenter/syscall insn, it trapped in
1421 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1422 * If user-mode had set TF itself, then it's still clear from
1423 * do_debug() and we need to set it again to restore the user
1424 * state. If we entered on the slow path, TF was already set.
1425 */
1426 if (test_thread_flag(TIF_SINGLESTEP))
1427 regs->flags |= X86_EFLAGS_TF;
1428
1486 /* do the secure computing check first */ 1429 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1430 secure_computing(regs->orig_ax);
1488 1431
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1433 ret = -1L;
1434
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
1491 syscall_trace(regs); 1436 syscall_trace(regs);
1492 1437
1493 if (unlikely(current->audit_context)) { 1438 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1439 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1440 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1441 regs->orig_ax,
1497 regs->bx, regs->cx, 1442 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1443 regs->dx, regs->si);
1499 } else { 1444#ifdef CONFIG_X86_64
1445 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1446 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1447 regs->orig_ax,
1502 regs->di, regs->si, 1448 regs->di, regs->si,
1503 regs->dx, regs->r10); 1449 regs->dx, regs->r10);
1504 } 1450#endif
1505 } 1451 }
1452
1453 return ret ?: regs->orig_ax;
1506} 1454}
1507 1455
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1456asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1457{
1510 if (unlikely(current->audit_context)) 1458 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1460
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1461 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP))
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs); 1462 syscall_trace(regs);
1517}
1518 1463
1519#endif /* CONFIG_X86_32 */ 1464 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of
1466 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1467 * We already reported this syscall instruction in
1468 * syscall_trace_enter(), so don't do any more now.
1469 */
1470 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1471 return;
1472
1473 /*
1474 * If we are single-stepping, synthesize a trap to follow the
1475 * system call instruction.
1476 */
1477 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED))
1479 send_sigtrap(current, regs, 0);
1480}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e15..9dcf39c0297 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 178 },
179 }, 179 },
180 { /* Handle problems with rebooting on Dell T5400's */
181 .callback = set_bios_reboot,
182 .ident = "Dell Precision T5400",
183 .matches = {
184 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
185 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
186 },
187 },
180 { /* Handle problems with rebooting on HP laptops */ 188 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 189 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 190 .ident = "HP Compaq Laptop",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81..ec952aa5394 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/user.h> 58#include <linux/user.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/highmem.h>
61 60
62#include <linux/kallsyms.h> 61#include <linux/kallsyms.h>
63#include <linux/edd.h>
64#include <linux/iscsi_ibft.h>
65#include <linux/kexec.h>
66#include <linux/cpufreq.h> 62#include <linux/cpufreq.h>
67#include <linux/dma-mapping.h> 63#include <linux/dma-mapping.h>
68#include <linux/ctype.h> 64#include <linux/ctype.h>
@@ -96,7 +92,7 @@
96#include <asm/smp.h> 92#include <asm/smp.h>
97#include <asm/desc.h> 93#include <asm/desc.h>
98#include <asm/dma.h> 94#include <asm/dma.h>
99#include <asm/gart.h> 95#include <asm/iommu.h>
100#include <asm/mmu_context.h> 96#include <asm/mmu_context.h>
101#include <asm/proto.h> 97#include <asm/proto.h>
102 98
@@ -104,7 +100,6 @@
104#include <asm/paravirt.h> 100#include <asm/paravirt.h>
105 101
106#include <asm/percpu.h> 102#include <asm/percpu.h>
107#include <asm/sections.h>
108#include <asm/topology.h> 103#include <asm/topology.h>
109#include <asm/apicdef.h> 104#include <asm/apicdef.h>
110#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
579early_param("elfcorehdr", setup_elfcorehdr); 574early_param("elfcorehdr", setup_elfcorehdr);
580#endif 575#endif
581 576
577static struct x86_quirks default_x86_quirks __initdata;
578
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580
582/* 581/*
583 * Determine if we were loaded by an EFI loader. If so, then we have also been 582 * Determine if we were loaded by an EFI loader. If so, then we have also been
584 * passed the efi memmap, systab, etc., so we should use these data structures 583 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -824,7 +823,10 @@ void __init setup_arch(char **cmdline_p)
824 vmi_init(); 823 vmi_init();
825#endif 824#endif
826 825
826 paravirt_pagetable_setup_start(swapper_pg_dir);
827 paging_init(); 827 paging_init();
828 paravirt_pagetable_setup_done(swapper_pg_dir);
829 paravirt_post_allocator_init();
828 830
829#ifdef CONFIG_X86_64 831#ifdef CONFIG_X86_64
830 map_vsyscall(); 832 map_vsyscall();
@@ -854,14 +856,6 @@ void __init setup_arch(char **cmdline_p)
854 init_cpu_to_node(); 856 init_cpu_to_node();
855#endif 857#endif
856 858
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
865 init_apic_mappings(); 859 init_apic_mappings();
866 ioapic_init_mappings(); 860 ioapic_init_mappings();
867 861
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d9237363096..07faaa5109c 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 212
213badframe: 213badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 214 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 215 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 216 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 218 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs)
657void 657void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 659{
660 /* Pending single-step? */
661 if (thread_info_flags & _TIF_SINGLESTEP) {
662 regs->flags |= X86_EFLAGS_TF;
663 clear_thread_flag(TIF_SINGLESTEP);
664 }
665
666 /* deal with pending signal delivery */ 660 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 662 do_signal(regs);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e..bf87684474f 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs)
487void do_notify_resume(struct pt_regs *regs, void *unused, 487void do_notify_resume(struct pt_regs *regs, void *unused,
488 __u32 thread_info_flags) 488 __u32 thread_info_flags)
489{ 489{
490 /* Pending single-step? */
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE 490#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 491 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 492 if (thread_info_flags & _TIF_MCE_NOTIFY)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e..27640196eb7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 546 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 547 "a previous APIC delivery may have failed\n");
548 548
549 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 549 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
550 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 550 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 551
552 timeout = 0; 552 timeout = 0;
553 do { 553 do {
@@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 579 int maxlvt;
580 580
581 /* Target chip */ 581 /* Target chip */
582 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 582 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583 583
584 /* Boot on the stack */ 584 /* Boot on the stack */
585 /* Kick the second */ 585 /* Kick the second */
586 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 586 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
587 587
588 Dprintk("Waiting for send to finish...\n"); 588 Dprintk("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 589 send_status = safe_apic_wait_icr_idle();
@@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
592 * Give the other CPU some time to accept the IPI. 592 * Give the other CPU some time to accept the IPI.
593 */ 593 */
594 udelay(200); 594 udelay(200);
595 /*
596 * Due to the Pentium erratum 3AP.
597 */
598 maxlvt = lapic_get_maxlvt(); 595 maxlvt = lapic_get_maxlvt();
599 if (maxlvt > 3) { 596 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
600 apic_read_around(APIC_SPIV);
601 apic_write(APIC_ESR, 0); 597 apic_write(APIC_ESR, 0);
602 }
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 598 accept_status = (apic_read(APIC_ESR) & 0xEF);
604 Dprintk("NMI sent.\n"); 599 Dprintk("NMI sent.\n");
605 600
@@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
625 return send_status; 620 return send_status;
626 } 621 }
627 622
623 maxlvt = lapic_get_maxlvt();
624
628 /* 625 /*
629 * Be paranoid about clearing APIC errors. 626 * Be paranoid about clearing APIC errors.
630 */ 627 */
631 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 628 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
632 apic_read_around(APIC_SPIV); 629 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
633 apic_write(APIC_ESR, 0); 630 apic_write(APIC_ESR, 0);
634 apic_read(APIC_ESR); 631 apic_read(APIC_ESR);
635 } 632 }
636 633
@@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
639 /* 636 /*
640 * Turn INIT on target chip 637 * Turn INIT on target chip
641 */ 638 */
642 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 639 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643 640
644 /* 641 /*
645 * Send IPI 642 * Send IPI
646 */ 643 */
647 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 644 apic_write(APIC_ICR,
648 | APIC_DM_INIT); 645 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
649 646
650 Dprintk("Waiting for send to finish...\n"); 647 Dprintk("Waiting for send to finish...\n");
651 send_status = safe_apic_wait_icr_idle(); 648 send_status = safe_apic_wait_icr_idle();
@@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
655 Dprintk("Deasserting INIT.\n"); 652 Dprintk("Deasserting INIT.\n");
656 653
657 /* Target chip */ 654 /* Target chip */
658 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 655 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
659 656
660 /* Send IPI */ 657 /* Send IPI */
661 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 658 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
662 659
663 Dprintk("Waiting for send to finish...\n"); 660 Dprintk("Waiting for send to finish...\n");
664 send_status = safe_apic_wait_icr_idle(); 661 send_status = safe_apic_wait_icr_idle();
@@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
689 */ 686 */
690 Dprintk("#startup loops: %d.\n", num_starts); 687 Dprintk("#startup loops: %d.\n", num_starts);
691 688
692 maxlvt = lapic_get_maxlvt();
693
694 for (j = 1; j <= num_starts; j++) { 689 for (j = 1; j <= num_starts; j++) {
695 Dprintk("Sending STARTUP #%d.\n", j); 690 Dprintk("Sending STARTUP #%d.\n", j);
696 apic_read_around(APIC_SPIV); 691 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
697 apic_write(APIC_ESR, 0); 692 apic_write(APIC_ESR, 0);
698 apic_read(APIC_ESR); 693 apic_read(APIC_ESR);
699 Dprintk("After apic_write.\n"); 694 Dprintk("After apic_write.\n");
700 695
@@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
703 */ 698 */
704 699
705 /* Target chip */ 700 /* Target chip */
706 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 701 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707 702
708 /* Boot on the stack */ 703 /* Boot on the stack */
709 /* Kick the second */ 704 /* Kick the second */
710 apic_write_around(APIC_ICR, APIC_DM_STARTUP 705 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
711 | (start_eip >> 12));
712 706
713 /* 707 /*
714 * Give the other CPU some time to accept the IPI. 708 * Give the other CPU some time to accept the IPI.
@@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
724 * Give the other CPU some time to accept the IPI. 718 * Give the other CPU some time to accept the IPI.
725 */ 719 */
726 udelay(200); 720 udelay(200);
727 /* 721 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
728 * Due to the Pentium erratum 3AP.
729 */
730 if (maxlvt > 3) {
731 apic_read_around(APIC_SPIV);
732 apic_write(APIC_ESR, 0); 722 apic_write(APIC_ESR, 0);
733 }
734 accept_status = (apic_read(APIC_ESR) & 0xEF); 723 accept_status = (apic_read(APIC_ESR) & 0xEF);
735 if (send_status || accept_status) 724 if (send_status || accept_status)
736 break; 725 break;
@@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
768 * 757 *
769 * Must be called after the _cpu_pda pointer table is initialized. 758 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 759 */
771static int __cpuinit get_local_pda(int cpu) 760int __cpuinit get_local_pda(int cpu)
772{ 761{
773 struct x8664_pda *oldpda, *newpda; 762 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 763 unsigned long size = sizeof(struct x8664_pda);
@@ -1311,7 +1300,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1311 cpu_clear(cpu, cpu_callout_map); 1300 cpu_clear(cpu, cpu_callout_map);
1312 cpu_clear(cpu, cpu_callin_map); 1301 cpu_clear(cpu, cpu_callin_map);
1313 /* was set by cpu_init() */ 1302 /* was set by cpu_init() */
1314 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1303 cpu_clear(cpu, cpu_initialized);
1315 numa_remove_cpu(cpu); 1304 numa_remove_cpu(cpu);
1316} 1305}
1317 1306
@@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
1390{ 1379{
1391 extern unsigned int maxcpus; 1380 extern unsigned int maxcpus;
1392 1381
1393 maxcpus = simple_strtoul(arg, NULL, 0); 1382 if (arg)
1383 maxcpus = simple_strtoul(arg, NULL, 0);
1394 return 0; 1384 return 0;
1395} 1385}
1396early_param("maxcpus", parse_maxcpus); 1386early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791..00000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee678..e8b9863ef8c 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b..ffe3c664afc 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
129 */ 129 */
130void __init time_init(void) 130void __init time_init(void)
131{ 131{
132 pre_time_init_hook();
132 tsc_init(); 133 tsc_init();
133 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
134} 135}
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 8a768973c4f..03df8e45e5a 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -58,6 +58,7 @@
58#include <asm/nmi.h> 58#include <asm/nmi.h>
59#include <asm/smp.h> 59#include <asm/smp.h>
60#include <asm/io.h> 60#include <asm/io.h>
61#include <asm/traps.h>
61 62
62#include "mach_traps.h" 63#include "mach_traps.h"
63 64
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
77gate_desc idt_table[256] 78gate_desc idt_table[256]
78 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
79 80
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int panic_on_unrecovered_nmi; 81int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24; 82int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 83static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
256 237
257static void 238static void
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl) 240 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 241{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
262 printk("%s =======================\n", log_lvl); 243 printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
383 return ud2 == 0x0b0f; 364 return ud2 == 0x0b0f;
384} 365}
385 366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
386int __kprobes __die(const char *str, struct pt_regs *regs, long err) 415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
387{ 416{
388 unsigned short ss; 417 unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
423 */ 452 */
424void die(const char *str, struct pt_regs *regs, long err) 453void die(const char *str, struct pt_regs *regs, long err)
425{ 454{
426 static struct { 455 unsigned long flags = oops_begin();
427 raw_spinlock_t lock;
428 u32 lock_owner;
429 int lock_owner_depth;
430 } die = {
431 .lock = __RAW_SPIN_LOCK_UNLOCKED,
432 .lock_owner = -1,
433 .lock_owner_depth = 0
434 };
435 unsigned long flags;
436
437 oops_enter();
438
439 if (die.lock_owner != raw_smp_processor_id()) {
440 console_verbose();
441 raw_local_irq_save(flags);
442 __raw_spin_lock(&die.lock);
443 die.lock_owner = smp_processor_id();
444 die.lock_owner_depth = 0;
445 bust_spinlocks(1);
446 } else {
447 raw_local_irq_save(flags);
448 }
449 456
450 if (++die.lock_owner_depth < 3) { 457 if (die_nest_count < 3) {
451 report_bug(regs->ip, regs); 458 report_bug(regs->ip, regs);
452 459
453 if (__die(str, regs, err)) 460 if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
456 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
457 } 464 }
458 465
459 bust_spinlocks(0); 466 oops_end(flags, regs, SIGSEGV);
460 die.lock_owner = -1;
461 add_taint(TAINT_DIE);
462 __raw_spin_unlock(&die.lock);
463 raw_local_irq_restore(flags);
464
465 if (!regs)
466 return;
467
468 if (kexec_should_crash(current))
469 crash_kexec(regs);
470
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473
474 if (panic_on_oops)
475 panic("Fatal exception");
476
477 oops_exit();
478 do_exit(SIGSEGV);
479} 467}
480 468
481static inline void 469static inline void
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 2696a683778..3f18d73f420 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -51,30 +51,10 @@
51#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
52#include <asm/proto.h> 52#include <asm/proto.h>
53#include <asm/pda.h> 53#include <asm/pda.h>
54#include <asm/traps.h>
54 55
55#include <mach_traps.h> 56#include <mach_traps.h>
56 57
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void);
75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77
78int panic_on_unrecovered_nmi; 58int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12; 59int kstack_depth_to_print = 12;
80static unsigned int code_bytes = 64; 60static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
355 .address = print_trace_address, 335 .address = print_trace_address,
356}; 336};
357 337
358void show_trace(struct task_struct *task, struct pt_regs *regs, 338static void
359 unsigned long *stack, unsigned long bp) 339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl)
360{ 341{
361 printk("\nCall Trace:\n"); 342 printk("\nCall Trace:\n");
362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); 343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
363 printk("\n"); 344 printk("\n");
364} 345}
365 346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
366static void 353static void
367_show_stack(struct task_struct *task, struct pt_regs *regs, 354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
368 unsigned long *sp, unsigned long bp) 355 unsigned long *sp, unsigned long bp, char *log_lvl)
369{ 356{
370 unsigned long *stack; 357 unsigned long *stack;
371 int i; 358 int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
399 printk(" %016lx", *stack++); 386 printk(" %016lx", *stack++);
400 touch_nmi_watchdog(); 387 touch_nmi_watchdog();
401 } 388 }
402 show_trace(task, regs, sp, bp); 389 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
403} 390}
404 391
405void show_stack(struct task_struct *task, unsigned long *sp) 392void show_stack(struct task_struct *task, unsigned long *sp)
406{ 393{
407 _show_stack(task, NULL, sp, 0); 394 show_stack_log_lvl(task, NULL, sp, 0, "");
408} 395}
409 396
410/* 397/*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
454 u8 *ip; 441 u8 *ip;
455 442
456 printk("Stack: "); 443 printk("Stack: ");
457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); 444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, "");
458 printk("\n"); 446 printk("\n");
459 447
460 printk(KERN_EMERG "Code: "); 448 printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
518} 506}
519 507
520void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 508void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
521{ 509{
522 die_owner = -1; 510 die_owner = -1;
523 bust_spinlocks(0); 511 bust_spinlocks(0);
524 die_nest_count--; 512 die_nest_count--;
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1..41e01b145c4 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -73,7 +73,7 @@ int is_visws_box(void)
73 return visws_board_type >= 0; 73 return visws_board_type >= 0;
74} 74}
75 75
76static int __init visws_time_init_quirk(void) 76static int __init visws_time_init(void)
77{ 77{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 78 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79 79
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
93 return 0; 93 return 0;
94} 94}
95 95
96static int __init visws_pre_intr_init_quirk(void) 96static int __init visws_pre_intr_init(void)
97{ 97{
98 init_VISWS_APIC_irqs(); 98 init_VISWS_APIC_irqs();
99 99
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
114 114
115long long mem_size __initdata = 0; 115long long mem_size __initdata = 0;
116 116
117static char * __init visws_memory_setup_quirk(void) 117static char * __init visws_memory_setup(void)
118{ 118{
119 long long gfx_mem_size = 8 * MB; 119 long long gfx_mem_size = 8 * MB;
120 120
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
176 outl(PIIX_SPECIAL_STOP, 0xCFC); 176 outl(PIIX_SPECIAL_STOP, 0xCFC);
177} 177}
178 178
179static int __init visws_get_smp_config_quirk(unsigned int early) 179static int __init visws_get_smp_config(unsigned int early)
180{ 180{
181 /* 181 /*
182 * Prevent MP-table parsing by the generic code: 182 * Prevent MP-table parsing by the generic code:
@@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
192 * No problem for Linux. 192 * No problem for Linux.
193 */ 193 */
194 194
195static void __init MP_processor_info (struct mpc_config_processor *m) 195static void __init MP_processor_info(struct mpc_config_processor *m)
196{ 196{
197 int ver, logical_apicid; 197 int ver, logical_apicid;
198 physid_mask_t apic_cpus; 198 physid_mask_t apic_cpus;
@@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
232 apic_version[m->mpc_apicid] = ver; 232 apic_version[m->mpc_apicid] = ver;
233} 233}
234 234
235int __init visws_find_smp_config_quirk(unsigned int reserve) 235static int __init visws_find_smp_config(unsigned int reserve)
236{ 236{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
258 return 1; 258 return 1;
259} 259}
260 260
261extern int visws_trap_init_quirk(void); 261static int visws_trap_init(void);
262
263static struct x86_quirks visws_x86_quirks __initdata = {
264 .arch_time_init = visws_time_init,
265 .arch_pre_intr_init = visws_pre_intr_init,
266 .arch_memory_setup = visws_memory_setup,
267 .arch_intr_init = NULL,
268 .arch_trap_init = visws_trap_init,
269 .mach_get_smp_config = visws_get_smp_config,
270 .mach_find_smp_config = visws_find_smp_config,
271};
262 272
263void __init visws_early_detect(void) 273void __init visws_early_detect(void)
264{ 274{
@@ -272,16 +282,10 @@ void __init visws_early_detect(void)
272 282
273 /* 283 /*
274 * Install special quirks for timer, interrupt and memory setup: 284 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps: 285 * Fall back to generic behavior for traps:
286 * Override generic MP-table parsing:
282 */ 287 */
283 arch_intr_init_quirk = NULL; 288 x86_quirks = &visws_x86_quirks;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285 289
286 /* 290 /*
287 * Install reboot quirks: 291 * Install reboot quirks:
@@ -294,12 +298,6 @@ void __init visws_early_detect(void)
294 */ 298 */
295 no_broadcast = 0; 299 no_broadcast = 0;
296 300
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC 301#ifdef CONFIG_X86_IO_APIC
304 /* 302 /*
305 * Turn off IO-APIC detection and initialization: 303 * Turn off IO-APIC detection and initialization:
@@ -426,7 +424,7 @@ static __init void cobalt_init(void)
426 co_apic_read(CO_APIC_ID)); 424 co_apic_read(CO_APIC_ID));
427} 425}
428 426
429int __init visws_trap_init_quirk(void) 427static int __init visws_trap_init(void)
430{ 428{
431 lithium_init(); 429 lithium_init();
432 cobalt_init(); 430 cobalt_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7..0a1b1a9d922 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
906#ifdef CONFIG_X86_LOCAL_APIC 906#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 907 para_fill(pv_apic_ops.apic_read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 908 para_fill(pv_apic_ops.apic_write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 909#endif
911 910
912 /* 911 /*
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c97d35c218d..d0e940bb6f4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,8 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o)
6ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
7common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
8endif 9endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3829aa7b663..c0f7872a912 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
91 c->gate = val; 91 c->gate = val;
92} 92}
93 93
94int pit_get_gate(struct kvm *kvm, int channel) 94static int pit_get_gate(struct kvm *kvm, int channel)
95{ 95{
96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
97 97
@@ -193,19 +193,16 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 193 }
194} 194}
195 195
196int __pit_timer_fn(struct kvm_kpit_state *ps) 196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{ 197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; 198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer; 199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200 200
201 atomic_inc(&pt->pending); 201 if (!atomic_inc_and_test(&pt->pending))
202 smp_mb__after_atomic_inc();
203 if (vcpu0) {
204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 if (waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
208 }
209 } 206 }
210 207
211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -308,6 +305,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
308 create_pit_timer(&ps->pit_timer, val, 0); 305 create_pit_timer(&ps->pit_timer, val, 0);
309 break; 306 break;
310 case 2: 307 case 2:
308 case 3:
311 create_pit_timer(&ps->pit_timer, val, 1); 309 create_pit_timer(&ps->pit_timer, val, 1);
312 break; 310 break;
313 default: 311 default:
@@ -459,7 +457,8 @@ static void pit_ioport_read(struct kvm_io_device *this,
459 mutex_unlock(&pit_state->lock); 457 mutex_unlock(&pit_state->lock);
460} 458}
461 459
462static int pit_in_range(struct kvm_io_device *this, gpa_t addr) 460static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
461 int len, int is_write)
463{ 462{
464 return ((addr >= KVM_PIT_BASE_ADDRESS) && 463 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
465 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 464 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
@@ -500,7 +499,8 @@ static void speaker_ioport_read(struct kvm_io_device *this,
500 mutex_unlock(&pit_state->lock); 499 mutex_unlock(&pit_state->lock);
501} 500}
502 501
503static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) 502static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
503 int len, int is_write)
504{ 504{
505 return (addr == KVM_SPEAKER_BASE_ADDRESS); 505 return (addr == KVM_SPEAKER_BASE_ADDRESS);
506} 506}
@@ -575,7 +575,7 @@ void kvm_free_pit(struct kvm *kvm)
575 } 575 }
576} 576}
577 577
578void __inject_pit_timer_intr(struct kvm *kvm) 578static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 579{
580 mutex_lock(&kvm->lock); 580 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab29cf2def4..c31164e8aa4 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -130,8 +130,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
130{ 130{
131 struct kvm_pic *s = opaque; 131 struct kvm_pic *s = opaque;
132 132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 133 if (irq >= 0 && irq < PIC_NUM_PINS) {
134 pic_update_irq(s); 134 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
135 pic_update_irq(s);
136 }
135} 137}
136 138
137/* 139/*
@@ -346,7 +348,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
346 return s->elcr; 348 return s->elcr;
347} 349}
348 350
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) 351static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
352 int len, int is_write)
350{ 353{
351 switch (addr) { 354 switch (addr) {
352 case 0x20: 355 case 0x20:
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2a15be2275c..7ca47cbb48b 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -30,6 +30,8 @@
30#include "ioapic.h" 30#include "ioapic.h"
31#include "lapic.h" 31#include "lapic.h"
32 32
33#define PIC_NUM_PINS 16
34
33struct kvm; 35struct kvm;
34struct kvm_vcpu; 36struct kvm_vcpu;
35 37
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ebc03f5ae16..73f43de69f6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -356,8 +356,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
356 case APIC_DM_SMI: 356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n"); 357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break; 358 break;
359
359 case APIC_DM_NMI: 360 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n"); 361 kvm_inject_nmi(vcpu);
361 break; 362 break;
362 363
363 case APIC_DM_INIT: 364 case APIC_DM_INIT:
@@ -572,6 +573,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{ 573{
573 u32 val = 0; 574 u32 val = 0;
574 575
576 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
577
575 if (offset >= LAPIC_MMIO_LENGTH) 578 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0; 579 return 0;
577 580
@@ -695,6 +698,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
695 698
696 offset &= 0xff0; 699 offset &= 0xff0;
697 700
701 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
702
698 switch (offset) { 703 switch (offset) {
699 case APIC_ID: /* Local APIC ID */ 704 case APIC_ID: /* Local APIC ID */
700 apic_set_reg(apic, APIC_ID, val); 705 apic_set_reg(apic, APIC_ID, val);
@@ -780,7 +785,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
780 785
781} 786}
782 787
783static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) 788static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
789 int len, int size)
784{ 790{
785 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 791 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
786 int ret = 0; 792 int ret = 0;
@@ -939,8 +945,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
939 int result = 0; 945 int result = 0;
940 wait_queue_head_t *q = &apic->vcpu->wq; 946 wait_queue_head_t *q = &apic->vcpu->wq;
941 947
942 atomic_inc(&apic->timer.pending); 948 if(!atomic_inc_and_test(&apic->timer.pending))
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
944 if (waitqueue_active(q)) { 950 if (waitqueue_active(q)) {
945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
946 wake_up_interruptible(q); 952 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 676c396c9ce..81858881287 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 35
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 36int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 37int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e7c3969f7a..b0e4ddca6c1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -66,7 +66,8 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
66#endif 66#endif
67 67
68#if defined(MMU_DEBUG) || defined(AUDIT) 68#if defined(MMU_DEBUG) || defined(AUDIT)
69static int dbg = 1; 69static int dbg = 0;
70module_param(dbg, bool, 0644);
70#endif 71#endif
71 72
72#ifndef MMU_DEBUG 73#ifndef MMU_DEBUG
@@ -776,6 +777,15 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
776 BUG(); 777 BUG();
777} 778}
778 779
780static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
781 struct kvm_mmu_page *sp)
782{
783 int i;
784
785 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
786 sp->spt[i] = shadow_trap_nonpresent_pte;
787}
788
779static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 789static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
780{ 790{
781 unsigned index; 791 unsigned index;
@@ -841,7 +851,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
841 hlist_add_head(&sp->hash_link, bucket); 851 hlist_add_head(&sp->hash_link, bucket);
842 if (!metaphysical) 852 if (!metaphysical)
843 rmap_write_protect(vcpu->kvm, gfn); 853 rmap_write_protect(vcpu->kvm, gfn);
844 vcpu->arch.mmu.prefetch_page(vcpu, sp); 854 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
855 vcpu->arch.mmu.prefetch_page(vcpu, sp);
856 else
857 nonpaging_prefetch_page(vcpu, sp);
845 return sp; 858 return sp;
846} 859}
847 860
@@ -917,14 +930,17 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
917 } 930 }
918 kvm_mmu_page_unlink_children(kvm, sp); 931 kvm_mmu_page_unlink_children(kvm, sp);
919 if (!sp->root_count) { 932 if (!sp->root_count) {
920 if (!sp->role.metaphysical) 933 if (!sp->role.metaphysical && !sp->role.invalid)
921 unaccount_shadowed(kvm, sp->gfn); 934 unaccount_shadowed(kvm, sp->gfn);
922 hlist_del(&sp->hash_link); 935 hlist_del(&sp->hash_link);
923 kvm_mmu_free_page(kvm, sp); 936 kvm_mmu_free_page(kvm, sp);
924 } else { 937 } else {
938 int invalid = sp->role.invalid;
925 list_move(&sp->link, &kvm->arch.active_mmu_pages); 939 list_move(&sp->link, &kvm->arch.active_mmu_pages);
926 sp->role.invalid = 1; 940 sp->role.invalid = 1;
927 kvm_reload_remote_mmus(kvm); 941 kvm_reload_remote_mmus(kvm);
942 if (!sp->role.metaphysical && !invalid)
943 unaccount_shadowed(kvm, sp->gfn);
928 } 944 }
929 kvm_mmu_reset_last_pte_updated(kvm); 945 kvm_mmu_reset_last_pte_updated(kvm);
930} 946}
@@ -1103,7 +1119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1103 mark_page_dirty(vcpu->kvm, gfn); 1119 mark_page_dirty(vcpu->kvm, gfn);
1104 1120
1105 pgprintk("%s: setting spte %llx\n", __func__, spte); 1121 pgprintk("%s: setting spte %llx\n", __func__, spte);
1106 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", 1122 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1107 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", 1123 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1108 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); 1124 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1109 set_shadow_pte(shadow_pte, spte); 1125 set_shadow_pte(shadow_pte, spte);
@@ -1122,8 +1138,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1122 else 1138 else
1123 kvm_release_pfn_clean(pfn); 1139 kvm_release_pfn_clean(pfn);
1124 } 1140 }
1125 if (!ptwrite || !*ptwrite) 1141 if (speculative) {
1126 vcpu->arch.last_pte_updated = shadow_pte; 1142 vcpu->arch.last_pte_updated = shadow_pte;
1143 vcpu->arch.last_pte_gfn = gfn;
1144 }
1127} 1145}
1128 1146
1129static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 1147static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1171,9 +1189,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1171 return -ENOMEM; 1189 return -ENOMEM;
1172 } 1190 }
1173 1191
1174 table[index] = __pa(new_table->spt) 1192 set_shadow_pte(&table[index],
1175 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1193 __pa(new_table->spt)
1176 | shadow_user_mask | shadow_x_mask; 1194 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1195 | shadow_user_mask | shadow_x_mask);
1177 } 1196 }
1178 table_addr = table[index] & PT64_BASE_ADDR_MASK; 1197 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1179 } 1198 }
@@ -1211,15 +1230,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1211} 1230}
1212 1231
1213 1232
1214static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1215 struct kvm_mmu_page *sp)
1216{
1217 int i;
1218
1219 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1220 sp->spt[i] = shadow_trap_nonpresent_pte;
1221}
1222
1223static void mmu_free_roots(struct kvm_vcpu *vcpu) 1233static void mmu_free_roots(struct kvm_vcpu *vcpu)
1224{ 1234{
1225 int i; 1235 int i;
@@ -1671,6 +1681,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1671 vcpu->arch.update_pte.pfn = pfn; 1681 vcpu->arch.update_pte.pfn = pfn;
1672} 1682}
1673 1683
1684static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1685{
1686 u64 *spte = vcpu->arch.last_pte_updated;
1687
1688 if (spte
1689 && vcpu->arch.last_pte_gfn == gfn
1690 && shadow_accessed_mask
1691 && !(*spte & shadow_accessed_mask)
1692 && is_shadow_present_pte(*spte))
1693 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1694}
1695
1674void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1696void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1675 const u8 *new, int bytes) 1697 const u8 *new, int bytes)
1676{ 1698{
@@ -1694,6 +1716,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1694 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 1716 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1695 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 1717 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1696 spin_lock(&vcpu->kvm->mmu_lock); 1718 spin_lock(&vcpu->kvm->mmu_lock);
1719 kvm_mmu_access_page(vcpu, gfn);
1697 kvm_mmu_free_some_pages(vcpu); 1720 kvm_mmu_free_some_pages(vcpu);
1698 ++vcpu->kvm->stat.mmu_pte_write; 1721 ++vcpu->kvm->stat.mmu_pte_write;
1699 kvm_mmu_audit(vcpu, "pre pte write"); 1722 kvm_mmu_audit(vcpu, "pre pte write");
@@ -1948,7 +1971,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
1948 kvm_flush_remote_tlbs(kvm); 1971 kvm_flush_remote_tlbs(kvm);
1949} 1972}
1950 1973
1951void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 1974static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
1952{ 1975{
1953 struct kvm_mmu_page *page; 1976 struct kvm_mmu_page *page;
1954 1977
@@ -1968,6 +1991,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1968 list_for_each_entry(kvm, &vm_list, vm_list) { 1991 list_for_each_entry(kvm, &vm_list, vm_list) {
1969 int npages; 1992 int npages;
1970 1993
1994 if (!down_read_trylock(&kvm->slots_lock))
1995 continue;
1971 spin_lock(&kvm->mmu_lock); 1996 spin_lock(&kvm->mmu_lock);
1972 npages = kvm->arch.n_alloc_mmu_pages - 1997 npages = kvm->arch.n_alloc_mmu_pages -
1973 kvm->arch.n_free_mmu_pages; 1998 kvm->arch.n_free_mmu_pages;
@@ -1980,6 +2005,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1980 nr_to_scan--; 2005 nr_to_scan--;
1981 2006
1982 spin_unlock(&kvm->mmu_lock); 2007 spin_unlock(&kvm->mmu_lock);
2008 up_read(&kvm->slots_lock);
1983 } 2009 }
1984 if (kvm_freed) 2010 if (kvm_freed)
1985 list_move_tail(&kvm_freed->vm_list, &vm_list); 2011 list_move_tail(&kvm_freed->vm_list, &vm_list);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1730757bbc7..258e5d56298 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -15,7 +15,8 @@
15#define PT_USER_MASK (1ULL << 2) 15#define PT_USER_MASK (1ULL << 2)
16#define PT_PWT_MASK (1ULL << 3) 16#define PT_PWT_MASK (1ULL << 3)
17#define PT_PCD_MASK (1ULL << 4) 17#define PT_PCD_MASK (1ULL << 4)
18#define PT_ACCESSED_MASK (1ULL << 5) 18#define PT_ACCESSED_SHIFT 5
19#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
19#define PT_DIRTY_MASK (1ULL << 6) 20#define PT_DIRTY_MASK (1ULL << 6)
20#define PT_PAGE_SIZE_MASK (1ULL << 7) 21#define PT_PAGE_SIZE_MASK (1ULL << 7)
21#define PT_PAT_MASK (1ULL << 7) 22#define PT_PAT_MASK (1ULL << 7)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 934c7b61939..4d918220bae 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -460,8 +460,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, 460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
461 struct kvm_mmu_page *sp) 461 struct kvm_mmu_page *sp)
462{ 462{
463 int i, offset = 0, r = 0; 463 int i, j, offset, r;
464 pt_element_t pt; 464 pt_element_t pt[256 / sizeof(pt_element_t)];
465 gpa_t pte_gpa;
465 466
466 if (sp->role.metaphysical 467 if (sp->role.metaphysical
467 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 468 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -469,19 +470,20 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
469 return; 470 return;
470 } 471 }
471 472
472 if (PTTYPE == 32) 473 pte_gpa = gfn_to_gpa(sp->gfn);
474 if (PTTYPE == 32) {
473 offset = sp->role.quadrant << PT64_LEVEL_BITS; 475 offset = sp->role.quadrant << PT64_LEVEL_BITS;
476 pte_gpa += offset * sizeof(pt_element_t);
477 }
474 478
475 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 479 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
476 gpa_t pte_gpa = gfn_to_gpa(sp->gfn); 480 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
477 pte_gpa += (i+offset) * sizeof(pt_element_t); 481 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
478 482 for (j = 0; j < ARRAY_SIZE(pt); ++j)
479 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, 483 if (r || is_present_pte(pt[j]))
480 sizeof(pt_element_t)); 484 sp->spt[i+j] = shadow_trap_nonpresent_pte;
481 if (r || is_present_pte(pt)) 485 else
482 sp->spt[i] = shadow_trap_nonpresent_pte; 486 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
483 else
484 sp->spt[i] = shadow_notrap_nonpresent_pte;
485 } 487 }
486} 488}
487 489
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b0d5fa5bab..b756e876dce 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -27,6 +27,8 @@
27 27
28#include <asm/desc.h> 28#include <asm/desc.h>
29 29
30#define __ex(x) __kvm_handle_fault_on_reboot(x)
31
30MODULE_AUTHOR("Qumranet"); 32MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
32 34
@@ -129,17 +131,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
129 131
130static inline void clgi(void) 132static inline void clgi(void)
131{ 133{
132 asm volatile (SVM_CLGI); 134 asm volatile (__ex(SVM_CLGI));
133} 135}
134 136
135static inline void stgi(void) 137static inline void stgi(void)
136{ 138{
137 asm volatile (SVM_STGI); 139 asm volatile (__ex(SVM_STGI));
138} 140}
139 141
140static inline void invlpga(unsigned long addr, u32 asid) 142static inline void invlpga(unsigned long addr, u32 asid)
141{ 143{
142 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); 144 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
143} 145}
144 146
145static inline unsigned long kvm_read_cr2(void) 147static inline unsigned long kvm_read_cr2(void)
@@ -270,19 +272,11 @@ static int has_svm(void)
270 272
271static void svm_hardware_disable(void *garbage) 273static void svm_hardware_disable(void *garbage)
272{ 274{
273 struct svm_cpu_data *svm_data 275 uint64_t efer;
274 = per_cpu(svm_data, raw_smp_processor_id());
275
276 if (svm_data) {
277 uint64_t efer;
278 276
279 wrmsrl(MSR_VM_HSAVE_PA, 0); 277 wrmsrl(MSR_VM_HSAVE_PA, 0);
280 rdmsrl(MSR_EFER, efer); 278 rdmsrl(MSR_EFER, efer);
281 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); 279 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
282 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
283 __free_page(svm_data->save_area);
284 kfree(svm_data);
285 }
286} 280}
287 281
288static void svm_hardware_enable(void *garbage) 282static void svm_hardware_enable(void *garbage)
@@ -321,6 +315,19 @@ static void svm_hardware_enable(void *garbage)
321 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 315 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
322} 316}
323 317
318static void svm_cpu_uninit(int cpu)
319{
320 struct svm_cpu_data *svm_data
321 = per_cpu(svm_data, raw_smp_processor_id());
322
323 if (!svm_data)
324 return;
325
326 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
327 __free_page(svm_data->save_area);
328 kfree(svm_data);
329}
330
324static int svm_cpu_init(int cpu) 331static int svm_cpu_init(int cpu)
325{ 332{
326 struct svm_cpu_data *svm_data; 333 struct svm_cpu_data *svm_data;
@@ -458,6 +465,11 @@ err:
458 465
459static __exit void svm_hardware_unsetup(void) 466static __exit void svm_hardware_unsetup(void)
460{ 467{
468 int cpu;
469
470 for_each_online_cpu(cpu)
471 svm_cpu_uninit(cpu);
472
461 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 473 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
462 iopm_base = 0; 474 iopm_base = 0;
463} 475}
@@ -707,10 +719,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
707 rdtscll(vcpu->arch.host_tsc); 719 rdtscll(vcpu->arch.host_tsc);
708} 720}
709 721
710static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
711{
712}
713
714static void svm_cache_regs(struct kvm_vcpu *vcpu) 722static void svm_cache_regs(struct kvm_vcpu *vcpu)
715{ 723{
716 struct vcpu_svm *svm = to_svm(vcpu); 724 struct vcpu_svm *svm = to_svm(vcpu);
@@ -949,7 +957,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
949 957
950static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 958static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
951{ 959{
952 return to_svm(vcpu)->db_regs[dr]; 960 unsigned long val = to_svm(vcpu)->db_regs[dr];
961 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
962 return val;
953} 963}
954 964
955static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 965static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
@@ -1004,6 +1014,16 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1004 1014
1005 fault_address = svm->vmcb->control.exit_info_2; 1015 fault_address = svm->vmcb->control.exit_info_2;
1006 error_code = svm->vmcb->control.exit_info_1; 1016 error_code = svm->vmcb->control.exit_info_1;
1017
1018 if (!npt_enabled)
1019 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
1020 (u32)fault_address, (u32)(fault_address >> 32),
1021 handler);
1022 else
1023 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1024 (u32)fault_address, (u32)(fault_address >> 32),
1025 handler);
1026
1007 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1027 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1008} 1028}
1009 1029
@@ -1081,6 +1101,19 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1081 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1101 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1082} 1102}
1083 1103
1104static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1105{
1106 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1107 return 1;
1108}
1109
1110static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1111{
1112 ++svm->vcpu.stat.irq_exits;
1113 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1114 return 1;
1115}
1116
1084static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1117static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1085{ 1118{
1086 return 1; 1119 return 1;
@@ -1219,6 +1252,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1219 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1252 if (svm_get_msr(&svm->vcpu, ecx, &data))
1220 kvm_inject_gp(&svm->vcpu, 0); 1253 kvm_inject_gp(&svm->vcpu, 0);
1221 else { 1254 else {
1255 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1256 (u32)(data >> 32), handler);
1257
1222 svm->vmcb->save.rax = data & 0xffffffff; 1258 svm->vmcb->save.rax = data & 0xffffffff;
1223 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1259 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1224 svm->next_rip = svm->vmcb->save.rip + 2; 1260 svm->next_rip = svm->vmcb->save.rip + 2;
@@ -1284,16 +1320,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1284 case MSR_K7_EVNTSEL1: 1320 case MSR_K7_EVNTSEL1:
1285 case MSR_K7_EVNTSEL2: 1321 case MSR_K7_EVNTSEL2:
1286 case MSR_K7_EVNTSEL3: 1322 case MSR_K7_EVNTSEL3:
1323 case MSR_K7_PERFCTR0:
1324 case MSR_K7_PERFCTR1:
1325 case MSR_K7_PERFCTR2:
1326 case MSR_K7_PERFCTR3:
1287 /* 1327 /*
1288 * only support writing 0 to the performance counters for now 1328 * Just discard all writes to the performance counters; this
1289 * to make Windows happy. Should be replaced by a real 1329 * should keep both older linux and windows 64-bit guests
1290 * performance counter emulation later. 1330 * happy
1291 */ 1331 */
1292 if (data != 0) 1332 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
1293 goto unhandled; 1333
1294 break; 1334 break;
1295 default: 1335 default:
1296 unhandled:
1297 return kvm_set_msr_common(vcpu, ecx, data); 1336 return kvm_set_msr_common(vcpu, ecx, data);
1298 } 1337 }
1299 return 0; 1338 return 0;
@@ -1304,6 +1343,10 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1304 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1343 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1305 u64 data = (svm->vmcb->save.rax & -1u) 1344 u64 data = (svm->vmcb->save.rax & -1u)
1306 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1345 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1346
1347 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1348 handler);
1349
1307 svm->next_rip = svm->vmcb->save.rip + 2; 1350 svm->next_rip = svm->vmcb->save.rip + 2;
1308 if (svm_set_msr(&svm->vcpu, ecx, data)) 1351 if (svm_set_msr(&svm->vcpu, ecx, data))
1309 kvm_inject_gp(&svm->vcpu, 0); 1352 kvm_inject_gp(&svm->vcpu, 0);
@@ -1323,6 +1366,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1323static int interrupt_window_interception(struct vcpu_svm *svm, 1366static int interrupt_window_interception(struct vcpu_svm *svm,
1324 struct kvm_run *kvm_run) 1367 struct kvm_run *kvm_run)
1325{ 1368{
1369 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
1370
1326 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1371 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1327 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 1372 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1328 /* 1373 /*
@@ -1364,8 +1409,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1364 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1409 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1365 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1410 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1366 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 1411 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
1367 [SVM_EXIT_INTR] = nop_on_interception, 1412 [SVM_EXIT_INTR] = intr_interception,
1368 [SVM_EXIT_NMI] = nop_on_interception, 1413 [SVM_EXIT_NMI] = nmi_interception,
1369 [SVM_EXIT_SMI] = nop_on_interception, 1414 [SVM_EXIT_SMI] = nop_on_interception,
1370 [SVM_EXIT_INIT] = nop_on_interception, 1415 [SVM_EXIT_INIT] = nop_on_interception,
1371 [SVM_EXIT_VINTR] = interrupt_window_interception, 1416 [SVM_EXIT_VINTR] = interrupt_window_interception,
@@ -1397,6 +1442,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1397 struct vcpu_svm *svm = to_svm(vcpu); 1442 struct vcpu_svm *svm = to_svm(vcpu);
1398 u32 exit_code = svm->vmcb->control.exit_code; 1443 u32 exit_code = svm->vmcb->control.exit_code;
1399 1444
1445 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
1446 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
1447
1400 if (npt_enabled) { 1448 if (npt_enabled) {
1401 int mmu_reload = 0; 1449 int mmu_reload = 0;
1402 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 1450 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1470,6 +1518,8 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1470{ 1518{
1471 struct vmcb_control_area *control; 1519 struct vmcb_control_area *control;
1472 1520
1521 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1522
1473 control = &svm->vmcb->control; 1523 control = &svm->vmcb->control;
1474 control->int_vector = irq; 1524 control->int_vector = irq;
1475 control->int_ctl &= ~V_INTR_PRIO_MASK; 1525 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1660,9 +1710,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1660 sync_lapic_to_cr8(vcpu); 1710 sync_lapic_to_cr8(vcpu);
1661 1711
1662 save_host_msrs(vcpu); 1712 save_host_msrs(vcpu);
1663 fs_selector = read_fs(); 1713 fs_selector = kvm_read_fs();
1664 gs_selector = read_gs(); 1714 gs_selector = kvm_read_gs();
1665 ldt_selector = read_ldt(); 1715 ldt_selector = kvm_read_ldt();
1666 svm->host_cr2 = kvm_read_cr2(); 1716 svm->host_cr2 = kvm_read_cr2();
1667 svm->host_dr6 = read_dr6(); 1717 svm->host_dr6 = read_dr6();
1668 svm->host_dr7 = read_dr7(); 1718 svm->host_dr7 = read_dr7();
@@ -1716,17 +1766,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1716 /* Enter guest mode */ 1766 /* Enter guest mode */
1717 "push %%rax \n\t" 1767 "push %%rax \n\t"
1718 "mov %c[vmcb](%[svm]), %%rax \n\t" 1768 "mov %c[vmcb](%[svm]), %%rax \n\t"
1719 SVM_VMLOAD "\n\t" 1769 __ex(SVM_VMLOAD) "\n\t"
1720 SVM_VMRUN "\n\t" 1770 __ex(SVM_VMRUN) "\n\t"
1721 SVM_VMSAVE "\n\t" 1771 __ex(SVM_VMSAVE) "\n\t"
1722 "pop %%rax \n\t" 1772 "pop %%rax \n\t"
1723#else 1773#else
1724 /* Enter guest mode */ 1774 /* Enter guest mode */
1725 "push %%eax \n\t" 1775 "push %%eax \n\t"
1726 "mov %c[vmcb](%[svm]), %%eax \n\t" 1776 "mov %c[vmcb](%[svm]), %%eax \n\t"
1727 SVM_VMLOAD "\n\t" 1777 __ex(SVM_VMLOAD) "\n\t"
1728 SVM_VMRUN "\n\t" 1778 __ex(SVM_VMRUN) "\n\t"
1729 SVM_VMSAVE "\n\t" 1779 __ex(SVM_VMSAVE) "\n\t"
1730 "pop %%eax \n\t" 1780 "pop %%eax \n\t"
1731#endif 1781#endif
1732 1782
@@ -1795,9 +1845,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1795 write_dr7(svm->host_dr7); 1845 write_dr7(svm->host_dr7);
1796 kvm_write_cr2(svm->host_cr2); 1846 kvm_write_cr2(svm->host_cr2);
1797 1847
1798 load_fs(fs_selector); 1848 kvm_load_fs(fs_selector);
1799 load_gs(gs_selector); 1849 kvm_load_gs(gs_selector);
1800 load_ldt(ldt_selector); 1850 kvm_load_ldt(ldt_selector);
1801 load_host_msrs(vcpu); 1851 load_host_msrs(vcpu);
1802 1852
1803 reload_tss(vcpu); 1853 reload_tss(vcpu);
@@ -1889,7 +1939,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1889 .prepare_guest_switch = svm_prepare_guest_switch, 1939 .prepare_guest_switch = svm_prepare_guest_switch,
1890 .vcpu_load = svm_vcpu_load, 1940 .vcpu_load = svm_vcpu_load,
1891 .vcpu_put = svm_vcpu_put, 1941 .vcpu_put = svm_vcpu_put,
1892 .vcpu_decache = svm_vcpu_decache,
1893 1942
1894 .set_guest_debug = svm_guest_debug, 1943 .set_guest_debug = svm_guest_debug,
1895 .get_msr = svm_get_msr, 1944 .get_msr = svm_get_msr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10ce6ee4c49..0cac6370171 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -30,6 +30,8 @@
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33#define __ex(x) __kvm_handle_fault_on_reboot(x)
34
33MODULE_AUTHOR("Qumranet"); 35MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 36MODULE_LICENSE("GPL");
35 37
@@ -53,6 +55,7 @@ struct vmcs {
53 55
54struct vcpu_vmx { 56struct vcpu_vmx {
55 struct kvm_vcpu vcpu; 57 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link;
56 int launched; 59 int launched;
57 u8 fail; 60 u8 fail;
58 u32 idt_vectoring_info; 61 u32 idt_vectoring_info;
@@ -88,9 +91,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
88} 91}
89 92
90static int init_rmode(struct kvm *kvm); 93static int init_rmode(struct kvm *kvm);
94static u64 construct_eptp(unsigned long root_hpa);
91 95
92static DEFINE_PER_CPU(struct vmcs *, vmxarea); 96static DEFINE_PER_CPU(struct vmcs *, vmxarea);
93static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 97static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
98static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
94 99
95static struct page *vmx_io_bitmap_a; 100static struct page *vmx_io_bitmap_a;
96static struct page *vmx_io_bitmap_b; 101static struct page *vmx_io_bitmap_b;
@@ -260,6 +265,11 @@ static inline int cpu_has_vmx_vpid(void)
260 SECONDARY_EXEC_ENABLE_VPID); 265 SECONDARY_EXEC_ENABLE_VPID);
261} 266}
262 267
268static inline int cpu_has_virtual_nmis(void)
269{
270 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
271}
272
263static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 273static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
264{ 274{
265 int i; 275 int i;
@@ -278,7 +288,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
278 u64 gva; 288 u64 gva;
279 } operand = { vpid, 0, gva }; 289 } operand = { vpid, 0, gva };
280 290
281 asm volatile (ASM_VMX_INVVPID 291 asm volatile (__ex(ASM_VMX_INVVPID)
282 /* CF==1 or ZF==1 --> rc = -1 */ 292 /* CF==1 or ZF==1 --> rc = -1 */
283 "; ja 1f ; ud2 ; 1:" 293 "; ja 1f ; ud2 ; 1:"
284 : : "a"(&operand), "c"(ext) : "cc", "memory"); 294 : : "a"(&operand), "c"(ext) : "cc", "memory");
@@ -290,7 +300,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
290 u64 eptp, gpa; 300 u64 eptp, gpa;
291 } operand = {eptp, gpa}; 301 } operand = {eptp, gpa};
292 302
293 asm volatile (ASM_VMX_INVEPT 303 asm volatile (__ex(ASM_VMX_INVEPT)
294 /* CF==1 or ZF==1 --> rc = -1 */ 304 /* CF==1 or ZF==1 --> rc = -1 */
295 "; ja 1f ; ud2 ; 1:\n" 305 "; ja 1f ; ud2 ; 1:\n"
296 : : "a" (&operand), "c" (ext) : "cc", "memory"); 306 : : "a" (&operand), "c" (ext) : "cc", "memory");
@@ -311,7 +321,7 @@ static void vmcs_clear(struct vmcs *vmcs)
311 u64 phys_addr = __pa(vmcs); 321 u64 phys_addr = __pa(vmcs);
312 u8 error; 322 u8 error;
313 323
314 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" 324 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
315 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 325 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
316 : "cc", "memory"); 326 : "cc", "memory");
317 if (error) 327 if (error)
@@ -329,6 +339,9 @@ static void __vcpu_clear(void *arg)
329 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 339 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
330 per_cpu(current_vmcs, cpu) = NULL; 340 per_cpu(current_vmcs, cpu) = NULL;
331 rdtscll(vmx->vcpu.arch.host_tsc); 341 rdtscll(vmx->vcpu.arch.host_tsc);
342 list_del(&vmx->local_vcpus_link);
343 vmx->vcpu.cpu = -1;
344 vmx->launched = 0;
332} 345}
333 346
334static void vcpu_clear(struct vcpu_vmx *vmx) 347static void vcpu_clear(struct vcpu_vmx *vmx)
@@ -336,7 +349,6 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
336 if (vmx->vcpu.cpu == -1) 349 if (vmx->vcpu.cpu == -1)
337 return; 350 return;
338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 351 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
339 vmx->launched = 0;
340} 352}
341 353
342static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 354static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
@@ -378,7 +390,7 @@ static unsigned long vmcs_readl(unsigned long field)
378{ 390{
379 unsigned long value; 391 unsigned long value;
380 392
381 asm volatile (ASM_VMX_VMREAD_RDX_RAX 393 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
382 : "=a"(value) : "d"(field) : "cc"); 394 : "=a"(value) : "d"(field) : "cc");
383 return value; 395 return value;
384} 396}
@@ -413,7 +425,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
413{ 425{
414 u8 error; 426 u8 error;
415 427
416 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 428 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
417 : "=q"(error) : "a"(value), "d"(field) : "cc"); 429 : "=q"(error) : "a"(value), "d"(field) : "cc");
418 if (unlikely(error)) 430 if (unlikely(error))
419 vmwrite_error(field, value); 431 vmwrite_error(field, value);
@@ -431,10 +443,8 @@ static void vmcs_write32(unsigned long field, u32 value)
431 443
432static void vmcs_write64(unsigned long field, u64 value) 444static void vmcs_write64(unsigned long field, u64 value)
433{ 445{
434#ifdef CONFIG_X86_64
435 vmcs_writel(field, value);
436#else
437 vmcs_writel(field, value); 446 vmcs_writel(field, value);
447#ifndef CONFIG_X86_64
438 asm volatile (""); 448 asm volatile ("");
439 vmcs_writel(field+1, value >> 32); 449 vmcs_writel(field+1, value >> 32);
440#endif 450#endif
@@ -474,7 +484,7 @@ static void reload_tss(void)
474 struct descriptor_table gdt; 484 struct descriptor_table gdt;
475 struct desc_struct *descs; 485 struct desc_struct *descs;
476 486
477 get_gdt(&gdt); 487 kvm_get_gdt(&gdt);
478 descs = (void *)gdt.base; 488 descs = (void *)gdt.base;
479 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 489 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
480 load_TR_desc(); 490 load_TR_desc();
@@ -530,9 +540,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
530 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 540 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
531 * allow segment selectors with cpl > 0 or ti == 1. 541 * allow segment selectors with cpl > 0 or ti == 1.
532 */ 542 */
533 vmx->host_state.ldt_sel = read_ldt(); 543 vmx->host_state.ldt_sel = kvm_read_ldt();
534 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 544 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
535 vmx->host_state.fs_sel = read_fs(); 545 vmx->host_state.fs_sel = kvm_read_fs();
536 if (!(vmx->host_state.fs_sel & 7)) { 546 if (!(vmx->host_state.fs_sel & 7)) {
537 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 547 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
538 vmx->host_state.fs_reload_needed = 0; 548 vmx->host_state.fs_reload_needed = 0;
@@ -540,7 +550,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
540 vmcs_write16(HOST_FS_SELECTOR, 0); 550 vmcs_write16(HOST_FS_SELECTOR, 0);
541 vmx->host_state.fs_reload_needed = 1; 551 vmx->host_state.fs_reload_needed = 1;
542 } 552 }
543 vmx->host_state.gs_sel = read_gs(); 553 vmx->host_state.gs_sel = kvm_read_gs();
544 if (!(vmx->host_state.gs_sel & 7)) 554 if (!(vmx->host_state.gs_sel & 7))
545 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 555 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
546 else { 556 else {
@@ -576,15 +586,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
576 ++vmx->vcpu.stat.host_state_reload; 586 ++vmx->vcpu.stat.host_state_reload;
577 vmx->host_state.loaded = 0; 587 vmx->host_state.loaded = 0;
578 if (vmx->host_state.fs_reload_needed) 588 if (vmx->host_state.fs_reload_needed)
579 load_fs(vmx->host_state.fs_sel); 589 kvm_load_fs(vmx->host_state.fs_sel);
580 if (vmx->host_state.gs_ldt_reload_needed) { 590 if (vmx->host_state.gs_ldt_reload_needed) {
581 load_ldt(vmx->host_state.ldt_sel); 591 kvm_load_ldt(vmx->host_state.ldt_sel);
582 /* 592 /*
583 * If we have to reload gs, we must take care to 593 * If we have to reload gs, we must take care to
584 * preserve our gs base. 594 * preserve our gs base.
585 */ 595 */
586 local_irq_save(flags); 596 local_irq_save(flags);
587 load_gs(vmx->host_state.gs_sel); 597 kvm_load_gs(vmx->host_state.gs_sel);
588#ifdef CONFIG_X86_64 598#ifdef CONFIG_X86_64
589 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 599 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
590#endif 600#endif
@@ -617,13 +627,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
617 vcpu_clear(vmx); 627 vcpu_clear(vmx);
618 kvm_migrate_timers(vcpu); 628 kvm_migrate_timers(vcpu);
619 vpid_sync_vcpu_all(vmx); 629 vpid_sync_vcpu_all(vmx);
630 local_irq_disable();
631 list_add(&vmx->local_vcpus_link,
632 &per_cpu(vcpus_on_cpu, cpu));
633 local_irq_enable();
620 } 634 }
621 635
622 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 636 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
623 u8 error; 637 u8 error;
624 638
625 per_cpu(current_vmcs, cpu) = vmx->vmcs; 639 per_cpu(current_vmcs, cpu) = vmx->vmcs;
626 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 640 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
627 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 641 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
628 : "cc"); 642 : "cc");
629 if (error) 643 if (error)
@@ -640,8 +654,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
640 * Linux uses per-cpu TSS and GDT, so set these when switching 654 * Linux uses per-cpu TSS and GDT, so set these when switching
641 * processors. 655 * processors.
642 */ 656 */
643 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ 657 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
644 get_gdt(&dt); 658 kvm_get_gdt(&dt);
645 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 659 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
646 660
647 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 661 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
@@ -684,11 +698,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
684 update_exception_bitmap(vcpu); 698 update_exception_bitmap(vcpu);
685} 699}
686 700
687static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
688{
689 vcpu_clear(to_vmx(vcpu));
690}
691
692static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 701static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
693{ 702{
694 return vmcs_readl(GUEST_RFLAGS); 703 return vmcs_readl(GUEST_RFLAGS);
@@ -913,6 +922,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
913 case MSR_IA32_TIME_STAMP_COUNTER: 922 case MSR_IA32_TIME_STAMP_COUNTER:
914 guest_write_tsc(data); 923 guest_write_tsc(data);
915 break; 924 break;
925 case MSR_P6_PERFCTR0:
926 case MSR_P6_PERFCTR1:
927 case MSR_P6_EVNTSEL0:
928 case MSR_P6_EVNTSEL1:
929 /*
930 * Just discard all writes to the performance counters; this
931 * should keep both older linux and windows 64-bit guests
932 * happy
933 */
934 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
935
936 break;
916 default: 937 default:
917 vmx_load_host_state(vmx); 938 vmx_load_host_state(vmx);
918 msr = find_msr_entry(vmx, msr_index); 939 msr = find_msr_entry(vmx, msr_index);
@@ -1022,6 +1043,7 @@ static void hardware_enable(void *garbage)
1022 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1043 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1023 u64 old; 1044 u64 old;
1024 1045
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1025 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1026 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
1027 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
@@ -1032,13 +1054,25 @@ static void hardware_enable(void *garbage)
1032 MSR_IA32_FEATURE_CONTROL_LOCKED | 1054 MSR_IA32_FEATURE_CONTROL_LOCKED |
1033 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
1034 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1035 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 1057 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr)
1036 : "memory", "cc"); 1059 : "memory", "cc");
1037} 1060}
1038 1061
1062static void vmclear_local_vcpus(void)
1063{
1064 int cpu = raw_smp_processor_id();
1065 struct vcpu_vmx *vmx, *n;
1066
1067 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1068 local_vcpus_link)
1069 __vcpu_clear(vmx);
1070}
1071
1039static void hardware_disable(void *garbage) 1072static void hardware_disable(void *garbage)
1040{ 1073{
1041 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 1074 vmclear_local_vcpus();
1075 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1042 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1076 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1043} 1077}
1044 1078
@@ -1072,7 +1106,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1072 u32 _vmentry_control = 0; 1106 u32 _vmentry_control = 0;
1073 1107
1074 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 1108 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1075 opt = 0; 1109 opt = PIN_BASED_VIRTUAL_NMIS;
1076 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 1110 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1077 &_pin_based_exec_control) < 0) 1111 &_pin_based_exec_control) < 0)
1078 return -EIO; 1112 return -EIO;
@@ -1389,6 +1423,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1389static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1423static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1390{ 1424{
1391 vpid_sync_vcpu_all(to_vmx(vcpu)); 1425 vpid_sync_vcpu_all(to_vmx(vcpu));
1426 if (vm_need_ept())
1427 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1392} 1428}
1393 1429
1394static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1430static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -1420,7 +1456,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1420 if (!(cr0 & X86_CR0_PG)) { 1456 if (!(cr0 & X86_CR0_PG)) {
1421 /* From paging/starting to nonpaging */ 1457 /* From paging/starting to nonpaging */
1422 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1458 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1423 vmcs_config.cpu_based_exec_ctrl | 1459 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1424 (CPU_BASED_CR3_LOAD_EXITING | 1460 (CPU_BASED_CR3_LOAD_EXITING |
1425 CPU_BASED_CR3_STORE_EXITING)); 1461 CPU_BASED_CR3_STORE_EXITING));
1426 vcpu->arch.cr0 = cr0; 1462 vcpu->arch.cr0 = cr0;
@@ -1430,7 +1466,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1430 } else if (!is_paging(vcpu)) { 1466 } else if (!is_paging(vcpu)) {
1431 /* From nonpaging to paging */ 1467 /* From nonpaging to paging */
1432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1468 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1433 vmcs_config.cpu_based_exec_ctrl & 1469 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1434 ~(CPU_BASED_CR3_LOAD_EXITING | 1470 ~(CPU_BASED_CR3_LOAD_EXITING |
1435 CPU_BASED_CR3_STORE_EXITING)); 1471 CPU_BASED_CR3_STORE_EXITING));
1436 vcpu->arch.cr0 = cr0; 1472 vcpu->arch.cr0 = cr0;
@@ -1821,7 +1857,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
1821 spin_unlock(&vmx_vpid_lock); 1857 spin_unlock(&vmx_vpid_lock);
1822} 1858}
1823 1859
1824void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 1860static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
1825{ 1861{
1826 void *va; 1862 void *va;
1827 1863
@@ -1907,8 +1943,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1907 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 1943 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1908 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1944 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1909 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1945 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1910 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ 1946 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
1911 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ 1947 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
1912 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1948 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1913#ifdef CONFIG_X86_64 1949#ifdef CONFIG_X86_64
1914 rdmsrl(MSR_FS_BASE, a); 1950 rdmsrl(MSR_FS_BASE, a);
@@ -1922,7 +1958,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1922 1958
1923 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 1959 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1924 1960
1925 get_idt(&dt); 1961 kvm_get_idt(&dt);
1926 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1962 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1927 1963
1928 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1964 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
@@ -2114,6 +2150,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2114 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2150 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2115} 2151}
2116 2152
2153static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158}
2159
2117static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2118{ 2161{
2119 int word_index = __ffs(vcpu->arch.irq_summary); 2162 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2554,8 +2597,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2554 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 2597 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2555 offset = exit_qualification & 0xffful; 2598 offset = exit_qualification & 0xffful;
2556 2599
2557 KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
2558
2559 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 2600 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2560 2601
2561 if (er != EMULATE_DONE) { 2602 if (er != EMULATE_DONE) {
@@ -2639,6 +2680,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2639 return 1; 2680 return 1;
2640} 2681}
2641 2682
2683static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2684{
2685 u32 cpu_based_vm_exec_control;
2686
2687 /* clear pending NMI */
2688 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2689 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2690 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2691 ++vcpu->stat.nmi_window_exits;
2692
2693 return 1;
2694}
2695
2642/* 2696/*
2643 * The exit handlers return 1 if the exit was handled fully and guest execution 2697 * The exit handlers return 1 if the exit was handled fully and guest execution
2644 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2698 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2649,6 +2703,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2649 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 2703 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2650 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 2704 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2651 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 2705 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2706 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
2652 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 2707 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2653 [EXIT_REASON_CR_ACCESS] = handle_cr, 2708 [EXIT_REASON_CR_ACCESS] = handle_cr,
2654 [EXIT_REASON_DR_ACCESS] = handle_dr, 2709 [EXIT_REASON_DR_ACCESS] = handle_dr,
@@ -2736,17 +2791,52 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2736 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2791 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2737} 2792}
2738 2793
2794static void enable_nmi_window(struct kvm_vcpu *vcpu)
2795{
2796 u32 cpu_based_vm_exec_control;
2797
2798 if (!cpu_has_virtual_nmis())
2799 return;
2800
2801 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2802 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2803 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2804}
2805
2806static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
2807{
2808 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2809 return !(guest_intr & (GUEST_INTR_STATE_NMI |
2810 GUEST_INTR_STATE_MOV_SS |
2811 GUEST_INTR_STATE_STI));
2812}
2813
2814static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
2815{
2816 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2817 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
2818 GUEST_INTR_STATE_STI)) &&
2819 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
2820}
2821
2822static void enable_intr_window(struct kvm_vcpu *vcpu)
2823{
2824 if (vcpu->arch.nmi_pending)
2825 enable_nmi_window(vcpu);
2826 else if (kvm_cpu_has_interrupt(vcpu))
2827 enable_irq_window(vcpu);
2828}
2829
2739static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2830static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2740{ 2831{
2741 struct vcpu_vmx *vmx = to_vmx(vcpu); 2832 struct vcpu_vmx *vmx = to_vmx(vcpu);
2742 u32 idtv_info_field, intr_info_field; 2833 u32 idtv_info_field, intr_info_field, exit_intr_info_field;
2743 int has_ext_irq, interrupt_window_open;
2744 int vector; 2834 int vector;
2745 2835
2746 update_tpr_threshold(vcpu); 2836 update_tpr_threshold(vcpu);
2747 2837
2748 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2749 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2838 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2839 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
2750 idtv_info_field = vmx->idt_vectoring_info; 2840 idtv_info_field = vmx->idt_vectoring_info;
2751 if (intr_info_field & INTR_INFO_VALID_MASK) { 2841 if (intr_info_field & INTR_INFO_VALID_MASK) {
2752 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2842 if (idtv_info_field & INTR_INFO_VALID_MASK) {
@@ -2754,8 +2844,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2754 if (printk_ratelimit()) 2844 if (printk_ratelimit())
2755 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2845 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2756 } 2846 }
2757 if (has_ext_irq) 2847 enable_intr_window(vcpu);
2758 enable_irq_window(vcpu);
2759 return; 2848 return;
2760 } 2849 }
2761 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2850 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
@@ -2765,30 +2854,56 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2765 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; 2854 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2766 2855
2767 vmx_inject_irq(vcpu, vect); 2856 vmx_inject_irq(vcpu, vect);
2768 if (unlikely(has_ext_irq)) 2857 enable_intr_window(vcpu);
2769 enable_irq_window(vcpu);
2770 return; 2858 return;
2771 } 2859 }
2772 2860
2773 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); 2861 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2774 2862
2775 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2863 /*
2864 * SDM 3: 25.7.1.2
2865 * Clear bit "block by NMI" before VM entry if a NMI delivery
2866 * faulted.
2867 */
2868 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2869 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
2870 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2871 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2872 ~GUEST_INTR_STATE_NMI);
2873
2874 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
2875 & ~INTR_INFO_RESVD_BITS_MASK);
2776 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2876 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2777 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2877 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2778 2878
2779 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 2879 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
2780 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2880 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2781 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 2881 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2782 if (unlikely(has_ext_irq)) 2882 enable_intr_window(vcpu);
2783 enable_irq_window(vcpu);
2784 return; 2883 return;
2785 } 2884 }
2786 if (!has_ext_irq) 2885 if (cpu_has_virtual_nmis()) {
2886 /*
2887 * SDM 3: 25.7.1.2
2888 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2889 * a guest IRET fault.
2890 */
2891 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
2892 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
2893 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2894 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
2895 GUEST_INTR_STATE_NMI);
2896 else if (vcpu->arch.nmi_pending) {
2897 if (vmx_nmi_enabled(vcpu))
2898 vmx_inject_nmi(vcpu);
2899 enable_intr_window(vcpu);
2900 return;
2901 }
2902
2903 }
2904 if (!kvm_cpu_has_interrupt(vcpu))
2787 return; 2905 return;
2788 interrupt_window_open = 2906 if (vmx_irq_enabled(vcpu)) {
2789 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2790 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2791 if (interrupt_window_open) {
2792 vector = kvm_cpu_get_interrupt(vcpu); 2907 vector = kvm_cpu_get_interrupt(vcpu);
2793 vmx_inject_irq(vcpu, vector); 2908 vmx_inject_irq(vcpu, vector);
2794 kvm_timer_intr_post(vcpu, vector); 2909 kvm_timer_intr_post(vcpu, vector);
@@ -2838,7 +2953,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 "push %%edx; push %%ebp;" 2953 "push %%edx; push %%ebp;"
2839 "push %%ecx \n\t" 2954 "push %%ecx \n\t"
2840#endif 2955#endif
2841 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2956 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
2842 /* Check if vmlaunch of vmresume is needed */ 2957 /* Check if vmlaunch of vmresume is needed */
2843 "cmpl $0, %c[launched](%0) \n\t" 2958 "cmpl $0, %c[launched](%0) \n\t"
2844 /* Load guest registers. Don't clobber flags. */ 2959 /* Load guest registers. Don't clobber flags. */
@@ -2873,9 +2988,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2873#endif 2988#endif
2874 /* Enter guest mode */ 2989 /* Enter guest mode */
2875 "jne .Llaunched \n\t" 2990 "jne .Llaunched \n\t"
2876 ASM_VMX_VMLAUNCH "\n\t" 2991 __ex(ASM_VMX_VMLAUNCH) "\n\t"
2877 "jmp .Lkvm_vmx_return \n\t" 2992 "jmp .Lkvm_vmx_return \n\t"
2878 ".Llaunched: " ASM_VMX_VMRESUME "\n\t" 2993 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2879 ".Lkvm_vmx_return: " 2994 ".Lkvm_vmx_return: "
2880 /* Save guest registers, load host registers, keep flags */ 2995 /* Save guest registers, load host registers, keep flags */
2881#ifdef CONFIG_X86_64 2996#ifdef CONFIG_X86_64
@@ -2949,7 +3064,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2949 fixup_rmode_irq(vmx); 3064 fixup_rmode_irq(vmx);
2950 3065
2951 vcpu->arch.interrupt_window_open = 3066 vcpu->arch.interrupt_window_open =
2952 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 3067 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3068 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
2953 3069
2954 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3070 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2955 vmx->launched = 1; 3071 vmx->launched = 1;
@@ -2957,7 +3073,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2957 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3073 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2958 3074
2959 /* We need to handle NMIs before interrupts are enabled */ 3075 /* We need to handle NMIs before interrupts are enabled */
2960 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 3076 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
3077 (intr_info & INTR_INFO_VALID_MASK)) {
2961 KVMTRACE_0D(NMI, vcpu, handler); 3078 KVMTRACE_0D(NMI, vcpu, handler);
2962 asm("int $2"); 3079 asm("int $2");
2963 } 3080 }
@@ -2968,7 +3085,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2968 struct vcpu_vmx *vmx = to_vmx(vcpu); 3085 struct vcpu_vmx *vmx = to_vmx(vcpu);
2969 3086
2970 if (vmx->vmcs) { 3087 if (vmx->vmcs) {
2971 on_each_cpu(__vcpu_clear, vmx, 1); 3088 vcpu_clear(vmx);
2972 free_vmcs(vmx->vmcs); 3089 free_vmcs(vmx->vmcs);
2973 vmx->vmcs = NULL; 3090 vmx->vmcs = NULL;
2974 } 3091 }
@@ -3095,7 +3212,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
3095 .prepare_guest_switch = vmx_save_host_state, 3212 .prepare_guest_switch = vmx_save_host_state,
3096 .vcpu_load = vmx_vcpu_load, 3213 .vcpu_load = vmx_vcpu_load,
3097 .vcpu_put = vmx_vcpu_put, 3214 .vcpu_put = vmx_vcpu_put,
3098 .vcpu_decache = vmx_vcpu_decache,
3099 3215
3100 .set_guest_debug = set_guest_debug, 3216 .set_guest_debug = set_guest_debug,
3101 .guest_debug_pre = kvm_guest_debug_pre, 3217 .guest_debug_pre = kvm_guest_debug_pre,
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 79d94c610df..425a13436b3 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -40,6 +40,7 @@
40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
41#define CPU_BASED_CR8_STORE_EXITING 0x00100000 41#define CPU_BASED_CR8_STORE_EXITING 0x00100000
42#define CPU_BASED_TPR_SHADOW 0x00200000 42#define CPU_BASED_TPR_SHADOW 0x00200000
43#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
43#define CPU_BASED_MOV_DR_EXITING 0x00800000 44#define CPU_BASED_MOV_DR_EXITING 0x00800000
44#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 45#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
45#define CPU_BASED_USE_IO_BITMAPS 0x02000000 46#define CPU_BASED_USE_IO_BITMAPS 0x02000000
@@ -216,7 +217,7 @@ enum vmcs_field {
216#define EXIT_REASON_TRIPLE_FAULT 2 217#define EXIT_REASON_TRIPLE_FAULT 2
217 218
218#define EXIT_REASON_PENDING_INTERRUPT 7 219#define EXIT_REASON_PENDING_INTERRUPT 7
219 220#define EXIT_REASON_NMI_WINDOW 8
220#define EXIT_REASON_TASK_SWITCH 9 221#define EXIT_REASON_TASK_SWITCH 9
221#define EXIT_REASON_CPUID 10 222#define EXIT_REASON_CPUID 10
222#define EXIT_REASON_HLT 12 223#define EXIT_REASON_HLT 12
@@ -251,7 +252,9 @@ enum vmcs_field {
251#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ 252#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
252#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ 253#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
253#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ 254#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
255#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
254#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ 256#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
257#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
255 258
256#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK 259#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
257#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK 260#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
@@ -259,9 +262,16 @@ enum vmcs_field {
259#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK 262#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
260 263
261#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 264#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
265#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
262#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 266#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
263#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ 267#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
264 268
269/* GUEST_INTERRUPTIBILITY_INFO flags. */
270#define GUEST_INTR_STATE_STI 0x00000001
271#define GUEST_INTR_STATE_MOV_SS 0x00000002
272#define GUEST_INTR_STATE_SMI 0x00000004
273#define GUEST_INTR_STATE_NMI 0x00000008
274
265/* 275/*
266 * Exit Qualifications for MOV for Control Register Access 276 * Exit Qualifications for MOV for Control Register Access
267 */ 277 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0faa2546b1c..9f1cdb011cf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -72,6 +72,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
72 { "mmio_exits", VCPU_STAT(mmio_exits) }, 72 { "mmio_exits", VCPU_STAT(mmio_exits) },
73 { "signal_exits", VCPU_STAT(signal_exits) }, 73 { "signal_exits", VCPU_STAT(signal_exits) },
74 { "irq_window", VCPU_STAT(irq_window_exits) }, 74 { "irq_window", VCPU_STAT(irq_window_exits) },
75 { "nmi_window", VCPU_STAT(nmi_window_exits) },
75 { "halt_exits", VCPU_STAT(halt_exits) }, 76 { "halt_exits", VCPU_STAT(halt_exits) },
76 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 77 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77 { "hypercalls", VCPU_STAT(hypercalls) }, 78 { "hypercalls", VCPU_STAT(hypercalls) },
@@ -173,6 +174,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
173 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 174 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
174} 175}
175 176
177void kvm_inject_nmi(struct kvm_vcpu *vcpu)
178{
179 vcpu->arch.nmi_pending = 1;
180}
181EXPORT_SYMBOL_GPL(kvm_inject_nmi);
182
176void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 183void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
177{ 184{
178 WARN_ON(vcpu->arch.exception.pending); 185 WARN_ON(vcpu->arch.exception.pending);
@@ -604,6 +611,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
604 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 611 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
605} 612}
606 613
614static bool msr_mtrr_valid(unsigned msr)
615{
616 switch (msr) {
617 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
618 case MSR_MTRRfix64K_00000:
619 case MSR_MTRRfix16K_80000:
620 case MSR_MTRRfix16K_A0000:
621 case MSR_MTRRfix4K_C0000:
622 case MSR_MTRRfix4K_C8000:
623 case MSR_MTRRfix4K_D0000:
624 case MSR_MTRRfix4K_D8000:
625 case MSR_MTRRfix4K_E0000:
626 case MSR_MTRRfix4K_E8000:
627 case MSR_MTRRfix4K_F0000:
628 case MSR_MTRRfix4K_F8000:
629 case MSR_MTRRdefType:
630 case MSR_IA32_CR_PAT:
631 return true;
632 case 0x2f8:
633 return true;
634 }
635 return false;
636}
637
638static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
639{
640 if (!msr_mtrr_valid(msr))
641 return 1;
642
643 vcpu->arch.mtrr[msr - 0x200] = data;
644 return 0;
645}
607 646
608int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 647int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
609{ 648{
@@ -625,8 +664,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
625 break; 664 break;
626 case MSR_IA32_UCODE_REV: 665 case MSR_IA32_UCODE_REV:
627 case MSR_IA32_UCODE_WRITE: 666 case MSR_IA32_UCODE_WRITE:
628 case 0x200 ... 0x2ff: /* MTRRs */
629 break; 667 break;
668 case 0x200 ... 0x2ff:
669 return set_msr_mtrr(vcpu, msr, data);
630 case MSR_IA32_APICBASE: 670 case MSR_IA32_APICBASE:
631 kvm_set_apic_base(vcpu, data); 671 kvm_set_apic_base(vcpu, data);
632 break; 672 break;
@@ -684,6 +724,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
684 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 724 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
685} 725}
686 726
727static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
728{
729 if (!msr_mtrr_valid(msr))
730 return 1;
731
732 *pdata = vcpu->arch.mtrr[msr - 0x200];
733 return 0;
734}
735
687int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 736int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
688{ 737{
689 u64 data; 738 u64 data;
@@ -705,11 +754,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
705 case MSR_IA32_MC0_MISC+16: 754 case MSR_IA32_MC0_MISC+16:
706 case MSR_IA32_UCODE_REV: 755 case MSR_IA32_UCODE_REV:
707 case MSR_IA32_EBL_CR_POWERON: 756 case MSR_IA32_EBL_CR_POWERON:
708 /* MTRR registers */
709 case 0xfe:
710 case 0x200 ... 0x2ff:
711 data = 0; 757 data = 0;
712 break; 758 break;
759 case MSR_MTRRcap:
760 data = 0x500 | KVM_NR_VAR_MTRR;
761 break;
762 case 0x200 ... 0x2ff:
763 return get_msr_mtrr(vcpu, msr, pdata);
713 case 0xcd: /* fsb frequency */ 764 case 0xcd: /* fsb frequency */
714 data = 3; 765 data = 3;
715 break; 766 break;
@@ -817,41 +868,6 @@ out:
817 return r; 868 return r;
818} 869}
819 870
820/*
821 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
822 * cached on it.
823 */
824void decache_vcpus_on_cpu(int cpu)
825{
826 struct kvm *vm;
827 struct kvm_vcpu *vcpu;
828 int i;
829
830 spin_lock(&kvm_lock);
831 list_for_each_entry(vm, &vm_list, vm_list)
832 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
833 vcpu = vm->vcpus[i];
834 if (!vcpu)
835 continue;
836 /*
837 * If the vcpu is locked, then it is running on some
838 * other cpu and therefore it is not cached on the
839 * cpu in question.
840 *
841 * If it's not locked, check the last cpu it executed
842 * on.
843 */
844 if (mutex_trylock(&vcpu->mutex)) {
845 if (vcpu->cpu == cpu) {
846 kvm_x86_ops->vcpu_decache(vcpu);
847 vcpu->cpu = -1;
848 }
849 mutex_unlock(&vcpu->mutex);
850 }
851 }
852 spin_unlock(&kvm_lock);
853}
854
855int kvm_dev_ioctl_check_extension(long ext) 871int kvm_dev_ioctl_check_extension(long ext)
856{ 872{
857 int r; 873 int r;
@@ -869,6 +885,9 @@ int kvm_dev_ioctl_check_extension(long ext)
869 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
870 r = 1; 886 r = 1;
871 break; 887 break;
888 case KVM_CAP_COALESCED_MMIO:
889 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
890 break;
872 case KVM_CAP_VAPIC: 891 case KVM_CAP_VAPIC:
873 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 892 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
874 break; 893 break;
@@ -1781,13 +1800,14 @@ static void kvm_init_msr_list(void)
1781 * Only apic need an MMIO device hook, so shortcut now.. 1800 * Only apic need an MMIO device hook, so shortcut now..
1782 */ 1801 */
1783static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1802static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1784 gpa_t addr) 1803 gpa_t addr, int len,
1804 int is_write)
1785{ 1805{
1786 struct kvm_io_device *dev; 1806 struct kvm_io_device *dev;
1787 1807
1788 if (vcpu->arch.apic) { 1808 if (vcpu->arch.apic) {
1789 dev = &vcpu->arch.apic->dev; 1809 dev = &vcpu->arch.apic->dev;
1790 if (dev->in_range(dev, addr)) 1810 if (dev->in_range(dev, addr, len, is_write))
1791 return dev; 1811 return dev;
1792 } 1812 }
1793 return NULL; 1813 return NULL;
@@ -1795,13 +1815,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1795 1815
1796 1816
1797static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1817static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1798 gpa_t addr) 1818 gpa_t addr, int len,
1819 int is_write)
1799{ 1820{
1800 struct kvm_io_device *dev; 1821 struct kvm_io_device *dev;
1801 1822
1802 dev = vcpu_find_pervcpu_dev(vcpu, addr); 1823 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1803 if (dev == NULL) 1824 if (dev == NULL)
1804 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1825 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1826 is_write);
1805 return dev; 1827 return dev;
1806} 1828}
1807 1829
@@ -1869,7 +1891,7 @@ mmio:
1869 * Is this MMIO handled locally? 1891 * Is this MMIO handled locally?
1870 */ 1892 */
1871 mutex_lock(&vcpu->kvm->lock); 1893 mutex_lock(&vcpu->kvm->lock);
1872 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1894 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
1873 if (mmio_dev) { 1895 if (mmio_dev) {
1874 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1896 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1875 mutex_unlock(&vcpu->kvm->lock); 1897 mutex_unlock(&vcpu->kvm->lock);
@@ -1924,7 +1946,7 @@ mmio:
1924 * Is this MMIO handled locally? 1946 * Is this MMIO handled locally?
1925 */ 1947 */
1926 mutex_lock(&vcpu->kvm->lock); 1948 mutex_lock(&vcpu->kvm->lock);
1927 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1949 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
1928 if (mmio_dev) { 1950 if (mmio_dev) {
1929 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1951 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1930 mutex_unlock(&vcpu->kvm->lock); 1952 mutex_unlock(&vcpu->kvm->lock);
@@ -2020,6 +2042,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2020 2042
2021int emulate_clts(struct kvm_vcpu *vcpu) 2043int emulate_clts(struct kvm_vcpu *vcpu)
2022{ 2044{
2045 KVMTRACE_0D(CLTS, vcpu, handler);
2023 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2046 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2024 return X86EMUL_CONTINUE; 2047 return X86EMUL_CONTINUE;
2025} 2048}
@@ -2053,21 +2076,19 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2053 2076
2054void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2077void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2055{ 2078{
2056 static int reported;
2057 u8 opcodes[4]; 2079 u8 opcodes[4];
2058 unsigned long rip = vcpu->arch.rip; 2080 unsigned long rip = vcpu->arch.rip;
2059 unsigned long rip_linear; 2081 unsigned long rip_linear;
2060 2082
2061 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2083 if (!printk_ratelimit())
2062
2063 if (reported)
2064 return; 2084 return;
2065 2085
2086 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2087
2066 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2088 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2067 2089
2068 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2090 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2069 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2091 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2070 reported = 1;
2071} 2092}
2072EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2093EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2073 2094
@@ -2105,27 +2126,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2105 ? X86EMUL_MODE_PROT64 : cs_db 2126 ? X86EMUL_MODE_PROT64 : cs_db
2106 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2127 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2107 2128
2108 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2109 vcpu->arch.emulate_ctxt.cs_base = 0;
2110 vcpu->arch.emulate_ctxt.ds_base = 0;
2111 vcpu->arch.emulate_ctxt.es_base = 0;
2112 vcpu->arch.emulate_ctxt.ss_base = 0;
2113 } else {
2114 vcpu->arch.emulate_ctxt.cs_base =
2115 get_segment_base(vcpu, VCPU_SREG_CS);
2116 vcpu->arch.emulate_ctxt.ds_base =
2117 get_segment_base(vcpu, VCPU_SREG_DS);
2118 vcpu->arch.emulate_ctxt.es_base =
2119 get_segment_base(vcpu, VCPU_SREG_ES);
2120 vcpu->arch.emulate_ctxt.ss_base =
2121 get_segment_base(vcpu, VCPU_SREG_SS);
2122 }
2123
2124 vcpu->arch.emulate_ctxt.gs_base =
2125 get_segment_base(vcpu, VCPU_SREG_GS);
2126 vcpu->arch.emulate_ctxt.fs_base =
2127 get_segment_base(vcpu, VCPU_SREG_FS);
2128
2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2130 2130
2131 /* Reject the instructions other than VMCALL/VMMCALL when 2131 /* Reject the instructions other than VMCALL/VMMCALL when
@@ -2300,9 +2300,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
2300} 2300}
2301 2301
2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2303 gpa_t addr) 2303 gpa_t addr, int len,
2304 int is_write)
2304{ 2305{
2305 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 2306 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2306} 2307}
2307 2308
2308int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2309int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
@@ -2331,11 +2332,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2331 2332
2332 kvm_x86_ops->cache_regs(vcpu); 2333 kvm_x86_ops->cache_regs(vcpu);
2333 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2334 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2334 kvm_x86_ops->decache_regs(vcpu);
2335 2335
2336 kvm_x86_ops->skip_emulated_instruction(vcpu); 2336 kvm_x86_ops->skip_emulated_instruction(vcpu);
2337 2337
2338 pio_dev = vcpu_find_pio_dev(vcpu, port); 2338 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2339 if (pio_dev) { 2339 if (pio_dev) {
2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2341 complete_pio(vcpu); 2341 complete_pio(vcpu);
@@ -2417,7 +2417,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2417 } 2417 }
2418 } 2418 }
2419 2419
2420 pio_dev = vcpu_find_pio_dev(vcpu, port); 2420 pio_dev = vcpu_find_pio_dev(vcpu, port,
2421 vcpu->arch.pio.cur_count,
2422 !vcpu->arch.pio.in);
2421 if (!vcpu->arch.pio.in) { 2423 if (!vcpu->arch.pio.in) {
2422 /* string PIO write */ 2424 /* string PIO write */
2423 ret = pio_copy_data(vcpu); 2425 ret = pio_copy_data(vcpu);
@@ -2600,27 +2602,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2600 2602
2601unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2603unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2602{ 2604{
2605 unsigned long value;
2606
2603 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2607 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2604 switch (cr) { 2608 switch (cr) {
2605 case 0: 2609 case 0:
2606 return vcpu->arch.cr0; 2610 value = vcpu->arch.cr0;
2611 break;
2607 case 2: 2612 case 2:
2608 return vcpu->arch.cr2; 2613 value = vcpu->arch.cr2;
2614 break;
2609 case 3: 2615 case 3:
2610 return vcpu->arch.cr3; 2616 value = vcpu->arch.cr3;
2617 break;
2611 case 4: 2618 case 4:
2612 return vcpu->arch.cr4; 2619 value = vcpu->arch.cr4;
2620 break;
2613 case 8: 2621 case 8:
2614 return kvm_get_cr8(vcpu); 2622 value = kvm_get_cr8(vcpu);
2623 break;
2615 default: 2624 default:
2616 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2625 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2617 return 0; 2626 return 0;
2618 } 2627 }
2628 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2629 (u32)((u64)value >> 32), handler);
2630
2631 return value;
2619} 2632}
2620 2633
2621void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2634void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2622 unsigned long *rflags) 2635 unsigned long *rflags)
2623{ 2636{
2637 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2638 (u32)((u64)val >> 32), handler);
2639
2624 switch (cr) { 2640 switch (cr) {
2625 case 0: 2641 case 0:
2626 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2642 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -2771,8 +2787,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2771 if (!apic || !apic->vapic_addr) 2787 if (!apic || !apic->vapic_addr)
2772 return; 2788 return;
2773 2789
2790 down_read(&vcpu->kvm->slots_lock);
2774 kvm_release_page_dirty(apic->vapic_page); 2791 kvm_release_page_dirty(apic->vapic_page);
2775 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2792 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2793 up_read(&vcpu->kvm->slots_lock);
2776} 2794}
2777 2795
2778static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2796static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -2928,9 +2946,7 @@ out:
2928 2946
2929 post_kvm_run_save(vcpu, kvm_run); 2947 post_kvm_run_save(vcpu, kvm_run);
2930 2948
2931 down_read(&vcpu->kvm->slots_lock);
2932 vapic_exit(vcpu); 2949 vapic_exit(vcpu);
2933 up_read(&vcpu->kvm->slots_lock);
2934 2950
2935 return r; 2951 return r;
2936} 2952}
@@ -2942,15 +2958,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2942 2958
2943 vcpu_load(vcpu); 2959 vcpu_load(vcpu);
2944 2960
2961 if (vcpu->sigset_active)
2962 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2963
2945 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 2964 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2946 kvm_vcpu_block(vcpu); 2965 kvm_vcpu_block(vcpu);
2947 vcpu_put(vcpu); 2966 r = -EAGAIN;
2948 return -EAGAIN; 2967 goto out;
2949 } 2968 }
2950 2969
2951 if (vcpu->sigset_active)
2952 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2953
2954 /* re-sync apic's tpr */ 2970 /* re-sync apic's tpr */
2955 if (!irqchip_in_kernel(vcpu->kvm)) 2971 if (!irqchip_in_kernel(vcpu->kvm))
2956 kvm_set_cr8(vcpu, kvm_run->cr8); 2972 kvm_set_cr8(vcpu, kvm_run->cr8);
@@ -3070,8 +3086,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3070 return 0; 3086 return 0;
3071} 3087}
3072 3088
3073static void get_segment(struct kvm_vcpu *vcpu, 3089void kvm_get_segment(struct kvm_vcpu *vcpu,
3074 struct kvm_segment *var, int seg) 3090 struct kvm_segment *var, int seg)
3075{ 3091{
3076 kvm_x86_ops->get_segment(vcpu, var, seg); 3092 kvm_x86_ops->get_segment(vcpu, var, seg);
3077} 3093}
@@ -3080,7 +3096,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080{ 3096{
3081 struct kvm_segment cs; 3097 struct kvm_segment cs;
3082 3098
3083 get_segment(vcpu, &cs, VCPU_SREG_CS); 3099 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3084 *db = cs.db; 3100 *db = cs.db;
3085 *l = cs.l; 3101 *l = cs.l;
3086} 3102}
@@ -3094,15 +3110,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3094 3110
3095 vcpu_load(vcpu); 3111 vcpu_load(vcpu);
3096 3112
3097 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3113 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3098 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3114 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3099 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3115 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3100 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3116 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3101 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3117 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3102 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3118 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3103 3119
3104 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3120 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3105 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3121 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3106 3122
3107 kvm_x86_ops->get_idt(vcpu, &dt); 3123 kvm_x86_ops->get_idt(vcpu, &dt);
3108 sregs->idt.limit = dt.limit; 3124 sregs->idt.limit = dt.limit;
@@ -3154,7 +3170,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3154 return 0; 3170 return 0;
3155} 3171}
3156 3172
3157static void set_segment(struct kvm_vcpu *vcpu, 3173static void kvm_set_segment(struct kvm_vcpu *vcpu,
3158 struct kvm_segment *var, int seg) 3174 struct kvm_segment *var, int seg)
3159{ 3175{
3160 kvm_x86_ops->set_segment(vcpu, var, seg); 3176 kvm_x86_ops->set_segment(vcpu, var, seg);
@@ -3191,7 +3207,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3191 if (selector & 1 << 2) { 3207 if (selector & 1 << 2) {
3192 struct kvm_segment kvm_seg; 3208 struct kvm_segment kvm_seg;
3193 3209
3194 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3210 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3195 3211
3196 if (kvm_seg.unusable) 3212 if (kvm_seg.unusable)
3197 dtable->limit = 0; 3213 dtable->limit = 0;
@@ -3297,7 +3313,7 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3297{ 3313{
3298 struct kvm_segment kvm_seg; 3314 struct kvm_segment kvm_seg;
3299 3315
3300 get_segment(vcpu, &kvm_seg, seg); 3316 kvm_get_segment(vcpu, &kvm_seg, seg);
3301 return kvm_seg.selector; 3317 return kvm_seg.selector;
3302} 3318}
3303 3319
@@ -3313,8 +3329,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3313 return 0; 3329 return 0;
3314} 3330}
3315 3331
3316static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3332int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3317 int type_bits, int seg) 3333 int type_bits, int seg)
3318{ 3334{
3319 struct kvm_segment kvm_seg; 3335 struct kvm_segment kvm_seg;
3320 3336
@@ -3327,7 +3343,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3327 if (!kvm_seg.s) 3343 if (!kvm_seg.s)
3328 kvm_seg.unusable = 1; 3344 kvm_seg.unusable = 1;
3329 3345
3330 set_segment(vcpu, &kvm_seg, seg); 3346 kvm_set_segment(vcpu, &kvm_seg, seg);
3331 return 0; 3347 return 0;
3332} 3348}
3333 3349
@@ -3373,25 +3389,25 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3373 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3389 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3374 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3390 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3375 3391
3376 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3392 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3377 return 1; 3393 return 1;
3378 3394
3379 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3395 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3380 return 1; 3396 return 1;
3381 3397
3382 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3398 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3383 return 1; 3399 return 1;
3384 3400
3385 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3401 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3386 return 1; 3402 return 1;
3387 3403
3388 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3404 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3389 return 1; 3405 return 1;
3390 3406
3391 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3407 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3392 return 1; 3408 return 1;
3393 3409
3394 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3410 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3395 return 1; 3411 return 1;
3396 return 0; 3412 return 0;
3397} 3413}
@@ -3432,24 +3448,24 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3432 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3448 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3433 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3449 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3434 3450
3435 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3451 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3436 return 1; 3452 return 1;
3437 3453
3438 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3454 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3439 return 1; 3455 return 1;
3440 3456
3441 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3457 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3442 return 1; 3458 return 1;
3443 3459
3444 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3460 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3445 return 1; 3461 return 1;
3446 3462
3447 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3463 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3448 return 1; 3464 return 1;
3449 return 0; 3465 return 0;
3450} 3466}
3451 3467
3452int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3468static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3453 struct desc_struct *cseg_desc, 3469 struct desc_struct *cseg_desc,
3454 struct desc_struct *nseg_desc) 3470 struct desc_struct *nseg_desc)
3455{ 3471{
@@ -3472,7 +3488,7 @@ out:
3472 return ret; 3488 return ret;
3473} 3489}
3474 3490
3475int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3491static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3476 struct desc_struct *cseg_desc, 3492 struct desc_struct *cseg_desc,
3477 struct desc_struct *nseg_desc) 3493 struct desc_struct *nseg_desc)
3478{ 3494{
@@ -3502,7 +3518,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3502 struct desc_struct nseg_desc; 3518 struct desc_struct nseg_desc;
3503 int ret = 0; 3519 int ret = 0;
3504 3520
3505 get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3521 kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3506 3522
3507 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3523 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3508 goto out; 3524 goto out;
@@ -3561,7 +3577,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3561 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3577 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3562 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3578 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3563 tr_seg.type = 11; 3579 tr_seg.type = 11;
3564 set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3580 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3565out: 3581out:
3566 kvm_x86_ops->decache_regs(vcpu); 3582 kvm_x86_ops->decache_regs(vcpu);
3567 return ret; 3583 return ret;
@@ -3628,15 +3644,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3628 } 3644 }
3629 } 3645 }
3630 3646
3631 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3647 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3632 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3648 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3633 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3649 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3634 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3650 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3635 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3651 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3636 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3652 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3637 3653
3638 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3654 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3639 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3655 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3640 3656
3641 vcpu_put(vcpu); 3657 vcpu_put(vcpu);
3642 3658
@@ -3751,14 +3767,14 @@ void fx_init(struct kvm_vcpu *vcpu)
3751 * allocate ram with GFP_KERNEL. 3767 * allocate ram with GFP_KERNEL.
3752 */ 3768 */
3753 if (!used_math()) 3769 if (!used_math())
3754 fx_save(&vcpu->arch.host_fx_image); 3770 kvm_fx_save(&vcpu->arch.host_fx_image);
3755 3771
3756 /* Initialize guest FPU by resetting ours and saving into guest's */ 3772 /* Initialize guest FPU by resetting ours and saving into guest's */
3757 preempt_disable(); 3773 preempt_disable();
3758 fx_save(&vcpu->arch.host_fx_image); 3774 kvm_fx_save(&vcpu->arch.host_fx_image);
3759 fx_finit(); 3775 kvm_fx_finit();
3760 fx_save(&vcpu->arch.guest_fx_image); 3776 kvm_fx_save(&vcpu->arch.guest_fx_image);
3761 fx_restore(&vcpu->arch.host_fx_image); 3777 kvm_fx_restore(&vcpu->arch.host_fx_image);
3762 preempt_enable(); 3778 preempt_enable();
3763 3779
3764 vcpu->arch.cr0 |= X86_CR0_ET; 3780 vcpu->arch.cr0 |= X86_CR0_ET;
@@ -3775,8 +3791,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3775 return; 3791 return;
3776 3792
3777 vcpu->guest_fpu_loaded = 1; 3793 vcpu->guest_fpu_loaded = 1;
3778 fx_save(&vcpu->arch.host_fx_image); 3794 kvm_fx_save(&vcpu->arch.host_fx_image);
3779 fx_restore(&vcpu->arch.guest_fx_image); 3795 kvm_fx_restore(&vcpu->arch.guest_fx_image);
3780} 3796}
3781EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3797EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3782 3798
@@ -3786,8 +3802,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3786 return; 3802 return;
3787 3803
3788 vcpu->guest_fpu_loaded = 0; 3804 vcpu->guest_fpu_loaded = 0;
3789 fx_save(&vcpu->arch.guest_fx_image); 3805 kvm_fx_save(&vcpu->arch.guest_fx_image);
3790 fx_restore(&vcpu->arch.host_fx_image); 3806 kvm_fx_restore(&vcpu->arch.host_fx_image);
3791 ++vcpu->stat.fpu_reload; 3807 ++vcpu->stat.fpu_reload;
3792} 3808}
3793EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3809EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
@@ -4016,6 +4032,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4016 return 0; 4032 return 0;
4017} 4033}
4018 4034
4035void kvm_arch_flush_shadow(struct kvm *kvm)
4036{
4037 kvm_mmu_zap_all(kvm);
4038}
4039
4019int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4040int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4020{ 4041{
4021 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4042 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 932f216d890..f2f90468f8b 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -121,7 +121,7 @@ static u16 opcode_table[256] = {
121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
122 0, 0, 0, 0, 122 0, 0, 0, 0,
123 /* 0x68 - 0x6F */ 123 /* 0x68 - 0x6F */
124 0, 0, ImplicitOps | Mov | Stack, 0, 124 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
127 /* 0x70 - 0x77 */ 127 /* 0x70 - 0x77 */
@@ -138,9 +138,11 @@ static u16 opcode_table[256] = {
138 /* 0x88 - 0x8F */ 138 /* 0x88 - 0x8F */
139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
141 0, ModRM | DstReg, 0, Group | Group1A, 141 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
142 /* 0x90 - 0x9F */ 142 DstReg | SrcMem | ModRM | Mov, Group | Group1A,
143 0, 0, 0, 0, 0, 0, 0, 0, 143 /* 0x90 - 0x97 */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x98 - 0x9F */
144 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 146 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
145 /* 0xA0 - 0xA7 */ 147 /* 0xA0 - 0xA7 */
146 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 148 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -152,7 +154,8 @@ static u16 opcode_table[256] = {
152 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
153 ByteOp | ImplicitOps | String, ImplicitOps | String, 155 ByteOp | ImplicitOps | String, ImplicitOps | String,
154 /* 0xB0 - 0xBF */ 156 /* 0xB0 - 0xBF */
155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xC0 - 0xC7 */ 159 /* 0xC0 - 0xC7 */
157 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
158 0, ImplicitOps | Stack, 0, 0, 161 0, ImplicitOps | Stack, 0, 0,
@@ -168,7 +171,8 @@ static u16 opcode_table[256] = {
168 /* 0xE0 - 0xE7 */ 171 /* 0xE0 - 0xE7 */
169 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0,
170 /* 0xE8 - 0xEF */ 173 /* 0xE8 - 0xEF */
171 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 174 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps,
172 0, 0, 0, 0, 176 0, 0, 0, 0,
173 /* 0xF0 - 0xF7 */ 177 /* 0xF0 - 0xF7 */
174 0, 0, 0, 0, 178 0, 0, 0, 0,
@@ -215,7 +219,7 @@ static u16 twobyte_table[256] = {
215 /* 0xA0 - 0xA7 */ 219 /* 0xA0 - 0xA7 */
216 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 220 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
217 /* 0xA8 - 0xAF */ 221 /* 0xA8 - 0xAF */
218 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 222 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
219 /* 0xB0 - 0xB7 */ 223 /* 0xB0 - 0xB7 */
220 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 224 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
221 DstMem | SrcReg | ModRM | BitOp, 225 DstMem | SrcReg | ModRM | BitOp,
@@ -518,6 +522,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
518 register_address_increment(c, &c->eip, rel); 522 register_address_increment(c, &c->eip, rel);
519} 523}
520 524
525static void set_seg_override(struct decode_cache *c, int seg)
526{
527 c->has_seg_override = true;
528 c->seg_override = seg;
529}
530
531static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
532{
533 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
534 return 0;
535
536 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
537}
538
539static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
540 struct decode_cache *c)
541{
542 if (!c->has_seg_override)
543 return 0;
544
545 return seg_base(ctxt, c->seg_override);
546}
547
548static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
549{
550 return seg_base(ctxt, VCPU_SREG_ES);
551}
552
553static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
554{
555 return seg_base(ctxt, VCPU_SREG_SS);
556}
557
521static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 558static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
522 struct x86_emulate_ops *ops, 559 struct x86_emulate_ops *ops,
523 unsigned long linear, u8 *dest) 560 unsigned long linear, u8 *dest)
@@ -660,7 +697,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
660{ 697{
661 struct decode_cache *c = &ctxt->decode; 698 struct decode_cache *c = &ctxt->decode;
662 u8 sib; 699 u8 sib;
663 int index_reg = 0, base_reg = 0, scale, rip_relative = 0; 700 int index_reg = 0, base_reg = 0, scale;
664 int rc = 0; 701 int rc = 0;
665 702
666 if (c->rex_prefix) { 703 if (c->rex_prefix) {
@@ -731,47 +768,28 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
731 } 768 }
732 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 769 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
733 (c->modrm_rm == 6 && c->modrm_mod != 0)) 770 (c->modrm_rm == 6 && c->modrm_mod != 0))
734 if (!c->override_base) 771 if (!c->has_seg_override)
735 c->override_base = &ctxt->ss_base; 772 set_seg_override(c, VCPU_SREG_SS);
736 c->modrm_ea = (u16)c->modrm_ea; 773 c->modrm_ea = (u16)c->modrm_ea;
737 } else { 774 } else {
738 /* 32/64-bit ModR/M decode. */ 775 /* 32/64-bit ModR/M decode. */
739 switch (c->modrm_rm) { 776 if ((c->modrm_rm & 7) == 4) {
740 case 4:
741 case 12:
742 sib = insn_fetch(u8, 1, c->eip); 777 sib = insn_fetch(u8, 1, c->eip);
743 index_reg |= (sib >> 3) & 7; 778 index_reg |= (sib >> 3) & 7;
744 base_reg |= sib & 7; 779 base_reg |= sib & 7;
745 scale = sib >> 6; 780 scale = sib >> 6;
746 781
747 switch (base_reg) { 782 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
748 case 5: 783 c->modrm_ea += insn_fetch(s32, 4, c->eip);
749 if (c->modrm_mod != 0) 784 else
750 c->modrm_ea += c->regs[base_reg];
751 else
752 c->modrm_ea +=
753 insn_fetch(s32, 4, c->eip);
754 break;
755 default:
756 c->modrm_ea += c->regs[base_reg]; 785 c->modrm_ea += c->regs[base_reg];
757 } 786 if (index_reg != 4)
758 switch (index_reg) {
759 case 4:
760 break;
761 default:
762 c->modrm_ea += c->regs[index_reg] << scale; 787 c->modrm_ea += c->regs[index_reg] << scale;
763 } 788 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
764 break; 789 if (ctxt->mode == X86EMUL_MODE_PROT64)
765 case 5: 790 c->rip_relative = 1;
766 if (c->modrm_mod != 0) 791 } else
767 c->modrm_ea += c->regs[c->modrm_rm];
768 else if (ctxt->mode == X86EMUL_MODE_PROT64)
769 rip_relative = 1;
770 break;
771 default:
772 c->modrm_ea += c->regs[c->modrm_rm]; 792 c->modrm_ea += c->regs[c->modrm_rm];
773 break;
774 }
775 switch (c->modrm_mod) { 793 switch (c->modrm_mod) {
776 case 0: 794 case 0:
777 if (c->modrm_rm == 5) 795 if (c->modrm_rm == 5)
@@ -785,22 +803,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
785 break; 803 break;
786 } 804 }
787 } 805 }
788 if (rip_relative) {
789 c->modrm_ea += c->eip;
790 switch (c->d & SrcMask) {
791 case SrcImmByte:
792 c->modrm_ea += 1;
793 break;
794 case SrcImm:
795 if (c->d & ByteOp)
796 c->modrm_ea += 1;
797 else
798 if (c->op_bytes == 8)
799 c->modrm_ea += 4;
800 else
801 c->modrm_ea += c->op_bytes;
802 }
803 }
804done: 806done:
805 return rc; 807 return rc;
806} 808}
@@ -838,6 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
838 840
839 memset(c, 0, sizeof(struct decode_cache)); 841 memset(c, 0, sizeof(struct decode_cache));
840 c->eip = ctxt->vcpu->arch.rip; 842 c->eip = ctxt->vcpu->arch.rip;
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
841 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
842 845
843 switch (mode) { 846 switch (mode) {
@@ -876,23 +879,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
876 /* switch between 2/4 bytes */ 879 /* switch between 2/4 bytes */
877 c->ad_bytes = def_ad_bytes ^ 6; 880 c->ad_bytes = def_ad_bytes ^ 6;
878 break; 881 break;
882 case 0x26: /* ES override */
879 case 0x2e: /* CS override */ 883 case 0x2e: /* CS override */
880 c->override_base = &ctxt->cs_base; 884 case 0x36: /* SS override */
881 break;
882 case 0x3e: /* DS override */ 885 case 0x3e: /* DS override */
883 c->override_base = &ctxt->ds_base; 886 set_seg_override(c, (c->b >> 3) & 3);
884 break;
885 case 0x26: /* ES override */
886 c->override_base = &ctxt->es_base;
887 break; 887 break;
888 case 0x64: /* FS override */ 888 case 0x64: /* FS override */
889 c->override_base = &ctxt->fs_base;
890 break;
891 case 0x65: /* GS override */ 889 case 0x65: /* GS override */
892 c->override_base = &ctxt->gs_base; 890 set_seg_override(c, c->b & 7);
893 break;
894 case 0x36: /* SS override */
895 c->override_base = &ctxt->ss_base;
896 break; 891 break;
897 case 0x40 ... 0x4f: /* REX */ 892 case 0x40 ... 0x4f: /* REX */
898 if (mode != X86EMUL_MODE_PROT64) 893 if (mode != X86EMUL_MODE_PROT64)
@@ -964,15 +959,11 @@ done_prefixes:
964 if (rc) 959 if (rc)
965 goto done; 960 goto done;
966 961
967 if (!c->override_base) 962 if (!c->has_seg_override)
968 c->override_base = &ctxt->ds_base; 963 set_seg_override(c, VCPU_SREG_DS);
969 if (mode == X86EMUL_MODE_PROT64 &&
970 c->override_base != &ctxt->fs_base &&
971 c->override_base != &ctxt->gs_base)
972 c->override_base = NULL;
973 964
974 if (c->override_base) 965 if (!(!c->twobyte && c->b == 0x8d))
975 c->modrm_ea += *c->override_base; 966 c->modrm_ea += seg_override_base(ctxt, c);
976 967
977 if (c->ad_bytes != 8) 968 if (c->ad_bytes != 8)
978 c->modrm_ea = (u32)c->modrm_ea; 969 c->modrm_ea = (u32)c->modrm_ea;
@@ -1049,6 +1040,7 @@ done_prefixes:
1049 break; 1040 break;
1050 case DstMem: 1041 case DstMem:
1051 if ((c->d & ModRM) && c->modrm_mod == 3) { 1042 if ((c->d & ModRM) && c->modrm_mod == 3) {
1043 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1052 c->dst.type = OP_REG; 1044 c->dst.type = OP_REG;
1053 c->dst.val = c->dst.orig_val = c->modrm_val; 1045 c->dst.val = c->dst.orig_val = c->modrm_val;
1054 c->dst.ptr = c->modrm_ptr; 1046 c->dst.ptr = c->modrm_ptr;
@@ -1058,6 +1050,9 @@ done_prefixes:
1058 break; 1050 break;
1059 } 1051 }
1060 1052
1053 if (c->rip_relative)
1054 c->modrm_ea += c->eip;
1055
1061done: 1056done:
1062 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1057 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1063} 1058}
@@ -1070,7 +1065,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1070 c->dst.bytes = c->op_bytes; 1065 c->dst.bytes = c->op_bytes;
1071 c->dst.val = c->src.val; 1066 c->dst.val = c->src.val;
1072 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1067 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1073 c->dst.ptr = (void *) register_address(c, ctxt->ss_base, 1068 c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
1074 c->regs[VCPU_REGS_RSP]); 1069 c->regs[VCPU_REGS_RSP]);
1075} 1070}
1076 1071
@@ -1080,7 +1075,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1080 struct decode_cache *c = &ctxt->decode; 1075 struct decode_cache *c = &ctxt->decode;
1081 int rc; 1076 int rc;
1082 1077
1083 rc = ops->read_std(register_address(c, ctxt->ss_base, 1078 rc = ops->read_std(register_address(c, ss_base(ctxt),
1084 c->regs[VCPU_REGS_RSP]), 1079 c->regs[VCPU_REGS_RSP]),
1085 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1080 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1086 if (rc != 0) 1081 if (rc != 0)
@@ -1402,11 +1397,11 @@ special_insn:
1402 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1403 -c->op_bytes); 1398 -c->op_bytes);
1404 c->dst.ptr = (void *) register_address( 1399 c->dst.ptr = (void *) register_address(
1405 c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); 1400 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1406 break; 1401 break;
1407 case 0x58 ... 0x5f: /* pop reg */ 1402 case 0x58 ... 0x5f: /* pop reg */
1408 pop_instruction: 1403 pop_instruction:
1409 if ((rc = ops->read_std(register_address(c, ctxt->ss_base, 1404 if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
1410 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1405 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1411 c->op_bytes, ctxt->vcpu)) != 0) 1406 c->op_bytes, ctxt->vcpu)) != 0)
1412 goto done; 1407 goto done;
@@ -1420,9 +1415,8 @@ special_insn:
1420 goto cannot_emulate; 1415 goto cannot_emulate;
1421 c->dst.val = (s32) c->src.val; 1416 c->dst.val = (s32) c->src.val;
1422 break; 1417 break;
1418 case 0x68: /* push imm */
1423 case 0x6a: /* push imm8 */ 1419 case 0x6a: /* push imm8 */
1424 c->src.val = 0L;
1425 c->src.val = insn_fetch(s8, 1, c->eip);
1426 emulate_push(ctxt); 1420 emulate_push(ctxt);
1427 break; 1421 break;
1428 case 0x6c: /* insb */ 1422 case 0x6c: /* insb */
@@ -1433,7 +1427,7 @@ special_insn:
1433 c->rep_prefix ? 1427 c->rep_prefix ?
1434 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1428 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1435 (ctxt->eflags & EFLG_DF), 1429 (ctxt->eflags & EFLG_DF),
1436 register_address(c, ctxt->es_base, 1430 register_address(c, es_base(ctxt),
1437 c->regs[VCPU_REGS_RDI]), 1431 c->regs[VCPU_REGS_RDI]),
1438 c->rep_prefix, 1432 c->rep_prefix,
1439 c->regs[VCPU_REGS_RDX]) == 0) { 1433 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1449,9 +1443,8 @@ special_insn:
1449 c->rep_prefix ? 1443 c->rep_prefix ?
1450 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1444 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1451 (ctxt->eflags & EFLG_DF), 1445 (ctxt->eflags & EFLG_DF),
1452 register_address(c, c->override_base ? 1446 register_address(c,
1453 *c->override_base : 1447 seg_override_base(ctxt, c),
1454 ctxt->ds_base,
1455 c->regs[VCPU_REGS_RSI]), 1448 c->regs[VCPU_REGS_RSI]),
1456 c->rep_prefix, 1449 c->rep_prefix,
1457 c->regs[VCPU_REGS_RDX]) == 0) { 1450 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1490,6 +1483,7 @@ special_insn:
1490 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1483 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1491 break; 1484 break;
1492 case 0x86 ... 0x87: /* xchg */ 1485 case 0x86 ... 0x87: /* xchg */
1486 xchg:
1493 /* Write back the register source. */ 1487 /* Write back the register source. */
1494 switch (c->dst.bytes) { 1488 switch (c->dst.bytes) {
1495 case 1: 1489 case 1:
@@ -1514,14 +1508,60 @@ special_insn:
1514 break; 1508 break;
1515 case 0x88 ... 0x8b: /* mov */ 1509 case 0x88 ... 0x8b: /* mov */
1516 goto mov; 1510 goto mov;
1511 case 0x8c: { /* mov r/m, sreg */
1512 struct kvm_segment segreg;
1513
1514 if (c->modrm_reg <= 5)
1515 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
1516 else {
1517 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
1518 c->modrm);
1519 goto cannot_emulate;
1520 }
1521 c->dst.val = segreg.selector;
1522 break;
1523 }
1517 case 0x8d: /* lea r16/r32, m */ 1524 case 0x8d: /* lea r16/r32, m */
1518 c->dst.val = c->modrm_ea; 1525 c->dst.val = c->modrm_ea;
1519 break; 1526 break;
1527 case 0x8e: { /* mov seg, r/m16 */
1528 uint16_t sel;
1529 int type_bits;
1530 int err;
1531
1532 sel = c->src.val;
1533 if (c->modrm_reg <= 5) {
1534 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1535 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
1536 type_bits, c->modrm_reg);
1537 } else {
1538 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1539 c->modrm);
1540 goto cannot_emulate;
1541 }
1542
1543 if (err < 0)
1544 goto cannot_emulate;
1545
1546 c->dst.type = OP_NONE; /* Disable writeback. */
1547 break;
1548 }
1520 case 0x8f: /* pop (sole member of Grp1a) */ 1549 case 0x8f: /* pop (sole member of Grp1a) */
1521 rc = emulate_grp1a(ctxt, ops); 1550 rc = emulate_grp1a(ctxt, ops);
1522 if (rc != 0) 1551 if (rc != 0)
1523 goto done; 1552 goto done;
1524 break; 1553 break;
1554 case 0x90: /* nop / xchg r8,rax */
1555 if (!(c->rex_prefix & 1)) { /* nop */
1556 c->dst.type = OP_NONE;
1557 break;
1558 }
1559 case 0x91 ... 0x97: /* xchg reg,rax */
1560 c->src.type = c->dst.type = OP_REG;
1561 c->src.bytes = c->dst.bytes = c->op_bytes;
1562 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
1563 c->src.val = *(c->src.ptr);
1564 goto xchg;
1525 case 0x9c: /* pushf */ 1565 case 0x9c: /* pushf */
1526 c->src.val = (unsigned long) ctxt->eflags; 1566 c->src.val = (unsigned long) ctxt->eflags;
1527 emulate_push(ctxt); 1567 emulate_push(ctxt);
@@ -1540,11 +1580,10 @@ special_insn:
1540 c->dst.type = OP_MEM; 1580 c->dst.type = OP_MEM;
1541 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1581 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1542 c->dst.ptr = (unsigned long *)register_address(c, 1582 c->dst.ptr = (unsigned long *)register_address(c,
1543 ctxt->es_base, 1583 es_base(ctxt),
1544 c->regs[VCPU_REGS_RDI]); 1584 c->regs[VCPU_REGS_RDI]);
1545 if ((rc = ops->read_emulated(register_address(c, 1585 if ((rc = ops->read_emulated(register_address(c,
1546 c->override_base ? *c->override_base : 1586 seg_override_base(ctxt, c),
1547 ctxt->ds_base,
1548 c->regs[VCPU_REGS_RSI]), 1587 c->regs[VCPU_REGS_RSI]),
1549 &c->dst.val, 1588 &c->dst.val,
1550 c->dst.bytes, ctxt->vcpu)) != 0) 1589 c->dst.bytes, ctxt->vcpu)) != 0)
@@ -1560,8 +1599,7 @@ special_insn:
1560 c->src.type = OP_NONE; /* Disable writeback. */ 1599 c->src.type = OP_NONE; /* Disable writeback. */
1561 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1562 c->src.ptr = (unsigned long *)register_address(c, 1601 c->src.ptr = (unsigned long *)register_address(c,
1563 c->override_base ? *c->override_base : 1602 seg_override_base(ctxt, c),
1564 ctxt->ds_base,
1565 c->regs[VCPU_REGS_RSI]); 1603 c->regs[VCPU_REGS_RSI]);
1566 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 1604 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1567 &c->src.val, 1605 &c->src.val,
@@ -1572,7 +1610,7 @@ special_insn:
1572 c->dst.type = OP_NONE; /* Disable writeback. */ 1610 c->dst.type = OP_NONE; /* Disable writeback. */
1573 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1611 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1574 c->dst.ptr = (unsigned long *)register_address(c, 1612 c->dst.ptr = (unsigned long *)register_address(c,
1575 ctxt->es_base, 1613 es_base(ctxt),
1576 c->regs[VCPU_REGS_RDI]); 1614 c->regs[VCPU_REGS_RDI]);
1577 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1615 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1578 &c->dst.val, 1616 &c->dst.val,
@@ -1596,7 +1634,7 @@ special_insn:
1596 c->dst.type = OP_MEM; 1634 c->dst.type = OP_MEM;
1597 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1635 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1598 c->dst.ptr = (unsigned long *)register_address(c, 1636 c->dst.ptr = (unsigned long *)register_address(c,
1599 ctxt->es_base, 1637 es_base(ctxt),
1600 c->regs[VCPU_REGS_RDI]); 1638 c->regs[VCPU_REGS_RDI]);
1601 c->dst.val = c->regs[VCPU_REGS_RAX]; 1639 c->dst.val = c->regs[VCPU_REGS_RAX];
1602 register_address_increment(c, &c->regs[VCPU_REGS_RDI], 1640 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
@@ -1608,8 +1646,7 @@ special_insn:
1608 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1646 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1609 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1647 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1610 if ((rc = ops->read_emulated(register_address(c, 1648 if ((rc = ops->read_emulated(register_address(c,
1611 c->override_base ? *c->override_base : 1649 seg_override_base(ctxt, c),
1612 ctxt->ds_base,
1613 c->regs[VCPU_REGS_RSI]), 1650 c->regs[VCPU_REGS_RSI]),
1614 &c->dst.val, 1651 &c->dst.val,
1615 c->dst.bytes, 1652 c->dst.bytes,
@@ -1622,6 +1659,8 @@ special_insn:
1622 case 0xae ... 0xaf: /* scas */ 1659 case 0xae ... 0xaf: /* scas */
1623 DPRINTF("Urk! I don't handle SCAS.\n"); 1660 DPRINTF("Urk! I don't handle SCAS.\n");
1624 goto cannot_emulate; 1661 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */
1663 goto mov;
1625 case 0xc0 ... 0xc1: 1664 case 0xc0 ... 0xc1:
1626 emulate_grp2(ctxt); 1665 emulate_grp2(ctxt);
1627 break; 1666 break;
@@ -1660,13 +1699,39 @@ special_insn:
1660 break; 1699 break;
1661 } 1700 }
1662 case 0xe9: /* jmp rel */ 1701 case 0xe9: /* jmp rel */
1663 case 0xeb: /* jmp rel short */ 1702 goto jmp;
1703 case 0xea: /* jmp far */ {
1704 uint32_t eip;
1705 uint16_t sel;
1706
1707 switch (c->op_bytes) {
1708 case 2:
1709 eip = insn_fetch(u16, 2, c->eip);
1710 break;
1711 case 4:
1712 eip = insn_fetch(u32, 4, c->eip);
1713 break;
1714 default:
1715 DPRINTF("jmp far: Invalid op_bytes\n");
1716 goto cannot_emulate;
1717 }
1718 sel = insn_fetch(u16, 2, c->eip);
1719 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1720 DPRINTF("jmp far: Failed to load CS descriptor\n");
1721 goto cannot_emulate;
1722 }
1723
1724 c->eip = eip;
1725 break;
1726 }
1727 case 0xeb:
1728 jmp: /* jmp rel short */
1664 jmp_rel(c, c->src.val); 1729 jmp_rel(c, c->src.val);
1665 c->dst.type = OP_NONE; /* Disable writeback. */ 1730 c->dst.type = OP_NONE; /* Disable writeback. */
1666 break; 1731 break;
1667 case 0xf4: /* hlt */ 1732 case 0xf4: /* hlt */
1668 ctxt->vcpu->arch.halt_request = 1; 1733 ctxt->vcpu->arch.halt_request = 1;
1669 goto done; 1734 break;
1670 case 0xf5: /* cmc */ 1735 case 0xf5: /* cmc */
1671 /* complement carry flag from eflags reg */ 1736 /* complement carry flag from eflags reg */
1672 ctxt->eflags ^= EFLG_CF; 1737 ctxt->eflags ^= EFLG_CF;
@@ -1882,6 +1947,8 @@ twobyte_insn:
1882 c->src.val &= (c->dst.bytes << 3) - 1; 1947 c->src.val &= (c->dst.bytes << 3) - 1;
1883 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 1948 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1884 break; 1949 break;
1950 case 0xae: /* clflush */
1951 break;
1885 case 0xb0 ... 0xb1: /* cmpxchg */ 1952 case 0xb0 ... 0xb1: /* cmpxchg */
1886 /* 1953 /*
1887 * Save real source value, then compare EAX against 1954 * Save real source value, then compare EAX against
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 50dad44fb54..0313a5eec41 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -991,7 +991,6 @@ __init void lguest_init(void)
991#ifdef CONFIG_X86_LOCAL_APIC 991#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 992 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 993 pv_apic_ops.apic_write = lguest_apic_write;
994 pv_apic_ops.apic_write_atomic = lguest_apic_write;
995 pv_apic_ops.apic_read = lguest_apic_read; 994 pv_apic_ops.apic_read = lguest_apic_read;
996#endif 995#endif
997 996
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 48278fa7d3d..3d317836be9 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,14 +10,6 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13/*
14 * Any quirks to be performed to initialize timers/irqs/etc?
15 */
16int (*arch_time_init_quirk)(void);
17int (*arch_pre_intr_init_quirk)(void);
18int (*arch_intr_init_quirk)(void);
19int (*arch_trap_init_quirk)(void);
20
21#ifdef CONFIG_HOTPLUG_CPU 13#ifdef CONFIG_HOTPLUG_CPU
22#define DEFAULT_SEND_IPI (1) 14#define DEFAULT_SEND_IPI (1)
23#else 15#else
@@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
37 **/ 29 **/
38void __init pre_intr_init_hook(void) 30void __init pre_intr_init_hook(void)
39{ 31{
40 if (arch_pre_intr_init_quirk) { 32 if (x86_quirks->arch_pre_intr_init) {
41 if (arch_pre_intr_init_quirk()) 33 if (x86_quirks->arch_pre_intr_init())
42 return; 34 return;
43 } 35 }
44 init_ISA_irqs(); 36 init_ISA_irqs();
@@ -64,8 +56,8 @@ static struct irqaction irq2 = {
64 **/ 56 **/
65void __init intr_init_hook(void) 57void __init intr_init_hook(void)
66{ 58{
67 if (arch_intr_init_quirk) { 59 if (x86_quirks->arch_intr_init) {
68 if (arch_intr_init_quirk()) 60 if (x86_quirks->arch_intr_init())
69 return; 61 return;
70 } 62 }
71#ifdef CONFIG_X86_LOCAL_APIC 63#ifdef CONFIG_X86_LOCAL_APIC
@@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
97 **/ 89 **/
98void __init trap_init_hook(void) 90void __init trap_init_hook(void)
99{ 91{
100 if (arch_trap_init_quirk) { 92 if (x86_quirks->arch_trap_init) {
101 if (arch_trap_init_quirk()) 93 if (x86_quirks->arch_trap_init())
102 return; 94 return;
103 } 95 }
104} 96}
@@ -111,6 +103,16 @@ static struct irqaction irq0 = {
111}; 103};
112 104
113/** 105/**
106 * pre_time_init_hook - do any specific initialisations before.
107 *
108 **/
109void __init pre_time_init_hook(void)
110{
111 if (x86_quirks->arch_pre_time_init)
112 x86_quirks->arch_pre_time_init();
113}
114
115/**
114 * time_init_hook - do any specific initialisations for the system timer. 116 * time_init_hook - do any specific initialisations for the system timer.
115 * 117 *
116 * Description: 118 * Description:
@@ -119,13 +121,13 @@ static struct irqaction irq0 = {
119 **/ 121 **/
120void __init time_init_hook(void) 122void __init time_init_hook(void)
121{ 123{
122 if (arch_time_init_quirk) { 124 if (x86_quirks->arch_time_init) {
123 /* 125 /*
124 * A nonzero return code does not mean failure, it means 126 * A nonzero return code does not mean failure, it means
125 * that the architecture quirk does not want any 127 * that the architecture quirk does not want any
126 * generic (timer) setup to be performed after this: 128 * generic (timer) setup to be performed after this:
127 */ 129 */
128 if (arch_time_init_quirk()) 130 if (x86_quirks->arch_time_init())
129 return; 131 return;
130 } 132 }
131 133
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9873716e9f7..1fbb844c3d7 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif 21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 23
24obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e6..d37f29376b0 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
844 reserve_early(table_start << PAGE_SHIFT, 844 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE"); 845 table_end << PAGE_SHIFT, "PGTABLE");
846 846
847 if (!after_init_bootmem)
848 early_memtest(start, end);
849
847 return end >> PAGE_SHIFT; 850 return end >> PAGE_SHIFT;
848} 851}
849 852
@@ -868,8 +871,6 @@ void __init paging_init(void)
868 */ 871 */
869 sparse_init(); 872 sparse_init();
870 zone_sizes_init(); 873 zone_sizes_init();
871
872 paravirt_post_allocator_init();
873} 874}
874 875
875/* 876/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 306049edd55..ec37121f670 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -517,118 +517,6 @@ static void __init init_gbpages(void)
517 direct_gbpages = 0; 517 direct_gbpages = 0;
518} 518}
519 519
520#ifdef CONFIG_MEMTEST
521
522static void __init memtest(unsigned long start_phys, unsigned long size,
523 unsigned pattern)
524{
525 unsigned long i;
526 unsigned long *start;
527 unsigned long start_bad;
528 unsigned long last_bad;
529 unsigned long val;
530 unsigned long start_phys_aligned;
531 unsigned long count;
532 unsigned long incr;
533
534 switch (pattern) {
535 case 0:
536 val = 0UL;
537 break;
538 case 1:
539 val = -1UL;
540 break;
541 case 2:
542 val = 0x5555555555555555UL;
543 break;
544 case 3:
545 val = 0xaaaaaaaaaaaaaaaaUL;
546 break;
547 default:
548 return;
549 }
550
551 incr = sizeof(unsigned long);
552 start_phys_aligned = ALIGN(start_phys, incr);
553 count = (size - (start_phys_aligned - start_phys))/incr;
554 start = __va(start_phys_aligned);
555 start_bad = 0;
556 last_bad = 0;
557
558 for (i = 0; i < count; i++)
559 start[i] = val;
560 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
561 if (*start != val) {
562 if (start_phys_aligned == last_bad + incr) {
563 last_bad += incr;
564 } else {
565 if (start_bad) {
566 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
567 val, start_bad, last_bad + incr);
568 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
569 }
570 start_bad = last_bad = start_phys_aligned;
571 }
572 }
573 }
574 if (start_bad) {
575 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
576 val, start_bad, last_bad + incr);
577 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
578 }
579
580}
581
582/* default is disabled */
583static int memtest_pattern __initdata;
584
585static int __init parse_memtest(char *arg)
586{
587 if (arg)
588 memtest_pattern = simple_strtoul(arg, NULL, 0);
589 return 0;
590}
591
592early_param("memtest", parse_memtest);
593
594static void __init early_memtest(unsigned long start, unsigned long end)
595{
596 u64 t_start, t_size;
597 unsigned pattern;
598
599 if (!memtest_pattern)
600 return;
601
602 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
603 for (pattern = 0; pattern < memtest_pattern; pattern++) {
604 t_start = start;
605 t_size = 0;
606 while (t_start < end) {
607 t_start = find_e820_area_size(t_start, &t_size, 1);
608
609 /* done ? */
610 if (t_start >= end)
611 break;
612 if (t_start + t_size > end)
613 t_size = end - t_start;
614
615 printk(KERN_CONT "\n %016llx - %016llx pattern %d",
616 (unsigned long long)t_start,
617 (unsigned long long)t_start + t_size, pattern);
618
619 memtest(t_start, t_size, pattern);
620
621 t_start += t_size;
622 }
623 }
624 printk(KERN_CONT "\n");
625}
626#else
627static void __init early_memtest(unsigned long start, unsigned long end)
628{
629}
630#endif
631
632static unsigned long __init kernel_physical_mapping_init(unsigned long start, 520static unsigned long __init kernel_physical_mapping_init(unsigned long start,
633 unsigned long end, 521 unsigned long end,
634 unsigned long page_size_mask) 522 unsigned long page_size_mask)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 00000000000..672e17f8262
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d4585077977..2fe30916d4b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,8 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
15 17
16#include <asm/msr.h> 18#include <asm/msr.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
@@ -373,8 +375,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
373 return vma_prot; 375 return vma_prot;
374} 376}
375 377
376#ifdef CONFIG_NONPROMISC_DEVMEM 378#ifdef CONFIG_STRICT_DEVMEM
377/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 379/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
378static inline int range_is_allowed(unsigned long pfn, unsigned long size) 380static inline int range_is_allowed(unsigned long pfn, unsigned long size)
379{ 381{
380 return 1; 382 return 1;
@@ -398,7 +400,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
398 } 400 }
399 return 1; 401 return 1;
400} 402}
401#endif /* CONFIG_NONPROMISC_DEVMEM */ 403#endif /* CONFIG_STRICT_DEVMEM */
402 404
403int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 405int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
404 unsigned long size, pgprot_t *vma_prot) 406 unsigned long size, pgprot_t *vma_prot)
@@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
489 491
490 free_memtype(addr, addr + size); 492 free_memtype(addr, addr + size);
491} 493}
494
495#if defined(CONFIG_DEBUG_FS)
496
497/* get Nth element of the linked list */
498static struct memtype *memtype_get_idx(loff_t pos)
499{
500 struct memtype *list_node, *print_entry;
501 int i = 1;
502
503 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
504 if (!print_entry)
505 return NULL;
506
507 spin_lock(&memtype_lock);
508 list_for_each_entry(list_node, &memtype_list, nd) {
509 if (pos == i) {
510 *print_entry = *list_node;
511 spin_unlock(&memtype_lock);
512 return print_entry;
513 }
514 ++i;
515 }
516 spin_unlock(&memtype_lock);
517 kfree(print_entry);
518 return NULL;
519}
520
521static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
522{
523 if (*pos == 0) {
524 ++*pos;
525 seq_printf(seq, "PAT memtype list:\n");
526 }
527
528 return memtype_get_idx(*pos);
529}
530
531static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
532{
533 ++*pos;
534 return memtype_get_idx(*pos);
535}
536
537static void memtype_seq_stop(struct seq_file *seq, void *v)
538{
539}
540
541static int memtype_seq_show(struct seq_file *seq, void *v)
542{
543 struct memtype *print_entry = (struct memtype *)v;
544
545 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
546 print_entry->start, print_entry->end);
547 kfree(print_entry);
548 return 0;
549}
550
551static struct seq_operations memtype_seq_ops = {
552 .start = memtype_seq_start,
553 .next = memtype_seq_next,
554 .stop = memtype_seq_stop,
555 .show = memtype_seq_show,
556};
557
558static int memtype_seq_open(struct inode *inode, struct file *file)
559{
560 return seq_open(file, &memtype_seq_ops);
561}
562
563static const struct file_operations memtype_fops = {
564 .open = memtype_seq_open,
565 .read = seq_read,
566 .llseek = seq_lseek,
567 .release = seq_release,
568};
569
570static int __init pat_memtype_list_init(void)
571{
572 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
573 NULL, &memtype_fops);
574 return 0;
575}
576
577late_initcall(pat_memtype_list_init);
578
579#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e515e8db842..d49202e740e 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o 6obj-$(CONFIG_PCI_OLPC) += olpc.o
7 7
8pci-y := fixup.o 8obj-y += fixup.o
9pci-$(CONFIG_ACPI) += acpi.o 9obj-$(CONFIG_ACPI) += acpi.o
10pci-y += legacy.o irq.o 10obj-y += legacy.o irq.o
11 11
12pci-$(CONFIG_X86_VISWS) += visws.o 12obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14pci-$(CONFIG_X86_NUMAQ) += numa.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += $(pci-y) common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 132876cc6fc..ec9ce35e44d 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
57 57
58int __init pci_subsys_init(void) 58int __init pci_subsys_init(void)
59{ 59{
60#ifdef CONFIG_X86_NUMAQ
61 pci_numaq_init();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 pci_acpi_init(); 64 pci_acpi_init();
62#endif 65#endif
66#ifdef CONFIG_X86_VISWS
67 pci_visws_init();
68#endif
63 pci_legacy_init(); 69 pci_legacy_init();
64 pcibios_irq_init(); 70 pcibios_irq_init();
65#ifdef CONFIG_X86_NUMAQ
66 pci_numa_init();
67#endif
68 pcibios_init(); 71 pcibios_init();
69 72
70 return 0; 73 return 0;
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c
index 8b5ca196673..f4b16dc11da 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numaq_32.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines 2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */ 3 */
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
@@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
151} 151}
152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
153 153
154int __init pci_numa_init(void) 154int __init pci_numaq_init(void)
155{ 155{
156 int quad; 156 int quad;
157 157
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 3e25deb821a..15b9cf6be72 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
108/* some common used subsys_initcalls */ 108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void); 109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void); 110extern int __init pcibios_irq_init(void);
111extern int __init pci_numa_init(void); 111extern int __init pci_visws_init(void);
112extern int __init pci_numaq_init(void);
112extern int __init pcibios_init(void); 113extern int __init pcibios_init(void);
113 114
114/* pci-mmconfig.c */ 115/* pci-mmconfig.c */
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 1a7bed492bb..42f4cb19fac 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
87} 87}
88 88
89static int __init pci_visws_init(void) 89int __init pci_visws_init(void)
90{ 90{
91 if (!is_visws_box())
92 return -1;
93
94 pcibios_enable_irq = &pci_visws_enable_irq;
95 pcibios_disable_irq = &pci_visws_disable_irq;
96
91 /* The VISWS supports configuration access type 1 only */ 97 /* The VISWS supports configuration access type 1 only */
92 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 98 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
93 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); 99 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
105 pcibios_resource_survey(); 111 pcibios_resource_survey();
106 return 0; 112 return 0;
107} 113}
108
109static __init int pci_subsys_init(void)
110{
111 if (!is_visws_box())
112 return -1;
113
114 pcibios_enable_irq = &pci_visws_enable_irq;
115 pcibios_disable_irq = &pci_visws_disable_irq;
116
117 pci_visws_init();
118 pcibios_init();
119
120 return 0;
121}
122subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21..4d6ef0a336d 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a51..513f330c583 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
207 202
208/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
209void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
226#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
227 222
228#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
229 225
230void enable_sep_cpu(void) 226void enable_sep_cpu(void)
231{ 227{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
296 gate_vma_init(); 292 gate_vma_init();
297#endif 293#endif
298 294
299 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
300 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
301 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
302 } else { 298 } else if (vdso32_sysenter()){
303 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
304 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
305 } 304 }
306 305
307 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab8..2ce5f82c333 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 19a6cfaf5db..257ba4a10ab 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
21extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
23 23
24struct page **vdso_pages; 24static struct page **vdso_pages;
25static unsigned vdso_size;
25 26
26static inline void *var_ref(void *p, char *name) 27static inline void *var_ref(void *p, char *name)
27{ 28{
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
38 int i; 39 int i;
39 char *vbase; 40 char *vbase;
40 41
42 vdso_size = npages << PAGE_SHIFT;
41 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
42 if (!vdso_pages) 44 if (!vdso_pages)
43 goto oom; 45 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
101 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
102 unsigned long addr; 104 unsigned long addr;
103 int ret; 105 int ret;
104 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
105 106
106 if (!vdso_enabled) 107 if (!vdso_enabled)
107 return 0; 108 return 0;
108 109
109 down_write(&mm->mmap_sem); 110 down_write(&mm->mmap_sem);
110 addr = vdso_addr(mm->start_stack, len); 111 addr = vdso_addr(mm->start_stack, vdso_size);
111 addr = get_unmapped_area(NULL, addr, len, 0, 0); 112 addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
112 if (IS_ERR_VALUE(addr)) { 113 if (IS_ERR_VALUE(addr)) {
113 ret = addr; 114 ret = addr;
114 goto up_fail; 115 goto up_fail;
115 } 116 }
116 117
117 ret = install_special_mapping(mm, addr, len, 118 ret = install_special_mapping(mm, addr, vdso_size,
118 VM_READ|VM_EXEC| 119 VM_READ|VM_EXEC|
119 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
120 VM_ALWAYSDUMP, 121 VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc9958087..3815e425f47 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d164913..59c1e539aed 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o xen-asm.o grant-table.o suspend.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5328e46d9cf..194bbd6e324 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,12 +41,12 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
47#include <asm/reboot.h> 49#include <asm/reboot.h>
48#include <asm/pgalloc.h>
49 50
50#include "xen-ops.h" 51#include "xen-ops.h"
51#include "mmu.h" 52#include "mmu.h"
@@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58 59
59/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
60 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
61 * 74 *
62 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -367,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
367 380
368static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 381static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
369{ 382{
370 xen_mc_batch();
371
372 load_TLS_descriptor(t, cpu, 0);
373 load_TLS_descriptor(t, cpu, 1);
374 load_TLS_descriptor(t, cpu, 2);
375
376 xen_mc_issue(PARAVIRT_LAZY_CPU);
377
378 /* 383 /*
379 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
380 * it means we're in a context switch, and %gs has just been 385 * it means we're in a context switch, and %gs has just been
@@ -383,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
383 * Either way, it has been saved, and the new value will get 388 * Either way, it has been saved, and the new value will get
384 * loaded properly. This will go away as soon as Xen has been 389 * loaded properly. This will go away as soon as Xen has been
385 * modified to not save/restore %gs for normal hypercalls. 390 * modified to not save/restore %gs for normal hypercalls.
391 *
392 * On x86_64, this hack is not used for %gs, because gs points
393 * to KERNEL_GS_BASE (and uses it for PDA references), so we
394 * must not zero %gs on x86_64
395 *
396 * For x86_64, we need to zero %fs, otherwise we may get an
397 * exception between the new %fs descriptor being loaded and
398 * %fs being effectively cleared at __switch_to().
386 */ 399 */
387 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
401#ifdef CONFIG_X86_32
388 loadsegment(gs, 0); 402 loadsegment(gs, 0);
403#else
404 loadsegment(fs, 0);
405#endif
406 }
407
408 xen_mc_batch();
409
410 load_TLS_descriptor(t, cpu, 0);
411 load_TLS_descriptor(t, cpu, 1);
412 load_TLS_descriptor(t, cpu, 2);
413
414 xen_mc_issue(PARAVIRT_LAZY_CPU);
415}
416
417#ifdef CONFIG_X86_64
418static void xen_load_gs_index(unsigned int idx)
419{
420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
421 BUG();
389} 422}
423#endif
390 424
391static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
392 const void *ptr) 426 const void *ptr)
@@ -404,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
404 preempt_enable(); 438 preempt_enable();
405} 439}
406 440
407static int cvt_gate_to_trap(int vector, u32 low, u32 high, 441static int cvt_gate_to_trap(int vector, const gate_desc *val,
408 struct trap_info *info) 442 struct trap_info *info)
409{ 443{
410 u8 type, dpl; 444 if (val->type != 0xf && val->type != 0xe)
411
412 type = (high >> 8) & 0x1f;
413 dpl = (high >> 13) & 3;
414
415 if (type != 0xf && type != 0xe)
416 return 0; 445 return 0;
417 446
418 info->vector = vector; 447 info->vector = vector;
419 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 448 info->address = gate_offset(*val);
420 info->cs = low >> 16; 449 info->cs = gate_segment(*val);
421 info->flags = dpl; 450 info->flags = val->dpl;
422 /* interrupt gates clear IF */ 451 /* interrupt gates clear IF */
423 if (type == 0xe) 452 if (val->type == 0xe)
424 info->flags |= 4; 453 info->flags |= 4;
425 454
426 return 1; 455 return 1;
@@ -447,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
447 476
448 if (p >= start && (p + 8) <= end) { 477 if (p >= start && (p + 8) <= end) {
449 struct trap_info info[2]; 478 struct trap_info info[2];
450 u32 *desc = (u32 *)g;
451 479
452 info[1].address = 0; 480 info[1].address = 0;
453 481
454 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 482 if (cvt_gate_to_trap(entrynum, g, &info[0]))
455 if (HYPERVISOR_set_trap_table(info)) 483 if (HYPERVISOR_set_trap_table(info))
456 BUG(); 484 BUG();
457 } 485 }
@@ -464,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
464{ 492{
465 unsigned in, out, count; 493 unsigned in, out, count;
466 494
467 count = (desc->size+1) / 8; 495 count = (desc->size+1) / sizeof(gate_desc);
468 BUG_ON(count > 256); 496 BUG_ON(count > 256);
469 497
470 for (in = out = 0; in < count; in++) { 498 for (in = out = 0; in < count; in++) {
471 const u32 *entry = (u32 *)(desc->address + in * 8); 499 gate_desc *entry = (gate_desc*)(desc->address) + in;
472 500
473 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 501 if (cvt_gate_to_trap(in, entry, &traps[out]))
474 out++; 502 out++;
475 } 503 }
476 traps[out].address = 0; 504 traps[out].address = 0;
@@ -699,33 +727,89 @@ static void set_current_cr3(void *v)
699 x86_write_percpu(xen_current_cr3, (unsigned long)v); 727 x86_write_percpu(xen_current_cr3, (unsigned long)v);
700} 728}
701 729
702static void xen_write_cr3(unsigned long cr3) 730static void __xen_write_cr3(bool kernel, unsigned long cr3)
703{ 731{
704 struct mmuext_op *op; 732 struct mmuext_op *op;
705 struct multicall_space mcs; 733 struct multicall_space mcs;
706 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 734 unsigned long mfn;
707 735
708 BUG_ON(preemptible()); 736 if (cr3)
737 mfn = pfn_to_mfn(PFN_DOWN(cr3));
738 else
739 mfn = 0;
709 740
710 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 741 WARN_ON(mfn == 0 && kernel);
711 742
712 /* Update while interrupts are disabled, so its atomic with 743 mcs = __xen_mc_entry(sizeof(*op));
713 respect to ipis */
714 x86_write_percpu(xen_cr3, cr3);
715 744
716 op = mcs.args; 745 op = mcs.args;
717 op->cmd = MMUEXT_NEW_BASEPTR; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
718 op->arg1.mfn = mfn; 747 op->arg1.mfn = mfn;
719 748
720 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
721 750
722 /* Update xen_update_cr3 once the batch has actually 751 if (kernel) {
723 been submitted. */ 752 x86_write_percpu(xen_cr3, cr3);
724 xen_mc_callback(set_current_cr3, (void *)cr3); 753
754 /* Update xen_current_cr3 once the batch has actually
755 been submitted. */
756 xen_mc_callback(set_current_cr3, (void *)cr3);
757 }
758}
759
760static void xen_write_cr3(unsigned long cr3)
761{
762 BUG_ON(preemptible());
763
764 xen_mc_batch(); /* disables interrupts */
765
766 /* Update while interrupts are disabled, so its atomic with
767 respect to ipis */
768 x86_write_percpu(xen_cr3, cr3);
769
770 __xen_write_cr3(true, cr3);
771
772#ifdef CONFIG_X86_64
773 {
774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
775 if (user_pgd)
776 __xen_write_cr3(false, __pa(user_pgd));
777 else
778 __xen_write_cr3(false, 0);
779 }
780#endif
725 781
726 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
727} 783}
728 784
785static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
786{
787 int ret;
788
789 ret = 0;
790
791 switch(msr) {
792#ifdef CONFIG_X86_64
793 unsigned which;
794 u64 base;
795
796 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
799
800 set:
801 base = ((u64)high << 32) | low;
802 if (HYPERVISOR_set_segment_base(which, base) != 0)
803 ret = -EFAULT;
804 break;
805#endif
806 default:
807 ret = native_write_msr_safe(msr, low, high);
808 }
809
810 return ret;
811}
812
729/* Early in boot, while setting up the initial pagetable, assume 813/* Early in boot, while setting up the initial pagetable, assume
730 everything is pinned. */ 814 everything is pinned. */
731static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -782,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
782 xen_alloc_ptpage(mm, pfn, PT_PMD); 866 xen_alloc_ptpage(mm, pfn, PT_PMD);
783} 867}
784 868
869static int xen_pgd_alloc(struct mm_struct *mm)
870{
871 pgd_t *pgd = mm->pgd;
872 int ret = 0;
873
874 BUG_ON(PagePinned(virt_to_page(pgd)));
875
876#ifdef CONFIG_X86_64
877 {
878 struct page *page = virt_to_page(pgd);
879 pgd_t *user_pgd;
880
881 BUG_ON(page->private != 0);
882
883 ret = -ENOMEM;
884
885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
886 page->private = (unsigned long)user_pgd;
887
888 if (user_pgd != NULL) {
889 user_pgd[pgd_index(VSYSCALL_START)] =
890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
891 ret = 0;
892 }
893
894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
895 }
896#endif
897
898 return ret;
899}
900
901static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
902{
903#ifdef CONFIG_X86_64
904 pgd_t *user_pgd = xen_get_user_pgd(pgd);
905
906 if (user_pgd)
907 free_page((unsigned long)user_pgd);
908#endif
909}
910
785/* This should never happen until we're OK to use struct page */ 911/* This should never happen until we're OK to use struct page */
786static void xen_release_ptpage(u32 pfn, unsigned level) 912static void xen_release_ptpage(u32 pfn, unsigned level)
787{ 913{
@@ -807,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
807 xen_release_ptpage(pfn, PT_PMD); 933 xen_release_ptpage(pfn, PT_PMD);
808} 934}
809 935
936#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
938{
939 xen_alloc_ptpage(mm, pfn, PT_PUD);
940}
941
942static void xen_release_pud(u32 pfn)
943{
944 xen_release_ptpage(pfn, PT_PUD);
945}
946#endif
947
810#ifdef CONFIG_HIGHPTE 948#ifdef CONFIG_HIGHPTE
811static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 949static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
812{ 950{
@@ -845,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
845 983
846static __init void xen_pagetable_setup_start(pgd_t *base) 984static __init void xen_pagetable_setup_start(pgd_t *base)
847{ 985{
848 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
849 int i;
850
851 /* special set_pte for pagetable initialization */
852 pv_mmu_ops.set_pte = xen_set_pte_init;
853
854 init_mm.pgd = base;
855 /*
856 * copy top-level of Xen-supplied pagetable into place. This
857 * is a stand-in while we copy the pmd pages.
858 */
859 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
860
861 /*
862 * For PAE, need to allocate new pmds, rather than
863 * share Xen's, since Xen doesn't like pmd's being
864 * shared between address spaces.
865 */
866 for (i = 0; i < PTRS_PER_PGD; i++) {
867 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
868 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
869
870 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
871 PAGE_SIZE);
872
873 make_lowmem_page_readonly(pmd);
874
875 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
876 } else
877 pgd_clear(&base[i]);
878 }
879
880 /* make sure zero_page is mapped RO so we can use it in pagetables */
881 make_lowmem_page_readonly(empty_zero_page);
882 make_lowmem_page_readonly(base);
883 /*
884 * Switch to new pagetable. This is done before
885 * pagetable_init has done anything so that the new pages
886 * added to the table can be prepared properly for Xen.
887 */
888 xen_write_cr3(__pa(base));
889
890 /* Unpin initial Xen pagetable */
891 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
892 PFN_DOWN(__pa(xen_start_info->pt_base)));
893} 986}
894 987
895void xen_setup_shared_info(void) 988void xen_setup_shared_info(void)
896{ 989{
897 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
898 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 991 set_fixmap(FIX_PARAVIRT_BOOTMAP,
899 992 xen_start_info->shared_info);
900 /* 993
901 * Create a mapping for the shared info page. 994 HYPERVISOR_shared_info =
902 * Should be set_fixmap(), but shared_info is a machine 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
903 * address with no corresponding pseudo-phys address.
904 */
905 set_pte_mfn(addr,
906 PFN_DOWN(xen_start_info->shared_info),
907 PAGE_KERNEL);
908
909 HYPERVISOR_shared_info = (struct shared_info *)addr;
910 } else 996 } else
911 HYPERVISOR_shared_info = 997 HYPERVISOR_shared_info =
912 (struct shared_info *)__va(xen_start_info->shared_info); 998 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -921,26 +1007,32 @@ void xen_setup_shared_info(void)
921 1007
922static __init void xen_pagetable_setup_done(pgd_t *base) 1008static __init void xen_pagetable_setup_done(pgd_t *base)
923{ 1009{
924 /* This will work as long as patching hasn't happened yet
925 (which it hasn't) */
926 pv_mmu_ops.alloc_pte = xen_alloc_pte;
927 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
928 pv_mmu_ops.release_pte = xen_release_pte;
929 pv_mmu_ops.release_pmd = xen_release_pmd;
930 pv_mmu_ops.set_pte = xen_set_pte;
931
932 xen_setup_shared_info(); 1010 xen_setup_shared_info();
933
934 /* Actually pin the pagetable down, but we can't set PG_pinned
935 yet because the page structures don't exist yet. */
936 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
937} 1011}
938 1012
939static __init void xen_post_allocator_init(void) 1013static __init void xen_post_allocator_init(void)
940{ 1014{
1015 pv_mmu_ops.set_pte = xen_set_pte;
941 pv_mmu_ops.set_pmd = xen_set_pmd; 1016 pv_mmu_ops.set_pmd = xen_set_pmd;
942 pv_mmu_ops.set_pud = xen_set_pud; 1017 pv_mmu_ops.set_pud = xen_set_pud;
1018#if PAGETABLE_LEVELS == 4
1019 pv_mmu_ops.set_pgd = xen_set_pgd;
1020#endif
1021
1022 /* This will work as long as patching hasn't happened yet
1023 (which it hasn't) */
1024 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1026 pv_mmu_ops.release_pte = xen_release_pte;
1027 pv_mmu_ops.release_pmd = xen_release_pmd;
1028#if PAGETABLE_LEVELS == 4
1029 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1030 pv_mmu_ops.release_pud = xen_release_pud;
1031#endif
943 1032
1033#ifdef CONFIG_X86_64
1034 SetPagePinned(virt_to_page(level3_user_vsyscall));
1035#endif
944 xen_mark_init_mm_pinned(); 1036 xen_mark_init_mm_pinned();
945} 1037}
946 1038
@@ -954,6 +1046,7 @@ void xen_setup_vcpu_info_placement(void)
954 1046
955 /* xen_vcpu_setup managed to place the vcpu_info within the 1047 /* xen_vcpu_setup managed to place the vcpu_info within the
956 percpu area for all cpus, so make use of it */ 1048 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
957 if (have_vcpu_info_placement) { 1050 if (have_vcpu_info_placement) {
958 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n");
959 1052
@@ -963,6 +1056,7 @@ void xen_setup_vcpu_info_placement(void)
963 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct;
964 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
965 } 1058 }
1059#endif
966} 1060}
967 1061
968static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -983,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
983 goto patch_site 1077 goto patch_site
984 1078
985 switch (type) { 1079 switch (type) {
1080#ifdef CONFIG_X86_32
986 SITE(pv_irq_ops, irq_enable); 1081 SITE(pv_irq_ops, irq_enable);
987 SITE(pv_irq_ops, irq_disable); 1082 SITE(pv_irq_ops, irq_disable);
988 SITE(pv_irq_ops, save_fl); 1083 SITE(pv_irq_ops, save_fl);
989 SITE(pv_irq_ops, restore_fl); 1084 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
990#undef SITE 1086#undef SITE
991 1087
992 patch_site: 1088 patch_site:
@@ -1029,8 +1125,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1029#ifdef CONFIG_X86_F00F_BUG 1125#ifdef CONFIG_X86_F00F_BUG
1030 case FIX_F00F_IDT: 1126 case FIX_F00F_IDT:
1031#endif 1127#endif
1128#ifdef CONFIG_X86_32
1032 case FIX_WP_TEST: 1129 case FIX_WP_TEST:
1033 case FIX_VDSO: 1130 case FIX_VDSO:
1131# ifdef CONFIG_HIGHMEM
1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1133# endif
1134#else
1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1136#endif
1034#ifdef CONFIG_X86_LOCAL_APIC 1137#ifdef CONFIG_X86_LOCAL_APIC
1035 case FIX_APIC_BASE: /* maps dummy local APIC */ 1138 case FIX_APIC_BASE: /* maps dummy local APIC */
1036#endif 1139#endif
@@ -1043,6 +1146,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1043 } 1146 }
1044 1147
1045 __native_set_fixmap(idx, pte); 1148 __native_set_fixmap(idx, pte);
1149
1150#ifdef CONFIG_X86_64
1151 /* Replicate changes to map the vsyscall page into the user
1152 pagetable vsyscall mapping. */
1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1154 unsigned long vaddr = __fix_to_virt(idx);
1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1156 }
1157#endif
1046} 1158}
1047 1159
1048static const struct pv_info xen_info __initdata = { 1160static const struct pv_info xen_info __initdata = {
@@ -1088,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1088 .wbinvd = native_wbinvd, 1200 .wbinvd = native_wbinvd,
1089 1201
1090 .read_msr = native_read_msr_safe, 1202 .read_msr = native_read_msr_safe,
1091 .write_msr = native_write_msr_safe, 1203 .write_msr = xen_write_msr_safe,
1092 .read_tsc = native_read_tsc, 1204 .read_tsc = native_read_tsc,
1093 .read_pmc = native_read_pmc, 1205 .read_pmc = native_read_pmc,
1094 1206
1095 .iret = xen_iret, 1207 .iret = xen_iret,
1096 .irq_enable_sysexit = xen_sysexit, 1208 .irq_enable_sysexit = xen_sysexit,
1209#ifdef CONFIG_X86_64
1210 .usergs_sysret32 = xen_sysret32,
1211 .usergs_sysret64 = xen_sysret64,
1212#endif
1097 1213
1098 .load_tr_desc = paravirt_nop, 1214 .load_tr_desc = paravirt_nop,
1099 .set_ldt = xen_set_ldt, 1215 .set_ldt = xen_set_ldt,
1100 .load_gdt = xen_load_gdt, 1216 .load_gdt = xen_load_gdt,
1101 .load_idt = xen_load_idt, 1217 .load_idt = xen_load_idt,
1102 .load_tls = xen_load_tls, 1218 .load_tls = xen_load_tls,
1219#ifdef CONFIG_X86_64
1220 .load_gs_index = xen_load_gs_index,
1221#endif
1103 1222
1104 .store_gdt = native_store_gdt, 1223 .store_gdt = native_store_gdt,
1105 .store_idt = native_store_idt, 1224 .store_idt = native_store_idt,
@@ -1113,14 +1232,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1113 .set_iopl_mask = xen_set_iopl_mask, 1232 .set_iopl_mask = xen_set_iopl_mask,
1114 .io_delay = xen_io_delay, 1233 .io_delay = xen_io_delay,
1115 1234
1235 /* Xen takes care of %gs when switching to usermode for us */
1236 .swapgs = paravirt_nop,
1237
1116 .lazy_mode = { 1238 .lazy_mode = {
1117 .enter = paravirt_enter_lazy_cpu, 1239 .enter = paravirt_enter_lazy_cpu,
1118 .leave = xen_leave_lazy, 1240 .leave = xen_leave_lazy,
1119 }, 1241 },
1120}; 1242};
1121 1243
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1122static const struct pv_irq_ops xen_irq_ops __initdata = { 1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1123 .init_IRQ = xen_init_IRQ, 1262 .init_IRQ = __xen_init_IRQ,
1124 .save_fl = xen_save_fl, 1263 .save_fl = xen_save_fl,
1125 .restore_fl = xen_restore_fl, 1264 .restore_fl = xen_restore_fl,
1126 .irq_disable = xen_irq_disable, 1265 .irq_disable = xen_irq_disable,
@@ -1128,14 +1267,13 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
1128 .safe_halt = xen_safe_halt, 1267 .safe_halt = xen_safe_halt,
1129 .halt = xen_halt, 1268 .halt = xen_halt,
1130#ifdef CONFIG_X86_64 1269#ifdef CONFIG_X86_64
1131 .adjust_exception_frame = paravirt_nop, 1270 .adjust_exception_frame = xen_adjust_exception_frame,
1132#endif 1271#endif
1133}; 1272};
1134 1273
1135static const struct pv_apic_ops xen_apic_ops __initdata = { 1274static const struct pv_apic_ops xen_apic_ops __initdata = {
1136#ifdef CONFIG_X86_LOCAL_APIC 1275#ifdef CONFIG_X86_LOCAL_APIC
1137 .apic_write = xen_apic_write, 1276 .apic_write = xen_apic_write,
1138 .apic_write_atomic = xen_apic_write,
1139 .apic_read = xen_apic_read, 1277 .apic_read = xen_apic_read,
1140 .setup_boot_clock = paravirt_nop, 1278 .setup_boot_clock = paravirt_nop,
1141 .setup_secondary_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop,
@@ -1161,8 +1299,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1161 .pte_update = paravirt_nop, 1299 .pte_update = paravirt_nop,
1162 .pte_update_defer = paravirt_nop, 1300 .pte_update_defer = paravirt_nop,
1163 1301
1164 .pgd_alloc = __paravirt_pgd_alloc, 1302 .pgd_alloc = xen_pgd_alloc,
1165 .pgd_free = paravirt_nop, 1303 .pgd_free = xen_pgd_free,
1166 1304
1167 .alloc_pte = xen_alloc_pte_init, 1305 .alloc_pte = xen_alloc_pte_init,
1168 .release_pte = xen_release_pte_init, 1306 .release_pte = xen_release_pte_init,
@@ -1174,7 +1312,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1174 .kmap_atomic_pte = xen_kmap_atomic_pte, 1312 .kmap_atomic_pte = xen_kmap_atomic_pte,
1175#endif 1313#endif
1176 1314
1177 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1315#ifdef CONFIG_X86_64
1316 .set_pte = xen_set_pte,
1317#else
1318 .set_pte = xen_set_pte_init,
1319#endif
1178 .set_pte_at = xen_set_pte_at, 1320 .set_pte_at = xen_set_pte_at,
1179 .set_pmd = xen_set_pmd_hyper, 1321 .set_pmd = xen_set_pmd_hyper,
1180 1322
@@ -1188,15 +1330,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1188 .make_pte = xen_make_pte, 1330 .make_pte = xen_make_pte,
1189 .make_pgd = xen_make_pgd, 1331 .make_pgd = xen_make_pgd,
1190 1332
1333#ifdef CONFIG_X86_PAE
1191 .set_pte_atomic = xen_set_pte_atomic, 1334 .set_pte_atomic = xen_set_pte_atomic,
1192 .set_pte_present = xen_set_pte_at, 1335 .set_pte_present = xen_set_pte_at,
1193 .set_pud = xen_set_pud_hyper,
1194 .pte_clear = xen_pte_clear, 1336 .pte_clear = xen_pte_clear,
1195 .pmd_clear = xen_pmd_clear, 1337 .pmd_clear = xen_pmd_clear,
1338#endif /* CONFIG_X86_PAE */
1339 .set_pud = xen_set_pud_hyper,
1196 1340
1197 .make_pmd = xen_make_pmd, 1341 .make_pmd = xen_make_pmd,
1198 .pmd_val = xen_pmd_val, 1342 .pmd_val = xen_pmd_val,
1199 1343
1344#if PAGETABLE_LEVELS == 4
1345 .pud_val = xen_pud_val,
1346 .make_pud = xen_make_pud,
1347 .set_pgd = xen_set_pgd_hyper,
1348
1349 .alloc_pud = xen_alloc_pte_init,
1350 .release_pud = xen_release_pte_init,
1351#endif /* PAGETABLE_LEVELS == 4 */
1352
1200 .activate_mm = xen_activate_mm, 1353 .activate_mm = xen_activate_mm,
1201 .dup_mmap = xen_dup_mmap, 1354 .dup_mmap = xen_dup_mmap,
1202 .exit_mmap = xen_exit_mmap, 1355 .exit_mmap = xen_exit_mmap,
@@ -1209,21 +1362,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1209 .set_fixmap = xen_set_fixmap, 1362 .set_fixmap = xen_set_fixmap,
1210}; 1363};
1211 1364
1212#ifdef CONFIG_SMP
1213static const struct smp_ops xen_smp_ops __initdata = {
1214 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1215 .smp_prepare_cpus = xen_smp_prepare_cpus,
1216 .cpu_up = xen_cpu_up,
1217 .smp_cpus_done = xen_smp_cpus_done,
1218
1219 .smp_send_stop = xen_smp_send_stop,
1220 .smp_send_reschedule = xen_smp_send_reschedule,
1221
1222 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1223 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1224};
1225#endif /* CONFIG_SMP */
1226
1227static void xen_reboot(int reason) 1365static void xen_reboot(int reason)
1228{ 1366{
1229 struct sched_shutdown r = { .reason = reason }; 1367 struct sched_shutdown r = { .reason = reason };
@@ -1268,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1268 1406
1269static void __init xen_reserve_top(void) 1407static void __init xen_reserve_top(void)
1270{ 1408{
1409#ifdef CONFIG_X86_32
1271 unsigned long top = HYPERVISOR_VIRT_START; 1410 unsigned long top = HYPERVISOR_VIRT_START;
1272 struct xen_platform_parameters pp; 1411 struct xen_platform_parameters pp;
1273 1412
@@ -1275,7 +1414,247 @@ static void __init xen_reserve_top(void)
1275 top = pp.virt_start; 1414 top = pp.virt_start;
1276 1415
1277 reserve_top_address(-top + 2 * PAGE_SIZE); 1416 reserve_top_address(-top + 2 * PAGE_SIZE);
1417#endif /* CONFIG_X86_32 */
1418}
1419
1420/*
1421 * Like __va(), but returns address in the kernel mapping (which is
1422 * all we have until the physical memory mapping has been set up.
1423 */
1424static void *__ka(phys_addr_t paddr)
1425{
1426#ifdef CONFIG_X86_64
1427 return (void *)(paddr + __START_KERNEL_map);
1428#else
1429 return __va(paddr);
1430#endif
1431}
1432
1433/* Convert a machine address to physical address */
1434static unsigned long m2p(phys_addr_t maddr)
1435{
1436 phys_addr_t paddr;
1437
1438 maddr &= PTE_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440
1441 return paddr;
1442}
1443
1444/* Convert a machine address to kernel virtual */
1445static void *m2v(phys_addr_t maddr)
1446{
1447 return __ka(m2p(maddr));
1448}
1449
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot)
1484{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot);
1487
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG();
1494}
1495
1496static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1497{
1498 unsigned pmdidx, pteidx;
1499 unsigned ident_pte;
1500 unsigned long pfn;
1501
1502 ident_pte = 0;
1503 pfn = 0;
1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1505 pte_t *pte_page;
1506
1507 /* Reuse or allocate a page of ptes */
1508 if (pmd_present(pmd[pmdidx]))
1509 pte_page = m2v(pmd[pmdidx].pmd);
1510 else {
1511 /* Check for free pte pages */
1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1513 break;
1514
1515 pte_page = &level1_ident_pgt[ident_pte];
1516 ident_pte += PTRS_PER_PTE;
1517
1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1519 }
1520
1521 /* Install mappings */
1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1523 pte_t pte;
1524
1525 if (pfn > max_pfn_mapped)
1526 max_pfn_mapped = pfn;
1527
1528 if (!pte_none(pte_page[pteidx]))
1529 continue;
1530
1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1532 pte_page[pteidx] = pte;
1533 }
1534 }
1535
1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1538
1539 set_page_prot(pmd, PAGE_KERNEL_RO);
1540}
1541
1542#ifdef CONFIG_X86_64
1543static void convert_pfn_mfn(void *v)
1544{
1545 pte_t *pte = v;
1546 int i;
1547
1548 /* All levels are converted the same way, so just treat them
1549 as ptes. */
1550 for(i = 0; i < PTRS_PER_PTE; i++)
1551 pte[i] = xen_make_pte(pte[i].pte);
1552}
1553
1554/*
1555 * Set up the inital kernel pagetable.
1556 *
1557 * We can construct this by grafting the Xen provided pagetable into
1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1560 * means that only the kernel has a physical mapping to start with -
1561 * but that's enough to get __va working. We need to fill in the rest
1562 * of the physical mapping once some sort of allocator has been set
1563 * up.
1564 */
1565static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1566{
1567 pud_t *l3;
1568 pmd_t *l2;
1569
1570 /* Zap identity mapping */
1571 init_level4_pgt[0] = __pgd(0);
1572
1573 /* Pre-constructed entries are in pfn, so convert to mfn */
1574 convert_pfn_mfn(init_level4_pgt);
1575 convert_pfn_mfn(level3_ident_pgt);
1576 convert_pfn_mfn(level3_kernel_pgt);
1577
1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1580
1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1583
1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1587
1588 /* Set up identity map */
1589 xen_map_identity_early(level2_ident_pgt, max_pfn);
1590
1591 /* Make pagetable pieces RO */
1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1598
1599 /* Pin down new L4 */
1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1601 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1602
1603 /* Unpin Xen-provided one */
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 /* Switch over */
1607 pgd = init_level4_pgt;
1608
1609 /*
1610 * At this stage there can be no user pgd, and no page
1611 * structure to attach it to, so make sure we just set kernel
1612 * pgd.
1613 */
1614 xen_mc_batch();
1615 __xen_write_cr3(true, __pa(pgd));
1616 xen_mc_issue(PARAVIRT_LAZY_CPU);
1617
1618 reserve_early(__pa(xen_start_info->pt_base),
1619 __pa(xen_start_info->pt_base +
1620 xen_start_info->nr_pt_frames * PAGE_SIZE),
1621 "XEN PAGETABLES");
1622
1623 return pgd;
1624}
1625#else /* !CONFIG_X86_64 */
1626static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1627
1628static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1629{
1630 pmd_t *kernel_pmd;
1631
1632 init_pg_tables_start = __pa(pgd);
1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1635
1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1638
1639 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1640
1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1644
1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1648
1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1650
1651 xen_write_cr3(__pa(swapper_pg_dir));
1652
1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1654
1655 return swapper_pg_dir;
1278} 1656}
1657#endif /* CONFIG_X86_64 */
1279 1658
1280/* First C function to be called on Xen boot */ 1659/* First C function to be called on Xen boot */
1281asmlinkage void __init xen_start_kernel(void) 1660asmlinkage void __init xen_start_kernel(void)
@@ -1305,53 +1684,56 @@ asmlinkage void __init xen_start_kernel(void)
1305 1684
1306 machine_ops = xen_machine_ops; 1685 machine_ops = xen_machine_ops;
1307 1686
1308#ifdef CONFIG_SMP 1687#ifdef CONFIG_X86_64
1309 smp_ops = xen_smp_ops; 1688 /* Disable until direct per-cpu data access. */
1689 have_vcpu_info_placement = 0;
1690 x86_64_init_pda();
1310#endif 1691#endif
1311 1692
1693 xen_smp_init();
1694
1312 /* Get mfn list */ 1695 /* Get mfn list */
1313 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1696 if (!xen_feature(XENFEAT_auto_translated_physmap))
1314 xen_build_dynamic_phys_to_machine(); 1697 xen_build_dynamic_phys_to_machine();
1315 1698
1316 pgd = (pgd_t *)xen_start_info->pt_base; 1699 pgd = (pgd_t *)xen_start_info->pt_base;
1317 1700
1318 init_pg_tables_start = __pa(pgd); 1701 /* Prevent unwanted bits from being set in PTEs. */
1319 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1702 __supported_pte_mask &= ~_PAGE_GLOBAL;
1320 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1703 if (!is_initial_xendomain())
1321 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1322 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1323
1324 /* keep using Xen gdt for now; no urgent need to change it */
1325
1326 x86_write_percpu(xen_cr3, __pa(pgd));
1327 x86_write_percpu(xen_current_cr3, __pa(pgd));
1328 1705
1329 /* Don't do the full vcpu_info placement stuff until we have a 1706 /* Don't do the full vcpu_info placement stuff until we have a
1330 possible map and a non-dummy shared_info. */ 1707 possible map and a non-dummy shared_info. */
1331 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1332 1709
1710 xen_raw_console_write("mapping kernel into physical memory\n");
1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1712
1713 init_mm.pgd = pgd;
1714
1715 /* keep using Xen gdt for now; no urgent need to change it */
1716
1333 pv_info.kernel_rpl = 1; 1717 pv_info.kernel_rpl = 1;
1334 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1335 pv_info.kernel_rpl = 0; 1719 pv_info.kernel_rpl = 0;
1336 1720
1337 /* Prevent unwanted bits from being set in PTEs. */
1338 __supported_pte_mask &= ~_PAGE_GLOBAL;
1339 if (!is_initial_xendomain())
1340 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1341
1342 /* set the limit of our address space */ 1721 /* set the limit of our address space */
1343 xen_reserve_top(); 1722 xen_reserve_top();
1344 1723
1724#ifdef CONFIG_X86_32
1345 /* set up basic CPUID stuff */ 1725 /* set up basic CPUID stuff */
1346 cpu_detect(&new_cpu_data); 1726 cpu_detect(&new_cpu_data);
1347 new_cpu_data.hard_math = 1; 1727 new_cpu_data.hard_math = 1;
1348 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1729#endif
1349 1730
1350 /* Poke various useful things into boot_params */ 1731 /* Poke various useful things into boot_params */
1351 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1352 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1353 ? __pa(xen_start_info->mod_start) : 0; 1734 ? __pa(xen_start_info->mod_start) : 0;
1354 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1355 1737
1356 if (!is_initial_xendomain()) { 1738 if (!is_initial_xendomain()) {
1357 add_preferred_console("xenboot", 0, NULL); 1739 add_preferred_console("xenboot", 0, NULL);
@@ -1359,6 +1741,21 @@ asmlinkage void __init xen_start_kernel(void)
1359 add_preferred_console("hvc", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL);
1360 } 1742 }
1361 1743
1744 xen_raw_console_write("about to get started...\n");
1745
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1754
1362 /* Start the world */ 1755 /* Start the world */
1756#ifdef CONFIG_X86_32
1363 i386_start_kernel(); 1757 i386_start_kernel();
1758#else
1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1760#endif
1364} 1761}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa..a44d56e38bd 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 70
62/* Placeholder for holes in the address space */ 71/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 74
67 /* Array of pointers to pages containing p2m entries */ 75 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 78
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 79/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 81
76static unsigned long p2m_top_mfn_list[ 82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 83 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 84
80static inline unsigned p2m_top_index(unsigned long pfn) 85static inline unsigned p2m_top_index(unsigned long pfn)
81{ 86{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 186 p2m_top[topidx][idx] = mfn;
182} 187}
183 188
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 190{
191 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 192 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
189 195
190 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
191 197
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 199}
194 200
195void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 262
257 xen_mc_batch(); 263 xen_mc_batch();
258 264
259 u.ptr = virt_to_machine(ptr).maddr; 265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 267 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 268 extend_mmu_update(&u);
262 269
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 290 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 292{
286 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 294}
316 295
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 397
419 xen_mc_batch(); 398 xen_mc_batch();
420 399
421 u.ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 402 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 403 extend_mmu_update(&u);
424 404
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
441 421
442void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 423{
424#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 426 smp_wmb();
446 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
447} 431}
448 432
433#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 435{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 437}
453 438
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 447{
463 set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
464} 449}
450#endif /* CONFIG_X86_PAE */
465 451
466pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
467{ 453{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
470} 456}
471 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
496/*
497 * Raw hypercall-based set_pgd, intended for in early boot before
498 * there's a page structure. This implies:
499 * 1. The only existing pagetable is the kernel's
500 * 2. It is always pinned
501 * 3. It has no user pagetable attached to it
502 */
503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
504{
505 preempt_disable();
506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
472/* 543/*
473 (Yet another) pagetable walker. This one is intended for pinning a 544 * (Yet another) pagetable walker. This one is intended for pinning a
474 pagetable. This means that it walks a pagetable and calls the 545 * pagetable. This means that it walks a pagetable and calls the
475 callback function on each page it finds making up the page table, 546 * callback function on each page it finds making up the page table,
476 at every level. It walks the entire pagetable, but it only bothers 547 * at every level. It walks the entire pagetable, but it only bothers
477 pinning pte pages which are below pte_limit. In the normal case 548 * pinning pte pages which are below limit. In the normal case this
478 this will be TASK_SIZE, but at boot we need to pin up to 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 550 * FIXADDR_TOP.
480 there, because then we start getting into Xen's ptes. 551 *
481*/ 552 * For 32-bit the important bit is that we don't pin beyond there,
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
483 unsigned long limit) 559 unsigned long limit)
484{ 560{
485 pgd_t *pgd = pgd_base;
486 int flush = 0; 561 int flush = 0;
487 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
489 565
490 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
491 569
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 571 return 0;
494 572
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 596 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 597
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
500 600
501 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
502 continue; 602 continue;
503 603
504 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
505 605
506 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
508 608
509 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 610 pmd_t *pmd;
511 unsigned long pmd_limit;
512 611
513 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
514 613 pudidx > pudidx_limit)
515 if (pud_next < limit) 614 goto out;
516 pmd_limit = pud_next;
517 else
518 pmd_limit = limit;
519 615
520 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
521 continue; 617 continue;
522 618
523 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
524 620
525 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
527 623
528 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
530 if ((pmd_limit-1) < (addr-1)) { 626
531 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
532 break; 628 pudidx == pudidx_limit &&
533 } 629 pmdidx > pmdidx_limit)
630 goto out;
534 631
535 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
536 continue; 633 continue;
537 634
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
539 } 637 }
540 } 638 }
541 } 639 }
542 640out:
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
544 641
545 return flush; 642 return flush;
546} 643}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
622{ 719{
623 xen_mc_batch(); 720 xen_mc_batch();
624 721
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 724 xen_mc_issue(0);
628 kmap_flush_unused(); 725 kmap_flush_unused();
629 xen_mc_batch(); 726 xen_mc_batch();
630 } 727 }
631 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 747 xen_mc_issue(0);
634} 748}
635 749
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 770 spin_unlock_irqrestore(&pgd_lock, flags);
657} 771}
658 772
659/* The init_mm pagetable is really pinned as soon as its created, but 773/*
660 that's before we have page structures to store the bits. So do all 774 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
662static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
663{ 779{
664 SetPagePinned(page); 780 SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
708 824
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 826
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
712 844
713 xen_mc_issue(0); 845 xen_mc_issue(0);
714} 846}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 859 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 860 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 861 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page)); 862 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 863 ClearPageSavePinned(page);
733 } 864 }
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
758{ 889{
759 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
760 898
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
763 901
764 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8b..0f59bd03f9e 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 32void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 43void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 46
53void xen_pmd_clear(pmd_t *pmdp); 47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 55
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed..9efd1c6c977 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde..b6acc3a0af4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7..f702199312a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
66 int cpu = smp_processor_id(); 66 int cpu = smp_processor_id();
67 67
68 cpu_init(); 68 cpu_init();
69 preempt_disable();
70
69 xen_enable_sysenter(); 71 xen_enable_sysenter();
72 xen_enable_syscall();
70 73
71 preempt_disable(); 74 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 75 smp_store_cpu_info(cpu);
76 cpu_data(cpu).x86_max_cores = 1;
77 set_cpu_sibling_map(cpu);
73 78
74 xen_setup_cpu_clockevents(); 79 xen_setup_cpu_clockevents();
75 80
81 cpu_set(cpu, cpu_online_map);
82 x86_write_percpu(cpu_state, CPU_ONLINE);
83 wmb();
84
76 /* We can take interrupts now: we're officially "up". */ 85 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 86 local_irq_enable();
78 87
@@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 150 return rc;
142} 151}
143 152
144void __init xen_fill_possible_map(void) 153static void __init xen_fill_possible_map(void)
145{ 154{
146 int i, rc; 155 int i, rc;
147 156
148 for (i = 0; i < NR_CPUS; i++) { 157 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 158 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 159 if (rc >= 0) {
160 num_processors++;
151 cpu_set(i, cpu_possible_map); 161 cpu_set(i, cpu_possible_map);
162 }
152 } 163 }
153} 164}
154 165
155void __init xen_smp_prepare_boot_cpu(void) 166static void __init xen_smp_prepare_boot_cpu(void)
156{ 167{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 168 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 169 native_smp_prepare_boot_cpu();
161 170
162 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 172 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 173 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 174
176 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
177} 176}
178 177
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 178static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 179{
181 unsigned cpu; 180 unsigned cpu;
182 181
183 for_each_possible_cpu(cpu) {
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192
193 smp_store_cpu_info(0); 182 smp_store_cpu_info(0);
183 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 184 set_cpu_sibling_map(0);
195 185
196 if (xen_smp_intr_init(0)) 186 if (xen_smp_intr_init(0))
@@ -225,7 +215,7 @@ static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 215cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 216{
227 struct vcpu_guest_context *ctxt; 217 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 218 struct desc_struct *gdt;
229 219
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 220 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 221 return 0;
@@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 224 if (ctxt == NULL)
235 return -ENOMEM; 225 return -ENOMEM;
236 226
227 gdt = get_cpu_gdt_table(cpu);
228
237 ctxt->flags = VGCF_IN_KERNEL; 229 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 230 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 231 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 232 ctxt->user_regs.ss = __KERNEL_DS;
233#ifdef CONFIG_X86_32
234 ctxt->user_regs.fs = __KERNEL_PERCPU;
235#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 238
@@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 242
250 ctxt->ldt_ents = 0; 243 ctxt->ldt_ents = 0;
251 244
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 245 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 246 make_lowmem_page_readonly(gdt);
254 247
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 248 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 249 ctxt->gdt_ents = GDT_ENTRIES;
257 250
258 ctxt->user_regs.cs = __KERNEL_CS; 251 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 254 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 255 ctxt->kernel_sp = idle->thread.sp0;
263 256
257#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 258 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 259 ctxt->failsafe_callback_cs = __KERNEL_CS;
260#endif
261 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 262 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 263
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 264 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 271 return 0;
277} 272}
278 273
279int __cpuinit xen_cpu_up(unsigned int cpu) 274static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 275{
281 struct task_struct *idle = idle_task(cpu); 276 struct task_struct *idle = idle_task(cpu);
282 int rc; 277 int rc;
@@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
287 return rc; 282 return rc;
288#endif 283#endif
289 284
285#ifdef CONFIG_X86_64
286 /* Allocate node local memory for AP pdas */
287 WARN_ON(cpu == 0);
288 if (cpu > 0) {
289 rc = get_local_pda(cpu);
290 if (rc)
291 return rc;
292 }
293#endif
294
295#ifdef CONFIG_X86_32
290 init_gdt(cpu); 296 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 297 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 298 irq_ctx_init(cpu);
299#else
300 cpu_pda(cpu)->pcurrent = idle;
301 clear_tsk_thread_flag(idle, TIF_FORK);
302#endif
293 xen_setup_timer(cpu); 303 xen_setup_timer(cpu);
294 304
305 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
306
295 /* make sure interrupts start blocked */ 307 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 308 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
297 309
@@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 318 if (rc)
307 return rc; 319 return rc;
308 320
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 321 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 322 BUG_ON(rc);
318 323
324 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
325 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
326 barrier();
327 }
328
319 return 0; 329 return 0;
320} 330}
321 331
322void xen_smp_cpus_done(unsigned int max_cpus) 332static void xen_smp_cpus_done(unsigned int max_cpus)
323{ 333{
324} 334}
325 335
@@ -335,12 +345,12 @@ static void stop_self(void *v)
335 BUG(); 345 BUG();
336} 346}
337 347
338void xen_smp_send_stop(void) 348static void xen_smp_send_stop(void)
339{ 349{
340 smp_call_function(stop_self, NULL, 0); 350 smp_call_function(stop_self, NULL, 0);
341} 351}
342 352
343void xen_smp_send_reschedule(int cpu) 353static void xen_smp_send_reschedule(int cpu)
344{ 354{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 355 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 356}
@@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
355 xen_send_IPI_one(cpu, vector); 365 xen_send_IPI_one(cpu, vector);
356} 366}
357 367
358void xen_smp_send_call_function_ipi(cpumask_t mask) 368static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 369{
360 int cpu; 370 int cpu;
361 371
@@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 380 }
371} 381}
372 382
373void xen_smp_send_call_function_single_ipi(int cpu) 383static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 384{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 385 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 386}
@@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 389{
380 irq_enter(); 390 irq_enter();
381 generic_smp_call_function_interrupt(); 391 generic_smp_call_function_interrupt();
392#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 393 __get_cpu_var(irq_stat).irq_call_count++;
394#else
395 add_pda(irq_call_count, 1);
396#endif
383 irq_exit(); 397 irq_exit();
384 398
385 return IRQ_HANDLED; 399 return IRQ_HANDLED;
@@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 403{
390 irq_enter(); 404 irq_enter();
391 generic_smp_call_function_single_interrupt(); 405 generic_smp_call_function_single_interrupt();
406#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 407 __get_cpu_var(irq_stat).irq_call_count++;
408#else
409 add_pda(irq_call_count, 1);
410#endif
393 irq_exit(); 411 irq_exit();
394 412
395 return IRQ_HANDLED; 413 return IRQ_HANDLED;
396} 414}
415
416static const struct smp_ops xen_smp_ops __initdata = {
417 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
418 .smp_prepare_cpus = xen_smp_prepare_cpus,
419 .cpu_up = xen_cpu_up,
420 .smp_cpus_done = xen_smp_cpus_done,
421
422 .smp_send_stop = xen_smp_send_stop,
423 .smp_send_reschedule = xen_smp_send_reschedule,
424
425 .send_call_func_ipi = xen_smp_send_call_function_ipi,
426 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
427};
428
429void __init xen_smp_init(void)
430{
431 smp_ops = xen_smp_ops;
432 xen_fill_possible_map();
433}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d..2a234db5949 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41d..2497a30f41d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 00000000000..4038cbfe333
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $VGCF_in_syscall
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0..63d49a523ed 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c..dd3c23152a2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 30void xen_vcpu_restore(void);
30 31
31void __init xen_build_dynamic_phys_to_machine(void); 32void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 40unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 41
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 42irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 43
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
45 45
46void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
47 47
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 49
56void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask);
59void xen_smp_send_call_function_single_ipi(int cpu);
60 52
61extern cpumask_t xen_cpu_initialized_map; 53extern cpumask_t xen_cpu_initialized_map;
54#else
55static inline void xen_smp_init(void) {}
56#endif
62 57
63 58
64/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 70
71/* These are not functions, and cannot be called normally */
76void xen_iret(void); 72void xen_iret(void);
77void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
78 77
79#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */