aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/.gitignore1
-rw-r--r--arch/x86/Kconfig169
-rw-r--r--arch/x86/Kconfig.cpu3
-rw-r--r--arch/x86/boot/.gitignore1
-rw-r--r--arch/x86/boot/Makefile9
-rw-r--r--arch/x86/boot/tools/build.c33
-rw-r--r--arch/x86/configs/x86_64_defconfig4
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c22
-rw-r--r--arch/x86/ia32/ia32entry.S12
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/include/asm/alternative-asm.h5
-rw-r--r--arch/x86/include/asm/alternative.h12
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h54
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h580
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apb_timer.h23
-rw-r--r--arch/x86/include/asm/apic.h4
-rw-r--r--arch/x86/include/asm/asm.h5
-rw-r--r--arch/x86/include/asm/atomic.h10
-rw-r--r--arch/x86/include/asm/atomic64_32.h2
-rw-r--r--arch/x86/include/asm/atomic64_64.h2
-rw-r--r--arch/x86/include/asm/bitops.h5
-rw-r--r--arch/x86/include/asm/calling.h130
-rw-r--r--arch/x86/include/asm/clocksource.h18
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h48
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h45
-rw-r--r--arch/x86/include/asm/cpufeature.h12
-rw-r--r--arch/x86/include/asm/delay.h25
-rw-r--r--arch/x86/include/asm/desc.h4
-rw-r--r--arch/x86/include/asm/device.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/fixmap.h1
-rw-r--r--arch/x86/include/asm/frame.h11
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/i8253.h20
-rw-r--r--arch/x86/include/asm/idle.h7
-rw-r--r--arch/x86/include/asm/io.h3
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h6
-rw-r--r--arch/x86/include/asm/irq_vectors.h7
-rw-r--r--arch/x86/include/asm/irqflags.h11
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h52
-rw-r--r--arch/x86/include/asm/kvm_host.h46
-rw-r--r--arch/x86/include/asm/kvm_para.h20
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/include/asm/local.h2
-rw-r--r--arch/x86/include/asm/mce.h21
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/mmzone_32.h6
-rw-r--r--arch/x86/include/asm/msr-index.h15
-rw-r--r--arch/x86/include/asm/olpc.h51
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h7
-rw-r--r--arch/x86/include/asm/percpu.h11
-rw-r--r--arch/x86/include/asm/perf_event.h5
-rw-r--r--arch/x86/include/asm/perf_event_p4.h33
-rw-r--r--arch/x86/include/asm/pgtable_types.h6
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/prom.h13
-rw-r--r--arch/x86/include/asm/ptrace.h19
-rw-r--r--arch/x86/include/asm/pvclock.h2
-rw-r--r--arch/x86/include/asm/rwlock.h43
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h8
-rw-r--r--arch/x86/include/asm/spinlock.h39
-rw-r--r--arch/x86/include/asm/spinlock_types.h6
-rw-r--r--arch/x86/include/asm/system.h1
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/time.h6
-rw-r--r--arch/x86/include/asm/timer.h23
-rw-r--r--arch/x86/include/asm/traps.h2
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/uaccess.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h60
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h37
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h2889
-rw-r--r--arch/x86/include/asm/vgtod.h3
-rw-r--r--arch/x86/include/asm/vmx.h43
-rw-r--r--arch/x86/include/asm/vsyscall.h10
-rw-r--r--arch/x86/include/asm/vvar.h24
-rw-r--r--arch/x86/include/asm/xen/hypercall.h22
-rw-r--r--arch/x86/include/asm/xen/page.h4
-rw-r--r--arch/x86/include/asm/xen/pci.h5
-rw-r--r--arch/x86/include/asm/xen/trace_types.h18
-rw-r--r--arch/x86/kernel/Makefile22
-rw-r--r--arch/x86/kernel/acpi/cstate.c23
-rw-r--r--arch/x86/kernel/acpi/realmode/.gitignore3
-rw-r--r--arch/x86/kernel/alternative.c23
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c2764
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1572
-rw-r--r--arch/x86/kernel/amd_nb.c31
-rw-r--r--arch/x86/kernel/apb_timer.c410
-rw-r--r--arch/x86/kernel/apic/apic.c62
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c20
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c213
-rw-r--r--arch/x86/kernel/apic/probe_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c8
-rw-r--r--arch/x86/kernel/apm_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/cpu/bugs.c4
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c152
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c288
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c184
-rw-r--r--arch/x86/kernel/cpu/perf_event.c171
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c14
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c386
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c119
-rw-r--r--arch/x86/kernel/devicetree.c60
-rw-r--r--arch/x86/kernel/dumpstack_64.c37
-rw-r--r--arch/x86/kernel/entry_32.S8
-rw-r--r--arch/x86/kernel/entry_64.S83
-rw-r--r--arch/x86/kernel/hpet.c35
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8253.c99
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/irqinit.c5
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes.c4
-rw-r--r--arch/x86/kernel/kvm.c72
-rw-r--r--arch/x86/kernel/kvmclock.c7
-rw-r--r--arch/x86/kernel/microcode_amd.c21
-rw-r--r--arch/x86/kernel/module.c37
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/paravirt.c13
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/probe_roms.c2
-rw-r--r--arch/x86/kernel/process.c31
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c22
-rw-r--r--arch/x86/kernel/ptrace.c5
-rw-r--r--arch/x86/kernel/quirks.c5
-rw-r--r--arch/x86/kernel/reboot.c13
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S2
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S2
-rw-r--r--arch/x86/kernel/rtc.c23
-rw-r--r--arch/x86/kernel/signal.c56
-rw-r--r--arch/x86/kernel/smpboot.c15
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c26
-rw-r--r--arch/x86/kernel/vmlinux.lds.S56
-rw-r--r--arch/x86/kernel/vread_tsc_64.c36
-rw-r--r--arch/x86/kernel/vsyscall_64.c340
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S37
-rw-r--r--arch/x86/kernel/vsyscall_trace.h29
-rw-r--r--arch/x86/kvm/Kconfig5
-rw-r--r--arch/x86/kvm/emulate.c1749
-rw-r--r--arch/x86/kvm/i8254.c10
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c1228
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c12
-rw-r--r--arch/x86/kvm/mmutrace.h48
-rw-r--r--arch/x86/kvm/paging_tmpl.h258
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/trace.h31
-rw-r--r--arch/x86/kvm/vmx.c2784
-rw-r--r--arch/x86/kvm/x86.c377
-rw-r--r--arch/x86/kvm/x86.h44
-rw-r--r--arch/x86/lguest/boot.c36
-rw-r--r--arch/x86/lguest/i386_head.S35
-rw-r--r--arch/x86/lib/Makefile9
-rw-r--r--arch/x86/lib/atomic64_32.c2
-rw-r--r--arch/x86/lib/copy_page_64.S9
-rw-r--r--arch/x86/lib/memmove_64.S11
-rw-r--r--arch/x86/lib/rwlock.S44
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/rwsem.S (renamed from arch/x86/lib/rwsem_64.S)75
-rw-r--r--arch/x86/lib/semaphore_32.S124
-rw-r--r--arch/x86/lib/thunk_64.S45
-rw-r--r--arch/x86/lib/usercopy.c43
-rw-r--r--arch/x86/mm/fault.c21
-rw-r--r--arch/x86/mm/gup.c12
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa.c15
-rw-r--r--arch/x86/mm/numa_32.c6
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/net/bpf_jit_comp.c40
-rw-r--r--arch/x86/oprofile/backtrace.c21
-rw-r--r--arch/x86/oprofile/init.c7
-rw-r--r--arch/x86/pci/Makefile3
-rw-r--r--arch/x86/pci/acpi.c50
-rw-r--r--arch/x86/pci/amd_bus.c42
-rw-r--r--arch/x86/pci/ce4100.c2
-rw-r--r--arch/x86/pci/common.c14
-rw-r--r--arch/x86/pci/direct.c6
-rw-r--r--arch/x86/pci/mmconfig-shared.c3
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/pci/olpc.c4
-rw-r--r--arch/x86/pci/pcbios.c2
-rw-r--r--arch/x86/pci/visws.c2
-rw-r--r--arch/x86/pci/xen.c371
-rw-r--r--arch/x86/platform/efi/efi.c90
-rw-r--r--arch/x86/platform/mrst/Makefile1
-rw-r--r--arch/x86/platform/mrst/mrst.c24
-rw-r--r--arch/x86/platform/mrst/pmu.c817
-rw-r--r--arch/x86/platform/mrst/pmu.h234
-rw-r--r--arch/x86/platform/mrst/vrtc.c9
-rw-r--r--arch/x86/platform/olpc/Makefile5
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c215
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-rtc.c81
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c614
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c146
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c168
-rw-r--r--arch/x86/platform/olpc/olpc.c99
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c103
-rw-r--r--arch/x86/platform/olpc/xo1-wakeup.S124
-rw-r--r--arch/x86/platform/uv/tlb_uv.c86
-rw-r--r--arch/x86/realmode/rm/.gitignore3
-rw-r--r--arch/x86/tools/.gitignore1
-rw-r--r--arch/x86/vdso/.gitignore2
-rw-r--r--arch/x86/vdso/Makefile1
-rw-r--r--arch/x86/vdso/vclock_gettime.c103
-rw-r--r--arch/x86/vdso/vdso.S16
-rw-r--r--arch/x86/vdso/vdso32/sysenter.S2
-rw-r--r--arch/x86/vdso/vma.c58
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c35
-rw-r--r--arch/x86/xen/mmu.c153
-rw-r--r--arch/x86/xen/multicalls.c169
-rw-r--r--arch/x86/xen/multicalls.h6
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c48
-rw-r--r--arch/x86/xen/smp.c15
-rw-r--r--arch/x86/xen/time.c5
-rw-r--r--arch/x86/xen/trace.c62
-rw-r--r--arch/x86/xen/vga.c67
-rw-r--r--arch/x86/xen/xen-asm_32.S8
-rw-r--r--arch/x86/xen/xen-ops.h11
251 files changed, 12544 insertions, 11429 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
index 028079065af..7cab8c08e6d 100644
--- a/arch/x86/.gitignore
+++ b/arch/x86/.gitignore
@@ -1,3 +1,4 @@
1boot/compressed/vmlinux 1boot/compressed/vmlinux
2tools/test_get_len 2tools/test_get_len
3tools/insn_sanity
3 4
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9f5e14388e1..4ff921c9f84 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -20,6 +20,7 @@ config X86
20 select HAVE_UNSTABLE_SCHED_CLOCK 20 select HAVE_UNSTABLE_SCHED_CLOCK
21 select HAVE_IDE 21 select HAVE_IDE
22 select HAVE_OPROFILE 22 select HAVE_OPROFILE
23 select HAVE_PCSPKR_PLATFORM
23 select HAVE_PERF_EVENTS 24 select HAVE_PERF_EVENTS
24 select HAVE_IRQ_WORK 25 select HAVE_IRQ_WORK
25 select HAVE_IOREMAP_PROT 26 select HAVE_IOREMAP_PROT
@@ -70,6 +71,8 @@ config X86
70 select IRQ_FORCED_THREADING 71 select IRQ_FORCED_THREADING
71 select USE_GENERIC_SMP_HELPERS if SMP 72 select USE_GENERIC_SMP_HELPERS if SMP
72 select HAVE_BPF_JIT if (X86_64 && NET) 73 select HAVE_BPF_JIT if (X86_64 && NET)
74 select CLKEVT_I8253
75 select ARCH_HAVE_NMI_SAFE_CMPXCHG
73 76
74config INSTRUCTION_DECODER 77config INSTRUCTION_DECODER
75 def_bool (KPROBES || PERF_EVENTS) 78 def_bool (KPROBES || PERF_EVENTS)
@@ -93,6 +96,10 @@ config CLOCKSOURCE_WATCHDOG
93config GENERIC_CLOCKEVENTS 96config GENERIC_CLOCKEVENTS
94 def_bool y 97 def_bool y
95 98
99config ARCH_CLOCKSOURCE_DATA
100 def_bool y
101 depends on X86_64
102
96config GENERIC_CLOCKEVENTS_BROADCAST 103config GENERIC_CLOCKEVENTS_BROADCAST
97 def_bool y 104 def_bool y
98 depends on X86_64 || (X86_32 && X86_LOCAL_APIC) 105 depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
@@ -123,7 +130,7 @@ config SBUS
123 bool 130 bool
124 131
125config NEED_DMA_MAP_STATE 132config NEED_DMA_MAP_STATE
126 def_bool (X86_64 || DMAR || DMA_API_DEBUG) 133 def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG)
127 134
128config NEED_SG_DMA_LENGTH 135config NEED_SG_DMA_LENGTH
129 def_bool y 136 def_bool y
@@ -213,7 +220,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
213 220
214config HAVE_INTEL_TXT 221config HAVE_INTEL_TXT
215 def_bool y 222 def_bool y
216 depends on EXPERIMENTAL && DMAR && ACPI 223 depends on EXPERIMENTAL && INTEL_IOMMU && ACPI
217 224
218config X86_32_SMP 225config X86_32_SMP
219 def_bool y 226 def_bool y
@@ -280,7 +287,7 @@ config SMP
280 287
281config X86_X2APIC 288config X86_X2APIC
282 bool "Support x2apic" 289 bool "Support x2apic"
283 depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP 290 depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP
284 ---help--- 291 ---help---
285 This enables x2apic support on CPUs that have this feature. 292 This enables x2apic support on CPUs that have this feature.
286 293
@@ -384,12 +391,21 @@ config X86_INTEL_CE
384 This option compiles in support for the CE4100 SOC for settop 391 This option compiles in support for the CE4100 SOC for settop
385 boxes and media devices. 392 boxes and media devices.
386 393
394config X86_INTEL_MID
395 bool "Intel MID platform support"
396 depends on X86_32
397 depends on X86_EXTENDED_PLATFORM
398 ---help---
399 Select to build a kernel capable of supporting Intel MID platform
400 systems which do not have the PCI legacy interfaces (Moorestown,
401 Medfield). If you are building for a PC class system say N here.
402
403if X86_INTEL_MID
404
387config X86_MRST 405config X86_MRST
388 bool "Moorestown MID platform" 406 bool "Moorestown MID platform"
389 depends on PCI 407 depends on PCI
390 depends on PCI_GOANY 408 depends on PCI_GOANY
391 depends on X86_32
392 depends on X86_EXTENDED_PLATFORM
393 depends on X86_IO_APIC 409 depends on X86_IO_APIC
394 select APB_TIMER 410 select APB_TIMER
395 select I2C 411 select I2C
@@ -404,6 +420,8 @@ config X86_MRST
404 nor standard legacy replacement devices/features. e.g. Moorestown does 420 nor standard legacy replacement devices/features. e.g. Moorestown does
405 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. 421 not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
406 422
423endif
424
407config X86_RDC321X 425config X86_RDC321X
408 bool "RDC R-321x SoC" 426 bool "RDC R-321x SoC"
409 depends on X86_32 427 depends on X86_32
@@ -512,6 +530,18 @@ menuconfig PARAVIRT_GUEST
512 530
513if PARAVIRT_GUEST 531if PARAVIRT_GUEST
514 532
533config PARAVIRT_TIME_ACCOUNTING
534 bool "Paravirtual steal time accounting"
535 select PARAVIRT
536 default n
537 ---help---
538 Select this option to enable fine granularity task steal time
539 accounting. Time spent executing other tasks in parallel with
540 the current vCPU is discounted from the vCPU power. To account for
541 that, there can be a small performance impact.
542
543 If in doubt, say N here.
544
515source "arch/x86/xen/Kconfig" 545source "arch/x86/xen/Kconfig"
516 546
517config KVM_CLOCK 547config KVM_CLOCK
@@ -617,6 +647,7 @@ config HPET_EMULATE_RTC
617config APB_TIMER 647config APB_TIMER
618 def_bool y if MRST 648 def_bool y if MRST
619 prompt "Langwell APB Timer Support" if X86_MRST 649 prompt "Langwell APB Timer Support" if X86_MRST
650 select DW_APB_TIMER
620 help 651 help
621 APB timer is the replacement for 8254, HPET on X86 MID platforms. 652 APB timer is the replacement for 8254, HPET on X86 MID platforms.
622 The APBT provides a stable time base on SMP 653 The APBT provides a stable time base on SMP
@@ -680,33 +711,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
680 Calgary anyway, pass 'iommu=calgary' on the kernel command line. 711 Calgary anyway, pass 'iommu=calgary' on the kernel command line.
681 If unsure, say Y. 712 If unsure, say Y.
682 713
683config AMD_IOMMU
684 bool "AMD IOMMU support"
685 select SWIOTLB
686 select PCI_MSI
687 select PCI_IOV
688 depends on X86_64 && PCI && ACPI
689 ---help---
690 With this option you can enable support for AMD IOMMU hardware in
691 your system. An IOMMU is a hardware component which provides
692 remapping of DMA memory accesses from devices. With an AMD IOMMU you
693 can isolate the the DMA memory of different devices and protect the
694 system from misbehaving device drivers or hardware.
695
696 You can find out if your system has an AMD IOMMU if you look into
697 your BIOS for an option to enable it or if you have an IVRS ACPI
698 table.
699
700config AMD_IOMMU_STATS
701 bool "Export AMD IOMMU statistics to debugfs"
702 depends on AMD_IOMMU
703 select DEBUG_FS
704 ---help---
705 This option enables code in the AMD IOMMU driver to collect various
706 statistics about whats happening in the driver and exports that
707 information to userspace via debugfs.
708 If unsure, say N.
709
710# need this always selected by IOMMU for the VIA workaround 714# need this always selected by IOMMU for the VIA workaround
711config SWIOTLB 715config SWIOTLB
712 def_bool y if X86_64 716 def_bool y if X86_64
@@ -720,9 +724,6 @@ config SWIOTLB
720config IOMMU_HELPER 724config IOMMU_HELPER
721 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) 725 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
722 726
723config IOMMU_API
724 def_bool (AMD_IOMMU || DMAR)
725
726config MAXSMP 727config MAXSMP
727 bool "Enable Maximum number of SMP Processors and NUMA Nodes" 728 bool "Enable Maximum number of SMP Processors and NUMA Nodes"
728 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL 729 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
@@ -1737,8 +1738,8 @@ menuconfig APM
1737 machines with more than one CPU. 1738 machines with more than one CPU.
1738 1739
1739 In order to use APM, you will need supporting software. For location 1740 In order to use APM, you will need supporting software. For location
1740 and more information, read <file:Documentation/power/pm.txt> and the 1741 and more information, read <file:Documentation/power/apm-acpi.txt>
1741 Battery Powered Linux mini-HOWTO, available from 1742 and the Battery Powered Linux mini-HOWTO, available from
1742 <http://www.tldp.org/docs.html#howto>. 1743 <http://www.tldp.org/docs.html#howto>.
1743 1744
1744 This driver does not spin down disk drives (see the hdparm(8) 1745 This driver does not spin down disk drives (see the hdparm(8)
@@ -1905,7 +1906,7 @@ config PCI_BIOS
1905# x86-64 doesn't support PCI BIOS access from long mode so always go direct. 1906# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
1906config PCI_DIRECT 1907config PCI_DIRECT
1907 def_bool y 1908 def_bool y
1908 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC)) 1909 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
1909 1910
1910config PCI_MMCONFIG 1911config PCI_MMCONFIG
1911 def_bool y 1912 def_bool y
@@ -1942,55 +1943,6 @@ config PCI_CNB20LE_QUIRK
1942 1943
1943 You should say N unless you know you need this. 1944 You should say N unless you know you need this.
1944 1945
1945config DMAR
1946 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1947 depends on PCI_MSI && ACPI && EXPERIMENTAL
1948 help
1949 DMA remapping (DMAR) devices support enables independent address
1950 translations for Direct Memory Access (DMA) from devices.
1951 These DMA remapping devices are reported via ACPI tables
1952 and include PCI device scope covered by these DMA
1953 remapping devices.
1954
1955config DMAR_DEFAULT_ON
1956 def_bool y
1957 prompt "Enable DMA Remapping Devices by default"
1958 depends on DMAR
1959 help
1960 Selecting this option will enable a DMAR device at boot time if
1961 one is found. If this option is not selected, DMAR support can
1962 be enabled by passing intel_iommu=on to the kernel. It is
1963 recommended you say N here while the DMAR code remains
1964 experimental.
1965
1966config DMAR_BROKEN_GFX_WA
1967 bool "Workaround broken graphics drivers (going away soon)"
1968 depends on DMAR && BROKEN
1969 ---help---
1970 Current Graphics drivers tend to use physical address
1971 for DMA and avoid using DMA APIs. Setting this config
1972 option permits the IOMMU driver to set a unity map for
1973 all the OS-visible memory. Hence the driver can continue
1974 to use physical addresses for DMA, at least until this
1975 option is removed in the 2.6.32 kernel.
1976
1977config DMAR_FLOPPY_WA
1978 def_bool y
1979 depends on DMAR
1980 ---help---
1981 Floppy disk drivers are known to bypass DMA API calls
1982 thereby failing to work when IOMMU is enabled. This
1983 workaround will setup a 1:1 mapping for the first
1984 16MiB to make floppy (an ISA device) work.
1985
1986config INTR_REMAP
1987 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
1988 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
1989 ---help---
1990 Supports Interrupt remapping for IO-APIC and MSI devices.
1991 To use x2apic mode in the CPU's which support x2APIC enhancements or
1992 to support platforms with CPU's having > 8 bit APIC ID, say Y.
1993
1994source "drivers/pci/pcie/Kconfig" 1946source "drivers/pci/pcie/Kconfig"
1995 1947
1996source "drivers/pci/Kconfig" 1948source "drivers/pci/Kconfig"
@@ -2073,11 +2025,44 @@ config OLPC
2073 Add support for detecting the unique features of the OLPC 2025 Add support for detecting the unique features of the OLPC
2074 XO hardware. 2026 XO hardware.
2075 2027
2076config OLPC_XO1 2028config OLPC_XO1_PM
2077 tristate "OLPC XO-1 support" 2029 bool "OLPC XO-1 Power Management"
2078 depends on OLPC && MFD_CS5535 2030 depends on OLPC && MFD_CS5535 && PM_SLEEP
2079 ---help--- 2031 select MFD_CORE
2080 Add support for non-essential features of the OLPC XO-1 laptop. 2032 ---help---
2033 Add support for poweroff and suspend of the OLPC XO-1 laptop.
2034
2035config OLPC_XO1_RTC
2036 bool "OLPC XO-1 Real Time Clock"
2037 depends on OLPC_XO1_PM && RTC_DRV_CMOS
2038 ---help---
2039 Add support for the XO-1 real time clock, which can be used as a
2040 programmable wakeup source.
2041
2042config OLPC_XO1_SCI
2043 bool "OLPC XO-1 SCI extras"
2044 depends on OLPC && OLPC_XO1_PM
2045 select POWER_SUPPLY
2046 select GPIO_CS5535
2047 select MFD_CORE
2048 ---help---
2049 Add support for SCI-based features of the OLPC XO-1 laptop:
2050 - EC-driven system wakeups
2051 - Power button
2052 - Ebook switch
2053 - Lid switch
2054 - AC adapter status updates
2055 - Battery status updates
2056
2057config OLPC_XO15_SCI
2058 bool "OLPC XO-1.5 SCI extras"
2059 depends on OLPC && ACPI
2060 select POWER_SUPPLY
2061 ---help---
2062 Add support for SCI-based features of the OLPC XO-1.5 laptop:
2063 - EC-driven system wakeups
2064 - AC adapter status updates
2065 - Battery status updates
2081 2066
2082endif # X86_32 2067endif # X86_32
2083 2068
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6a7cfdf8ff6..e3ca7e0d858 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -312,6 +312,9 @@ config X86_CMPXCHG
312config CMPXCHG_LOCAL 312config CMPXCHG_LOCAL
313 def_bool X86_64 || (X86_32 && !M386) 313 def_bool X86_64 || (X86_32 && !M386)
314 314
315config CMPXCHG_DOUBLE
316 def_bool y
317
315config X86_L1_CACHE_SHIFT 318config X86_L1_CACHE_SHIFT
316 int 319 int
317 default "7" if MPENTIUM4 || MPSC 320 default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 851fe936d24..e3cf9f682be 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -2,7 +2,6 @@ bootsect
2bzImage 2bzImage
3cpustr.h 3cpustr.h
4mkcpustr 4mkcpustr
5offsets.h
6voffset.h 5voffset.h
7zoffset.h 6zoffset.h
8setup 7setup
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index f7cb086b4ad..95365a82b6a 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,12 +9,6 @@
9# Changed by many, many contributors over the years. 9# Changed by many, many contributors over the years.
10# 10#
11 11
12# ROOT_DEV specifies the default root-device when making the image.
13# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
14# the default of FLOPPY is used by 'build'.
15
16ROOT_DEV := CURRENT
17
18# If you want to preset the SVGA mode, uncomment the next line and 12# If you want to preset the SVGA mode, uncomment the next line and
19# set SVGA_MODE to whatever number you want. 13# set SVGA_MODE to whatever number you want.
20# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 14# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
@@ -75,8 +69,7 @@ GCOV_PROFILE := n
75$(obj)/bzImage: asflags-y := $(SVGA_MODE) 69$(obj)/bzImage: asflags-y := $(SVGA_MODE)
76 70
77quiet_cmd_image = BUILD $@ 71quiet_cmd_image = BUILD $@
78cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ 72cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin > $@
79 $(ROOT_DEV) > $@
80 73
81$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE 74$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
82 $(call if_changed,image) 75 $(call if_changed,image)
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index ee3a4ea923a..fdc60a0b3c2 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
130 130
131static void usage(void) 131static void usage(void)
132{ 132{
133 die("Usage: build setup system [rootdev] [> image]"); 133 die("Usage: build setup system [> image]");
134} 134}
135 135
136int main(int argc, char ** argv) 136int main(int argc, char ** argv)
@@ -138,39 +138,14 @@ int main(int argc, char ** argv)
138 unsigned int i, sz, setup_sectors; 138 unsigned int i, sz, setup_sectors;
139 int c; 139 int c;
140 u32 sys_size; 140 u32 sys_size;
141 u8 major_root, minor_root;
142 struct stat sb; 141 struct stat sb;
143 FILE *file; 142 FILE *file;
144 int fd; 143 int fd;
145 void *kernel; 144 void *kernel;
146 u32 crc = 0xffffffffUL; 145 u32 crc = 0xffffffffUL;
147 146
148 if ((argc < 3) || (argc > 4)) 147 if (argc != 3)
149 usage(); 148 usage();
150 if (argc > 3) {
151 if (!strcmp(argv[3], "CURRENT")) {
152 if (stat("/", &sb)) {
153 perror("/");
154 die("Couldn't stat /");
155 }
156 major_root = major(sb.st_dev);
157 minor_root = minor(sb.st_dev);
158 } else if (strcmp(argv[3], "FLOPPY")) {
159 if (stat(argv[3], &sb)) {
160 perror(argv[3]);
161 die("Couldn't stat root device.");
162 }
163 major_root = major(sb.st_rdev);
164 minor_root = minor(sb.st_rdev);
165 } else {
166 major_root = 0;
167 minor_root = 0;
168 }
169 } else {
170 major_root = DEFAULT_MAJOR_ROOT;
171 minor_root = DEFAULT_MINOR_ROOT;
172 }
173 fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
174 149
175 /* Copy the setup code */ 150 /* Copy the setup code */
176 file = fopen(argv[1], "r"); 151 file = fopen(argv[1], "r");
@@ -193,8 +168,8 @@ int main(int argc, char ** argv)
193 memset(buf+c, 0, i-c); 168 memset(buf+c, 0, i-c);
194 169
195 /* Set the default root device */ 170 /* Set the default root device */
196 buf[508] = minor_root; 171 buf[508] = DEFAULT_MINOR_ROOT;
197 buf[509] = major_root; 172 buf[509] = DEFAULT_MAJOR_ROOT;
198 173
199 fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i); 174 fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
200 175
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 22a0dc8e51d..058a35b8286 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -67,8 +67,8 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
67CONFIG_CPU_FREQ_GOV_ONDEMAND=y 67CONFIG_CPU_FREQ_GOV_ONDEMAND=y
68CONFIG_X86_ACPI_CPUFREQ=y 68CONFIG_X86_ACPI_CPUFREQ=y
69CONFIG_PCI_MMCONFIG=y 69CONFIG_PCI_MMCONFIG=y
70CONFIG_DMAR=y 70CONFIG_INTEL_IOMMU=y
71# CONFIG_DMAR_DEFAULT_ON is not set 71# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
72CONFIG_PCIEPORTBUS=y 72CONFIG_PCIEPORTBUS=y
73CONFIG_PCCARD=y 73CONFIG_PCCARD=y
74CONFIG_YENTA=y 74CONFIG_YENTA=y
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 7a6e68e4f74..976aa64d9a2 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -245,7 +245,7 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
245 crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) 245 crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
246 & CRYPTO_TFM_RES_MASK); 246 & CRYPTO_TFM_RES_MASK);
247 247
248 return 0; 248 return err;
249} 249}
250 250
251static int ghash_async_init_tfm(struct crypto_tfm *tfm) 251static int ghash_async_init_tfm(struct crypto_tfm *tfm)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 588a7aa937e..65577698cab 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -127,15 +127,17 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
127 127
128asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask) 128asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
129{ 129{
130 mask &= _BLOCKABLE; 130 sigset_t blocked;
131 spin_lock_irq(&current->sighand->siglock); 131
132 current->saved_sigmask = current->blocked; 132 current->saved_sigmask = current->blocked;
133 siginitset(&current->blocked, mask); 133
134 recalc_sigpending(); 134 mask &= _BLOCKABLE;
135 spin_unlock_irq(&current->sighand->siglock); 135 siginitset(&blocked, mask);
136 set_current_blocked(&blocked);
136 137
137 current->state = TASK_INTERRUPTIBLE; 138 current->state = TASK_INTERRUPTIBLE;
138 schedule(); 139 schedule();
140
139 set_restore_sigmask(); 141 set_restore_sigmask();
140 return -ERESTARTNOHAND; 142 return -ERESTARTNOHAND;
141} 143}
@@ -279,10 +281,7 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
279 goto badframe; 281 goto badframe;
280 282
281 sigdelsetmask(&set, ~_BLOCKABLE); 283 sigdelsetmask(&set, ~_BLOCKABLE);
282 spin_lock_irq(&current->sighand->siglock); 284 set_current_blocked(&set);
283 current->blocked = set;
284 recalc_sigpending();
285 spin_unlock_irq(&current->sighand->siglock);
286 285
287 if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) 286 if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
288 goto badframe; 287 goto badframe;
@@ -308,10 +307,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
308 goto badframe; 307 goto badframe;
309 308
310 sigdelsetmask(&set, ~_BLOCKABLE); 309 sigdelsetmask(&set, ~_BLOCKABLE);
311 spin_lock_irq(&current->sighand->siglock); 310 set_current_blocked(&set);
312 current->blocked = set;
313 recalc_sigpending();
314 spin_unlock_irq(&current->sighand->siglock);
315 311
316 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 312 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
317 goto badframe; 313 goto badframe;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index c1870dddd32..54edb207ff3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -143,7 +143,7 @@ ENTRY(ia32_sysenter_target)
143 CFI_REL_OFFSET rip,0 143 CFI_REL_OFFSET rip,0
144 pushq_cfi %rax 144 pushq_cfi %rax
145 cld 145 cld
146 SAVE_ARGS 0,0,1 146 SAVE_ARGS 0,1,0
147 /* no need to do an access_ok check here because rbp has been 147 /* no need to do an access_ok check here because rbp has been
148 32bit zero extended */ 148 32bit zero extended */
1491: movl (%rbp),%ebp 1491: movl (%rbp),%ebp
@@ -173,7 +173,7 @@ sysexit_from_sys_call:
173 andl $~0x200,EFLAGS-R11(%rsp) 173 andl $~0x200,EFLAGS-R11(%rsp)
174 movl RIP-R11(%rsp),%edx /* User %eip */ 174 movl RIP-R11(%rsp),%edx /* User %eip */
175 CFI_REGISTER rip,rdx 175 CFI_REGISTER rip,rdx
176 RESTORE_ARGS 1,24,1,1,1,1 176 RESTORE_ARGS 0,24,0,0,0,0
177 xorq %r8,%r8 177 xorq %r8,%r8
178 xorq %r9,%r9 178 xorq %r9,%r9
179 xorq %r10,%r10 179 xorq %r10,%r10
@@ -289,7 +289,7 @@ ENTRY(ia32_cstar_target)
289 * disabled irqs and here we enable it straight after entry: 289 * disabled irqs and here we enable it straight after entry:
290 */ 290 */
291 ENABLE_INTERRUPTS(CLBR_NONE) 291 ENABLE_INTERRUPTS(CLBR_NONE)
292 SAVE_ARGS 8,1,1 292 SAVE_ARGS 8,0,0
293 movl %eax,%eax /* zero extension */ 293 movl %eax,%eax /* zero extension */
294 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 294 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
295 movq %rcx,RIP-ARGOFFSET(%rsp) 295 movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -328,7 +328,7 @@ cstar_dispatch:
328 jnz sysretl_audit 328 jnz sysretl_audit
329sysretl_from_sys_call: 329sysretl_from_sys_call:
330 andl $~TS_COMPAT,TI_status(%r10) 330 andl $~TS_COMPAT,TI_status(%r10)
331 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 331 RESTORE_ARGS 0,-ARG_SKIP,0,0,0
332 movl RIP-ARGOFFSET(%rsp),%ecx 332 movl RIP-ARGOFFSET(%rsp),%ecx
333 CFI_REGISTER rip,rcx 333 CFI_REGISTER rip,rcx
334 movl EFLAGS-ARGOFFSET(%rsp),%r11d 334 movl EFLAGS-ARGOFFSET(%rsp),%r11d
@@ -419,7 +419,7 @@ ENTRY(ia32_syscall)
419 cld 419 cld
420 /* note the registers are not zero extended to the sf. 420 /* note the registers are not zero extended to the sf.
421 this could be a problem. */ 421 this could be a problem. */
422 SAVE_ARGS 0,0,1 422 SAVE_ARGS 0,1,0
423 GET_THREAD_INFO(%r10) 423 GET_THREAD_INFO(%r10)
424 orl $TS_COMPAT,TI_status(%r10) 424 orl $TS_COMPAT,TI_status(%r10)
425 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) 425 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
@@ -672,7 +672,7 @@ ia32_sys_call_table:
672 .quad sys32_vm86_warning /* vm86 */ 672 .quad sys32_vm86_warning /* vm86 */
673 .quad quiet_ni_syscall /* query_module */ 673 .quad quiet_ni_syscall /* query_module */
674 .quad sys_poll 674 .quad sys_poll
675 .quad compat_sys_nfsservctl 675 .quad quiet_ni_syscall /* old nfsservctl */
676 .quad sys_setresgid16 /* 170 */ 676 .quad sys_setresgid16 /* 170 */
677 .quad sys_getresgid16 677 .quad sys_getresgid16
678 .quad sys_prctl 678 .quad sys_prctl
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 5852519b2d0..f6f5c53dc90 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -43,7 +43,7 @@
43#include <asm/mman.h> 43#include <asm/mman.h>
44#include <asm/types.h> 44#include <asm/types.h>
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46#include <asm/atomic.h> 46#include <linux/atomic.h>
47#include <asm/vgtod.h> 47#include <asm/vgtod.h>
48#include <asm/sys_ia32.h> 48#include <asm/sys_ia32.h>
49 49
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 94d420b360d..091508b533b 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -16,9 +16,8 @@
16#endif 16#endif
17 17
18.macro altinstruction_entry orig alt feature orig_len alt_len 18.macro altinstruction_entry orig alt feature orig_len alt_len
19 .align 8 19 .long \orig - .
20 .quad \orig 20 .long \alt - .
21 .quad \alt
22 .word \feature 21 .word \feature
23 .byte \orig_len 22 .byte \orig_len
24 .byte \alt_len 23 .byte \alt_len
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bf535f947e8..37ad100a221 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -43,14 +43,11 @@
43#endif 43#endif
44 44
45struct alt_instr { 45struct alt_instr {
46 u8 *instr; /* original instruction */ 46 s32 instr_offset; /* original instruction */
47 u8 *replacement; 47 s32 repl_offset; /* offset to replacement instruction */
48 u16 cpuid; /* cpuid bit set for replacement */ 48 u16 cpuid; /* cpuid bit set for replacement */
49 u8 instrlen; /* length of original instruction */ 49 u8 instrlen; /* length of original instruction */
50 u8 replacementlen; /* length of new instruction, <= instrlen */ 50 u8 replacementlen; /* length of new instruction, <= instrlen */
51#ifdef CONFIG_X86_64
52 u32 pad2;
53#endif
54}; 51};
55 52
56extern void alternative_instructions(void); 53extern void alternative_instructions(void);
@@ -83,9 +80,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
83 \ 80 \
84 "661:\n\t" oldinstr "\n662:\n" \ 81 "661:\n\t" oldinstr "\n662:\n" \
85 ".section .altinstructions,\"a\"\n" \ 82 ".section .altinstructions,\"a\"\n" \
86 _ASM_ALIGN "\n" \ 83 " .long 661b - .\n" /* label */ \
87 _ASM_PTR "661b\n" /* label */ \ 84 " .long 663f - .\n" /* new instruction */ \
88 _ASM_PTR "663f\n" /* new instruction */ \
89 " .word " __stringify(feature) "\n" /* feature bit */ \ 85 " .word " __stringify(feature) "\n" /* feature bit */ \
90 " .byte 662b-661b\n" /* sourcelen */ \ 86 " .byte 662b-661b\n" /* sourcelen */ \
91 " .byte 664f-663f\n" /* replacementlen */ \ 87 " .byte 664f-663f\n" /* replacementlen */ \
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
deleted file mode 100644
index a6863a2dec1..00000000000
--- a/arch/x86/include/asm/amd_iommu.h
+++ /dev/null
@@ -1,35 +0,0 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#ifndef _ASM_X86_AMD_IOMMU_H
21#define _ASM_X86_AMD_IOMMU_H
22
23#include <linux/irqreturn.h>
24
25#ifdef CONFIG_AMD_IOMMU
26
27extern int amd_iommu_detect(void);
28
29#else
30
31static inline int amd_iommu_detect(void) { return -ENODEV; }
32
33#endif
34
35#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
deleted file mode 100644
index 55d95eb789b..00000000000
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ /dev/null
@@ -1,54 +0,0 @@
1/*
2 * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H
21
22#include <asm/amd_iommu_types.h>
23
24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
27extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
28extern void amd_iommu_apply_erratum_63(u16 devid);
29extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
30extern int amd_iommu_init_devices(void);
31extern void amd_iommu_uninit_devices(void);
32extern void amd_iommu_init_notifier(void);
33extern void amd_iommu_init_api(void);
34#ifndef CONFIG_AMD_IOMMU_STATS
35
36static inline void amd_iommu_stats_init(void) { }
37
38#endif /* !CONFIG_AMD_IOMMU_STATS */
39
40static inline bool is_rd890_iommu(struct pci_dev *pdev)
41{
42 return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
43 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
44}
45
46static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
47{
48 if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
49 return false;
50
51 return !!(iommu->features & f);
52}
53
54#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
deleted file mode 100644
index 4c998299541..00000000000
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ /dev/null
@@ -1,580 +0,0 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
21#define _ASM_X86_AMD_IOMMU_TYPES_H
22
23#include <linux/types.h>
24#include <linux/mutex.h>
25#include <linux/list.h>
26#include <linux/spinlock.h>
27
28/*
29 * Maximum number of IOMMUs supported
30 */
31#define MAX_IOMMUS 32
32
33/*
34 * some size calculation constants
35 */
36#define DEV_TABLE_ENTRY_SIZE 32
37#define ALIAS_TABLE_ENTRY_SIZE 2
38#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
39
40/* Length of the MMIO region for the AMD IOMMU */
41#define MMIO_REGION_LENGTH 0x4000
42
43/* Capability offsets used by the driver */
44#define MMIO_CAP_HDR_OFFSET 0x00
45#define MMIO_RANGE_OFFSET 0x0c
46#define MMIO_MISC_OFFSET 0x10
47
48/* Masks, shifts and macros to parse the device range capability */
49#define MMIO_RANGE_LD_MASK 0xff000000
50#define MMIO_RANGE_FD_MASK 0x00ff0000
51#define MMIO_RANGE_BUS_MASK 0x0000ff00
52#define MMIO_RANGE_LD_SHIFT 24
53#define MMIO_RANGE_FD_SHIFT 16
54#define MMIO_RANGE_BUS_SHIFT 8
55#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
56#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
57#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
58#define MMIO_MSI_NUM(x) ((x) & 0x1f)
59
60/* Flag masks for the AMD IOMMU exclusion range */
61#define MMIO_EXCL_ENABLE_MASK 0x01ULL
62#define MMIO_EXCL_ALLOW_MASK 0x02ULL
63
64/* Used offsets into the MMIO space */
65#define MMIO_DEV_TABLE_OFFSET 0x0000
66#define MMIO_CMD_BUF_OFFSET 0x0008
67#define MMIO_EVT_BUF_OFFSET 0x0010
68#define MMIO_CONTROL_OFFSET 0x0018
69#define MMIO_EXCL_BASE_OFFSET 0x0020
70#define MMIO_EXCL_LIMIT_OFFSET 0x0028
71#define MMIO_EXT_FEATURES 0x0030
72#define MMIO_CMD_HEAD_OFFSET 0x2000
73#define MMIO_CMD_TAIL_OFFSET 0x2008
74#define MMIO_EVT_HEAD_OFFSET 0x2010
75#define MMIO_EVT_TAIL_OFFSET 0x2018
76#define MMIO_STATUS_OFFSET 0x2020
77
78
79/* Extended Feature Bits */
80#define FEATURE_PREFETCH (1ULL<<0)
81#define FEATURE_PPR (1ULL<<1)
82#define FEATURE_X2APIC (1ULL<<2)
83#define FEATURE_NX (1ULL<<3)
84#define FEATURE_GT (1ULL<<4)
85#define FEATURE_IA (1ULL<<6)
86#define FEATURE_GA (1ULL<<7)
87#define FEATURE_HE (1ULL<<8)
88#define FEATURE_PC (1ULL<<9)
89
90/* MMIO status bits */
91#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
92
93/* event logging constants */
94#define EVENT_ENTRY_SIZE 0x10
95#define EVENT_TYPE_SHIFT 28
96#define EVENT_TYPE_MASK 0xf
97#define EVENT_TYPE_ILL_DEV 0x1
98#define EVENT_TYPE_IO_FAULT 0x2
99#define EVENT_TYPE_DEV_TAB_ERR 0x3
100#define EVENT_TYPE_PAGE_TAB_ERR 0x4
101#define EVENT_TYPE_ILL_CMD 0x5
102#define EVENT_TYPE_CMD_HARD_ERR 0x6
103#define EVENT_TYPE_IOTLB_INV_TO 0x7
104#define EVENT_TYPE_INV_DEV_REQ 0x8
105#define EVENT_DEVID_MASK 0xffff
106#define EVENT_DEVID_SHIFT 0
107#define EVENT_DOMID_MASK 0xffff
108#define EVENT_DOMID_SHIFT 0
109#define EVENT_FLAGS_MASK 0xfff
110#define EVENT_FLAGS_SHIFT 0x10
111
112/* feature control bits */
113#define CONTROL_IOMMU_EN 0x00ULL
114#define CONTROL_HT_TUN_EN 0x01ULL
115#define CONTROL_EVT_LOG_EN 0x02ULL
116#define CONTROL_EVT_INT_EN 0x03ULL
117#define CONTROL_COMWAIT_EN 0x04ULL
118#define CONTROL_PASSPW_EN 0x08ULL
119#define CONTROL_RESPASSPW_EN 0x09ULL
120#define CONTROL_COHERENT_EN 0x0aULL
121#define CONTROL_ISOC_EN 0x0bULL
122#define CONTROL_CMDBUF_EN 0x0cULL
123#define CONTROL_PPFLOG_EN 0x0dULL
124#define CONTROL_PPFINT_EN 0x0eULL
125
126/* command specific defines */
127#define CMD_COMPL_WAIT 0x01
128#define CMD_INV_DEV_ENTRY 0x02
129#define CMD_INV_IOMMU_PAGES 0x03
130#define CMD_INV_IOTLB_PAGES 0x04
131#define CMD_INV_ALL 0x08
132
133#define CMD_COMPL_WAIT_STORE_MASK 0x01
134#define CMD_COMPL_WAIT_INT_MASK 0x02
135#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
136#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
137
138#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
139
140/* macros and definitions for device table entries */
141#define DEV_ENTRY_VALID 0x00
142#define DEV_ENTRY_TRANSLATION 0x01
143#define DEV_ENTRY_IR 0x3d
144#define DEV_ENTRY_IW 0x3e
145#define DEV_ENTRY_NO_PAGE_FAULT 0x62
146#define DEV_ENTRY_EX 0x67
147#define DEV_ENTRY_SYSMGT1 0x68
148#define DEV_ENTRY_SYSMGT2 0x69
149#define DEV_ENTRY_INIT_PASS 0xb8
150#define DEV_ENTRY_EINT_PASS 0xb9
151#define DEV_ENTRY_NMI_PASS 0xba
152#define DEV_ENTRY_LINT0_PASS 0xbe
153#define DEV_ENTRY_LINT1_PASS 0xbf
154#define DEV_ENTRY_MODE_MASK 0x07
155#define DEV_ENTRY_MODE_SHIFT 0x09
156
157/* constants to configure the command buffer */
158#define CMD_BUFFER_SIZE 8192
159#define CMD_BUFFER_UNINITIALIZED 1
160#define CMD_BUFFER_ENTRIES 512
161#define MMIO_CMD_SIZE_SHIFT 56
162#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
163
164/* constants for event buffer handling */
165#define EVT_BUFFER_SIZE 8192 /* 512 entries */
166#define EVT_LEN_MASK (0x9ULL << 56)
167
168#define PAGE_MODE_NONE 0x00
169#define PAGE_MODE_1_LEVEL 0x01
170#define PAGE_MODE_2_LEVEL 0x02
171#define PAGE_MODE_3_LEVEL 0x03
172#define PAGE_MODE_4_LEVEL 0x04
173#define PAGE_MODE_5_LEVEL 0x05
174#define PAGE_MODE_6_LEVEL 0x06
175
176#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
177#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
178 ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
179 (0xffffffffffffffffULL))
180#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
181#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
182#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
183 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
184#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
185
186#define PM_MAP_4k 0
187#define PM_ADDR_MASK 0x000ffffffffff000ULL
188#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
189 (~((1ULL << (12 + ((lvl) * 9))) - 1)))
190#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
191
192/*
193 * Returns the page table level to use for a given page size
194 * Pagesize is expected to be a power-of-two
195 */
196#define PAGE_SIZE_LEVEL(pagesize) \
197 ((__ffs(pagesize) - 12) / 9)
198/*
199 * Returns the number of ptes to use for a given page size
200 * Pagesize is expected to be a power-of-two
201 */
202#define PAGE_SIZE_PTE_COUNT(pagesize) \
203 (1ULL << ((__ffs(pagesize) - 12) % 9))
204
205/*
206 * Aligns a given io-virtual address to a given page size
207 * Pagesize is expected to be a power-of-two
208 */
209#define PAGE_SIZE_ALIGN(address, pagesize) \
210 ((address) & ~((pagesize) - 1))
211/*
212 * Creates an IOMMU PTE for an address an a given pagesize
213 * The PTE has no permission bits set
214 * Pagesize is expected to be a power-of-two larger than 4096
215 */
216#define PAGE_SIZE_PTE(address, pagesize) \
217 (((address) | ((pagesize) - 1)) & \
218 (~(pagesize >> 1)) & PM_ADDR_MASK)
219
220/*
221 * Takes a PTE value with mode=0x07 and returns the page size it maps
222 */
223#define PTE_PAGE_SIZE(pte) \
224 (1ULL << (1 + ffz(((pte) | 0xfffULL))))
225
226#define IOMMU_PTE_P (1ULL << 0)
227#define IOMMU_PTE_TV (1ULL << 1)
228#define IOMMU_PTE_U (1ULL << 59)
229#define IOMMU_PTE_FC (1ULL << 60)
230#define IOMMU_PTE_IR (1ULL << 61)
231#define IOMMU_PTE_IW (1ULL << 62)
232
233#define DTE_FLAG_IOTLB 0x01
234
235#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
236#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
237#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
238#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
239
240#define IOMMU_PROT_MASK 0x03
241#define IOMMU_PROT_IR 0x01
242#define IOMMU_PROT_IW 0x02
243
244/* IOMMU capabilities */
245#define IOMMU_CAP_IOTLB 24
246#define IOMMU_CAP_NPCACHE 26
247#define IOMMU_CAP_EFR 27
248
249#define MAX_DOMAIN_ID 65536
250
251/* FIXME: move this macro to <linux/pci.h> */
252#define PCI_BUS(x) (((x) >> 8) & 0xff)
253
254/* Protection domain flags */
255#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
256#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
257 domain for an IOMMU */
258#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
259 translation */
260
261extern bool amd_iommu_dump;
262#define DUMP_printk(format, arg...) \
263 do { \
264 if (amd_iommu_dump) \
265 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
266 } while(0);
267
268/* global flag if IOMMUs cache non-present entries */
269extern bool amd_iommu_np_cache;
270/* Only true if all IOMMUs support device IOTLBs */
271extern bool amd_iommu_iotlb_sup;
272
273/*
274 * Make iterating over all IOMMUs easier
275 */
276#define for_each_iommu(iommu) \
277 list_for_each_entry((iommu), &amd_iommu_list, list)
278#define for_each_iommu_safe(iommu, next) \
279 list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
280
281#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
282#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
283#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
284#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
285#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
286#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
287
288/*
289 * This structure contains generic data for IOMMU protection domains
290 * independent of their use.
291 */
292struct protection_domain {
293 struct list_head list; /* for list of all protection domains */
294 struct list_head dev_list; /* List of all devices in this domain */
295 spinlock_t lock; /* mostly used to lock the page table*/
296 struct mutex api_lock; /* protect page tables in the iommu-api path */
297 u16 id; /* the domain id written to the device table */
298 int mode; /* paging mode (0-6 levels) */
299 u64 *pt_root; /* page table root pointer */
300 unsigned long flags; /* flags to find out type of domain */
301 bool updated; /* complete domain flush required */
302 unsigned dev_cnt; /* devices assigned to this domain */
303 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
304 void *priv; /* private data */
305
306};
307
308/*
309 * This struct contains device specific data for the IOMMU
310 */
311struct iommu_dev_data {
312 struct list_head list; /* For domain->dev_list */
313 struct device *dev; /* Device this data belong to */
314 struct device *alias; /* The Alias Device */
315 struct protection_domain *domain; /* Domain the device is bound to */
316 atomic_t bind; /* Domain attach reverent count */
317};
318
319/*
320 * For dynamic growth the aperture size is split into ranges of 128MB of
321 * DMA address space each. This struct represents one such range.
322 */
323struct aperture_range {
324
325 /* address allocation bitmap */
326 unsigned long *bitmap;
327
328 /*
329 * Array of PTE pages for the aperture. In this array we save all the
330 * leaf pages of the domain page table used for the aperture. This way
331 * we don't need to walk the page table to find a specific PTE. We can
332 * just calculate its address in constant time.
333 */
334 u64 *pte_pages[64];
335
336 unsigned long offset;
337};
338
339/*
340 * Data container for a dma_ops specific protection domain
341 */
342struct dma_ops_domain {
343 struct list_head list;
344
345 /* generic protection domain information */
346 struct protection_domain domain;
347
348 /* size of the aperture for the mappings */
349 unsigned long aperture_size;
350
351 /* address we start to search for free addresses */
352 unsigned long next_address;
353
354 /* address space relevant data */
355 struct aperture_range *aperture[APERTURE_MAX_RANGES];
356
357 /* This will be set to true when TLB needs to be flushed */
358 bool need_flush;
359
360 /*
361 * if this is a preallocated domain, keep the device for which it was
362 * preallocated in this variable
363 */
364 u16 target_dev;
365};
366
367/*
368 * Structure where we save information about one hardware AMD IOMMU in the
369 * system.
370 */
371struct amd_iommu {
372 struct list_head list;
373
374 /* Index within the IOMMU array */
375 int index;
376
377 /* locks the accesses to the hardware */
378 spinlock_t lock;
379
380 /* Pointer to PCI device of this IOMMU */
381 struct pci_dev *dev;
382
383 /* physical address of MMIO space */
384 u64 mmio_phys;
385 /* virtual address of MMIO space */
386 u8 *mmio_base;
387
388 /* capabilities of that IOMMU read from ACPI */
389 u32 cap;
390
391 /* flags read from acpi table */
392 u8 acpi_flags;
393
394 /* Extended features */
395 u64 features;
396
397 /*
398 * Capability pointer. There could be more than one IOMMU per PCI
399 * device function if there are more than one AMD IOMMU capability
400 * pointers.
401 */
402 u16 cap_ptr;
403
404 /* pci domain of this IOMMU */
405 u16 pci_seg;
406
407 /* first device this IOMMU handles. read from PCI */
408 u16 first_device;
409 /* last device this IOMMU handles. read from PCI */
410 u16 last_device;
411
412 /* start of exclusion range of that IOMMU */
413 u64 exclusion_start;
414 /* length of exclusion range of that IOMMU */
415 u64 exclusion_length;
416
417 /* command buffer virtual address */
418 u8 *cmd_buf;
419 /* size of command buffer */
420 u32 cmd_buf_size;
421
422 /* size of event buffer */
423 u32 evt_buf_size;
424 /* event buffer virtual address */
425 u8 *evt_buf;
426 /* MSI number for event interrupt */
427 u16 evt_msi_num;
428
429 /* true if interrupts for this IOMMU are already enabled */
430 bool int_enabled;
431
432 /* if one, we need to send a completion wait command */
433 bool need_sync;
434
435 /* default dma_ops domain for that IOMMU */
436 struct dma_ops_domain *default_dom;
437
438 /*
439 * We can't rely on the BIOS to restore all values on reinit, so we
440 * need to stash them
441 */
442
443 /* The iommu BAR */
444 u32 stored_addr_lo;
445 u32 stored_addr_hi;
446
447 /*
448 * Each iommu has 6 l1s, each of which is documented as having 0x12
449 * registers
450 */
451 u32 stored_l1[6][0x12];
452
453 /* The l2 indirect registers */
454 u32 stored_l2[0x83];
455};
456
457/*
458 * List with all IOMMUs in the system. This list is not locked because it is
459 * only written and read at driver initialization or suspend time
460 */
461extern struct list_head amd_iommu_list;
462
463/*
464 * Array with pointers to each IOMMU struct
465 * The indices are referenced in the protection domains
466 */
467extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
468
469/* Number of IOMMUs present in the system */
470extern int amd_iommus_present;
471
472/*
473 * Declarations for the global list of all protection domains
474 */
475extern spinlock_t amd_iommu_pd_lock;
476extern struct list_head amd_iommu_pd_list;
477
478/*
479 * Structure defining one entry in the device table
480 */
481struct dev_table_entry {
482 u32 data[8];
483};
484
485/*
486 * One entry for unity mappings parsed out of the ACPI table.
487 */
488struct unity_map_entry {
489 struct list_head list;
490
491 /* starting device id this entry is used for (including) */
492 u16 devid_start;
493 /* end device id this entry is used for (including) */
494 u16 devid_end;
495
496 /* start address to unity map (including) */
497 u64 address_start;
498 /* end address to unity map (including) */
499 u64 address_end;
500
501 /* required protection */
502 int prot;
503};
504
505/*
506 * List of all unity mappings. It is not locked because as runtime it is only
507 * read. It is created at ACPI table parsing time.
508 */
509extern struct list_head amd_iommu_unity_map;
510
511/*
512 * Data structures for device handling
513 */
514
515/*
516 * Device table used by hardware. Read and write accesses by software are
517 * locked with the amd_iommu_pd_table lock.
518 */
519extern struct dev_table_entry *amd_iommu_dev_table;
520
521/*
522 * Alias table to find requestor ids to device ids. Not locked because only
523 * read on runtime.
524 */
525extern u16 *amd_iommu_alias_table;
526
527/*
528 * Reverse lookup table to find the IOMMU which translates a specific device.
529 */
530extern struct amd_iommu **amd_iommu_rlookup_table;
531
532/* size of the dma_ops aperture as power of 2 */
533extern unsigned amd_iommu_aperture_order;
534
535/* largest PCI device id we expect translation requests for */
536extern u16 amd_iommu_last_bdf;
537
538/* allocation bitmap for domain ids */
539extern unsigned long *amd_iommu_pd_alloc_bitmap;
540
541/*
542 * If true, the addresses will be flushed on unmap time, not when
543 * they are reused
544 */
545extern bool amd_iommu_unmap_flush;
546
547/* takes bus and device/function and returns the device id
548 * FIXME: should that be in generic PCI code? */
549static inline u16 calc_devid(u8 bus, u8 devfn)
550{
551 return (((u16)bus) << 8) | devfn;
552}
553
554#ifdef CONFIG_AMD_IOMMU_STATS
555
556struct __iommu_counter {
557 char *name;
558 struct dentry *dent;
559 u64 value;
560};
561
562#define DECLARE_STATS_COUNTER(nm) \
563 static struct __iommu_counter nm = { \
564 .name = #nm, \
565 }
566
567#define INC_STATS_COUNTER(name) name.value += 1
568#define ADD_STATS_COUNTER(name, x) name.value += (x)
569#define SUB_STATS_COUNTER(name, x) name.value -= (x)
570
571#else /* CONFIG_AMD_IOMMU_STATS */
572
573#define DECLARE_STATS_COUNTER(name)
574#define INC_STATS_COUNTER(name)
575#define ADD_STATS_COUNTER(name, x)
576#define SUB_STATS_COUNTER(name, x)
577
578#endif /* CONFIG_AMD_IOMMU_STATS */
579
580#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 67f87f25761..78a1eff7422 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_AMD_NB_H 1#ifndef _ASM_X86_AMD_NB_H
2#define _ASM_X86_AMD_NB_H 2#define _ASM_X86_AMD_NB_H
3 3
4#include <linux/ioport.h>
4#include <linux/pci.h> 5#include <linux/pci.h>
5 6
6struct amd_nb_bus_dev_range { 7struct amd_nb_bus_dev_range {
@@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 14extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14 15
15extern bool early_is_amd_nb(u32 value); 16extern bool early_is_amd_nb(u32 value);
17extern struct resource *amd_get_mmconfig_range(struct resource *res);
16extern int amd_cache_northbridges(void); 18extern int amd_cache_northbridges(void);
17extern void amd_flush_garts(void); 19extern void amd_flush_garts(void);
18extern int amd_numa_init(void); 20extern int amd_numa_init(void);
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index af60d8a2e28..0acbac299e4 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -18,24 +18,6 @@
18 18
19#ifdef CONFIG_APB_TIMER 19#ifdef CONFIG_APB_TIMER
20 20
21/* Langwell DW APB timer registers */
22#define APBTMR_N_LOAD_COUNT 0x00
23#define APBTMR_N_CURRENT_VALUE 0x04
24#define APBTMR_N_CONTROL 0x08
25#define APBTMR_N_EOI 0x0c
26#define APBTMR_N_INT_STATUS 0x10
27
28#define APBTMRS_INT_STATUS 0xa0
29#define APBTMRS_EOI 0xa4
30#define APBTMRS_RAW_INT_STATUS 0xa8
31#define APBTMRS_COMP_VERSION 0xac
32#define APBTMRS_REG_SIZE 0x14
33
34/* register bits */
35#define APBTMR_CONTROL_ENABLE (1<<0)
36#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */
37#define APBTMR_CONTROL_INT (1<<2)
38
39/* default memory mapped register base */ 21/* default memory mapped register base */
40#define LNW_SCU_ADDR 0xFF100000 22#define LNW_SCU_ADDR 0xFF100000
41#define LNW_EXT_TIMER_OFFSET 0x1B800 23#define LNW_EXT_TIMER_OFFSET 0x1B800
@@ -43,14 +25,13 @@
43#define LNW_EXT_TIMER_PGOFFSET 0x800 25#define LNW_EXT_TIMER_PGOFFSET 0x800
44 26
45/* APBT clock speed range from PCLK to fabric base, 25-100MHz */ 27/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
46#define APBT_MAX_FREQ 50 28#define APBT_MAX_FREQ 50000000
47#define APBT_MIN_FREQ 1 29#define APBT_MIN_FREQ 1000000
48#define APBT_MMAP_SIZE 1024 30#define APBT_MMAP_SIZE 1024
49 31
50#define APBT_DEV_USED 1 32#define APBT_DEV_USED 1
51 33
52extern void apbt_time_init(void); 34extern void apbt_time_init(void);
53extern struct clock_event_device *global_clock_event;
54extern unsigned long apbt_quick_calibrate(void); 35extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); 36extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void); 37extern void apbt_setup_secondary_clock(void);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4a0b7c7e2cc..9b7273cb219 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -8,7 +8,7 @@
8#include <asm/cpufeature.h> 8#include <asm/cpufeature.h>
9#include <asm/processor.h> 9#include <asm/processor.h>
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/atomic.h> 11#include <linux/atomic.h>
12#include <asm/fixmap.h> 12#include <asm/fixmap.h>
13#include <asm/mpspec.h> 13#include <asm/mpspec.h>
14#include <asm/system.h> 14#include <asm/system.h>
@@ -495,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
495 return; 495 return;
496} 496}
497 497
498extern struct apic *generic_bigsmp_probe(void); 498extern void generic_bigsmp_probe(void);
499 499
500 500
501#ifdef CONFIG_X86_LOCAL_APIC 501#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b3ed1e1460f..9412d6558c8 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,9 +3,11 @@
3 3
4#ifdef __ASSEMBLY__ 4#ifdef __ASSEMBLY__
5# define __ASM_FORM(x) x 5# define __ASM_FORM(x) x
6# define __ASM_FORM_COMMA(x) x,
6# define __ASM_EX_SEC .section __ex_table, "a" 7# define __ASM_EX_SEC .section __ex_table, "a"
7#else 8#else
8# define __ASM_FORM(x) " " #x " " 9# define __ASM_FORM(x) " " #x " "
10# define __ASM_FORM_COMMA(x) " " #x ","
9# define __ASM_EX_SEC " .section __ex_table,\"a\"\n" 11# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
10#endif 12#endif
11 13
@@ -15,7 +17,8 @@
15# define __ASM_SEL(a,b) __ASM_FORM(b) 17# define __ASM_SEL(a,b) __ASM_FORM(b)
16#endif 18#endif
17 19
18#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q) 20#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
21 inst##q##__VA_ARGS__)
19#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) 22#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
20 23
21#define _ASM_PTR __ASM_SEL(.long, .quad) 24#define _ASM_PTR __ASM_SEL(.long, .quad)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 952a826ac4e..10572e309ab 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -221,15 +221,15 @@ static inline int atomic_xchg(atomic_t *v, int new)
221} 221}
222 222
223/** 223/**
224 * atomic_add_unless - add unless the number is already a given value 224 * __atomic_add_unless - add unless the number is already a given value
225 * @v: pointer of type atomic_t 225 * @v: pointer of type atomic_t
226 * @a: the amount to add to v... 226 * @a: the amount to add to v...
227 * @u: ...unless v is equal to u. 227 * @u: ...unless v is equal to u.
228 * 228 *
229 * Atomically adds @a to @v, so long as @v was not already @u. 229 * Atomically adds @a to @v, so long as @v was not already @u.
230 * Returns non-zero if @v was not @u, and zero otherwise. 230 * Returns the old value of @v.
231 */ 231 */
232static inline int atomic_add_unless(atomic_t *v, int a, int u) 232static inline int __atomic_add_unless(atomic_t *v, int a, int u)
233{ 233{
234 int c, old; 234 int c, old;
235 c = atomic_read(v); 235 c = atomic_read(v);
@@ -241,10 +241,9 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
241 break; 241 break;
242 c = old; 242 c = old;
243 } 243 }
244 return c != (u); 244 return c;
245} 245}
246 246
247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
248 247
249/* 248/*
250 * atomic_dec_if_positive - decrement by 1 if old value positive 249 * atomic_dec_if_positive - decrement by 1 if old value positive
@@ -319,5 +318,4 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
319# include "atomic64_64.h" 318# include "atomic64_64.h"
320#endif 319#endif
321 320
322#include <asm-generic/atomic-long.h>
323#endif /* _ASM_X86_ATOMIC_H */ 321#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 2a934aa19a4..24098aafce0 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -263,7 +263,7 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v)
263 * @u: ...unless v is equal to u. 263 * @u: ...unless v is equal to u.
264 * 264 *
265 * Atomically adds @a to @v, so long as it was not @u. 265 * Atomically adds @a to @v, so long as it was not @u.
266 * Returns non-zero if @v was not @u, and zero otherwise. 266 * Returns the old value of @v.
267 */ 267 */
268static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) 268static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
269{ 269{
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 49fd1ea2295..017594d403f 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -202,7 +202,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
202 * @u: ...unless v is equal to u. 202 * @u: ...unless v is equal to u.
203 * 203 *
204 * Atomically adds @a to @v, so long as it was not @u. 204 * Atomically adds @a to @v, so long as it was not @u.
205 * Returns non-zero if @v was not @u, and zero otherwise. 205 * Returns the old value of @v.
206 */ 206 */
207static inline int atomic64_add_unless(atomic64_t *v, long a, long u) 207static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
208{ 208{
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 69d58131bc8..1775d6e5920 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -458,10 +458,7 @@ static inline int fls(int x)
458 458
459#include <asm-generic/bitops/le.h> 459#include <asm-generic/bitops/le.h>
460 460
461#define ext2_set_bit_atomic(lock, nr, addr) \ 461#include <asm-generic/bitops/ext2-atomic-setbit.h>
462 test_and_set_bit((nr), (unsigned long *)(addr))
463#define ext2_clear_bit_atomic(lock, nr, addr) \
464 test_and_clear_bit((nr), (unsigned long *)(addr))
465 462
466#endif /* __KERNEL__ */ 463#endif /* __KERNEL__ */
467#endif /* _ASM_X86_BITOPS_H */ 464#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 30af5a83216..a9e3a740f69 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,6 +46,7 @@ For 32-bit we have the following conventions - kernel is built with
46 46
47*/ 47*/
48 48
49#include "dwarf2.h"
49 50
50/* 51/*
51 * 64-bit system call stack frame layout defines and helpers, for 52 * 64-bit system call stack frame layout defines and helpers, for
@@ -84,72 +85,57 @@ For 32-bit we have the following conventions - kernel is built with
84#define ARGOFFSET R11 85#define ARGOFFSET R11
85#define SWFRAME ORIG_RAX 86#define SWFRAME ORIG_RAX
86 87
87 .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0 88 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
88 subq $9*8+\addskip, %rsp 89 subq $9*8+\addskip, %rsp
89 CFI_ADJUST_CFA_OFFSET 9*8+\addskip 90 CFI_ADJUST_CFA_OFFSET 9*8+\addskip
90 movq %rdi, 8*8(%rsp) 91 movq_cfi rdi, 8*8
91 CFI_REL_OFFSET rdi, 8*8 92 movq_cfi rsi, 7*8
92 movq %rsi, 7*8(%rsp) 93 movq_cfi rdx, 6*8
93 CFI_REL_OFFSET rsi, 7*8 94
94 movq %rdx, 6*8(%rsp) 95 .if \save_rcx
95 CFI_REL_OFFSET rdx, 6*8 96 movq_cfi rcx, 5*8
96 .if \norcx
97 .else
98 movq %rcx, 5*8(%rsp)
99 CFI_REL_OFFSET rcx, 5*8
100 .endif 97 .endif
101 movq %rax, 4*8(%rsp) 98
102 CFI_REL_OFFSET rax, 4*8 99 movq_cfi rax, 4*8
103 .if \nor891011 100
104 .else 101 .if \save_r891011
105 movq %r8, 3*8(%rsp) 102 movq_cfi r8, 3*8
106 CFI_REL_OFFSET r8, 3*8 103 movq_cfi r9, 2*8
107 movq %r9, 2*8(%rsp) 104 movq_cfi r10, 1*8
108 CFI_REL_OFFSET r9, 2*8 105 movq_cfi r11, 0*8
109 movq %r10, 1*8(%rsp)
110 CFI_REL_OFFSET r10, 1*8
111 movq %r11, (%rsp)
112 CFI_REL_OFFSET r11, 0*8
113 .endif 106 .endif
107
114 .endm 108 .endm
115 109
116#define ARG_SKIP (9*8) 110#define ARG_SKIP (9*8)
117 111
118 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ 112 .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
119 skipr8910=0, skiprdx=0 113 rstor_r8910=1, rstor_rdx=1
120 .if \skipr11 114 .if \rstor_r11
121 .else 115 movq_cfi_restore 0*8, r11
122 movq (%rsp), %r11
123 CFI_RESTORE r11
124 .endif 116 .endif
125 .if \skipr8910 117
126 .else 118 .if \rstor_r8910
127 movq 1*8(%rsp), %r10 119 movq_cfi_restore 1*8, r10
128 CFI_RESTORE r10 120 movq_cfi_restore 2*8, r9
129 movq 2*8(%rsp), %r9 121 movq_cfi_restore 3*8, r8
130 CFI_RESTORE r9
131 movq 3*8(%rsp), %r8
132 CFI_RESTORE r8
133 .endif 122 .endif
134 .if \skiprax 123
135 .else 124 .if \rstor_rax
136 movq 4*8(%rsp), %rax 125 movq_cfi_restore 4*8, rax
137 CFI_RESTORE rax
138 .endif 126 .endif
139 .if \skiprcx 127
140 .else 128 .if \rstor_rcx
141 movq 5*8(%rsp), %rcx 129 movq_cfi_restore 5*8, rcx
142 CFI_RESTORE rcx
143 .endif 130 .endif
144 .if \skiprdx 131
145 .else 132 .if \rstor_rdx
146 movq 6*8(%rsp), %rdx 133 movq_cfi_restore 6*8, rdx
147 CFI_RESTORE rdx
148 .endif 134 .endif
149 movq 7*8(%rsp), %rsi 135
150 CFI_RESTORE rsi 136 movq_cfi_restore 7*8, rsi
151 movq 8*8(%rsp), %rdi 137 movq_cfi_restore 8*8, rdi
152 CFI_RESTORE rdi 138
153 .if ARG_SKIP+\addskip > 0 139 .if ARG_SKIP+\addskip > 0
154 addq $ARG_SKIP+\addskip, %rsp 140 addq $ARG_SKIP+\addskip, %rsp
155 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) 141 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
@@ -176,33 +162,21 @@ For 32-bit we have the following conventions - kernel is built with
176 .macro SAVE_REST 162 .macro SAVE_REST
177 subq $REST_SKIP, %rsp 163 subq $REST_SKIP, %rsp
178 CFI_ADJUST_CFA_OFFSET REST_SKIP 164 CFI_ADJUST_CFA_OFFSET REST_SKIP
179 movq %rbx, 5*8(%rsp) 165 movq_cfi rbx, 5*8
180 CFI_REL_OFFSET rbx, 5*8 166 movq_cfi rbp, 4*8
181 movq %rbp, 4*8(%rsp) 167 movq_cfi r12, 3*8
182 CFI_REL_OFFSET rbp, 4*8 168 movq_cfi r13, 2*8
183 movq %r12, 3*8(%rsp) 169 movq_cfi r14, 1*8
184 CFI_REL_OFFSET r12, 3*8 170 movq_cfi r15, 0*8
185 movq %r13, 2*8(%rsp)
186 CFI_REL_OFFSET r13, 2*8
187 movq %r14, 1*8(%rsp)
188 CFI_REL_OFFSET r14, 1*8
189 movq %r15, (%rsp)
190 CFI_REL_OFFSET r15, 0*8
191 .endm 171 .endm
192 172
193 .macro RESTORE_REST 173 .macro RESTORE_REST
194 movq (%rsp), %r15 174 movq_cfi_restore 0*8, r15
195 CFI_RESTORE r15 175 movq_cfi_restore 1*8, r14
196 movq 1*8(%rsp), %r14 176 movq_cfi_restore 2*8, r13
197 CFI_RESTORE r14 177 movq_cfi_restore 3*8, r12
198 movq 2*8(%rsp), %r13 178 movq_cfi_restore 4*8, rbp
199 CFI_RESTORE r13 179 movq_cfi_restore 5*8, rbx
200 movq 3*8(%rsp), %r12
201 CFI_RESTORE r12
202 movq 4*8(%rsp), %rbp
203 CFI_RESTORE rbp
204 movq 5*8(%rsp), %rbx
205 CFI_RESTORE rbx
206 addq $REST_SKIP, %rsp 180 addq $REST_SKIP, %rsp
207 CFI_ADJUST_CFA_OFFSET -(REST_SKIP) 181 CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
208 .endm 182 .endm
@@ -214,7 +188,7 @@ For 32-bit we have the following conventions - kernel is built with
214 188
215 .macro RESTORE_ALL addskip=0 189 .macro RESTORE_ALL addskip=0
216 RESTORE_REST 190 RESTORE_REST
217 RESTORE_ARGS 0, \addskip 191 RESTORE_ARGS 1, \addskip
218 .endm 192 .endm
219 193
220 .macro icebp 194 .macro icebp
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 00000000000..0bdbbb3b9ce
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,18 @@
1/* x86-specific clocksource additions */
2
3#ifndef _ASM_X86_CLOCKSOURCE_H
4#define _ASM_X86_CLOCKSOURCE_H
5
6#ifdef CONFIG_X86_64
7
8#define VCLOCK_NONE 0 /* No vDSO clock available. */
9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
11
12struct arch_clocksource_data {
13 int vclock_mode;
14};
15
16#endif /* CONFIG_X86_64 */
17
18#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 284a6e8f7ce..3deb7250624 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -280,4 +280,52 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
280 280
281#endif 281#endif
282 282
283#define cmpxchg8b(ptr, o1, o2, n1, n2) \
284({ \
285 char __ret; \
286 __typeof__(o2) __dummy; \
287 __typeof__(*(ptr)) __old1 = (o1); \
288 __typeof__(o2) __old2 = (o2); \
289 __typeof__(*(ptr)) __new1 = (n1); \
290 __typeof__(o2) __new2 = (n2); \
291 asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \
292 : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
293 : "a" (__old1), "d"(__old2), \
294 "b" (__new1), "c" (__new2) \
295 : "memory"); \
296 __ret; })
297
298
299#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \
300({ \
301 char __ret; \
302 __typeof__(o2) __dummy; \
303 __typeof__(*(ptr)) __old1 = (o1); \
304 __typeof__(o2) __old2 = (o2); \
305 __typeof__(*(ptr)) __new1 = (n1); \
306 __typeof__(o2) __new2 = (n2); \
307 asm volatile("cmpxchg8b %2; setz %1" \
308 : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
309 : "a" (__old), "d"(__old2), \
310 "b" (__new1), "c" (__new2), \
311 : "memory"); \
312 __ret; })
313
314
315#define cmpxchg_double(ptr, o1, o2, n1, n2) \
316({ \
317 BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
318 VM_BUG_ON((unsigned long)(ptr) % 8); \
319 cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \
320})
321
322#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
323({ \
324 BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
325 VM_BUG_ON((unsigned long)(ptr) % 8); \
326 cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
327})
328
329#define system_has_cmpxchg_double() cpu_has_cx8
330
283#endif /* _ASM_X86_CMPXCHG_32_H */ 331#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 423ae58aa02..7cf5c0a2443 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -151,4 +151,49 @@ extern void __cmpxchg_wrong_size(void);
151 cmpxchg_local((ptr), (o), (n)); \ 151 cmpxchg_local((ptr), (o), (n)); \
152}) 152})
153 153
154#define cmpxchg16b(ptr, o1, o2, n1, n2) \
155({ \
156 char __ret; \
157 __typeof__(o2) __junk; \
158 __typeof__(*(ptr)) __old1 = (o1); \
159 __typeof__(o2) __old2 = (o2); \
160 __typeof__(*(ptr)) __new1 = (n1); \
161 __typeof__(o2) __new2 = (n2); \
162 asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \
163 : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
164 : "b"(__new1), "c"(__new2), \
165 "a"(__old1), "d"(__old2)); \
166 __ret; })
167
168
169#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \
170({ \
171 char __ret; \
172 __typeof__(o2) __junk; \
173 __typeof__(*(ptr)) __old1 = (o1); \
174 __typeof__(o2) __old2 = (o2); \
175 __typeof__(*(ptr)) __new1 = (n1); \
176 __typeof__(o2) __new2 = (n2); \
177 asm volatile("cmpxchg16b %2;setz %1" \
178 : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
179 : "b"(__new1), "c"(__new2), \
180 "a"(__old1), "d"(__old2)); \
181 __ret; })
182
183#define cmpxchg_double(ptr, o1, o2, n1, n2) \
184({ \
185 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
186 VM_BUG_ON((unsigned long)(ptr) % 16); \
187 cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \
188})
189
190#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
191({ \
192 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
193 VM_BUG_ON((unsigned long)(ptr) % 16); \
194 cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
195})
196
197#define system_has_cmpxchg_double() cpu_has_cx16
198
154#endif /* _ASM_X86_CMPXCHG_64_H */ 199#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 71cc3800712..88b23a43f34 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -288,6 +288,8 @@ extern const char * const x86_power_flags[32];
288#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 288#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
289#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) 289#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
290#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) 290#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
291#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8)
292#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
291 293
292#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 294#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
293# define cpu_has_invlpg 1 295# define cpu_has_invlpg 1
@@ -330,9 +332,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
330 asm goto("1: jmp %l[t_no]\n" 332 asm goto("1: jmp %l[t_no]\n"
331 "2:\n" 333 "2:\n"
332 ".section .altinstructions,\"a\"\n" 334 ".section .altinstructions,\"a\"\n"
333 _ASM_ALIGN "\n" 335 " .long 1b - .\n"
334 _ASM_PTR "1b\n" 336 " .long 0\n" /* no replacement */
335 _ASM_PTR "0\n" /* no replacement */
336 " .word %P0\n" /* feature bit */ 337 " .word %P0\n" /* feature bit */
337 " .byte 2b - 1b\n" /* source len */ 338 " .byte 2b - 1b\n" /* source len */
338 " .byte 0\n" /* replacement len */ 339 " .byte 0\n" /* replacement len */
@@ -348,9 +349,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
348 asm volatile("1: movb $0,%0\n" 349 asm volatile("1: movb $0,%0\n"
349 "2:\n" 350 "2:\n"
350 ".section .altinstructions,\"a\"\n" 351 ".section .altinstructions,\"a\"\n"
351 _ASM_ALIGN "\n" 352 " .long 1b - .\n"
352 _ASM_PTR "1b\n" 353 " .long 3f - .\n"
353 _ASM_PTR "3f\n"
354 " .word %P1\n" /* feature bit */ 354 " .word %P1\n" /* feature bit */
355 " .byte 2b - 1b\n" /* source len */ 355 " .byte 2b - 1b\n" /* source len */
356 " .byte 4f - 3f\n" /* replacement len */ 356 " .byte 4f - 3f\n" /* replacement len */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 409a649204a..9b3b4f2754c 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -1,30 +1,7 @@
1#ifndef _ASM_X86_DELAY_H 1#ifndef _ASM_X86_DELAY_H
2#define _ASM_X86_DELAY_H 2#define _ASM_X86_DELAY_H
3 3
4/* 4#include <asm-generic/delay.h>
5 * Copyright (C) 1993 Linus Torvalds
6 *
7 * Delay routines calling functions in arch/x86/lib/delay.c
8 */
9
10/* Undefined functions to get compile-time errors */
11extern void __bad_udelay(void);
12extern void __bad_ndelay(void);
13
14extern void __udelay(unsigned long usecs);
15extern void __ndelay(unsigned long nsecs);
16extern void __const_udelay(unsigned long xloops);
17extern void __delay(unsigned long loops);
18
19/* 0x10c7 is 2**32 / 1000000 (rounded up) */
20#define udelay(n) (__builtin_constant_p(n) ? \
21 ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
22 __udelay(n))
23
24/* 0x5 is 2**32 / 1000000000 (rounded up) */
25#define ndelay(n) (__builtin_constant_p(n) ? \
26 ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
27 __ndelay(n))
28 5
29void use_tsc_delay(void); 6void use_tsc_delay(void);
30 7
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 7b439d9aea2..41935fadfdf 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -27,8 +27,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
27 27
28 desc->base2 = (info->base_addr & 0xff000000) >> 24; 28 desc->base2 = (info->base_addr & 0xff000000) >> 24;
29 /* 29 /*
30 * Don't allow setting of the lm bit. It is useless anyway 30 * Don't allow setting of the lm bit. It would confuse
31 * because 64bit system calls require __USER_CS: 31 * user_64bit_mode and would get overridden by sysret anyway.
32 */ 32 */
33 desc->l = 0; 33 desc->l = 0;
34} 34}
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 029f230ab63..63a2a03d7d5 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,7 +8,7 @@ struct dev_archdata {
8#ifdef CONFIG_X86_64 8#ifdef CONFIG_X86_64
9struct dma_map_ops *dma_ops; 9struct dma_map_ops *dma_ops;
10#endif 10#endif
11#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU) 11#if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
12 void *iommu; /* hook for IOMMU specific extension */ 12 void *iommu; /* hook for IOMMU specific extension */
13#endif 13#endif
14}; 14};
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 3b0d7ef959b..e2c555f2191 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -54,8 +54,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
54BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) 54BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
55#endif 55#endif
56 56
57#ifdef CONFIG_X86_MCE
58BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
59#endif
60
61#endif 57#endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4729b2b6311..460c74e4852 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,6 +78,7 @@ enum fixed_addresses {
78 VSYSCALL_LAST_PAGE, 78 VSYSCALL_LAST_PAGE,
79 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE 79 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
80 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, 80 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
81 VVAR_PAGE,
81 VSYSCALL_HPET, 82 VSYSCALL_HPET,
82#endif 83#endif
83 FIX_DBGP_BASE, 84 FIX_DBGP_BASE,
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 2c6fc9e6281..3b629f47eb6 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,5 +1,6 @@
1#ifdef __ASSEMBLY__ 1#ifdef __ASSEMBLY__
2 2
3#include <asm/asm.h>
3#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
4 5
5/* The annotation hides the frame from the unwinder and makes it look 6/* The annotation hides the frame from the unwinder and makes it look
@@ -7,13 +8,13 @@
7 frame pointer later */ 8 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER 9#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME 10 .macro FRAME
10 pushl_cfi %ebp 11 __ASM_SIZE(push,_cfi) %__ASM_REG(bp)
11 CFI_REL_OFFSET ebp,0 12 CFI_REL_OFFSET __ASM_REG(bp), 0
12 movl %esp,%ebp 13 __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp)
13 .endm 14 .endm
14 .macro ENDFRAME 15 .macro ENDFRAME
15 popl_cfi %ebp 16 __ASM_SIZE(pop,_cfi) %__ASM_REG(bp)
16 CFI_RESTORE ebp 17 CFI_RESTORE __ASM_REG(bp)
17 .endm 18 .endm
18#else 19#else
19 .macro FRAME 20 .macro FRAME
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index c490d89a9b7..8f1e5445d37 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -21,7 +21,7 @@
21#include <linux/profile.h> 21#include <linux/profile.h>
22#include <linux/smp.h> 22#include <linux/smp.h>
23 23
24#include <asm/atomic.h> 24#include <linux/atomic.h>
25#include <asm/irq.h> 25#include <asm/irq.h>
26#include <asm/sections.h> 26#include <asm/sections.h>
27 27
@@ -34,7 +34,6 @@ extern void irq_work_interrupt(void);
34extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
35extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
36extern void reschedule_interrupt(void); 36extern void reschedule_interrupt(void);
37extern void mce_self_interrupt(void);
38 37
39extern void invalidate_interrupt(void); 38extern void invalidate_interrupt(void);
40extern void invalidate_interrupt0(void); 39extern void invalidate_interrupt0(void);
@@ -122,7 +121,7 @@ struct irq_cfg {
122 cpumask_var_t old_domain; 121 cpumask_var_t old_domain;
123 u8 vector; 122 u8 vector;
124 u8 move_in_progress : 1; 123 u8 move_in_progress : 1;
125#ifdef CONFIG_INTR_REMAP 124#ifdef CONFIG_IRQ_REMAP
126 struct irq_2_iommu irq_2_iommu; 125 struct irq_2_iommu irq_2_iommu;
127#endif 126#endif
128}; 127};
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
deleted file mode 100644
index 65aaa91d585..00000000000
--- a/arch/x86/include/asm/i8253.h
+++ /dev/null
@@ -1,20 +0,0 @@
1#ifndef _ASM_X86_I8253_H
2#define _ASM_X86_I8253_H
3
4/* i8253A PIT registers */
5#define PIT_MODE 0x43
6#define PIT_CH0 0x40
7#define PIT_CH2 0x42
8
9#define PIT_LATCH LATCH
10
11extern raw_spinlock_t i8253_lock;
12
13extern struct clock_event_device *global_clock_event;
14
15extern void setup_pit_timer(void);
16
17#define inb_pit inb_p
18#define outb_pit outb_p
19
20#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index f49253d7571..f1e4268ef3c 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -1,13 +1,6 @@
1#ifndef _ASM_X86_IDLE_H 1#ifndef _ASM_X86_IDLE_H
2#define _ASM_X86_IDLE_H 2#define _ASM_X86_IDLE_H
3 3
4#define IDLE_START 1
5#define IDLE_END 2
6
7struct notifier_block;
8void idle_notifier_register(struct notifier_block *n);
9void idle_notifier_unregister(struct notifier_block *n);
10
11#ifdef CONFIG_X86_64 4#ifdef CONFIG_X86_64
12void enter_idle(void); 5void enter_idle(void);
13void exit_idle(void); 6void exit_idle(void);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d02804d650c..d8e8eefbe24 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,8 +40,6 @@
40#include <linux/compiler.h> 40#include <linux/compiler.h>
41#include <asm/page.h> 41#include <asm/page.h>
42 42
43#include <xen/xen.h>
44
45#define build_mmio_read(name, size, type, reg, barrier) \ 43#define build_mmio_read(name, size, type, reg, barrier) \
46static inline type name(const volatile void __iomem *addr) \ 44static inline type name(const volatile void __iomem *addr) \
47{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \ 45{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -334,6 +332,7 @@ extern void fixup_early_ioremap(void);
334extern bool is_early_ioremap_ptep(pte_t *ptep); 332extern bool is_early_ioremap_ptep(pte_t *ptep);
335 333
336#ifdef CONFIG_XEN 334#ifdef CONFIG_XEN
335#include <xen/xen.h>
337struct bio_vec; 336struct bio_vec;
338 337
339extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, 338extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99cef15..dffc38ee625 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops;
5extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
6extern int iommu_detected; 6extern int iommu_detected;
7extern int iommu_pass_through; 7extern int iommu_pass_through;
8extern int iommu_group_mf;
8 9
9/* 10 seconds */ 10/* 10 seconds */
10#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 1c23360fb2d..47d99934580 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,7 +3,8 @@
3 3
4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
5 5
6#ifdef CONFIG_INTR_REMAP 6#ifdef CONFIG_IRQ_REMAP
7static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
7static inline void prepare_irte(struct irte *irte, int vector, 8static inline void prepare_irte(struct irte *irte, int vector,
8 unsigned int dest) 9 unsigned int dest)
9{ 10{
@@ -36,6 +37,9 @@ static inline bool irq_remapped(struct irq_cfg *cfg)
36{ 37{
37 return false; 38 return false;
38} 39}
40static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
41{
42}
39#endif 43#endif
40 44
41#endif /* _ASM_X86_IRQ_REMAPPING_H */ 45#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 99a44cf9845..7de6ad70365 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -17,7 +17,7 @@
17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
18 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
19 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
22 * 22 *
23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
@@ -109,11 +109,6 @@
109 109
110#define UV_BAU_MESSAGE 0xf5 110#define UV_BAU_MESSAGE 0xf5
111 111
112/*
113 * Self IPI vector for machine checks
114 */
115#define MCE_SELF_VECTOR 0xf4
116
117/* Xen vector callback to receive events in a HVM domain */ 112/* Xen vector callback to receive events in a HVM domain */
118#define XEN_HVM_EVTCHN_CALLBACK 0xf3 113#define XEN_HVM_EVTCHN_CALLBACK 0xf3
119 114
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 5745ce8bf10..bba3cf88e62 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -60,23 +60,24 @@ static inline void native_halt(void)
60#include <asm/paravirt.h> 60#include <asm/paravirt.h>
61#else 61#else
62#ifndef __ASSEMBLY__ 62#ifndef __ASSEMBLY__
63#include <linux/types.h>
63 64
64static inline unsigned long arch_local_save_flags(void) 65static inline notrace unsigned long arch_local_save_flags(void)
65{ 66{
66 return native_save_fl(); 67 return native_save_fl();
67} 68}
68 69
69static inline void arch_local_irq_restore(unsigned long flags) 70static inline notrace void arch_local_irq_restore(unsigned long flags)
70{ 71{
71 native_restore_fl(flags); 72 native_restore_fl(flags);
72} 73}
73 74
74static inline void arch_local_irq_disable(void) 75static inline notrace void arch_local_irq_disable(void)
75{ 76{
76 native_irq_disable(); 77 native_irq_disable();
77} 78}
78 79
79static inline void arch_local_irq_enable(void) 80static inline notrace void arch_local_irq_enable(void)
80{ 81{
81 native_irq_enable(); 82 native_irq_enable();
82} 83}
@@ -102,7 +103,7 @@ static inline void halt(void)
102/* 103/*
103 * For spinlocks, etc: 104 * For spinlocks, etc:
104 */ 105 */
105static inline unsigned long arch_local_irq_save(void) 106static inline notrace unsigned long arch_local_irq_save(void)
106{ 107{
107 unsigned long flags = arch_local_save_flags(); 108 unsigned long flags = arch_local_save_flags();
108 arch_local_irq_disable(); 109 arch_local_irq_disable();
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fe2cc6e105f..d73f1571bde 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -28,7 +28,6 @@ extern void show_registers(struct pt_regs *regs);
28extern void show_trace(struct task_struct *t, struct pt_regs *regs, 28extern void show_trace(struct task_struct *t, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp); 29 unsigned long *sp, unsigned long bp);
30extern void __show_regs(struct pt_regs *regs, int all); 30extern void __show_regs(struct pt_regs *regs, int all);
31extern void show_regs(struct pt_regs *regs);
32extern unsigned long oops_begin(void); 31extern unsigned long oops_begin(void);
33extern void oops_end(unsigned long, struct pt_regs *, int signr); 32extern void oops_end(unsigned long, struct pt_regs *, int signr);
34#ifdef CONFIG_KEXEC 33#ifdef CONFIG_KEXEC
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0049211959c..6040d115ef5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,26 @@ struct read_cache {
229 unsigned long end; 229 unsigned long end;
230}; 230};
231 231
232struct decode_cache { 232struct x86_emulate_ctxt {
233 struct x86_emulate_ops *ops;
234
235 /* Register state before/after emulation. */
236 unsigned long eflags;
237 unsigned long eip; /* eip before instruction emulation */
238 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
239 int mode;
240
241 /* interruptibility state, as a result of execution of STI or MOV SS */
242 int interruptibility;
243
244 bool guest_mode; /* guest running a nested guest */
245 bool perm_ok; /* do not check permissions if true */
246 bool only_vendor_specific_insn;
247
248 bool have_exception;
249 struct x86_exception exception;
250
251 /* decode cache */
233 u8 twobyte; 252 u8 twobyte;
234 u8 b; 253 u8 b;
235 u8 intercept; 254 u8 intercept;
@@ -246,8 +265,6 @@ struct decode_cache {
246 unsigned int d; 265 unsigned int d;
247 int (*execute)(struct x86_emulate_ctxt *ctxt); 266 int (*execute)(struct x86_emulate_ctxt *ctxt);
248 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 267 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
249 unsigned long regs[NR_VCPU_REGS];
250 unsigned long eip;
251 /* modrm */ 268 /* modrm */
252 u8 modrm; 269 u8 modrm;
253 u8 modrm_mod; 270 u8 modrm_mod;
@@ -255,34 +272,14 @@ struct decode_cache {
255 u8 modrm_rm; 272 u8 modrm_rm;
256 u8 modrm_seg; 273 u8 modrm_seg;
257 bool rip_relative; 274 bool rip_relative;
275 unsigned long _eip;
276 /* Fields above regs are cleared together. */
277 unsigned long regs[NR_VCPU_REGS];
258 struct fetch_cache fetch; 278 struct fetch_cache fetch;
259 struct read_cache io_read; 279 struct read_cache io_read;
260 struct read_cache mem_read; 280 struct read_cache mem_read;
261}; 281};
262 282
263struct x86_emulate_ctxt {
264 struct x86_emulate_ops *ops;
265
266 /* Register state before/after emulation. */
267 unsigned long eflags;
268 unsigned long eip; /* eip before instruction emulation */
269 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
270 int mode;
271
272 /* interruptibility state, as a result of execution of STI or MOV SS */
273 int interruptibility;
274
275 bool guest_mode; /* guest running a nested guest */
276 bool perm_ok; /* do not check permissions if true */
277 bool only_vendor_specific_insn;
278
279 bool have_exception;
280 struct x86_exception exception;
281
282 /* decode cache */
283 struct decode_cache decode;
284};
285
286/* Repeat String Operation Prefix */ 283/* Repeat String Operation Prefix */
287#define REPE_PREFIX 0xf3 284#define REPE_PREFIX 0xf3
288#define REPNE_PREFIX 0xf2 285#define REPNE_PREFIX 0xf2
@@ -373,6 +370,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
373int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 370int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
374 u16 tss_selector, int reason, 371 u16 tss_selector, int reason,
375 bool has_error_code, u32 error_code); 372 bool has_error_code, u32 error_code);
376int emulate_int_real(struct x86_emulate_ctxt *ctxt, 373int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
377 struct x86_emulate_ops *ops, int irq);
378#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 374#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2ac8e2ee89..dd51c83aa5d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
48 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 48 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
49 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 49 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
50 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 50 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
51 | X86_CR4_OSXSAVE \ 51 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
53 53
54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -205,6 +205,7 @@ union kvm_mmu_page_role {
205 unsigned invalid:1; 205 unsigned invalid:1;
206 unsigned nxe:1; 206 unsigned nxe:1;
207 unsigned cr0_wp:1; 207 unsigned cr0_wp:1;
208 unsigned smep_andnot_wp:1;
208 }; 209 };
209}; 210};
210 211
@@ -227,15 +228,17 @@ struct kvm_mmu_page {
227 * in this shadow page. 228 * in this shadow page.
228 */ 229 */
229 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 230 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
230 bool multimapped; /* More than one parent_pte? */
231 bool unsync; 231 bool unsync;
232 int root_count; /* Currently serving as active root */ 232 int root_count; /* Currently serving as active root */
233 unsigned int unsync_children; 233 unsigned int unsync_children;
234 union { 234 unsigned long parent_ptes; /* Reverse mapping for parent_pte */
235 u64 *parent_pte; /* !multimapped */
236 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
237 };
238 DECLARE_BITMAP(unsync_child_bitmap, 512); 235 DECLARE_BITMAP(unsync_child_bitmap, 512);
236
237#ifdef CONFIG_X86_32
238 int clear_spte_count;
239#endif
240
241 struct rcu_head rcu;
239}; 242};
240 243
241struct kvm_pv_mmu_op_buffer { 244struct kvm_pv_mmu_op_buffer {
@@ -269,8 +272,6 @@ struct kvm_mmu {
269 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 272 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
270 struct x86_exception *exception); 273 struct x86_exception *exception);
271 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); 274 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
272 void (*prefetch_page)(struct kvm_vcpu *vcpu,
273 struct kvm_mmu_page *page);
274 int (*sync_page)(struct kvm_vcpu *vcpu, 275 int (*sync_page)(struct kvm_vcpu *vcpu,
275 struct kvm_mmu_page *sp); 276 struct kvm_mmu_page *sp);
276 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 277 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -346,8 +347,7 @@ struct kvm_vcpu_arch {
346 * put it here to avoid allocation */ 347 * put it here to avoid allocation */
347 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 348 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
348 349
349 struct kvm_mmu_memory_cache mmu_pte_chain_cache; 350 struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
350 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
351 struct kvm_mmu_memory_cache mmu_page_cache; 351 struct kvm_mmu_memory_cache mmu_page_cache;
352 struct kvm_mmu_memory_cache mmu_page_header_cache; 352 struct kvm_mmu_memory_cache mmu_page_header_cache;
353 353
@@ -393,6 +393,15 @@ struct kvm_vcpu_arch {
393 unsigned int hw_tsc_khz; 393 unsigned int hw_tsc_khz;
394 unsigned int time_offset; 394 unsigned int time_offset;
395 struct page *time_page; 395 struct page *time_page;
396
397 struct {
398 u64 msr_val;
399 u64 last_steal;
400 u64 accum_steal;
401 struct gfn_to_hva_cache stime;
402 struct kvm_steal_time steal;
403 } st;
404
396 u64 last_guest_tsc; 405 u64 last_guest_tsc;
397 u64 last_kernel_ns; 406 u64 last_kernel_ns;
398 u64 last_tsc_nsec; 407 u64 last_tsc_nsec;
@@ -419,6 +428,11 @@ struct kvm_vcpu_arch {
419 u64 mcg_ctl; 428 u64 mcg_ctl;
420 u64 *mce_banks; 429 u64 *mce_banks;
421 430
431 /* Cache MMIO info */
432 u64 mmio_gva;
433 unsigned access;
434 gfn_t mmio_gfn;
435
422 /* used for guest single stepping over the given code position */ 436 /* used for guest single stepping over the given code position */
423 unsigned long singlestep_rip; 437 unsigned long singlestep_rip;
424 438
@@ -441,6 +455,7 @@ struct kvm_arch {
441 unsigned int n_used_mmu_pages; 455 unsigned int n_used_mmu_pages;
442 unsigned int n_requested_mmu_pages; 456 unsigned int n_requested_mmu_pages;
443 unsigned int n_max_mmu_pages; 457 unsigned int n_max_mmu_pages;
458 unsigned int indirect_shadow_pages;
444 atomic_t invlpg_counter; 459 atomic_t invlpg_counter;
445 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 460 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
446 /* 461 /*
@@ -477,6 +492,8 @@ struct kvm_arch {
477 u64 hv_guest_os_id; 492 u64 hv_guest_os_id;
478 u64 hv_hypercall; 493 u64 hv_hypercall;
479 494
495 atomic_t reader_counter;
496
480 #ifdef CONFIG_KVM_MMU_AUDIT 497 #ifdef CONFIG_KVM_MMU_AUDIT
481 int audit_point; 498 int audit_point;
482 #endif 499 #endif
@@ -559,7 +576,7 @@ struct kvm_x86_ops {
559 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 576 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
560 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 577 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
561 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 578 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
562 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); 579 int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
563 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); 580 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
564 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 581 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
565 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 582 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -636,7 +653,6 @@ void kvm_mmu_module_exit(void);
636void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 653void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
637int kvm_mmu_create(struct kvm_vcpu *vcpu); 654int kvm_mmu_create(struct kvm_vcpu *vcpu);
638int kvm_mmu_setup(struct kvm_vcpu *vcpu); 655int kvm_mmu_setup(struct kvm_vcpu *vcpu);
639void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
640void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 656void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
641 u64 dirty_mask, u64 nx_mask, u64 x_mask); 657 u64 dirty_mask, u64 nx_mask, u64 x_mask);
642 658
@@ -830,11 +846,12 @@ enum {
830asmlinkage void kvm_spurious_fault(void); 846asmlinkage void kvm_spurious_fault(void);
831extern bool kvm_rebooting; 847extern bool kvm_rebooting;
832 848
833#define __kvm_handle_fault_on_reboot(insn) \ 849#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
834 "666: " insn "\n\t" \ 850 "666: " insn "\n\t" \
835 "668: \n\t" \ 851 "668: \n\t" \
836 ".pushsection .fixup, \"ax\" \n" \ 852 ".pushsection .fixup, \"ax\" \n" \
837 "667: \n\t" \ 853 "667: \n\t" \
854 cleanup_insn "\n\t" \
838 "cmpb $0, kvm_rebooting \n\t" \ 855 "cmpb $0, kvm_rebooting \n\t" \
839 "jne 668b \n\t" \ 856 "jne 668b \n\t" \
840 __ASM_SIZE(push) " $666b \n\t" \ 857 __ASM_SIZE(push) " $666b \n\t" \
@@ -844,6 +861,9 @@ extern bool kvm_rebooting;
844 _ASM_PTR " 666b, 667b \n\t" \ 861 _ASM_PTR " 666b, 667b \n\t" \
845 ".popsection" 862 ".popsection"
846 863
864#define __kvm_handle_fault_on_reboot(insn) \
865 ____kvm_handle_fault_on_reboot(insn, "")
866
847#define KVM_ARCH_WANT_MMU_NOTIFIER 867#define KVM_ARCH_WANT_MMU_NOTIFIER
848int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 868int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
849int kvm_age_hva(struct kvm *kvm, unsigned long hva); 869int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index a427bf77a93..734c3767cfa 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
21 */ 21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4 23#define KVM_FEATURE_ASYNC_PF 4
24#define KVM_FEATURE_STEAL_TIME 5
24 25
25/* The last 8 bits are used to indicate how to interpret the flags field 26/* The last 8 bits are used to indicate how to interpret the flags field
26 * in pvclock structure. If no bits are set, all flags are ignored. 27 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -30,10 +31,23 @@
30#define MSR_KVM_WALL_CLOCK 0x11 31#define MSR_KVM_WALL_CLOCK 0x11
31#define MSR_KVM_SYSTEM_TIME 0x12 32#define MSR_KVM_SYSTEM_TIME 0x12
32 33
34#define KVM_MSR_ENABLED 1
33/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ 35/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
34#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 36#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
35#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 37#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
36#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 38#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
39#define MSR_KVM_STEAL_TIME 0x4b564d03
40
41struct kvm_steal_time {
42 __u64 steal;
43 __u32 version;
44 __u32 flags;
45 __u32 pad[12];
46};
47
48#define KVM_STEAL_ALIGNMENT_BITS 5
49#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
50#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
37 51
38#define KVM_MAX_MMU_OP_BATCH 32 52#define KVM_MAX_MMU_OP_BATCH 32
39 53
@@ -178,6 +192,7 @@ void __init kvm_guest_init(void);
178void kvm_async_pf_task_wait(u32 token); 192void kvm_async_pf_task_wait(u32 token);
179void kvm_async_pf_task_wake(u32 token); 193void kvm_async_pf_task_wake(u32 token);
180u32 kvm_read_and_reset_pf_reason(void); 194u32 kvm_read_and_reset_pf_reason(void);
195extern void kvm_disable_steal_time(void);
181#else 196#else
182#define kvm_guest_init() do { } while (0) 197#define kvm_guest_init() do { } while (0)
183#define kvm_async_pf_task_wait(T) do {} while(0) 198#define kvm_async_pf_task_wait(T) do {} while(0)
@@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void)
186{ 201{
187 return 0; 202 return 0;
188} 203}
204
205static inline void kvm_disable_steal_time(void)
206{
207 return;
208}
189#endif 209#endif
190 210
191#endif /* __KERNEL__ */ 211#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index b60f2924c41..879fd7d3387 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -61,6 +61,7 @@ hcall(unsigned long call,
61 : "memory"); 61 : "memory");
62 return call; 62 return call;
63} 63}
64/*:*/
64 65
65/* Can't use our min() macro here: needs to be a constant */ 66/* Can't use our min() macro here: needs to be a constant */
66#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 67#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 2e9972468a5..9cdae5d47e8 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -4,7 +4,7 @@
4#include <linux/percpu.h> 4#include <linux/percpu.h>
5 5
6#include <asm/system.h> 6#include <asm/system.h>
7#include <asm/atomic.h> 7#include <linux/atomic.h>
8#include <asm/asm.h> 8#include <asm/asm.h>
9 9
10typedef struct { 10typedef struct {
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 021979a6e23..c9321f34e55 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -8,6 +8,7 @@
8 * Machine Check support for x86 8 * Machine Check support for x86
9 */ 9 */
10 10
11/* MCG_CAP register defines */
11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */ 12#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
12#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ 13#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ 14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
@@ -17,10 +18,12 @@
17#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) 18#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
18#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ 19#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
19 20
21/* MCG_STATUS register defines */
20#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ 22#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
21#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ 23#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
22#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ 24#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
23 25
26/* MCi_STATUS register defines */
24#define MCI_STATUS_VAL (1ULL<<63) /* valid error */ 27#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
25#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ 28#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
26#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ 29#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
@@ -31,12 +34,14 @@
31#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ 34#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
32#define MCI_STATUS_AR (1ULL<<55) /* Action required */ 35#define MCI_STATUS_AR (1ULL<<55) /* Action required */
33 36
34/* MISC register defines */ 37/* MCi_MISC register defines */
35#define MCM_ADDR_SEGOFF 0 /* segment offset */ 38#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f)
36#define MCM_ADDR_LINEAR 1 /* linear address */ 39#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7)
37#define MCM_ADDR_PHYS 2 /* physical address */ 40#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */
38#define MCM_ADDR_MEM 3 /* memory address */ 41#define MCI_MISC_ADDR_LINEAR 1 /* linear address */
39#define MCM_ADDR_GENERIC 7 /* generic */ 42#define MCI_MISC_ADDR_PHYS 2 /* physical address */
43#define MCI_MISC_ADDR_MEM 3 /* memory address */
44#define MCI_MISC_ADDR_GENERIC 7 /* generic */
40 45
41/* CTL2 register defines */ 46/* CTL2 register defines */
42#define MCI_CTL2_CMCI_EN (1ULL << 30) 47#define MCI_CTL2_CMCI_EN (1ULL << 30)
@@ -119,7 +124,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain;
119 124
120#include <linux/percpu.h> 125#include <linux/percpu.h>
121#include <linux/init.h> 126#include <linux/init.h>
122#include <asm/atomic.h> 127#include <linux/atomic.h>
123 128
124extern int mce_disabled; 129extern int mce_disabled;
125extern int mce_p5_enabled; 130extern int mce_p5_enabled;
@@ -144,7 +149,7 @@ static inline void enable_p5_mce(void) {}
144 149
145void mce_setup(struct mce *m); 150void mce_setup(struct mce *m);
146void mce_log(struct mce *m); 151void mce_log(struct mce *m);
147DECLARE_PER_CPU(struct sys_device, mce_dev); 152DECLARE_PER_CPU(struct sys_device, mce_sysdev);
148 153
149/* 154/*
150 * Maximum banks number. 155 * Maximum banks number.
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8b5393ec108..69021528b43 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -2,7 +2,7 @@
2#define _ASM_X86_MMU_CONTEXT_H 2#define _ASM_X86_MMU_CONTEXT_H
3 3
4#include <asm/desc.h> 4#include <asm/desc.h>
5#include <asm/atomic.h> 5#include <linux/atomic.h>
6#include <asm/pgalloc.h> 6#include <asm/pgalloc.h>
7#include <asm/tlbflush.h> 7#include <asm/tlbflush.h>
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ffa037f28d3..55728e12147 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -34,15 +34,15 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
34 * 64Gb / 4096bytes/page = 16777216 pages 34 * 64Gb / 4096bytes/page = 16777216 pages
35 */ 35 */
36#define MAX_NR_PAGES 16777216 36#define MAX_NR_PAGES 16777216
37#define MAX_ELEMENTS 1024 37#define MAX_SECTIONS 1024
38#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) 38#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)
39 39
40extern s8 physnode_map[]; 40extern s8 physnode_map[];
41 41
42static inline int pfn_to_nid(unsigned long pfn) 42static inline int pfn_to_nid(unsigned long pfn)
43{ 43{
44#ifdef CONFIG_NUMA 44#ifdef CONFIG_NUMA
45 return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]); 45 return((int) physnode_map[(pfn) / PAGES_PER_SECTION]);
46#else 46#else
47 return 0; 47 return 0;
48#endif 48#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 485b4f1f079..d52609aeeab 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -259,6 +259,9 @@
259#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 259#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
260 260
261#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 261#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
262#define ENERGY_PERF_BIAS_PERFORMANCE 0
263#define ENERGY_PERF_BIAS_NORMAL 6
264#define ENERGY_PERF_BIAS_POWERSAVE 15
262 265
263#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 266#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
264 267
@@ -438,6 +441,18 @@
438#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a 441#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
439#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b 442#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
440#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c 443#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
444#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
445#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
446#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
447#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
448
449/* VMX_BASIC bits and bitmasks */
450#define VMX_BASIC_VMCS_SIZE_SHIFT 32
451#define VMX_BASIC_64 0x0001000000000000LLU
452#define VMX_BASIC_MEM_TYPE_SHIFT 50
453#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
454#define VMX_BASIC_MEM_TYPE_WB 6LLU
455#define VMX_BASIC_INOUT 0x0040000000000000LLU
441 456
442/* AMD-V MSRs */ 457/* AMD-V MSRs */
443 458
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 5ca6801b75f..87bdbca72f9 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -13,6 +13,7 @@ struct olpc_platform_t {
13 13
14#define OLPC_F_PRESENT 0x01 14#define OLPC_F_PRESENT 0x01
15#define OLPC_F_DCON 0x02 15#define OLPC_F_DCON 0x02
16#define OLPC_F_EC_WIDE_SCI 0x04
16 17
17#ifdef CONFIG_OLPC 18#ifdef CONFIG_OLPC
18 19
@@ -62,6 +63,13 @@ static inline int olpc_board_at_least(uint32_t rev)
62 return olpc_platform_info.boardrev >= rev; 63 return olpc_platform_info.boardrev >= rev;
63} 64}
64 65
66extern void olpc_ec_wakeup_set(u16 value);
67extern void olpc_ec_wakeup_clear(u16 value);
68extern bool olpc_ec_wakeup_available(void);
69
70extern int olpc_ec_mask_write(u16 bits);
71extern int olpc_ec_sci_query(u16 *sci_value);
72
65#else 73#else
66 74
67static inline int machine_is_olpc(void) 75static inline int machine_is_olpc(void)
@@ -74,6 +82,20 @@ static inline int olpc_has_dcon(void)
74 return 0; 82 return 0;
75} 83}
76 84
85static inline void olpc_ec_wakeup_set(u16 value) { }
86static inline void olpc_ec_wakeup_clear(u16 value) { }
87
88static inline bool olpc_ec_wakeup_available(void)
89{
90 return false;
91}
92
93#endif
94
95#ifdef CONFIG_OLPC_XO1_PM
96extern void do_olpc_suspend_lowlevel(void);
97extern void olpc_xo1_pm_wakeup_set(u16 value);
98extern void olpc_xo1_pm_wakeup_clear(u16 value);
77#endif 99#endif
78 100
79extern int pci_olpc_init(void); 101extern int pci_olpc_init(void);
@@ -83,14 +105,19 @@ extern int pci_olpc_init(void);
83extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, 105extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
84 unsigned char *outbuf, size_t outlen); 106 unsigned char *outbuf, size_t outlen);
85 107
86extern int olpc_ec_mask_set(uint8_t bits);
87extern int olpc_ec_mask_unset(uint8_t bits);
88
89/* EC commands */ 108/* EC commands */
90 109
91#define EC_FIRMWARE_REV 0x08 110#define EC_FIRMWARE_REV 0x08
92#define EC_WLAN_ENTER_RESET 0x35 111#define EC_WRITE_SCI_MASK 0x1b
93#define EC_WLAN_LEAVE_RESET 0x25 112#define EC_WAKE_UP_WLAN 0x24
113#define EC_WLAN_LEAVE_RESET 0x25
114#define EC_READ_EB_MODE 0x2a
115#define EC_SET_SCI_INHIBIT 0x32
116#define EC_SET_SCI_INHIBIT_RELEASE 0x34
117#define EC_WLAN_ENTER_RESET 0x35
118#define EC_WRITE_EXT_SCI_MASK 0x38
119#define EC_SCI_QUERY 0x84
120#define EC_EXT_SCI_QUERY 0x85
94 121
95/* SCI source values */ 122/* SCI source values */
96 123
@@ -99,10 +126,12 @@ extern int olpc_ec_mask_unset(uint8_t bits);
99#define EC_SCI_SRC_BATTERY 0x02 126#define EC_SCI_SRC_BATTERY 0x02
100#define EC_SCI_SRC_BATSOC 0x04 127#define EC_SCI_SRC_BATSOC 0x04
101#define EC_SCI_SRC_BATERR 0x08 128#define EC_SCI_SRC_BATERR 0x08
102#define EC_SCI_SRC_EBOOK 0x10 129#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */
103#define EC_SCI_SRC_WLAN 0x20 130#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */
104#define EC_SCI_SRC_ACPWR 0x40 131#define EC_SCI_SRC_ACPWR 0x40
105#define EC_SCI_SRC_ALL 0x7F 132#define EC_SCI_SRC_BATCRIT 0x80
133#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */
134#define EC_SCI_SRC_ALL 0x1FF
106 135
107/* GPIO assignments */ 136/* GPIO assignments */
108 137
@@ -116,7 +145,7 @@ extern int olpc_ec_mask_unset(uint8_t bits);
116#define OLPC_GPIO_SMB_CLK 14 145#define OLPC_GPIO_SMB_CLK 14
117#define OLPC_GPIO_SMB_DATA 15 146#define OLPC_GPIO_SMB_DATA 15
118#define OLPC_GPIO_WORKAUX geode_gpio(24) 147#define OLPC_GPIO_WORKAUX geode_gpio(24)
119#define OLPC_GPIO_LID geode_gpio(26) 148#define OLPC_GPIO_LID 26
120#define OLPC_GPIO_ECSCI geode_gpio(27) 149#define OLPC_GPIO_ECSCI 27
121 150
122#endif /* _ASM_X86_OLPC_H */ 151#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab17..a7d2db9a74f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
230 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); 230 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
231} 231}
232 232
233struct jump_label_key;
234extern struct jump_label_key paravirt_steal_enabled;
235extern struct jump_label_key paravirt_steal_rq_enabled;
236
237static inline u64 paravirt_steal_clock(int cpu)
238{
239 return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
240}
241
233static inline unsigned long long paravirt_read_pmc(int counter) 242static inline unsigned long long paravirt_read_pmc(int counter)
234{ 243{
235 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); 244 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c86..8e8b9a4987e 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -41,6 +41,7 @@
41 41
42#include <asm/desc_defs.h> 42#include <asm/desc_defs.h>
43#include <asm/kmap_types.h> 43#include <asm/kmap_types.h>
44#include <asm/pgtable_types.h>
44 45
45struct page; 46struct page;
46struct thread_struct; 47struct thread_struct;
@@ -63,6 +64,11 @@ struct paravirt_callee_save {
63struct pv_info { 64struct pv_info {
64 unsigned int kernel_rpl; 65 unsigned int kernel_rpl;
65 int shared_kernel_pmd; 66 int shared_kernel_pmd;
67
68#ifdef CONFIG_X86_64
69 u16 extra_user_64bit_cs; /* __USER_CS if none */
70#endif
71
66 int paravirt_enabled; 72 int paravirt_enabled;
67 const char *name; 73 const char *name;
68}; 74};
@@ -89,6 +95,7 @@ struct pv_lazy_ops {
89 95
90struct pv_time_ops { 96struct pv_time_ops {
91 unsigned long long (*sched_clock)(void); 97 unsigned long long (*sched_clock)(void);
98 unsigned long long (*steal_clock)(int cpu);
92 unsigned long (*get_tsc_khz)(void); 99 unsigned long (*get_tsc_khz)(void);
93}; 100};
94 101
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a0a9779084d..3470c9d0ebb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -388,12 +388,9 @@ do { \
388#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 388#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
389#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 389#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
390#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 390#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
391/* 391#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val)
392 * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much 392#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val)
393 * faster than an xchg with forced lock semantics. 393#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val)
394 */
395#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
396#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
397 394
398#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 395#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
399#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 396#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -485,6 +482,8 @@ do { \
485#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 482#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
486#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 483#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
487#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) 484#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
485#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
486#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
488 487
489#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 488#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
490#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 489#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index d9d4dae305f..094fb30817a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
152 (regs)->bp = caller_frame_pointer(); \ 152 (regs)->bp = caller_frame_pointer(); \
153 (regs)->cs = __KERNEL_CS; \ 153 (regs)->cs = __KERNEL_CS; \
154 regs->flags = 0; \ 154 regs->flags = 0; \
155 asm volatile( \
156 _ASM_MOV "%%"_ASM_SP ", %0\n" \
157 : "=m" ((regs)->sp) \
158 :: "memory" \
159 ); \
155} 160}
156 161
157#else 162#else
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 56fd9e3abbd..4f7e67e2345 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -102,6 +102,14 @@
102#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) 102#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
103 103
104/* 104/*
105 * If an event has alias it should be marked
106 * with a special bit. (Don't forget to check
107 * P4_PEBS_CONFIG_MASK and related bits on
108 * modification.)
109 */
110#define P4_CONFIG_ALIASABLE (1 << 9)
111
112/*
105 * The bits we allow to pass for RAW events 113 * The bits we allow to pass for RAW events
106 */ 114 */
107#define P4_CONFIG_MASK_ESCR \ 115#define P4_CONFIG_MASK_ESCR \
@@ -123,6 +131,31 @@
123 (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ 131 (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
124 (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) 132 (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
125 133
134/*
135 * In case of event aliasing we need to preserve some
136 * caller bits, otherwise the mapping won't be complete.
137 */
138#define P4_CONFIG_EVENT_ALIAS_MASK \
139 (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \
140 p4_config_pack_cccr(P4_CCCR_EDGE | \
141 P4_CCCR_THRESHOLD_MASK | \
142 P4_CCCR_COMPLEMENT | \
143 P4_CCCR_COMPARE))
144
145#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \
146 ((P4_CONFIG_HT) | \
147 p4_config_pack_escr(P4_ESCR_T0_OS | \
148 P4_ESCR_T0_USR | \
149 P4_ESCR_T1_OS | \
150 P4_ESCR_T1_USR) | \
151 p4_config_pack_cccr(P4_CCCR_OVF | \
152 P4_CCCR_CASCADE | \
153 P4_CCCR_FORCE_OVF | \
154 P4_CCCR_THREAD_ANY | \
155 P4_CCCR_OVF_PMI_T0 | \
156 P4_CCCR_OVF_PMI_T1 | \
157 P4_CONFIG_ALIASABLE))
158
126static inline bool p4_is_event_cascaded(u64 config) 159static inline bool p4_is_event_cascaded(u64 config)
127{ 160{
128 u32 cccr = p4_config_unpack_cccr(config); 161 u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d56187c6b83..013286a10c2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -107,7 +107,8 @@
107#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) 107#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
108#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) 108#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
109#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) 109#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
110#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) 110#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
111#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
111#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 112#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
112#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) 113#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
113#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 114#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -129,7 +130,8 @@
129#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) 130#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
130#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 131#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
131#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) 132#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
132#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) 133#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
134#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
133 135
134#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 136#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
135#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) 137#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 59ab4dffa37..2dddb317bb3 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -59,6 +59,7 @@
59#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ 59#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
62#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 63#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
63#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ 64#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
64 65
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b844edc69fe..7e6a7b66203 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -755,8 +755,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
755 :: "a" (eax), "c" (ecx)); 755 :: "a" (eax), "c" (ecx));
756} 756}
757 757
758extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
759
760extern void select_idle_routine(const struct cpuinfo_x86 *c); 758extern void select_idle_routine(const struct cpuinfo_x86 *c);
761extern void init_amd_e400_c1e_mask(void); 759extern void init_amd_e400_c1e_mask(void);
762 760
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 971e0b46446..644dd885f05 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -19,7 +19,7 @@
19#include <linux/pci.h> 19#include <linux/pci.h>
20 20
21#include <asm/irq.h> 21#include <asm/irq.h>
22#include <asm/atomic.h> 22#include <linux/atomic.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include <asm/irq_controller.h> 24#include <asm/irq_controller.h>
25 25
@@ -30,17 +30,6 @@ extern void add_dtb(u64 data);
30extern void x86_add_irq_domains(void); 30extern void x86_add_irq_domains(void);
31void __cpuinit x86_of_pci_init(void); 31void __cpuinit x86_of_pci_init(void);
32void x86_dtb_init(void); 32void x86_dtb_init(void);
33
34static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
35{
36 return pdev ? pdev->dev.of_node : NULL;
37}
38
39static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
40{
41 return pci_device_to_OF_node(bus->self);
42}
43
44#else 33#else
45static inline void add_dtb(u64 data) { } 34static inline void add_dtb(u64 data) { }
46static inline void x86_add_irq_domains(void) { } 35static inline void x86_add_irq_domains(void) { }
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 94e7618fcac..35664547125 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -131,6 +131,9 @@ struct pt_regs {
131#ifdef __KERNEL__ 131#ifdef __KERNEL__
132 132
133#include <linux/init.h> 133#include <linux/init.h>
134#ifdef CONFIG_PARAVIRT
135#include <asm/paravirt_types.h>
136#endif
134 137
135struct cpuinfo_x86; 138struct cpuinfo_x86;
136struct task_struct; 139struct task_struct;
@@ -187,6 +190,22 @@ static inline int v8086_mode(struct pt_regs *regs)
187#endif 190#endif
188} 191}
189 192
193#ifdef CONFIG_X86_64
194static inline bool user_64bit_mode(struct pt_regs *regs)
195{
196#ifndef CONFIG_PARAVIRT
197 /*
198 * On non-paravirt systems, this is the only long mode CPL 3
199 * selector. We do not allow long mode selectors in the LDT.
200 */
201 return regs->cs == __USER_CS;
202#else
203 /* Headers are too twisted for this to go in paravirt.h. */
204 return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
205#endif
206}
207#endif
208
190/* 209/*
191 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode 210 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
192 * when it traps. The previous stack will be directly underneath the saved 211 * when it traps. The previous stack will be directly underneath the saved
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index a518c0a4504..c59cc97fe6c 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -44,7 +44,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
44 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 44 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
45#elif defined(__x86_64__) 45#elif defined(__x86_64__)
46 __asm__ ( 46 __asm__ (
47 "mul %[mul_frac] ; shrd $32, %[hi], %[lo]" 47 "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
48 : [lo]"=a"(product), 48 : [lo]"=a"(product),
49 [hi]"=d"(tmp) 49 [hi]"=d"(tmp)
50 : "0"(delta), 50 : "0"(delta),
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
index 6a8c0d64510..a5370a03d90 100644
--- a/arch/x86/include/asm/rwlock.h
+++ b/arch/x86/include/asm/rwlock.h
@@ -1,7 +1,48 @@
1#ifndef _ASM_X86_RWLOCK_H 1#ifndef _ASM_X86_RWLOCK_H
2#define _ASM_X86_RWLOCK_H 2#define _ASM_X86_RWLOCK_H
3 3
4#define RW_LOCK_BIAS 0x01000000 4#include <asm/asm.h>
5
6#if CONFIG_NR_CPUS <= 2048
7
8#ifndef __ASSEMBLY__
9typedef union {
10 s32 lock;
11 s32 write;
12} arch_rwlock_t;
13#endif
14
15#define RW_LOCK_BIAS 0x00100000
16#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##l)
17#define READ_LOCK_ATOMIC(n) atomic_##n
18#define WRITE_LOCK_ADD(n) __ASM_FORM_COMMA(addl n)
19#define WRITE_LOCK_SUB(n) __ASM_FORM_COMMA(subl n)
20#define WRITE_LOCK_CMP RW_LOCK_BIAS
21
22#else /* CONFIG_NR_CPUS > 2048 */
23
24#include <linux/const.h>
25
26#ifndef __ASSEMBLY__
27typedef union {
28 s64 lock;
29 struct {
30 u32 read;
31 s32 write;
32 };
33} arch_rwlock_t;
34#endif
35
36#define RW_LOCK_BIAS (_AC(1,L) << 32)
37#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##q)
38#define READ_LOCK_ATOMIC(n) atomic64_##n
39#define WRITE_LOCK_ADD(n) __ASM_FORM(incl)
40#define WRITE_LOCK_SUB(n) __ASM_FORM(decl)
41#define WRITE_LOCK_CMP 1
42
43#endif /* CONFIG_NR_CPUS */
44
45#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
5 46
6/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */ 47/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
7 48
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index cd84f7208f7..5e641715c3f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -162,7 +162,7 @@
162#define GDT_ENTRY_DEFAULT_USER32_CS 4 162#define GDT_ENTRY_DEFAULT_USER32_CS 4
163#define GDT_ENTRY_DEFAULT_USER_DS 5 163#define GDT_ENTRY_DEFAULT_USER_DS 5
164#define GDT_ENTRY_DEFAULT_USER_CS 6 164#define GDT_ENTRY_DEFAULT_USER_CS 6
165#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3) 165#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
166#define __USER32_DS __USER_DS 166#define __USER32_DS __USER_DS
167 167
168#define GDT_ENTRY_TSS 8 /* needs two entries */ 168#define GDT_ENTRY_TSS 8 /* needs two entries */
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 725b7783199..49adfd7bb4a 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -10,7 +10,11 @@ static inline void smpboot_clear_io_apic_irqs(void)
10 10
11static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) 11static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
12{ 12{
13 unsigned long flags;
14
15 spin_lock_irqsave(&rtc_lock, flags);
13 CMOS_WRITE(0xa, 0xf); 16 CMOS_WRITE(0xa, 0xf);
17 spin_unlock_irqrestore(&rtc_lock, flags);
14 local_flush_tlb(); 18 local_flush_tlb();
15 pr_debug("1.\n"); 19 pr_debug("1.\n");
16 *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) = 20 *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
@@ -23,6 +27,8 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
23 27
24static inline void smpboot_restore_warm_reset_vector(void) 28static inline void smpboot_restore_warm_reset_vector(void)
25{ 29{
30 unsigned long flags;
31
26 /* 32 /*
27 * Install writable page 0 entry to set BIOS data area. 33 * Install writable page 0 entry to set BIOS data area.
28 */ 34 */
@@ -32,7 +38,9 @@ static inline void smpboot_restore_warm_reset_vector(void)
32 * Paranoid: Set warm reset code and vector here back 38 * Paranoid: Set warm reset code and vector here back
33 * to default values. 39 * to default values.
34 */ 40 */
41 spin_lock_irqsave(&rtc_lock, flags);
35 CMOS_WRITE(0, 0xf); 42 CMOS_WRITE(0, 0xf);
43 spin_unlock_irqrestore(&rtc_lock, flags);
36 44
37 *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0; 45 *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
38} 46}
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3089f70c0c5..ee67edf86fd 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -1,8 +1,7 @@
1#ifndef _ASM_X86_SPINLOCK_H 1#ifndef _ASM_X86_SPINLOCK_H
2#define _ASM_X86_SPINLOCK_H 2#define _ASM_X86_SPINLOCK_H
3 3
4#include <asm/atomic.h> 4#include <linux/atomic.h>
5#include <asm/rwlock.h>
6#include <asm/page.h> 5#include <asm/page.h>
7#include <asm/processor.h> 6#include <asm/processor.h>
8#include <linux/compiler.h> 7#include <linux/compiler.h>
@@ -234,7 +233,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
234 */ 233 */
235static inline int arch_read_can_lock(arch_rwlock_t *lock) 234static inline int arch_read_can_lock(arch_rwlock_t *lock)
236{ 235{
237 return (int)(lock)->lock > 0; 236 return lock->lock > 0;
238} 237}
239 238
240/** 239/**
@@ -243,12 +242,12 @@ static inline int arch_read_can_lock(arch_rwlock_t *lock)
243 */ 242 */
244static inline int arch_write_can_lock(arch_rwlock_t *lock) 243static inline int arch_write_can_lock(arch_rwlock_t *lock)
245{ 244{
246 return (lock)->lock == RW_LOCK_BIAS; 245 return lock->write == WRITE_LOCK_CMP;
247} 246}
248 247
249static inline void arch_read_lock(arch_rwlock_t *rw) 248static inline void arch_read_lock(arch_rwlock_t *rw)
250{ 249{
251 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" 250 asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
252 "jns 1f\n" 251 "jns 1f\n"
253 "call __read_lock_failed\n\t" 252 "call __read_lock_failed\n\t"
254 "1:\n" 253 "1:\n"
@@ -257,47 +256,55 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
257 256
258static inline void arch_write_lock(arch_rwlock_t *rw) 257static inline void arch_write_lock(arch_rwlock_t *rw)
259{ 258{
260 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" 259 asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
261 "jz 1f\n" 260 "jz 1f\n"
262 "call __write_lock_failed\n\t" 261 "call __write_lock_failed\n\t"
263 "1:\n" 262 "1:\n"
264 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); 263 ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
264 : "memory");
265} 265}
266 266
267static inline int arch_read_trylock(arch_rwlock_t *lock) 267static inline int arch_read_trylock(arch_rwlock_t *lock)
268{ 268{
269 atomic_t *count = (atomic_t *)lock; 269 READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
270 270
271 if (atomic_dec_return(count) >= 0) 271 if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
272 return 1; 272 return 1;
273 atomic_inc(count); 273 READ_LOCK_ATOMIC(inc)(count);
274 return 0; 274 return 0;
275} 275}
276 276
277static inline int arch_write_trylock(arch_rwlock_t *lock) 277static inline int arch_write_trylock(arch_rwlock_t *lock)
278{ 278{
279 atomic_t *count = (atomic_t *)lock; 279 atomic_t *count = (atomic_t *)&lock->write;
280 280
281 if (atomic_sub_and_test(RW_LOCK_BIAS, count)) 281 if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
282 return 1; 282 return 1;
283 atomic_add(RW_LOCK_BIAS, count); 283 atomic_add(WRITE_LOCK_CMP, count);
284 return 0; 284 return 0;
285} 285}
286 286
287static inline void arch_read_unlock(arch_rwlock_t *rw) 287static inline void arch_read_unlock(arch_rwlock_t *rw)
288{ 288{
289 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); 289 asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
290 :"+m" (rw->lock) : : "memory");
290} 291}
291 292
292static inline void arch_write_unlock(arch_rwlock_t *rw) 293static inline void arch_write_unlock(arch_rwlock_t *rw)
293{ 294{
294 asm volatile(LOCK_PREFIX "addl %1, %0" 295 asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); 296 : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
296} 297}
297 298
298#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) 299#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
299#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) 300#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
300 301
302#undef READ_LOCK_SIZE
303#undef READ_LOCK_ATOMIC
304#undef WRITE_LOCK_ADD
305#undef WRITE_LOCK_SUB
306#undef WRITE_LOCK_CMP
307
301#define arch_spin_relax(lock) cpu_relax() 308#define arch_spin_relax(lock) cpu_relax()
302#define arch_read_relax(lock) cpu_relax() 309#define arch_read_relax(lock) cpu_relax()
303#define arch_write_relax(lock) cpu_relax() 310#define arch_write_relax(lock) cpu_relax()
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index dcb48b2edc1..7c7a486fcb6 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -11,10 +11,6 @@ typedef struct arch_spinlock {
11 11
12#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } 12#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
13 13
14typedef struct { 14#include <asm/rwlock.h>
15 unsigned int lock;
16} arch_rwlock_t;
17
18#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
19 15
20#endif /* _ASM_X86_SPINLOCK_TYPES_H */ 16#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index c2ff2a1d845..2d2f01ce6dc 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -401,6 +401,7 @@ extern unsigned long arch_align_stack(unsigned long sp);
401extern void free_init_pages(char *what, unsigned long begin, unsigned long end); 401extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
402 402
403void default_idle(void); 403void default_idle(void);
404bool set_pm_idle_to_default(void);
404 405
405void stop_this_cpu(void *dummy); 406void stop_this_cpu(void *dummy);
406 407
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1f2e61e2898..a1fe5c127b5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -21,7 +21,7 @@ struct task_struct;
21struct exec_domain; 21struct exec_domain;
22#include <asm/processor.h> 22#include <asm/processor.h>
23#include <asm/ftrace.h> 23#include <asm/ftrace.h>
24#include <asm/atomic.h> 24#include <linux/atomic.h>
25 25
26struct thread_info { 26struct thread_info {
27 struct task_struct *task; /* main task structure */ 27 struct task_struct *task; /* main task structure */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 7bdec4e9b73..92b8aec0697 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -1,10 +1,12 @@
1#ifndef _ASM_X86_TIME_H 1#ifndef _ASM_X86_TIME_H
2#define _ASM_X86_TIME_H 2#define _ASM_X86_TIME_H
3 3
4extern void hpet_time_init(void); 4#include <linux/clocksource.h>
5
6#include <asm/mc146818rtc.h> 5#include <asm/mc146818rtc.h>
7 6
7extern void hpet_time_init(void);
8extern void time_init(void); 8extern void time_init(void);
9 9
10extern struct clock_event_device *global_clock_event;
11
10#endif /* _ASM_X86_TIME_H */ 12#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fa7b9176b76..431793e5d48 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -32,6 +32,22 @@ extern int no_timer_check;
32 * (mathieu.desnoyers@polymtl.ca) 32 * (mathieu.desnoyers@polymtl.ca)
33 * 33 *
34 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 34 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
35 *
36 * In:
37 *
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * Although we may still have enough bits to store the value of ns,
41 * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
42 * leading to an incorrect result.
43 *
44 * To avoid this, we can decompose 'cycles' into quotient and remainder
45 * of division by SC. Then,
46 *
47 * ns = (quot * SC + rem) * cyc2ns_scale / SC
48 * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
49 *
50 * - sqazi@google.com
35 */ 51 */
36 52
37DECLARE_PER_CPU(unsigned long, cyc2ns); 53DECLARE_PER_CPU(unsigned long, cyc2ns);
@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
41 57
42static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 58static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
43{ 59{
60 unsigned long long quot;
61 unsigned long long rem;
44 int cpu = smp_processor_id(); 62 int cpu = smp_processor_id();
45 unsigned long long ns = per_cpu(cyc2ns_offset, cpu); 63 unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
46 ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; 64 quot = (cyc >> CYC2NS_SCALE_FACTOR);
65 rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
66 ns += quot * per_cpu(cyc2ns, cpu) +
67 ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
47 return ns; 68 return ns;
48} 69}
49 70
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0310da67307..0012d0902c5 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_TRAPS_H 1#ifndef _ASM_X86_TRAPS_H
2#define _ASM_X86_TRAPS_H 2#define _ASM_X86_TRAPS_H
3 3
4#include <linux/kprobes.h>
5
4#include <asm/debugreg.h> 6#include <asm/debugreg.h>
5#include <asm/siginfo.h> /* TRAP_TRACE, ... */ 7#include <asm/siginfo.h> /* TRAP_TRACE, ... */
6 8
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9db5583b6d3..83e2efd181e 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,10 +51,6 @@ extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void); 52extern unsigned long native_calibrate_tsc(void);
53 53
54#ifdef CONFIG_X86_64
55extern cycles_t vread_tsc(void);
56#endif
57
58/* 54/*
59 * Boot-time check whether the TSCs are synchronized across 55 * Boot-time check whether the TSCs are synchronized across
60 * all CPUs/cores: 56 * all CPUs/cores:
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 99ddd148a76..36361bf6fdd 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -555,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; };
555 555
556#endif /* CONFIG_X86_WP_WORKS_OK */ 556#endif /* CONFIG_X86_WP_WORKS_OK */
557 557
558extern unsigned long
559copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
560
558/* 561/*
559 * movsl can be slow when source and dest are not both 8-byte aligned 562 * movsl can be slow when source and dest are not both 8-byte aligned
560 */ 563 */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index e347f077378..f6f37d0ca33 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -414,7 +414,7 @@ __SYSCALL(__NR_query_module, sys_ni_syscall)
414__SYSCALL(__NR_quotactl, sys_quotactl) 414__SYSCALL(__NR_quotactl, sys_quotactl)
415 415
416#define __NR_nfsservctl 180 416#define __NR_nfsservctl 180
417__SYSCALL(__NR_nfsservctl, sys_nfsservctl) 417__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
418 418
419/* reserved for LiS/STREAMS */ 419/* reserved for LiS/STREAMS */
420#define __NR_getpmsg 181 420#define __NR_getpmsg 181
@@ -681,6 +681,8 @@ __SYSCALL(__NR_syncfs, sys_syncfs)
681__SYSCALL(__NR_sendmmsg, sys_sendmmsg) 681__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
682#define __NR_setns 308 682#define __NR_setns 308
683__SYSCALL(__NR_setns, sys_setns) 683__SYSCALL(__NR_setns, sys_setns)
684#define __NR_getcpu 309
685__SYSCALL(__NR_getcpu, sys_getcpu)
684 686
685#define __NR_LITMUS 309 687#define __NR_LITMUS 309
686 688
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index a291c40efd4..0c767a8e000 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -55,6 +55,7 @@
55#define UV_BAU_TUNABLES_DIR "sgi_uv" 55#define UV_BAU_TUNABLES_DIR "sgi_uv"
56#define UV_BAU_TUNABLES_FILE "bau_tunables" 56#define UV_BAU_TUNABLES_FILE "bau_tunables"
57#define WHITESPACE " \t\n" 57#define WHITESPACE " \t\n"
58#define uv_mmask ((1UL << uv_hub_info->m_val) - 1)
58#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) 59#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
59#define cpubit_isset(cpu, bau_local_cpumask) \ 60#define cpubit_isset(cpu, bau_local_cpumask) \
60 test_bit((cpu), (bau_local_cpumask).bits) 61 test_bit((cpu), (bau_local_cpumask).bits)
@@ -67,7 +68,7 @@
67 * we're using 655us, similar to UV1: 65 units of 10us 68 * we're using 655us, similar to UV1: 65 units of 10us
68 */ 69 */
69#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) 70#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
70#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (65*10UL) 71#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
71 72
72#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \ 73#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
73 UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ 74 UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
@@ -106,12 +107,20 @@
106#define DS_SOURCE_TIMEOUT 3 107#define DS_SOURCE_TIMEOUT 3
107/* 108/*
108 * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2 109 * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
109 * values 1 and 5 will not occur 110 * values 1 and 3 will not occur
111 * Decoded meaning ERROR BUSY AUX ERR
112 * ------------------------------- ---- ----- -------
113 * IDLE 0 0 0
114 * BUSY (active) 0 1 0
115 * SW Ack Timeout (destination) 1 0 0
116 * SW Ack INTD rejected (strong NACK) 1 0 1
117 * Source Side Time Out Detected 1 1 0
118 * Destination Side PUT Failed 1 1 1
110 */ 119 */
111#define UV2H_DESC_IDLE 0 120#define UV2H_DESC_IDLE 0
112#define UV2H_DESC_DEST_TIMEOUT 2 121#define UV2H_DESC_BUSY 2
113#define UV2H_DESC_DEST_STRONG_NACK 3 122#define UV2H_DESC_DEST_TIMEOUT 4
114#define UV2H_DESC_BUSY 4 123#define UV2H_DESC_DEST_STRONG_NACK 5
115#define UV2H_DESC_SOURCE_TIMEOUT 6 124#define UV2H_DESC_SOURCE_TIMEOUT 6
116#define UV2H_DESC_DEST_PUT_ERR 7 125#define UV2H_DESC_DEST_PUT_ERR 7
117 126
@@ -183,7 +192,7 @@
183 * 'base_dest_nasid' field of the header corresponds to the 192 * 'base_dest_nasid' field of the header corresponds to the
184 * destination nodeID associated with that specified bit. 193 * destination nodeID associated with that specified bit.
185 */ 194 */
186struct bau_targ_hubmask { 195struct pnmask {
187 unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; 196 unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
188}; 197};
189 198
@@ -314,7 +323,7 @@ struct bau_msg_header {
314 * Should be 64 bytes 323 * Should be 64 bytes
315 */ 324 */
316struct bau_desc { 325struct bau_desc {
317 struct bau_targ_hubmask distribution; 326 struct pnmask distribution;
318 /* 327 /*
319 * message template, consisting of header and payload: 328 * message template, consisting of header and payload:
320 */ 329 */
@@ -488,6 +497,7 @@ struct bau_control {
488 struct bau_control *uvhub_master; 497 struct bau_control *uvhub_master;
489 struct bau_control *socket_master; 498 struct bau_control *socket_master;
490 struct ptc_stats *statp; 499 struct ptc_stats *statp;
500 cpumask_t *cpumask;
491 unsigned long timeout_interval; 501 unsigned long timeout_interval;
492 unsigned long set_bau_on_time; 502 unsigned long set_bau_on_time;
493 atomic_t active_descriptor_count; 503 atomic_t active_descriptor_count;
@@ -526,90 +536,90 @@ struct bau_control {
526 struct hub_and_pnode *thp; 536 struct hub_and_pnode *thp;
527}; 537};
528 538
529static unsigned long read_mmr_uv2_status(void) 539static inline unsigned long read_mmr_uv2_status(void)
530{ 540{
531 return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); 541 return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
532} 542}
533 543
534static void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) 544static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
535{ 545{
536 write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); 546 write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
537} 547}
538 548
539static void write_mmr_descriptor_base(int pnode, unsigned long mmr_image) 549static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
540{ 550{
541 write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image); 551 write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
542} 552}
543 553
544static void write_mmr_activation(unsigned long index) 554static inline void write_mmr_activation(unsigned long index)
545{ 555{
546 write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 556 write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
547} 557}
548 558
549static void write_gmmr_activation(int pnode, unsigned long mmr_image) 559static inline void write_gmmr_activation(int pnode, unsigned long mmr_image)
550{ 560{
551 write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image); 561 write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
552} 562}
553 563
554static void write_mmr_payload_first(int pnode, unsigned long mmr_image) 564static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image)
555{ 565{
556 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image); 566 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
557} 567}
558 568
559static void write_mmr_payload_tail(int pnode, unsigned long mmr_image) 569static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
560{ 570{
561 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image); 571 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
562} 572}
563 573
564static void write_mmr_payload_last(int pnode, unsigned long mmr_image) 574static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image)
565{ 575{
566 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image); 576 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
567} 577}
568 578
569static void write_mmr_misc_control(int pnode, unsigned long mmr_image) 579static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image)
570{ 580{
571 write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 581 write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
572} 582}
573 583
574static unsigned long read_mmr_misc_control(int pnode) 584static inline unsigned long read_mmr_misc_control(int pnode)
575{ 585{
576 return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL); 586 return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
577} 587}
578 588
579static void write_mmr_sw_ack(unsigned long mr) 589static inline void write_mmr_sw_ack(unsigned long mr)
580{ 590{
581 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); 591 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
582} 592}
583 593
584static unsigned long read_mmr_sw_ack(void) 594static inline unsigned long read_mmr_sw_ack(void)
585{ 595{
586 return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 596 return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
587} 597}
588 598
589static unsigned long read_gmmr_sw_ack(int pnode) 599static inline unsigned long read_gmmr_sw_ack(int pnode)
590{ 600{
591 return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 601 return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
592} 602}
593 603
594static void write_mmr_data_config(int pnode, unsigned long mr) 604static inline void write_mmr_data_config(int pnode, unsigned long mr)
595{ 605{
596 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr); 606 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
597} 607}
598 608
599static inline int bau_uvhub_isset(int uvhub, struct bau_targ_hubmask *dstp) 609static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp)
600{ 610{
601 return constant_test_bit(uvhub, &dstp->bits[0]); 611 return constant_test_bit(uvhub, &dstp->bits[0]);
602} 612}
603static inline void bau_uvhub_set(int pnode, struct bau_targ_hubmask *dstp) 613static inline void bau_uvhub_set(int pnode, struct pnmask *dstp)
604{ 614{
605 __set_bit(pnode, &dstp->bits[0]); 615 __set_bit(pnode, &dstp->bits[0]);
606} 616}
607static inline void bau_uvhubs_clear(struct bau_targ_hubmask *dstp, 617static inline void bau_uvhubs_clear(struct pnmask *dstp,
608 int nbits) 618 int nbits)
609{ 619{
610 bitmap_zero(&dstp->bits[0], nbits); 620 bitmap_zero(&dstp->bits[0], nbits);
611} 621}
612static inline int bau_uvhub_weight(struct bau_targ_hubmask *dstp) 622static inline int bau_uvhub_weight(struct pnmask *dstp)
613{ 623{
614 return bitmap_weight((unsigned long *)&dstp->bits[0], 624 return bitmap_weight((unsigned long *)&dstp->bits[0],
615 UV_DISTRIBUTION_SIZE); 625 UV_DISTRIBUTION_SIZE);
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index f26544a1521..54a13aaebc4 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -46,6 +46,13 @@
46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant 46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
47 * of the nasid for socket usage. 47 * of the nasid for socket usage.
48 * 48 *
49 * GPA - (global physical address) a socket physical address converted
50 * so that it can be used by the GRU as a global address. Socket
51 * physical addresses 1) need additional NASID (node) bits added
52 * to the high end of the address, and 2) unaliased if the
53 * partition does not have a physical address 0. In addition, on
54 * UV2 rev 1, GPAs need the gnode left shifted to bits 39 or 40.
55 *
49 * 56 *
50 * NumaLink Global Physical Address Format: 57 * NumaLink Global Physical Address Format:
51 * +--------------------------------+---------------------+ 58 * +--------------------------------+---------------------+
@@ -141,6 +148,8 @@ struct uv_hub_info_s {
141 unsigned int gnode_extra; 148 unsigned int gnode_extra;
142 unsigned char hub_revision; 149 unsigned char hub_revision;
143 unsigned char apic_pnode_shift; 150 unsigned char apic_pnode_shift;
151 unsigned char m_shift;
152 unsigned char n_lshift;
144 unsigned long gnode_upper; 153 unsigned long gnode_upper;
145 unsigned long lowmem_remap_top; 154 unsigned long lowmem_remap_top;
146 unsigned long lowmem_remap_base; 155 unsigned long lowmem_remap_base;
@@ -177,6 +186,16 @@ static inline int is_uv2_hub(void)
177 return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; 186 return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
178} 187}
179 188
189static inline int is_uv2_1_hub(void)
190{
191 return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE;
192}
193
194static inline int is_uv2_2_hub(void)
195{
196 return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE + 1;
197}
198
180union uvh_apicid { 199union uvh_apicid {
181 unsigned long v; 200 unsigned long v;
182 struct uvh_apicid_s { 201 struct uvh_apicid_s {
@@ -276,7 +295,10 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
276{ 295{
277 if (paddr < uv_hub_info->lowmem_remap_top) 296 if (paddr < uv_hub_info->lowmem_remap_top)
278 paddr |= uv_hub_info->lowmem_remap_base; 297 paddr |= uv_hub_info->lowmem_remap_base;
279 return paddr | uv_hub_info->gnode_upper; 298 paddr |= uv_hub_info->gnode_upper;
299 paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
300 ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift);
301 return paddr;
280} 302}
281 303
282 304
@@ -300,16 +322,19 @@ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
300 unsigned long remap_base = uv_hub_info->lowmem_remap_base; 322 unsigned long remap_base = uv_hub_info->lowmem_remap_base;
301 unsigned long remap_top = uv_hub_info->lowmem_remap_top; 323 unsigned long remap_top = uv_hub_info->lowmem_remap_top;
302 324
325 gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
326 ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val);
327 gpa = gpa & uv_hub_info->gpa_mask;
303 if (paddr >= remap_base && paddr < remap_base + remap_top) 328 if (paddr >= remap_base && paddr < remap_base + remap_top)
304 paddr -= remap_base; 329 paddr -= remap_base;
305 return paddr; 330 return paddr;
306} 331}
307 332
308 333
309/* gnode -> pnode */ 334/* gpa -> pnode */
310static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) 335static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
311{ 336{
312 return gpa >> uv_hub_info->m_val; 337 return gpa >> uv_hub_info->n_lshift;
313} 338}
314 339
315/* gpa -> pnode */ 340/* gpa -> pnode */
@@ -320,6 +345,12 @@ static inline int uv_gpa_to_pnode(unsigned long gpa)
320 return uv_gpa_to_gnode(gpa) & n_mask; 345 return uv_gpa_to_gnode(gpa) & n_mask;
321} 346}
322 347
348/* gpa -> node offset*/
349static inline unsigned long uv_gpa_to_offset(unsigned long gpa)
350{
351 return (gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift;
352}
353
323/* pnode, offset --> socket virtual */ 354/* pnode, offset --> socket virtual */
324static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) 355static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
325{ 356{
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 4be52c86344..10474fb1185 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -61,1689 +61,2016 @@
61/* Compat: if this #define is present, UV headers support UV2 */ 61/* Compat: if this #define is present, UV headers support UV2 */
62#define UV2_HUB_IS_SUPPORTED 1 62#define UV2_HUB_IS_SUPPORTED 1
63 63
64/* KABI compat: if this #define is present, KABI hacks are present */
65#define UV2_HUB_KABI_HACKS 1
66
67/* ========================================================================= */ 64/* ========================================================================= */
68/* UVH_BAU_DATA_BROADCAST */ 65/* UVH_BAU_DATA_BROADCAST */
69/* ========================================================================= */ 66/* ========================================================================= */
70#define UVH_BAU_DATA_BROADCAST 0x61688UL 67#define UVH_BAU_DATA_BROADCAST 0x61688UL
71#define UVH_BAU_DATA_BROADCAST_32 0x440 68#define UVH_BAU_DATA_BROADCAST_32 0x440
72 69
73#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 70#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
74#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL 71#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
75 72
76union uvh_bau_data_broadcast_u { 73union uvh_bau_data_broadcast_u {
77 unsigned long v; 74 unsigned long v;
78 struct uvh_bau_data_broadcast_s { 75 struct uvh_bau_data_broadcast_s {
79 unsigned long enable : 1; /* RW */ 76 unsigned long enable:1; /* RW */
80 unsigned long rsvd_1_63: 63; /* */ 77 unsigned long rsvd_1_63:63;
81 } s; 78 } s;
82}; 79};
83 80
84/* ========================================================================= */ 81/* ========================================================================= */
85/* UVH_BAU_DATA_CONFIG */ 82/* UVH_BAU_DATA_CONFIG */
86/* ========================================================================= */ 83/* ========================================================================= */
87#define UVH_BAU_DATA_CONFIG 0x61680UL 84#define UVH_BAU_DATA_CONFIG 0x61680UL
88#define UVH_BAU_DATA_CONFIG_32 0x438 85#define UVH_BAU_DATA_CONFIG_32 0x438
89 86
90#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 87#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
91#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL 88#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
92#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 89#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
93#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL 90#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
94#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 91#define UVH_BAU_DATA_CONFIG_P_SHFT 13
95#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL 92#define UVH_BAU_DATA_CONFIG_T_SHFT 15
96#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 93#define UVH_BAU_DATA_CONFIG_M_SHFT 16
97#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL 94#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
98#define UVH_BAU_DATA_CONFIG_P_SHFT 13 95#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
99#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL 96#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
100#define UVH_BAU_DATA_CONFIG_T_SHFT 15 97#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
101#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL 98#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
102#define UVH_BAU_DATA_CONFIG_M_SHFT 16 99#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
103#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL 100#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
104#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 101#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
105#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL 102#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
106 103
107union uvh_bau_data_config_u { 104union uvh_bau_data_config_u {
108 unsigned long v; 105 unsigned long v;
109 struct uvh_bau_data_config_s { 106 struct uvh_bau_data_config_s {
110 unsigned long vector_ : 8; /* RW */ 107 unsigned long vector_:8; /* RW */
111 unsigned long dm : 3; /* RW */ 108 unsigned long dm:3; /* RW */
112 unsigned long destmode : 1; /* RW */ 109 unsigned long destmode:1; /* RW */
113 unsigned long status : 1; /* RO */ 110 unsigned long status:1; /* RO */
114 unsigned long p : 1; /* RO */ 111 unsigned long p:1; /* RO */
115 unsigned long rsvd_14 : 1; /* */ 112 unsigned long rsvd_14:1;
116 unsigned long t : 1; /* RO */ 113 unsigned long t:1; /* RO */
117 unsigned long m : 1; /* RW */ 114 unsigned long m:1; /* RW */
118 unsigned long rsvd_17_31: 15; /* */ 115 unsigned long rsvd_17_31:15;
119 unsigned long apic_id : 32; /* RW */ 116 unsigned long apic_id:32; /* RW */
120 } s; 117 } s;
121}; 118};
122 119
123/* ========================================================================= */ 120/* ========================================================================= */
124/* UVH_EVENT_OCCURRED0 */ 121/* UVH_EVENT_OCCURRED0 */
125/* ========================================================================= */ 122/* ========================================================================= */
126#define UVH_EVENT_OCCURRED0 0x70000UL 123#define UVH_EVENT_OCCURRED0 0x70000UL
127#define UVH_EVENT_OCCURRED0_32 0x5e8 124#define UVH_EVENT_OCCURRED0_32 0x5e8
128 125
129#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 126#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
130#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL 127#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
131#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 128#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
132#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL 129#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
133#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 130#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
134#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL 131#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
135#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3 132#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
136#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL 133#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
137#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4 134#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
138#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL 135#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
139#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5 136#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
140#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL 137#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
141#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6 138#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
142#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL 139#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
143#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 140#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
144#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL 141#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
145#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 142#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
146#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL 143#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
147#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 144#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
148#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL 145#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
149#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 146#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
150#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL 147#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
151#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 148#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
152#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL 149#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
153#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 150#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
154#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL 151#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
155#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 152#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
156#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL 153#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
157#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 154#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
158#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL 155#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
159#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 156#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
160#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL 157#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
161#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 158#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
162#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL 159#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
163#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 160#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
164#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL 161#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
165#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 162#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
166#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL 163#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
167#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 164#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
168#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL 165#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
169#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 166#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
170#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL 167#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
171#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 168#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
172#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL 169#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
173#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 170#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
174#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL 171#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
175#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 172#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
176#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL 173#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
177#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 174#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
178#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL 175#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
179#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 176#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
180#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL 177#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
181#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 178#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
182#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL 179#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
183#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 180#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
184#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL 181#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
185#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 182#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
186#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL 183#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
187#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 184#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
188#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL 185#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
189#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 186#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
190#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL 187#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
191#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 188#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
192#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL 189#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
193#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 190#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
194#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL 191#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
195#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 192#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
196#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL 193#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
197#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 194#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
198#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL 195#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
199#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 196#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
200#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL 197#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
201#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 198#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
202#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL 199#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
203#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 200#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
204#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL 201#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
205#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 202#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
206#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL 203#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
207#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 204#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
208#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL 205#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
209#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 206#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
210#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL 207#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
211#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 208#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
212#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL 209#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
213#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 210#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
214#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL 211#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
215#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43 212#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
216#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL 213#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
217#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 214#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
218#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL 215#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
219#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45 216#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
220#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL 217#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
221#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 218#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
222#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL 219#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
223#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 220#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
224#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL 221#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
225#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 222#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
226#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL 223#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
227#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 224#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
228#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL 225#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
229#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 226#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
230#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL 227#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
231#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51 228#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
232#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL 229#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
233#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52 230#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
234#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL 231#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
235#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53 232#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
236#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL 233#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
237#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54 234#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
238#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL 235#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
239#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55 236#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
240#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL 237#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
241#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 238#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
242#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL 239#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
243 240
244#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 241#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
245#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL 242#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
246#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 243#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2
247#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL 244#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
248#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2 245#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
249#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL 246#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
250#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 247#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
251#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL 248#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
252#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 249#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
253#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL 250#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
254#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 251#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
255#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL 252#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
256#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 253#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
257#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL 254#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
258#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 255#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
259#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL 256#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
260#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 257#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
261#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL 258#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
262#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 259#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
263#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL 260#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
264#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 261#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
265#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL 262#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
266#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 263#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
267#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL 264#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
268#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 265#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
269#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL 266#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
270#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 267#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
271#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL 268#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
272#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 269#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
273#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL 270#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
274#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 271#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
275#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL 272#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
276#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 273#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
277#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL 274#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
278#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 275#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
279#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL 276#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
280#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 277#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
281#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL 278#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
282#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 279#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
283#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL 280#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
284#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 281#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
285#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL 282#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
286#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 283#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
287#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL 284#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
288#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 285#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
289#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL 286#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
290#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 287#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
291#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL 288#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
292#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 289#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
293#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL 290#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
294#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 291#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
295#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL 292#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
296#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 293#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
297#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL 294#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
298#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 295#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
299#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL 296#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
300#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 297#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
301#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL 298#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
302#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 299#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
303#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL 300#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
304#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 301#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
305#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL 302#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
306#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 303#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
307#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL 304#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
308#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 305#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
309#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL 306#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
310#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 307#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
311#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL 308#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
312#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 309#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
313#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL 310#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
314#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 311#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
315#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL 312#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
316#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 313#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
317#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL 314#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
318#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 315#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
319#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL 316#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
320#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 317#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
321#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL 318#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
322#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 319#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
323#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL 320#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
324#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 321#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
325#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL 322#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
326#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 323#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
327#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL 324#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
328#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 325#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
329#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL 326#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
330#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 327#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
331#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL 328#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
332#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 329#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
333#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL 330#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
334#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 331#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
335#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL 332#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
336#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 333#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
337#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL 334#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
338#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 335#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
339#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL 336#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
340#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 337#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
341#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL 338#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
342#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 339#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
343#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL 340#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
344#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 341#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
345#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL 342#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
346#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 343#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
347#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL 344#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
348#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 345#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
349#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL 346#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
350#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 347#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
351#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL 348#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
352#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 349#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
353#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL 350#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
354#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 351#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
355#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL 352#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
356#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 353#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
357#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL 354#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
358#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 355#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
359#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL 356#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
360#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 357#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
361#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL 358#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
362 359
363union uvh_event_occurred0_u { 360union uvh_event_occurred0_u {
364 unsigned long v; 361 unsigned long v;
365 struct uv1h_event_occurred0_s { 362 struct uv1h_event_occurred0_s {
366 unsigned long lb_hcerr : 1; /* RW, W1C */ 363 unsigned long lb_hcerr:1; /* RW, W1C */
367 unsigned long gr0_hcerr : 1; /* RW, W1C */ 364 unsigned long gr0_hcerr:1; /* RW, W1C */
368 unsigned long gr1_hcerr : 1; /* RW, W1C */ 365 unsigned long gr1_hcerr:1; /* RW, W1C */
369 unsigned long lh_hcerr : 1; /* RW, W1C */ 366 unsigned long lh_hcerr:1; /* RW, W1C */
370 unsigned long rh_hcerr : 1; /* RW, W1C */ 367 unsigned long rh_hcerr:1; /* RW, W1C */
371 unsigned long xn_hcerr : 1; /* RW, W1C */ 368 unsigned long xn_hcerr:1; /* RW, W1C */
372 unsigned long si_hcerr : 1; /* RW, W1C */ 369 unsigned long si_hcerr:1; /* RW, W1C */
373 unsigned long lb_aoerr0 : 1; /* RW, W1C */ 370 unsigned long lb_aoerr0:1; /* RW, W1C */
374 unsigned long gr0_aoerr0 : 1; /* RW, W1C */ 371 unsigned long gr0_aoerr0:1; /* RW, W1C */
375 unsigned long gr1_aoerr0 : 1; /* RW, W1C */ 372 unsigned long gr1_aoerr0:1; /* RW, W1C */
376 unsigned long lh_aoerr0 : 1; /* RW, W1C */ 373 unsigned long lh_aoerr0:1; /* RW, W1C */
377 unsigned long rh_aoerr0 : 1; /* RW, W1C */ 374 unsigned long rh_aoerr0:1; /* RW, W1C */
378 unsigned long xn_aoerr0 : 1; /* RW, W1C */ 375 unsigned long xn_aoerr0:1; /* RW, W1C */
379 unsigned long si_aoerr0 : 1; /* RW, W1C */ 376 unsigned long si_aoerr0:1; /* RW, W1C */
380 unsigned long lb_aoerr1 : 1; /* RW, W1C */ 377 unsigned long lb_aoerr1:1; /* RW, W1C */
381 unsigned long gr0_aoerr1 : 1; /* RW, W1C */ 378 unsigned long gr0_aoerr1:1; /* RW, W1C */
382 unsigned long gr1_aoerr1 : 1; /* RW, W1C */ 379 unsigned long gr1_aoerr1:1; /* RW, W1C */
383 unsigned long lh_aoerr1 : 1; /* RW, W1C */ 380 unsigned long lh_aoerr1:1; /* RW, W1C */
384 unsigned long rh_aoerr1 : 1; /* RW, W1C */ 381 unsigned long rh_aoerr1:1; /* RW, W1C */
385 unsigned long xn_aoerr1 : 1; /* RW, W1C */ 382 unsigned long xn_aoerr1:1; /* RW, W1C */
386 unsigned long si_aoerr1 : 1; /* RW, W1C */ 383 unsigned long si_aoerr1:1; /* RW, W1C */
387 unsigned long rh_vpi_int : 1; /* RW, W1C */ 384 unsigned long rh_vpi_int:1; /* RW, W1C */
388 unsigned long system_shutdown_int : 1; /* RW, W1C */ 385 unsigned long system_shutdown_int:1; /* RW, W1C */
389 unsigned long lb_irq_int_0 : 1; /* RW, W1C */ 386 unsigned long lb_irq_int_0:1; /* RW, W1C */
390 unsigned long lb_irq_int_1 : 1; /* RW, W1C */ 387 unsigned long lb_irq_int_1:1; /* RW, W1C */
391 unsigned long lb_irq_int_2 : 1; /* RW, W1C */ 388 unsigned long lb_irq_int_2:1; /* RW, W1C */
392 unsigned long lb_irq_int_3 : 1; /* RW, W1C */ 389 unsigned long lb_irq_int_3:1; /* RW, W1C */
393 unsigned long lb_irq_int_4 : 1; /* RW, W1C */ 390 unsigned long lb_irq_int_4:1; /* RW, W1C */
394 unsigned long lb_irq_int_5 : 1; /* RW, W1C */ 391 unsigned long lb_irq_int_5:1; /* RW, W1C */
395 unsigned long lb_irq_int_6 : 1; /* RW, W1C */ 392 unsigned long lb_irq_int_6:1; /* RW, W1C */
396 unsigned long lb_irq_int_7 : 1; /* RW, W1C */ 393 unsigned long lb_irq_int_7:1; /* RW, W1C */
397 unsigned long lb_irq_int_8 : 1; /* RW, W1C */ 394 unsigned long lb_irq_int_8:1; /* RW, W1C */
398 unsigned long lb_irq_int_9 : 1; /* RW, W1C */ 395 unsigned long lb_irq_int_9:1; /* RW, W1C */
399 unsigned long lb_irq_int_10 : 1; /* RW, W1C */ 396 unsigned long lb_irq_int_10:1; /* RW, W1C */
400 unsigned long lb_irq_int_11 : 1; /* RW, W1C */ 397 unsigned long lb_irq_int_11:1; /* RW, W1C */
401 unsigned long lb_irq_int_12 : 1; /* RW, W1C */ 398 unsigned long lb_irq_int_12:1; /* RW, W1C */
402 unsigned long lb_irq_int_13 : 1; /* RW, W1C */ 399 unsigned long lb_irq_int_13:1; /* RW, W1C */
403 unsigned long lb_irq_int_14 : 1; /* RW, W1C */ 400 unsigned long lb_irq_int_14:1; /* RW, W1C */
404 unsigned long lb_irq_int_15 : 1; /* RW, W1C */ 401 unsigned long lb_irq_int_15:1; /* RW, W1C */
405 unsigned long l1_nmi_int : 1; /* RW, W1C */ 402 unsigned long l1_nmi_int:1; /* RW, W1C */
406 unsigned long stop_clock : 1; /* RW, W1C */ 403 unsigned long stop_clock:1; /* RW, W1C */
407 unsigned long asic_to_l1 : 1; /* RW, W1C */ 404 unsigned long asic_to_l1:1; /* RW, W1C */
408 unsigned long l1_to_asic : 1; /* RW, W1C */ 405 unsigned long l1_to_asic:1; /* RW, W1C */
409 unsigned long ltc_int : 1; /* RW, W1C */ 406 unsigned long ltc_int:1; /* RW, W1C */
410 unsigned long la_seq_trigger : 1; /* RW, W1C */ 407 unsigned long la_seq_trigger:1; /* RW, W1C */
411 unsigned long ipi_int : 1; /* RW, W1C */ 408 unsigned long ipi_int:1; /* RW, W1C */
412 unsigned long extio_int0 : 1; /* RW, W1C */ 409 unsigned long extio_int0:1; /* RW, W1C */
413 unsigned long extio_int1 : 1; /* RW, W1C */ 410 unsigned long extio_int1:1; /* RW, W1C */
414 unsigned long extio_int2 : 1; /* RW, W1C */ 411 unsigned long extio_int2:1; /* RW, W1C */
415 unsigned long extio_int3 : 1; /* RW, W1C */ 412 unsigned long extio_int3:1; /* RW, W1C */
416 unsigned long profile_int : 1; /* RW, W1C */ 413 unsigned long profile_int:1; /* RW, W1C */
417 unsigned long rtc0 : 1; /* RW, W1C */ 414 unsigned long rtc0:1; /* RW, W1C */
418 unsigned long rtc1 : 1; /* RW, W1C */ 415 unsigned long rtc1:1; /* RW, W1C */
419 unsigned long rtc2 : 1; /* RW, W1C */ 416 unsigned long rtc2:1; /* RW, W1C */
420 unsigned long rtc3 : 1; /* RW, W1C */ 417 unsigned long rtc3:1; /* RW, W1C */
421 unsigned long bau_data : 1; /* RW, W1C */ 418 unsigned long bau_data:1; /* RW, W1C */
422 unsigned long power_management_req : 1; /* RW, W1C */ 419 unsigned long power_management_req:1; /* RW, W1C */
423 unsigned long rsvd_57_63 : 7; /* */ 420 unsigned long rsvd_57_63:7;
424 } s1; 421 } s1;
425 struct uv2h_event_occurred0_s { 422 struct uv2h_event_occurred0_s {
426 unsigned long lb_hcerr : 1; /* RW */ 423 unsigned long lb_hcerr:1; /* RW */
427 unsigned long qp_hcerr : 1; /* RW */ 424 unsigned long qp_hcerr:1; /* RW */
428 unsigned long rh_hcerr : 1; /* RW */ 425 unsigned long rh_hcerr:1; /* RW */
429 unsigned long lh0_hcerr : 1; /* RW */ 426 unsigned long lh0_hcerr:1; /* RW */
430 unsigned long lh1_hcerr : 1; /* RW */ 427 unsigned long lh1_hcerr:1; /* RW */
431 unsigned long gr0_hcerr : 1; /* RW */ 428 unsigned long gr0_hcerr:1; /* RW */
432 unsigned long gr1_hcerr : 1; /* RW */ 429 unsigned long gr1_hcerr:1; /* RW */
433 unsigned long ni0_hcerr : 1; /* RW */ 430 unsigned long ni0_hcerr:1; /* RW */
434 unsigned long ni1_hcerr : 1; /* RW */ 431 unsigned long ni1_hcerr:1; /* RW */
435 unsigned long lb_aoerr0 : 1; /* RW */ 432 unsigned long lb_aoerr0:1; /* RW */
436 unsigned long qp_aoerr0 : 1; /* RW */ 433 unsigned long qp_aoerr0:1; /* RW */
437 unsigned long rh_aoerr0 : 1; /* RW */ 434 unsigned long rh_aoerr0:1; /* RW */
438 unsigned long lh0_aoerr0 : 1; /* RW */ 435 unsigned long lh0_aoerr0:1; /* RW */
439 unsigned long lh1_aoerr0 : 1; /* RW */ 436 unsigned long lh1_aoerr0:1; /* RW */
440 unsigned long gr0_aoerr0 : 1; /* RW */ 437 unsigned long gr0_aoerr0:1; /* RW */
441 unsigned long gr1_aoerr0 : 1; /* RW */ 438 unsigned long gr1_aoerr0:1; /* RW */
442 unsigned long xb_aoerr0 : 1; /* RW */ 439 unsigned long xb_aoerr0:1; /* RW */
443 unsigned long rt_aoerr0 : 1; /* RW */ 440 unsigned long rt_aoerr0:1; /* RW */
444 unsigned long ni0_aoerr0 : 1; /* RW */ 441 unsigned long ni0_aoerr0:1; /* RW */
445 unsigned long ni1_aoerr0 : 1; /* RW */ 442 unsigned long ni1_aoerr0:1; /* RW */
446 unsigned long lb_aoerr1 : 1; /* RW */ 443 unsigned long lb_aoerr1:1; /* RW */
447 unsigned long qp_aoerr1 : 1; /* RW */ 444 unsigned long qp_aoerr1:1; /* RW */
448 unsigned long rh_aoerr1 : 1; /* RW */ 445 unsigned long rh_aoerr1:1; /* RW */
449 unsigned long lh0_aoerr1 : 1; /* RW */ 446 unsigned long lh0_aoerr1:1; /* RW */
450 unsigned long lh1_aoerr1 : 1; /* RW */ 447 unsigned long lh1_aoerr1:1; /* RW */
451 unsigned long gr0_aoerr1 : 1; /* RW */ 448 unsigned long gr0_aoerr1:1; /* RW */
452 unsigned long gr1_aoerr1 : 1; /* RW */ 449 unsigned long gr1_aoerr1:1; /* RW */
453 unsigned long xb_aoerr1 : 1; /* RW */ 450 unsigned long xb_aoerr1:1; /* RW */
454 unsigned long rt_aoerr1 : 1; /* RW */ 451 unsigned long rt_aoerr1:1; /* RW */
455 unsigned long ni0_aoerr1 : 1; /* RW */ 452 unsigned long ni0_aoerr1:1; /* RW */
456 unsigned long ni1_aoerr1 : 1; /* RW */ 453 unsigned long ni1_aoerr1:1; /* RW */
457 unsigned long system_shutdown_int : 1; /* RW */ 454 unsigned long system_shutdown_int:1; /* RW */
458 unsigned long lb_irq_int_0 : 1; /* RW */ 455 unsigned long lb_irq_int_0:1; /* RW */
459 unsigned long lb_irq_int_1 : 1; /* RW */ 456 unsigned long lb_irq_int_1:1; /* RW */
460 unsigned long lb_irq_int_2 : 1; /* RW */ 457 unsigned long lb_irq_int_2:1; /* RW */
461 unsigned long lb_irq_int_3 : 1; /* RW */ 458 unsigned long lb_irq_int_3:1; /* RW */
462 unsigned long lb_irq_int_4 : 1; /* RW */ 459 unsigned long lb_irq_int_4:1; /* RW */
463 unsigned long lb_irq_int_5 : 1; /* RW */ 460 unsigned long lb_irq_int_5:1; /* RW */
464 unsigned long lb_irq_int_6 : 1; /* RW */ 461 unsigned long lb_irq_int_6:1; /* RW */
465 unsigned long lb_irq_int_7 : 1; /* RW */ 462 unsigned long lb_irq_int_7:1; /* RW */
466 unsigned long lb_irq_int_8 : 1; /* RW */ 463 unsigned long lb_irq_int_8:1; /* RW */
467 unsigned long lb_irq_int_9 : 1; /* RW */ 464 unsigned long lb_irq_int_9:1; /* RW */
468 unsigned long lb_irq_int_10 : 1; /* RW */ 465 unsigned long lb_irq_int_10:1; /* RW */
469 unsigned long lb_irq_int_11 : 1; /* RW */ 466 unsigned long lb_irq_int_11:1; /* RW */
470 unsigned long lb_irq_int_12 : 1; /* RW */ 467 unsigned long lb_irq_int_12:1; /* RW */
471 unsigned long lb_irq_int_13 : 1; /* RW */ 468 unsigned long lb_irq_int_13:1; /* RW */
472 unsigned long lb_irq_int_14 : 1; /* RW */ 469 unsigned long lb_irq_int_14:1; /* RW */
473 unsigned long lb_irq_int_15 : 1; /* RW */ 470 unsigned long lb_irq_int_15:1; /* RW */
474 unsigned long l1_nmi_int : 1; /* RW */ 471 unsigned long l1_nmi_int:1; /* RW */
475 unsigned long stop_clock : 1; /* RW */ 472 unsigned long stop_clock:1; /* RW */
476 unsigned long asic_to_l1 : 1; /* RW */ 473 unsigned long asic_to_l1:1; /* RW */
477 unsigned long l1_to_asic : 1; /* RW */ 474 unsigned long l1_to_asic:1; /* RW */
478 unsigned long la_seq_trigger : 1; /* RW */ 475 unsigned long la_seq_trigger:1; /* RW */
479 unsigned long ipi_int : 1; /* RW */ 476 unsigned long ipi_int:1; /* RW */
480 unsigned long extio_int0 : 1; /* RW */ 477 unsigned long extio_int0:1; /* RW */
481 unsigned long extio_int1 : 1; /* RW */ 478 unsigned long extio_int1:1; /* RW */
482 unsigned long extio_int2 : 1; /* RW */ 479 unsigned long extio_int2:1; /* RW */
483 unsigned long extio_int3 : 1; /* RW */ 480 unsigned long extio_int3:1; /* RW */
484 unsigned long profile_int : 1; /* RW */ 481 unsigned long profile_int:1; /* RW */
485 unsigned long rsvd_59_63 : 5; /* */ 482 unsigned long rsvd_59_63:5;
486 } s2; 483 } s2;
487}; 484};
488 485
489/* ========================================================================= */ 486/* ========================================================================= */
490/* UVH_EVENT_OCCURRED0_ALIAS */ 487/* UVH_EVENT_OCCURRED0_ALIAS */
491/* ========================================================================= */ 488/* ========================================================================= */
492#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL 489#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
493#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 490#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
494 491
495/* ========================================================================= */ 492/* ========================================================================= */
496/* UVH_GR0_TLB_INT0_CONFIG */ 493/* UVH_GR0_TLB_INT0_CONFIG */
497/* ========================================================================= */ 494/* ========================================================================= */
498#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL 495#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
499 496
500#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 497#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
501#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL 498#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
502#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 499#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
503#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL 500#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
504#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11 501#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
505#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL 502#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
506#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12 503#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
507#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL 504#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
508#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13 505#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
509#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL 506#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
510#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15 507#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
511#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL 508#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
512#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16 509#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
513#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL 510#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
514#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32 511#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
515#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL 512#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
516 513
517union uvh_gr0_tlb_int0_config_u { 514union uvh_gr0_tlb_int0_config_u {
518 unsigned long v; 515 unsigned long v;
519 struct uvh_gr0_tlb_int0_config_s { 516 struct uvh_gr0_tlb_int0_config_s {
520 unsigned long vector_ : 8; /* RW */ 517 unsigned long vector_:8; /* RW */
521 unsigned long dm : 3; /* RW */ 518 unsigned long dm:3; /* RW */
522 unsigned long destmode : 1; /* RW */ 519 unsigned long destmode:1; /* RW */
523 unsigned long status : 1; /* RO */ 520 unsigned long status:1; /* RO */
524 unsigned long p : 1; /* RO */ 521 unsigned long p:1; /* RO */
525 unsigned long rsvd_14 : 1; /* */ 522 unsigned long rsvd_14:1;
526 unsigned long t : 1; /* RO */ 523 unsigned long t:1; /* RO */
527 unsigned long m : 1; /* RW */ 524 unsigned long m:1; /* RW */
528 unsigned long rsvd_17_31: 15; /* */ 525 unsigned long rsvd_17_31:15;
529 unsigned long apic_id : 32; /* RW */ 526 unsigned long apic_id:32; /* RW */
530 } s; 527 } s;
531}; 528};
532 529
533/* ========================================================================= */ 530/* ========================================================================= */
534/* UVH_GR0_TLB_INT1_CONFIG */ 531/* UVH_GR0_TLB_INT1_CONFIG */
535/* ========================================================================= */ 532/* ========================================================================= */
536#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL 533#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
537 534
538#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 535#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
539#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL 536#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
540#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 537#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
541#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL 538#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
542#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11 539#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
543#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL 540#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
544#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12 541#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
545#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL 542#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
546#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13 543#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
547#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL 544#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
548#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15 545#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
549#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL 546#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
550#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16 547#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
551#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL 548#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
552#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32 549#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
553#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL 550#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
554 551
555union uvh_gr0_tlb_int1_config_u { 552union uvh_gr0_tlb_int1_config_u {
556 unsigned long v; 553 unsigned long v;
557 struct uvh_gr0_tlb_int1_config_s { 554 struct uvh_gr0_tlb_int1_config_s {
558 unsigned long vector_ : 8; /* RW */ 555 unsigned long vector_:8; /* RW */
559 unsigned long dm : 3; /* RW */ 556 unsigned long dm:3; /* RW */
560 unsigned long destmode : 1; /* RW */ 557 unsigned long destmode:1; /* RW */
561 unsigned long status : 1; /* RO */ 558 unsigned long status:1; /* RO */
562 unsigned long p : 1; /* RO */ 559 unsigned long p:1; /* RO */
563 unsigned long rsvd_14 : 1; /* */ 560 unsigned long rsvd_14:1;
564 unsigned long t : 1; /* RO */ 561 unsigned long t:1; /* RO */
565 unsigned long m : 1; /* RW */ 562 unsigned long m:1; /* RW */
566 unsigned long rsvd_17_31: 15; /* */ 563 unsigned long rsvd_17_31:15;
567 unsigned long apic_id : 32; /* RW */ 564 unsigned long apic_id:32; /* RW */
568 } s; 565 } s;
566};
567
568/* ========================================================================= */
569/* UVH_GR0_TLB_MMR_CONTROL */
570/* ========================================================================= */
571#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL
572#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL
573#define UVH_GR0_TLB_MMR_CONTROL (is_uv1_hub() ? \
574 UV1H_GR0_TLB_MMR_CONTROL : \
575 UV2H_GR0_TLB_MMR_CONTROL)
576
577#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
578#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
579#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
580#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
581#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
582#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
583#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
584#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
585#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
586#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
587#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
588#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
589
590#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
591#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
592#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
593#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
594#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
595#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
596#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
597#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
598#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
599#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
600#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
601#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
602#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
603#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
604#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
605#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
606#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
607#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
608#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
609#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
610#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
611#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
612
613#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
614#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
615#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
616#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
617#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
618#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
619#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
620#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
621#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
622#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
623#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
624#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
625#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
626#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
627#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
628#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
629#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
630#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
631
632union uvh_gr0_tlb_mmr_control_u {
633 unsigned long v;
634 struct uvh_gr0_tlb_mmr_control_s {
635 unsigned long index:12; /* RW */
636 unsigned long mem_sel:2; /* RW */
637 unsigned long rsvd_14_15:2;
638 unsigned long auto_valid_en:1; /* RW */
639 unsigned long rsvd_17_19:3;
640 unsigned long mmr_hash_index_en:1; /* RW */
641 unsigned long rsvd_21_29:9;
642 unsigned long mmr_write:1; /* WP */
643 unsigned long mmr_read:1; /* WP */
644 unsigned long rsvd_32_63:32;
645 } s;
646 struct uv1h_gr0_tlb_mmr_control_s {
647 unsigned long index:12; /* RW */
648 unsigned long mem_sel:2; /* RW */
649 unsigned long rsvd_14_15:2;
650 unsigned long auto_valid_en:1; /* RW */
651 unsigned long rsvd_17_19:3;
652 unsigned long mmr_hash_index_en:1; /* RW */
653 unsigned long rsvd_21_29:9;
654 unsigned long mmr_write:1; /* WP */
655 unsigned long mmr_read:1; /* WP */
656 unsigned long rsvd_32_47:16;
657 unsigned long mmr_inj_con:1; /* RW */
658 unsigned long rsvd_49_51:3;
659 unsigned long mmr_inj_tlbram:1; /* RW */
660 unsigned long rsvd_53:1;
661 unsigned long mmr_inj_tlbpgsize:1; /* RW */
662 unsigned long rsvd_55:1;
663 unsigned long mmr_inj_tlbrreg:1; /* RW */
664 unsigned long rsvd_57_59:3;
665 unsigned long mmr_inj_tlblruv:1; /* RW */
666 unsigned long rsvd_61_63:3;
667 } s1;
668 struct uv2h_gr0_tlb_mmr_control_s {
669 unsigned long index:12; /* RW */
670 unsigned long mem_sel:2; /* RW */
671 unsigned long rsvd_14_15:2;
672 unsigned long auto_valid_en:1; /* RW */
673 unsigned long rsvd_17_19:3;
674 unsigned long mmr_hash_index_en:1; /* RW */
675 unsigned long rsvd_21_29:9;
676 unsigned long mmr_write:1; /* WP */
677 unsigned long mmr_read:1; /* WP */
678 unsigned long mmr_op_done:1; /* RW */
679 unsigned long rsvd_33_47:15;
680 unsigned long mmr_inj_con:1; /* RW */
681 unsigned long rsvd_49_51:3;
682 unsigned long mmr_inj_tlbram:1; /* RW */
683 unsigned long rsvd_53_63:11;
684 } s2;
685};
686
687/* ========================================================================= */
688/* UVH_GR0_TLB_MMR_READ_DATA_HI */
689/* ========================================================================= */
690#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL
691#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
692#define UVH_GR0_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \
693 UV1H_GR0_TLB_MMR_READ_DATA_HI : \
694 UV2H_GR0_TLB_MMR_READ_DATA_HI)
695
696#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
697#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
698#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
699#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
700#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
701#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
702#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
703#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
704
705union uvh_gr0_tlb_mmr_read_data_hi_u {
706 unsigned long v;
707 struct uvh_gr0_tlb_mmr_read_data_hi_s {
708 unsigned long pfn:41; /* RO */
709 unsigned long gaa:2; /* RO */
710 unsigned long dirty:1; /* RO */
711 unsigned long larger:1; /* RO */
712 unsigned long rsvd_45_63:19;
713 } s;
714};
715
716/* ========================================================================= */
717/* UVH_GR0_TLB_MMR_READ_DATA_LO */
718/* ========================================================================= */
719#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL
720#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
721#define UVH_GR0_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \
722 UV1H_GR0_TLB_MMR_READ_DATA_LO : \
723 UV2H_GR0_TLB_MMR_READ_DATA_LO)
724
725#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
726#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
727#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
728#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
729#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
730#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
731
732union uvh_gr0_tlb_mmr_read_data_lo_u {
733 unsigned long v;
734 struct uvh_gr0_tlb_mmr_read_data_lo_s {
735 unsigned long vpn:39; /* RO */
736 unsigned long asid:24; /* RO */
737 unsigned long valid:1; /* RO */
738 } s;
569}; 739};
570 740
571/* ========================================================================= */ 741/* ========================================================================= */
572/* UVH_GR1_TLB_INT0_CONFIG */ 742/* UVH_GR1_TLB_INT0_CONFIG */
573/* ========================================================================= */ 743/* ========================================================================= */
574#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL 744#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
575 745
576#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 746#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
577#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL 747#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
578#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 748#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
579#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL 749#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
580#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11 750#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
581#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL 751#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
582#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12 752#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
583#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL 753#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
584#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13 754#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
585#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL 755#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
586#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15 756#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
587#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL 757#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
588#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16 758#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
589#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL 759#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
590#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32 760#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
591#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL 761#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
592 762
593union uvh_gr1_tlb_int0_config_u { 763union uvh_gr1_tlb_int0_config_u {
594 unsigned long v; 764 unsigned long v;
595 struct uvh_gr1_tlb_int0_config_s { 765 struct uvh_gr1_tlb_int0_config_s {
596 unsigned long vector_ : 8; /* RW */ 766 unsigned long vector_:8; /* RW */
597 unsigned long dm : 3; /* RW */ 767 unsigned long dm:3; /* RW */
598 unsigned long destmode : 1; /* RW */ 768 unsigned long destmode:1; /* RW */
599 unsigned long status : 1; /* RO */ 769 unsigned long status:1; /* RO */
600 unsigned long p : 1; /* RO */ 770 unsigned long p:1; /* RO */
601 unsigned long rsvd_14 : 1; /* */ 771 unsigned long rsvd_14:1;
602 unsigned long t : 1; /* RO */ 772 unsigned long t:1; /* RO */
603 unsigned long m : 1; /* RW */ 773 unsigned long m:1; /* RW */
604 unsigned long rsvd_17_31: 15; /* */ 774 unsigned long rsvd_17_31:15;
605 unsigned long apic_id : 32; /* RW */ 775 unsigned long apic_id:32; /* RW */
606 } s; 776 } s;
607}; 777};
608 778
609/* ========================================================================= */ 779/* ========================================================================= */
610/* UVH_GR1_TLB_INT1_CONFIG */ 780/* UVH_GR1_TLB_INT1_CONFIG */
611/* ========================================================================= */ 781/* ========================================================================= */
612#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL 782#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
613 783
614#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 784#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
615#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL 785#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
616#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 786#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
617#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL 787#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
618#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11 788#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
619#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL 789#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
620#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12 790#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
621#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL 791#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
622#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13 792#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
623#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL 793#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
624#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15 794#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
625#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL 795#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
626#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16 796#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
627#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL 797#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
628#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32 798#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
629#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL 799#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
630 800
631union uvh_gr1_tlb_int1_config_u { 801union uvh_gr1_tlb_int1_config_u {
632 unsigned long v; 802 unsigned long v;
633 struct uvh_gr1_tlb_int1_config_s { 803 struct uvh_gr1_tlb_int1_config_s {
634 unsigned long vector_ : 8; /* RW */ 804 unsigned long vector_:8; /* RW */
635 unsigned long dm : 3; /* RW */ 805 unsigned long dm:3; /* RW */
636 unsigned long destmode : 1; /* RW */ 806 unsigned long destmode:1; /* RW */
637 unsigned long status : 1; /* RO */ 807 unsigned long status:1; /* RO */
638 unsigned long p : 1; /* RO */ 808 unsigned long p:1; /* RO */
639 unsigned long rsvd_14 : 1; /* */ 809 unsigned long rsvd_14:1;
640 unsigned long t : 1; /* RO */ 810 unsigned long t:1; /* RO */
641 unsigned long m : 1; /* RW */ 811 unsigned long m:1; /* RW */
642 unsigned long rsvd_17_31: 15; /* */ 812 unsigned long rsvd_17_31:15;
643 unsigned long apic_id : 32; /* RW */ 813 unsigned long apic_id:32; /* RW */
644 } s; 814 } s;
815};
816
817/* ========================================================================= */
818/* UVH_GR1_TLB_MMR_CONTROL */
819/* ========================================================================= */
820#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL
821#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL
822#define UVH_GR1_TLB_MMR_CONTROL (is_uv1_hub() ? \
823 UV1H_GR1_TLB_MMR_CONTROL : \
824 UV2H_GR1_TLB_MMR_CONTROL)
825
826#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
827#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
828#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
829#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
830#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
831#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
832#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
833#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
834#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
835#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
836#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
837#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
838
839#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
840#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
841#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
842#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
843#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
844#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
845#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
846#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
847#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
848#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
849#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
850#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
851#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
852#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
853#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
854#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
855#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
856#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
857#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
858#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
859#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
860#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
861
862#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
863#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
864#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
865#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
866#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
867#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
868#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
869#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
870#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
871#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
872#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
873#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
874#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
875#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
876#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
877#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
878#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
879#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
880
881union uvh_gr1_tlb_mmr_control_u {
882 unsigned long v;
883 struct uvh_gr1_tlb_mmr_control_s {
884 unsigned long index:12; /* RW */
885 unsigned long mem_sel:2; /* RW */
886 unsigned long rsvd_14_15:2;
887 unsigned long auto_valid_en:1; /* RW */
888 unsigned long rsvd_17_19:3;
889 unsigned long mmr_hash_index_en:1; /* RW */
890 unsigned long rsvd_21_29:9;
891 unsigned long mmr_write:1; /* WP */
892 unsigned long mmr_read:1; /* WP */
893 unsigned long rsvd_32_63:32;
894 } s;
895 struct uv1h_gr1_tlb_mmr_control_s {
896 unsigned long index:12; /* RW */
897 unsigned long mem_sel:2; /* RW */
898 unsigned long rsvd_14_15:2;
899 unsigned long auto_valid_en:1; /* RW */
900 unsigned long rsvd_17_19:3;
901 unsigned long mmr_hash_index_en:1; /* RW */
902 unsigned long rsvd_21_29:9;
903 unsigned long mmr_write:1; /* WP */
904 unsigned long mmr_read:1; /* WP */
905 unsigned long rsvd_32_47:16;
906 unsigned long mmr_inj_con:1; /* RW */
907 unsigned long rsvd_49_51:3;
908 unsigned long mmr_inj_tlbram:1; /* RW */
909 unsigned long rsvd_53:1;
910 unsigned long mmr_inj_tlbpgsize:1; /* RW */
911 unsigned long rsvd_55:1;
912 unsigned long mmr_inj_tlbrreg:1; /* RW */
913 unsigned long rsvd_57_59:3;
914 unsigned long mmr_inj_tlblruv:1; /* RW */
915 unsigned long rsvd_61_63:3;
916 } s1;
917 struct uv2h_gr1_tlb_mmr_control_s {
918 unsigned long index:12; /* RW */
919 unsigned long mem_sel:2; /* RW */
920 unsigned long rsvd_14_15:2;
921 unsigned long auto_valid_en:1; /* RW */
922 unsigned long rsvd_17_19:3;
923 unsigned long mmr_hash_index_en:1; /* RW */
924 unsigned long rsvd_21_29:9;
925 unsigned long mmr_write:1; /* WP */
926 unsigned long mmr_read:1; /* WP */
927 unsigned long mmr_op_done:1; /* RW */
928 unsigned long rsvd_33_47:15;
929 unsigned long mmr_inj_con:1; /* RW */
930 unsigned long rsvd_49_51:3;
931 unsigned long mmr_inj_tlbram:1; /* RW */
932 unsigned long rsvd_53_63:11;
933 } s2;
934};
935
936/* ========================================================================= */
937/* UVH_GR1_TLB_MMR_READ_DATA_HI */
938/* ========================================================================= */
939#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL
940#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
941#define UVH_GR1_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \
942 UV1H_GR1_TLB_MMR_READ_DATA_HI : \
943 UV2H_GR1_TLB_MMR_READ_DATA_HI)
944
945#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
946#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
947#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
948#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
949#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
950#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
951#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
952#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
953
954union uvh_gr1_tlb_mmr_read_data_hi_u {
955 unsigned long v;
956 struct uvh_gr1_tlb_mmr_read_data_hi_s {
957 unsigned long pfn:41; /* RO */
958 unsigned long gaa:2; /* RO */
959 unsigned long dirty:1; /* RO */
960 unsigned long larger:1; /* RO */
961 unsigned long rsvd_45_63:19;
962 } s;
963};
964
965/* ========================================================================= */
966/* UVH_GR1_TLB_MMR_READ_DATA_LO */
967/* ========================================================================= */
968#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL
969#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
970#define UVH_GR1_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \
971 UV1H_GR1_TLB_MMR_READ_DATA_LO : \
972 UV2H_GR1_TLB_MMR_READ_DATA_LO)
973
974#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
975#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
976#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
977#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
978#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
979#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
980
981union uvh_gr1_tlb_mmr_read_data_lo_u {
982 unsigned long v;
983 struct uvh_gr1_tlb_mmr_read_data_lo_s {
984 unsigned long vpn:39; /* RO */
985 unsigned long asid:24; /* RO */
986 unsigned long valid:1; /* RO */
987 } s;
645}; 988};
646 989
647/* ========================================================================= */ 990/* ========================================================================= */
648/* UVH_INT_CMPB */ 991/* UVH_INT_CMPB */
649/* ========================================================================= */ 992/* ========================================================================= */
650#define UVH_INT_CMPB 0x22080UL 993#define UVH_INT_CMPB 0x22080UL
651 994
652#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 995#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
653#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL 996#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
654 997
655union uvh_int_cmpb_u { 998union uvh_int_cmpb_u {
656 unsigned long v; 999 unsigned long v;
657 struct uvh_int_cmpb_s { 1000 struct uvh_int_cmpb_s {
658 unsigned long real_time_cmpb : 56; /* RW */ 1001 unsigned long real_time_cmpb:56; /* RW */
659 unsigned long rsvd_56_63 : 8; /* */ 1002 unsigned long rsvd_56_63:8;
660 } s; 1003 } s;
661}; 1004};
662 1005
663/* ========================================================================= */ 1006/* ========================================================================= */
664/* UVH_INT_CMPC */ 1007/* UVH_INT_CMPC */
665/* ========================================================================= */ 1008/* ========================================================================= */
666#define UVH_INT_CMPC 0x22100UL 1009#define UVH_INT_CMPC 0x22100UL
667 1010
668#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 1011#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
669#define UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 1012#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
670#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT (is_uv1_hub() ? \
671 UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT : \
672 UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT)
673#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
674#define UV2H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
675#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK (is_uv1_hub() ? \
676 UV1H_INT_CMPC_REAL_TIME_CMPC_MASK : \
677 UV2H_INT_CMPC_REAL_TIME_CMPC_MASK)
678 1013
679union uvh_int_cmpc_u { 1014union uvh_int_cmpc_u {
680 unsigned long v; 1015 unsigned long v;
681 struct uvh_int_cmpc_s { 1016 struct uvh_int_cmpc_s {
682 unsigned long real_time_cmpc : 56; /* RW */ 1017 unsigned long real_time_cmpc:56; /* RW */
683 unsigned long rsvd_56_63 : 8; /* */ 1018 unsigned long rsvd_56_63:8;
684 } s; 1019 } s;
685}; 1020};
686 1021
687/* ========================================================================= */ 1022/* ========================================================================= */
688/* UVH_INT_CMPD */ 1023/* UVH_INT_CMPD */
689/* ========================================================================= */ 1024/* ========================================================================= */
690#define UVH_INT_CMPD 0x22180UL 1025#define UVH_INT_CMPD 0x22180UL
691 1026
692#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 1027#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
693#define UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 1028#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
694#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT (is_uv1_hub() ? \
695 UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT : \
696 UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT)
697#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
698#define UV2H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
699#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK (is_uv1_hub() ? \
700 UV1H_INT_CMPD_REAL_TIME_CMPD_MASK : \
701 UV2H_INT_CMPD_REAL_TIME_CMPD_MASK)
702 1029
703union uvh_int_cmpd_u { 1030union uvh_int_cmpd_u {
704 unsigned long v; 1031 unsigned long v;
705 struct uvh_int_cmpd_s { 1032 struct uvh_int_cmpd_s {
706 unsigned long real_time_cmpd : 56; /* RW */ 1033 unsigned long real_time_cmpd:56; /* RW */
707 unsigned long rsvd_56_63 : 8; /* */ 1034 unsigned long rsvd_56_63:8;
708 } s; 1035 } s;
709}; 1036};
710 1037
711/* ========================================================================= */ 1038/* ========================================================================= */
712/* UVH_IPI_INT */ 1039/* UVH_IPI_INT */
713/* ========================================================================= */ 1040/* ========================================================================= */
714#define UVH_IPI_INT 0x60500UL 1041#define UVH_IPI_INT 0x60500UL
715#define UVH_IPI_INT_32 0x348 1042#define UVH_IPI_INT_32 0x348
716 1043
717#define UVH_IPI_INT_VECTOR_SHFT 0 1044#define UVH_IPI_INT_VECTOR_SHFT 0
718#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL 1045#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
719#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 1046#define UVH_IPI_INT_DESTMODE_SHFT 11
720#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL 1047#define UVH_IPI_INT_APIC_ID_SHFT 16
721#define UVH_IPI_INT_DESTMODE_SHFT 11 1048#define UVH_IPI_INT_SEND_SHFT 63
722#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL 1049#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
723#define UVH_IPI_INT_APIC_ID_SHFT 16 1050#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
724#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL 1051#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
725#define UVH_IPI_INT_SEND_SHFT 63 1052#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
726#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL 1053#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
727 1054
728union uvh_ipi_int_u { 1055union uvh_ipi_int_u {
729 unsigned long v; 1056 unsigned long v;
730 struct uvh_ipi_int_s { 1057 struct uvh_ipi_int_s {
731 unsigned long vector_ : 8; /* RW */ 1058 unsigned long vector_:8; /* RW */
732 unsigned long delivery_mode : 3; /* RW */ 1059 unsigned long delivery_mode:3; /* RW */
733 unsigned long destmode : 1; /* RW */ 1060 unsigned long destmode:1; /* RW */
734 unsigned long rsvd_12_15 : 4; /* */ 1061 unsigned long rsvd_12_15:4;
735 unsigned long apic_id : 32; /* RW */ 1062 unsigned long apic_id:32; /* RW */
736 unsigned long rsvd_48_62 : 15; /* */ 1063 unsigned long rsvd_48_62:15;
737 unsigned long send : 1; /* WP */ 1064 unsigned long send:1; /* WP */
738 } s; 1065 } s;
739}; 1066};
740 1067
741/* ========================================================================= */ 1068/* ========================================================================= */
742/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ 1069/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
743/* ========================================================================= */ 1070/* ========================================================================= */
744#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL 1071#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
745#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 1072#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
746 1073
747#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 1074#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
748#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
749#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 1075#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
1076#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
750#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL 1077#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
751 1078
752union uvh_lb_bau_intd_payload_queue_first_u { 1079union uvh_lb_bau_intd_payload_queue_first_u {
753 unsigned long v; 1080 unsigned long v;
754 struct uvh_lb_bau_intd_payload_queue_first_s { 1081 struct uvh_lb_bau_intd_payload_queue_first_s {
755 unsigned long rsvd_0_3: 4; /* */ 1082 unsigned long rsvd_0_3:4;
756 unsigned long address : 39; /* RW */ 1083 unsigned long address:39; /* RW */
757 unsigned long rsvd_43_48: 6; /* */ 1084 unsigned long rsvd_43_48:6;
758 unsigned long node_id : 14; /* RW */ 1085 unsigned long node_id:14; /* RW */
759 unsigned long rsvd_63 : 1; /* */ 1086 unsigned long rsvd_63:1;
760 } s; 1087 } s;
761}; 1088};
762 1089
763/* ========================================================================= */ 1090/* ========================================================================= */
764/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ 1091/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
765/* ========================================================================= */ 1092/* ========================================================================= */
766#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL 1093#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
767#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 1094#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
768 1095
769#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 1096#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
770#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL 1097#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
771 1098
772union uvh_lb_bau_intd_payload_queue_last_u { 1099union uvh_lb_bau_intd_payload_queue_last_u {
773 unsigned long v; 1100 unsigned long v;
774 struct uvh_lb_bau_intd_payload_queue_last_s { 1101 struct uvh_lb_bau_intd_payload_queue_last_s {
775 unsigned long rsvd_0_3: 4; /* */ 1102 unsigned long rsvd_0_3:4;
776 unsigned long address : 39; /* RW */ 1103 unsigned long address:39; /* RW */
777 unsigned long rsvd_43_63: 21; /* */ 1104 unsigned long rsvd_43_63:21;
778 } s; 1105 } s;
779}; 1106};
780 1107
781/* ========================================================================= */ 1108/* ========================================================================= */
782/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ 1109/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
783/* ========================================================================= */ 1110/* ========================================================================= */
784#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL 1111#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
785#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 1112#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
786 1113
787#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 1114#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
788#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL 1115#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
789 1116
790union uvh_lb_bau_intd_payload_queue_tail_u { 1117union uvh_lb_bau_intd_payload_queue_tail_u {
791 unsigned long v; 1118 unsigned long v;
792 struct uvh_lb_bau_intd_payload_queue_tail_s { 1119 struct uvh_lb_bau_intd_payload_queue_tail_s {
793 unsigned long rsvd_0_3: 4; /* */ 1120 unsigned long rsvd_0_3:4;
794 unsigned long address : 39; /* RW */ 1121 unsigned long address:39; /* RW */
795 unsigned long rsvd_43_63: 21; /* */ 1122 unsigned long rsvd_43_63:21;
796 } s; 1123 } s;
797}; 1124};
798 1125
799/* ========================================================================= */ 1126/* ========================================================================= */
800/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ 1127/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
801/* ========================================================================= */ 1128/* ========================================================================= */
802#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL 1129#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
803#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 1130#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
804 1131
805#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 1132#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
806#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
807#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 1133#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
808#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
809#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 1134#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
810#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
811#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 1135#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
812#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
813#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 1136#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
814#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
815#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 1137#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
816#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
817#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 1138#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
818#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
819#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 1139#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
820#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
821#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 1140#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
822#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
823#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 1141#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
824#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
825#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 1142#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
826#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
827#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 1143#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
828#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
829#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 1144#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
830#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
831#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 1145#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
832#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
833#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 1146#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
834#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
835#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 1147#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
1148#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
1149#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
1150#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
1151#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
1152#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
1153#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
1154#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
1155#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
1156#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
1157#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
1158#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
1159#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
1160#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
1161#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
1162#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
836#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL 1163#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
837 1164
838union uvh_lb_bau_intd_software_acknowledge_u { 1165union uvh_lb_bau_intd_software_acknowledge_u {
839 unsigned long v; 1166 unsigned long v;
840 struct uvh_lb_bau_intd_software_acknowledge_s { 1167 struct uvh_lb_bau_intd_software_acknowledge_s {
841 unsigned long pending_0 : 1; /* RW, W1C */ 1168 unsigned long pending_0:1; /* RW, W1C */
842 unsigned long pending_1 : 1; /* RW, W1C */ 1169 unsigned long pending_1:1; /* RW, W1C */
843 unsigned long pending_2 : 1; /* RW, W1C */ 1170 unsigned long pending_2:1; /* RW, W1C */
844 unsigned long pending_3 : 1; /* RW, W1C */ 1171 unsigned long pending_3:1; /* RW, W1C */
845 unsigned long pending_4 : 1; /* RW, W1C */ 1172 unsigned long pending_4:1; /* RW, W1C */
846 unsigned long pending_5 : 1; /* RW, W1C */ 1173 unsigned long pending_5:1; /* RW, W1C */
847 unsigned long pending_6 : 1; /* RW, W1C */ 1174 unsigned long pending_6:1; /* RW, W1C */
848 unsigned long pending_7 : 1; /* RW, W1C */ 1175 unsigned long pending_7:1; /* RW, W1C */
849 unsigned long timeout_0 : 1; /* RW, W1C */ 1176 unsigned long timeout_0:1; /* RW, W1C */
850 unsigned long timeout_1 : 1; /* RW, W1C */ 1177 unsigned long timeout_1:1; /* RW, W1C */
851 unsigned long timeout_2 : 1; /* RW, W1C */ 1178 unsigned long timeout_2:1; /* RW, W1C */
852 unsigned long timeout_3 : 1; /* RW, W1C */ 1179 unsigned long timeout_3:1; /* RW, W1C */
853 unsigned long timeout_4 : 1; /* RW, W1C */ 1180 unsigned long timeout_4:1; /* RW, W1C */
854 unsigned long timeout_5 : 1; /* RW, W1C */ 1181 unsigned long timeout_5:1; /* RW, W1C */
855 unsigned long timeout_6 : 1; /* RW, W1C */ 1182 unsigned long timeout_6:1; /* RW, W1C */
856 unsigned long timeout_7 : 1; /* RW, W1C */ 1183 unsigned long timeout_7:1; /* RW, W1C */
857 unsigned long rsvd_16_63: 48; /* */ 1184 unsigned long rsvd_16_63:48;
858 } s; 1185 } s;
859}; 1186};
860 1187
861/* ========================================================================= */ 1188/* ========================================================================= */
862/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ 1189/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
863/* ========================================================================= */ 1190/* ========================================================================= */
864#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL 1191#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
865#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 1192#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
866 1193
867/* ========================================================================= */ 1194/* ========================================================================= */
868/* UVH_LB_BAU_MISC_CONTROL */ 1195/* UVH_LB_BAU_MISC_CONTROL */
869/* ========================================================================= */ 1196/* ========================================================================= */
870#define UVH_LB_BAU_MISC_CONTROL 0x320170UL 1197#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
871#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 1198#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
872 1199
873#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 1200#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
874#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL 1201#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
875#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 1202#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
876#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL 1203#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
877#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
878#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
879#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
880#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
881#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 1204#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
882#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
883#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 1205#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
884#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
885#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 1206#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
886#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
887#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 1207#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
888#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
889#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 1208#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
890#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
891#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 1209#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
892#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
893#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 1210#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
894#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
895#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 1211#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
896#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
897#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 1212#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
898#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
899#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 1213#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
900#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
901#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 1214#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
1215#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
1216#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
1217#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
1218#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
1219#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
1220#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
1221#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
1222#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
1223#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
1224#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
1225#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
1226#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
1227#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
1228#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
902#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL 1229#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
903 1230
904#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 1231#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
905#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL 1232#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
906#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 1233#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
907#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL 1234#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
908#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
909#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
910#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
911#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
912#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 1235#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
913#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
914#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 1236#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
915#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
916#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 1237#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
917#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
918#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 1238#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
919#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
920#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 1239#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
921#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
922#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 1240#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
923#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
924#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 1241#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
925#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
926#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 1242#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
927#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
928#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 1243#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
929#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
930#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 1244#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
931#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
932#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 1245#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
1246#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
1247#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
1248#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
1249#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
1250#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
1251#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
1252#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
1253#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
1254#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
1255#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
1256#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
1257#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
1258#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
1259#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
1260#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
933#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL 1261#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
934#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 1262#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
935#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL 1263
936 1264#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
937#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 1265#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
938#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL 1266#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
939#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 1267#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
940#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
941#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
942#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
943#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
944#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
945#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 1268#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
946#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
947#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 1269#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
948#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
949#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 1270#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
950#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
951#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 1271#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
952#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
953#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 1272#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
954#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
955#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 1273#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
956#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
957#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 1274#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
958#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
959#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 1275#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
960#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
961#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 1276#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
962#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
963#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 1277#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
964#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
965#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 1278#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
966#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
967#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 1279#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
968#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL 1280#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
969#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
970#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
971#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 1281#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
972#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
973#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 1282#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
974#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
975#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 1283#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
976#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
977#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 1284#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
978#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
979#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 1285#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
1286#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
1287#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
1288#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
1289#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
1290#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
1291#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
1292#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
1293#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
1294#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
1295#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
1296#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
1297#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
1298#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
1299#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
1300#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
1301#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
1302#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
1303#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
1304#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
1305#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
1306#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
1307#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
980#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL 1308#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
981#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 1309#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
982#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
983 1310
984union uvh_lb_bau_misc_control_u { 1311union uvh_lb_bau_misc_control_u {
985 unsigned long v; 1312 unsigned long v;
986 struct uvh_lb_bau_misc_control_s { 1313 struct uvh_lb_bau_misc_control_s {
987 unsigned long rejection_delay : 8; /* RW */ 1314 unsigned long rejection_delay:8; /* RW */
988 unsigned long apic_mode : 1; /* RW */ 1315 unsigned long apic_mode:1; /* RW */
989 unsigned long force_broadcast : 1; /* RW */ 1316 unsigned long force_broadcast:1; /* RW */
990 unsigned long force_lock_nop : 1; /* RW */ 1317 unsigned long force_lock_nop:1; /* RW */
991 unsigned long qpi_agent_presence_vector : 3; /* RW */ 1318 unsigned long qpi_agent_presence_vector:3; /* RW */
992 unsigned long descriptor_fetch_mode : 1; /* RW */ 1319 unsigned long descriptor_fetch_mode:1; /* RW */
993 unsigned long enable_intd_soft_ack_mode : 1; /* RW */ 1320 unsigned long enable_intd_soft_ack_mode:1; /* RW */
994 unsigned long intd_soft_ack_timeout_period : 4; /* RW */ 1321 unsigned long intd_soft_ack_timeout_period:4; /* RW */
995 unsigned long enable_dual_mapping_mode : 1; /* RW */ 1322 unsigned long enable_dual_mapping_mode:1; /* RW */
996 unsigned long vga_io_port_decode_enable : 1; /* RW */ 1323 unsigned long vga_io_port_decode_enable:1; /* RW */
997 unsigned long vga_io_port_16_bit_decode : 1; /* RW */ 1324 unsigned long vga_io_port_16_bit_decode:1; /* RW */
998 unsigned long suppress_dest_registration : 1; /* RW */ 1325 unsigned long suppress_dest_registration:1; /* RW */
999 unsigned long programmed_initial_priority : 3; /* RW */ 1326 unsigned long programmed_initial_priority:3; /* RW */
1000 unsigned long use_incoming_priority : 1; /* RW */ 1327 unsigned long use_incoming_priority:1; /* RW */
1001 unsigned long enable_programmed_initial_priority : 1; /* RW */ 1328 unsigned long enable_programmed_initial_priority:1;/* RW */
1002 unsigned long rsvd_29_63 : 35; 1329 unsigned long rsvd_29_63:35;
1003 } s; 1330 } s;
1004 struct uv1h_lb_bau_misc_control_s { 1331 struct uv1h_lb_bau_misc_control_s {
1005 unsigned long rejection_delay : 8; /* RW */ 1332 unsigned long rejection_delay:8; /* RW */
1006 unsigned long apic_mode : 1; /* RW */ 1333 unsigned long apic_mode:1; /* RW */
1007 unsigned long force_broadcast : 1; /* RW */ 1334 unsigned long force_broadcast:1; /* RW */
1008 unsigned long force_lock_nop : 1; /* RW */ 1335 unsigned long force_lock_nop:1; /* RW */
1009 unsigned long qpi_agent_presence_vector : 3; /* RW */ 1336 unsigned long qpi_agent_presence_vector:3; /* RW */
1010 unsigned long descriptor_fetch_mode : 1; /* RW */ 1337 unsigned long descriptor_fetch_mode:1; /* RW */
1011 unsigned long enable_intd_soft_ack_mode : 1; /* RW */ 1338 unsigned long enable_intd_soft_ack_mode:1; /* RW */
1012 unsigned long intd_soft_ack_timeout_period : 4; /* RW */ 1339 unsigned long intd_soft_ack_timeout_period:4; /* RW */
1013 unsigned long enable_dual_mapping_mode : 1; /* RW */ 1340 unsigned long enable_dual_mapping_mode:1; /* RW */
1014 unsigned long vga_io_port_decode_enable : 1; /* RW */ 1341 unsigned long vga_io_port_decode_enable:1; /* RW */
1015 unsigned long vga_io_port_16_bit_decode : 1; /* RW */ 1342 unsigned long vga_io_port_16_bit_decode:1; /* RW */
1016 unsigned long suppress_dest_registration : 1; /* RW */ 1343 unsigned long suppress_dest_registration:1; /* RW */
1017 unsigned long programmed_initial_priority : 3; /* RW */ 1344 unsigned long programmed_initial_priority:3; /* RW */
1018 unsigned long use_incoming_priority : 1; /* RW */ 1345 unsigned long use_incoming_priority:1; /* RW */
1019 unsigned long enable_programmed_initial_priority : 1; /* RW */ 1346 unsigned long enable_programmed_initial_priority:1;/* RW */
1020 unsigned long rsvd_29_47 : 19; /* */ 1347 unsigned long rsvd_29_47:19;
1021 unsigned long fun : 16; /* RW */ 1348 unsigned long fun:16; /* RW */
1022 } s1; 1349 } s1;
1023 struct uv2h_lb_bau_misc_control_s { 1350 struct uv2h_lb_bau_misc_control_s {
1024 unsigned long rejection_delay : 8; /* RW */ 1351 unsigned long rejection_delay:8; /* RW */
1025 unsigned long apic_mode : 1; /* RW */ 1352 unsigned long apic_mode:1; /* RW */
1026 unsigned long force_broadcast : 1; /* RW */ 1353 unsigned long force_broadcast:1; /* RW */
1027 unsigned long force_lock_nop : 1; /* RW */ 1354 unsigned long force_lock_nop:1; /* RW */
1028 unsigned long qpi_agent_presence_vector : 3; /* RW */ 1355 unsigned long qpi_agent_presence_vector:3; /* RW */
1029 unsigned long descriptor_fetch_mode : 1; /* RW */ 1356 unsigned long descriptor_fetch_mode:1; /* RW */
1030 unsigned long enable_intd_soft_ack_mode : 1; /* RW */ 1357 unsigned long enable_intd_soft_ack_mode:1; /* RW */
1031 unsigned long intd_soft_ack_timeout_period : 4; /* RW */ 1358 unsigned long intd_soft_ack_timeout_period:4; /* RW */
1032 unsigned long enable_dual_mapping_mode : 1; /* RW */ 1359 unsigned long enable_dual_mapping_mode:1; /* RW */
1033 unsigned long vga_io_port_decode_enable : 1; /* RW */ 1360 unsigned long vga_io_port_decode_enable:1; /* RW */
1034 unsigned long vga_io_port_16_bit_decode : 1; /* RW */ 1361 unsigned long vga_io_port_16_bit_decode:1; /* RW */
1035 unsigned long suppress_dest_registration : 1; /* RW */ 1362 unsigned long suppress_dest_registration:1; /* RW */
1036 unsigned long programmed_initial_priority : 3; /* RW */ 1363 unsigned long programmed_initial_priority:3; /* RW */
1037 unsigned long use_incoming_priority : 1; /* RW */ 1364 unsigned long use_incoming_priority:1; /* RW */
1038 unsigned long enable_programmed_initial_priority : 1; /* RW */ 1365 unsigned long enable_programmed_initial_priority:1;/* RW */
1039 unsigned long enable_automatic_apic_mode_selection : 1; /* RW */ 1366 unsigned long enable_automatic_apic_mode_selection:1;/* RW */
1040 unsigned long apic_mode_status : 1; /* RO */ 1367 unsigned long apic_mode_status:1; /* RO */
1041 unsigned long suppress_interrupts_to_self : 1; /* RW */ 1368 unsigned long suppress_interrupts_to_self:1; /* RW */
1042 unsigned long enable_lock_based_system_flush : 1; /* RW */ 1369 unsigned long enable_lock_based_system_flush:1;/* RW */
1043 unsigned long enable_extended_sb_status : 1; /* RW */ 1370 unsigned long enable_extended_sb_status:1; /* RW */
1044 unsigned long suppress_int_prio_udt_to_self : 1; /* RW */ 1371 unsigned long suppress_int_prio_udt_to_self:1;/* RW */
1045 unsigned long use_legacy_descriptor_formats : 1; /* RW */ 1372 unsigned long use_legacy_descriptor_formats:1;/* RW */
1046 unsigned long rsvd_36_47 : 12; /* */ 1373 unsigned long rsvd_36_47:12;
1047 unsigned long fun : 16; /* RW */ 1374 unsigned long fun:16; /* RW */
1048 } s2; 1375 } s2;
1049}; 1376};
1050 1377
1051/* ========================================================================= */ 1378/* ========================================================================= */
1052/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ 1379/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
1053/* ========================================================================= */ 1380/* ========================================================================= */
1054#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL 1381#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
1055#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 1382#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
1056 1383
1057#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 1384#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
1058#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL 1385#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
1059#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 1386#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
1060#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL 1387#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
1061#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63 1388#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
1062#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL 1389#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
1063 1390
1064union uvh_lb_bau_sb_activation_control_u { 1391union uvh_lb_bau_sb_activation_control_u {
1065 unsigned long v; 1392 unsigned long v;
1066 struct uvh_lb_bau_sb_activation_control_s { 1393 struct uvh_lb_bau_sb_activation_control_s {
1067 unsigned long index : 6; /* RW */ 1394 unsigned long index:6; /* RW */
1068 unsigned long rsvd_6_61: 56; /* */ 1395 unsigned long rsvd_6_61:56;
1069 unsigned long push : 1; /* WP */ 1396 unsigned long push:1; /* WP */
1070 unsigned long init : 1; /* WP */ 1397 unsigned long init:1; /* WP */
1071 } s; 1398 } s;
1072}; 1399};
1073 1400
1074/* ========================================================================= */ 1401/* ========================================================================= */
1075/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ 1402/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
1076/* ========================================================================= */ 1403/* ========================================================================= */
1077#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL 1404#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
1078#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 1405#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
1079 1406
1080#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 1407#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
1081#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL 1408#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
1082 1409
1083union uvh_lb_bau_sb_activation_status_0_u { 1410union uvh_lb_bau_sb_activation_status_0_u {
1084 unsigned long v; 1411 unsigned long v;
1085 struct uvh_lb_bau_sb_activation_status_0_s { 1412 struct uvh_lb_bau_sb_activation_status_0_s {
1086 unsigned long status : 64; /* RW */ 1413 unsigned long status:64; /* RW */
1087 } s; 1414 } s;
1088}; 1415};
1089 1416
1090/* ========================================================================= */ 1417/* ========================================================================= */
1091/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ 1418/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
1092/* ========================================================================= */ 1419/* ========================================================================= */
1093#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL 1420#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
1094#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 1421#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
1095 1422
1096#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 1423#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
1097#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL 1424#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
1098 1425
1099union uvh_lb_bau_sb_activation_status_1_u { 1426union uvh_lb_bau_sb_activation_status_1_u {
1100 unsigned long v; 1427 unsigned long v;
1101 struct uvh_lb_bau_sb_activation_status_1_s { 1428 struct uvh_lb_bau_sb_activation_status_1_s {
1102 unsigned long status : 64; /* RW */ 1429 unsigned long status:64; /* RW */
1103 } s; 1430 } s;
1104}; 1431};
1105 1432
1106/* ========================================================================= */ 1433/* ========================================================================= */
1107/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ 1434/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
1108/* ========================================================================= */ 1435/* ========================================================================= */
1109#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL 1436#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
1110#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 1437#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
1111 1438
1112#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 1439#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
1113#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL 1440#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
1114#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 1441#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
1115#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL 1442#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
1116 1443
1117union uvh_lb_bau_sb_descriptor_base_u { 1444union uvh_lb_bau_sb_descriptor_base_u {
1118 unsigned long v; 1445 unsigned long v;
1119 struct uvh_lb_bau_sb_descriptor_base_s { 1446 struct uvh_lb_bau_sb_descriptor_base_s {
1120 unsigned long rsvd_0_11 : 12; /* */ 1447 unsigned long rsvd_0_11:12;
1121 unsigned long page_address : 31; /* RW */ 1448 unsigned long page_address:31; /* RW */
1122 unsigned long rsvd_43_48 : 6; /* */ 1449 unsigned long rsvd_43_48:6;
1123 unsigned long node_id : 14; /* RW */ 1450 unsigned long node_id:14; /* RW */
1124 unsigned long rsvd_63 : 1; /* */ 1451 unsigned long rsvd_63:1;
1125 } s; 1452 } s;
1126}; 1453};
1127 1454
1128/* ========================================================================= */ 1455/* ========================================================================= */
1129/* UVH_NODE_ID */ 1456/* UVH_NODE_ID */
1130/* ========================================================================= */ 1457/* ========================================================================= */
1131#define UVH_NODE_ID 0x0UL 1458#define UVH_NODE_ID 0x0UL
1132 1459
1133#define UVH_NODE_ID_FORCE1_SHFT 0 1460#define UVH_NODE_ID_FORCE1_SHFT 0
1134#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL 1461#define UVH_NODE_ID_MANUFACTURER_SHFT 1
1135#define UVH_NODE_ID_MANUFACTURER_SHFT 1 1462#define UVH_NODE_ID_PART_NUMBER_SHFT 12
1136#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL 1463#define UVH_NODE_ID_REVISION_SHFT 28
1137#define UVH_NODE_ID_PART_NUMBER_SHFT 12 1464#define UVH_NODE_ID_NODE_ID_SHFT 32
1138#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL 1465#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
1139#define UVH_NODE_ID_REVISION_SHFT 28 1466#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
1140#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL 1467#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
1141#define UVH_NODE_ID_NODE_ID_SHFT 32 1468#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
1142#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL 1469#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
1143 1470
1144#define UV1H_NODE_ID_FORCE1_SHFT 0 1471#define UV1H_NODE_ID_FORCE1_SHFT 0
1145#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL 1472#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
1146#define UV1H_NODE_ID_MANUFACTURER_SHFT 1 1473#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
1147#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL 1474#define UV1H_NODE_ID_REVISION_SHFT 28
1148#define UV1H_NODE_ID_PART_NUMBER_SHFT 12 1475#define UV1H_NODE_ID_NODE_ID_SHFT 32
1149#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL 1476#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
1150#define UV1H_NODE_ID_REVISION_SHFT 28 1477#define UV1H_NODE_ID_NI_PORT_SHFT 56
1151#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL 1478#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
1152#define UV1H_NODE_ID_NODE_ID_SHFT 32 1479#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
1153#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL 1480#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
1154#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48 1481#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
1155#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL 1482#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
1156#define UV1H_NODE_ID_NI_PORT_SHFT 56 1483#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
1157#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL 1484#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
1158 1485
1159#define UV2H_NODE_ID_FORCE1_SHFT 0 1486#define UV2H_NODE_ID_FORCE1_SHFT 0
1160#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL 1487#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
1161#define UV2H_NODE_ID_MANUFACTURER_SHFT 1 1488#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
1162#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL 1489#define UV2H_NODE_ID_REVISION_SHFT 28
1163#define UV2H_NODE_ID_PART_NUMBER_SHFT 12 1490#define UV2H_NODE_ID_NODE_ID_SHFT 32
1164#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL 1491#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
1165#define UV2H_NODE_ID_REVISION_SHFT 28 1492#define UV2H_NODE_ID_NI_PORT_SHFT 57
1166#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL 1493#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
1167#define UV2H_NODE_ID_NODE_ID_SHFT 32 1494#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
1168#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL 1495#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
1169#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50 1496#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
1170#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL 1497#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
1171#define UV2H_NODE_ID_NI_PORT_SHFT 57 1498#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
1172#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL 1499#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
1173 1500
1174union uvh_node_id_u { 1501union uvh_node_id_u {
1175 unsigned long v; 1502 unsigned long v;
1176 struct uvh_node_id_s { 1503 struct uvh_node_id_s {
1177 unsigned long force1 : 1; /* RO */ 1504 unsigned long force1:1; /* RO */
1178 unsigned long manufacturer : 11; /* RO */ 1505 unsigned long manufacturer:11; /* RO */
1179 unsigned long part_number : 16; /* RO */ 1506 unsigned long part_number:16; /* RO */
1180 unsigned long revision : 4; /* RO */ 1507 unsigned long revision:4; /* RO */
1181 unsigned long node_id : 15; /* RW */ 1508 unsigned long node_id:15; /* RW */
1182 unsigned long rsvd_47_63 : 17; 1509 unsigned long rsvd_47_63:17;
1183 } s; 1510 } s;
1184 struct uv1h_node_id_s { 1511 struct uv1h_node_id_s {
1185 unsigned long force1 : 1; /* RO */ 1512 unsigned long force1:1; /* RO */
1186 unsigned long manufacturer : 11; /* RO */ 1513 unsigned long manufacturer:11; /* RO */
1187 unsigned long part_number : 16; /* RO */ 1514 unsigned long part_number:16; /* RO */
1188 unsigned long revision : 4; /* RO */ 1515 unsigned long revision:4; /* RO */
1189 unsigned long node_id : 15; /* RW */ 1516 unsigned long node_id:15; /* RW */
1190 unsigned long rsvd_47 : 1; /* */ 1517 unsigned long rsvd_47:1;
1191 unsigned long nodes_per_bit : 7; /* RW */ 1518 unsigned long nodes_per_bit:7; /* RW */
1192 unsigned long rsvd_55 : 1; /* */ 1519 unsigned long rsvd_55:1;
1193 unsigned long ni_port : 4; /* RO */ 1520 unsigned long ni_port:4; /* RO */
1194 unsigned long rsvd_60_63 : 4; /* */ 1521 unsigned long rsvd_60_63:4;
1195 } s1; 1522 } s1;
1196 struct uv2h_node_id_s { 1523 struct uv2h_node_id_s {
1197 unsigned long force1 : 1; /* RO */ 1524 unsigned long force1:1; /* RO */
1198 unsigned long manufacturer : 11; /* RO */ 1525 unsigned long manufacturer:11; /* RO */
1199 unsigned long part_number : 16; /* RO */ 1526 unsigned long part_number:16; /* RO */
1200 unsigned long revision : 4; /* RO */ 1527 unsigned long revision:4; /* RO */
1201 unsigned long node_id : 15; /* RW */ 1528 unsigned long node_id:15; /* RW */
1202 unsigned long rsvd_47_49 : 3; /* */ 1529 unsigned long rsvd_47_49:3;
1203 unsigned long nodes_per_bit : 7; /* RO */ 1530 unsigned long nodes_per_bit:7; /* RO */
1204 unsigned long ni_port : 5; /* RO */ 1531 unsigned long ni_port:5; /* RO */
1205 unsigned long rsvd_62_63 : 2; /* */ 1532 unsigned long rsvd_62_63:2;
1206 } s2; 1533 } s2;
1207}; 1534};
1208 1535
1209/* ========================================================================= */ 1536/* ========================================================================= */
1210/* UVH_NODE_PRESENT_TABLE */ 1537/* UVH_NODE_PRESENT_TABLE */
1211/* ========================================================================= */ 1538/* ========================================================================= */
1212#define UVH_NODE_PRESENT_TABLE 0x1400UL 1539#define UVH_NODE_PRESENT_TABLE 0x1400UL
1213#define UVH_NODE_PRESENT_TABLE_DEPTH 16 1540#define UVH_NODE_PRESENT_TABLE_DEPTH 16
1214 1541
1215#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 1542#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
1216#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL 1543#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
1217 1544
1218union uvh_node_present_table_u { 1545union uvh_node_present_table_u {
1219 unsigned long v; 1546 unsigned long v;
1220 struct uvh_node_present_table_s { 1547 struct uvh_node_present_table_s {
1221 unsigned long nodes : 64; /* RW */ 1548 unsigned long nodes:64; /* RW */
1222 } s; 1549 } s;
1223}; 1550};
1224 1551
1225/* ========================================================================= */ 1552/* ========================================================================= */
1226/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ 1553/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
1227/* ========================================================================= */ 1554/* ========================================================================= */
1228#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL 1555#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
1229 1556
1230#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 1557#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
1231#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
1232#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 1558#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
1233#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
1234#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 1559#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
1560#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
1561#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
1235#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL 1562#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
1236 1563
1237union uvh_rh_gam_alias210_overlay_config_0_mmr_u { 1564union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
1238 unsigned long v; 1565 unsigned long v;
1239 struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { 1566 struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
1240 unsigned long rsvd_0_23: 24; /* */ 1567 unsigned long rsvd_0_23:24;
1241 unsigned long base : 8; /* RW */ 1568 unsigned long base:8; /* RW */
1242 unsigned long rsvd_32_47: 16; /* */ 1569 unsigned long rsvd_32_47:16;
1243 unsigned long m_alias : 5; /* RW */ 1570 unsigned long m_alias:5; /* RW */
1244 unsigned long rsvd_53_62: 10; /* */ 1571 unsigned long rsvd_53_62:10;
1245 unsigned long enable : 1; /* RW */ 1572 unsigned long enable:1; /* RW */
1246 } s; 1573 } s;
1247}; 1574};
1248 1575
1249/* ========================================================================= */ 1576/* ========================================================================= */
1250/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ 1577/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
1251/* ========================================================================= */ 1578/* ========================================================================= */
1252#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL 1579#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
1253 1580
1254#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 1581#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
1255#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
1256#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 1582#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
1257#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
1258#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 1583#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
1584#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
1585#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
1259#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL 1586#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
1260 1587
1261union uvh_rh_gam_alias210_overlay_config_1_mmr_u { 1588union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
1262 unsigned long v; 1589 unsigned long v;
1263 struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { 1590 struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
1264 unsigned long rsvd_0_23: 24; /* */ 1591 unsigned long rsvd_0_23:24;
1265 unsigned long base : 8; /* RW */ 1592 unsigned long base:8; /* RW */
1266 unsigned long rsvd_32_47: 16; /* */ 1593 unsigned long rsvd_32_47:16;
1267 unsigned long m_alias : 5; /* RW */ 1594 unsigned long m_alias:5; /* RW */
1268 unsigned long rsvd_53_62: 10; /* */ 1595 unsigned long rsvd_53_62:10;
1269 unsigned long enable : 1; /* RW */ 1596 unsigned long enable:1; /* RW */
1270 } s; 1597 } s;
1271}; 1598};
1272 1599
1273/* ========================================================================= */ 1600/* ========================================================================= */
1274/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ 1601/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
1275/* ========================================================================= */ 1602/* ========================================================================= */
1276#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL 1603#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
1277 1604
1278#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 1605#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
1279#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
1280#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 1606#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
1281#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
1282#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 1607#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
1608#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
1609#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
1283#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL 1610#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
1284 1611
1285union uvh_rh_gam_alias210_overlay_config_2_mmr_u { 1612union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
1286 unsigned long v; 1613 unsigned long v;
1287 struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { 1614 struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
1288 unsigned long rsvd_0_23: 24; /* */ 1615 unsigned long rsvd_0_23:24;
1289 unsigned long base : 8; /* RW */ 1616 unsigned long base:8; /* RW */
1290 unsigned long rsvd_32_47: 16; /* */ 1617 unsigned long rsvd_32_47:16;
1291 unsigned long m_alias : 5; /* RW */ 1618 unsigned long m_alias:5; /* RW */
1292 unsigned long rsvd_53_62: 10; /* */ 1619 unsigned long rsvd_53_62:10;
1293 unsigned long enable : 1; /* RW */ 1620 unsigned long enable:1; /* RW */
1294 } s; 1621 } s;
1295}; 1622};
1296 1623
1297/* ========================================================================= */ 1624/* ========================================================================= */
1298/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ 1625/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
1299/* ========================================================================= */ 1626/* ========================================================================= */
1300#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL 1627#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
1301 1628
1302#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 1629#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
1303#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL 1630#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
1304 1631
1305union uvh_rh_gam_alias210_redirect_config_0_mmr_u { 1632union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
1306 unsigned long v; 1633 unsigned long v;
1307 struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { 1634 struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
1308 unsigned long rsvd_0_23 : 24; /* */ 1635 unsigned long rsvd_0_23:24;
1309 unsigned long dest_base : 22; /* RW */ 1636 unsigned long dest_base:22; /* RW */
1310 unsigned long rsvd_46_63: 18; /* */ 1637 unsigned long rsvd_46_63:18;
1311 } s; 1638 } s;
1312}; 1639};
1313 1640
1314/* ========================================================================= */ 1641/* ========================================================================= */
1315/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ 1642/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
1316/* ========================================================================= */ 1643/* ========================================================================= */
1317#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL 1644#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
1318 1645
1319#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 1646#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
1320#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL 1647#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
1321 1648
1322union uvh_rh_gam_alias210_redirect_config_1_mmr_u { 1649union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
1323 unsigned long v; 1650 unsigned long v;
1324 struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { 1651 struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
1325 unsigned long rsvd_0_23 : 24; /* */ 1652 unsigned long rsvd_0_23:24;
1326 unsigned long dest_base : 22; /* RW */ 1653 unsigned long dest_base:22; /* RW */
1327 unsigned long rsvd_46_63: 18; /* */ 1654 unsigned long rsvd_46_63:18;
1328 } s; 1655 } s;
1329}; 1656};
1330 1657
1331/* ========================================================================= */ 1658/* ========================================================================= */
1332/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ 1659/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
1333/* ========================================================================= */ 1660/* ========================================================================= */
1334#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL 1661#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
1335 1662
1336#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 1663#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
1337#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL 1664#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
1338 1665
1339union uvh_rh_gam_alias210_redirect_config_2_mmr_u { 1666union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
1340 unsigned long v; 1667 unsigned long v;
1341 struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { 1668 struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
1342 unsigned long rsvd_0_23 : 24; /* */ 1669 unsigned long rsvd_0_23:24;
1343 unsigned long dest_base : 22; /* RW */ 1670 unsigned long dest_base:22; /* RW */
1344 unsigned long rsvd_46_63: 18; /* */ 1671 unsigned long rsvd_46_63:18;
1345 } s; 1672 } s;
1346}; 1673};
1347 1674
1348/* ========================================================================= */ 1675/* ========================================================================= */
1349/* UVH_RH_GAM_CONFIG_MMR */ 1676/* UVH_RH_GAM_CONFIG_MMR */
1350/* ========================================================================= */ 1677/* ========================================================================= */
1351#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL 1678#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
1352 1679
1353#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 1680#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1354#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL 1681#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1355#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 1682#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1356#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL 1683#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
1357 1684
1358#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 1685#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1359#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL 1686#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1360#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 1687#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
1361#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL 1688#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1362#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 1689#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
1363#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL 1690#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
1364 1691
1365#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 1692#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1366#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL 1693#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1367#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 1694#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1368#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL 1695#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
1369 1696
1370union uvh_rh_gam_config_mmr_u { 1697union uvh_rh_gam_config_mmr_u {
1371 unsigned long v; 1698 unsigned long v;
1372 struct uvh_rh_gam_config_mmr_s { 1699 struct uvh_rh_gam_config_mmr_s {
1373 unsigned long m_skt : 6; /* RW */ 1700 unsigned long m_skt:6; /* RW */
1374 unsigned long n_skt : 4; /* RW */ 1701 unsigned long n_skt:4; /* RW */
1375 unsigned long rsvd_10_63 : 54; 1702 unsigned long rsvd_10_63:54;
1376 } s; 1703 } s;
1377 struct uv1h_rh_gam_config_mmr_s { 1704 struct uv1h_rh_gam_config_mmr_s {
1378 unsigned long m_skt : 6; /* RW */ 1705 unsigned long m_skt:6; /* RW */
1379 unsigned long n_skt : 4; /* RW */ 1706 unsigned long n_skt:4; /* RW */
1380 unsigned long rsvd_10_11: 2; /* */ 1707 unsigned long rsvd_10_11:2;
1381 unsigned long mmiol_cfg : 1; /* RW */ 1708 unsigned long mmiol_cfg:1; /* RW */
1382 unsigned long rsvd_13_63: 51; /* */ 1709 unsigned long rsvd_13_63:51;
1383 } s1; 1710 } s1;
1384 struct uv2h_rh_gam_config_mmr_s { 1711 struct uv2h_rh_gam_config_mmr_s {
1385 unsigned long m_skt : 6; /* RW */ 1712 unsigned long m_skt:6; /* RW */
1386 unsigned long n_skt : 4; /* RW */ 1713 unsigned long n_skt:4; /* RW */
1387 unsigned long rsvd_10_63: 54; /* */ 1714 unsigned long rsvd_10_63:54;
1388 } s2; 1715 } s2;
1389}; 1716};
1390 1717
1391/* ========================================================================= */ 1718/* ========================================================================= */
1392/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ 1719/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
1393/* ========================================================================= */ 1720/* ========================================================================= */
1394#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL 1721#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
1395 1722
1396#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 1723#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
1397#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL 1724#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
1398 1725
1399#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 1726#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
1400#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL 1727#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
1401#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 1728#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
1402#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL 1729#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1403#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 1730#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
1404#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL 1731#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
1405#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1732#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
1406#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1733#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1407 1734
1408#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 1735#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
1409#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL 1736#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
1410#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 1737#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1411#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL 1738#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
1412#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1739#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
1413#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1740#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1414 1741
1415union uvh_rh_gam_gru_overlay_config_mmr_u { 1742union uvh_rh_gam_gru_overlay_config_mmr_u {
1416 unsigned long v; 1743 unsigned long v;
1417 struct uvh_rh_gam_gru_overlay_config_mmr_s { 1744 struct uvh_rh_gam_gru_overlay_config_mmr_s {
1418 unsigned long rsvd_0_27: 28; /* */ 1745 unsigned long rsvd_0_27:28;
1419 unsigned long base : 18; /* RW */ 1746 unsigned long base:18; /* RW */
1420 unsigned long rsvd_46_62 : 17; 1747 unsigned long rsvd_46_62:17;
1421 unsigned long enable : 1; /* RW */ 1748 unsigned long enable:1; /* RW */
1422 } s; 1749 } s;
1423 struct uv1h_rh_gam_gru_overlay_config_mmr_s { 1750 struct uv1h_rh_gam_gru_overlay_config_mmr_s {
1424 unsigned long rsvd_0_27: 28; /* */ 1751 unsigned long rsvd_0_27:28;
1425 unsigned long base : 18; /* RW */ 1752 unsigned long base:18; /* RW */
1426 unsigned long rsvd_46_47: 2; /* */ 1753 unsigned long rsvd_46_47:2;
1427 unsigned long gr4 : 1; /* RW */ 1754 unsigned long gr4:1; /* RW */
1428 unsigned long rsvd_49_51: 3; /* */ 1755 unsigned long rsvd_49_51:3;
1429 unsigned long n_gru : 4; /* RW */ 1756 unsigned long n_gru:4; /* RW */
1430 unsigned long rsvd_56_62: 7; /* */ 1757 unsigned long rsvd_56_62:7;
1431 unsigned long enable : 1; /* RW */ 1758 unsigned long enable:1; /* RW */
1432 } s1; 1759 } s1;
1433 struct uv2h_rh_gam_gru_overlay_config_mmr_s { 1760 struct uv2h_rh_gam_gru_overlay_config_mmr_s {
1434 unsigned long rsvd_0_27: 28; /* */ 1761 unsigned long rsvd_0_27:28;
1435 unsigned long base : 18; /* RW */ 1762 unsigned long base:18; /* RW */
1436 unsigned long rsvd_46_51: 6; /* */ 1763 unsigned long rsvd_46_51:6;
1437 unsigned long n_gru : 4; /* RW */ 1764 unsigned long n_gru:4; /* RW */
1438 unsigned long rsvd_56_62: 7; /* */ 1765 unsigned long rsvd_56_62:7;
1439 unsigned long enable : 1; /* RW */ 1766 unsigned long enable:1; /* RW */
1440 } s2; 1767 } s2;
1441}; 1768};
1442 1769
1443/* ========================================================================= */ 1770/* ========================================================================= */
1444/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ 1771/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
1445/* ========================================================================= */ 1772/* ========================================================================= */
1446#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL 1773#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
1447 1774
1448#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 1775#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
1449#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL 1776#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
1450#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 1777#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1451#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1452#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1453#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1454#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1778#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1779#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
1780#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1781#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1455#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1782#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1456 1783
1457#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 1784#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
1458#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL 1785#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
1459#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 1786#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1460#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1461#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1462#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1463#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1787#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1788#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
1789#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1790#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1464#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1791#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1465 1792
1466union uvh_rh_gam_mmioh_overlay_config_mmr_u { 1793union uvh_rh_gam_mmioh_overlay_config_mmr_u {
1467 unsigned long v; 1794 unsigned long v;
1468 struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { 1795 struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
1469 unsigned long rsvd_0_29: 30; /* */ 1796 unsigned long rsvd_0_29:30;
1470 unsigned long base : 16; /* RW */ 1797 unsigned long base:16; /* RW */
1471 unsigned long m_io : 6; /* RW */ 1798 unsigned long m_io:6; /* RW */
1472 unsigned long n_io : 4; /* RW */ 1799 unsigned long n_io:4; /* RW */
1473 unsigned long rsvd_56_62: 7; /* */ 1800 unsigned long rsvd_56_62:7;
1474 unsigned long enable : 1; /* RW */ 1801 unsigned long enable:1; /* RW */
1475 } s1; 1802 } s1;
1476 struct uv2h_rh_gam_mmioh_overlay_config_mmr_s { 1803 struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
1477 unsigned long rsvd_0_26: 27; /* */ 1804 unsigned long rsvd_0_26:27;
1478 unsigned long base : 19; /* RW */ 1805 unsigned long base:19; /* RW */
1479 unsigned long m_io : 6; /* RW */ 1806 unsigned long m_io:6; /* RW */
1480 unsigned long n_io : 4; /* RW */ 1807 unsigned long n_io:4; /* RW */
1481 unsigned long rsvd_56_62: 7; /* */ 1808 unsigned long rsvd_56_62:7;
1482 unsigned long enable : 1; /* RW */ 1809 unsigned long enable:1; /* RW */
1483 } s2; 1810 } s2;
1484}; 1811};
1485 1812
1486/* ========================================================================= */ 1813/* ========================================================================= */
1487/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ 1814/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
1488/* ========================================================================= */ 1815/* ========================================================================= */
1489#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL 1816#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
1490 1817
1491#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 1818#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1492#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL 1819#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1493 1820
1494#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 1821#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1495#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1496#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 1822#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
1823#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1824#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1497#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL 1825#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
1498#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1826#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1499#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1500 1827
1501#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 1828#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1502#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL 1829#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1503#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1830#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1504#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1831#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1505 1832
1506union uvh_rh_gam_mmr_overlay_config_mmr_u { 1833union uvh_rh_gam_mmr_overlay_config_mmr_u {
1507 unsigned long v; 1834 unsigned long v;
1508 struct uvh_rh_gam_mmr_overlay_config_mmr_s { 1835 struct uvh_rh_gam_mmr_overlay_config_mmr_s {
1509 unsigned long rsvd_0_25: 26; /* */ 1836 unsigned long rsvd_0_25:26;
1510 unsigned long base : 20; /* RW */ 1837 unsigned long base:20; /* RW */
1511 unsigned long rsvd_46_62 : 17; 1838 unsigned long rsvd_46_62:17;
1512 unsigned long enable : 1; /* RW */ 1839 unsigned long enable:1; /* RW */
1513 } s; 1840 } s;
1514 struct uv1h_rh_gam_mmr_overlay_config_mmr_s { 1841 struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
1515 unsigned long rsvd_0_25: 26; /* */ 1842 unsigned long rsvd_0_25:26;
1516 unsigned long base : 20; /* RW */ 1843 unsigned long base:20; /* RW */
1517 unsigned long dual_hub : 1; /* RW */ 1844 unsigned long dual_hub:1; /* RW */
1518 unsigned long rsvd_47_62: 16; /* */ 1845 unsigned long rsvd_47_62:16;
1519 unsigned long enable : 1; /* RW */ 1846 unsigned long enable:1; /* RW */
1520 } s1; 1847 } s1;
1521 struct uv2h_rh_gam_mmr_overlay_config_mmr_s { 1848 struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
1522 unsigned long rsvd_0_25: 26; /* */ 1849 unsigned long rsvd_0_25:26;
1523 unsigned long base : 20; /* RW */ 1850 unsigned long base:20; /* RW */
1524 unsigned long rsvd_46_62: 17; /* */ 1851 unsigned long rsvd_46_62:17;
1525 unsigned long enable : 1; /* RW */ 1852 unsigned long enable:1; /* RW */
1526 } s2; 1853 } s2;
1527}; 1854};
1528 1855
1529/* ========================================================================= */ 1856/* ========================================================================= */
1530/* UVH_RTC */ 1857/* UVH_RTC */
1531/* ========================================================================= */ 1858/* ========================================================================= */
1532#define UVH_RTC 0x340000UL 1859#define UVH_RTC 0x340000UL
1533 1860
1534#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 1861#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
1535#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL 1862#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
1536 1863
1537union uvh_rtc_u { 1864union uvh_rtc_u {
1538 unsigned long v; 1865 unsigned long v;
1539 struct uvh_rtc_s { 1866 struct uvh_rtc_s {
1540 unsigned long real_time_clock : 56; /* RW */ 1867 unsigned long real_time_clock:56; /* RW */
1541 unsigned long rsvd_56_63 : 8; /* */ 1868 unsigned long rsvd_56_63:8;
1542 } s; 1869 } s;
1543}; 1870};
1544 1871
1545/* ========================================================================= */ 1872/* ========================================================================= */
1546/* UVH_RTC1_INT_CONFIG */ 1873/* UVH_RTC1_INT_CONFIG */
1547/* ========================================================================= */ 1874/* ========================================================================= */
1548#define UVH_RTC1_INT_CONFIG 0x615c0UL 1875#define UVH_RTC1_INT_CONFIG 0x615c0UL
1549 1876
1550#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 1877#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
1551#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL 1878#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
1552#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 1879#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
1553#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL 1880#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
1554#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 1881#define UVH_RTC1_INT_CONFIG_P_SHFT 13
1555#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL 1882#define UVH_RTC1_INT_CONFIG_T_SHFT 15
1556#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 1883#define UVH_RTC1_INT_CONFIG_M_SHFT 16
1557#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL 1884#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
1558#define UVH_RTC1_INT_CONFIG_P_SHFT 13 1885#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1559#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL 1886#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
1560#define UVH_RTC1_INT_CONFIG_T_SHFT 15 1887#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1561#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL 1888#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1562#define UVH_RTC1_INT_CONFIG_M_SHFT 16 1889#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
1563#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL 1890#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
1564#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 1891#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
1565#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL 1892#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1566 1893
1567union uvh_rtc1_int_config_u { 1894union uvh_rtc1_int_config_u {
1568 unsigned long v; 1895 unsigned long v;
1569 struct uvh_rtc1_int_config_s { 1896 struct uvh_rtc1_int_config_s {
1570 unsigned long vector_ : 8; /* RW */ 1897 unsigned long vector_:8; /* RW */
1571 unsigned long dm : 3; /* RW */ 1898 unsigned long dm:3; /* RW */
1572 unsigned long destmode : 1; /* RW */ 1899 unsigned long destmode:1; /* RW */
1573 unsigned long status : 1; /* RO */ 1900 unsigned long status:1; /* RO */
1574 unsigned long p : 1; /* RO */ 1901 unsigned long p:1; /* RO */
1575 unsigned long rsvd_14 : 1; /* */ 1902 unsigned long rsvd_14:1;
1576 unsigned long t : 1; /* RO */ 1903 unsigned long t:1; /* RO */
1577 unsigned long m : 1; /* RW */ 1904 unsigned long m:1; /* RW */
1578 unsigned long rsvd_17_31: 15; /* */ 1905 unsigned long rsvd_17_31:15;
1579 unsigned long apic_id : 32; /* RW */ 1906 unsigned long apic_id:32; /* RW */
1580 } s; 1907 } s;
1581}; 1908};
1582 1909
1583/* ========================================================================= */ 1910/* ========================================================================= */
1584/* UVH_SCRATCH5 */ 1911/* UVH_SCRATCH5 */
1585/* ========================================================================= */ 1912/* ========================================================================= */
1586#define UVH_SCRATCH5 0x2d0200UL 1913#define UVH_SCRATCH5 0x2d0200UL
1587#define UVH_SCRATCH5_32 0x778 1914#define UVH_SCRATCH5_32 0x778
1588 1915
1589#define UVH_SCRATCH5_SCRATCH5_SHFT 0 1916#define UVH_SCRATCH5_SCRATCH5_SHFT 0
1590#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL 1917#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
1591 1918
1592union uvh_scratch5_u { 1919union uvh_scratch5_u {
1593 unsigned long v; 1920 unsigned long v;
1594 struct uvh_scratch5_s { 1921 struct uvh_scratch5_s {
1595 unsigned long scratch5 : 64; /* RW, W1CS */ 1922 unsigned long scratch5:64; /* RW, W1CS */
1596 } s; 1923 } s;
1597}; 1924};
1598 1925
1599/* ========================================================================= */ 1926/* ========================================================================= */
1600/* UV2H_EVENT_OCCURRED2 */ 1927/* UV2H_EVENT_OCCURRED2 */
1601/* ========================================================================= */ 1928/* ========================================================================= */
1602#define UV2H_EVENT_OCCURRED2 0x70100UL 1929#define UV2H_EVENT_OCCURRED2 0x70100UL
1603#define UV2H_EVENT_OCCURRED2_32 0xb68 1930#define UV2H_EVENT_OCCURRED2_32 0xb68
1604 1931
1605#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 1932#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
1606#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL 1933#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
1607#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 1934#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
1608#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL 1935#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
1609#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 1936#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
1610#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL 1937#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
1611#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 1938#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
1612#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL 1939#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
1613#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 1940#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
1614#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL 1941#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
1615#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 1942#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
1616#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL 1943#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
1617#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 1944#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
1618#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL 1945#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
1619#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 1946#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
1620#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL 1947#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
1621#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 1948#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
1622#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL 1949#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
1623#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 1950#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
1624#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL 1951#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
1625#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 1952#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
1626#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL 1953#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
1627#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 1954#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
1628#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL 1955#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
1629#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 1956#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
1630#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL 1957#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
1631#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 1958#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
1632#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL 1959#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
1633#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 1960#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
1634#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL 1961#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
1635#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 1962#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
1636#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL 1963#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
1637#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 1964#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
1638#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL 1965#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
1639#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 1966#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
1640#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL 1967#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
1641#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 1968#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
1642#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL 1969#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
1643#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 1970#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
1644#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL 1971#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
1645#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 1972#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
1646#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL 1973#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
1647#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 1974#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
1648#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL 1975#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
1649#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 1976#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
1650#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL 1977#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
1651#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 1978#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
1652#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL 1979#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
1653#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 1980#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
1654#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL 1981#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
1655#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 1982#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
1656#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL 1983#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
1657#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 1984#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
1658#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL 1985#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
1659#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 1986#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
1660#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL 1987#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
1661#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 1988#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
1662#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL 1989#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
1663#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 1990#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
1664#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL 1991#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
1665#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 1992#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
1666#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL 1993#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
1667#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 1994#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
1668#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL 1995#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
1669 1996
1670union uv2h_event_occurred2_u { 1997union uv2h_event_occurred2_u {
1671 unsigned long v; 1998 unsigned long v;
1672 struct uv2h_event_occurred2_s { 1999 struct uv2h_event_occurred2_s {
1673 unsigned long rtc_0 : 1; /* RW */ 2000 unsigned long rtc_0:1; /* RW */
1674 unsigned long rtc_1 : 1; /* RW */ 2001 unsigned long rtc_1:1; /* RW */
1675 unsigned long rtc_2 : 1; /* RW */ 2002 unsigned long rtc_2:1; /* RW */
1676 unsigned long rtc_3 : 1; /* RW */ 2003 unsigned long rtc_3:1; /* RW */
1677 unsigned long rtc_4 : 1; /* RW */ 2004 unsigned long rtc_4:1; /* RW */
1678 unsigned long rtc_5 : 1; /* RW */ 2005 unsigned long rtc_5:1; /* RW */
1679 unsigned long rtc_6 : 1; /* RW */ 2006 unsigned long rtc_6:1; /* RW */
1680 unsigned long rtc_7 : 1; /* RW */ 2007 unsigned long rtc_7:1; /* RW */
1681 unsigned long rtc_8 : 1; /* RW */ 2008 unsigned long rtc_8:1; /* RW */
1682 unsigned long rtc_9 : 1; /* RW */ 2009 unsigned long rtc_9:1; /* RW */
1683 unsigned long rtc_10 : 1; /* RW */ 2010 unsigned long rtc_10:1; /* RW */
1684 unsigned long rtc_11 : 1; /* RW */ 2011 unsigned long rtc_11:1; /* RW */
1685 unsigned long rtc_12 : 1; /* RW */ 2012 unsigned long rtc_12:1; /* RW */
1686 unsigned long rtc_13 : 1; /* RW */ 2013 unsigned long rtc_13:1; /* RW */
1687 unsigned long rtc_14 : 1; /* RW */ 2014 unsigned long rtc_14:1; /* RW */
1688 unsigned long rtc_15 : 1; /* RW */ 2015 unsigned long rtc_15:1; /* RW */
1689 unsigned long rtc_16 : 1; /* RW */ 2016 unsigned long rtc_16:1; /* RW */
1690 unsigned long rtc_17 : 1; /* RW */ 2017 unsigned long rtc_17:1; /* RW */
1691 unsigned long rtc_18 : 1; /* RW */ 2018 unsigned long rtc_18:1; /* RW */
1692 unsigned long rtc_19 : 1; /* RW */ 2019 unsigned long rtc_19:1; /* RW */
1693 unsigned long rtc_20 : 1; /* RW */ 2020 unsigned long rtc_20:1; /* RW */
1694 unsigned long rtc_21 : 1; /* RW */ 2021 unsigned long rtc_21:1; /* RW */
1695 unsigned long rtc_22 : 1; /* RW */ 2022 unsigned long rtc_22:1; /* RW */
1696 unsigned long rtc_23 : 1; /* RW */ 2023 unsigned long rtc_23:1; /* RW */
1697 unsigned long rtc_24 : 1; /* RW */ 2024 unsigned long rtc_24:1; /* RW */
1698 unsigned long rtc_25 : 1; /* RW */ 2025 unsigned long rtc_25:1; /* RW */
1699 unsigned long rtc_26 : 1; /* RW */ 2026 unsigned long rtc_26:1; /* RW */
1700 unsigned long rtc_27 : 1; /* RW */ 2027 unsigned long rtc_27:1; /* RW */
1701 unsigned long rtc_28 : 1; /* RW */ 2028 unsigned long rtc_28:1; /* RW */
1702 unsigned long rtc_29 : 1; /* RW */ 2029 unsigned long rtc_29:1; /* RW */
1703 unsigned long rtc_30 : 1; /* RW */ 2030 unsigned long rtc_30:1; /* RW */
1704 unsigned long rtc_31 : 1; /* RW */ 2031 unsigned long rtc_31:1; /* RW */
1705 unsigned long rsvd_32_63: 32; /* */ 2032 unsigned long rsvd_32_63:32;
1706 } s1; 2033 } s1;
1707}; 2034};
1708 2035
1709/* ========================================================================= */ 2036/* ========================================================================= */
1710/* UV2H_EVENT_OCCURRED2_ALIAS */ 2037/* UV2H_EVENT_OCCURRED2_ALIAS */
1711/* ========================================================================= */ 2038/* ========================================================================= */
1712#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL 2039#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL
1713#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 2040#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
1714 2041
1715/* ========================================================================= */ 2042/* ========================================================================= */
1716/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */ 2043/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */
1717/* ========================================================================= */ 2044/* ========================================================================= */
1718#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL 2045#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
1719#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 2046#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
1720 2047
1721#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 2048#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
1722#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL 2049#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
1723 2050
1724union uv2h_lb_bau_sb_activation_status_2_u { 2051union uv2h_lb_bau_sb_activation_status_2_u {
1725 unsigned long v; 2052 unsigned long v;
1726 struct uv2h_lb_bau_sb_activation_status_2_s { 2053 struct uv2h_lb_bau_sb_activation_status_2_s {
1727 unsigned long aux_error : 64; /* RW */ 2054 unsigned long aux_error:64; /* RW */
1728 } s1; 2055 } s1;
1729}; 2056};
1730 2057
1731/* ========================================================================= */ 2058/* ========================================================================= */
1732/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */ 2059/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */
1733/* ========================================================================= */ 2060/* ========================================================================= */
1734#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL 2061#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
1735#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0 2062#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
1736 2063
1737#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 2064#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
1738#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL 2065#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
1739 2066
1740union uv1h_lb_target_physical_apic_id_mask_u { 2067union uv1h_lb_target_physical_apic_id_mask_u {
1741 unsigned long v; 2068 unsigned long v;
1742 struct uv1h_lb_target_physical_apic_id_mask_s { 2069 struct uv1h_lb_target_physical_apic_id_mask_s {
1743 unsigned long bit_enables : 32; /* RW */ 2070 unsigned long bit_enables:32; /* RW */
1744 unsigned long rsvd_32_63 : 32; /* */ 2071 unsigned long rsvd_32_63:32;
1745 } s1; 2072 } s1;
1746}; 2073};
1747 2074
1748 2075
1749#endif /* __ASM_UV_MMRS_X86_H__ */ 2076#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 646b4c1ca69..815285bcace 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -11,10 +11,9 @@ struct vsyscall_gtod_data {
11 time_t wall_time_sec; 11 time_t wall_time_sec;
12 u32 wall_time_nsec; 12 u32 wall_time_nsec;
13 13
14 int sysctl_enabled;
15 struct timezone sys_tz; 14 struct timezone sys_tz;
16 struct { /* extract of a clocksource struct */ 15 struct { /* extract of a clocksource struct */
17 cycle_t (*vread)(void); 16 int vclock_mode;
18 cycle_t cycle_last; 17 cycle_t cycle_last;
19 cycle_t mask; 18 cycle_t mask;
20 u32 mult; 19 u32 mult;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 84471b81046..2caf290e989 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -132,6 +132,8 @@ enum vmcs_field {
132 GUEST_IA32_PAT_HIGH = 0x00002805, 132 GUEST_IA32_PAT_HIGH = 0x00002805,
133 GUEST_IA32_EFER = 0x00002806, 133 GUEST_IA32_EFER = 0x00002806,
134 GUEST_IA32_EFER_HIGH = 0x00002807, 134 GUEST_IA32_EFER_HIGH = 0x00002807,
135 GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
136 GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
135 GUEST_PDPTR0 = 0x0000280a, 137 GUEST_PDPTR0 = 0x0000280a,
136 GUEST_PDPTR0_HIGH = 0x0000280b, 138 GUEST_PDPTR0_HIGH = 0x0000280b,
137 GUEST_PDPTR1 = 0x0000280c, 139 GUEST_PDPTR1 = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
144 HOST_IA32_PAT_HIGH = 0x00002c01, 146 HOST_IA32_PAT_HIGH = 0x00002c01,
145 HOST_IA32_EFER = 0x00002c02, 147 HOST_IA32_EFER = 0x00002c02,
146 HOST_IA32_EFER_HIGH = 0x00002c03, 148 HOST_IA32_EFER_HIGH = 0x00002c03,
149 HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
150 HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
147 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 151 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
148 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 152 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
149 EXCEPTION_BITMAP = 0x00004004, 153 EXCEPTION_BITMAP = 0x00004004,
@@ -426,4 +430,43 @@ struct vmx_msr_entry {
426 u64 value; 430 u64 value;
427} __aligned(16); 431} __aligned(16);
428 432
433/*
434 * Exit Qualifications for entry failure during or after loading guest state
435 */
436#define ENTRY_FAIL_DEFAULT 0
437#define ENTRY_FAIL_PDPTE 2
438#define ENTRY_FAIL_NMI 3
439#define ENTRY_FAIL_VMCS_LINK_PTR 4
440
441/*
442 * VM-instruction error numbers
443 */
444enum vm_instruction_error_number {
445 VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
446 VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
447 VMXERR_VMCLEAR_VMXON_POINTER = 3,
448 VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
449 VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
450 VMXERR_VMRESUME_AFTER_VMXOFF = 6,
451 VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
452 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
453 VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
454 VMXERR_VMPTRLD_VMXON_POINTER = 10,
455 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
456 VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
457 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
458 VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
459 VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
460 VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
461 VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
462 VMXERR_VMCALL_NONCLEAR_VMCS = 19,
463 VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
464 VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
465 VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
466 VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
467 VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
468 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
469 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
470};
471
429#endif 472#endif
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d55597351f6..eaea1d31f75 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,10 +16,6 @@ enum vsyscall_num {
16#ifdef __KERNEL__ 16#ifdef __KERNEL__
17#include <linux/seqlock.h> 17#include <linux/seqlock.h>
18 18
19/* Definitions for CONFIG_GENERIC_TIME definitions */
20#define __vsyscall_fn \
21 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
22
23#define VGETCPU_RDTSCP 1 19#define VGETCPU_RDTSCP 1
24#define VGETCPU_LSL 2 20#define VGETCPU_LSL 2
25 21
@@ -31,6 +27,12 @@ extern struct timezone sys_tz;
31 27
32extern void map_vsyscall(void); 28extern void map_vsyscall(void);
33 29
30/*
31 * Called on instruction fetch fault in vsyscall page.
32 * Returns true if handled.
33 */
34extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
35
34#endif /* __KERNEL__ */ 36#endif /* __KERNEL__ */
35 37
36#endif /* _ASM_X86_VSYSCALL_H */ 38#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 341b3559452..de656ac2af4 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -10,15 +10,14 @@
10 * In normal kernel code, they are used like any other variable. 10 * In normal kernel code, they are used like any other variable.
11 * In user code, they are accessed through the VVAR macro. 11 * In user code, they are accessed through the VVAR macro.
12 * 12 *
13 * Each of these variables lives in the vsyscall page, and each 13 * These variables live in a page of kernel data that has an extra RO
14 * one needs a unique offset within the little piece of the page 14 * mapping for userspace. Each variable needs a unique offset within
15 * reserved for vvars. Specify that offset in DECLARE_VVAR. 15 * that page; specify that offset with the DECLARE_VVAR macro. (If
16 * (There are 896 bytes available. If you mess up, the linker will 16 * you mess up, the linker will catch it.)
17 * catch it.)
18 */ 17 */
19 18
20/* Offset of vars within vsyscall page */ 19/* Base address of vvars. This is not ABI. */
21#define VSYSCALL_VARS_OFFSET (3072 + 128) 20#define VVAR_ADDRESS (-10*1024*1024 - 4096)
22 21
23#if defined(__VVAR_KERNEL_LDS) 22#if defined(__VVAR_KERNEL_LDS)
24 23
@@ -26,17 +25,17 @@
26 * right place. 25 * right place.
27 */ 26 */
28#define DECLARE_VVAR(offset, type, name) \ 27#define DECLARE_VVAR(offset, type, name) \
29 EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset) 28 EMIT_VVAR(name, offset)
30 29
31#else 30#else
32 31
33#define DECLARE_VVAR(offset, type, name) \ 32#define DECLARE_VVAR(offset, type, name) \
34 static type const * const vvaraddr_ ## name = \ 33 static type const * const vvaraddr_ ## name = \
35 (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset)); 34 (void *)(VVAR_ADDRESS + (offset));
36 35
37#define DEFINE_VVAR(type, name) \ 36#define DEFINE_VVAR(type, name) \
38 type __vvar_ ## name \ 37 type name \
39 __attribute__((section(".vsyscall_var_" #name), aligned(16))) 38 __attribute__((section(".vvar_" #name), aligned(16)))
40 39
41#define VVAR(name) (*vvaraddr_ ## name) 40#define VVAR(name) (*vvaraddr_ ## name)
42 41
@@ -45,8 +44,7 @@
45/* DECLARE_VVAR(offset, type, name) */ 44/* DECLARE_VVAR(offset, type, name) */
46 45
47DECLARE_VVAR(0, volatile unsigned long, jiffies) 46DECLARE_VVAR(0, volatile unsigned long, jiffies)
48DECLARE_VVAR(8, int, vgetcpu_mode) 47DECLARE_VVAR(16, int, vgetcpu_mode)
49DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) 48DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
50 49
51#undef DECLARE_VVAR 50#undef DECLARE_VVAR
52#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index d240ea95051..417777de5a4 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -39,6 +39,8 @@
39#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/types.h> 40#include <linux/types.h>
41 41
42#include <trace/events/xen.h>
43
42#include <asm/page.h> 44#include <asm/page.h>
43#include <asm/pgtable.h> 45#include <asm/pgtable.h>
44 46
@@ -459,6 +461,8 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
459{ 461{
460 mcl->op = __HYPERVISOR_fpu_taskswitch; 462 mcl->op = __HYPERVISOR_fpu_taskswitch;
461 mcl->args[0] = set; 463 mcl->args[0] = set;
464
465 trace_xen_mc_entry(mcl, 1);
462} 466}
463 467
464static inline void 468static inline void
@@ -475,6 +479,8 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
475 mcl->args[2] = new_val.pte >> 32; 479 mcl->args[2] = new_val.pte >> 32;
476 mcl->args[3] = flags; 480 mcl->args[3] = flags;
477 } 481 }
482
483 trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4);
478} 484}
479 485
480static inline void 486static inline void
@@ -485,6 +491,8 @@ MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
485 mcl->args[0] = cmd; 491 mcl->args[0] = cmd;
486 mcl->args[1] = (unsigned long)uop; 492 mcl->args[1] = (unsigned long)uop;
487 mcl->args[2] = count; 493 mcl->args[2] = count;
494
495 trace_xen_mc_entry(mcl, 3);
488} 496}
489 497
490static inline void 498static inline void
@@ -504,6 +512,8 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
504 mcl->args[3] = flags; 512 mcl->args[3] = flags;
505 mcl->args[4] = domid; 513 mcl->args[4] = domid;
506 } 514 }
515
516 trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5);
507} 517}
508 518
509static inline void 519static inline void
@@ -520,6 +530,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
520 mcl->args[2] = desc.a; 530 mcl->args[2] = desc.a;
521 mcl->args[3] = desc.b; 531 mcl->args[3] = desc.b;
522 } 532 }
533
534 trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
523} 535}
524 536
525static inline void 537static inline void
@@ -528,6 +540,8 @@ MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
528 mcl->op = __HYPERVISOR_memory_op; 540 mcl->op = __HYPERVISOR_memory_op;
529 mcl->args[0] = cmd; 541 mcl->args[0] = cmd;
530 mcl->args[1] = (unsigned long)arg; 542 mcl->args[1] = (unsigned long)arg;
543
544 trace_xen_mc_entry(mcl, 2);
531} 545}
532 546
533static inline void 547static inline void
@@ -539,6 +553,8 @@ MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
539 mcl->args[1] = count; 553 mcl->args[1] = count;
540 mcl->args[2] = (unsigned long)success_count; 554 mcl->args[2] = (unsigned long)success_count;
541 mcl->args[3] = domid; 555 mcl->args[3] = domid;
556
557 trace_xen_mc_entry(mcl, 4);
542} 558}
543 559
544static inline void 560static inline void
@@ -550,6 +566,8 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
550 mcl->args[1] = count; 566 mcl->args[1] = count;
551 mcl->args[2] = (unsigned long)success_count; 567 mcl->args[2] = (unsigned long)success_count;
552 mcl->args[3] = domid; 568 mcl->args[3] = domid;
569
570 trace_xen_mc_entry(mcl, 4);
553} 571}
554 572
555static inline void 573static inline void
@@ -558,6 +576,8 @@ MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
558 mcl->op = __HYPERVISOR_set_gdt; 576 mcl->op = __HYPERVISOR_set_gdt;
559 mcl->args[0] = (unsigned long)frames; 577 mcl->args[0] = (unsigned long)frames;
560 mcl->args[1] = entries; 578 mcl->args[1] = entries;
579
580 trace_xen_mc_entry(mcl, 2);
561} 581}
562 582
563static inline void 583static inline void
@@ -567,6 +587,8 @@ MULTI_stack_switch(struct multicall_entry *mcl,
567 mcl->op = __HYPERVISOR_stack_switch; 587 mcl->op = __HYPERVISOR_stack_switch;
568 mcl->args[0] = ss; 588 mcl->args[0] = ss;
569 mcl->args[1] = esp; 589 mcl->args[1] = esp;
590
591 trace_xen_mc_entry(mcl, 2);
570} 592}
571 593
572#endif /* _ASM_X86_XEN_HYPERCALL_H */ 594#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 64a619d47d3..7ff4669580c 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -39,7 +39,7 @@ typedef struct xpaddr {
39 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) 39 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
40 40
41extern unsigned long *machine_to_phys_mapping; 41extern unsigned long *machine_to_phys_mapping;
42extern unsigned int machine_to_phys_order; 42extern unsigned long machine_to_phys_nr;
43 43
44extern unsigned long get_phys_to_machine(unsigned long pfn); 44extern unsigned long get_phys_to_machine(unsigned long pfn);
45extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 45extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -87,7 +87,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
87 if (xen_feature(XENFEAT_auto_translated_physmap)) 87 if (xen_feature(XENFEAT_auto_translated_physmap))
88 return mfn; 88 return mfn;
89 89
90 if (unlikely((mfn >> machine_to_phys_order) != 0)) { 90 if (unlikely(mfn >= machine_to_phys_nr)) {
91 pfn = ~0; 91 pfn = ~0;
92 goto try_override; 92 goto try_override;
93 } 93 }
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 4fbda9a3f33..968d57dd54c 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -14,13 +14,14 @@ static inline int pci_xen_hvm_init(void)
14} 14}
15#endif 15#endif
16#if defined(CONFIG_XEN_DOM0) 16#if defined(CONFIG_XEN_DOM0)
17void __init xen_setup_pirqs(void); 17int __init pci_xen_initial_domain(void);
18int xen_find_device_domain_owner(struct pci_dev *dev); 18int xen_find_device_domain_owner(struct pci_dev *dev);
19int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); 19int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
20int xen_unregister_device_domain_owner(struct pci_dev *dev); 20int xen_unregister_device_domain_owner(struct pci_dev *dev);
21#else 21#else
22static inline void __init xen_setup_pirqs(void) 22static inline int __init pci_xen_initial_domain(void)
23{ 23{
24 return -1;
24} 25}
25static inline int xen_find_device_domain_owner(struct pci_dev *dev) 26static inline int xen_find_device_domain_owner(struct pci_dev *dev)
26{ 27{
diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h
new file mode 100644
index 00000000000..21e1874c0a0
--- /dev/null
+++ b/arch/x86/include/asm/xen/trace_types.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_XEN_TRACE_TYPES_H
2#define _ASM_XEN_TRACE_TYPES_H
3
4enum xen_mc_flush_reason {
5 XEN_MC_FL_NONE, /* explicit flush */
6 XEN_MC_FL_BATCH, /* out of hypercall space */
7 XEN_MC_FL_ARGS, /* out of argument space */
8 XEN_MC_FL_CALLBACK, /* out of callback space */
9};
10
11enum xen_mc_extend_args {
12 XEN_MC_XE_OK,
13 XEN_MC_XE_BAD_OP,
14 XEN_MC_XE_NO_SPACE
15};
16typedef void (*xen_mc_callback_fn_t)(void *);
17
18#endif /* _ASM_XEN_TRACE_TYPES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d727f8f9433..c84954ad12f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,24 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg
17CFLAGS_REMOVE_early_printk.o = -pg 17CFLAGS_REMOVE_early_printk.o = -pg
18endif 18endif
19 19
20#
21# vsyscalls (which work on the user stack) should have
22# no stack-protector checks:
23#
24nostackp := $(call cc-option, -fno-stack-protector)
25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
26CFLAGS_hpet.o := $(nostackp)
27CFLAGS_vread_tsc_64.o := $(nostackp)
28CFLAGS_paravirt.o := $(nostackp)
29GCOV_PROFILE_vsyscall_64.o := n
30GCOV_PROFILE_hpet.o := n
31GCOV_PROFILE_tsc.o := n
32GCOV_PROFILE_vread_tsc_64.o := n
33GCOV_PROFILE_paravirt.o := n
34
35# vread_tsc_64 is hot and should be fully optimized:
36CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
37
38obj-y := process_$(BITS).o signal.o entry_$(BITS).o 20obj-y := process_$(BITS).o signal.o entry_$(BITS).o
39obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 21obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
40obj-y += time.o ioport.o ldt.o dumpstack.o 22obj-y += time.o ioport.o ldt.o dumpstack.o
@@ -43,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
43obj-y += probe_roms.o 25obj-y += probe_roms.o
44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o 28obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
29obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
47obj-y += bootflag.o e820.o 30obj-y += bootflag.o e820.o
48obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 31obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
49obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 32obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -125,7 +108,6 @@ ifeq ($(CONFIG_X86_64),y)
125 108
126 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o 109 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
127 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 110 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
128 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
129 111
130 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 112 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
131 obj-y += vsmp_64.o 113 obj-y += vsmp_64.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 5812404a0d4..f50e7fb2a20 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
149} 149}
150EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 150EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
151 151
152/*
153 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
154 * which can obviate IPI to trigger checking of need_resched.
155 * We execute MONITOR against need_resched and enter optimized wait state
156 * through MWAIT. Whenever someone changes need_resched, we would be woken
157 * up from MWAIT (without an IPI).
158 *
159 * New with Core Duo processors, MWAIT can take some hints based on CPU
160 * capability.
161 */
162void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
163{
164 if (!need_resched()) {
165 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
166 clflush((void *)&current_thread_info()->flags);
167
168 __monitor((void *)&current_thread_info()->flags, 0, 0);
169 smp_mb();
170 if (!need_resched())
171 __mwait(ax, cx);
172 }
173}
174
152void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 175void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
153{ 176{
154 unsigned int cpu = smp_processor_id(); 177 unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f..00000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
1wakeup.bin
2wakeup.elf
3wakeup.lds
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a81f2d52f86..c6382281624 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/mce.h> 15#include <asm/mce.h>
16#include <asm/nmi.h> 16#include <asm/nmi.h>
17#include <asm/vsyscall.h>
18#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
19#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
20#include <asm/io.h> 19#include <asm/io.h>
@@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
250 249
251extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
252extern s32 __smp_locks[], __smp_locks_end[]; 251extern s32 __smp_locks[], __smp_locks_end[];
253extern char __vsyscall_0;
254void *text_poke_early(void *addr, const void *opcode, size_t len); 252void *text_poke_early(void *addr, const void *opcode, size_t len);
255 253
256/* Replace instructions with better alternatives for this CPU type. 254/* Replace instructions with better alternatives for this CPU type.
@@ -263,6 +261,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
263 struct alt_instr *end) 261 struct alt_instr *end)
264{ 262{
265 struct alt_instr *a; 263 struct alt_instr *a;
264 u8 *instr, *replacement;
266 u8 insnbuf[MAX_PATCH_LEN]; 265 u8 insnbuf[MAX_PATCH_LEN];
267 266
268 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 267 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
@@ -276,25 +275,23 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
276 * order. 275 * order.
277 */ 276 */
278 for (a = start; a < end; a++) { 277 for (a = start; a < end; a++) {
279 u8 *instr = a->instr; 278 instr = (u8 *)&a->instr_offset + a->instr_offset;
279 replacement = (u8 *)&a->repl_offset + a->repl_offset;
280 BUG_ON(a->replacementlen > a->instrlen); 280 BUG_ON(a->replacementlen > a->instrlen);
281 BUG_ON(a->instrlen > sizeof(insnbuf)); 281 BUG_ON(a->instrlen > sizeof(insnbuf));
282 BUG_ON(a->cpuid >= NCAPINTS*32); 282 BUG_ON(a->cpuid >= NCAPINTS*32);
283 if (!boot_cpu_has(a->cpuid)) 283 if (!boot_cpu_has(a->cpuid))
284 continue; 284 continue;
285#ifdef CONFIG_X86_64 285
286 /* vsyscall code is not mapped yet. resolve it manually. */ 286 memcpy(insnbuf, replacement, a->replacementlen);
287 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { 287
288 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); 288 /* 0xe8 is a relative jump; fix the offset. */
289 DPRINTK("%s: vsyscall fixup: %p => %p\n",
290 __func__, a->instr, instr);
291 }
292#endif
293 memcpy(insnbuf, a->replacement, a->replacementlen);
294 if (*insnbuf == 0xe8 && a->replacementlen == 5) 289 if (*insnbuf == 0xe8 && a->replacementlen == 5)
295 *(s32 *)(insnbuf + 1) += a->replacement - a->instr; 290 *(s32 *)(insnbuf + 1) += replacement - instr;
291
296 add_nops(insnbuf + a->replacementlen, 292 add_nops(insnbuf + a->replacementlen,
297 a->instrlen - a->replacementlen); 293 a->instrlen - a->replacementlen);
294
298 text_poke_early(instr, insnbuf, a->instrlen); 295 text_poke_early(instr, insnbuf, a->instrlen);
299 } 296 }
300} 297}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b117efd24f7..8a439d364b9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -30,7 +30,7 @@
30#include <linux/syscore_ops.h> 30#include <linux/syscore_ops.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <asm/atomic.h> 33#include <linux/atomic.h>
34#include <asm/mtrr.h> 34#include <asm/mtrr.h>
35#include <asm/pgtable.h> 35#include <asm/pgtable.h>
36#include <asm/proto.h> 36#include <asm/proto.h>
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 7c3a95e54ec..00000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2764 +0,0 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/bitmap.h>
23#include <linux/slab.h>
24#include <linux/debugfs.h>
25#include <linux/scatterlist.h>
26#include <linux/dma-mapping.h>
27#include <linux/iommu-helper.h>
28#include <linux/iommu.h>
29#include <linux/delay.h>
30#include <asm/proto.h>
31#include <asm/iommu.h>
32#include <asm/gart.h>
33#include <asm/dma.h>
34#include <asm/amd_iommu_proto.h>
35#include <asm/amd_iommu_types.h>
36#include <asm/amd_iommu.h>
37
38#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
39
40#define LOOP_TIMEOUT 100000
41
42static DEFINE_RWLOCK(amd_iommu_devtable_lock);
43
44/* A list of preallocated protection domains */
45static LIST_HEAD(iommu_pd_list);
46static DEFINE_SPINLOCK(iommu_pd_list_lock);
47
48/*
49 * Domain for untranslated devices - only allocated
50 * if iommu=pt passed on kernel cmd line.
51 */
52static struct protection_domain *pt_domain;
53
54static struct iommu_ops amd_iommu_ops;
55
56/*
57 * general struct to manage commands send to an IOMMU
58 */
59struct iommu_cmd {
60 u32 data[4];
61};
62
63static void update_domain(struct protection_domain *domain);
64
65/****************************************************************************
66 *
67 * Helper functions
68 *
69 ****************************************************************************/
70
71static inline u16 get_device_id(struct device *dev)
72{
73 struct pci_dev *pdev = to_pci_dev(dev);
74
75 return calc_devid(pdev->bus->number, pdev->devfn);
76}
77
78static struct iommu_dev_data *get_dev_data(struct device *dev)
79{
80 return dev->archdata.iommu;
81}
82
83/*
84 * In this function the list of preallocated protection domains is traversed to
85 * find the domain for a specific device
86 */
87static struct dma_ops_domain *find_protection_domain(u16 devid)
88{
89 struct dma_ops_domain *entry, *ret = NULL;
90 unsigned long flags;
91 u16 alias = amd_iommu_alias_table[devid];
92
93 if (list_empty(&iommu_pd_list))
94 return NULL;
95
96 spin_lock_irqsave(&iommu_pd_list_lock, flags);
97
98 list_for_each_entry(entry, &iommu_pd_list, list) {
99 if (entry->target_dev == devid ||
100 entry->target_dev == alias) {
101 ret = entry;
102 break;
103 }
104 }
105
106 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
107
108 return ret;
109}
110
111/*
112 * This function checks if the driver got a valid device from the caller to
113 * avoid dereferencing invalid pointers.
114 */
115static bool check_device(struct device *dev)
116{
117 u16 devid;
118
119 if (!dev || !dev->dma_mask)
120 return false;
121
122 /* No device or no PCI device */
123 if (dev->bus != &pci_bus_type)
124 return false;
125
126 devid = get_device_id(dev);
127
128 /* Out of our scope? */
129 if (devid > amd_iommu_last_bdf)
130 return false;
131
132 if (amd_iommu_rlookup_table[devid] == NULL)
133 return false;
134
135 return true;
136}
137
138static int iommu_init_device(struct device *dev)
139{
140 struct iommu_dev_data *dev_data;
141 struct pci_dev *pdev;
142 u16 devid, alias;
143
144 if (dev->archdata.iommu)
145 return 0;
146
147 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
148 if (!dev_data)
149 return -ENOMEM;
150
151 dev_data->dev = dev;
152
153 devid = get_device_id(dev);
154 alias = amd_iommu_alias_table[devid];
155 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
156 if (pdev)
157 dev_data->alias = &pdev->dev;
158 else {
159 kfree(dev_data);
160 return -ENOTSUPP;
161 }
162
163 atomic_set(&dev_data->bind, 0);
164
165 dev->archdata.iommu = dev_data;
166
167
168 return 0;
169}
170
171static void iommu_ignore_device(struct device *dev)
172{
173 u16 devid, alias;
174
175 devid = get_device_id(dev);
176 alias = amd_iommu_alias_table[devid];
177
178 memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
179 memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
180
181 amd_iommu_rlookup_table[devid] = NULL;
182 amd_iommu_rlookup_table[alias] = NULL;
183}
184
185static void iommu_uninit_device(struct device *dev)
186{
187 kfree(dev->archdata.iommu);
188}
189
190void __init amd_iommu_uninit_devices(void)
191{
192 struct pci_dev *pdev = NULL;
193
194 for_each_pci_dev(pdev) {
195
196 if (!check_device(&pdev->dev))
197 continue;
198
199 iommu_uninit_device(&pdev->dev);
200 }
201}
202
203int __init amd_iommu_init_devices(void)
204{
205 struct pci_dev *pdev = NULL;
206 int ret = 0;
207
208 for_each_pci_dev(pdev) {
209
210 if (!check_device(&pdev->dev))
211 continue;
212
213 ret = iommu_init_device(&pdev->dev);
214 if (ret == -ENOTSUPP)
215 iommu_ignore_device(&pdev->dev);
216 else if (ret)
217 goto out_free;
218 }
219
220 return 0;
221
222out_free:
223
224 amd_iommu_uninit_devices();
225
226 return ret;
227}
228#ifdef CONFIG_AMD_IOMMU_STATS
229
230/*
231 * Initialization code for statistics collection
232 */
233
234DECLARE_STATS_COUNTER(compl_wait);
235DECLARE_STATS_COUNTER(cnt_map_single);
236DECLARE_STATS_COUNTER(cnt_unmap_single);
237DECLARE_STATS_COUNTER(cnt_map_sg);
238DECLARE_STATS_COUNTER(cnt_unmap_sg);
239DECLARE_STATS_COUNTER(cnt_alloc_coherent);
240DECLARE_STATS_COUNTER(cnt_free_coherent);
241DECLARE_STATS_COUNTER(cross_page);
242DECLARE_STATS_COUNTER(domain_flush_single);
243DECLARE_STATS_COUNTER(domain_flush_all);
244DECLARE_STATS_COUNTER(alloced_io_mem);
245DECLARE_STATS_COUNTER(total_map_requests);
246
247static struct dentry *stats_dir;
248static struct dentry *de_fflush;
249
250static void amd_iommu_stats_add(struct __iommu_counter *cnt)
251{
252 if (stats_dir == NULL)
253 return;
254
255 cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
256 &cnt->value);
257}
258
259static void amd_iommu_stats_init(void)
260{
261 stats_dir = debugfs_create_dir("amd-iommu", NULL);
262 if (stats_dir == NULL)
263 return;
264
265 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
266 (u32 *)&amd_iommu_unmap_flush);
267
268 amd_iommu_stats_add(&compl_wait);
269 amd_iommu_stats_add(&cnt_map_single);
270 amd_iommu_stats_add(&cnt_unmap_single);
271 amd_iommu_stats_add(&cnt_map_sg);
272 amd_iommu_stats_add(&cnt_unmap_sg);
273 amd_iommu_stats_add(&cnt_alloc_coherent);
274 amd_iommu_stats_add(&cnt_free_coherent);
275 amd_iommu_stats_add(&cross_page);
276 amd_iommu_stats_add(&domain_flush_single);
277 amd_iommu_stats_add(&domain_flush_all);
278 amd_iommu_stats_add(&alloced_io_mem);
279 amd_iommu_stats_add(&total_map_requests);
280}
281
282#endif
283
284/****************************************************************************
285 *
286 * Interrupt handling functions
287 *
288 ****************************************************************************/
289
290static void dump_dte_entry(u16 devid)
291{
292 int i;
293
294 for (i = 0; i < 8; ++i)
295 pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
296 amd_iommu_dev_table[devid].data[i]);
297}
298
299static void dump_command(unsigned long phys_addr)
300{
301 struct iommu_cmd *cmd = phys_to_virt(phys_addr);
302 int i;
303
304 for (i = 0; i < 4; ++i)
305 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
306}
307
308static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
309{
310 u32 *event = __evt;
311 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
312 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
313 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
314 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
315 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
316
317 printk(KERN_ERR "AMD-Vi: Event logged [");
318
319 switch (type) {
320 case EVENT_TYPE_ILL_DEV:
321 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
322 "address=0x%016llx flags=0x%04x]\n",
323 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
324 address, flags);
325 dump_dte_entry(devid);
326 break;
327 case EVENT_TYPE_IO_FAULT:
328 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
329 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
330 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
331 domid, address, flags);
332 break;
333 case EVENT_TYPE_DEV_TAB_ERR:
334 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
335 "address=0x%016llx flags=0x%04x]\n",
336 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
337 address, flags);
338 break;
339 case EVENT_TYPE_PAGE_TAB_ERR:
340 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
341 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
342 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
343 domid, address, flags);
344 break;
345 case EVENT_TYPE_ILL_CMD:
346 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
347 dump_command(address);
348 break;
349 case EVENT_TYPE_CMD_HARD_ERR:
350 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
351 "flags=0x%04x]\n", address, flags);
352 break;
353 case EVENT_TYPE_IOTLB_INV_TO:
354 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
355 "address=0x%016llx]\n",
356 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
357 address);
358 break;
359 case EVENT_TYPE_INV_DEV_REQ:
360 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
361 "address=0x%016llx flags=0x%04x]\n",
362 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
363 address, flags);
364 break;
365 default:
366 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
367 }
368}
369
370static void iommu_poll_events(struct amd_iommu *iommu)
371{
372 u32 head, tail;
373 unsigned long flags;
374
375 spin_lock_irqsave(&iommu->lock, flags);
376
377 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
378 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
379
380 while (head != tail) {
381 iommu_print_event(iommu, iommu->evt_buf + head);
382 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
383 }
384
385 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
386
387 spin_unlock_irqrestore(&iommu->lock, flags);
388}
389
390irqreturn_t amd_iommu_int_thread(int irq, void *data)
391{
392 struct amd_iommu *iommu;
393
394 for_each_iommu(iommu)
395 iommu_poll_events(iommu);
396
397 return IRQ_HANDLED;
398}
399
400irqreturn_t amd_iommu_int_handler(int irq, void *data)
401{
402 return IRQ_WAKE_THREAD;
403}
404
405/****************************************************************************
406 *
407 * IOMMU command queuing functions
408 *
409 ****************************************************************************/
410
411static int wait_on_sem(volatile u64 *sem)
412{
413 int i = 0;
414
415 while (*sem == 0 && i < LOOP_TIMEOUT) {
416 udelay(1);
417 i += 1;
418 }
419
420 if (i == LOOP_TIMEOUT) {
421 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
422 return -EIO;
423 }
424
425 return 0;
426}
427
428static void copy_cmd_to_buffer(struct amd_iommu *iommu,
429 struct iommu_cmd *cmd,
430 u32 tail)
431{
432 u8 *target;
433
434 target = iommu->cmd_buf + tail;
435 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
436
437 /* Copy command to buffer */
438 memcpy(target, cmd, sizeof(*cmd));
439
440 /* Tell the IOMMU about it */
441 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
442}
443
444static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
445{
446 WARN_ON(address & 0x7ULL);
447
448 memset(cmd, 0, sizeof(*cmd));
449 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
450 cmd->data[1] = upper_32_bits(__pa(address));
451 cmd->data[2] = 1;
452 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
453}
454
455static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
456{
457 memset(cmd, 0, sizeof(*cmd));
458 cmd->data[0] = devid;
459 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
460}
461
462static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
463 size_t size, u16 domid, int pde)
464{
465 u64 pages;
466 int s;
467
468 pages = iommu_num_pages(address, size, PAGE_SIZE);
469 s = 0;
470
471 if (pages > 1) {
472 /*
473 * If we have to flush more than one page, flush all
474 * TLB entries for this domain
475 */
476 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
477 s = 1;
478 }
479
480 address &= PAGE_MASK;
481
482 memset(cmd, 0, sizeof(*cmd));
483 cmd->data[1] |= domid;
484 cmd->data[2] = lower_32_bits(address);
485 cmd->data[3] = upper_32_bits(address);
486 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
487 if (s) /* size bit - we flush more than one 4kb page */
488 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
489 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
490 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
491}
492
493static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
494 u64 address, size_t size)
495{
496 u64 pages;
497 int s;
498
499 pages = iommu_num_pages(address, size, PAGE_SIZE);
500 s = 0;
501
502 if (pages > 1) {
503 /*
504 * If we have to flush more than one page, flush all
505 * TLB entries for this domain
506 */
507 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
508 s = 1;
509 }
510
511 address &= PAGE_MASK;
512
513 memset(cmd, 0, sizeof(*cmd));
514 cmd->data[0] = devid;
515 cmd->data[0] |= (qdep & 0xff) << 24;
516 cmd->data[1] = devid;
517 cmd->data[2] = lower_32_bits(address);
518 cmd->data[3] = upper_32_bits(address);
519 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
520 if (s)
521 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
522}
523
524static void build_inv_all(struct iommu_cmd *cmd)
525{
526 memset(cmd, 0, sizeof(*cmd));
527 CMD_SET_TYPE(cmd, CMD_INV_ALL);
528}
529
530/*
531 * Writes the command to the IOMMUs command buffer and informs the
532 * hardware about the new command.
533 */
534static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
535{
536 u32 left, tail, head, next_tail;
537 unsigned long flags;
538
539 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
540
541again:
542 spin_lock_irqsave(&iommu->lock, flags);
543
544 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
545 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
546 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
547 left = (head - next_tail) % iommu->cmd_buf_size;
548
549 if (left <= 2) {
550 struct iommu_cmd sync_cmd;
551 volatile u64 sem = 0;
552 int ret;
553
554 build_completion_wait(&sync_cmd, (u64)&sem);
555 copy_cmd_to_buffer(iommu, &sync_cmd, tail);
556
557 spin_unlock_irqrestore(&iommu->lock, flags);
558
559 if ((ret = wait_on_sem(&sem)) != 0)
560 return ret;
561
562 goto again;
563 }
564
565 copy_cmd_to_buffer(iommu, cmd, tail);
566
567 /* We need to sync now to make sure all commands are processed */
568 iommu->need_sync = true;
569
570 spin_unlock_irqrestore(&iommu->lock, flags);
571
572 return 0;
573}
574
575/*
576 * This function queues a completion wait command into the command
577 * buffer of an IOMMU
578 */
579static int iommu_completion_wait(struct amd_iommu *iommu)
580{
581 struct iommu_cmd cmd;
582 volatile u64 sem = 0;
583 int ret;
584
585 if (!iommu->need_sync)
586 return 0;
587
588 build_completion_wait(&cmd, (u64)&sem);
589
590 ret = iommu_queue_command(iommu, &cmd);
591 if (ret)
592 return ret;
593
594 return wait_on_sem(&sem);
595}
596
597static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
598{
599 struct iommu_cmd cmd;
600
601 build_inv_dte(&cmd, devid);
602
603 return iommu_queue_command(iommu, &cmd);
604}
605
606static void iommu_flush_dte_all(struct amd_iommu *iommu)
607{
608 u32 devid;
609
610 for (devid = 0; devid <= 0xffff; ++devid)
611 iommu_flush_dte(iommu, devid);
612
613 iommu_completion_wait(iommu);
614}
615
616/*
617 * This function uses heavy locking and may disable irqs for some time. But
618 * this is no issue because it is only called during resume.
619 */
620static void iommu_flush_tlb_all(struct amd_iommu *iommu)
621{
622 u32 dom_id;
623
624 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
625 struct iommu_cmd cmd;
626 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
627 dom_id, 1);
628 iommu_queue_command(iommu, &cmd);
629 }
630
631 iommu_completion_wait(iommu);
632}
633
634static void iommu_flush_all(struct amd_iommu *iommu)
635{
636 struct iommu_cmd cmd;
637
638 build_inv_all(&cmd);
639
640 iommu_queue_command(iommu, &cmd);
641 iommu_completion_wait(iommu);
642}
643
644void iommu_flush_all_caches(struct amd_iommu *iommu)
645{
646 if (iommu_feature(iommu, FEATURE_IA)) {
647 iommu_flush_all(iommu);
648 } else {
649 iommu_flush_dte_all(iommu);
650 iommu_flush_tlb_all(iommu);
651 }
652}
653
654/*
655 * Command send function for flushing on-device TLB
656 */
657static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
658{
659 struct pci_dev *pdev = to_pci_dev(dev);
660 struct amd_iommu *iommu;
661 struct iommu_cmd cmd;
662 u16 devid;
663 int qdep;
664
665 qdep = pci_ats_queue_depth(pdev);
666 devid = get_device_id(dev);
667 iommu = amd_iommu_rlookup_table[devid];
668
669 build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
670
671 return iommu_queue_command(iommu, &cmd);
672}
673
674/*
675 * Command send function for invalidating a device table entry
676 */
677static int device_flush_dte(struct device *dev)
678{
679 struct amd_iommu *iommu;
680 struct pci_dev *pdev;
681 u16 devid;
682 int ret;
683
684 pdev = to_pci_dev(dev);
685 devid = get_device_id(dev);
686 iommu = amd_iommu_rlookup_table[devid];
687
688 ret = iommu_flush_dte(iommu, devid);
689 if (ret)
690 return ret;
691
692 if (pci_ats_enabled(pdev))
693 ret = device_flush_iotlb(dev, 0, ~0UL);
694
695 return ret;
696}
697
698/*
699 * TLB invalidation function which is called from the mapping functions.
700 * It invalidates a single PTE if the range to flush is within a single
701 * page. Otherwise it flushes the whole TLB of the IOMMU.
702 */
703static void __domain_flush_pages(struct protection_domain *domain,
704 u64 address, size_t size, int pde)
705{
706 struct iommu_dev_data *dev_data;
707 struct iommu_cmd cmd;
708 int ret = 0, i;
709
710 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
711
712 for (i = 0; i < amd_iommus_present; ++i) {
713 if (!domain->dev_iommu[i])
714 continue;
715
716 /*
717 * Devices of this domain are behind this IOMMU
718 * We need a TLB flush
719 */
720 ret |= iommu_queue_command(amd_iommus[i], &cmd);
721 }
722
723 list_for_each_entry(dev_data, &domain->dev_list, list) {
724 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
725
726 if (!pci_ats_enabled(pdev))
727 continue;
728
729 ret |= device_flush_iotlb(dev_data->dev, address, size);
730 }
731
732 WARN_ON(ret);
733}
734
735static void domain_flush_pages(struct protection_domain *domain,
736 u64 address, size_t size)
737{
738 __domain_flush_pages(domain, address, size, 0);
739}
740
741/* Flush the whole IO/TLB for a given protection domain */
742static void domain_flush_tlb(struct protection_domain *domain)
743{
744 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
745}
746
747/* Flush the whole IO/TLB for a given protection domain - including PDE */
748static void domain_flush_tlb_pde(struct protection_domain *domain)
749{
750 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
751}
752
753static void domain_flush_complete(struct protection_domain *domain)
754{
755 int i;
756
757 for (i = 0; i < amd_iommus_present; ++i) {
758 if (!domain->dev_iommu[i])
759 continue;
760
761 /*
762 * Devices of this domain are behind this IOMMU
763 * We need to wait for completion of all commands.
764 */
765 iommu_completion_wait(amd_iommus[i]);
766 }
767}
768
769
770/*
771 * This function flushes the DTEs for all devices in domain
772 */
773static void domain_flush_devices(struct protection_domain *domain)
774{
775 struct iommu_dev_data *dev_data;
776 unsigned long flags;
777
778 spin_lock_irqsave(&domain->lock, flags);
779
780 list_for_each_entry(dev_data, &domain->dev_list, list)
781 device_flush_dte(dev_data->dev);
782
783 spin_unlock_irqrestore(&domain->lock, flags);
784}
785
786/****************************************************************************
787 *
788 * The functions below are used the create the page table mappings for
789 * unity mapped regions.
790 *
791 ****************************************************************************/
792
793/*
794 * This function is used to add another level to an IO page table. Adding
795 * another level increases the size of the address space by 9 bits to a size up
796 * to 64 bits.
797 */
798static bool increase_address_space(struct protection_domain *domain,
799 gfp_t gfp)
800{
801 u64 *pte;
802
803 if (domain->mode == PAGE_MODE_6_LEVEL)
804 /* address space already 64 bit large */
805 return false;
806
807 pte = (void *)get_zeroed_page(gfp);
808 if (!pte)
809 return false;
810
811 *pte = PM_LEVEL_PDE(domain->mode,
812 virt_to_phys(domain->pt_root));
813 domain->pt_root = pte;
814 domain->mode += 1;
815 domain->updated = true;
816
817 return true;
818}
819
820static u64 *alloc_pte(struct protection_domain *domain,
821 unsigned long address,
822 unsigned long page_size,
823 u64 **pte_page,
824 gfp_t gfp)
825{
826 int level, end_lvl;
827 u64 *pte, *page;
828
829 BUG_ON(!is_power_of_2(page_size));
830
831 while (address > PM_LEVEL_SIZE(domain->mode))
832 increase_address_space(domain, gfp);
833
834 level = domain->mode - 1;
835 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
836 address = PAGE_SIZE_ALIGN(address, page_size);
837 end_lvl = PAGE_SIZE_LEVEL(page_size);
838
839 while (level > end_lvl) {
840 if (!IOMMU_PTE_PRESENT(*pte)) {
841 page = (u64 *)get_zeroed_page(gfp);
842 if (!page)
843 return NULL;
844 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
845 }
846
847 /* No level skipping support yet */
848 if (PM_PTE_LEVEL(*pte) != level)
849 return NULL;
850
851 level -= 1;
852
853 pte = IOMMU_PTE_PAGE(*pte);
854
855 if (pte_page && level == end_lvl)
856 *pte_page = pte;
857
858 pte = &pte[PM_LEVEL_INDEX(level, address)];
859 }
860
861 return pte;
862}
863
864/*
865 * This function checks if there is a PTE for a given dma address. If
866 * there is one, it returns the pointer to it.
867 */
868static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
869{
870 int level;
871 u64 *pte;
872
873 if (address > PM_LEVEL_SIZE(domain->mode))
874 return NULL;
875
876 level = domain->mode - 1;
877 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
878
879 while (level > 0) {
880
881 /* Not Present */
882 if (!IOMMU_PTE_PRESENT(*pte))
883 return NULL;
884
885 /* Large PTE */
886 if (PM_PTE_LEVEL(*pte) == 0x07) {
887 unsigned long pte_mask, __pte;
888
889 /*
890 * If we have a series of large PTEs, make
891 * sure to return a pointer to the first one.
892 */
893 pte_mask = PTE_PAGE_SIZE(*pte);
894 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
895 __pte = ((unsigned long)pte) & pte_mask;
896
897 return (u64 *)__pte;
898 }
899
900 /* No level skipping support yet */
901 if (PM_PTE_LEVEL(*pte) != level)
902 return NULL;
903
904 level -= 1;
905
906 /* Walk to the next level */
907 pte = IOMMU_PTE_PAGE(*pte);
908 pte = &pte[PM_LEVEL_INDEX(level, address)];
909 }
910
911 return pte;
912}
913
914/*
915 * Generic mapping functions. It maps a physical address into a DMA
916 * address space. It allocates the page table pages if necessary.
917 * In the future it can be extended to a generic mapping function
918 * supporting all features of AMD IOMMU page tables like level skipping
919 * and full 64 bit address spaces.
920 */
921static int iommu_map_page(struct protection_domain *dom,
922 unsigned long bus_addr,
923 unsigned long phys_addr,
924 int prot,
925 unsigned long page_size)
926{
927 u64 __pte, *pte;
928 int i, count;
929
930 if (!(prot & IOMMU_PROT_MASK))
931 return -EINVAL;
932
933 bus_addr = PAGE_ALIGN(bus_addr);
934 phys_addr = PAGE_ALIGN(phys_addr);
935 count = PAGE_SIZE_PTE_COUNT(page_size);
936 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
937
938 for (i = 0; i < count; ++i)
939 if (IOMMU_PTE_PRESENT(pte[i]))
940 return -EBUSY;
941
942 if (page_size > PAGE_SIZE) {
943 __pte = PAGE_SIZE_PTE(phys_addr, page_size);
944 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
945 } else
946 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
947
948 if (prot & IOMMU_PROT_IR)
949 __pte |= IOMMU_PTE_IR;
950 if (prot & IOMMU_PROT_IW)
951 __pte |= IOMMU_PTE_IW;
952
953 for (i = 0; i < count; ++i)
954 pte[i] = __pte;
955
956 update_domain(dom);
957
958 return 0;
959}
960
961static unsigned long iommu_unmap_page(struct protection_domain *dom,
962 unsigned long bus_addr,
963 unsigned long page_size)
964{
965 unsigned long long unmap_size, unmapped;
966 u64 *pte;
967
968 BUG_ON(!is_power_of_2(page_size));
969
970 unmapped = 0;
971
972 while (unmapped < page_size) {
973
974 pte = fetch_pte(dom, bus_addr);
975
976 if (!pte) {
977 /*
978 * No PTE for this address
979 * move forward in 4kb steps
980 */
981 unmap_size = PAGE_SIZE;
982 } else if (PM_PTE_LEVEL(*pte) == 0) {
983 /* 4kb PTE found for this address */
984 unmap_size = PAGE_SIZE;
985 *pte = 0ULL;
986 } else {
987 int count, i;
988
989 /* Large PTE found which maps this address */
990 unmap_size = PTE_PAGE_SIZE(*pte);
991 count = PAGE_SIZE_PTE_COUNT(unmap_size);
992 for (i = 0; i < count; i++)
993 pte[i] = 0ULL;
994 }
995
996 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
997 unmapped += unmap_size;
998 }
999
1000 BUG_ON(!is_power_of_2(unmapped));
1001
1002 return unmapped;
1003}
1004
1005/*
1006 * This function checks if a specific unity mapping entry is needed for
1007 * this specific IOMMU.
1008 */
1009static int iommu_for_unity_map(struct amd_iommu *iommu,
1010 struct unity_map_entry *entry)
1011{
1012 u16 bdf, i;
1013
1014 for (i = entry->devid_start; i <= entry->devid_end; ++i) {
1015 bdf = amd_iommu_alias_table[i];
1016 if (amd_iommu_rlookup_table[bdf] == iommu)
1017 return 1;
1018 }
1019
1020 return 0;
1021}
1022
1023/*
1024 * This function actually applies the mapping to the page table of the
1025 * dma_ops domain.
1026 */
1027static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
1028 struct unity_map_entry *e)
1029{
1030 u64 addr;
1031 int ret;
1032
1033 for (addr = e->address_start; addr < e->address_end;
1034 addr += PAGE_SIZE) {
1035 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
1036 PAGE_SIZE);
1037 if (ret)
1038 return ret;
1039 /*
1040 * if unity mapping is in aperture range mark the page
1041 * as allocated in the aperture
1042 */
1043 if (addr < dma_dom->aperture_size)
1044 __set_bit(addr >> PAGE_SHIFT,
1045 dma_dom->aperture[0]->bitmap);
1046 }
1047
1048 return 0;
1049}
1050
1051/*
1052 * Init the unity mappings for a specific IOMMU in the system
1053 *
1054 * Basically iterates over all unity mapping entries and applies them to
1055 * the default domain DMA of that IOMMU if necessary.
1056 */
1057static int iommu_init_unity_mappings(struct amd_iommu *iommu)
1058{
1059 struct unity_map_entry *entry;
1060 int ret;
1061
1062 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
1063 if (!iommu_for_unity_map(iommu, entry))
1064 continue;
1065 ret = dma_ops_unity_map(iommu->default_dom, entry);
1066 if (ret)
1067 return ret;
1068 }
1069
1070 return 0;
1071}
1072
1073/*
1074 * Inits the unity mappings required for a specific device
1075 */
1076static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
1077 u16 devid)
1078{
1079 struct unity_map_entry *e;
1080 int ret;
1081
1082 list_for_each_entry(e, &amd_iommu_unity_map, list) {
1083 if (!(devid >= e->devid_start && devid <= e->devid_end))
1084 continue;
1085 ret = dma_ops_unity_map(dma_dom, e);
1086 if (ret)
1087 return ret;
1088 }
1089
1090 return 0;
1091}
1092
1093/****************************************************************************
1094 *
1095 * The next functions belong to the address allocator for the dma_ops
1096 * interface functions. They work like the allocators in the other IOMMU
1097 * drivers. Its basically a bitmap which marks the allocated pages in
1098 * the aperture. Maybe it could be enhanced in the future to a more
1099 * efficient allocator.
1100 *
1101 ****************************************************************************/
1102
1103/*
1104 * The address allocator core functions.
1105 *
1106 * called with domain->lock held
1107 */
1108
1109/*
1110 * Used to reserve address ranges in the aperture (e.g. for exclusion
1111 * ranges.
1112 */
1113static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1114 unsigned long start_page,
1115 unsigned int pages)
1116{
1117 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1118
1119 if (start_page + pages > last_page)
1120 pages = last_page - start_page;
1121
1122 for (i = start_page; i < start_page + pages; ++i) {
1123 int index = i / APERTURE_RANGE_PAGES;
1124 int page = i % APERTURE_RANGE_PAGES;
1125 __set_bit(page, dom->aperture[index]->bitmap);
1126 }
1127}
1128
1129/*
1130 * This function is used to add a new aperture range to an existing
1131 * aperture in case of dma_ops domain allocation or address allocation
1132 * failure.
1133 */
1134static int alloc_new_range(struct dma_ops_domain *dma_dom,
1135 bool populate, gfp_t gfp)
1136{
1137 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
1138 struct amd_iommu *iommu;
1139 unsigned long i;
1140
1141#ifdef CONFIG_IOMMU_STRESS
1142 populate = false;
1143#endif
1144
1145 if (index >= APERTURE_MAX_RANGES)
1146 return -ENOMEM;
1147
1148 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
1149 if (!dma_dom->aperture[index])
1150 return -ENOMEM;
1151
1152 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
1153 if (!dma_dom->aperture[index]->bitmap)
1154 goto out_free;
1155
1156 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
1157
1158 if (populate) {
1159 unsigned long address = dma_dom->aperture_size;
1160 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
1161 u64 *pte, *pte_page;
1162
1163 for (i = 0; i < num_ptes; ++i) {
1164 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1165 &pte_page, gfp);
1166 if (!pte)
1167 goto out_free;
1168
1169 dma_dom->aperture[index]->pte_pages[i] = pte_page;
1170
1171 address += APERTURE_RANGE_SIZE / 64;
1172 }
1173 }
1174
1175 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1176
1177 /* Initialize the exclusion range if necessary */
1178 for_each_iommu(iommu) {
1179 if (iommu->exclusion_start &&
1180 iommu->exclusion_start >= dma_dom->aperture[index]->offset
1181 && iommu->exclusion_start < dma_dom->aperture_size) {
1182 unsigned long startpage;
1183 int pages = iommu_num_pages(iommu->exclusion_start,
1184 iommu->exclusion_length,
1185 PAGE_SIZE);
1186 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1187 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1188 }
1189 }
1190
1191 /*
1192 * Check for areas already mapped as present in the new aperture
1193 * range and mark those pages as reserved in the allocator. Such
1194 * mappings may already exist as a result of requested unity
1195 * mappings for devices.
1196 */
1197 for (i = dma_dom->aperture[index]->offset;
1198 i < dma_dom->aperture_size;
1199 i += PAGE_SIZE) {
1200 u64 *pte = fetch_pte(&dma_dom->domain, i);
1201 if (!pte || !IOMMU_PTE_PRESENT(*pte))
1202 continue;
1203
1204 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
1205 }
1206
1207 update_domain(&dma_dom->domain);
1208
1209 return 0;
1210
1211out_free:
1212 update_domain(&dma_dom->domain);
1213
1214 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
1215
1216 kfree(dma_dom->aperture[index]);
1217 dma_dom->aperture[index] = NULL;
1218
1219 return -ENOMEM;
1220}
1221
1222static unsigned long dma_ops_area_alloc(struct device *dev,
1223 struct dma_ops_domain *dom,
1224 unsigned int pages,
1225 unsigned long align_mask,
1226 u64 dma_mask,
1227 unsigned long start)
1228{
1229 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
1230 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
1231 int i = start >> APERTURE_RANGE_SHIFT;
1232 unsigned long boundary_size;
1233 unsigned long address = -1;
1234 unsigned long limit;
1235
1236 next_bit >>= PAGE_SHIFT;
1237
1238 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
1239 PAGE_SIZE) >> PAGE_SHIFT;
1240
1241 for (;i < max_index; ++i) {
1242 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
1243
1244 if (dom->aperture[i]->offset >= dma_mask)
1245 break;
1246
1247 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
1248 dma_mask >> PAGE_SHIFT);
1249
1250 address = iommu_area_alloc(dom->aperture[i]->bitmap,
1251 limit, next_bit, pages, 0,
1252 boundary_size, align_mask);
1253 if (address != -1) {
1254 address = dom->aperture[i]->offset +
1255 (address << PAGE_SHIFT);
1256 dom->next_address = address + (pages << PAGE_SHIFT);
1257 break;
1258 }
1259
1260 next_bit = 0;
1261 }
1262
1263 return address;
1264}
1265
1266static unsigned long dma_ops_alloc_addresses(struct device *dev,
1267 struct dma_ops_domain *dom,
1268 unsigned int pages,
1269 unsigned long align_mask,
1270 u64 dma_mask)
1271{
1272 unsigned long address;
1273
1274#ifdef CONFIG_IOMMU_STRESS
1275 dom->next_address = 0;
1276 dom->need_flush = true;
1277#endif
1278
1279 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1280 dma_mask, dom->next_address);
1281
1282 if (address == -1) {
1283 dom->next_address = 0;
1284 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1285 dma_mask, 0);
1286 dom->need_flush = true;
1287 }
1288
1289 if (unlikely(address == -1))
1290 address = DMA_ERROR_CODE;
1291
1292 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
1293
1294 return address;
1295}
1296
1297/*
1298 * The address free function.
1299 *
1300 * called with domain->lock held
1301 */
1302static void dma_ops_free_addresses(struct dma_ops_domain *dom,
1303 unsigned long address,
1304 unsigned int pages)
1305{
1306 unsigned i = address >> APERTURE_RANGE_SHIFT;
1307 struct aperture_range *range = dom->aperture[i];
1308
1309 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
1310
1311#ifdef CONFIG_IOMMU_STRESS
1312 if (i < 4)
1313 return;
1314#endif
1315
1316 if (address >= dom->next_address)
1317 dom->need_flush = true;
1318
1319 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
1320
1321 bitmap_clear(range->bitmap, address, pages);
1322
1323}
1324
1325/****************************************************************************
1326 *
1327 * The next functions belong to the domain allocation. A domain is
1328 * allocated for every IOMMU as the default domain. If device isolation
1329 * is enabled, every device get its own domain. The most important thing
1330 * about domains is the page table mapping the DMA address space they
1331 * contain.
1332 *
1333 ****************************************************************************/
1334
1335/*
1336 * This function adds a protection domain to the global protection domain list
1337 */
1338static void add_domain_to_list(struct protection_domain *domain)
1339{
1340 unsigned long flags;
1341
1342 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1343 list_add(&domain->list, &amd_iommu_pd_list);
1344 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1345}
1346
1347/*
1348 * This function removes a protection domain to the global
1349 * protection domain list
1350 */
1351static void del_domain_from_list(struct protection_domain *domain)
1352{
1353 unsigned long flags;
1354
1355 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1356 list_del(&domain->list);
1357 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1358}
1359
1360static u16 domain_id_alloc(void)
1361{
1362 unsigned long flags;
1363 int id;
1364
1365 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1366 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1367 BUG_ON(id == 0);
1368 if (id > 0 && id < MAX_DOMAIN_ID)
1369 __set_bit(id, amd_iommu_pd_alloc_bitmap);
1370 else
1371 id = 0;
1372 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1373
1374 return id;
1375}
1376
1377static void domain_id_free(int id)
1378{
1379 unsigned long flags;
1380
1381 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1382 if (id > 0 && id < MAX_DOMAIN_ID)
1383 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
1384 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1385}
1386
1387static void free_pagetable(struct protection_domain *domain)
1388{
1389 int i, j;
1390 u64 *p1, *p2, *p3;
1391
1392 p1 = domain->pt_root;
1393
1394 if (!p1)
1395 return;
1396
1397 for (i = 0; i < 512; ++i) {
1398 if (!IOMMU_PTE_PRESENT(p1[i]))
1399 continue;
1400
1401 p2 = IOMMU_PTE_PAGE(p1[i]);
1402 for (j = 0; j < 512; ++j) {
1403 if (!IOMMU_PTE_PRESENT(p2[j]))
1404 continue;
1405 p3 = IOMMU_PTE_PAGE(p2[j]);
1406 free_page((unsigned long)p3);
1407 }
1408
1409 free_page((unsigned long)p2);
1410 }
1411
1412 free_page((unsigned long)p1);
1413
1414 domain->pt_root = NULL;
1415}
1416
1417/*
1418 * Free a domain, only used if something went wrong in the
1419 * allocation path and we need to free an already allocated page table
1420 */
1421static void dma_ops_domain_free(struct dma_ops_domain *dom)
1422{
1423 int i;
1424
1425 if (!dom)
1426 return;
1427
1428 del_domain_from_list(&dom->domain);
1429
1430 free_pagetable(&dom->domain);
1431
1432 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
1433 if (!dom->aperture[i])
1434 continue;
1435 free_page((unsigned long)dom->aperture[i]->bitmap);
1436 kfree(dom->aperture[i]);
1437 }
1438
1439 kfree(dom);
1440}
1441
1442/*
1443 * Allocates a new protection domain usable for the dma_ops functions.
1444 * It also initializes the page table and the address allocator data
1445 * structures required for the dma_ops interface
1446 */
1447static struct dma_ops_domain *dma_ops_domain_alloc(void)
1448{
1449 struct dma_ops_domain *dma_dom;
1450
1451 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
1452 if (!dma_dom)
1453 return NULL;
1454
1455 spin_lock_init(&dma_dom->domain.lock);
1456
1457 dma_dom->domain.id = domain_id_alloc();
1458 if (dma_dom->domain.id == 0)
1459 goto free_dma_dom;
1460 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1461 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1462 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1463 dma_dom->domain.flags = PD_DMA_OPS_MASK;
1464 dma_dom->domain.priv = dma_dom;
1465 if (!dma_dom->domain.pt_root)
1466 goto free_dma_dom;
1467
1468 dma_dom->need_flush = false;
1469 dma_dom->target_dev = 0xffff;
1470
1471 add_domain_to_list(&dma_dom->domain);
1472
1473 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1474 goto free_dma_dom;
1475
1476 /*
1477 * mark the first page as allocated so we never return 0 as
1478 * a valid dma-address. So we can use 0 as error value
1479 */
1480 dma_dom->aperture[0]->bitmap[0] = 1;
1481 dma_dom->next_address = 0;
1482
1483
1484 return dma_dom;
1485
1486free_dma_dom:
1487 dma_ops_domain_free(dma_dom);
1488
1489 return NULL;
1490}
1491
1492/*
1493 * little helper function to check whether a given protection domain is a
1494 * dma_ops domain
1495 */
1496static bool dma_ops_domain(struct protection_domain *domain)
1497{
1498 return domain->flags & PD_DMA_OPS_MASK;
1499}
1500
1501static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1502{
1503 u64 pte_root = virt_to_phys(domain->pt_root);
1504 u32 flags = 0;
1505
1506 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1507 << DEV_ENTRY_MODE_SHIFT;
1508 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1509
1510 if (ats)
1511 flags |= DTE_FLAG_IOTLB;
1512
1513 amd_iommu_dev_table[devid].data[3] |= flags;
1514 amd_iommu_dev_table[devid].data[2] = domain->id;
1515 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1516 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1517}
1518
1519static void clear_dte_entry(u16 devid)
1520{
1521 /* remove entry from the device table seen by the hardware */
1522 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1523 amd_iommu_dev_table[devid].data[1] = 0;
1524 amd_iommu_dev_table[devid].data[2] = 0;
1525
1526 amd_iommu_apply_erratum_63(devid);
1527}
1528
1529static void do_attach(struct device *dev, struct protection_domain *domain)
1530{
1531 struct iommu_dev_data *dev_data;
1532 struct amd_iommu *iommu;
1533 struct pci_dev *pdev;
1534 bool ats = false;
1535 u16 devid;
1536
1537 devid = get_device_id(dev);
1538 iommu = amd_iommu_rlookup_table[devid];
1539 dev_data = get_dev_data(dev);
1540 pdev = to_pci_dev(dev);
1541
1542 if (amd_iommu_iotlb_sup)
1543 ats = pci_ats_enabled(pdev);
1544
1545 /* Update data structures */
1546 dev_data->domain = domain;
1547 list_add(&dev_data->list, &domain->dev_list);
1548 set_dte_entry(devid, domain, ats);
1549
1550 /* Do reference counting */
1551 domain->dev_iommu[iommu->index] += 1;
1552 domain->dev_cnt += 1;
1553
1554 /* Flush the DTE entry */
1555 device_flush_dte(dev);
1556}
1557
1558static void do_detach(struct device *dev)
1559{
1560 struct iommu_dev_data *dev_data;
1561 struct amd_iommu *iommu;
1562 u16 devid;
1563
1564 devid = get_device_id(dev);
1565 iommu = amd_iommu_rlookup_table[devid];
1566 dev_data = get_dev_data(dev);
1567
1568 /* decrease reference counters */
1569 dev_data->domain->dev_iommu[iommu->index] -= 1;
1570 dev_data->domain->dev_cnt -= 1;
1571
1572 /* Update data structures */
1573 dev_data->domain = NULL;
1574 list_del(&dev_data->list);
1575 clear_dte_entry(devid);
1576
1577 /* Flush the DTE entry */
1578 device_flush_dte(dev);
1579}
1580
1581/*
1582 * If a device is not yet associated with a domain, this function does
1583 * assigns it visible for the hardware
1584 */
1585static int __attach_device(struct device *dev,
1586 struct protection_domain *domain)
1587{
1588 struct iommu_dev_data *dev_data, *alias_data;
1589 int ret;
1590
1591 dev_data = get_dev_data(dev);
1592 alias_data = get_dev_data(dev_data->alias);
1593
1594 if (!alias_data)
1595 return -EINVAL;
1596
1597 /* lock domain */
1598 spin_lock(&domain->lock);
1599
1600 /* Some sanity checks */
1601 ret = -EBUSY;
1602 if (alias_data->domain != NULL &&
1603 alias_data->domain != domain)
1604 goto out_unlock;
1605
1606 if (dev_data->domain != NULL &&
1607 dev_data->domain != domain)
1608 goto out_unlock;
1609
1610 /* Do real assignment */
1611 if (dev_data->alias != dev) {
1612 alias_data = get_dev_data(dev_data->alias);
1613 if (alias_data->domain == NULL)
1614 do_attach(dev_data->alias, domain);
1615
1616 atomic_inc(&alias_data->bind);
1617 }
1618
1619 if (dev_data->domain == NULL)
1620 do_attach(dev, domain);
1621
1622 atomic_inc(&dev_data->bind);
1623
1624 ret = 0;
1625
1626out_unlock:
1627
1628 /* ready */
1629 spin_unlock(&domain->lock);
1630
1631 return ret;
1632}
1633
1634/*
1635 * If a device is not yet associated with a domain, this function does
1636 * assigns it visible for the hardware
1637 */
1638static int attach_device(struct device *dev,
1639 struct protection_domain *domain)
1640{
1641 struct pci_dev *pdev = to_pci_dev(dev);
1642 unsigned long flags;
1643 int ret;
1644
1645 if (amd_iommu_iotlb_sup)
1646 pci_enable_ats(pdev, PAGE_SHIFT);
1647
1648 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1649 ret = __attach_device(dev, domain);
1650 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1651
1652 /*
1653 * We might boot into a crash-kernel here. The crashed kernel
1654 * left the caches in the IOMMU dirty. So we have to flush
1655 * here to evict all dirty stuff.
1656 */
1657 domain_flush_tlb_pde(domain);
1658
1659 return ret;
1660}
1661
1662/*
1663 * Removes a device from a protection domain (unlocked)
1664 */
1665static void __detach_device(struct device *dev)
1666{
1667 struct iommu_dev_data *dev_data = get_dev_data(dev);
1668 struct iommu_dev_data *alias_data;
1669 struct protection_domain *domain;
1670 unsigned long flags;
1671
1672 BUG_ON(!dev_data->domain);
1673
1674 domain = dev_data->domain;
1675
1676 spin_lock_irqsave(&domain->lock, flags);
1677
1678 if (dev_data->alias != dev) {
1679 alias_data = get_dev_data(dev_data->alias);
1680 if (atomic_dec_and_test(&alias_data->bind))
1681 do_detach(dev_data->alias);
1682 }
1683
1684 if (atomic_dec_and_test(&dev_data->bind))
1685 do_detach(dev);
1686
1687 spin_unlock_irqrestore(&domain->lock, flags);
1688
1689 /*
1690 * If we run in passthrough mode the device must be assigned to the
1691 * passthrough domain if it is detached from any other domain.
1692 * Make sure we can deassign from the pt_domain itself.
1693 */
1694 if (iommu_pass_through &&
1695 (dev_data->domain == NULL && domain != pt_domain))
1696 __attach_device(dev, pt_domain);
1697}
1698
1699/*
1700 * Removes a device from a protection domain (with devtable_lock held)
1701 */
1702static void detach_device(struct device *dev)
1703{
1704 struct pci_dev *pdev = to_pci_dev(dev);
1705 unsigned long flags;
1706
1707 /* lock device table */
1708 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1709 __detach_device(dev);
1710 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1711
1712 if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
1713 pci_disable_ats(pdev);
1714}
1715
1716/*
1717 * Find out the protection domain structure for a given PCI device. This
1718 * will give us the pointer to the page table root for example.
1719 */
1720static struct protection_domain *domain_for_device(struct device *dev)
1721{
1722 struct protection_domain *dom;
1723 struct iommu_dev_data *dev_data, *alias_data;
1724 unsigned long flags;
1725 u16 devid;
1726
1727 devid = get_device_id(dev);
1728 dev_data = get_dev_data(dev);
1729 alias_data = get_dev_data(dev_data->alias);
1730 if (!alias_data)
1731 return NULL;
1732
1733 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1734 dom = dev_data->domain;
1735 if (dom == NULL &&
1736 alias_data->domain != NULL) {
1737 __attach_device(dev, alias_data->domain);
1738 dom = alias_data->domain;
1739 }
1740
1741 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1742
1743 return dom;
1744}
1745
1746static int device_change_notifier(struct notifier_block *nb,
1747 unsigned long action, void *data)
1748{
1749 struct device *dev = data;
1750 u16 devid;
1751 struct protection_domain *domain;
1752 struct dma_ops_domain *dma_domain;
1753 struct amd_iommu *iommu;
1754 unsigned long flags;
1755
1756 if (!check_device(dev))
1757 return 0;
1758
1759 devid = get_device_id(dev);
1760 iommu = amd_iommu_rlookup_table[devid];
1761
1762 switch (action) {
1763 case BUS_NOTIFY_UNBOUND_DRIVER:
1764
1765 domain = domain_for_device(dev);
1766
1767 if (!domain)
1768 goto out;
1769 if (iommu_pass_through)
1770 break;
1771 detach_device(dev);
1772 break;
1773 case BUS_NOTIFY_ADD_DEVICE:
1774
1775 iommu_init_device(dev);
1776
1777 domain = domain_for_device(dev);
1778
1779 /* allocate a protection domain if a device is added */
1780 dma_domain = find_protection_domain(devid);
1781 if (dma_domain)
1782 goto out;
1783 dma_domain = dma_ops_domain_alloc();
1784 if (!dma_domain)
1785 goto out;
1786 dma_domain->target_dev = devid;
1787
1788 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1789 list_add_tail(&dma_domain->list, &iommu_pd_list);
1790 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1791
1792 break;
1793 case BUS_NOTIFY_DEL_DEVICE:
1794
1795 iommu_uninit_device(dev);
1796
1797 default:
1798 goto out;
1799 }
1800
1801 device_flush_dte(dev);
1802 iommu_completion_wait(iommu);
1803
1804out:
1805 return 0;
1806}
1807
1808static struct notifier_block device_nb = {
1809 .notifier_call = device_change_notifier,
1810};
1811
1812void amd_iommu_init_notifier(void)
1813{
1814 bus_register_notifier(&pci_bus_type, &device_nb);
1815}
1816
1817/*****************************************************************************
1818 *
1819 * The next functions belong to the dma_ops mapping/unmapping code.
1820 *
1821 *****************************************************************************/
1822
1823/*
1824 * In the dma_ops path we only have the struct device. This function
1825 * finds the corresponding IOMMU, the protection domain and the
1826 * requestor id for a given device.
1827 * If the device is not yet associated with a domain this is also done
1828 * in this function.
1829 */
1830static struct protection_domain *get_domain(struct device *dev)
1831{
1832 struct protection_domain *domain;
1833 struct dma_ops_domain *dma_dom;
1834 u16 devid = get_device_id(dev);
1835
1836 if (!check_device(dev))
1837 return ERR_PTR(-EINVAL);
1838
1839 domain = domain_for_device(dev);
1840 if (domain != NULL && !dma_ops_domain(domain))
1841 return ERR_PTR(-EBUSY);
1842
1843 if (domain != NULL)
1844 return domain;
1845
1846 /* Device not bount yet - bind it */
1847 dma_dom = find_protection_domain(devid);
1848 if (!dma_dom)
1849 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1850 attach_device(dev, &dma_dom->domain);
1851 DUMP_printk("Using protection domain %d for device %s\n",
1852 dma_dom->domain.id, dev_name(dev));
1853
1854 return &dma_dom->domain;
1855}
1856
1857static void update_device_table(struct protection_domain *domain)
1858{
1859 struct iommu_dev_data *dev_data;
1860
1861 list_for_each_entry(dev_data, &domain->dev_list, list) {
1862 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
1863 u16 devid = get_device_id(dev_data->dev);
1864 set_dte_entry(devid, domain, pci_ats_enabled(pdev));
1865 }
1866}
1867
1868static void update_domain(struct protection_domain *domain)
1869{
1870 if (!domain->updated)
1871 return;
1872
1873 update_device_table(domain);
1874
1875 domain_flush_devices(domain);
1876 domain_flush_tlb_pde(domain);
1877
1878 domain->updated = false;
1879}
1880
1881/*
1882 * This function fetches the PTE for a given address in the aperture
1883 */
1884static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1885 unsigned long address)
1886{
1887 struct aperture_range *aperture;
1888 u64 *pte, *pte_page;
1889
1890 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1891 if (!aperture)
1892 return NULL;
1893
1894 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1895 if (!pte) {
1896 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
1897 GFP_ATOMIC);
1898 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1899 } else
1900 pte += PM_LEVEL_INDEX(0, address);
1901
1902 update_domain(&dom->domain);
1903
1904 return pte;
1905}
1906
1907/*
1908 * This is the generic map function. It maps one 4kb page at paddr to
1909 * the given address in the DMA address space for the domain.
1910 */
1911static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1912 unsigned long address,
1913 phys_addr_t paddr,
1914 int direction)
1915{
1916 u64 *pte, __pte;
1917
1918 WARN_ON(address > dom->aperture_size);
1919
1920 paddr &= PAGE_MASK;
1921
1922 pte = dma_ops_get_pte(dom, address);
1923 if (!pte)
1924 return DMA_ERROR_CODE;
1925
1926 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1927
1928 if (direction == DMA_TO_DEVICE)
1929 __pte |= IOMMU_PTE_IR;
1930 else if (direction == DMA_FROM_DEVICE)
1931 __pte |= IOMMU_PTE_IW;
1932 else if (direction == DMA_BIDIRECTIONAL)
1933 __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
1934
1935 WARN_ON(*pte);
1936
1937 *pte = __pte;
1938
1939 return (dma_addr_t)address;
1940}
1941
1942/*
1943 * The generic unmapping function for on page in the DMA address space.
1944 */
1945static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1946 unsigned long address)
1947{
1948 struct aperture_range *aperture;
1949 u64 *pte;
1950
1951 if (address >= dom->aperture_size)
1952 return;
1953
1954 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1955 if (!aperture)
1956 return;
1957
1958 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1959 if (!pte)
1960 return;
1961
1962 pte += PM_LEVEL_INDEX(0, address);
1963
1964 WARN_ON(!*pte);
1965
1966 *pte = 0ULL;
1967}
1968
1969/*
1970 * This function contains common code for mapping of a physically
1971 * contiguous memory region into DMA address space. It is used by all
1972 * mapping functions provided with this IOMMU driver.
1973 * Must be called with the domain lock held.
1974 */
1975static dma_addr_t __map_single(struct device *dev,
1976 struct dma_ops_domain *dma_dom,
1977 phys_addr_t paddr,
1978 size_t size,
1979 int dir,
1980 bool align,
1981 u64 dma_mask)
1982{
1983 dma_addr_t offset = paddr & ~PAGE_MASK;
1984 dma_addr_t address, start, ret;
1985 unsigned int pages;
1986 unsigned long align_mask = 0;
1987 int i;
1988
1989 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
1990 paddr &= PAGE_MASK;
1991
1992 INC_STATS_COUNTER(total_map_requests);
1993
1994 if (pages > 1)
1995 INC_STATS_COUNTER(cross_page);
1996
1997 if (align)
1998 align_mask = (1UL << get_order(size)) - 1;
1999
2000retry:
2001 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
2002 dma_mask);
2003 if (unlikely(address == DMA_ERROR_CODE)) {
2004 /*
2005 * setting next_address here will let the address
2006 * allocator only scan the new allocated range in the
2007 * first run. This is a small optimization.
2008 */
2009 dma_dom->next_address = dma_dom->aperture_size;
2010
2011 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
2012 goto out;
2013
2014 /*
2015 * aperture was successfully enlarged by 128 MB, try
2016 * allocation again
2017 */
2018 goto retry;
2019 }
2020
2021 start = address;
2022 for (i = 0; i < pages; ++i) {
2023 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
2024 if (ret == DMA_ERROR_CODE)
2025 goto out_unmap;
2026
2027 paddr += PAGE_SIZE;
2028 start += PAGE_SIZE;
2029 }
2030 address += offset;
2031
2032 ADD_STATS_COUNTER(alloced_io_mem, size);
2033
2034 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
2035 domain_flush_tlb(&dma_dom->domain);
2036 dma_dom->need_flush = false;
2037 } else if (unlikely(amd_iommu_np_cache))
2038 domain_flush_pages(&dma_dom->domain, address, size);
2039
2040out:
2041 return address;
2042
2043out_unmap:
2044
2045 for (--i; i >= 0; --i) {
2046 start -= PAGE_SIZE;
2047 dma_ops_domain_unmap(dma_dom, start);
2048 }
2049
2050 dma_ops_free_addresses(dma_dom, address, pages);
2051
2052 return DMA_ERROR_CODE;
2053}
2054
2055/*
2056 * Does the reverse of the __map_single function. Must be called with
2057 * the domain lock held too
2058 */
2059static void __unmap_single(struct dma_ops_domain *dma_dom,
2060 dma_addr_t dma_addr,
2061 size_t size,
2062 int dir)
2063{
2064 dma_addr_t flush_addr;
2065 dma_addr_t i, start;
2066 unsigned int pages;
2067
2068 if ((dma_addr == DMA_ERROR_CODE) ||
2069 (dma_addr + size > dma_dom->aperture_size))
2070 return;
2071
2072 flush_addr = dma_addr;
2073 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
2074 dma_addr &= PAGE_MASK;
2075 start = dma_addr;
2076
2077 for (i = 0; i < pages; ++i) {
2078 dma_ops_domain_unmap(dma_dom, start);
2079 start += PAGE_SIZE;
2080 }
2081
2082 SUB_STATS_COUNTER(alloced_io_mem, size);
2083
2084 dma_ops_free_addresses(dma_dom, dma_addr, pages);
2085
2086 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
2087 domain_flush_pages(&dma_dom->domain, flush_addr, size);
2088 dma_dom->need_flush = false;
2089 }
2090}
2091
2092/*
2093 * The exported map_single function for dma_ops.
2094 */
2095static dma_addr_t map_page(struct device *dev, struct page *page,
2096 unsigned long offset, size_t size,
2097 enum dma_data_direction dir,
2098 struct dma_attrs *attrs)
2099{
2100 unsigned long flags;
2101 struct protection_domain *domain;
2102 dma_addr_t addr;
2103 u64 dma_mask;
2104 phys_addr_t paddr = page_to_phys(page) + offset;
2105
2106 INC_STATS_COUNTER(cnt_map_single);
2107
2108 domain = get_domain(dev);
2109 if (PTR_ERR(domain) == -EINVAL)
2110 return (dma_addr_t)paddr;
2111 else if (IS_ERR(domain))
2112 return DMA_ERROR_CODE;
2113
2114 dma_mask = *dev->dma_mask;
2115
2116 spin_lock_irqsave(&domain->lock, flags);
2117
2118 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
2119 dma_mask);
2120 if (addr == DMA_ERROR_CODE)
2121 goto out;
2122
2123 domain_flush_complete(domain);
2124
2125out:
2126 spin_unlock_irqrestore(&domain->lock, flags);
2127
2128 return addr;
2129}
2130
2131/*
2132 * The exported unmap_single function for dma_ops.
2133 */
2134static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2135 enum dma_data_direction dir, struct dma_attrs *attrs)
2136{
2137 unsigned long flags;
2138 struct protection_domain *domain;
2139
2140 INC_STATS_COUNTER(cnt_unmap_single);
2141
2142 domain = get_domain(dev);
2143 if (IS_ERR(domain))
2144 return;
2145
2146 spin_lock_irqsave(&domain->lock, flags);
2147
2148 __unmap_single(domain->priv, dma_addr, size, dir);
2149
2150 domain_flush_complete(domain);
2151
2152 spin_unlock_irqrestore(&domain->lock, flags);
2153}
2154
2155/*
2156 * This is a special map_sg function which is used if we should map a
2157 * device which is not handled by an AMD IOMMU in the system.
2158 */
2159static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
2160 int nelems, int dir)
2161{
2162 struct scatterlist *s;
2163 int i;
2164
2165 for_each_sg(sglist, s, nelems, i) {
2166 s->dma_address = (dma_addr_t)sg_phys(s);
2167 s->dma_length = s->length;
2168 }
2169
2170 return nelems;
2171}
2172
2173/*
2174 * The exported map_sg function for dma_ops (handles scatter-gather
2175 * lists).
2176 */
2177static int map_sg(struct device *dev, struct scatterlist *sglist,
2178 int nelems, enum dma_data_direction dir,
2179 struct dma_attrs *attrs)
2180{
2181 unsigned long flags;
2182 struct protection_domain *domain;
2183 int i;
2184 struct scatterlist *s;
2185 phys_addr_t paddr;
2186 int mapped_elems = 0;
2187 u64 dma_mask;
2188
2189 INC_STATS_COUNTER(cnt_map_sg);
2190
2191 domain = get_domain(dev);
2192 if (PTR_ERR(domain) == -EINVAL)
2193 return map_sg_no_iommu(dev, sglist, nelems, dir);
2194 else if (IS_ERR(domain))
2195 return 0;
2196
2197 dma_mask = *dev->dma_mask;
2198
2199 spin_lock_irqsave(&domain->lock, flags);
2200
2201 for_each_sg(sglist, s, nelems, i) {
2202 paddr = sg_phys(s);
2203
2204 s->dma_address = __map_single(dev, domain->priv,
2205 paddr, s->length, dir, false,
2206 dma_mask);
2207
2208 if (s->dma_address) {
2209 s->dma_length = s->length;
2210 mapped_elems++;
2211 } else
2212 goto unmap;
2213 }
2214
2215 domain_flush_complete(domain);
2216
2217out:
2218 spin_unlock_irqrestore(&domain->lock, flags);
2219
2220 return mapped_elems;
2221unmap:
2222 for_each_sg(sglist, s, mapped_elems, i) {
2223 if (s->dma_address)
2224 __unmap_single(domain->priv, s->dma_address,
2225 s->dma_length, dir);
2226 s->dma_address = s->dma_length = 0;
2227 }
2228
2229 mapped_elems = 0;
2230
2231 goto out;
2232}
2233
2234/*
2235 * The exported map_sg function for dma_ops (handles scatter-gather
2236 * lists).
2237 */
2238static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2239 int nelems, enum dma_data_direction dir,
2240 struct dma_attrs *attrs)
2241{
2242 unsigned long flags;
2243 struct protection_domain *domain;
2244 struct scatterlist *s;
2245 int i;
2246
2247 INC_STATS_COUNTER(cnt_unmap_sg);
2248
2249 domain = get_domain(dev);
2250 if (IS_ERR(domain))
2251 return;
2252
2253 spin_lock_irqsave(&domain->lock, flags);
2254
2255 for_each_sg(sglist, s, nelems, i) {
2256 __unmap_single(domain->priv, s->dma_address,
2257 s->dma_length, dir);
2258 s->dma_address = s->dma_length = 0;
2259 }
2260
2261 domain_flush_complete(domain);
2262
2263 spin_unlock_irqrestore(&domain->lock, flags);
2264}
2265
2266/*
2267 * The exported alloc_coherent function for dma_ops.
2268 */
2269static void *alloc_coherent(struct device *dev, size_t size,
2270 dma_addr_t *dma_addr, gfp_t flag)
2271{
2272 unsigned long flags;
2273 void *virt_addr;
2274 struct protection_domain *domain;
2275 phys_addr_t paddr;
2276 u64 dma_mask = dev->coherent_dma_mask;
2277
2278 INC_STATS_COUNTER(cnt_alloc_coherent);
2279
2280 domain = get_domain(dev);
2281 if (PTR_ERR(domain) == -EINVAL) {
2282 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2283 *dma_addr = __pa(virt_addr);
2284 return virt_addr;
2285 } else if (IS_ERR(domain))
2286 return NULL;
2287
2288 dma_mask = dev->coherent_dma_mask;
2289 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2290 flag |= __GFP_ZERO;
2291
2292 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2293 if (!virt_addr)
2294 return NULL;
2295
2296 paddr = virt_to_phys(virt_addr);
2297
2298 if (!dma_mask)
2299 dma_mask = *dev->dma_mask;
2300
2301 spin_lock_irqsave(&domain->lock, flags);
2302
2303 *dma_addr = __map_single(dev, domain->priv, paddr,
2304 size, DMA_BIDIRECTIONAL, true, dma_mask);
2305
2306 if (*dma_addr == DMA_ERROR_CODE) {
2307 spin_unlock_irqrestore(&domain->lock, flags);
2308 goto out_free;
2309 }
2310
2311 domain_flush_complete(domain);
2312
2313 spin_unlock_irqrestore(&domain->lock, flags);
2314
2315 return virt_addr;
2316
2317out_free:
2318
2319 free_pages((unsigned long)virt_addr, get_order(size));
2320
2321 return NULL;
2322}
2323
2324/*
2325 * The exported free_coherent function for dma_ops.
2326 */
2327static void free_coherent(struct device *dev, size_t size,
2328 void *virt_addr, dma_addr_t dma_addr)
2329{
2330 unsigned long flags;
2331 struct protection_domain *domain;
2332
2333 INC_STATS_COUNTER(cnt_free_coherent);
2334
2335 domain = get_domain(dev);
2336 if (IS_ERR(domain))
2337 goto free_mem;
2338
2339 spin_lock_irqsave(&domain->lock, flags);
2340
2341 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2342
2343 domain_flush_complete(domain);
2344
2345 spin_unlock_irqrestore(&domain->lock, flags);
2346
2347free_mem:
2348 free_pages((unsigned long)virt_addr, get_order(size));
2349}
2350
2351/*
2352 * This function is called by the DMA layer to find out if we can handle a
2353 * particular device. It is part of the dma_ops.
2354 */
2355static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2356{
2357 return check_device(dev);
2358}
2359
2360/*
2361 * The function for pre-allocating protection domains.
2362 *
2363 * If the driver core informs the DMA layer if a driver grabs a device
2364 * we don't need to preallocate the protection domains anymore.
2365 * For now we have to.
2366 */
2367static void prealloc_protection_domains(void)
2368{
2369 struct pci_dev *dev = NULL;
2370 struct dma_ops_domain *dma_dom;
2371 u16 devid;
2372
2373 for_each_pci_dev(dev) {
2374
2375 /* Do we handle this device? */
2376 if (!check_device(&dev->dev))
2377 continue;
2378
2379 /* Is there already any domain for it? */
2380 if (domain_for_device(&dev->dev))
2381 continue;
2382
2383 devid = get_device_id(&dev->dev);
2384
2385 dma_dom = dma_ops_domain_alloc();
2386 if (!dma_dom)
2387 continue;
2388 init_unity_mappings_for_device(dma_dom, devid);
2389 dma_dom->target_dev = devid;
2390
2391 attach_device(&dev->dev, &dma_dom->domain);
2392
2393 list_add_tail(&dma_dom->list, &iommu_pd_list);
2394 }
2395}
2396
2397static struct dma_map_ops amd_iommu_dma_ops = {
2398 .alloc_coherent = alloc_coherent,
2399 .free_coherent = free_coherent,
2400 .map_page = map_page,
2401 .unmap_page = unmap_page,
2402 .map_sg = map_sg,
2403 .unmap_sg = unmap_sg,
2404 .dma_supported = amd_iommu_dma_supported,
2405};
2406
2407static unsigned device_dma_ops_init(void)
2408{
2409 struct pci_dev *pdev = NULL;
2410 unsigned unhandled = 0;
2411
2412 for_each_pci_dev(pdev) {
2413 if (!check_device(&pdev->dev)) {
2414 unhandled += 1;
2415 continue;
2416 }
2417
2418 pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
2419 }
2420
2421 return unhandled;
2422}
2423
2424/*
2425 * The function which clues the AMD IOMMU driver into dma_ops.
2426 */
2427
2428void __init amd_iommu_init_api(void)
2429{
2430 register_iommu(&amd_iommu_ops);
2431}
2432
2433int __init amd_iommu_init_dma_ops(void)
2434{
2435 struct amd_iommu *iommu;
2436 int ret, unhandled;
2437
2438 /*
2439 * first allocate a default protection domain for every IOMMU we
2440 * found in the system. Devices not assigned to any other
2441 * protection domain will be assigned to the default one.
2442 */
2443 for_each_iommu(iommu) {
2444 iommu->default_dom = dma_ops_domain_alloc();
2445 if (iommu->default_dom == NULL)
2446 return -ENOMEM;
2447 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
2448 ret = iommu_init_unity_mappings(iommu);
2449 if (ret)
2450 goto free_domains;
2451 }
2452
2453 /*
2454 * Pre-allocate the protection domains for each device.
2455 */
2456 prealloc_protection_domains();
2457
2458 iommu_detected = 1;
2459 swiotlb = 0;
2460
2461 /* Make the driver finally visible to the drivers */
2462 unhandled = device_dma_ops_init();
2463 if (unhandled && max_pfn > MAX_DMA32_PFN) {
2464 /* There are unhandled devices - initialize swiotlb for them */
2465 swiotlb = 1;
2466 }
2467
2468 amd_iommu_stats_init();
2469
2470 return 0;
2471
2472free_domains:
2473
2474 for_each_iommu(iommu) {
2475 if (iommu->default_dom)
2476 dma_ops_domain_free(iommu->default_dom);
2477 }
2478
2479 return ret;
2480}
2481
2482/*****************************************************************************
2483 *
2484 * The following functions belong to the exported interface of AMD IOMMU
2485 *
2486 * This interface allows access to lower level functions of the IOMMU
2487 * like protection domain handling and assignement of devices to domains
2488 * which is not possible with the dma_ops interface.
2489 *
2490 *****************************************************************************/
2491
2492static void cleanup_domain(struct protection_domain *domain)
2493{
2494 struct iommu_dev_data *dev_data, *next;
2495 unsigned long flags;
2496
2497 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2498
2499 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2500 struct device *dev = dev_data->dev;
2501
2502 __detach_device(dev);
2503 atomic_set(&dev_data->bind, 0);
2504 }
2505
2506 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2507}
2508
2509static void protection_domain_free(struct protection_domain *domain)
2510{
2511 if (!domain)
2512 return;
2513
2514 del_domain_from_list(domain);
2515
2516 if (domain->id)
2517 domain_id_free(domain->id);
2518
2519 kfree(domain);
2520}
2521
2522static struct protection_domain *protection_domain_alloc(void)
2523{
2524 struct protection_domain *domain;
2525
2526 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2527 if (!domain)
2528 return NULL;
2529
2530 spin_lock_init(&domain->lock);
2531 mutex_init(&domain->api_lock);
2532 domain->id = domain_id_alloc();
2533 if (!domain->id)
2534 goto out_err;
2535 INIT_LIST_HEAD(&domain->dev_list);
2536
2537 add_domain_to_list(domain);
2538
2539 return domain;
2540
2541out_err:
2542 kfree(domain);
2543
2544 return NULL;
2545}
2546
2547static int amd_iommu_domain_init(struct iommu_domain *dom)
2548{
2549 struct protection_domain *domain;
2550
2551 domain = protection_domain_alloc();
2552 if (!domain)
2553 goto out_free;
2554
2555 domain->mode = PAGE_MODE_3_LEVEL;
2556 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2557 if (!domain->pt_root)
2558 goto out_free;
2559
2560 dom->priv = domain;
2561
2562 return 0;
2563
2564out_free:
2565 protection_domain_free(domain);
2566
2567 return -ENOMEM;
2568}
2569
2570static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2571{
2572 struct protection_domain *domain = dom->priv;
2573
2574 if (!domain)
2575 return;
2576
2577 if (domain->dev_cnt > 0)
2578 cleanup_domain(domain);
2579
2580 BUG_ON(domain->dev_cnt != 0);
2581
2582 free_pagetable(domain);
2583
2584 protection_domain_free(domain);
2585
2586 dom->priv = NULL;
2587}
2588
2589static void amd_iommu_detach_device(struct iommu_domain *dom,
2590 struct device *dev)
2591{
2592 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2593 struct amd_iommu *iommu;
2594 u16 devid;
2595
2596 if (!check_device(dev))
2597 return;
2598
2599 devid = get_device_id(dev);
2600
2601 if (dev_data->domain != NULL)
2602 detach_device(dev);
2603
2604 iommu = amd_iommu_rlookup_table[devid];
2605 if (!iommu)
2606 return;
2607
2608 device_flush_dte(dev);
2609 iommu_completion_wait(iommu);
2610}
2611
2612static int amd_iommu_attach_device(struct iommu_domain *dom,
2613 struct device *dev)
2614{
2615 struct protection_domain *domain = dom->priv;
2616 struct iommu_dev_data *dev_data;
2617 struct amd_iommu *iommu;
2618 int ret;
2619 u16 devid;
2620
2621 if (!check_device(dev))
2622 return -EINVAL;
2623
2624 dev_data = dev->archdata.iommu;
2625
2626 devid = get_device_id(dev);
2627
2628 iommu = amd_iommu_rlookup_table[devid];
2629 if (!iommu)
2630 return -EINVAL;
2631
2632 if (dev_data->domain)
2633 detach_device(dev);
2634
2635 ret = attach_device(dev, domain);
2636
2637 iommu_completion_wait(iommu);
2638
2639 return ret;
2640}
2641
2642static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2643 phys_addr_t paddr, int gfp_order, int iommu_prot)
2644{
2645 unsigned long page_size = 0x1000UL << gfp_order;
2646 struct protection_domain *domain = dom->priv;
2647 int prot = 0;
2648 int ret;
2649
2650 if (iommu_prot & IOMMU_READ)
2651 prot |= IOMMU_PROT_IR;
2652 if (iommu_prot & IOMMU_WRITE)
2653 prot |= IOMMU_PROT_IW;
2654
2655 mutex_lock(&domain->api_lock);
2656 ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2657 mutex_unlock(&domain->api_lock);
2658
2659 return ret;
2660}
2661
2662static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2663 int gfp_order)
2664{
2665 struct protection_domain *domain = dom->priv;
2666 unsigned long page_size, unmap_size;
2667
2668 page_size = 0x1000UL << gfp_order;
2669
2670 mutex_lock(&domain->api_lock);
2671 unmap_size = iommu_unmap_page(domain, iova, page_size);
2672 mutex_unlock(&domain->api_lock);
2673
2674 domain_flush_tlb_pde(domain);
2675
2676 return get_order(unmap_size);
2677}
2678
2679static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2680 unsigned long iova)
2681{
2682 struct protection_domain *domain = dom->priv;
2683 unsigned long offset_mask;
2684 phys_addr_t paddr;
2685 u64 *pte, __pte;
2686
2687 pte = fetch_pte(domain, iova);
2688
2689 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2690 return 0;
2691
2692 if (PM_PTE_LEVEL(*pte) == 0)
2693 offset_mask = PAGE_SIZE - 1;
2694 else
2695 offset_mask = PTE_PAGE_SIZE(*pte) - 1;
2696
2697 __pte = *pte & PM_ADDR_MASK;
2698 paddr = (__pte & ~offset_mask) | (iova & offset_mask);
2699
2700 return paddr;
2701}
2702
2703static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2704 unsigned long cap)
2705{
2706 switch (cap) {
2707 case IOMMU_CAP_CACHE_COHERENCY:
2708 return 1;
2709 }
2710
2711 return 0;
2712}
2713
2714static struct iommu_ops amd_iommu_ops = {
2715 .domain_init = amd_iommu_domain_init,
2716 .domain_destroy = amd_iommu_domain_destroy,
2717 .attach_dev = amd_iommu_attach_device,
2718 .detach_dev = amd_iommu_detach_device,
2719 .map = amd_iommu_map,
2720 .unmap = amd_iommu_unmap,
2721 .iova_to_phys = amd_iommu_iova_to_phys,
2722 .domain_has_cap = amd_iommu_domain_has_cap,
2723};
2724
2725/*****************************************************************************
2726 *
2727 * The next functions do a basic initialization of IOMMU for pass through
2728 * mode
2729 *
2730 * In passthrough mode the IOMMU is initialized and enabled but not used for
2731 * DMA-API translation.
2732 *
2733 *****************************************************************************/
2734
2735int __init amd_iommu_init_passthrough(void)
2736{
2737 struct amd_iommu *iommu;
2738 struct pci_dev *dev = NULL;
2739 u16 devid;
2740
2741 /* allocate passthrough domain */
2742 pt_domain = protection_domain_alloc();
2743 if (!pt_domain)
2744 return -ENOMEM;
2745
2746 pt_domain->mode |= PAGE_MODE_NONE;
2747
2748 for_each_pci_dev(dev) {
2749 if (!check_device(&dev->dev))
2750 continue;
2751
2752 devid = get_device_id(&dev->dev);
2753
2754 iommu = amd_iommu_rlookup_table[devid];
2755 if (!iommu)
2756 continue;
2757
2758 attach_device(&dev->dev, pt_domain);
2759 }
2760
2761 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
2762
2763 return 0;
2764}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index bfc8453bd98..00000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1572 +0,0 @@
1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/acpi.h>
22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/syscore_ops.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
29#include <asm/amd_iommu_types.h>
30#include <asm/amd_iommu.h>
31#include <asm/iommu.h>
32#include <asm/gart.h>
33#include <asm/x86_init.h>
34#include <asm/iommu_table.h>
35/*
36 * definitions for the ACPI scanning code
37 */
38#define IVRS_HEADER_LENGTH 48
39
40#define ACPI_IVHD_TYPE 0x10
41#define ACPI_IVMD_TYPE_ALL 0x20
42#define ACPI_IVMD_TYPE 0x21
43#define ACPI_IVMD_TYPE_RANGE 0x22
44
45#define IVHD_DEV_ALL 0x01
46#define IVHD_DEV_SELECT 0x02
47#define IVHD_DEV_SELECT_RANGE_START 0x03
48#define IVHD_DEV_RANGE_END 0x04
49#define IVHD_DEV_ALIAS 0x42
50#define IVHD_DEV_ALIAS_RANGE 0x43
51#define IVHD_DEV_EXT_SELECT 0x46
52#define IVHD_DEV_EXT_SELECT_RANGE 0x47
53
54#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
55#define IVHD_FLAG_PASSPW_EN_MASK 0x02
56#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
57#define IVHD_FLAG_ISOC_EN_MASK 0x08
58
59#define IVMD_FLAG_EXCL_RANGE 0x08
60#define IVMD_FLAG_UNITY_MAP 0x01
61
62#define ACPI_DEVFLAG_INITPASS 0x01
63#define ACPI_DEVFLAG_EXTINT 0x02
64#define ACPI_DEVFLAG_NMI 0x04
65#define ACPI_DEVFLAG_SYSMGT1 0x10
66#define ACPI_DEVFLAG_SYSMGT2 0x20
67#define ACPI_DEVFLAG_LINT0 0x40
68#define ACPI_DEVFLAG_LINT1 0x80
69#define ACPI_DEVFLAG_ATSDIS 0x10000000
70
71/*
72 * ACPI table definitions
73 *
74 * These data structures are laid over the table to parse the important values
75 * out of it.
76 */
77
78/*
79 * structure describing one IOMMU in the ACPI table. Typically followed by one
80 * or more ivhd_entrys.
81 */
82struct ivhd_header {
83 u8 type;
84 u8 flags;
85 u16 length;
86 u16 devid;
87 u16 cap_ptr;
88 u64 mmio_phys;
89 u16 pci_seg;
90 u16 info;
91 u32 reserved;
92} __attribute__((packed));
93
94/*
95 * A device entry describing which devices a specific IOMMU translates and
96 * which requestor ids they use.
97 */
98struct ivhd_entry {
99 u8 type;
100 u16 devid;
101 u8 flags;
102 u32 ext;
103} __attribute__((packed));
104
105/*
106 * An AMD IOMMU memory definition structure. It defines things like exclusion
107 * ranges for devices and regions that should be unity mapped.
108 */
109struct ivmd_header {
110 u8 type;
111 u8 flags;
112 u16 length;
113 u16 devid;
114 u16 aux;
115 u64 resv;
116 u64 range_start;
117 u64 range_length;
118} __attribute__((packed));
119
120bool amd_iommu_dump;
121
122static int __initdata amd_iommu_detected;
123static bool __initdata amd_iommu_disabled;
124
125u16 amd_iommu_last_bdf; /* largest PCI device id we have
126 to handle */
127LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
128 we find in ACPI */
129bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
130
131LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
132 system */
133
134/* Array to assign indices to IOMMUs*/
135struct amd_iommu *amd_iommus[MAX_IOMMUS];
136int amd_iommus_present;
137
138/* IOMMUs have a non-present cache? */
139bool amd_iommu_np_cache __read_mostly;
140bool amd_iommu_iotlb_sup __read_mostly = true;
141
142/*
143 * The ACPI table parsing functions set this variable on an error
144 */
145static int __initdata amd_iommu_init_err;
146
147/*
148 * List of protection domains - used during resume
149 */
150LIST_HEAD(amd_iommu_pd_list);
151spinlock_t amd_iommu_pd_lock;
152
153/*
154 * Pointer to the device table which is shared by all AMD IOMMUs
155 * it is indexed by the PCI device id or the HT unit id and contains
156 * information about the domain the device belongs to as well as the
157 * page table root pointer.
158 */
159struct dev_table_entry *amd_iommu_dev_table;
160
161/*
162 * The alias table is a driver specific data structure which contains the
163 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
164 * More than one device can share the same requestor id.
165 */
166u16 *amd_iommu_alias_table;
167
168/*
169 * The rlookup table is used to find the IOMMU which is responsible
170 * for a specific device. It is also indexed by the PCI device id.
171 */
172struct amd_iommu **amd_iommu_rlookup_table;
173
174/*
175 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
176 * to know which ones are already in use.
177 */
178unsigned long *amd_iommu_pd_alloc_bitmap;
179
180static u32 dev_table_size; /* size of the device table */
181static u32 alias_table_size; /* size of the alias table */
182static u32 rlookup_table_size; /* size if the rlookup table */
183
184/*
185 * This function flushes all internal caches of
186 * the IOMMU used by this driver.
187 */
188extern void iommu_flush_all_caches(struct amd_iommu *iommu);
189
190static inline void update_last_devid(u16 devid)
191{
192 if (devid > amd_iommu_last_bdf)
193 amd_iommu_last_bdf = devid;
194}
195
196static inline unsigned long tbl_size(int entry_size)
197{
198 unsigned shift = PAGE_SHIFT +
199 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
200
201 return 1UL << shift;
202}
203
204/* Access to l1 and l2 indexed register spaces */
205
206static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
207{
208 u32 val;
209
210 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
211 pci_read_config_dword(iommu->dev, 0xfc, &val);
212 return val;
213}
214
215static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
216{
217 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
218 pci_write_config_dword(iommu->dev, 0xfc, val);
219 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
220}
221
222static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
223{
224 u32 val;
225
226 pci_write_config_dword(iommu->dev, 0xf0, address);
227 pci_read_config_dword(iommu->dev, 0xf4, &val);
228 return val;
229}
230
231static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
232{
233 pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
234 pci_write_config_dword(iommu->dev, 0xf4, val);
235}
236
237/****************************************************************************
238 *
239 * AMD IOMMU MMIO register space handling functions
240 *
241 * These functions are used to program the IOMMU device registers in
242 * MMIO space required for that driver.
243 *
244 ****************************************************************************/
245
246/*
247 * This function set the exclusion range in the IOMMU. DMA accesses to the
248 * exclusion range are passed through untranslated
249 */
250static void iommu_set_exclusion_range(struct amd_iommu *iommu)
251{
252 u64 start = iommu->exclusion_start & PAGE_MASK;
253 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
254 u64 entry;
255
256 if (!iommu->exclusion_start)
257 return;
258
259 entry = start | MMIO_EXCL_ENABLE_MASK;
260 memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
261 &entry, sizeof(entry));
262
263 entry = limit;
264 memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
265 &entry, sizeof(entry));
266}
267
268/* Programs the physical address of the device table into the IOMMU hardware */
269static void __init iommu_set_device_table(struct amd_iommu *iommu)
270{
271 u64 entry;
272
273 BUG_ON(iommu->mmio_base == NULL);
274
275 entry = virt_to_phys(amd_iommu_dev_table);
276 entry |= (dev_table_size >> 12) - 1;
277 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
278 &entry, sizeof(entry));
279}
280
281/* Generic functions to enable/disable certain features of the IOMMU. */
282static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
283{
284 u32 ctrl;
285
286 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
287 ctrl |= (1 << bit);
288 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
289}
290
291static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
292{
293 u32 ctrl;
294
295 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
296 ctrl &= ~(1 << bit);
297 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
298}
299
300/* Function to enable the hardware */
301static void iommu_enable(struct amd_iommu *iommu)
302{
303 static const char * const feat_str[] = {
304 "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
305 "IA", "GA", "HE", "PC", NULL
306 };
307 int i;
308
309 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
310 dev_name(&iommu->dev->dev), iommu->cap_ptr);
311
312 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
313 printk(KERN_CONT " extended features: ");
314 for (i = 0; feat_str[i]; ++i)
315 if (iommu_feature(iommu, (1ULL << i)))
316 printk(KERN_CONT " %s", feat_str[i]);
317 }
318 printk(KERN_CONT "\n");
319
320 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
321}
322
323static void iommu_disable(struct amd_iommu *iommu)
324{
325 /* Disable command buffer */
326 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
327
328 /* Disable event logging and event interrupts */
329 iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
330 iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
331
332 /* Disable IOMMU hardware itself */
333 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
334}
335
336/*
337 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
338 * the system has one.
339 */
340static u8 * __init iommu_map_mmio_space(u64 address)
341{
342 u8 *ret;
343
344 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
345 pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
346 address);
347 pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
348 return NULL;
349 }
350
351 ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
352 if (ret != NULL)
353 return ret;
354
355 release_mem_region(address, MMIO_REGION_LENGTH);
356
357 return NULL;
358}
359
360static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
361{
362 if (iommu->mmio_base)
363 iounmap(iommu->mmio_base);
364 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
365}
366
367/****************************************************************************
368 *
369 * The functions below belong to the first pass of AMD IOMMU ACPI table
370 * parsing. In this pass we try to find out the highest device id this
371 * code has to handle. Upon this information the size of the shared data
372 * structures is determined later.
373 *
374 ****************************************************************************/
375
376/*
377 * This function calculates the length of a given IVHD entry
378 */
379static inline int ivhd_entry_length(u8 *ivhd)
380{
381 return 0x04 << (*ivhd >> 6);
382}
383
384/*
385 * This function reads the last device id the IOMMU has to handle from the PCI
386 * capability header for this IOMMU
387 */
388static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
389{
390 u32 cap;
391
392 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
393 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
394
395 return 0;
396}
397
398/*
399 * After reading the highest device id from the IOMMU PCI capability header
400 * this function looks if there is a higher device id defined in the ACPI table
401 */
402static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
403{
404 u8 *p = (void *)h, *end = (void *)h;
405 struct ivhd_entry *dev;
406
407 p += sizeof(*h);
408 end += h->length;
409
410 find_last_devid_on_pci(PCI_BUS(h->devid),
411 PCI_SLOT(h->devid),
412 PCI_FUNC(h->devid),
413 h->cap_ptr);
414
415 while (p < end) {
416 dev = (struct ivhd_entry *)p;
417 switch (dev->type) {
418 case IVHD_DEV_SELECT:
419 case IVHD_DEV_RANGE_END:
420 case IVHD_DEV_ALIAS:
421 case IVHD_DEV_EXT_SELECT:
422 /* all the above subfield types refer to device ids */
423 update_last_devid(dev->devid);
424 break;
425 default:
426 break;
427 }
428 p += ivhd_entry_length(p);
429 }
430
431 WARN_ON(p != end);
432
433 return 0;
434}
435
436/*
437 * Iterate over all IVHD entries in the ACPI table and find the highest device
438 * id which we need to handle. This is the first of three functions which parse
439 * the ACPI table. So we check the checksum here.
440 */
441static int __init find_last_devid_acpi(struct acpi_table_header *table)
442{
443 int i;
444 u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
445 struct ivhd_header *h;
446
447 /*
448 * Validate checksum here so we don't need to do it when
449 * we actually parse the table
450 */
451 for (i = 0; i < table->length; ++i)
452 checksum += p[i];
453 if (checksum != 0) {
454 /* ACPI table corrupt */
455 amd_iommu_init_err = -ENODEV;
456 return 0;
457 }
458
459 p += IVRS_HEADER_LENGTH;
460
461 end += table->length;
462 while (p < end) {
463 h = (struct ivhd_header *)p;
464 switch (h->type) {
465 case ACPI_IVHD_TYPE:
466 find_last_devid_from_ivhd(h);
467 break;
468 default:
469 break;
470 }
471 p += h->length;
472 }
473 WARN_ON(p != end);
474
475 return 0;
476}
477
478/****************************************************************************
479 *
480 * The following functions belong the the code path which parses the ACPI table
481 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
482 * data structures, initialize the device/alias/rlookup table and also
483 * basically initialize the hardware.
484 *
485 ****************************************************************************/
486
487/*
488 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
489 * write commands to that buffer later and the IOMMU will execute them
490 * asynchronously
491 */
492static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
493{
494 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
495 get_order(CMD_BUFFER_SIZE));
496
497 if (cmd_buf == NULL)
498 return NULL;
499
500 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
501
502 return cmd_buf;
503}
504
505/*
506 * This function resets the command buffer if the IOMMU stopped fetching
507 * commands from it.
508 */
509void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
510{
511 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
512
513 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
514 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
515
516 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
517}
518
519/*
520 * This function writes the command buffer address to the hardware and
521 * enables it.
522 */
523static void iommu_enable_command_buffer(struct amd_iommu *iommu)
524{
525 u64 entry;
526
527 BUG_ON(iommu->cmd_buf == NULL);
528
529 entry = (u64)virt_to_phys(iommu->cmd_buf);
530 entry |= MMIO_CMD_SIZE_512;
531
532 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
533 &entry, sizeof(entry));
534
535 amd_iommu_reset_cmd_buffer(iommu);
536 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
537}
538
539static void __init free_command_buffer(struct amd_iommu *iommu)
540{
541 free_pages((unsigned long)iommu->cmd_buf,
542 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
543}
544
545/* allocates the memory where the IOMMU will log its events to */
546static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
547{
548 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
549 get_order(EVT_BUFFER_SIZE));
550
551 if (iommu->evt_buf == NULL)
552 return NULL;
553
554 iommu->evt_buf_size = EVT_BUFFER_SIZE;
555
556 return iommu->evt_buf;
557}
558
559static void iommu_enable_event_buffer(struct amd_iommu *iommu)
560{
561 u64 entry;
562
563 BUG_ON(iommu->evt_buf == NULL);
564
565 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
566
567 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
568 &entry, sizeof(entry));
569
570 /* set head and tail to zero manually */
571 writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
572 writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
573
574 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
575}
576
577static void __init free_event_buffer(struct amd_iommu *iommu)
578{
579 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
580}
581
582/* sets a specific bit in the device table entry. */
583static void set_dev_entry_bit(u16 devid, u8 bit)
584{
585 int i = (bit >> 5) & 0x07;
586 int _bit = bit & 0x1f;
587
588 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
589}
590
591static int get_dev_entry_bit(u16 devid, u8 bit)
592{
593 int i = (bit >> 5) & 0x07;
594 int _bit = bit & 0x1f;
595
596 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
597}
598
599
600void amd_iommu_apply_erratum_63(u16 devid)
601{
602 int sysmgt;
603
604 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
605 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
606
607 if (sysmgt == 0x01)
608 set_dev_entry_bit(devid, DEV_ENTRY_IW);
609}
610
611/* Writes the specific IOMMU for a device into the rlookup table */
612static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
613{
614 amd_iommu_rlookup_table[devid] = iommu;
615}
616
617/*
618 * This function takes the device specific flags read from the ACPI
619 * table and sets up the device table entry with that information
620 */
621static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
622 u16 devid, u32 flags, u32 ext_flags)
623{
624 if (flags & ACPI_DEVFLAG_INITPASS)
625 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
626 if (flags & ACPI_DEVFLAG_EXTINT)
627 set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
628 if (flags & ACPI_DEVFLAG_NMI)
629 set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
630 if (flags & ACPI_DEVFLAG_SYSMGT1)
631 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
632 if (flags & ACPI_DEVFLAG_SYSMGT2)
633 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
634 if (flags & ACPI_DEVFLAG_LINT0)
635 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
636 if (flags & ACPI_DEVFLAG_LINT1)
637 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
638
639 amd_iommu_apply_erratum_63(devid);
640
641 set_iommu_for_device(iommu, devid);
642}
643
644/*
645 * Reads the device exclusion range from ACPI and initialize IOMMU with
646 * it
647 */
648static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
649{
650 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
651
652 if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
653 return;
654
655 if (iommu) {
656 /*
657 * We only can configure exclusion ranges per IOMMU, not
658 * per device. But we can enable the exclusion range per
659 * device. This is done here
660 */
661 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
662 iommu->exclusion_start = m->range_start;
663 iommu->exclusion_length = m->range_length;
664 }
665}
666
667/*
668 * This function reads some important data from the IOMMU PCI space and
669 * initializes the driver data structure with it. It reads the hardware
670 * capabilities and the first/last device entries
671 */
672static void __init init_iommu_from_pci(struct amd_iommu *iommu)
673{
674 int cap_ptr = iommu->cap_ptr;
675 u32 range, misc, low, high;
676 int i, j;
677
678 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
679 &iommu->cap);
680 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
681 &range);
682 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
683 &misc);
684
685 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
686 MMIO_GET_FD(range));
687 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
688 MMIO_GET_LD(range));
689 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
690
691 if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
692 amd_iommu_iotlb_sup = false;
693
694 /* read extended feature bits */
695 low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
696 high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
697
698 iommu->features = ((u64)high << 32) | low;
699
700 if (!is_rd890_iommu(iommu->dev))
701 return;
702
703 /*
704 * Some rd890 systems may not be fully reconfigured by the BIOS, so
705 * it's necessary for us to store this information so it can be
706 * reprogrammed on resume
707 */
708
709 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
710 &iommu->stored_addr_lo);
711 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
712 &iommu->stored_addr_hi);
713
714 /* Low bit locks writes to configuration space */
715 iommu->stored_addr_lo &= ~1;
716
717 for (i = 0; i < 6; i++)
718 for (j = 0; j < 0x12; j++)
719 iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
720
721 for (i = 0; i < 0x83; i++)
722 iommu->stored_l2[i] = iommu_read_l2(iommu, i);
723}
724
725/*
726 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
727 * initializes the hardware and our data structures with it.
728 */
729static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
730 struct ivhd_header *h)
731{
732 u8 *p = (u8 *)h;
733 u8 *end = p, flags = 0;
734 u16 devid = 0, devid_start = 0, devid_to = 0;
735 u32 dev_i, ext_flags = 0;
736 bool alias = false;
737 struct ivhd_entry *e;
738
739 /*
740 * First save the recommended feature enable bits from ACPI
741 */
742 iommu->acpi_flags = h->flags;
743
744 /*
745 * Done. Now parse the device entries
746 */
747 p += sizeof(struct ivhd_header);
748 end += h->length;
749
750
751 while (p < end) {
752 e = (struct ivhd_entry *)p;
753 switch (e->type) {
754 case IVHD_DEV_ALL:
755
756 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
757 " last device %02x:%02x.%x flags: %02x\n",
758 PCI_BUS(iommu->first_device),
759 PCI_SLOT(iommu->first_device),
760 PCI_FUNC(iommu->first_device),
761 PCI_BUS(iommu->last_device),
762 PCI_SLOT(iommu->last_device),
763 PCI_FUNC(iommu->last_device),
764 e->flags);
765
766 for (dev_i = iommu->first_device;
767 dev_i <= iommu->last_device; ++dev_i)
768 set_dev_entry_from_acpi(iommu, dev_i,
769 e->flags, 0);
770 break;
771 case IVHD_DEV_SELECT:
772
773 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
774 "flags: %02x\n",
775 PCI_BUS(e->devid),
776 PCI_SLOT(e->devid),
777 PCI_FUNC(e->devid),
778 e->flags);
779
780 devid = e->devid;
781 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
782 break;
783 case IVHD_DEV_SELECT_RANGE_START:
784
785 DUMP_printk(" DEV_SELECT_RANGE_START\t "
786 "devid: %02x:%02x.%x flags: %02x\n",
787 PCI_BUS(e->devid),
788 PCI_SLOT(e->devid),
789 PCI_FUNC(e->devid),
790 e->flags);
791
792 devid_start = e->devid;
793 flags = e->flags;
794 ext_flags = 0;
795 alias = false;
796 break;
797 case IVHD_DEV_ALIAS:
798
799 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
800 "flags: %02x devid_to: %02x:%02x.%x\n",
801 PCI_BUS(e->devid),
802 PCI_SLOT(e->devid),
803 PCI_FUNC(e->devid),
804 e->flags,
805 PCI_BUS(e->ext >> 8),
806 PCI_SLOT(e->ext >> 8),
807 PCI_FUNC(e->ext >> 8));
808
809 devid = e->devid;
810 devid_to = e->ext >> 8;
811 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
812 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
813 amd_iommu_alias_table[devid] = devid_to;
814 break;
815 case IVHD_DEV_ALIAS_RANGE:
816
817 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
818 "devid: %02x:%02x.%x flags: %02x "
819 "devid_to: %02x:%02x.%x\n",
820 PCI_BUS(e->devid),
821 PCI_SLOT(e->devid),
822 PCI_FUNC(e->devid),
823 e->flags,
824 PCI_BUS(e->ext >> 8),
825 PCI_SLOT(e->ext >> 8),
826 PCI_FUNC(e->ext >> 8));
827
828 devid_start = e->devid;
829 flags = e->flags;
830 devid_to = e->ext >> 8;
831 ext_flags = 0;
832 alias = true;
833 break;
834 case IVHD_DEV_EXT_SELECT:
835
836 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
837 "flags: %02x ext: %08x\n",
838 PCI_BUS(e->devid),
839 PCI_SLOT(e->devid),
840 PCI_FUNC(e->devid),
841 e->flags, e->ext);
842
843 devid = e->devid;
844 set_dev_entry_from_acpi(iommu, devid, e->flags,
845 e->ext);
846 break;
847 case IVHD_DEV_EXT_SELECT_RANGE:
848
849 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
850 "%02x:%02x.%x flags: %02x ext: %08x\n",
851 PCI_BUS(e->devid),
852 PCI_SLOT(e->devid),
853 PCI_FUNC(e->devid),
854 e->flags, e->ext);
855
856 devid_start = e->devid;
857 flags = e->flags;
858 ext_flags = e->ext;
859 alias = false;
860 break;
861 case IVHD_DEV_RANGE_END:
862
863 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
864 PCI_BUS(e->devid),
865 PCI_SLOT(e->devid),
866 PCI_FUNC(e->devid));
867
868 devid = e->devid;
869 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
870 if (alias) {
871 amd_iommu_alias_table[dev_i] = devid_to;
872 set_dev_entry_from_acpi(iommu,
873 devid_to, flags, ext_flags);
874 }
875 set_dev_entry_from_acpi(iommu, dev_i,
876 flags, ext_flags);
877 }
878 break;
879 default:
880 break;
881 }
882
883 p += ivhd_entry_length(p);
884 }
885}
886
887/* Initializes the device->iommu mapping for the driver */
888static int __init init_iommu_devices(struct amd_iommu *iommu)
889{
890 u32 i;
891
892 for (i = iommu->first_device; i <= iommu->last_device; ++i)
893 set_iommu_for_device(iommu, i);
894
895 return 0;
896}
897
898static void __init free_iommu_one(struct amd_iommu *iommu)
899{
900 free_command_buffer(iommu);
901 free_event_buffer(iommu);
902 iommu_unmap_mmio_space(iommu);
903}
904
905static void __init free_iommu_all(void)
906{
907 struct amd_iommu *iommu, *next;
908
909 for_each_iommu_safe(iommu, next) {
910 list_del(&iommu->list);
911 free_iommu_one(iommu);
912 kfree(iommu);
913 }
914}
915
916/*
917 * This function clues the initialization function for one IOMMU
918 * together and also allocates the command buffer and programs the
919 * hardware. It does NOT enable the IOMMU. This is done afterwards.
920 */
921static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
922{
923 spin_lock_init(&iommu->lock);
924
925 /* Add IOMMU to internal data structures */
926 list_add_tail(&iommu->list, &amd_iommu_list);
927 iommu->index = amd_iommus_present++;
928
929 if (unlikely(iommu->index >= MAX_IOMMUS)) {
930 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
931 return -ENOSYS;
932 }
933
934 /* Index is fine - add IOMMU to the array */
935 amd_iommus[iommu->index] = iommu;
936
937 /*
938 * Copy data from ACPI table entry to the iommu struct
939 */
940 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
941 if (!iommu->dev)
942 return 1;
943
944 iommu->cap_ptr = h->cap_ptr;
945 iommu->pci_seg = h->pci_seg;
946 iommu->mmio_phys = h->mmio_phys;
947 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
948 if (!iommu->mmio_base)
949 return -ENOMEM;
950
951 iommu->cmd_buf = alloc_command_buffer(iommu);
952 if (!iommu->cmd_buf)
953 return -ENOMEM;
954
955 iommu->evt_buf = alloc_event_buffer(iommu);
956 if (!iommu->evt_buf)
957 return -ENOMEM;
958
959 iommu->int_enabled = false;
960
961 init_iommu_from_pci(iommu);
962 init_iommu_from_acpi(iommu, h);
963 init_iommu_devices(iommu);
964
965 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
966 amd_iommu_np_cache = true;
967
968 return pci_enable_device(iommu->dev);
969}
970
971/*
972 * Iterates over all IOMMU entries in the ACPI table, allocates the
973 * IOMMU structure and initializes it with init_iommu_one()
974 */
975static int __init init_iommu_all(struct acpi_table_header *table)
976{
977 u8 *p = (u8 *)table, *end = (u8 *)table;
978 struct ivhd_header *h;
979 struct amd_iommu *iommu;
980 int ret;
981
982 end += table->length;
983 p += IVRS_HEADER_LENGTH;
984
985 while (p < end) {
986 h = (struct ivhd_header *)p;
987 switch (*p) {
988 case ACPI_IVHD_TYPE:
989
990 DUMP_printk("device: %02x:%02x.%01x cap: %04x "
991 "seg: %d flags: %01x info %04x\n",
992 PCI_BUS(h->devid), PCI_SLOT(h->devid),
993 PCI_FUNC(h->devid), h->cap_ptr,
994 h->pci_seg, h->flags, h->info);
995 DUMP_printk(" mmio-addr: %016llx\n",
996 h->mmio_phys);
997
998 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
999 if (iommu == NULL) {
1000 amd_iommu_init_err = -ENOMEM;
1001 return 0;
1002 }
1003
1004 ret = init_iommu_one(iommu, h);
1005 if (ret) {
1006 amd_iommu_init_err = ret;
1007 return 0;
1008 }
1009 break;
1010 default:
1011 break;
1012 }
1013 p += h->length;
1014
1015 }
1016 WARN_ON(p != end);
1017
1018 return 0;
1019}
1020
1021/****************************************************************************
1022 *
1023 * The following functions initialize the MSI interrupts for all IOMMUs
1024 * in the system. Its a bit challenging because there could be multiple
1025 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
1026 * pci_dev.
1027 *
1028 ****************************************************************************/
1029
1030static int iommu_setup_msi(struct amd_iommu *iommu)
1031{
1032 int r;
1033
1034 if (pci_enable_msi(iommu->dev))
1035 return 1;
1036
1037 r = request_threaded_irq(iommu->dev->irq,
1038 amd_iommu_int_handler,
1039 amd_iommu_int_thread,
1040 0, "AMD-Vi",
1041 iommu->dev);
1042
1043 if (r) {
1044 pci_disable_msi(iommu->dev);
1045 return 1;
1046 }
1047
1048 iommu->int_enabled = true;
1049 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
1050
1051 return 0;
1052}
1053
1054static int iommu_init_msi(struct amd_iommu *iommu)
1055{
1056 if (iommu->int_enabled)
1057 return 0;
1058
1059 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
1060 return iommu_setup_msi(iommu);
1061
1062 return 1;
1063}
1064
1065/****************************************************************************
1066 *
1067 * The next functions belong to the third pass of parsing the ACPI
1068 * table. In this last pass the memory mapping requirements are
1069 * gathered (like exclusion and unity mapping reanges).
1070 *
1071 ****************************************************************************/
1072
1073static void __init free_unity_maps(void)
1074{
1075 struct unity_map_entry *entry, *next;
1076
1077 list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
1078 list_del(&entry->list);
1079 kfree(entry);
1080 }
1081}
1082
1083/* called when we find an exclusion range definition in ACPI */
1084static int __init init_exclusion_range(struct ivmd_header *m)
1085{
1086 int i;
1087
1088 switch (m->type) {
1089 case ACPI_IVMD_TYPE:
1090 set_device_exclusion_range(m->devid, m);
1091 break;
1092 case ACPI_IVMD_TYPE_ALL:
1093 for (i = 0; i <= amd_iommu_last_bdf; ++i)
1094 set_device_exclusion_range(i, m);
1095 break;
1096 case ACPI_IVMD_TYPE_RANGE:
1097 for (i = m->devid; i <= m->aux; ++i)
1098 set_device_exclusion_range(i, m);
1099 break;
1100 default:
1101 break;
1102 }
1103
1104 return 0;
1105}
1106
1107/* called for unity map ACPI definition */
1108static int __init init_unity_map_range(struct ivmd_header *m)
1109{
1110 struct unity_map_entry *e = 0;
1111 char *s;
1112
1113 e = kzalloc(sizeof(*e), GFP_KERNEL);
1114 if (e == NULL)
1115 return -ENOMEM;
1116
1117 switch (m->type) {
1118 default:
1119 kfree(e);
1120 return 0;
1121 case ACPI_IVMD_TYPE:
1122 s = "IVMD_TYPEi\t\t\t";
1123 e->devid_start = e->devid_end = m->devid;
1124 break;
1125 case ACPI_IVMD_TYPE_ALL:
1126 s = "IVMD_TYPE_ALL\t\t";
1127 e->devid_start = 0;
1128 e->devid_end = amd_iommu_last_bdf;
1129 break;
1130 case ACPI_IVMD_TYPE_RANGE:
1131 s = "IVMD_TYPE_RANGE\t\t";
1132 e->devid_start = m->devid;
1133 e->devid_end = m->aux;
1134 break;
1135 }
1136 e->address_start = PAGE_ALIGN(m->range_start);
1137 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
1138 e->prot = m->flags >> 1;
1139
1140 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
1141 " range_start: %016llx range_end: %016llx flags: %x\n", s,
1142 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
1143 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
1144 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
1145 e->address_start, e->address_end, m->flags);
1146
1147 list_add_tail(&e->list, &amd_iommu_unity_map);
1148
1149 return 0;
1150}
1151
1152/* iterates over all memory definitions we find in the ACPI table */
1153static int __init init_memory_definitions(struct acpi_table_header *table)
1154{
1155 u8 *p = (u8 *)table, *end = (u8 *)table;
1156 struct ivmd_header *m;
1157
1158 end += table->length;
1159 p += IVRS_HEADER_LENGTH;
1160
1161 while (p < end) {
1162 m = (struct ivmd_header *)p;
1163 if (m->flags & IVMD_FLAG_EXCL_RANGE)
1164 init_exclusion_range(m);
1165 else if (m->flags & IVMD_FLAG_UNITY_MAP)
1166 init_unity_map_range(m);
1167
1168 p += m->length;
1169 }
1170
1171 return 0;
1172}
1173
1174/*
1175 * Init the device table to not allow DMA access for devices and
1176 * suppress all page faults
1177 */
1178static void init_device_table(void)
1179{
1180 u32 devid;
1181
1182 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
1183 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
1184 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
1185 }
1186}
1187
1188static void iommu_init_flags(struct amd_iommu *iommu)
1189{
1190 iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
1191 iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
1192 iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
1193
1194 iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
1195 iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
1196 iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
1197
1198 iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
1199 iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
1200 iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
1201
1202 iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
1203 iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
1204 iommu_feature_disable(iommu, CONTROL_ISOC_EN);
1205
1206 /*
1207 * make IOMMU memory accesses cache coherent
1208 */
1209 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1210}
1211
1212static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
1213{
1214 int i, j;
1215 u32 ioc_feature_control;
1216 struct pci_dev *pdev = NULL;
1217
1218 /* RD890 BIOSes may not have completely reconfigured the iommu */
1219 if (!is_rd890_iommu(iommu->dev))
1220 return;
1221
1222 /*
1223 * First, we need to ensure that the iommu is enabled. This is
1224 * controlled by a register in the northbridge
1225 */
1226 pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
1227
1228 if (!pdev)
1229 return;
1230
1231 /* Select Northbridge indirect register 0x75 and enable writing */
1232 pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
1233 pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
1234
1235 /* Enable the iommu */
1236 if (!(ioc_feature_control & 0x1))
1237 pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
1238
1239 pci_dev_put(pdev);
1240
1241 /* Restore the iommu BAR */
1242 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1243 iommu->stored_addr_lo);
1244 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
1245 iommu->stored_addr_hi);
1246
1247 /* Restore the l1 indirect regs for each of the 6 l1s */
1248 for (i = 0; i < 6; i++)
1249 for (j = 0; j < 0x12; j++)
1250 iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
1251
1252 /* Restore the l2 indirect regs */
1253 for (i = 0; i < 0x83; i++)
1254 iommu_write_l2(iommu, i, iommu->stored_l2[i]);
1255
1256 /* Lock PCI setup registers */
1257 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1258 iommu->stored_addr_lo | 1);
1259}
1260
1261/*
1262 * This function finally enables all IOMMUs found in the system after
1263 * they have been initialized
1264 */
1265static void enable_iommus(void)
1266{
1267 struct amd_iommu *iommu;
1268
1269 for_each_iommu(iommu) {
1270 iommu_disable(iommu);
1271 iommu_init_flags(iommu);
1272 iommu_set_device_table(iommu);
1273 iommu_enable_command_buffer(iommu);
1274 iommu_enable_event_buffer(iommu);
1275 iommu_set_exclusion_range(iommu);
1276 iommu_init_msi(iommu);
1277 iommu_enable(iommu);
1278 iommu_flush_all_caches(iommu);
1279 }
1280}
1281
1282static void disable_iommus(void)
1283{
1284 struct amd_iommu *iommu;
1285
1286 for_each_iommu(iommu)
1287 iommu_disable(iommu);
1288}
1289
1290/*
1291 * Suspend/Resume support
1292 * disable suspend until real resume implemented
1293 */
1294
1295static void amd_iommu_resume(void)
1296{
1297 struct amd_iommu *iommu;
1298
1299 for_each_iommu(iommu)
1300 iommu_apply_resume_quirks(iommu);
1301
1302 /* re-load the hardware */
1303 enable_iommus();
1304
1305 /*
1306 * we have to flush after the IOMMUs are enabled because a
1307 * disabled IOMMU will never execute the commands we send
1308 */
1309 for_each_iommu(iommu)
1310 iommu_flush_all_caches(iommu);
1311}
1312
1313static int amd_iommu_suspend(void)
1314{
1315 /* disable IOMMUs to go out of the way for BIOS */
1316 disable_iommus();
1317
1318 return 0;
1319}
1320
1321static struct syscore_ops amd_iommu_syscore_ops = {
1322 .suspend = amd_iommu_suspend,
1323 .resume = amd_iommu_resume,
1324};
1325
1326/*
1327 * This is the core init function for AMD IOMMU hardware in the system.
1328 * This function is called from the generic x86 DMA layer initialization
1329 * code.
1330 *
1331 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
1332 * three times:
1333 *
1334 * 1 pass) Find the highest PCI device id the driver has to handle.
1335 * Upon this information the size of the data structures is
1336 * determined that needs to be allocated.
1337 *
1338 * 2 pass) Initialize the data structures just allocated with the
1339 * information in the ACPI table about available AMD IOMMUs
1340 * in the system. It also maps the PCI devices in the
1341 * system to specific IOMMUs
1342 *
1343 * 3 pass) After the basic data structures are allocated and
1344 * initialized we update them with information about memory
1345 * remapping requirements parsed out of the ACPI table in
1346 * this last pass.
1347 *
1348 * After that the hardware is initialized and ready to go. In the last
1349 * step we do some Linux specific things like registering the driver in
1350 * the dma_ops interface and initializing the suspend/resume support
1351 * functions. Finally it prints some information about AMD IOMMUs and
1352 * the driver state and enables the hardware.
1353 */
1354static int __init amd_iommu_init(void)
1355{
1356 int i, ret = 0;
1357
1358 /*
1359 * First parse ACPI tables to find the largest Bus/Dev/Func
1360 * we need to handle. Upon this information the shared data
1361 * structures for the IOMMUs in the system will be allocated
1362 */
1363 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1364 return -ENODEV;
1365
1366 ret = amd_iommu_init_err;
1367 if (ret)
1368 goto out;
1369
1370 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1371 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1372 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
1373
1374 ret = -ENOMEM;
1375
1376 /* Device table - directly used by all IOMMUs */
1377 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1378 get_order(dev_table_size));
1379 if (amd_iommu_dev_table == NULL)
1380 goto out;
1381
1382 /*
1383 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
1384 * IOMMU see for that device
1385 */
1386 amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
1387 get_order(alias_table_size));
1388 if (amd_iommu_alias_table == NULL)
1389 goto free;
1390
1391 /* IOMMU rlookup table - find the IOMMU for a specific device */
1392 amd_iommu_rlookup_table = (void *)__get_free_pages(
1393 GFP_KERNEL | __GFP_ZERO,
1394 get_order(rlookup_table_size));
1395 if (amd_iommu_rlookup_table == NULL)
1396 goto free;
1397
1398 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1399 GFP_KERNEL | __GFP_ZERO,
1400 get_order(MAX_DOMAIN_ID/8));
1401 if (amd_iommu_pd_alloc_bitmap == NULL)
1402 goto free;
1403
1404 /* init the device table */
1405 init_device_table();
1406
1407 /*
1408 * let all alias entries point to itself
1409 */
1410 for (i = 0; i <= amd_iommu_last_bdf; ++i)
1411 amd_iommu_alias_table[i] = i;
1412
1413 /*
1414 * never allocate domain 0 because its used as the non-allocated and
1415 * error value placeholder
1416 */
1417 amd_iommu_pd_alloc_bitmap[0] = 1;
1418
1419 spin_lock_init(&amd_iommu_pd_lock);
1420
1421 /*
1422 * now the data structures are allocated and basically initialized
1423 * start the real acpi table scan
1424 */
1425 ret = -ENODEV;
1426 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1427 goto free;
1428
1429 if (amd_iommu_init_err) {
1430 ret = amd_iommu_init_err;
1431 goto free;
1432 }
1433
1434 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1435 goto free;
1436
1437 if (amd_iommu_init_err) {
1438 ret = amd_iommu_init_err;
1439 goto free;
1440 }
1441
1442 ret = amd_iommu_init_devices();
1443 if (ret)
1444 goto free;
1445
1446 enable_iommus();
1447
1448 if (iommu_pass_through)
1449 ret = amd_iommu_init_passthrough();
1450 else
1451 ret = amd_iommu_init_dma_ops();
1452
1453 if (ret)
1454 goto free_disable;
1455
1456 amd_iommu_init_api();
1457
1458 amd_iommu_init_notifier();
1459
1460 register_syscore_ops(&amd_iommu_syscore_ops);
1461
1462 if (iommu_pass_through)
1463 goto out;
1464
1465 if (amd_iommu_unmap_flush)
1466 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1467 else
1468 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1469
1470 x86_platform.iommu_shutdown = disable_iommus;
1471out:
1472 return ret;
1473
1474free_disable:
1475 disable_iommus();
1476
1477free:
1478 amd_iommu_uninit_devices();
1479
1480 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1481 get_order(MAX_DOMAIN_ID/8));
1482
1483 free_pages((unsigned long)amd_iommu_rlookup_table,
1484 get_order(rlookup_table_size));
1485
1486 free_pages((unsigned long)amd_iommu_alias_table,
1487 get_order(alias_table_size));
1488
1489 free_pages((unsigned long)amd_iommu_dev_table,
1490 get_order(dev_table_size));
1491
1492 free_iommu_all();
1493
1494 free_unity_maps();
1495
1496#ifdef CONFIG_GART_IOMMU
1497 /*
1498 * We failed to initialize the AMD IOMMU - try fallback to GART
1499 * if possible.
1500 */
1501 gart_iommu_init();
1502
1503#endif
1504
1505 goto out;
1506}
1507
1508/****************************************************************************
1509 *
1510 * Early detect code. This code runs at IOMMU detection time in the DMA
1511 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1512 * IOMMUs
1513 *
1514 ****************************************************************************/
1515static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1516{
1517 return 0;
1518}
1519
1520int __init amd_iommu_detect(void)
1521{
1522 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1523 return -ENODEV;
1524
1525 if (amd_iommu_disabled)
1526 return -ENODEV;
1527
1528 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1529 iommu_detected = 1;
1530 amd_iommu_detected = 1;
1531 x86_init.iommu.iommu_init = amd_iommu_init;
1532
1533 /* Make sure ACS will be enabled */
1534 pci_request_acs();
1535 return 1;
1536 }
1537 return -ENODEV;
1538}
1539
1540/****************************************************************************
1541 *
1542 * Parsing functions for the AMD IOMMU specific kernel command line
1543 * options.
1544 *
1545 ****************************************************************************/
1546
1547static int __init parse_amd_iommu_dump(char *str)
1548{
1549 amd_iommu_dump = true;
1550
1551 return 1;
1552}
1553
1554static int __init parse_amd_iommu_options(char *str)
1555{
1556 for (; *str; ++str) {
1557 if (strncmp(str, "fullflush", 9) == 0)
1558 amd_iommu_unmap_flush = true;
1559 if (strncmp(str, "off", 3) == 0)
1560 amd_iommu_disabled = true;
1561 }
1562
1563 return 1;
1564}
1565
1566__setup("amd_iommu_dump", parse_amd_iommu_dump);
1567__setup("amd_iommu=", parse_amd_iommu_options);
1568
1569IOMMU_INIT_FINISH(amd_iommu_detect,
1570 gart_iommu_hole_init,
1571 0,
1572 0);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8fac..bae1efe6d51 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device)
119 return false; 119 return false;
120} 120}
121 121
122struct resource *amd_get_mmconfig_range(struct resource *res)
123{
124 u32 address;
125 u64 base, msr;
126 unsigned segn_busn_bits;
127
128 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
129 return NULL;
130
131 /* assume all cpus from fam10h have mmconfig */
132 if (boot_cpu_data.x86 < 0x10)
133 return NULL;
134
135 address = MSR_FAM10H_MMIO_CONF_BASE;
136 rdmsrl(address, msr);
137
138 /* mmconfig is not enabled */
139 if (!(msr & FAM10H_MMIO_CONF_ENABLE))
140 return NULL;
141
142 base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
143
144 segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
145 FAM10H_MMIO_CONF_BUSRANGE_MASK;
146
147 res->flags = IORESOURCE_MEM;
148 res->start = base;
149 res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
150 return res;
151}
152
122int amd_get_subcaches(int cpu) 153int amd_get_subcaches(int cpu)
123{ 154{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; 155 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 289e92862fd..afdc3f756de 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
27 * timer, but by default APB timer has higher rating than local APIC timers. 27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */ 28 */
29 29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h> 30#include <linux/delay.h>
31#include <linux/dw_apb_timer.h>
33#include <linux/errno.h> 32#include <linux/errno.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/slab.h> 34#include <linux/slab.h>
37#include <linux/pm.h> 35#include <linux/pm.h>
38#include <linux/pci.h>
39#include <linux/sfi.h> 36#include <linux/sfi.h>
40#include <linux/interrupt.h> 37#include <linux/interrupt.h>
41#include <linux/cpu.h> 38#include <linux/cpu.h>
@@ -44,76 +41,48 @@
44#include <asm/fixmap.h> 41#include <asm/fixmap.h>
45#include <asm/apb_timer.h> 42#include <asm/apb_timer.h>
46#include <asm/mrst.h> 43#include <asm/mrst.h>
44#include <asm/time.h>
47 45
48#define APBT_MASK CLOCKSOURCE_MASK(32)
49#define APBT_SHIFT 22
50#define APBT_CLOCKEVENT_RATING 110 46#define APBT_CLOCKEVENT_RATING 110
51#define APBT_CLOCKSOURCE_RATING 250 47#define APBT_CLOCKSOURCE_RATING 250
52#define APBT_MIN_DELTA_USEC 200
53 48
54#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
55#define APBT_CLOCKEVENT0_NUM (0) 49#define APBT_CLOCKEVENT0_NUM (0)
56#define APBT_CLOCKEVENT1_NUM (1)
57#define APBT_CLOCKSOURCE_NUM (2) 50#define APBT_CLOCKSOURCE_NUM (2)
58 51
59static unsigned long apbt_address; 52static phys_addr_t apbt_address;
60static int apb_timer_block_enabled; 53static int apb_timer_block_enabled;
61static void __iomem *apbt_virt_address; 54static void __iomem *apbt_virt_address;
62static int phy_cs_timer_id;
63 55
64/* 56/*
65 * Common DW APB timer info 57 * Common DW APB timer info
66 */ 58 */
67static uint64_t apbt_freq; 59static unsigned long apbt_freq;
68
69static void apbt_set_mode(enum clock_event_mode mode,
70 struct clock_event_device *evt);
71static int apbt_next_event(unsigned long delta,
72 struct clock_event_device *evt);
73static cycle_t apbt_read_clocksource(struct clocksource *cs);
74static void apbt_restart_clocksource(struct clocksource *cs);
75 60
76struct apbt_dev { 61struct apbt_dev {
77 struct clock_event_device evt; 62 struct dw_apb_clock_event_device *timer;
78 unsigned int num; 63 unsigned int num;
79 int cpu; 64 int cpu;
80 unsigned int irq; 65 unsigned int irq;
81 unsigned int tick; 66 char name[10];
82 unsigned int count;
83 unsigned int flags;
84 char name[10];
85}; 67};
86 68
87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); 69static struct dw_apb_clocksource *clocksource_apbt;
88 70
89#ifdef CONFIG_SMP 71static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
90static unsigned int apbt_num_timers_used;
91static struct apbt_dev *apbt_devs;
92#endif
93
94static inline unsigned long apbt_readl_reg(unsigned long a)
95{ 72{
96 return readl(apbt_virt_address + a); 73 return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
97} 74}
98 75
99static inline void apbt_writel_reg(unsigned long d, unsigned long a) 76static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
100{
101 writel(d, apbt_virt_address + a);
102}
103
104static inline unsigned long apbt_readl(int n, unsigned long a)
105{
106 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
107}
108 77
109static inline void apbt_writel(int n, unsigned long d, unsigned long a) 78#ifdef CONFIG_SMP
110{ 79static unsigned int apbt_num_timers_used;
111 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); 80#endif
112}
113 81
114static inline void apbt_set_mapping(void) 82static inline void apbt_set_mapping(void)
115{ 83{
116 struct sfi_timer_table_entry *mtmr; 84 struct sfi_timer_table_entry *mtmr;
85 int phy_cs_timer_id = 0;
117 86
118 if (apbt_virt_address) { 87 if (apbt_virt_address) {
119 pr_debug("APBT base already mapped\n"); 88 pr_debug("APBT base already mapped\n");
@@ -125,21 +94,18 @@ static inline void apbt_set_mapping(void)
125 APBT_CLOCKEVENT0_NUM); 94 APBT_CLOCKEVENT0_NUM);
126 return; 95 return;
127 } 96 }
128 apbt_address = (unsigned long)mtmr->phys_addr; 97 apbt_address = (phys_addr_t)mtmr->phys_addr;
129 if (!apbt_address) { 98 if (!apbt_address) {
130 printk(KERN_WARNING "No timer base from SFI, use default\n"); 99 printk(KERN_WARNING "No timer base from SFI, use default\n");
131 apbt_address = APBT_DEFAULT_BASE; 100 apbt_address = APBT_DEFAULT_BASE;
132 } 101 }
133 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); 102 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
134 if (apbt_virt_address) { 103 if (!apbt_virt_address) {
135 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ 104 pr_debug("Failed mapping APBT phy address at %lu\n",\
136 (void *)apbt_address, (void *)apbt_virt_address); 105 (unsigned long)apbt_address);
137 } else {
138 pr_debug("Failed mapping APBT phy address at %p\n",\
139 (void *)apbt_address);
140 goto panic_noapbt; 106 goto panic_noapbt;
141 } 107 }
142 apbt_freq = mtmr->freq_hz / USEC_PER_SEC; 108 apbt_freq = mtmr->freq_hz;
143 sfi_free_mtmr(mtmr); 109 sfi_free_mtmr(mtmr);
144 110
145 /* Now figure out the physical timer id for clocksource device */ 111 /* Now figure out the physical timer id for clocksource device */
@@ -148,9 +114,14 @@ static inline void apbt_set_mapping(void)
148 goto panic_noapbt; 114 goto panic_noapbt;
149 115
150 /* Now figure out the physical timer id */ 116 /* Now figure out the physical timer id */
151 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) 117 pr_debug("Use timer %d for clocksource\n",
152 / APBTMRS_REG_SIZE; 118 (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
153 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); 119 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
120 APBTMRS_REG_SIZE;
121
122 clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
123 "apbt0", apbt_virt_address + phy_cs_timer_id *
124 APBTMRS_REG_SIZE, apbt_freq);
154 return; 125 return;
155 126
156panic_noapbt: 127panic_noapbt:
@@ -172,82 +143,6 @@ static inline int is_apbt_capable(void)
172 return apbt_virt_address ? 1 : 0; 143 return apbt_virt_address ? 1 : 0;
173} 144}
174 145
175static struct clocksource clocksource_apbt = {
176 .name = "apbt",
177 .rating = APBT_CLOCKSOURCE_RATING,
178 .read = apbt_read_clocksource,
179 .mask = APBT_MASK,
180 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
181 .resume = apbt_restart_clocksource,
182};
183
184/* boot APB clock event device */
185static struct clock_event_device apbt_clockevent = {
186 .name = "apbt0",
187 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
188 .set_mode = apbt_set_mode,
189 .set_next_event = apbt_next_event,
190 .shift = APBT_SHIFT,
191 .irq = 0,
192 .rating = APBT_CLOCKEVENT_RATING,
193};
194
195/*
196 * start count down from 0xffff_ffff. this is done by toggling the enable bit
197 * then load initial load count to ~0.
198 */
199static void apbt_start_counter(int n)
200{
201 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
202
203 ctrl &= ~APBTMR_CONTROL_ENABLE;
204 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
205 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
206 /* enable, mask interrupt */
207 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
208 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
209 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
210 /* read it once to get cached counter value initialized */
211 apbt_read_clocksource(&clocksource_apbt);
212}
213
214static irqreturn_t apbt_interrupt_handler(int irq, void *data)
215{
216 struct apbt_dev *dev = (struct apbt_dev *)data;
217 struct clock_event_device *aevt = &dev->evt;
218
219 if (!aevt->event_handler) {
220 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
221 dev->num);
222 return IRQ_NONE;
223 }
224 aevt->event_handler(aevt);
225 return IRQ_HANDLED;
226}
227
228static void apbt_restart_clocksource(struct clocksource *cs)
229{
230 apbt_start_counter(phy_cs_timer_id);
231}
232
233static void apbt_enable_int(int n)
234{
235 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
236 /* clear pending intr */
237 apbt_readl(n, APBTMR_N_EOI);
238 ctrl &= ~APBTMR_CONTROL_INT;
239 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
240}
241
242static void apbt_disable_int(int n)
243{
244 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
245
246 ctrl |= APBTMR_CONTROL_INT;
247 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
248}
249
250
251static int __init apbt_clockevent_register(void) 146static int __init apbt_clockevent_register(void)
252{ 147{
253 struct sfi_timer_table_entry *mtmr; 148 struct sfi_timer_table_entry *mtmr;
@@ -260,45 +155,21 @@ static int __init apbt_clockevent_register(void)
260 return -ENODEV; 155 return -ENODEV;
261 } 156 }
262 157
263 /*
264 * We need to calculate the scaled math multiplication factor for
265 * nanosecond to apbt tick conversion.
266 * mult = (nsec/cycle)*2^APBT_SHIFT
267 */
268 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
269 , NSEC_PER_SEC, APBT_SHIFT);
270
271 /* Calculate the min / max delta */
272 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
273 &apbt_clockevent);
274 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
275 APBT_MIN_DELTA_USEC*apbt_freq,
276 &apbt_clockevent);
277 /*
278 * Start apbt with the boot cpu mask and make it
279 * global if not used for per cpu timer.
280 */
281 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
282 adev->num = smp_processor_id(); 158 adev->num = smp_processor_id();
283 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 159 adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
160 mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
161 APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
162 adev_virt_addr(adev), 0, apbt_freq);
163 /* Firmware does EOI handling for us. */
164 adev->timer->eoi = NULL;
284 165
285 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { 166 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
286 adev->evt.rating = APBT_CLOCKEVENT_RATING - 100; 167 global_clock_event = &adev->timer->ced;
287 global_clock_event = &adev->evt;
288 printk(KERN_DEBUG "%s clockevent registered as global\n", 168 printk(KERN_DEBUG "%s clockevent registered as global\n",
289 global_clock_event->name); 169 global_clock_event->name);
290 } 170 }
291 171
292 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, 172 dw_apb_clockevent_register(adev->timer);
293 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
294 apbt_clockevent.name, adev)) {
295 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
296 apbt_clockevent.irq);
297 }
298
299 clockevents_register_device(&adev->evt);
300 /* Start APBT 0 interrupts */
301 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
302 173
303 sfi_free_mtmr(mtmr); 174 sfi_free_mtmr(mtmr);
304 return 0; 175 return 0;
@@ -316,52 +187,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
316 irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); 187 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
317 /* APB timer irqs are set up as mp_irqs, timer is edge type */ 188 /* APB timer irqs are set up as mp_irqs, timer is edge type */
318 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); 189 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
319
320 if (system_state == SYSTEM_BOOTING) {
321 if (request_irq(adev->irq, apbt_interrupt_handler,
322 IRQF_TIMER | IRQF_DISABLED |
323 IRQF_NOBALANCING,
324 adev->name, adev)) {
325 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
326 adev->num);
327 }
328 } else
329 enable_irq(adev->irq);
330} 190}
331 191
332/* Should be called with per cpu */ 192/* Should be called with per cpu */
333void apbt_setup_secondary_clock(void) 193void apbt_setup_secondary_clock(void)
334{ 194{
335 struct apbt_dev *adev; 195 struct apbt_dev *adev;
336 struct clock_event_device *aevt;
337 int cpu; 196 int cpu;
338 197
339 /* Don't register boot CPU clockevent */ 198 /* Don't register boot CPU clockevent */
340 cpu = smp_processor_id(); 199 cpu = smp_processor_id();
341 if (!cpu) 200 if (!cpu)
342 return; 201 return;
343 /*
344 * We need to calculate the scaled math multiplication factor for
345 * nanosecond to apbt tick conversion.
346 * mult = (nsec/cycle)*2^APBT_SHIFT
347 */
348 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
349 adev = &per_cpu(cpu_apbt_dev, cpu);
350 aevt = &adev->evt;
351 202
352 memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); 203 adev = &__get_cpu_var(cpu_apbt_dev);
353 aevt->cpumask = cpumask_of(cpu); 204 if (!adev->timer) {
354 aevt->name = adev->name; 205 adev->timer = dw_apb_clockevent_init(cpu, adev->name,
355 aevt->mode = CLOCK_EVT_MODE_UNUSED; 206 APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
207 adev->irq, apbt_freq);
208 adev->timer->eoi = NULL;
209 } else {
210 dw_apb_clockevent_resume(adev->timer);
211 }
356 212
357 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", 213 printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
358 cpu, aevt->name, *(u32 *)aevt->cpumask); 214 cpu, adev->name, adev->cpu);
359 215
360 apbt_setup_irq(adev); 216 apbt_setup_irq(adev);
361 217 dw_apb_clockevent_register(adev->timer);
362 clockevents_register_device(aevt);
363
364 apbt_enable_int(cpu);
365 218
366 return; 219 return;
367} 220}
@@ -384,13 +237,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
384 237
385 switch (action & 0xf) { 238 switch (action & 0xf) {
386 case CPU_DEAD: 239 case CPU_DEAD:
387 disable_irq(adev->irq); 240 dw_apb_clockevent_pause(adev->timer);
388 apbt_disable_int(cpu);
389 if (system_state == SYSTEM_RUNNING) { 241 if (system_state == SYSTEM_RUNNING) {
390 pr_debug("skipping APBT CPU %lu offline\n", cpu); 242 pr_debug("skipping APBT CPU %lu offline\n", cpu);
391 } else if (adev) { 243 } else if (adev) {
392 pr_debug("APBT clockevent for cpu %lu offline\n", cpu); 244 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
393 free_irq(adev->irq, adev); 245 dw_apb_clockevent_stop(adev->timer);
394 } 246 }
395 break; 247 break;
396 default: 248 default:
@@ -415,116 +267,16 @@ void apbt_setup_secondary_clock(void) {}
415 267
416#endif /* CONFIG_SMP */ 268#endif /* CONFIG_SMP */
417 269
418static void apbt_set_mode(enum clock_event_mode mode,
419 struct clock_event_device *evt)
420{
421 unsigned long ctrl;
422 uint64_t delta;
423 int timer_num;
424 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
425
426 BUG_ON(!apbt_virt_address);
427
428 timer_num = adev->num;
429 pr_debug("%s CPU %d timer %d mode=%d\n",
430 __func__, first_cpu(*evt->cpumask), timer_num, mode);
431
432 switch (mode) {
433 case CLOCK_EVT_MODE_PERIODIC:
434 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
435 delta >>= apbt_clockevent.shift;
436 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
437 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
438 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
439 /*
440 * DW APB p. 46, have to disable timer before load counter,
441 * may cause sync problem.
442 */
443 ctrl &= ~APBTMR_CONTROL_ENABLE;
444 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
445 udelay(1);
446 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
447 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
448 ctrl |= APBTMR_CONTROL_ENABLE;
449 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
450 break;
451 /* APB timer does not have one-shot mode, use free running mode */
452 case CLOCK_EVT_MODE_ONESHOT:
453 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
454 /*
455 * set free running mode, this mode will let timer reload max
456 * timeout which will give time (3min on 25MHz clock) to rearm
457 * the next event, therefore emulate the one-shot mode.
458 */
459 ctrl &= ~APBTMR_CONTROL_ENABLE;
460 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
461
462 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
463 /* write again to set free running mode */
464 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
465
466 /*
467 * DW APB p. 46, load counter with all 1s before starting free
468 * running mode.
469 */
470 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
471 ctrl &= ~APBTMR_CONTROL_INT;
472 ctrl |= APBTMR_CONTROL_ENABLE;
473 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
474 break;
475
476 case CLOCK_EVT_MODE_UNUSED:
477 case CLOCK_EVT_MODE_SHUTDOWN:
478 apbt_disable_int(timer_num);
479 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
480 ctrl &= ~APBTMR_CONTROL_ENABLE;
481 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
482 break;
483
484 case CLOCK_EVT_MODE_RESUME:
485 apbt_enable_int(timer_num);
486 break;
487 }
488}
489
490static int apbt_next_event(unsigned long delta,
491 struct clock_event_device *evt)
492{
493 unsigned long ctrl;
494 int timer_num;
495
496 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
497
498 timer_num = adev->num;
499 /* Disable timer */
500 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
501 ctrl &= ~APBTMR_CONTROL_ENABLE;
502 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
503 /* write new count */
504 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
505 ctrl |= APBTMR_CONTROL_ENABLE;
506 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
507 return 0;
508}
509
510static cycle_t apbt_read_clocksource(struct clocksource *cs)
511{
512 unsigned long current_count;
513
514 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
515 return (cycle_t)~current_count;
516}
517
518static int apbt_clocksource_register(void) 270static int apbt_clocksource_register(void)
519{ 271{
520 u64 start, now; 272 u64 start, now;
521 cycle_t t1; 273 cycle_t t1;
522 274
523 /* Start the counter, use timer 2 as source, timer 0/1 for event */ 275 /* Start the counter, use timer 2 as source, timer 0/1 for event */
524 apbt_start_counter(phy_cs_timer_id); 276 dw_apb_clocksource_start(clocksource_apbt);
525 277
526 /* Verify whether apbt counter works */ 278 /* Verify whether apbt counter works */
527 t1 = apbt_read_clocksource(&clocksource_apbt); 279 t1 = dw_apb_clocksource_read(clocksource_apbt);
528 rdtscll(start); 280 rdtscll(start);
529 281
530 /* 282 /*
@@ -539,10 +291,10 @@ static int apbt_clocksource_register(void)
539 } while ((now - start) < 200000UL); 291 } while ((now - start) < 200000UL);
540 292
541 /* APBT is the only always on clocksource, it has to work! */ 293 /* APBT is the only always on clocksource, it has to work! */
542 if (t1 == apbt_read_clocksource(&clocksource_apbt)) 294 if (t1 == dw_apb_clocksource_read(clocksource_apbt))
543 panic("APBT counter not counting. APBT disabled\n"); 295 panic("APBT counter not counting. APBT disabled\n");
544 296
545 clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); 297 dw_apb_clocksource_register(clocksource_apbt);
546 298
547 return 0; 299 return 0;
548} 300}
@@ -566,10 +318,7 @@ void __init apbt_time_init(void)
566 if (apb_timer_block_enabled) 318 if (apb_timer_block_enabled)
567 return; 319 return;
568 apbt_set_mapping(); 320 apbt_set_mapping();
569 if (apbt_virt_address) { 321 if (!apbt_virt_address)
570 pr_debug("Found APBT version 0x%lx\n",\
571 apbt_readl_reg(APBTMRS_COMP_VERSION));
572 } else
573 goto out_noapbt; 322 goto out_noapbt;
574 /* 323 /*
575 * Read the frequency and check for a sane value, for ESL model 324 * Read the frequency and check for a sane value, for ESL model
@@ -577,7 +326,7 @@ void __init apbt_time_init(void)
577 */ 326 */
578 327
579 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { 328 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
580 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); 329 pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
581 goto out_noapbt; 330 goto out_noapbt;
582 } 331 }
583 if (apbt_clocksource_register()) { 332 if (apbt_clocksource_register()) {
@@ -603,30 +352,20 @@ void __init apbt_time_init(void)
603 } else { 352 } else {
604 percpu_timer = 0; 353 percpu_timer = 0;
605 apbt_num_timers_used = 1; 354 apbt_num_timers_used = 1;
606 adev = &per_cpu(cpu_apbt_dev, 0);
607 adev->flags &= ~APBT_DEV_USED;
608 } 355 }
609 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); 356 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
610 357
611 /* here we set up per CPU timer data structure */ 358 /* here we set up per CPU timer data structure */
612 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
613 GFP_KERNEL);
614 if (!apbt_devs) {
615 printk(KERN_ERR "Failed to allocate APB timer devices\n");
616 return;
617 }
618 for (i = 0; i < apbt_num_timers_used; i++) { 359 for (i = 0; i < apbt_num_timers_used; i++) {
619 adev = &per_cpu(cpu_apbt_dev, i); 360 adev = &per_cpu(cpu_apbt_dev, i);
620 adev->num = i; 361 adev->num = i;
621 adev->cpu = i; 362 adev->cpu = i;
622 p_mtmr = sfi_get_mtmr(i); 363 p_mtmr = sfi_get_mtmr(i);
623 if (p_mtmr) { 364 if (p_mtmr)
624 adev->tick = p_mtmr->freq_hz;
625 adev->irq = p_mtmr->irq; 365 adev->irq = p_mtmr->irq;
626 } else 366 else
627 printk(KERN_ERR "Failed to get timer for cpu %d\n", i); 367 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
628 adev->count = 0; 368 snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
629 sprintf(adev->name, "apbt%d", i);
630 } 369 }
631#endif 370#endif
632 371
@@ -638,17 +377,8 @@ out_noapbt:
638 panic("failed to enable APB timer\n"); 377 panic("failed to enable APB timer\n");
639} 378}
640 379
641static inline void apbt_disable(int n)
642{
643 if (is_apbt_capable()) {
644 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
645 ctrl &= ~APBTMR_CONTROL_ENABLE;
646 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
647 }
648}
649
650/* called before apb_timer_enable, use early map */ 380/* called before apb_timer_enable, use early map */
651unsigned long apbt_quick_calibrate() 381unsigned long apbt_quick_calibrate(void)
652{ 382{
653 int i, scale; 383 int i, scale;
654 u64 old, new; 384 u64 old, new;
@@ -657,31 +387,31 @@ unsigned long apbt_quick_calibrate()
657 u32 loop, shift; 387 u32 loop, shift;
658 388
659 apbt_set_mapping(); 389 apbt_set_mapping();
660 apbt_start_counter(phy_cs_timer_id); 390 dw_apb_clocksource_start(clocksource_apbt);
661 391
662 /* check if the timer can count down, otherwise return */ 392 /* check if the timer can count down, otherwise return */
663 old = apbt_read_clocksource(&clocksource_apbt); 393 old = dw_apb_clocksource_read(clocksource_apbt);
664 i = 10000; 394 i = 10000;
665 while (--i) { 395 while (--i) {
666 if (old != apbt_read_clocksource(&clocksource_apbt)) 396 if (old != dw_apb_clocksource_read(clocksource_apbt))
667 break; 397 break;
668 } 398 }
669 if (!i) 399 if (!i)
670 goto failed; 400 goto failed;
671 401
672 /* count 16 ms */ 402 /* count 16 ms */
673 loop = (apbt_freq * 1000) << 4; 403 loop = (apbt_freq / 1000) << 4;
674 404
675 /* restart the timer to ensure it won't get to 0 in the calibration */ 405 /* restart the timer to ensure it won't get to 0 in the calibration */
676 apbt_start_counter(phy_cs_timer_id); 406 dw_apb_clocksource_start(clocksource_apbt);
677 407
678 old = apbt_read_clocksource(&clocksource_apbt); 408 old = dw_apb_clocksource_read(clocksource_apbt);
679 old += loop; 409 old += loop;
680 410
681 t1 = __native_read_tsc(); 411 t1 = __native_read_tsc();
682 412
683 do { 413 do {
684 new = apbt_read_clocksource(&clocksource_apbt); 414 new = dw_apb_clocksource_read(clocksource_apbt);
685 } while (new < old); 415 } while (new < old);
686 416
687 t2 = __native_read_tsc(); 417 t2 = __native_read_tsc();
@@ -693,7 +423,7 @@ unsigned long apbt_quick_calibrate()
693 return 0; 423 return 0;
694 } 424 }
695 scale = (int)div_u64((t2 - t1), loop >> shift); 425 scale = (int)div_u64((t2 - t1), loop >> shift);
696 khz = (scale * apbt_freq * 1000) >> shift; 426 khz = (scale * (apbt_freq / 1000)) >> shift;
697 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); 427 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
698 return khz; 428 return khz;
699failed: 429failed:
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b9338b8cf42..a2fd72e0ab3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
27#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/timex.h> 29#include <linux/timex.h>
30#include <linux/i8253.h>
30#include <linux/dmar.h> 31#include <linux/dmar.h>
31#include <linux/init.h> 32#include <linux/init.h>
32#include <linux/cpu.h> 33#include <linux/cpu.h>
@@ -37,9 +38,8 @@
37#include <asm/perf_event.h> 38#include <asm/perf_event.h>
38#include <asm/x86_init.h> 39#include <asm/x86_init.h>
39#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
40#include <asm/atomic.h> 41#include <linux/atomic.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
42#include <asm/i8253.h>
43#include <asm/i8259.h> 43#include <asm/i8259.h>
44#include <asm/proto.h> 44#include <asm/proto.h>
45#include <asm/apic.h> 45#include <asm/apic.h>
@@ -48,6 +48,7 @@
48#include <asm/hpet.h> 48#include <asm/hpet.h>
49#include <asm/idle.h> 49#include <asm/idle.h>
50#include <asm/mtrr.h> 50#include <asm/mtrr.h>
51#include <asm/time.h>
51#include <asm/smp.h> 52#include <asm/smp.h>
52#include <asm/mce.h> 53#include <asm/mce.h>
53#include <asm/tsc.h> 54#include <asm/tsc.h>
@@ -1429,34 +1430,28 @@ void enable_x2apic(void)
1429 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1430 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1430 if (!(msr & X2APIC_ENABLE)) { 1431 if (!(msr & X2APIC_ENABLE)) {
1431 printk_once(KERN_INFO "Enabling x2apic\n"); 1432 printk_once(KERN_INFO "Enabling x2apic\n");
1432 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1433 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
1433 } 1434 }
1434} 1435}
1435#endif /* CONFIG_X86_X2APIC */ 1436#endif /* CONFIG_X86_X2APIC */
1436 1437
1437int __init enable_IR(void) 1438int __init enable_IR(void)
1438{ 1439{
1439#ifdef CONFIG_INTR_REMAP 1440#ifdef CONFIG_IRQ_REMAP
1440 if (!intr_remapping_supported()) { 1441 if (!intr_remapping_supported()) {
1441 pr_debug("intr-remapping not supported\n"); 1442 pr_debug("intr-remapping not supported\n");
1442 return 0; 1443 return -1;
1443 } 1444 }
1444 1445
1445 if (!x2apic_preenabled && skip_ioapic_setup) { 1446 if (!x2apic_preenabled && skip_ioapic_setup) {
1446 pr_info("Skipped enabling intr-remap because of skipping " 1447 pr_info("Skipped enabling intr-remap because of skipping "
1447 "io-apic setup\n"); 1448 "io-apic setup\n");
1448 return 0; 1449 return -1;
1449 } 1450 }
1450 1451
1451 if (enable_intr_remapping(x2apic_supported())) 1452 return enable_intr_remapping();
1452 return 0;
1453
1454 pr_info("Enabled Interrupt-remapping\n");
1455
1456 return 1;
1457
1458#endif 1453#endif
1459 return 0; 1454 return -1;
1460} 1455}
1461 1456
1462void __init enable_IR_x2apic(void) 1457void __init enable_IR_x2apic(void)
@@ -1480,11 +1475,11 @@ void __init enable_IR_x2apic(void)
1480 mask_ioapic_entries(); 1475 mask_ioapic_entries();
1481 1476
1482 if (dmar_table_init_ret) 1477 if (dmar_table_init_ret)
1483 ret = 0; 1478 ret = -1;
1484 else 1479 else
1485 ret = enable_IR(); 1480 ret = enable_IR();
1486 1481
1487 if (!ret) { 1482 if (ret < 0) {
1488 /* IR is required if there is APIC ID > 255 even when running 1483 /* IR is required if there is APIC ID > 255 even when running
1489 * under KVM 1484 * under KVM
1490 */ 1485 */
@@ -1498,6 +1493,9 @@ void __init enable_IR_x2apic(void)
1498 x2apic_force_phys(); 1493 x2apic_force_phys();
1499 } 1494 }
1500 1495
1496 if (ret == IRQ_REMAP_XAPIC_MODE)
1497 goto nox2apic;
1498
1501 x2apic_enabled = 1; 1499 x2apic_enabled = 1;
1502 1500
1503 if (x2apic_supported() && !x2apic_mode) { 1501 if (x2apic_supported() && !x2apic_mode) {
@@ -1507,19 +1505,21 @@ void __init enable_IR_x2apic(void)
1507 } 1505 }
1508 1506
1509nox2apic: 1507nox2apic:
1510 if (!ret) /* IR enabling failed */ 1508 if (ret < 0) /* IR enabling failed */
1511 restore_ioapic_entries(); 1509 restore_ioapic_entries();
1512 legacy_pic->restore_mask(); 1510 legacy_pic->restore_mask();
1513 local_irq_restore(flags); 1511 local_irq_restore(flags);
1514 1512
1515out: 1513out:
1516 if (x2apic_enabled) 1514 if (x2apic_enabled || !x2apic_supported())
1517 return; 1515 return;
1518 1516
1519 if (x2apic_preenabled) 1517 if (x2apic_preenabled)
1520 panic("x2apic: enabled by BIOS but kernel init failed."); 1518 panic("x2apic: enabled by BIOS but kernel init failed.");
1521 else if (cpu_has_x2apic) 1519 else if (ret == IRQ_REMAP_XAPIC_MODE)
1522 pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); 1520 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1521 else if (ret < 0)
1522 pr_info("x2apic not enabled, IRQ remapping init failed\n");
1523} 1523}
1524 1524
1525#ifdef CONFIG_X86_64 1525#ifdef CONFIG_X86_64
@@ -1943,10 +1943,28 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1943 1943
1944void __cpuinit generic_processor_info(int apicid, int version) 1944void __cpuinit generic_processor_info(int apicid, int version)
1945{ 1945{
1946 int cpu; 1946 int cpu, max = nr_cpu_ids;
1947 bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
1948 phys_cpu_present_map);
1949
1950 /*
1951 * If boot cpu has not been detected yet, then only allow upto
1952 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
1953 */
1954 if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
1955 apicid != boot_cpu_physical_apicid) {
1956 int thiscpu = max + disabled_cpus - 1;
1957
1958 pr_warning(
1959 "ACPI: NR_CPUS/possible_cpus limit of %i almost"
1960 " reached. Keeping one slot for boot cpu."
1961 " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
1962
1963 disabled_cpus++;
1964 return;
1965 }
1947 1966
1948 if (num_processors >= nr_cpu_ids) { 1967 if (num_processors >= nr_cpu_ids) {
1949 int max = nr_cpu_ids;
1950 int thiscpu = max + disabled_cpus; 1968 int thiscpu = max + disabled_cpus;
1951 1969
1952 pr_warning( 1970 pr_warning(
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index efd737e827f..521bead0113 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -255,12 +255,24 @@ static struct apic apic_bigsmp = {
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, 255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256}; 256};
257 257
258struct apic * __init generic_bigsmp_probe(void) 258void __init generic_bigsmp_probe(void)
259{ 259{
260 if (probe_bigsmp()) 260 unsigned int cpu;
261 return &apic_bigsmp;
262 261
263 return NULL; 262 if (!probe_bigsmp())
263 return;
264
265 apic = &apic_bigsmp;
266
267 for_each_possible_cpu(cpu) {
268 if (early_per_cpu(x86_cpu_to_logical_apicid,
269 cpu) == BAD_APICID)
270 continue;
271 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
272 bigsmp_early_logical_apicid(cpu);
273 }
274
275 pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
264} 276}
265 277
266apic_driver(apic_bigsmp); 278apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 9536b3fe43f..5d513bc47b6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -48,7 +48,7 @@
48#include <linux/io.h> 48#include <linux/io.h>
49 49
50#include <asm/apicdef.h> 50#include <asm/apicdef.h>
51#include <asm/atomic.h> 51#include <linux/atomic.h>
52#include <asm/fixmap.h> 52#include <asm/fixmap.h>
53#include <asm/mpspec.h> 53#include <asm/mpspec.h>
54#include <asm/setup.h> 54#include <asm/setup.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e5293394b54..620da6fed6b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1202,7 +1202,6 @@ void __setup_vector_irq(int cpu)
1202} 1202}
1203 1203
1204static struct irq_chip ioapic_chip; 1204static struct irq_chip ioapic_chip;
1205static struct irq_chip ir_ioapic_chip;
1206 1205
1207#ifdef CONFIG_X86_32 1206#ifdef CONFIG_X86_32
1208static inline int IO_APIC_irq_trigger(int irq) 1207static inline int IO_APIC_irq_trigger(int irq)
@@ -1246,7 +1245,7 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1246 1245
1247 if (irq_remapped(cfg)) { 1246 if (irq_remapped(cfg)) {
1248 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 1247 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1249 chip = &ir_ioapic_chip; 1248 irq_remap_modify_chip_defaults(chip);
1250 fasteoi = trigger != 0; 1249 fasteoi = trigger != 0;
1251 } 1250 }
1252 1251
@@ -1295,6 +1294,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
1295 * irq handler will do the explicit EOI to the io-apic. 1294 * irq handler will do the explicit EOI to the io-apic.
1296 */ 1295 */
1297 ir_entry->vector = pin; 1296 ir_entry->vector = pin;
1297
1298 apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
1299 "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
1300 "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
1301 "Avail:%X Vector:%02X Dest:%08X "
1302 "SID:%04X SQ:%X SVT:%X)\n",
1303 apic_id, irte.present, irte.fpd, irte.dst_mode,
1304 irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
1305 irte.avail, irte.vector, irte.dest_id,
1306 irte.sid, irte.sq, irte.svt);
1298 } else { 1307 } else {
1299 entry->delivery_mode = apic->irq_delivery_mode; 1308 entry->delivery_mode = apic->irq_delivery_mode;
1300 entry->dest_mode = apic->irq_dest_mode; 1309 entry->dest_mode = apic->irq_dest_mode;
@@ -1337,9 +1346,9 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1337 1346
1338 apic_printk(APIC_VERBOSE,KERN_DEBUG 1347 apic_printk(APIC_VERBOSE,KERN_DEBUG
1339 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1348 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1340 "IRQ %d Mode:%i Active:%i)\n", 1349 "IRQ %d Mode:%i Active:%i Dest:%d)\n",
1341 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector, 1350 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
1342 irq, trigger, polarity); 1351 irq, trigger, polarity, dest);
1343 1352
1344 1353
1345 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry, 1354 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
@@ -1522,10 +1531,12 @@ __apicdebuginit(void) print_IO_APIC(void)
1522 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); 1531 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1523 1532
1524 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01); 1533 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1525 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1534 printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
1535 reg_01.bits.entries);
1526 1536
1527 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1537 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1528 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1538 printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
1539 reg_01.bits.version);
1529 1540
1530 /* 1541 /*
1531 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, 1542 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1550,31 +1561,60 @@ __apicdebuginit(void) print_IO_APIC(void)
1550 1561
1551 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1562 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1552 1563
1553 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1564 if (intr_remapping_enabled) {
1554 " Stat Dmod Deli Vect:\n"); 1565 printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
1566 " Pol Stat Indx2 Zero Vect:\n");
1567 } else {
1568 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1569 " Stat Dmod Deli Vect:\n");
1570 }
1555 1571
1556 for (i = 0; i <= reg_01.bits.entries; i++) { 1572 for (i = 0; i <= reg_01.bits.entries; i++) {
1557 struct IO_APIC_route_entry entry; 1573 if (intr_remapping_enabled) {
1558 1574 struct IO_APIC_route_entry entry;
1559 entry = ioapic_read_entry(apic, i); 1575 struct IR_IO_APIC_route_entry *ir_entry;
1560 1576
1561 printk(KERN_DEBUG " %02x %03X ", 1577 entry = ioapic_read_entry(apic, i);
1562 i, 1578 ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
1563 entry.dest 1579 printk(KERN_DEBUG " %02x %04X ",
1564 ); 1580 i,
1581 ir_entry->index
1582 );
1583 printk("%1d %1d %1d %1d %1d "
1584 "%1d %1d %X %02X\n",
1585 ir_entry->format,
1586 ir_entry->mask,
1587 ir_entry->trigger,
1588 ir_entry->irr,
1589 ir_entry->polarity,
1590 ir_entry->delivery_status,
1591 ir_entry->index2,
1592 ir_entry->zero,
1593 ir_entry->vector
1594 );
1595 } else {
1596 struct IO_APIC_route_entry entry;
1565 1597
1566 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", 1598 entry = ioapic_read_entry(apic, i);
1567 entry.mask, 1599 printk(KERN_DEBUG " %02x %02X ",
1568 entry.trigger, 1600 i,
1569 entry.irr, 1601 entry.dest
1570 entry.polarity, 1602 );
1571 entry.delivery_status, 1603 printk("%1d %1d %1d %1d %1d "
1572 entry.dest_mode, 1604 "%1d %1d %02X\n",
1573 entry.delivery_mode, 1605 entry.mask,
1574 entry.vector 1606 entry.trigger,
1575 ); 1607 entry.irr,
1608 entry.polarity,
1609 entry.delivery_status,
1610 entry.dest_mode,
1611 entry.delivery_mode,
1612 entry.vector
1613 );
1614 }
1576 } 1615 }
1577 } 1616 }
1617
1578 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1618 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1579 for_each_active_irq(irq) { 1619 for_each_active_irq(irq) {
1580 struct irq_pin_list *entry; 1620 struct irq_pin_list *entry;
@@ -1792,7 +1832,7 @@ __apicdebuginit(int) print_ICs(void)
1792 return 0; 1832 return 0;
1793} 1833}
1794 1834
1795fs_initcall(print_ICs); 1835late_initcall(print_ICs);
1796 1836
1797 1837
1798/* Where if anywhere is the i8259 connect in external int mode */ 1838/* Where if anywhere is the i8259 connect in external int mode */
@@ -2214,7 +2254,7 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2214 return ret; 2254 return ret;
2215} 2255}
2216 2256
2217#ifdef CONFIG_INTR_REMAP 2257#ifdef CONFIG_IRQ_REMAP
2218 2258
2219/* 2259/*
2220 * Migrate the IO-APIC irq in the presence of intr-remapping. 2260 * Migrate the IO-APIC irq in the presence of intr-remapping.
@@ -2226,6 +2266,9 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2226 * updated vector information), by using a virtual vector (io-apic pin number). 2266 * updated vector information), by using a virtual vector (io-apic pin number).
2227 * Real vector that is used for interrupting cpu will be coming from 2267 * Real vector that is used for interrupting cpu will be coming from
2228 * the interrupt-remapping table entry. 2268 * the interrupt-remapping table entry.
2269 *
2270 * As the migration is a simple atomic update of IRTE, the same mechanism
2271 * is used to migrate MSI irq's in the presence of interrupt-remapping.
2229 */ 2272 */
2230static int 2273static int
2231ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, 2274ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
@@ -2250,10 +2293,16 @@ ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2250 irte.dest_id = IRTE_DEST(dest); 2293 irte.dest_id = IRTE_DEST(dest);
2251 2294
2252 /* 2295 /*
2253 * Modified the IRTE and flushes the Interrupt entry cache. 2296 * Atomically updates the IRTE with the new destination, vector
2297 * and flushes the interrupt entry cache.
2254 */ 2298 */
2255 modify_irte(irq, &irte); 2299 modify_irte(irq, &irte);
2256 2300
2301 /*
2302 * After this point, all the interrupts will start arriving
2303 * at the new destination. So, time to cleanup the previous
2304 * vector allocation.
2305 */
2257 if (cfg->move_in_progress) 2306 if (cfg->move_in_progress)
2258 send_cleanup_vector(cfg); 2307 send_cleanup_vector(cfg);
2259 2308
@@ -2511,7 +2560,7 @@ static void ack_apic_level(struct irq_data *data)
2511 } 2560 }
2512} 2561}
2513 2562
2514#ifdef CONFIG_INTR_REMAP 2563#ifdef CONFIG_IRQ_REMAP
2515static void ir_ack_apic_edge(struct irq_data *data) 2564static void ir_ack_apic_edge(struct irq_data *data)
2516{ 2565{
2517 ack_APIC_irq(); 2566 ack_APIC_irq();
@@ -2522,7 +2571,23 @@ static void ir_ack_apic_level(struct irq_data *data)
2522 ack_APIC_irq(); 2571 ack_APIC_irq();
2523 eoi_ioapic_irq(data->irq, data->chip_data); 2572 eoi_ioapic_irq(data->irq, data->chip_data);
2524} 2573}
2525#endif /* CONFIG_INTR_REMAP */ 2574
2575static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
2576{
2577 seq_printf(p, " IR-%s", data->chip->name);
2578}
2579
2580static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
2581{
2582 chip->irq_print_chip = ir_print_prefix;
2583 chip->irq_ack = ir_ack_apic_edge;
2584 chip->irq_eoi = ir_ack_apic_level;
2585
2586#ifdef CONFIG_SMP
2587 chip->irq_set_affinity = ir_ioapic_set_affinity;
2588#endif
2589}
2590#endif /* CONFIG_IRQ_REMAP */
2526 2591
2527static struct irq_chip ioapic_chip __read_mostly = { 2592static struct irq_chip ioapic_chip __read_mostly = {
2528 .name = "IO-APIC", 2593 .name = "IO-APIC",
@@ -2537,21 +2602,6 @@ static struct irq_chip ioapic_chip __read_mostly = {
2537 .irq_retrigger = ioapic_retrigger_irq, 2602 .irq_retrigger = ioapic_retrigger_irq,
2538}; 2603};
2539 2604
2540static struct irq_chip ir_ioapic_chip __read_mostly = {
2541 .name = "IR-IO-APIC",
2542 .irq_startup = startup_ioapic_irq,
2543 .irq_mask = mask_ioapic_irq,
2544 .irq_unmask = unmask_ioapic_irq,
2545#ifdef CONFIG_INTR_REMAP
2546 .irq_ack = ir_ack_apic_edge,
2547 .irq_eoi = ir_ack_apic_level,
2548#ifdef CONFIG_SMP
2549 .irq_set_affinity = ir_ioapic_set_affinity,
2550#endif
2551#endif
2552 .irq_retrigger = ioapic_retrigger_irq,
2553};
2554
2555static inline void init_IO_APIC_traps(void) 2605static inline void init_IO_APIC_traps(void)
2556{ 2606{
2557 struct irq_cfg *cfg; 2607 struct irq_cfg *cfg;
@@ -3103,45 +3153,6 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3103 3153
3104 return 0; 3154 return 0;
3105} 3155}
3106#ifdef CONFIG_INTR_REMAP
3107/*
3108 * Migrate the MSI irq to another cpumask. This migration is
3109 * done in the process context using interrupt-remapping hardware.
3110 */
3111static int
3112ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3113 bool force)
3114{
3115 struct irq_cfg *cfg = data->chip_data;
3116 unsigned int dest, irq = data->irq;
3117 struct irte irte;
3118
3119 if (get_irte(irq, &irte))
3120 return -1;
3121
3122 if (__ioapic_set_affinity(data, mask, &dest))
3123 return -1;
3124
3125 irte.vector = cfg->vector;
3126 irte.dest_id = IRTE_DEST(dest);
3127
3128 /*
3129 * atomically update the IRTE with the new destination and vector.
3130 */
3131 modify_irte(irq, &irte);
3132
3133 /*
3134 * After this point, all the interrupts will start arriving
3135 * at the new destination. So, time to cleanup the previous
3136 * vector allocation.
3137 */
3138 if (cfg->move_in_progress)
3139 send_cleanup_vector(cfg);
3140
3141 return 0;
3142}
3143
3144#endif
3145#endif /* CONFIG_SMP */ 3156#endif /* CONFIG_SMP */
3146 3157
3147/* 3158/*
@@ -3159,19 +3170,6 @@ static struct irq_chip msi_chip = {
3159 .irq_retrigger = ioapic_retrigger_irq, 3170 .irq_retrigger = ioapic_retrigger_irq,
3160}; 3171};
3161 3172
3162static struct irq_chip msi_ir_chip = {
3163 .name = "IR-PCI-MSI",
3164 .irq_unmask = unmask_msi_irq,
3165 .irq_mask = mask_msi_irq,
3166#ifdef CONFIG_INTR_REMAP
3167 .irq_ack = ir_ack_apic_edge,
3168#ifdef CONFIG_SMP
3169 .irq_set_affinity = ir_msi_set_affinity,
3170#endif
3171#endif
3172 .irq_retrigger = ioapic_retrigger_irq,
3173};
3174
3175/* 3173/*
3176 * Map the PCI dev to the corresponding remapping hardware unit 3174 * Map the PCI dev to the corresponding remapping hardware unit
3177 * and allocate 'nvec' consecutive interrupt-remapping table entries 3175 * and allocate 'nvec' consecutive interrupt-remapping table entries
@@ -3214,7 +3212,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3214 3212
3215 if (irq_remapped(irq_get_chip_data(irq))) { 3213 if (irq_remapped(irq_get_chip_data(irq))) {
3216 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3214 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3217 chip = &msi_ir_chip; 3215 irq_remap_modify_chip_defaults(chip);
3218 } 3216 }
3219 3217
3220 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); 3218 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
@@ -3287,7 +3285,7 @@ void native_teardown_msi_irq(unsigned int irq)
3287 destroy_irq(irq); 3285 destroy_irq(irq);
3288} 3286}
3289 3287
3290#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3288#ifdef CONFIG_DMAR_TABLE
3291#ifdef CONFIG_SMP 3289#ifdef CONFIG_SMP
3292static int 3290static int
3293dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, 3291dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
@@ -3368,19 +3366,6 @@ static int hpet_msi_set_affinity(struct irq_data *data,
3368 3366
3369#endif /* CONFIG_SMP */ 3367#endif /* CONFIG_SMP */
3370 3368
3371static struct irq_chip ir_hpet_msi_type = {
3372 .name = "IR-HPET_MSI",
3373 .irq_unmask = hpet_msi_unmask,
3374 .irq_mask = hpet_msi_mask,
3375#ifdef CONFIG_INTR_REMAP
3376 .irq_ack = ir_ack_apic_edge,
3377#ifdef CONFIG_SMP
3378 .irq_set_affinity = ir_msi_set_affinity,
3379#endif
3380#endif
3381 .irq_retrigger = ioapic_retrigger_irq,
3382};
3383
3384static struct irq_chip hpet_msi_type = { 3369static struct irq_chip hpet_msi_type = {
3385 .name = "HPET_MSI", 3370 .name = "HPET_MSI",
3386 .irq_unmask = hpet_msi_unmask, 3371 .irq_unmask = hpet_msi_unmask,
@@ -3417,7 +3402,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3417 hpet_msi_write(irq_get_handler_data(irq), &msg); 3402 hpet_msi_write(irq_get_handler_data(irq), &msg);
3418 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3403 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3419 if (irq_remapped(irq_get_chip_data(irq))) 3404 if (irq_remapped(irq_get_chip_data(irq)))
3420 chip = &ir_hpet_msi_type; 3405 irq_remap_modify_chip_defaults(chip);
3421 3406
3422 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); 3407 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3423 return 0; 3408 return 0;
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index b5254ad044a..0787bb3412f 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -200,14 +200,8 @@ void __init default_setup_apic_routing(void)
200 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support 200 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
201 */ 201 */
202 202
203 if (!cmdline_apic && apic == &apic_default) { 203 if (!cmdline_apic && apic == &apic_default)
204 struct apic *bigsmp = generic_bigsmp_probe(); 204 generic_bigsmp_probe();
205 if (bigsmp) {
206 apic = bigsmp;
207 printk(KERN_INFO "Overriding APIC driver with %s\n",
208 apic->name);
209 }
210 }
211#endif 205#endif
212 206
213 if (apic->setup_apic_routing) 207 if (apic->setup_apic_routing)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index adc66c3a1fe..cfeb978f49f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -207,7 +207,6 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
207 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 207 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
208 APIC_DM_INIT; 208 APIC_DM_INIT;
209 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 209 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
210 mdelay(10);
211 210
212 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 211 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
213 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 212 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
@@ -833,6 +832,10 @@ void __init uv_system_init(void)
833 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; 832 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
834 uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; 833 uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
835 834
835 uv_cpu_hub_info(cpu)->m_shift = 64 - m_val;
836 uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ?
837 (m_val == 40 ? 40 : 39) : m_val;
838
836 pnode = uv_apicid_to_pnode(apicid); 839 pnode = uv_apicid_to_pnode(apicid);
837 blade = boot_pnode_to_blade(pnode); 840 blade = boot_pnode_to_blade(pnode);
838 lcpu = uv_blade_info[blade].nr_possible_cpus; 841 lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -863,8 +866,7 @@ void __init uv_system_init(void)
863 if (uv_node_to_blade[nid] >= 0) 866 if (uv_node_to_blade[nid] >= 0)
864 continue; 867 continue;
865 paddr = node_start_pfn(nid) << PAGE_SHIFT; 868 paddr = node_start_pfn(nid) << PAGE_SHIFT;
866 paddr = uv_soc_phys_ram_to_gpa(paddr); 869 pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
867 pnode = (paddr >> m_val) & pnode_mask;
868 blade = boot_pnode_to_blade(pnode); 870 blade = boot_pnode_to_blade(pnode);
869 uv_node_to_blade[nid] = blade; 871 uv_node_to_blade[nid] = blade;
870 } 872 }
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 965a7666c28..0371c484bb8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,11 +229,11 @@
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h> 230#include <linux/acpi.h>
231#include <linux/syscore_ops.h> 231#include <linux/syscore_ops.h>
232#include <linux/i8253.h>
232 233
233#include <asm/system.h> 234#include <asm/system.h>
234#include <asm/uaccess.h> 235#include <asm/uaccess.h>
235#include <asm/desc.h> 236#include <asm/desc.h>
236#include <asm/i8253.h>
237#include <asm/olpc.h> 237#include <asm/olpc.h>
238#include <asm/paravirt.h> 238#include <asm/paravirt.h>
239#include <asm/reboot.h> 239#include <asm/reboot.h>
@@ -1220,11 +1220,11 @@ static void reinit_timer(void)
1220 1220
1221 raw_spin_lock_irqsave(&i8253_lock, flags); 1221 raw_spin_lock_irqsave(&i8253_lock, flags);
1222 /* set the clock to HZ */ 1222 /* set the clock to HZ */
1223 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1223 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1224 udelay(10); 1224 udelay(10);
1225 outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */ 1225 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
1226 udelay(10); 1226 udelay(10);
1227 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ 1227 outb_p(LATCH >> 8, PIT_CH0); /* MSB */
1228 udelay(10); 1228 udelay(10);
1229 raw_spin_unlock_irqrestore(&i8253_lock, flags); 1229 raw_spin_unlock_irqrestore(&i8253_lock, flags);
1230#endif 1230#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6f..395a10e6806 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
63 BLANK(); 63 BLANK();
64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
65 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); 65 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
66 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
67 66
68 BLANK(); 67 BLANK();
69 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 68 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 525514cf33c..46674fbb62b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -62,6 +62,8 @@ static void __init check_fpu(void)
62 return; 62 return;
63 } 63 }
64 64
65 kernel_fpu_begin();
66
65 /* 67 /*
66 * trap_init() enabled FXSR and company _before_ testing for FP 68 * trap_init() enabled FXSR and company _before_ testing for FP
67 * problems here. 69 * problems here.
@@ -80,6 +82,8 @@ static void __init check_fpu(void)
80 : "=m" (*&fdiv_bug) 82 : "=m" (*&fdiv_bug)
81 : "m" (*&x), "m" (*&y)); 83 : "m" (*&x), "m" (*&y));
82 84
85 kernel_fpu_end();
86
83 boot_cpu_data.fdiv_bug = fdiv_bug; 87 boot_cpu_data.fdiv_bug = fdiv_bug;
84 if (boot_cpu_data.fdiv_bug) 88 if (boot_cpu_data.fdiv_bug)
85 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); 89 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 22a073d7fbf..62184390a60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,7 +21,7 @@
21#include <linux/topology.h> 21#include <linux/topology.h>
22#include <linux/cpumask.h> 22#include <linux/cpumask.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/atomic.h> 24#include <linux/atomic.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include <asm/apic.h> 27#include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8..755f64fb074 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
32 */ 32 */
33static const __initconst struct hypervisor_x86 * const hypervisors[] = 33static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 34{
35 &x86_hyper_vmware,
36 &x86_hyper_ms_hyperv,
37#ifdef CONFIG_XEN_PVHVM 35#ifdef CONFIG_XEN_PVHVM
38 &x86_hyper_xen_hvm, 36 &x86_hyper_xen_hvm,
39#endif 37#endif
38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv,
40}; 40};
41 41
42const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1edf5ba4fb2..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
456 456
457 if (cpu_has(c, X86_FEATURE_VMX)) 457 if (cpu_has(c, X86_FEATURE_VMX))
458 detect_vmx_virtcap(c); 458 detect_vmx_virtcap(c);
459
460 /*
461 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
462 * x86_energy_perf_policy(8) is available to change it at run-time
463 */
464 if (cpu_has(c, X86_FEATURE_EPB)) {
465 u64 epb;
466
467 rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
468 if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
469 printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
470 " Set to 'normal', was 'performance'\n"
471 "ENERGY_PERF_BIAS: View and update with"
472 " x86_energy_perf_policy(8)\n");
473 epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
474 wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
475 }
476 }
459} 477}
460 478
461#ifdef CONFIG_X86_32 479#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
43 unsigned char covered; 43 unsigned char covered;
44 char *msg; 44 char *msg;
45} severities[] = { 45} severities[] = {
46#define KERNEL .context = IN_KERNEL 46#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
47#define USER .context = IN_USER 47#define KERNEL .context = IN_KERNEL
48#define SER .ser = SER_REQUIRED 48#define USER .context = IN_USER
49#define NOSER .ser = NO_SER 49#define SER .ser = SER_REQUIRED
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY 50#define NOSER .ser = NO_SER
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } 51#define BITCLR(x) .mask = x, .result = 0
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } 52#define BITSET(x) .mask = x, .result = x
53#define MCGMASK(x, res, s, m, r...) \ 53#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } 54#define MASK(x, y) .mask = x, .result = y
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff 57#define MCACOD 0xffff
60 58
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"), 59 MCESEV(
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"), 60 NO, "Invalid",
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), 61 BITCLR(MCI_STATUS_VAL)
62 ),
63 MCESEV(
64 NO, "Not enabled",
65 BITCLR(MCI_STATUS_EN)
66 ),
67 MCESEV(
68 PANIC, "Processor context corrupt",
69 BITSET(MCI_STATUS_PCC)
70 ),
64 /* When MCIP is not set something is very confused */ 71 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), 72 MCESEV(
73 PANIC, "MCIP not set in MCA handler",
74 MCGMASK(MCG_STATUS_MCIP, 0)
75 ),
66 /* Neither return not error IP -- no chance to recover -> PANIC */ 76 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, 77 MCESEV(
68 "Neither restart nor error IP"), 78 PANIC, "Neither restart nor error IP",
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", 79 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
70 KERNEL), 80 ),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), 81 MCESEV(
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, 82 PANIC, "In kernel and no restart IP",
73 "Spurious not enabled", SER), 83 KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
84 ),
85 MCESEV(
86 KEEP, "Corrected error",
87 NOSER, BITCLR(MCI_STATUS_UC)
88 ),
74 89
75 /* ignore OVER for UCNA */ 90 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, 91 MCESEV(
77 "Uncorrected no action required", SER), 92 KEEP, "Uncorrected no action required",
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, 93 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
79 "Illegal combination (UCNA with AR=1)", SER), 94 ),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), 95 MCESEV(
96 PANIC, "Illegal combination (UCNA with AR=1)",
97 SER,
98 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
99 ),
100 MCESEV(
101 KEEP, "Non signalled machine check",
102 SER, BITCLR(MCI_STATUS_S)
103 ),
81 104
82 /* AR add known MCACODs here */ 105 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, 106 MCESEV(
84 "Action required with lost events", SER), 107 PANIC, "Action required with lost events",
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, 108 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
86 "Action required; unknown MCACOD", SER), 109 ),
110 MCESEV(
111 PANIC, "Action required: unknown MCACOD",
112 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
113 ),
87 114
88 /* known AO MCACODs: */ 115 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, 116 MCESEV(
90 "Action optional: memory scrubbing error", SER), 117 AO, "Action optional: memory scrubbing error",
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, 118 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
92 "Action optional: last level cache writeback error", SER), 119 ),
93 120 MCESEV(
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, 121 AO, "Action optional: last level cache writeback error",
95 "Action optional unknown MCACOD", SER), 122 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, 123 ),
97 "Action optional with lost events", SER), 124 MCESEV(
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), 125 SOME, "Action optional: unknown MCACOD",
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"), 126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
100 BITSET(0, SOME, "No match") /* always matches. keep at end */ 127 ),
128 MCESEV(
129 SOME, "Action optional with lost events",
130 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
131 ),
132
133 MCESEV(
134 PANIC, "Overflowed uncorrected",
135 BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
136 ),
137 MCESEV(
138 UC, "Uncorrected",
139 BITSET(MCI_STATUS_UC)
140 ),
141 MCESEV(
142 SOME, "No match",
143 BITSET(0)
144 ) /* always matches. keep at end */
101}; 145};
102 146
103/* 147/*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
112 return IN_KERNEL; 156 return IN_KERNEL;
113} 157}
114 158
115int mce_severity(struct mce *a, int tolerant, char **msg) 159int mce_severity(struct mce *m, int tolerant, char **msg)
116{ 160{
117 enum context ctx = error_context(a); 161 enum context ctx = error_context(m);
118 struct severity *s; 162 struct severity *s;
119 163
120 for (s = severities;; s++) { 164 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result) 165 if ((m->status & s->mask) != s->result)
122 continue; 166 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres) 167 if ((m->mcgstatus & s->mcgmask) != s->mcgres)
124 continue; 168 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser) 169 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue; 170 continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
197 241
198static int __init severities_debugfs_init(void) 242static int __init severities_debugfs_init(void)
199{ 243{
200 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 244 struct dentry *dmce, *fsev;
201 245
202 dmce = mce_get_debugfs_dir(); 246 dmce = mce_get_debugfs_dir();
203 if (dmce == NULL) 247 if (!dmce)
204 goto err_out; 248 goto err_out;
205 fseverities_coverage = debugfs_create_file("severities-coverage", 249
206 0444, dmce, NULL, 250 fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
207 &severities_coverage_fops); 251 &severities_coverage_fops);
208 if (fseverities_coverage == NULL) 252 if (!fsev)
209 goto err_out; 253 goto err_out;
210 254
211 return 0; 255 return 0;
@@ -214,4 +258,4 @@ err_out:
214 return -ENOMEM; 258 return -ENOMEM;
215} 259}
216late_initcall(severities_debugfs_init); 260late_initcall(severities_debugfs_init);
217#endif 261#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b6464..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
10#include <linux/thread_info.h> 10#include <linux/thread_info.h>
11#include <linux/capability.h> 11#include <linux/capability.h>
12#include <linux/miscdevice.h> 12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h> 13#include <linux/ratelimit.h>
15#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
16#include <linux/rcupdate.h> 15#include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
38#include <linux/mm.h> 37#include <linux/mm.h>
39#include <linux/debugfs.h> 38#include <linux/debugfs.h>
40#include <linux/edac_mce.h> 39#include <linux/edac_mce.h>
40#include <linux/irq_work.h>
41 41
42#include <asm/processor.h> 42#include <asm/processor.h>
43#include <asm/hw_irq.h>
44#include <asm/apic.h>
45#include <asm/idle.h>
46#include <asm/ipi.h>
47#include <asm/mce.h> 43#include <asm/mce.h>
48#include <asm/msr.h> 44#include <asm/msr.h>
49 45
50#include "mce-internal.h" 46#include "mce-internal.h"
51 47
52static DEFINE_MUTEX(mce_read_mutex); 48static DEFINE_MUTEX(mce_chrdev_read_mutex);
53 49
54#define rcu_dereference_check_mce(p) \ 50#define rcu_dereference_check_mce(p) \
55 rcu_dereference_index_check((p), \ 51 rcu_dereference_index_check((p), \
56 rcu_read_lock_sched_held() || \ 52 rcu_read_lock_sched_held() || \
57 lockdep_is_held(&mce_read_mutex)) 53 lockdep_is_held(&mce_chrdev_read_mutex))
58 54
59#define CREATE_TRACE_POINTS 55#define CREATE_TRACE_POINTS
60#include <trace/events/mce.h> 56#include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long mce_need_notify;
94static char mce_helper[128]; 90static char mce_helper[128];
95static char *mce_helper_argv[2] = { mce_helper, NULL }; 91static char *mce_helper_argv[2] = { mce_helper, NULL };
96 92
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
94
98static DEFINE_PER_CPU(struct mce, mces_seen); 95static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 96static int cpu_missing;
100 97
@@ -373,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
373} 370}
374 371
375/* 372/*
373 * Collect all global (w.r.t. this processor) status about this machine
374 * check into our "mce" struct so that we can use it later to assess
375 * the severity of the problem as we read per-bank specific details.
376 */
377static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
378{
379 mce_setup(m);
380
381 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
382 if (regs) {
383 /*
384 * Get the address of the instruction at the time of
385 * the machine check error.
386 */
387 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
388 m->ip = regs->ip;
389 m->cs = regs->cs;
390 }
391 /* Use accurate RIP reporting if available. */
392 if (rip_msr)
393 m->ip = mce_rdmsrl(rip_msr);
394 }
395}
396
397/*
376 * Simple lockless ring to communicate PFNs from the exception handler with the 398 * Simple lockless ring to communicate PFNs from the exception handler with the
377 * process context work function. This is vastly simplified because there's 399 * process context work function. This is vastly simplified because there's
378 * only a single reader and a single writer. 400 * only a single reader and a single writer.
@@ -443,40 +465,13 @@ static void mce_schedule_work(void)
443 } 465 }
444} 466}
445 467
446/* 468DEFINE_PER_CPU(struct irq_work, mce_irq_work);
447 * Get the address of the instruction at the time of the machine check
448 * error.
449 */
450static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
451{
452
453 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
454 m->ip = regs->ip;
455 m->cs = regs->cs;
456 } else {
457 m->ip = 0;
458 m->cs = 0;
459 }
460 if (rip_msr)
461 m->ip = mce_rdmsrl(rip_msr);
462}
463 469
464#ifdef CONFIG_X86_LOCAL_APIC 470static void mce_irq_work_cb(struct irq_work *entry)
465/*
466 * Called after interrupts have been reenabled again
467 * when a MCE happened during an interrupts off region
468 * in the kernel.
469 */
470asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
471{ 471{
472 ack_APIC_irq();
473 exit_idle();
474 irq_enter();
475 mce_notify_irq(); 472 mce_notify_irq();
476 mce_schedule_work(); 473 mce_schedule_work();
477 irq_exit();
478} 474}
479#endif
480 475
481static void mce_report_event(struct pt_regs *regs) 476static void mce_report_event(struct pt_regs *regs)
482{ 477{
@@ -492,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
492 return; 487 return;
493 } 488 }
494 489
495#ifdef CONFIG_X86_LOCAL_APIC 490 irq_work_queue(&__get_cpu_var(mce_irq_work));
496 /*
497 * Without APIC do not notify. The event will be picked
498 * up eventually.
499 */
500 if (!cpu_has_apic)
501 return;
502
503 /*
504 * When interrupts are disabled we cannot use
505 * kernel services safely. Trigger an self interrupt
506 * through the APIC to instead do the notification
507 * after interrupts are reenabled again.
508 */
509 apic->send_IPI_self(MCE_SELF_VECTOR);
510
511 /*
512 * Wait for idle afterwards again so that we don't leave the
513 * APIC in a non idle state because the normal APIC writes
514 * cannot exclude us.
515 */
516 apic_wait_icr_idle();
517#endif
518} 491}
519 492
520DEFINE_PER_CPU(unsigned, mce_poll_count); 493DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -541,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
541 514
542 percpu_inc(mce_poll_count); 515 percpu_inc(mce_poll_count);
543 516
544 mce_setup(&m); 517 mce_gather_info(&m, NULL);
545 518
546 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
547 for (i = 0; i < banks; i++) { 519 for (i = 0; i < banks; i++) {
548 if (!mce_banks[i].ctl || !test_bit(i, *b)) 520 if (!mce_banks[i].ctl || !test_bit(i, *b))
549 continue; 521 continue;
@@ -879,9 +851,9 @@ static int mce_usable_address(struct mce *m)
879{ 851{
880 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 852 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
881 return 0; 853 return 0;
882 if ((m->misc & 0x3f) > PAGE_SHIFT) 854 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
883 return 0; 855 return 0;
884 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) 856 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
885 return 0; 857 return 0;
886 return 1; 858 return 1;
887} 859}
@@ -942,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
942 if (!banks) 914 if (!banks)
943 goto out; 915 goto out;
944 916
945 mce_setup(&m); 917 mce_gather_info(&m, regs);
946 918
947 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
948 final = &__get_cpu_var(mces_seen); 919 final = &__get_cpu_var(mces_seen);
949 *final = m; 920 *final = m;
950 921
@@ -1028,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1028 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 999 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1029 mce_ring_add(m.addr >> PAGE_SHIFT); 1000 mce_ring_add(m.addr >> PAGE_SHIFT);
1030 1001
1031 mce_get_rip(&m, regs);
1032 mce_log(&m); 1002 mce_log(&m);
1033 1003
1034 if (severity > worst) { 1004 if (severity > worst) {
@@ -1190,7 +1160,8 @@ int mce_notify_irq(void)
1190 clear_thread_flag(TIF_MCE_NOTIFY); 1160 clear_thread_flag(TIF_MCE_NOTIFY);
1191 1161
1192 if (test_and_clear_bit(0, &mce_need_notify)) { 1162 if (test_and_clear_bit(0, &mce_need_notify)) {
1193 wake_up_interruptible(&mce_wait); 1163 /* wake processes polling /dev/mcelog */
1164 wake_up_interruptible(&mce_chrdev_wait);
1194 1165
1195 /* 1166 /*
1196 * There is no risk of missing notifications because 1167 * There is no risk of missing notifications because
@@ -1363,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1363 return 0; 1334 return 0;
1364} 1335}
1365 1336
1366static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) 1337static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1367{ 1338{
1368 if (c->x86 != 5) 1339 if (c->x86 != 5)
1369 return; 1340 return 0;
1341
1370 switch (c->x86_vendor) { 1342 switch (c->x86_vendor) {
1371 case X86_VENDOR_INTEL: 1343 case X86_VENDOR_INTEL:
1372 intel_p5_mcheck_init(c); 1344 intel_p5_mcheck_init(c);
1345 return 1;
1373 break; 1346 break;
1374 case X86_VENDOR_CENTAUR: 1347 case X86_VENDOR_CENTAUR:
1375 winchip_mcheck_init(c); 1348 winchip_mcheck_init(c);
1349 return 1;
1376 break; 1350 break;
1377 } 1351 }
1352
1353 return 0;
1378} 1354}
1379 1355
1380static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1356static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1428,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1428 if (mce_disabled) 1404 if (mce_disabled)
1429 return; 1405 return;
1430 1406
1431 __mcheck_cpu_ancient_init(c); 1407 if (__mcheck_cpu_ancient_init(c))
1408 return;
1432 1409
1433 if (!mce_available(c)) 1410 if (!mce_available(c))
1434 return; 1411 return;
@@ -1444,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1444 __mcheck_cpu_init_vendor(c); 1421 __mcheck_cpu_init_vendor(c);
1445 __mcheck_cpu_init_timer(); 1422 __mcheck_cpu_init_timer();
1446 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1447 1424 init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1448} 1425}
1449 1426
1450/* 1427/*
1451 * Character device to read and clear the MCE log. 1428 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1452 */ 1429 */
1453 1430
1454static DEFINE_SPINLOCK(mce_state_lock); 1431static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1455static int open_count; /* #times opened */ 1432static int mce_chrdev_open_count; /* #times opened */
1456static int open_exclu; /* already open exclusive? */ 1433static int mce_chrdev_open_exclu; /* already open exclusive? */
1457 1434
1458static int mce_open(struct inode *inode, struct file *file) 1435static int mce_chrdev_open(struct inode *inode, struct file *file)
1459{ 1436{
1460 spin_lock(&mce_state_lock); 1437 spin_lock(&mce_chrdev_state_lock);
1461 1438
1462 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 1439 if (mce_chrdev_open_exclu ||
1463 spin_unlock(&mce_state_lock); 1440 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1441 spin_unlock(&mce_chrdev_state_lock);
1464 1442
1465 return -EBUSY; 1443 return -EBUSY;
1466 } 1444 }
1467 1445
1468 if (file->f_flags & O_EXCL) 1446 if (file->f_flags & O_EXCL)
1469 open_exclu = 1; 1447 mce_chrdev_open_exclu = 1;
1470 open_count++; 1448 mce_chrdev_open_count++;
1471 1449
1472 spin_unlock(&mce_state_lock); 1450 spin_unlock(&mce_chrdev_state_lock);
1473 1451
1474 return nonseekable_open(inode, file); 1452 return nonseekable_open(inode, file);
1475} 1453}
1476 1454
1477static int mce_release(struct inode *inode, struct file *file) 1455static int mce_chrdev_release(struct inode *inode, struct file *file)
1478{ 1456{
1479 spin_lock(&mce_state_lock); 1457 spin_lock(&mce_chrdev_state_lock);
1480 1458
1481 open_count--; 1459 mce_chrdev_open_count--;
1482 open_exclu = 0; 1460 mce_chrdev_open_exclu = 0;
1483 1461
1484 spin_unlock(&mce_state_lock); 1462 spin_unlock(&mce_chrdev_state_lock);
1485 1463
1486 return 0; 1464 return 0;
1487} 1465}
@@ -1530,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
1530 return 0; 1508 return 0;
1531} 1509}
1532 1510
1533static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1511static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1534 loff_t *off) 1512 size_t usize, loff_t *off)
1535{ 1513{
1536 char __user *buf = ubuf; 1514 char __user *buf = ubuf;
1537 unsigned long *cpu_tsc; 1515 unsigned long *cpu_tsc;
@@ -1542,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1542 if (!cpu_tsc) 1520 if (!cpu_tsc)
1543 return -ENOMEM; 1521 return -ENOMEM;
1544 1522
1545 mutex_lock(&mce_read_mutex); 1523 mutex_lock(&mce_chrdev_read_mutex);
1546 1524
1547 if (!mce_apei_read_done) { 1525 if (!mce_apei_read_done) {
1548 err = __mce_read_apei(&buf, usize); 1526 err = __mce_read_apei(&buf, usize);
@@ -1562,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1562 do { 1540 do {
1563 for (i = prev; i < next; i++) { 1541 for (i = prev; i < next; i++) {
1564 unsigned long start = jiffies; 1542 unsigned long start = jiffies;
1543 struct mce *m = &mcelog.entry[i];
1565 1544
1566 while (!mcelog.entry[i].finished) { 1545 while (!m->finished) {
1567 if (time_after_eq(jiffies, start + 2)) { 1546 if (time_after_eq(jiffies, start + 2)) {
1568 memset(mcelog.entry + i, 0, 1547 memset(m, 0, sizeof(*m));
1569 sizeof(struct mce));
1570 goto timeout; 1548 goto timeout;
1571 } 1549 }
1572 cpu_relax(); 1550 cpu_relax();
1573 } 1551 }
1574 smp_rmb(); 1552 smp_rmb();
1575 err |= copy_to_user(buf, mcelog.entry + i, 1553 err |= copy_to_user(buf, m, sizeof(*m));
1576 sizeof(struct mce)); 1554 buf += sizeof(*m);
1577 buf += sizeof(struct mce);
1578timeout: 1555timeout:
1579 ; 1556 ;
1580 } 1557 }
@@ -1594,13 +1571,13 @@ timeout:
1594 on_each_cpu(collect_tscs, cpu_tsc, 1); 1571 on_each_cpu(collect_tscs, cpu_tsc, 1);
1595 1572
1596 for (i = next; i < MCE_LOG_LEN; i++) { 1573 for (i = next; i < MCE_LOG_LEN; i++) {
1597 if (mcelog.entry[i].finished && 1574 struct mce *m = &mcelog.entry[i];
1598 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 1575
1599 err |= copy_to_user(buf, mcelog.entry+i, 1576 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1600 sizeof(struct mce)); 1577 err |= copy_to_user(buf, m, sizeof(*m));
1601 smp_rmb(); 1578 smp_rmb();
1602 buf += sizeof(struct mce); 1579 buf += sizeof(*m);
1603 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 1580 memset(m, 0, sizeof(*m));
1604 } 1581 }
1605 } 1582 }
1606 1583
@@ -1608,15 +1585,15 @@ timeout:
1608 err = -EFAULT; 1585 err = -EFAULT;
1609 1586
1610out: 1587out:
1611 mutex_unlock(&mce_read_mutex); 1588 mutex_unlock(&mce_chrdev_read_mutex);
1612 kfree(cpu_tsc); 1589 kfree(cpu_tsc);
1613 1590
1614 return err ? err : buf - ubuf; 1591 return err ? err : buf - ubuf;
1615} 1592}
1616 1593
1617static unsigned int mce_poll(struct file *file, poll_table *wait) 1594static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1618{ 1595{
1619 poll_wait(file, &mce_wait, wait); 1596 poll_wait(file, &mce_chrdev_wait, wait);
1620 if (rcu_access_index(mcelog.next)) 1597 if (rcu_access_index(mcelog.next))
1621 return POLLIN | POLLRDNORM; 1598 return POLLIN | POLLRDNORM;
1622 if (!mce_apei_read_done && apei_check_mce()) 1599 if (!mce_apei_read_done && apei_check_mce())
@@ -1624,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
1624 return 0; 1601 return 0;
1625} 1602}
1626 1603
1627static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 1604static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1605 unsigned long arg)
1628{ 1606{
1629 int __user *p = (int __user *)arg; 1607 int __user *p = (int __user *)arg;
1630 1608
@@ -1652,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1652 1630
1653/* Modified in mce-inject.c, so not static or const */ 1631/* Modified in mce-inject.c, so not static or const */
1654struct file_operations mce_chrdev_ops = { 1632struct file_operations mce_chrdev_ops = {
1655 .open = mce_open, 1633 .open = mce_chrdev_open,
1656 .release = mce_release, 1634 .release = mce_chrdev_release,
1657 .read = mce_read, 1635 .read = mce_chrdev_read,
1658 .poll = mce_poll, 1636 .poll = mce_chrdev_poll,
1659 .unlocked_ioctl = mce_ioctl, 1637 .unlocked_ioctl = mce_chrdev_ioctl,
1660 .llseek = no_llseek, 1638 .llseek = no_llseek,
1661}; 1639};
1662EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1640EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1663 1641
1664static struct miscdevice mce_log_device = { 1642static struct miscdevice mce_chrdev_device = {
1665 MISC_MCELOG_MINOR, 1643 MISC_MCELOG_MINOR,
1666 "mcelog", 1644 "mcelog",
1667 &mce_chrdev_ops, 1645 &mce_chrdev_ops,
@@ -1719,7 +1697,7 @@ int __init mcheck_init(void)
1719} 1697}
1720 1698
1721/* 1699/*
1722 * Sysfs support 1700 * mce_syscore: PM support
1723 */ 1701 */
1724 1702
1725/* 1703/*
@@ -1739,12 +1717,12 @@ static int mce_disable_error_reporting(void)
1739 return 0; 1717 return 0;
1740} 1718}
1741 1719
1742static int mce_suspend(void) 1720static int mce_syscore_suspend(void)
1743{ 1721{
1744 return mce_disable_error_reporting(); 1722 return mce_disable_error_reporting();
1745} 1723}
1746 1724
1747static void mce_shutdown(void) 1725static void mce_syscore_shutdown(void)
1748{ 1726{
1749 mce_disable_error_reporting(); 1727 mce_disable_error_reporting();
1750} 1728}
@@ -1754,18 +1732,22 @@ static void mce_shutdown(void)
1754 * Only one CPU is active at this time, the others get re-added later using 1732 * Only one CPU is active at this time, the others get re-added later using
1755 * CPU hotplug: 1733 * CPU hotplug:
1756 */ 1734 */
1757static void mce_resume(void) 1735static void mce_syscore_resume(void)
1758{ 1736{
1759 __mcheck_cpu_init_generic(); 1737 __mcheck_cpu_init_generic();
1760 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); 1738 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1761} 1739}
1762 1740
1763static struct syscore_ops mce_syscore_ops = { 1741static struct syscore_ops mce_syscore_ops = {
1764 .suspend = mce_suspend, 1742 .suspend = mce_syscore_suspend,
1765 .shutdown = mce_shutdown, 1743 .shutdown = mce_syscore_shutdown,
1766 .resume = mce_resume, 1744 .resume = mce_syscore_resume,
1767}; 1745};
1768 1746
1747/*
1748 * mce_sysdev: Sysfs support
1749 */
1750
1769static void mce_cpu_restart(void *data) 1751static void mce_cpu_restart(void *data)
1770{ 1752{
1771 del_timer_sync(&__get_cpu_var(mce_timer)); 1753 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1801,11 +1783,11 @@ static void mce_enable_ce(void *all)
1801 __mcheck_cpu_init_timer(); 1783 __mcheck_cpu_init_timer();
1802} 1784}
1803 1785
1804static struct sysdev_class mce_sysclass = { 1786static struct sysdev_class mce_sysdev_class = {
1805 .name = "machinecheck", 1787 .name = "machinecheck",
1806}; 1788};
1807 1789
1808DEFINE_PER_CPU(struct sys_device, mce_dev); 1790DEFINE_PER_CPU(struct sys_device, mce_sysdev);
1809 1791
1810__cpuinitdata 1792__cpuinitdata
1811void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1793void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1934,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
1934 &mce_cmci_disabled 1916 &mce_cmci_disabled
1935}; 1917};
1936 1918
1937static struct sysdev_attribute *mce_attrs[] = { 1919static struct sysdev_attribute *mce_sysdev_attrs[] = {
1938 &attr_tolerant.attr, 1920 &attr_tolerant.attr,
1939 &attr_check_interval.attr, 1921 &attr_check_interval.attr,
1940 &attr_trigger, 1922 &attr_trigger,
@@ -1945,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
1945 NULL 1927 NULL
1946}; 1928};
1947 1929
1948static cpumask_var_t mce_dev_initialized; 1930static cpumask_var_t mce_sysdev_initialized;
1949 1931
1950/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 1932/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1951static __cpuinit int mce_create_device(unsigned int cpu) 1933static __cpuinit int mce_sysdev_create(unsigned int cpu)
1952{ 1934{
1935 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
1953 int err; 1936 int err;
1954 int i, j; 1937 int i, j;
1955 1938
1956 if (!mce_available(&boot_cpu_data)) 1939 if (!mce_available(&boot_cpu_data))
1957 return -EIO; 1940 return -EIO;
1958 1941
1959 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); 1942 memset(&sysdev->kobj, 0, sizeof(struct kobject));
1960 per_cpu(mce_dev, cpu).id = cpu; 1943 sysdev->id = cpu;
1961 per_cpu(mce_dev, cpu).cls = &mce_sysclass; 1944 sysdev->cls = &mce_sysdev_class;
1962 1945
1963 err = sysdev_register(&per_cpu(mce_dev, cpu)); 1946 err = sysdev_register(sysdev);
1964 if (err) 1947 if (err)
1965 return err; 1948 return err;
1966 1949
1967 for (i = 0; mce_attrs[i]; i++) { 1950 for (i = 0; mce_sysdev_attrs[i]; i++) {
1968 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1951 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
1969 if (err) 1952 if (err)
1970 goto error; 1953 goto error;
1971 } 1954 }
1972 for (j = 0; j < banks; j++) { 1955 for (j = 0; j < banks; j++) {
1973 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1956 err = sysdev_create_file(sysdev, &mce_banks[j].attr);
1974 &mce_banks[j].attr);
1975 if (err) 1957 if (err)
1976 goto error2; 1958 goto error2;
1977 } 1959 }
1978 cpumask_set_cpu(cpu, mce_dev_initialized); 1960 cpumask_set_cpu(cpu, mce_sysdev_initialized);
1979 1961
1980 return 0; 1962 return 0;
1981error2: 1963error2:
1982 while (--j >= 0) 1964 while (--j >= 0)
1983 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1965 sysdev_remove_file(sysdev, &mce_banks[j].attr);
1984error: 1966error:
1985 while (--i >= 0) 1967 while (--i >= 0)
1986 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1968 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
1987 1969
1988 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1970 sysdev_unregister(sysdev);
1989 1971
1990 return err; 1972 return err;
1991} 1973}
1992 1974
1993static __cpuinit void mce_remove_device(unsigned int cpu) 1975static __cpuinit void mce_sysdev_remove(unsigned int cpu)
1994{ 1976{
1977 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
1995 int i; 1978 int i;
1996 1979
1997 if (!cpumask_test_cpu(cpu, mce_dev_initialized)) 1980 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
1998 return; 1981 return;
1999 1982
2000 for (i = 0; mce_attrs[i]; i++) 1983 for (i = 0; mce_sysdev_attrs[i]; i++)
2001 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1984 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
2002 1985
2003 for (i = 0; i < banks; i++) 1986 for (i = 0; i < banks; i++)
2004 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1987 sysdev_remove_file(sysdev, &mce_banks[i].attr);
2005 1988
2006 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1989 sysdev_unregister(sysdev);
2007 cpumask_clear_cpu(cpu, mce_dev_initialized); 1990 cpumask_clear_cpu(cpu, mce_sysdev_initialized);
2008} 1991}
2009 1992
2010/* Make sure there are no machine checks on offlined CPUs. */ 1993/* Make sure there are no machine checks on offlined CPUs. */
@@ -2054,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2054 switch (action) { 2037 switch (action) {
2055 case CPU_ONLINE: 2038 case CPU_ONLINE:
2056 case CPU_ONLINE_FROZEN: 2039 case CPU_ONLINE_FROZEN:
2057 mce_create_device(cpu); 2040 mce_sysdev_create(cpu);
2058 if (threshold_cpu_callback) 2041 if (threshold_cpu_callback)
2059 threshold_cpu_callback(action, cpu); 2042 threshold_cpu_callback(action, cpu);
2060 break; 2043 break;
@@ -2062,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2062 case CPU_DEAD_FROZEN: 2045 case CPU_DEAD_FROZEN:
2063 if (threshold_cpu_callback) 2046 if (threshold_cpu_callback)
2064 threshold_cpu_callback(action, cpu); 2047 threshold_cpu_callback(action, cpu);
2065 mce_remove_device(cpu); 2048 mce_sysdev_remove(cpu);
2066 break; 2049 break;
2067 case CPU_DOWN_PREPARE: 2050 case CPU_DOWN_PREPARE:
2068 case CPU_DOWN_PREPARE_FROZEN: 2051 case CPU_DOWN_PREPARE_FROZEN:
@@ -2116,27 +2099,28 @@ static __init int mcheck_init_device(void)
2116 if (!mce_available(&boot_cpu_data)) 2099 if (!mce_available(&boot_cpu_data))
2117 return -EIO; 2100 return -EIO;
2118 2101
2119 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2102 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
2120 2103
2121 mce_init_banks(); 2104 mce_init_banks();
2122 2105
2123 err = sysdev_class_register(&mce_sysclass); 2106 err = sysdev_class_register(&mce_sysdev_class);
2124 if (err) 2107 if (err)
2125 return err; 2108 return err;
2126 2109
2127 for_each_online_cpu(i) { 2110 for_each_online_cpu(i) {
2128 err = mce_create_device(i); 2111 err = mce_sysdev_create(i);
2129 if (err) 2112 if (err)
2130 return err; 2113 return err;
2131 } 2114 }
2132 2115
2133 register_syscore_ops(&mce_syscore_ops); 2116 register_syscore_ops(&mce_syscore_ops);
2134 register_hotcpu_notifier(&mce_cpu_notifier); 2117 register_hotcpu_notifier(&mce_cpu_notifier);
2135 misc_register(&mce_log_device); 2118
2119 /* register character device /dev/mcelog */
2120 misc_register(&mce_chrdev_device);
2136 2121
2137 return err; 2122 return err;
2138} 2123}
2139
2140device_initcall(mcheck_init_device); 2124device_initcall(mcheck_init_device);
2141 2125
2142/* 2126/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad3514..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
548 if (!b) 548 if (!b)
549 goto out; 549 goto out;
550 550
551 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, 551 err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
552 b->kobj, name); 552 b->kobj, name);
553 if (err) 553 if (err)
554 goto out; 554 goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
571 goto out; 571 goto out;
572 } 572 }
573 573
574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); 574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
575 if (!b->kobj) 575 if (!b->kobj)
576 goto out_free; 576 goto out_free;
577 577
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
591 if (i == cpu) 591 if (i == cpu)
592 continue; 592 continue;
593 593
594 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, 594 err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
595 b->kobj, name); 595 b->kobj, name);
596 if (err) 596 if (err)
597 goto out; 597 goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
669#ifdef CONFIG_SMP 669#ifdef CONFIG_SMP
670 /* sibling symlink */ 670 /* sibling symlink */
671 if (shared_bank[bank] && b->blocks->cpu != cpu) { 671 if (shared_bank[bank] && b->blocks->cpu != cpu) {
672 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); 672 sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
673 per_cpu(threshold_banks, cpu)[bank] = NULL; 673 per_cpu(threshold_banks, cpu)[bank] = NULL;
674 674
675 return; 675 return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
681 if (i == cpu) 681 if (i == cpu)
682 continue; 682 continue;
683 683
684 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); 684 sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
685 per_cpu(threshold_banks, i)[bank] = NULL; 685 per_cpu(threshold_banks, i)[bank] = NULL;
686 } 686 }
687 687
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..6b96110bb0c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
79static int have_wrcomb(void) 79static int have_wrcomb(void)
80{ 80{
81 struct pci_dev *dev; 81 struct pci_dev *dev;
82 u8 rev;
83 82
84 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); 83 dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
85 if (dev != NULL) { 84 if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
89 * chipsets to be tagged 88 * chipsets to be tagged
90 */ 89 */
91 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && 90 if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
92 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { 91 dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
93 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 92 dev->revision <= 5) {
94 if (rev <= 5) { 93 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
95 pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); 94 pci_dev_put(dev);
96 pci_dev_put(dev); 95 return 0;
97 return 0;
98 }
99 } 96 }
100 /* 97 /*
101 * Intel 450NX errata # 23. Non ascending cacheline evictions to 98 * Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,56 +134,42 @@ static void __init init_table(void)
137} 134}
138 135
139struct set_mtrr_data { 136struct set_mtrr_data {
140 atomic_t count;
141 atomic_t gate;
142 unsigned long smp_base; 137 unsigned long smp_base;
143 unsigned long smp_size; 138 unsigned long smp_size;
144 unsigned int smp_reg; 139 unsigned int smp_reg;
145 mtrr_type smp_type; 140 mtrr_type smp_type;
146}; 141};
147 142
148static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
149
150/** 143/**
151 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. 144 * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
145 * by all the CPUs.
152 * @info: pointer to mtrr configuration data 146 * @info: pointer to mtrr configuration data
153 * 147 *
154 * Returns nothing. 148 * Returns nothing.
155 */ 149 */
156static int mtrr_work_handler(void *info) 150static int mtrr_rendezvous_handler(void *info)
157{ 151{
158#ifdef CONFIG_SMP
159 struct set_mtrr_data *data = info; 152 struct set_mtrr_data *data = info;
160 unsigned long flags;
161 153
162 atomic_dec(&data->count); 154 /*
163 while (!atomic_read(&data->gate)) 155 * We use this same function to initialize the mtrrs during boot,
164 cpu_relax(); 156 * resume, runtime cpu online and on an explicit request to set a
165 157 * specific MTRR.
166 local_irq_save(flags); 158 *
167 159 * During boot or suspend, the state of the boot cpu's mtrrs has been
168 atomic_dec(&data->count); 160 * saved, and we want to replicate that across all the cpus that come
169 while (atomic_read(&data->gate)) 161 * online (either at the end of boot or resume or during a runtime cpu
170 cpu_relax(); 162 * online). If we're doing that, @reg is set to something special and on
171 163 * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
172 /* The master has cleared me to execute */ 164 * started the boot/resume sequence, this might be a duplicate
165 * set_all()).
166 */
173 if (data->smp_reg != ~0U) { 167 if (data->smp_reg != ~0U) {
174 mtrr_if->set(data->smp_reg, data->smp_base, 168 mtrr_if->set(data->smp_reg, data->smp_base,
175 data->smp_size, data->smp_type); 169 data->smp_size, data->smp_type);
176 } else if (mtrr_aps_delayed_init) { 170 } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
177 /*
178 * Initialize the MTRRs inaddition to the synchronisation.
179 */
180 mtrr_if->set_all(); 171 mtrr_if->set_all();
181 } 172 }
182
183 atomic_dec(&data->count);
184 while (!atomic_read(&data->gate))
185 cpu_relax();
186
187 atomic_dec(&data->count);
188 local_irq_restore(flags);
189#endif
190 return 0; 173 return 0;
191} 174}
192 175
@@ -223,20 +206,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
223 * 14. Wait for buddies to catch up 206 * 14. Wait for buddies to catch up
224 * 15. Enable interrupts. 207 * 15. Enable interrupts.
225 * 208 *
226 * What does that mean for us? Well, first we set data.count to the number 209 * What does that mean for us? Well, stop_machine() will ensure that
227 * of CPUs. As each CPU announces that it started the rendezvous handler by 210 * the rendezvous handler is started on each CPU. And in lockstep they
228 * decrementing the count, We reset data.count and set the data.gate flag 211 * do the state transition of disabling interrupts, updating MTRR's
229 * allowing all the cpu's to proceed with the work. As each cpu disables 212 * (the CPU vendors may each do it differently, so we call mtrr_if->set()
230 * interrupts, it'll decrement data.count once. We wait until it hits 0 and 213 * callback and let them take care of it.) and enabling interrupts.
231 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
232 * are waiting for that flag to be cleared. Once it's cleared, each
233 * CPU goes through the transition of updating MTRRs.
234 * The CPU vendors may each do it differently,
235 * so we call mtrr_if->set() callback and let them take care of it.
236 * When they're done, they again decrement data->count and wait for data.gate
237 * to be set.
238 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
239 * Everyone then enables interrupts and we all continue on.
240 * 214 *
241 * Note that the mechanism is the same for UP systems, too; all the SMP stuff 215 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
242 * becomes nops. 216 * becomes nops.
@@ -244,92 +218,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
244static void 218static void
245set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) 219set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
246{ 220{
247 struct set_mtrr_data data; 221 struct set_mtrr_data data = { .smp_reg = reg,
248 unsigned long flags; 222 .smp_base = base,
249 int cpu; 223 .smp_size = size,
224 .smp_type = type
225 };
250 226
251 preempt_disable(); 227 stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
252 228}
253 data.smp_reg = reg;
254 data.smp_base = base;
255 data.smp_size = size;
256 data.smp_type = type;
257 atomic_set(&data.count, num_booting_cpus() - 1);
258
259 /* Make sure data.count is visible before unleashing other CPUs */
260 smp_wmb();
261 atomic_set(&data.gate, 0);
262
263 /* Start the ball rolling on other CPUs */
264 for_each_online_cpu(cpu) {
265 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
266
267 if (cpu == smp_processor_id())
268 continue;
269
270 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
271 }
272
273
274 while (atomic_read(&data.count))
275 cpu_relax();
276
277 /* Ok, reset count and toggle gate */
278 atomic_set(&data.count, num_booting_cpus() - 1);
279 smp_wmb();
280 atomic_set(&data.gate, 1);
281
282 local_irq_save(flags);
283
284 while (atomic_read(&data.count))
285 cpu_relax();
286
287 /* Ok, reset count and toggle gate */
288 atomic_set(&data.count, num_booting_cpus() - 1);
289 smp_wmb();
290 atomic_set(&data.gate, 0);
291
292 /* Do our MTRR business */
293
294 /*
295 * HACK!
296 *
297 * We use this same function to initialize the mtrrs during boot,
298 * resume, runtime cpu online and on an explicit request to set a
299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
310 */
311 if (reg != ~0U)
312 mtrr_if->set(reg, base, size, type);
313 else
314 mtrr_if->set_all();
315
316 /* Wait for the others */
317 while (atomic_read(&data.count))
318 cpu_relax();
319
320 atomic_set(&data.count, num_booting_cpus() - 1);
321 smp_wmb();
322 atomic_set(&data.gate, 1);
323
324 /*
325 * Wait here for everyone to have seen the gate change
326 * So we're the last ones to touch 'data'
327 */
328 while (atomic_read(&data.count))
329 cpu_relax();
330 229
331 local_irq_restore(flags); 230static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
332 preempt_enable(); 231 unsigned long size, mtrr_type type)
232{
233 struct set_mtrr_data data = { .smp_reg = reg,
234 .smp_base = base,
235 .smp_size = size,
236 .smp_type = type
237 };
238
239 stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
240 cpu_callout_mask);
333} 241}
334 242
335/** 243/**
@@ -783,7 +691,7 @@ void mtrr_ap_init(void)
783 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug 691 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
784 * lock to prevent mtrr entry changes 692 * lock to prevent mtrr entry changes
785 */ 693 */
786 set_mtrr(~0U, 0, 0, 0); 694 set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
787} 695}
788 696
789/** 697/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b4b17..cfa62ec090e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/highmem.h>
26#include <linux/cpu.h> 25#include <linux/cpu.h>
27#include <linux/bitops.h> 26#include <linux/bitops.h>
28 27
@@ -45,38 +44,27 @@ do { \
45#endif 44#endif
46 45
47/* 46/*
48 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context 47 * | NHM/WSM | SNB |
48 * register -------------------------------
49 * | HT | no HT | HT | no HT |
50 *-----------------------------------------
51 * offcore | core | core | cpu | core |
52 * lbr_sel | core | core | cpu | core |
53 * ld_lat | cpu | core | cpu | core |
54 *-----------------------------------------
55 *
56 * Given that there is a small number of shared regs,
57 * we can pre-allocate their slot in the per-cpu
58 * per-core reg tables.
49 */ 59 */
50static unsigned long 60enum extra_reg_type {
51copy_from_user_nmi(void *to, const void __user *from, unsigned long n) 61 EXTRA_REG_NONE = -1, /* not used */
52{
53 unsigned long offset, addr = (unsigned long)from;
54 unsigned long size, len = 0;
55 struct page *page;
56 void *map;
57 int ret;
58
59 do {
60 ret = __get_user_pages_fast(addr, 1, 0, &page);
61 if (!ret)
62 break;
63
64 offset = addr & (PAGE_SIZE - 1);
65 size = min(PAGE_SIZE - offset, n - len);
66
67 map = kmap_atomic(page);
68 memcpy(to, map+offset, size);
69 kunmap_atomic(map);
70 put_page(page);
71 62
72 len += size; 63 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
73 to += size; 64 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
74 addr += size;
75 65
76 } while (len < n); 66 EXTRA_REG_MAX /* number of entries needed */
77 67};
78 return len;
79}
80 68
81struct event_constraint { 69struct event_constraint {
82 union { 70 union {
@@ -132,11 +120,10 @@ struct cpu_hw_events {
132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 120 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
133 121
134 /* 122 /*
135 * Intel percore register state. 123 * manage shared (per-core, per-cpu) registers
136 * Coordinate shared resources between HT threads. 124 * used on Intel NHM/WSM/SNB
137 */ 125 */
138 int percore_used; /* Used by this CPU? */ 126 struct intel_shared_regs *shared_regs;
139 struct intel_percore *per_core;
140 127
141 /* 128 /*
142 * AMD specific bits 129 * AMD specific bits
@@ -187,26 +174,45 @@ struct cpu_hw_events {
187 for ((e) = (c); (e)->weight; (e)++) 174 for ((e) = (c); (e)->weight; (e)++)
188 175
189/* 176/*
177 * Per register state.
178 */
179struct er_account {
180 raw_spinlock_t lock; /* per-core: protect structure */
181 u64 config; /* extra MSR config */
182 u64 reg; /* extra MSR number */
183 atomic_t ref; /* reference count */
184};
185
186/*
190 * Extra registers for specific events. 187 * Extra registers for specific events.
188 *
191 * Some events need large masks and require external MSRs. 189 * Some events need large masks and require external MSRs.
192 * Define a mapping to these extra registers. 190 * Those extra MSRs end up being shared for all events on
191 * a PMU and sometimes between PMU of sibling HT threads.
192 * In either case, the kernel needs to handle conflicting
193 * accesses to those extra, shared, regs. The data structure
194 * to manage those registers is stored in cpu_hw_event.
193 */ 195 */
194struct extra_reg { 196struct extra_reg {
195 unsigned int event; 197 unsigned int event;
196 unsigned int msr; 198 unsigned int msr;
197 u64 config_mask; 199 u64 config_mask;
198 u64 valid_mask; 200 u64 valid_mask;
201 int idx; /* per_xxx->regs[] reg index */
199}; 202};
200 203
201#define EVENT_EXTRA_REG(e, ms, m, vm) { \ 204#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
202 .event = (e), \ 205 .event = (e), \
203 .msr = (ms), \ 206 .msr = (ms), \
204 .config_mask = (m), \ 207 .config_mask = (m), \
205 .valid_mask = (vm), \ 208 .valid_mask = (vm), \
209 .idx = EXTRA_REG_##i \
206 } 210 }
207#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ 211
208 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) 212#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
209#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) 213 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
214
215#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
210 216
211union perf_capabilities { 217union perf_capabilities {
212 struct { 218 struct {
@@ -252,7 +258,6 @@ struct x86_pmu {
252 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 258 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
253 struct perf_event *event); 259 struct perf_event *event);
254 struct event_constraint *event_constraints; 260 struct event_constraint *event_constraints;
255 struct event_constraint *percore_constraints;
256 void (*quirks)(void); 261 void (*quirks)(void);
257 int perfctr_second_write; 262 int perfctr_second_write;
258 263
@@ -286,8 +291,12 @@ struct x86_pmu {
286 * Extra registers for events 291 * Extra registers for events
287 */ 292 */
288 struct extra_reg *extra_regs; 293 struct extra_reg *extra_regs;
294 unsigned int er_flags;
289}; 295};
290 296
297#define ERF_NO_HT_SHARING 1
298#define ERF_HAS_RSP_1 2
299
291static struct x86_pmu x86_pmu __read_mostly; 300static struct x86_pmu x86_pmu __read_mostly;
292 301
293static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 302static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -393,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
393 */ 402 */
394static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 403static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
395{ 404{
405 struct hw_perf_event_extra *reg;
396 struct extra_reg *er; 406 struct extra_reg *er;
397 407
398 event->hw.extra_reg = 0; 408 reg = &event->hw.extra_reg;
399 event->hw.extra_config = 0;
400 409
401 if (!x86_pmu.extra_regs) 410 if (!x86_pmu.extra_regs)
402 return 0; 411 return 0;
@@ -406,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
406 continue; 415 continue;
407 if (event->attr.config1 & ~er->valid_mask) 416 if (event->attr.config1 & ~er->valid_mask)
408 return -EINVAL; 417 return -EINVAL;
409 event->hw.extra_reg = er->msr; 418
410 event->hw.extra_config = event->attr.config1; 419 reg->idx = er->idx;
420 reg->config = event->attr.config1;
421 reg->reg = er->msr;
411 break; 422 break;
412 } 423 }
413 return 0; 424 return 0;
@@ -706,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
706 event->hw.last_cpu = -1; 717 event->hw.last_cpu = -1;
707 event->hw.last_tag = ~0ULL; 718 event->hw.last_tag = ~0ULL;
708 719
720 /* mark unused */
721 event->hw.extra_reg.idx = EXTRA_REG_NONE;
722
709 return x86_pmu.hw_config(event); 723 return x86_pmu.hw_config(event);
710} 724}
711 725
@@ -747,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
747static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, 761static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
748 u64 enable_mask) 762 u64 enable_mask)
749{ 763{
750 if (hwc->extra_reg) 764 if (hwc->extra_reg.reg)
751 wrmsrl(hwc->extra_reg, hwc->extra_config); 765 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
752 wrmsrl(hwc->config_base, hwc->config | enable_mask); 766 wrmsrl(hwc->config_base, hwc->config | enable_mask);
753} 767}
754 768
@@ -1332,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1332 if (!x86_perf_event_set_period(event)) 1346 if (!x86_perf_event_set_period(event))
1333 continue; 1347 continue;
1334 1348
1335 if (perf_event_overflow(event, 1, &data, regs)) 1349 if (perf_event_overflow(event, &data, regs))
1336 x86_pmu_stop(event, 0); 1350 x86_pmu_stop(event, 0);
1337 } 1351 }
1338 1352
@@ -1637,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
1637 perf_pmu_enable(pmu); 1651 perf_pmu_enable(pmu);
1638 return 0; 1652 return 0;
1639} 1653}
1654/*
1655 * a fake_cpuc is used to validate event groups. Due to
1656 * the extra reg logic, we need to also allocate a fake
1657 * per_core and per_cpu structure. Otherwise, group events
1658 * using extra reg may conflict without the kernel being
1659 * able to catch this when the last event gets added to
1660 * the group.
1661 */
1662static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1663{
1664 kfree(cpuc->shared_regs);
1665 kfree(cpuc);
1666}
1667
1668static struct cpu_hw_events *allocate_fake_cpuc(void)
1669{
1670 struct cpu_hw_events *cpuc;
1671 int cpu = raw_smp_processor_id();
1672
1673 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1674 if (!cpuc)
1675 return ERR_PTR(-ENOMEM);
1676
1677 /* only needed, if we have extra_regs */
1678 if (x86_pmu.extra_regs) {
1679 cpuc->shared_regs = allocate_shared_regs(cpu);
1680 if (!cpuc->shared_regs)
1681 goto error;
1682 }
1683 return cpuc;
1684error:
1685 free_fake_cpuc(cpuc);
1686 return ERR_PTR(-ENOMEM);
1687}
1640 1688
1641/* 1689/*
1642 * validate that we can schedule this event 1690 * validate that we can schedule this event
@@ -1647,9 +1695,9 @@ static int validate_event(struct perf_event *event)
1647 struct event_constraint *c; 1695 struct event_constraint *c;
1648 int ret = 0; 1696 int ret = 0;
1649 1697
1650 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); 1698 fake_cpuc = allocate_fake_cpuc();
1651 if (!fake_cpuc) 1699 if (IS_ERR(fake_cpuc))
1652 return -ENOMEM; 1700 return PTR_ERR(fake_cpuc);
1653 1701
1654 c = x86_pmu.get_event_constraints(fake_cpuc, event); 1702 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1655 1703
@@ -1659,7 +1707,7 @@ static int validate_event(struct perf_event *event)
1659 if (x86_pmu.put_event_constraints) 1707 if (x86_pmu.put_event_constraints)
1660 x86_pmu.put_event_constraints(fake_cpuc, event); 1708 x86_pmu.put_event_constraints(fake_cpuc, event);
1661 1709
1662 kfree(fake_cpuc); 1710 free_fake_cpuc(fake_cpuc);
1663 1711
1664 return ret; 1712 return ret;
1665} 1713}
@@ -1679,36 +1727,32 @@ static int validate_group(struct perf_event *event)
1679{ 1727{
1680 struct perf_event *leader = event->group_leader; 1728 struct perf_event *leader = event->group_leader;
1681 struct cpu_hw_events *fake_cpuc; 1729 struct cpu_hw_events *fake_cpuc;
1682 int ret, n; 1730 int ret = -ENOSPC, n;
1683
1684 ret = -ENOMEM;
1685 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1686 if (!fake_cpuc)
1687 goto out;
1688 1731
1732 fake_cpuc = allocate_fake_cpuc();
1733 if (IS_ERR(fake_cpuc))
1734 return PTR_ERR(fake_cpuc);
1689 /* 1735 /*
1690 * the event is not yet connected with its 1736 * the event is not yet connected with its
1691 * siblings therefore we must first collect 1737 * siblings therefore we must first collect
1692 * existing siblings, then add the new event 1738 * existing siblings, then add the new event
1693 * before we can simulate the scheduling 1739 * before we can simulate the scheduling
1694 */ 1740 */
1695 ret = -ENOSPC;
1696 n = collect_events(fake_cpuc, leader, true); 1741 n = collect_events(fake_cpuc, leader, true);
1697 if (n < 0) 1742 if (n < 0)
1698 goto out_free; 1743 goto out;
1699 1744
1700 fake_cpuc->n_events = n; 1745 fake_cpuc->n_events = n;
1701 n = collect_events(fake_cpuc, event, false); 1746 n = collect_events(fake_cpuc, event, false);
1702 if (n < 0) 1747 if (n < 0)
1703 goto out_free; 1748 goto out;
1704 1749
1705 fake_cpuc->n_events = n; 1750 fake_cpuc->n_events = n;
1706 1751
1707 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 1752 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1708 1753
1709out_free:
1710 kfree(fake_cpuc);
1711out: 1754out:
1755 free_fake_cpuc(fake_cpuc);
1712 return ret; 1756 return ret;
1713} 1757}
1714 1758
@@ -1856,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1856 1900
1857 perf_callchain_store(entry, regs->ip); 1901 perf_callchain_store(entry, regs->ip);
1858 1902
1903 if (!current->mm)
1904 return;
1905
1859 if (perf_callchain_user32(regs, entry)) 1906 if (perf_callchain_user32(regs, entry))
1860 return; 1907 return;
1861 1908
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
89 [ C(RESULT_MISS) ] = -1, 89 [ C(RESULT_MISS) ] = -1,
90 }, 90 },
91 }, 91 },
92 [ C(NODE) ] = {
93 [ C(OP_READ) ] = {
94 [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
95 [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
96 },
97 [ C(OP_WRITE) ] = {
98 [ C(RESULT_ACCESS) ] = -1,
99 [ C(RESULT_MISS) ] = -1,
100 },
101 [ C(OP_PREFETCH) ] = {
102 [ C(RESULT_ACCESS) ] = -1,
103 [ C(RESULT_MISS) ] = -1,
104 },
105 },
92}; 106};
93 107
94/* 108/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c4..f88af2c2a56 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/* 3/*
15 * Per core state 4 * Per core/cpu state
16 * This used to coordinate shared registers for HT threads. 5 *
6 * Used to coordinate shared registers between HT threads or
7 * among events on a single PMU.
17 */ 8 */
18struct intel_percore { 9struct intel_shared_regs {
19 raw_spinlock_t lock; /* protect structure */ 10 struct er_account regs[EXTRA_REG_MAX];
20 struct er_account regs[MAX_EXTRA_REGS]; 11 int refcnt; /* per-core: #HT threads */
21 int refcnt; /* number of threads */ 12 unsigned core_id; /* per-core: core id */
22 unsigned core_id;
23}; 13};
24 14
25/* 15/*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
88 78
89static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = 79static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
90{ 80{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 81 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
92 EVENT_EXTRA_END 82 EVENT_EXTRA_END
93}; 83};
94 84
95static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
101static struct event_constraint intel_westmere_event_constraints[] __read_mostly = 85static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
102{ 86{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 87 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -116,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 100 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 101 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ 102 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 103 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 104 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END 105 EVENT_CONSTRAINT_END
@@ -125,15 +107,13 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
125 107
126static struct extra_reg intel_westmere_extra_regs[] __read_mostly = 108static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
127{ 109{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 110 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), 111 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
130 EVENT_EXTRA_END 112 EVENT_EXTRA_END
131}; 113};
132 114
133static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = 115static struct event_constraint intel_v1_event_constraints[] __read_mostly =
134{ 116{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END 117 EVENT_CONSTRAINT_END
138}; 118};
139 119
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
145 EVENT_CONSTRAINT_END 125 EVENT_CONSTRAINT_END
146}; 126};
147 127
128static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
129 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
130 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
131 EVENT_EXTRA_END
132};
133
148static u64 intel_pmu_event_map(int hw_event) 134static u64 intel_pmu_event_map(int hw_event)
149{ 135{
150 return intel_perfmon_event_map[hw_event]; 136 return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
245 [ C(RESULT_MISS) ] = -1, 231 [ C(RESULT_MISS) ] = -1,
246 }, 232 },
247 }, 233 },
234 [ C(NODE) ] = {
235 [ C(OP_READ) ] = {
236 [ C(RESULT_ACCESS) ] = -1,
237 [ C(RESULT_MISS) ] = -1,
238 },
239 [ C(OP_WRITE) ] = {
240 [ C(RESULT_ACCESS) ] = -1,
241 [ C(RESULT_MISS) ] = -1,
242 },
243 [ C(OP_PREFETCH) ] = {
244 [ C(RESULT_ACCESS) ] = -1,
245 [ C(RESULT_MISS) ] = -1,
246 },
247 },
248
248}; 249};
249 250
250static __initconst const u64 westmere_hw_cache_event_ids 251static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
346 [ C(RESULT_MISS) ] = -1, 347 [ C(RESULT_MISS) ] = -1,
347 }, 348 },
348 }, 349 },
350 [ C(NODE) ] = {
351 [ C(OP_READ) ] = {
352 [ C(RESULT_ACCESS) ] = 0x01b7,
353 [ C(RESULT_MISS) ] = 0x01b7,
354 },
355 [ C(OP_WRITE) ] = {
356 [ C(RESULT_ACCESS) ] = 0x01b7,
357 [ C(RESULT_MISS) ] = 0x01b7,
358 },
359 [ C(OP_PREFETCH) ] = {
360 [ C(RESULT_ACCESS) ] = 0x01b7,
361 [ C(RESULT_MISS) ] = 0x01b7,
362 },
363 },
349}; 364};
350 365
351/* 366/*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
398 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, 413 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
399 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, 414 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
400 }, 415 },
401 } 416 },
417 [ C(NODE) ] = {
418 [ C(OP_READ) ] = {
419 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
420 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
421 },
422 [ C(OP_WRITE) ] = {
423 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
424 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
425 },
426 [ C(OP_PREFETCH) ] = {
427 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
428 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
429 },
430 },
402}; 431};
403 432
404static __initconst const u64 nehalem_hw_cache_event_ids 433static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
500 [ C(RESULT_MISS) ] = -1, 529 [ C(RESULT_MISS) ] = -1,
501 }, 530 },
502 }, 531 },
532 [ C(NODE) ] = {
533 [ C(OP_READ) ] = {
534 [ C(RESULT_ACCESS) ] = 0x01b7,
535 [ C(RESULT_MISS) ] = 0x01b7,
536 },
537 [ C(OP_WRITE) ] = {
538 [ C(RESULT_ACCESS) ] = 0x01b7,
539 [ C(RESULT_MISS) ] = 0x01b7,
540 },
541 [ C(OP_PREFETCH) ] = {
542 [ C(RESULT_ACCESS) ] = 0x01b7,
543 [ C(RESULT_MISS) ] = 0x01b7,
544 },
545 },
503}; 546};
504 547
505static __initconst const u64 core2_hw_cache_event_ids 548static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
1003 1046
1004 data.period = event->hw.last_period; 1047 data.period = event->hw.last_period;
1005 1048
1006 if (perf_event_overflow(event, 1, &data, regs)) 1049 if (perf_event_overflow(event, &data, regs))
1007 x86_pmu_stop(event, 0); 1050 x86_pmu_stop(event, 0);
1008 } 1051 }
1009 1052
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
1037 return NULL; 1080 return NULL;
1038} 1081}
1039 1082
1083static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
1084{
1085 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
1086 return false;
1087
1088 if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
1089 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1090 event->hw.config |= 0x01bb;
1091 event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
1092 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
1093 } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
1094 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1095 event->hw.config |= 0x01b7;
1096 event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
1097 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
1098 }
1099
1100 if (event->hw.extra_reg.idx == orig_idx)
1101 return false;
1102
1103 return true;
1104}
1105
1106/*
1107 * manage allocation of shared extra msr for certain events
1108 *
1109 * sharing can be:
1110 * per-cpu: to be shared between the various events on a single PMU
1111 * per-core: per-cpu + shared by HT threads
1112 */
1040static struct event_constraint * 1113static struct event_constraint *
1041intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1114__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1115 struct perf_event *event)
1042{ 1116{
1043 struct hw_perf_event *hwc = &event->hw; 1117 struct event_constraint *c = &emptyconstraint;
1044 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; 1118 struct hw_perf_event_extra *reg = &event->hw.extra_reg;
1045 struct event_constraint *c;
1046 struct intel_percore *pc;
1047 struct er_account *era; 1119 struct er_account *era;
1048 int i; 1120 unsigned long flags;
1049 int free_slot; 1121 int orig_idx = reg->idx;
1050 int found;
1051 1122
1052 if (!x86_pmu.percore_constraints || hwc->extra_alloc) 1123 /* already allocated shared msr */
1053 return NULL; 1124 if (reg->alloc)
1125 return &unconstrained;
1054 1126
1055 for (c = x86_pmu.percore_constraints; c->cmask; c++) { 1127again:
1056 if (e != c->code) 1128 era = &cpuc->shared_regs->regs[reg->idx];
1057 continue; 1129 /*
1130 * we use spin_lock_irqsave() to avoid lockdep issues when
1131 * passing a fake cpuc
1132 */
1133 raw_spin_lock_irqsave(&era->lock, flags);
1134
1135 if (!atomic_read(&era->ref) || era->config == reg->config) {
1136
1137 /* lock in msr value */
1138 era->config = reg->config;
1139 era->reg = reg->reg;
1140
1141 /* one more user */
1142 atomic_inc(&era->ref);
1143
1144 /* no need to reallocate during incremental event scheduling */
1145 reg->alloc = 1;
1058 1146
1059 /* 1147 /*
1060 * Allocate resource per core. 1148 * All events using extra_reg are unconstrained.
1149 * Avoids calling x86_get_event_constraints()
1150 *
1151 * Must revisit if extra_reg controlling events
1152 * ever have constraints. Worst case we go through
1153 * the regular event constraint table.
1061 */ 1154 */
1062 pc = cpuc->per_core; 1155 c = &unconstrained;
1063 if (!pc) 1156 } else if (intel_try_alt_er(event, orig_idx)) {
1064 break; 1157 raw_spin_unlock(&era->lock);
1065 c = &emptyconstraint; 1158 goto again;
1066 raw_spin_lock(&pc->lock);
1067 free_slot = -1;
1068 found = 0;
1069 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1070 era = &pc->regs[i];
1071 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1072 /* Allow sharing same config */
1073 if (hwc->extra_config == era->extra_config) {
1074 era->ref++;
1075 cpuc->percore_used = 1;
1076 hwc->extra_alloc = 1;
1077 c = NULL;
1078 }
1079 /* else conflict */
1080 found = 1;
1081 break;
1082 } else if (era->ref == 0 && free_slot == -1)
1083 free_slot = i;
1084 }
1085 if (!found && free_slot != -1) {
1086 era = &pc->regs[free_slot];
1087 era->ref = 1;
1088 era->extra_reg = hwc->extra_reg;
1089 era->extra_config = hwc->extra_config;
1090 cpuc->percore_used = 1;
1091 hwc->extra_alloc = 1;
1092 c = NULL;
1093 }
1094 raw_spin_unlock(&pc->lock);
1095 return c;
1096 } 1159 }
1160 raw_spin_unlock_irqrestore(&era->lock, flags);
1097 1161
1098 return NULL; 1162 return c;
1163}
1164
1165static void
1166__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
1167 struct hw_perf_event_extra *reg)
1168{
1169 struct er_account *era;
1170
1171 /*
1172 * only put constraint if extra reg was actually
1173 * allocated. Also takes care of event which do
1174 * not use an extra shared reg
1175 */
1176 if (!reg->alloc)
1177 return;
1178
1179 era = &cpuc->shared_regs->regs[reg->idx];
1180
1181 /* one fewer user */
1182 atomic_dec(&era->ref);
1183
1184 /* allocate again next time */
1185 reg->alloc = 0;
1186}
1187
1188static struct event_constraint *
1189intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1190 struct perf_event *event)
1191{
1192 struct event_constraint *c = NULL;
1193
1194 if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
1195 c = __intel_shared_reg_get_constraints(cpuc, event);
1196
1197 return c;
1099} 1198}
1100 1199
1101static struct event_constraint * 1200static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
1111 if (c) 1210 if (c)
1112 return c; 1211 return c;
1113 1212
1114 c = intel_percore_constraints(cpuc, event); 1213 c = intel_shared_regs_constraints(cpuc, event);
1115 if (c) 1214 if (c)
1116 return c; 1215 return c;
1117 1216
1118 return x86_get_event_constraints(cpuc, event); 1217 return x86_get_event_constraints(cpuc, event);
1119} 1218}
1120 1219
1121static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 1220static void
1221intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1122 struct perf_event *event) 1222 struct perf_event *event)
1123{ 1223{
1124 struct extra_reg *er; 1224 struct hw_perf_event_extra *reg;
1125 struct intel_percore *pc;
1126 struct er_account *era;
1127 struct hw_perf_event *hwc = &event->hw;
1128 int i, allref;
1129 1225
1130 if (!cpuc->percore_used) 1226 reg = &event->hw.extra_reg;
1131 return; 1227 if (reg->idx != EXTRA_REG_NONE)
1132 1228 __intel_shared_reg_put_constraints(cpuc, reg);
1133 for (er = x86_pmu.extra_regs; er->msr; er++) { 1229}
1134 if (er->event != (hwc->config & er->config_mask))
1135 continue;
1136 1230
1137 pc = cpuc->per_core; 1231static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1138 raw_spin_lock(&pc->lock); 1232 struct perf_event *event)
1139 for (i = 0; i < MAX_EXTRA_REGS; i++) { 1233{
1140 era = &pc->regs[i]; 1234 intel_put_shared_regs_event_constraints(cpuc, event);
1141 if (era->ref > 0 &&
1142 era->extra_config == hwc->extra_config &&
1143 era->extra_reg == er->msr) {
1144 era->ref--;
1145 hwc->extra_alloc = 0;
1146 break;
1147 }
1148 }
1149 allref = 0;
1150 for (i = 0; i < MAX_EXTRA_REGS; i++)
1151 allref += pc->regs[i].ref;
1152 if (allref == 0)
1153 cpuc->percore_used = 0;
1154 raw_spin_unlock(&pc->lock);
1155 break;
1156 }
1157} 1235}
1158 1236
1159static int intel_pmu_hw_config(struct perf_event *event) 1237static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
1231 .event_constraints = intel_core_event_constraints, 1309 .event_constraints = intel_core_event_constraints,
1232}; 1310};
1233 1311
1312static struct intel_shared_regs *allocate_shared_regs(int cpu)
1313{
1314 struct intel_shared_regs *regs;
1315 int i;
1316
1317 regs = kzalloc_node(sizeof(struct intel_shared_regs),
1318 GFP_KERNEL, cpu_to_node(cpu));
1319 if (regs) {
1320 /*
1321 * initialize the locks to keep lockdep happy
1322 */
1323 for (i = 0; i < EXTRA_REG_MAX; i++)
1324 raw_spin_lock_init(&regs->regs[i].lock);
1325
1326 regs->core_id = -1;
1327 }
1328 return regs;
1329}
1330
1234static int intel_pmu_cpu_prepare(int cpu) 1331static int intel_pmu_cpu_prepare(int cpu)
1235{ 1332{
1236 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1333 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1237 1334
1238 if (!cpu_has_ht_siblings()) 1335 if (!x86_pmu.extra_regs)
1239 return NOTIFY_OK; 1336 return NOTIFY_OK;
1240 1337
1241 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), 1338 cpuc->shared_regs = allocate_shared_regs(cpu);
1242 GFP_KERNEL, cpu_to_node(cpu)); 1339 if (!cpuc->shared_regs)
1243 if (!cpuc->per_core)
1244 return NOTIFY_BAD; 1340 return NOTIFY_BAD;
1245 1341
1246 raw_spin_lock_init(&cpuc->per_core->lock);
1247 cpuc->per_core->core_id = -1;
1248 return NOTIFY_OK; 1342 return NOTIFY_OK;
1249} 1343}
1250 1344
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
1260 */ 1354 */
1261 intel_pmu_lbr_reset(); 1355 intel_pmu_lbr_reset();
1262 1356
1263 if (!cpu_has_ht_siblings()) 1357 if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
1264 return; 1358 return;
1265 1359
1266 for_each_cpu(i, topology_thread_cpumask(cpu)) { 1360 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1267 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; 1361 struct intel_shared_regs *pc;
1268 1362
1363 pc = per_cpu(cpu_hw_events, i).shared_regs;
1269 if (pc && pc->core_id == core_id) { 1364 if (pc && pc->core_id == core_id) {
1270 kfree(cpuc->per_core); 1365 kfree(cpuc->shared_regs);
1271 cpuc->per_core = pc; 1366 cpuc->shared_regs = pc;
1272 break; 1367 break;
1273 } 1368 }
1274 } 1369 }
1275 1370
1276 cpuc->per_core->core_id = core_id; 1371 cpuc->shared_regs->core_id = core_id;
1277 cpuc->per_core->refcnt++; 1372 cpuc->shared_regs->refcnt++;
1278} 1373}
1279 1374
1280static void intel_pmu_cpu_dying(int cpu) 1375static void intel_pmu_cpu_dying(int cpu)
1281{ 1376{
1282 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1377 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1283 struct intel_percore *pc = cpuc->per_core; 1378 struct intel_shared_regs *pc;
1284 1379
1380 pc = cpuc->shared_regs;
1285 if (pc) { 1381 if (pc) {
1286 if (pc->core_id == -1 || --pc->refcnt == 0) 1382 if (pc->core_id == -1 || --pc->refcnt == 0)
1287 kfree(pc); 1383 kfree(pc);
1288 cpuc->per_core = NULL; 1384 cpuc->shared_regs = NULL;
1289 } 1385 }
1290 1386
1291 fini_debug_store_on_cpu(cpu); 1387 fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1532,6 @@ static __init int intel_pmu_init(void)
1436 1532
1437 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1533 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1438 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; 1534 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1439 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1535 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1441 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1536 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442 1537
@@ -1481,10 +1576,10 @@ static __init int intel_pmu_init(void)
1481 intel_pmu_lbr_init_nhm(); 1576 intel_pmu_lbr_init_nhm();
1482 1577
1483 x86_pmu.event_constraints = intel_westmere_event_constraints; 1578 x86_pmu.event_constraints = intel_westmere_event_constraints;
1484 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1485 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1579 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1486 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; 1580 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1487 x86_pmu.extra_regs = intel_westmere_extra_regs; 1581 x86_pmu.extra_regs = intel_westmere_extra_regs;
1582 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1488 1583
1489 /* UOPS_ISSUED.STALLED_CYCLES */ 1584 /* UOPS_ISSUED.STALLED_CYCLES */
1490 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1585 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1495,6 +1590,7 @@ static __init int intel_pmu_init(void)
1495 break; 1590 break;
1496 1591
1497 case 42: /* SandyBridge */ 1592 case 42: /* SandyBridge */
1593 case 45: /* SandyBridge, "Romely-EP" */
1498 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1594 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1499 sizeof(hw_cache_event_ids)); 1595 sizeof(hw_cache_event_ids));
1500 1596
@@ -1502,6 +1598,10 @@ static __init int intel_pmu_init(void)
1502 1598
1503 x86_pmu.event_constraints = intel_snb_event_constraints; 1599 x86_pmu.event_constraints = intel_snb_event_constraints;
1504 x86_pmu.pebs_constraints = intel_snb_pebs_events; 1600 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1601 x86_pmu.extra_regs = intel_snb_extra_regs;
1602 /* all extra regs are per-cpu when HT is on */
1603 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1604 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
1505 1605
1506 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ 1606 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1507 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1607 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1512,11 +1612,19 @@ static __init int intel_pmu_init(void)
1512 break; 1612 break;
1513 1613
1514 default: 1614 default:
1515 /* 1615 switch (x86_pmu.version) {
1516 * default constraints for v2 and up 1616 case 1:
1517 */ 1617 x86_pmu.event_constraints = intel_v1_event_constraints;
1518 x86_pmu.event_constraints = intel_gen_event_constraints; 1618 pr_cont("generic architected perfmon v1, ");
1519 pr_cont("generic architected perfmon, "); 1619 break;
1620 default:
1621 /*
1622 * default constraints for v2 and up
1623 */
1624 x86_pmu.event_constraints = intel_gen_event_constraints;
1625 pr_cont("generic architected perfmon, ");
1626 break;
1627 }
1520 } 1628 }
1521 return 0; 1629 return 0;
1522} 1630}
@@ -1528,4 +1636,8 @@ static int intel_pmu_init(void)
1528 return 0; 1636 return 0;
1529} 1637}
1530 1638
1639static struct intel_shared_regs *allocate_shared_regs(int cpu)
1640{
1641 return NULL;
1642}
1531#endif /* CONFIG_CPU_SUP_INTEL */ 1643#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..3213c52db76 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
340 */ 340 */
341 perf_prepare_sample(&header, &data, event, &regs); 341 perf_prepare_sample(&header, &data, event, &regs);
342 342
343 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) 343 if (perf_output_begin(&handle, event, header.size * (top - at)))
344 return 1; 344 return 1;
345 345
346 for (; at < top; at++) { 346 for (; at < top; at++) {
@@ -508,6 +508,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
508 unsigned long from = cpuc->lbr_entries[0].from; 508 unsigned long from = cpuc->lbr_entries[0].from;
509 unsigned long old_to, to = cpuc->lbr_entries[0].to; 509 unsigned long old_to, to = cpuc->lbr_entries[0].to;
510 unsigned long ip = regs->ip; 510 unsigned long ip = regs->ip;
511 int is_64bit = 0;
511 512
512 /* 513 /*
513 * We don't need to fixup if the PEBS assist is fault like 514 * We don't need to fixup if the PEBS assist is fault like
@@ -559,7 +560,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
559 } else 560 } else
560 kaddr = (void *)to; 561 kaddr = (void *)to;
561 562
562 kernel_insn_init(&insn, kaddr); 563#ifdef CONFIG_X86_64
564 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
565#endif
566 insn_init(&insn, kaddr, is_64bit);
563 insn_get_length(&insn); 567 insn_get_length(&insn);
564 to += insn.length; 568 to += insn.length;
565 } while (to < ip); 569 } while (to < ip);
@@ -616,7 +620,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
616 else 620 else
617 regs.flags &= ~PERF_EFLAGS_EXACT; 621 regs.flags &= ~PERF_EFLAGS_EXACT;
618 622
619 if (perf_event_overflow(event, 1, &data, &regs)) 623 if (perf_event_overflow(event, &data, &regs))
620 x86_pmu_stop(event, 0); 624 x86_pmu_stop(event, 0);
621} 625}
622 626
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ead584fb6a7..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
554 [ C(RESULT_MISS) ] = -1, 554 [ C(RESULT_MISS) ] = -1,
555 }, 555 },
556 }, 556 },
557 [ C(NODE) ] = {
558 [ C(OP_READ) ] = {
559 [ C(RESULT_ACCESS) ] = -1,
560 [ C(RESULT_MISS) ] = -1,
561 },
562 [ C(OP_WRITE) ] = {
563 [ C(RESULT_ACCESS) ] = -1,
564 [ C(RESULT_MISS) ] = -1,
565 },
566 [ C(OP_PREFETCH) ] = {
567 [ C(RESULT_ACCESS) ] = -1,
568 [ C(RESULT_MISS) ] = -1,
569 },
570 },
557}; 571};
558 572
573/*
574 * Because of Netburst being quite restricted in how many
575 * identical events may run simultaneously, we introduce event aliases,
576 * ie the different events which have the same functionality but
577 * utilize non-intersected resources (ESCR/CCCR/counter registers).
578 *
579 * This allow us to relax restrictions a bit and run two or more
580 * identical events together.
581 *
582 * Never set any custom internal bits such as P4_CONFIG_HT,
583 * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
584 * either up to date automatically or not applicable at all.
585 */
586struct p4_event_alias {
587 u64 original;
588 u64 alternative;
589} p4_event_aliases[] = {
590 {
591 /*
592 * Non-halted cycles can be substituted with non-sleeping cycles (see
593 * Intel SDM Vol3b for details). We need this alias to be able
594 * to run nmi-watchdog and 'perf top' (or any other user space tool
595 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
596 * simultaneously.
597 */
598 .original =
599 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
600 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
601 .alternative =
602 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
603 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
604 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
605 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
606 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
607 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
608 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
609 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
610 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
611 p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
612 P4_CCCR_COMPARE),
613 },
614};
615
616static u64 p4_get_alias_event(u64 config)
617{
618 u64 config_match;
619 int i;
620
621 /*
622 * Only event with special mark is allowed,
623 * we're to be sure it didn't come as malformed
624 * RAW event.
625 */
626 if (!(config & P4_CONFIG_ALIASABLE))
627 return 0;
628
629 config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
630
631 for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
632 if (config_match == p4_event_aliases[i].original) {
633 config_match = p4_event_aliases[i].alternative;
634 break;
635 } else if (config_match == p4_event_aliases[i].alternative) {
636 config_match = p4_event_aliases[i].original;
637 break;
638 }
639 }
640
641 if (i >= ARRAY_SIZE(p4_event_aliases))
642 return 0;
643
644 return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
645}
646
559static u64 p4_general_events[PERF_COUNT_HW_MAX] = { 647static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
560 /* non-halted CPU clocks */ 648 /* non-halted CPU clocks */
561 [PERF_COUNT_HW_CPU_CYCLES] = 649 [PERF_COUNT_HW_CPU_CYCLES] =
562 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | 650 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
563 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), 651 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
652 P4_CONFIG_ALIASABLE,
564 653
565 /* 654 /*
566 * retired instructions 655 * retired instructions
@@ -945,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
945 1034
946 if (!x86_perf_event_set_period(event)) 1035 if (!x86_perf_event_set_period(event))
947 continue; 1036 continue;
948 if (perf_event_overflow(event, 1, &data, regs)) 1037 if (perf_event_overflow(event, &data, regs))
949 x86_pmu_stop(event, 0); 1038 x86_pmu_stop(event, 0);
950 } 1039 }
951 1040
@@ -1120,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1120 struct p4_event_bind *bind; 1209 struct p4_event_bind *bind;
1121 unsigned int i, thread, num; 1210 unsigned int i, thread, num;
1122 int cntr_idx, escr_idx; 1211 int cntr_idx, escr_idx;
1212 u64 config_alias;
1213 int pass;
1123 1214
1124 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 1215 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1125 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); 1216 bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1128,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1128 1219
1129 hwc = &cpuc->event_list[i]->hw; 1220 hwc = &cpuc->event_list[i]->hw;
1130 thread = p4_ht_thread(cpu); 1221 thread = p4_ht_thread(cpu);
1222 pass = 0;
1223
1224again:
1225 /*
1226 * It's possible to hit a circular lock
1227 * between original and alternative events
1228 * if both are scheduled already.
1229 */
1230 if (pass > 2)
1231 goto done;
1232
1131 bind = p4_config_get_bind(hwc->config); 1233 bind = p4_config_get_bind(hwc->config);
1132 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); 1234 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
1133 if (unlikely(escr_idx == -1)) 1235 if (unlikely(escr_idx == -1))
@@ -1141,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
1141 } 1243 }
1142 1244
1143 cntr_idx = p4_next_cntr(thread, used_mask, bind); 1245 cntr_idx = p4_next_cntr(thread, used_mask, bind);
1144 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) 1246 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
1145 goto done; 1247 /*
1248 * Check whether an event alias is still available.
1249 */
1250 config_alias = p4_get_alias_event(hwc->config);
1251 if (!config_alias)
1252 goto done;
1253 hwc->config = config_alias;
1254 pass++;
1255 goto again;
1256 }
1146 1257
1147 p4_pmu_swap_config_ts(hwc, cpu); 1258 p4_pmu_swap_config_ts(hwc, cpu);
1148 if (assign) 1259 if (assign)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 9aeb78a23de..a621f342768 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -134,6 +134,24 @@ static int __init add_bus_probe(void)
134module_init(add_bus_probe); 134module_init(add_bus_probe);
135 135
136#ifdef CONFIG_PCI 136#ifdef CONFIG_PCI
137struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
138{
139 struct device_node *np;
140
141 for_each_node_by_type(np, "pci") {
142 const void *prop;
143 unsigned int bus_min;
144
145 prop = of_get_property(np, "bus-range", NULL);
146 if (!prop)
147 continue;
148 bus_min = be32_to_cpup(prop);
149 if (bus->number == bus_min)
150 return np;
151 }
152 return NULL;
153}
154
137static int x86_of_pci_irq_enable(struct pci_dev *dev) 155static int x86_of_pci_irq_enable(struct pci_dev *dev)
138{ 156{
139 struct of_irq oirq; 157 struct of_irq oirq;
@@ -165,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
165 183
166void __cpuinit x86_of_pci_init(void) 184void __cpuinit x86_of_pci_init(void)
167{ 185{
168 struct device_node *np;
169
170 pcibios_enable_irq = x86_of_pci_irq_enable; 186 pcibios_enable_irq = x86_of_pci_irq_enable;
171 pcibios_disable_irq = x86_of_pci_irq_disable; 187 pcibios_disable_irq = x86_of_pci_irq_disable;
172
173 for_each_node_by_type(np, "pci") {
174 const void *prop;
175 struct pci_bus *bus;
176 unsigned int bus_min;
177 struct device_node *child;
178
179 prop = of_get_property(np, "bus-range", NULL);
180 if (!prop)
181 continue;
182 bus_min = be32_to_cpup(prop);
183
184 bus = pci_find_bus(0, bus_min);
185 if (!bus) {
186 printk(KERN_ERR "Can't find a node for bus %s.\n",
187 np->full_name);
188 continue;
189 }
190
191 if (bus->self)
192 bus->self->dev.of_node = np;
193 else
194 bus->dev.of_node = np;
195
196 for_each_child_of_node(np, child) {
197 struct pci_dev *dev;
198 u32 devfn;
199
200 prop = of_get_property(child, "reg", NULL);
201 if (!prop)
202 continue;
203
204 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
205 dev = pci_get_slot(bus, devfn);
206 if (!dev)
207 continue;
208 dev->dev.of_node = child;
209 pci_dev_put(dev);
210 }
211 }
212} 188}
213#endif 189#endif
214 190
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d..19853ad8afc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
105} 105}
106 106
107/* 107/*
108 * We are returning from the irq stack and go to the previous one.
109 * If the previous stack is also in the irq stack, then bp in the first
110 * frame of the irq stack points to the previous, interrupted one.
111 * Otherwise we have another level of indirection: We first save
112 * the bp of the previous stack, then we switch the stack to the irq one
113 * and save a new bp that links to the previous one.
114 * (See save_args())
115 */
116static inline unsigned long
117fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
118 unsigned long *irq_stack, unsigned long *irq_stack_end)
119{
120#ifdef CONFIG_FRAME_POINTER
121 struct stack_frame *frame = (struct stack_frame *)bp;
122 unsigned long next;
123
124 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
125 if (!probe_kernel_address(&frame->next_frame, next))
126 return next;
127 else
128 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
129 "callchain\n", &frame->next_frame);
130 }
131#endif
132 return bp;
133}
134
135/*
136 * x86-64 can have up to three kernel stacks: 108 * x86-64 can have up to three kernel stacks:
137 * process stack 109 * process stack
138 * interrupt stack 110 * interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
155 task = current; 127 task = current;
156 128
157 if (!stack) { 129 if (!stack) {
158 stack = &dummy; 130 if (regs)
159 if (task && task != current) 131 stack = (unsigned long *)regs->sp;
132 else if (task && task != current)
160 stack = (unsigned long *)task->thread.sp; 133 stack = (unsigned long *)task->thread.sp;
134 else
135 stack = &dummy;
161 } 136 }
162 137
163 if (!bp) 138 if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
205 * pointer (index -1 to end) in the IRQ stack: 180 * pointer (index -1 to end) in the IRQ stack:
206 */ 181 */
207 stack = (unsigned long *) (irq_stack_end[-1]); 182 stack = (unsigned long *) (irq_stack_end[-1]);
208 bp = fixup_bp_irq_link(bp, stack, irq_stack,
209 irq_stack_end);
210 irq_stack_end = NULL; 183 irq_stack_end = NULL;
211 ops->stack(data, "EOI"); 184 ops->stack(data, "EOI");
212 continue; 185 continue;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 5c1a9197491..f3f6f534400 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,7 @@
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56#include <asm/cpufeature.h> 56#include <asm/cpufeature.h>
57#include <asm/alternative-asm.h>
57 58
58/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 59/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
59#include <linux/elf-em.h> 60#include <linux/elf-em.h>
@@ -873,12 +874,7 @@ ENTRY(simd_coprocessor_error)
873661: pushl_cfi $do_general_protection 874661: pushl_cfi $do_general_protection
874662: 875662:
875.section .altinstructions,"a" 876.section .altinstructions,"a"
876 .balign 4 877 altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
877 .long 661b
878 .long 663f
879 .word X86_FEATURE_XMM
880 .byte 662b-661b
881 .byte 664f-663f
882.previous 878.previous
883.section .altinstr_replacement,"ax" 879.section .altinstr_replacement,"ax"
884663: pushl $do_simd_coprocessor_error 880663: pushl $do_simd_coprocessor_error
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 47a4bcd2e50..e5d2d3fa7a0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
9/* 9/*
10 * entry.S contains the system-call and fault low-level handling routines. 10 * entry.S contains the system-call and fault low-level handling routines.
11 * 11 *
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
12 * NOTE: This code handles signal-recognition, which happens every time 14 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call. 15 * after an interrupt and after each system call.
14 * 16 *
@@ -297,27 +299,26 @@ ENDPROC(native_usergs_sysret64)
297 .endm 299 .endm
298 300
299/* save partial stack frame */ 301/* save partial stack frame */
300 .pushsection .kprobes.text, "ax" 302 .macro SAVE_ARGS_IRQ
301ENTRY(save_args)
302 XCPT_FRAME
303 cld 303 cld
304 /* 304 /* start from rbp in pt_regs and jump over */
305 * start from rbp in pt_regs and jump over 305 movq_cfi rdi, RDI-RBP
306 * return address. 306 movq_cfi rsi, RSI-RBP
307 */ 307 movq_cfi rdx, RDX-RBP
308 movq_cfi rdi, RDI+8-RBP 308 movq_cfi rcx, RCX-RBP
309 movq_cfi rsi, RSI+8-RBP 309 movq_cfi rax, RAX-RBP
310 movq_cfi rdx, RDX+8-RBP 310 movq_cfi r8, R8-RBP
311 movq_cfi rcx, RCX+8-RBP 311 movq_cfi r9, R9-RBP
312 movq_cfi rax, RAX+8-RBP 312 movq_cfi r10, R10-RBP
313 movq_cfi r8, R8+8-RBP 313 movq_cfi r11, R11-RBP
314 movq_cfi r9, R9+8-RBP 314
315 movq_cfi r10, R10+8-RBP 315 /* Save rbp so that we can unwind from get_irq_regs() */
316 movq_cfi r11, R11+8-RBP 316 movq_cfi rbp, 0
317 317
318 leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ 318 /* Save previous stack value */
319 movq_cfi rbp, 8 /* push %rbp */ 319 movq %rsp, %rsi
320 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ 320
321 leaq -RBP(%rsp),%rdi /* arg1 for handler */
321 testl $3, CS(%rdi) 322 testl $3, CS(%rdi)
322 je 1f 323 je 1f
323 SWAPGS 324 SWAPGS
@@ -329,19 +330,14 @@ ENTRY(save_args)
329 */ 330 */
3301: incl PER_CPU_VAR(irq_count) 3311: incl PER_CPU_VAR(irq_count)
331 jne 2f 332 jne 2f
332 popq_cfi %rax /* move return address... */
333 mov PER_CPU_VAR(irq_stack_ptr),%rsp 333 mov PER_CPU_VAR(irq_stack_ptr),%rsp
334 EMPTY_FRAME 0 334 EMPTY_FRAME 0
335 pushq_cfi %rbp /* backlink for unwinder */ 335
336 pushq_cfi %rax /* ... to the new stack */ 3362: /* Store previous stack value */
337 /* 337 pushq %rsi
338 * We entered an interrupt context - irqs are off: 338 /* We entered an interrupt context - irqs are off: */
339 */ 339 TRACE_IRQS_OFF
3402: TRACE_IRQS_OFF 340 .endm
341 ret
342 CFI_ENDPROC
343END(save_args)
344 .popsection
345 341
346ENTRY(save_rest) 342ENTRY(save_rest)
347 PARTIAL_FRAME 1 REST_SKIP+8 343 PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +469,7 @@ ENTRY(system_call_after_swapgs)
473 * and short: 469 * and short:
474 */ 470 */
475 ENABLE_INTERRUPTS(CLBR_NONE) 471 ENABLE_INTERRUPTS(CLBR_NONE)
476 SAVE_ARGS 8,1 472 SAVE_ARGS 8,0
477 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 473 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
478 movq %rcx,RIP-ARGOFFSET(%rsp) 474 movq %rcx,RIP-ARGOFFSET(%rsp)
479 CFI_REL_OFFSET rip,RIP-ARGOFFSET 475 CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +504,7 @@ sysret_check:
508 TRACE_IRQS_ON 504 TRACE_IRQS_ON
509 movq RIP-ARGOFFSET(%rsp),%rcx 505 movq RIP-ARGOFFSET(%rsp),%rcx
510 CFI_REGISTER rip,rcx 506 CFI_REGISTER rip,rcx
511 RESTORE_ARGS 0,-ARG_SKIP,1 507 RESTORE_ARGS 1,-ARG_SKIP,0
512 /*CFI_REGISTER rflags,r11*/ 508 /*CFI_REGISTER rflags,r11*/
513 movq PER_CPU_VAR(old_rsp), %rsp 509 movq PER_CPU_VAR(old_rsp), %rsp
514 USERGS_SYSRET64 510 USERGS_SYSRET64
@@ -791,7 +787,7 @@ END(interrupt)
791 /* reserve pt_regs for scratch regs and rbp */ 787 /* reserve pt_regs for scratch regs and rbp */
792 subq $ORIG_RAX-RBP, %rsp 788 subq $ORIG_RAX-RBP, %rsp
793 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP 789 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
794 call save_args 790 SAVE_ARGS_IRQ
795 PARTIAL_FRAME 0 791 PARTIAL_FRAME 0
796 call \func 792 call \func
797 .endm 793 .endm
@@ -814,15 +810,14 @@ ret_from_intr:
814 DISABLE_INTERRUPTS(CLBR_NONE) 810 DISABLE_INTERRUPTS(CLBR_NONE)
815 TRACE_IRQS_OFF 811 TRACE_IRQS_OFF
816 decl PER_CPU_VAR(irq_count) 812 decl PER_CPU_VAR(irq_count)
817 leaveq
818 813
819 CFI_RESTORE rbp 814 /* Restore saved previous stack */
815 popq %rsi
816 leaq 16(%rsi), %rsp
817
820 CFI_DEF_CFA_REGISTER rsp 818 CFI_DEF_CFA_REGISTER rsp
821 CFI_ADJUST_CFA_OFFSET -8 819 CFI_ADJUST_CFA_OFFSET -16
822 820
823 /* we did not save rbx, restore only from ARGOFFSET */
824 addq $8, %rsp
825 CFI_ADJUST_CFA_OFFSET -8
826exit_intr: 821exit_intr:
827 GET_THREAD_INFO(%rcx) 822 GET_THREAD_INFO(%rcx)
828 testl $3,CS-ARGOFFSET(%rsp) 823 testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +853,7 @@ retint_restore_args: /* return to kernel space */
858 */ 853 */
859 TRACE_IRQS_IRETQ 854 TRACE_IRQS_IRETQ
860restore_args: 855restore_args:
861 RESTORE_ARGS 0,8,0 856 RESTORE_ARGS 1,8,1
862 857
863irq_return: 858irq_return:
864 INTERRUPT_RETURN 859 INTERRUPT_RETURN
@@ -991,11 +986,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
991apicinterrupt THERMAL_APIC_VECTOR \ 986apicinterrupt THERMAL_APIC_VECTOR \
992 thermal_interrupt smp_thermal_interrupt 987 thermal_interrupt smp_thermal_interrupt
993 988
994#ifdef CONFIG_X86_MCE
995apicinterrupt MCE_SELF_VECTOR \
996 mce_self_interrupt smp_mce_self_interrupt
997#endif
998
999#ifdef CONFIG_SMP 989#ifdef CONFIG_SMP
1000apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 990apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1001 call_function_single_interrupt smp_call_function_single_interrupt 991 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1124,6 +1114,7 @@ zeroentry coprocessor_error do_coprocessor_error
1124errorentry alignment_check do_alignment_check 1114errorentry alignment_check do_alignment_check
1125zeroentry simd_coprocessor_error do_simd_coprocessor_error 1115zeroentry simd_coprocessor_error do_simd_coprocessor_error
1126 1116
1117
1127 /* Reload gs selector with exception handling */ 1118 /* Reload gs selector with exception handling */
1128 /* edi: new selector */ 1119 /* edi: new selector */
1129ENTRY(native_load_gs_index) 1120ENTRY(native_load_gs_index)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 6781765b3a0..4d5a1005420 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
4#include <linux/sysdev.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/i8253.h>
7#include <linux/slab.h> 8#include <linux/slab.h>
8#include <linux/hpet.h> 9#include <linux/hpet.h>
9#include <linux/init.h> 10#include <linux/init.h>
@@ -12,8 +13,8 @@
12#include <linux/io.h> 13#include <linux/io.h>
13 14
14#include <asm/fixmap.h> 15#include <asm/fixmap.h>
15#include <asm/i8253.h>
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17#include <asm/time.h>
17 18
18#define HPET_MASK CLOCKSOURCE_MASK(32) 19#define HPET_MASK CLOCKSOURCE_MASK(32)
19 20
@@ -71,7 +72,7 @@ static inline void hpet_set_mapping(void)
71{ 72{
72 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); 73 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
73#ifdef CONFIG_X86_64 74#ifdef CONFIG_X86_64
74 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); 75 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
75#endif 76#endif
76} 77}
77 78
@@ -738,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
738 return (cycle_t)hpet_readl(HPET_COUNTER); 739 return (cycle_t)hpet_readl(HPET_COUNTER);
739} 740}
740 741
741#ifdef CONFIG_X86_64
742static cycle_t __vsyscall_fn vread_hpet(void)
743{
744 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
745}
746#endif
747
748static struct clocksource clocksource_hpet = { 742static struct clocksource clocksource_hpet = {
749 .name = "hpet", 743 .name = "hpet",
750 .rating = 250, 744 .rating = 250,
@@ -753,7 +747,7 @@ static struct clocksource clocksource_hpet = {
753 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 747 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
754 .resume = hpet_resume_counter, 748 .resume = hpet_resume_counter,
755#ifdef CONFIG_X86_64 749#ifdef CONFIG_X86_64
756 .vread = vread_hpet, 750 .archdata = { .vclock_mode = VCLOCK_HPET },
757#endif 751#endif
758}; 752};
759 753
@@ -1054,6 +1048,14 @@ int hpet_rtc_timer_init(void)
1054} 1048}
1055EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); 1049EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
1056 1050
1051static void hpet_disable_rtc_channel(void)
1052{
1053 unsigned long cfg;
1054 cfg = hpet_readl(HPET_T1_CFG);
1055 cfg &= ~HPET_TN_ENABLE;
1056 hpet_writel(cfg, HPET_T1_CFG);
1057}
1058
1057/* 1059/*
1058 * The functions below are called from rtc driver. 1060 * The functions below are called from rtc driver.
1059 * Return 0 if HPET is not being used. 1061 * Return 0 if HPET is not being used.
@@ -1065,6 +1067,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
1065 return 0; 1067 return 0;
1066 1068
1067 hpet_rtc_flags &= ~bit_mask; 1069 hpet_rtc_flags &= ~bit_mask;
1070 if (unlikely(!hpet_rtc_flags))
1071 hpet_disable_rtc_channel();
1072
1068 return 1; 1073 return 1;
1069} 1074}
1070EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); 1075EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
@@ -1130,15 +1135,11 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1130 1135
1131static void hpet_rtc_timer_reinit(void) 1136static void hpet_rtc_timer_reinit(void)
1132{ 1137{
1133 unsigned int cfg, delta; 1138 unsigned int delta;
1134 int lost_ints = -1; 1139 int lost_ints = -1;
1135 1140
1136 if (unlikely(!hpet_rtc_flags)) { 1141 if (unlikely(!hpet_rtc_flags))
1137 cfg = hpet_readl(HPET_T1_CFG); 1142 hpet_disable_rtc_channel();
1138 cfg &= ~HPET_TN_ENABLE;
1139 hpet_writel(cfg, HPET_T1_CFG);
1140 return;
1141 }
1142 1143
1143 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1144 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
1144 delta = hpet_default_delta; 1145 delta = hpet_default_delta;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 12aff253768..739d8598f78 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -321,7 +321,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
321 return tmp; 321 return tmp;
322} 322}
323 323
324#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); 324#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16)
325#define FP_EXP_TAG_VALID 0 325#define FP_EXP_TAG_VALID 0
326#define FP_EXP_TAG_ZERO 1 326#define FP_EXP_TAG_ZERO 1
327#define FP_EXP_TAG_SPECIAL 2 327#define FP_EXP_TAG_SPECIAL 2
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index fb66dc9e36c..f2b96de3c7c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,113 +3,24 @@
3 * 3 *
4 */ 4 */
5#include <linux/clockchips.h> 5#include <linux/clockchips.h>
6#include <linux/interrupt.h>
7#include <linux/spinlock.h>
8#include <linux/jiffies.h>
9#include <linux/module.h> 6#include <linux/module.h>
10#include <linux/timex.h> 7#include <linux/timex.h>
11#include <linux/delay.h> 8#include <linux/i8253.h>
12#include <linux/init.h>
13#include <linux/io.h>
14 9
15#include <asm/i8253.h>
16#include <asm/hpet.h> 10#include <asm/hpet.h>
11#include <asm/time.h>
17#include <asm/smp.h> 12#include <asm/smp.h>
18 13
19DEFINE_RAW_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock);
21
22/* 14/*
23 * HPET replaces the PIT, when enabled. So we need to know, which of 15 * HPET replaces the PIT, when enabled. So we need to know, which of
24 * the two timers is used 16 * the two timers is used
25 */ 17 */
26struct clock_event_device *global_clock_event; 18struct clock_event_device *global_clock_event;
27 19
28/*
29 * Initialize the PIT timer.
30 *
31 * This is also called after resume to bring the PIT into operation again.
32 */
33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt)
35{
36 raw_spin_lock(&i8253_lock);
37
38 switch (mode) {
39 case CLOCK_EVT_MODE_PERIODIC:
40 /* binary, mode 2, LSB/MSB, ch 0 */
41 outb_pit(0x34, PIT_MODE);
42 outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
43 outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
44 break;
45
46 case CLOCK_EVT_MODE_SHUTDOWN:
47 case CLOCK_EVT_MODE_UNUSED:
48 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
49 evt->mode == CLOCK_EVT_MODE_ONESHOT) {
50 outb_pit(0x30, PIT_MODE);
51 outb_pit(0, PIT_CH0);
52 outb_pit(0, PIT_CH0);
53 }
54 break;
55
56 case CLOCK_EVT_MODE_ONESHOT:
57 /* One shot setup */
58 outb_pit(0x38, PIT_MODE);
59 break;
60
61 case CLOCK_EVT_MODE_RESUME:
62 /* Nothing to do here */
63 break;
64 }
65 raw_spin_unlock(&i8253_lock);
66}
67
68/*
69 * Program the next event in oneshot mode
70 *
71 * Delta is given in PIT ticks
72 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{
75 raw_spin_lock(&i8253_lock);
76 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 raw_spin_unlock(&i8253_lock);
79
80 return 0;
81}
82
83/*
84 * On UP the PIT can serve all of the possible timer functions. On SMP systems
85 * it can be solely used for the global tick.
86 *
87 * The profiling and update capabilities are switched off once the local apic is
88 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
89 * !using_apic_timer decisions in do_timer_interrupt_hook()
90 */
91static struct clock_event_device pit_ce = {
92 .name = "pit",
93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
94 .set_mode = init_pit_timer,
95 .set_next_event = pit_next_event,
96 .irq = 0,
97};
98
99/*
100 * Initialize the conversion factor and the min/max deltas of the clock event
101 * structure and register the clock event source with the framework.
102 */
103void __init setup_pit_timer(void) 20void __init setup_pit_timer(void)
104{ 21{
105 /* 22 clockevent_i8253_init(true);
106 * Start pit with the boot cpu mask and make it global after the 23 global_clock_event = &i8253_clockevent;
107 * IO_APIC has been initialized.
108 */
109 pit_ce.cpumask = cpumask_of(smp_processor_id());
110
111 clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
112 global_clock_event = &pit_ce;
113} 24}
114 25
115#ifndef CONFIG_X86_64 26#ifndef CONFIG_X86_64
@@ -123,7 +34,7 @@ static int __init init_pit_clocksource(void)
123 * - when local APIC timer is active (PIT is switched off) 34 * - when local APIC timer is active (PIT is switched off)
124 */ 35 */
125 if (num_possible_cpus() > 1 || is_hpet_enabled() || 36 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
126 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) 37 i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
127 return 0; 38 return 0;
128 39
129 return clocksource_i8253_init(); 40 return clocksource_i8253_init();
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 65b8f5c2eeb..610485223bd 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -14,7 +14,7 @@
14#include <linux/io.h> 14#include <linux/io.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16 16
17#include <asm/atomic.h> 17#include <linux/atomic.h>
18#include <asm/system.h> 18#include <asm/system.h>
19#include <asm/timer.h> 19#include <asm/timer.h>
20#include <asm/hw_irq.h> 20#include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 48acf71c653..f3a90e926f5 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -15,7 +15,7 @@
15#include <linux/io.h> 15#include <linux/io.h>
16#include <linux/delay.h> 16#include <linux/delay.h>
17 17
18#include <asm/atomic.h> 18#include <linux/atomic.h>
19#include <asm/system.h> 19#include <asm/system.h>
20#include <asm/timer.h> 20#include <asm/timer.h>
21#include <asm/hw_irq.h> 21#include <asm/hw_irq.h>
@@ -275,9 +275,6 @@ static void __init apic_intr_init(void)
275#ifdef CONFIG_X86_MCE_THRESHOLD 275#ifdef CONFIG_X86_MCE_THRESHOLD
276 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 276 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
277#endif 277#endif
278#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
279 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
280#endif
281 278
282#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 279#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
283 /* self generated IPI for local APIC timer */ 280 /* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b..00354d4919a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
608 return register_die_notifier(&kgdb_notifier); 608 return register_die_notifier(&kgdb_notifier);
609} 609}
610 610
611static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, 611static void kgdb_hw_overflow_handler(struct perf_event *event,
612 struct perf_sample_data *data, struct pt_regs *regs) 612 struct perf_sample_data *data, struct pt_regs *regs)
613{ 613{
614 struct task_struct *tsk = current; 614 struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
638 for (i = 0; i < HBP_NUM; i++) { 638 for (i = 0; i < HBP_NUM; i++) {
639 if (breakinfo[i].pev) 639 if (breakinfo[i].pev)
640 continue; 640 continue;
641 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 641 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
642 if (IS_ERR((void * __force)breakinfo[i].pev)) { 642 if (IS_ERR((void * __force)breakinfo[i].pev)) {
643 printk(KERN_ERR "kgdb: Could not allocate hw" 643 printk(KERN_ERR "kgdb: Could not allocate hw"
644 "breakpoints\nDisabling the kernel debugger\n"); 644 "breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index f1a6244d7d9..794bc95134c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -75,8 +75,10 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
75 /* 75 /*
76 * Undefined/reserved opcodes, conditional jump, Opcode Extension 76 * Undefined/reserved opcodes, conditional jump, Opcode Extension
77 * Groups, and some special opcodes can not boost. 77 * Groups, and some special opcodes can not boost.
78 * This is non-const to keep gcc from statically optimizing it out, as
79 * variable_test_bit makes gcc think only *(unsigned long*) is used.
78 */ 80 */
79static const u32 twobyte_is_boostable[256 / 32] = { 81static u32 twobyte_is_boostable[256 / 32] = {
80 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 82 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
81 /* ---------------------------------------------- */ 83 /* ---------------------------------------------- */
82 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ 84 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0b122..a9c2116001d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
51 51
52early_param("no-kvmapf", parse_no_kvmapf); 52early_param("no-kvmapf", parse_no_kvmapf);
53 53
54static int steal_acc = 1;
55static int parse_no_stealacc(char *arg)
56{
57 steal_acc = 0;
58 return 0;
59}
60
61early_param("no-steal-acc", parse_no_stealacc);
62
54struct kvm_para_state { 63struct kvm_para_state {
55 u8 mmu_queue[MMU_QUEUE_SIZE]; 64 u8 mmu_queue[MMU_QUEUE_SIZE];
56 int mmu_queue_len; 65 int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
58 67
59static DEFINE_PER_CPU(struct kvm_para_state, para_state); 68static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 69static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
70static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
71static int has_steal_clock = 0;
61 72
62static struct kvm_para_state *kvm_para_state(void) 73static struct kvm_para_state *kvm_para_state(void)
63{ 74{
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
441#endif 452#endif
442} 453}
443 454
455static void kvm_register_steal_time(void)
456{
457 int cpu = smp_processor_id();
458 struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
459
460 if (!has_steal_clock)
461 return;
462
463 memset(st, 0, sizeof(*st));
464
465 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
466 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
467 cpu, __pa(st));
468}
469
444void __cpuinit kvm_guest_cpu_init(void) 470void __cpuinit kvm_guest_cpu_init(void)
445{ 471{
446 if (!kvm_para_available()) 472 if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 483 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id()); 484 smp_processor_id());
459 } 485 }
486
487 if (has_steal_clock)
488 kvm_register_steal_time();
460} 489}
461 490
462static void kvm_pv_disable_apf(void *unused) 491static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify, 512 .notifier_call = kvm_pv_reboot_notify,
484}; 513};
485 514
515static u64 kvm_steal_clock(int cpu)
516{
517 u64 steal;
518 struct kvm_steal_time *src;
519 int version;
520
521 src = &per_cpu(steal_time, cpu);
522 do {
523 version = src->version;
524 rmb();
525 steal = src->steal;
526 rmb();
527 } while ((version & 1) || (version != src->version));
528
529 return steal;
530}
531
532void kvm_disable_steal_time(void)
533{
534 if (!has_steal_clock)
535 return;
536
537 wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
538}
539
486#ifdef CONFIG_SMP 540#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void) 541static void __init kvm_smp_prepare_boot_cpu(void)
488{ 542{
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
500 554
501static void kvm_guest_cpu_offline(void *dummy) 555static void kvm_guest_cpu_offline(void *dummy)
502{ 556{
557 kvm_disable_steal_time();
503 kvm_pv_disable_apf(NULL); 558 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all(); 559 apf_task_wake_all();
505} 560}
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) 603 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init; 604 x86_init.irqs.trap_init = kvm_apf_trap_init;
550 605
606 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
607 has_steal_clock = 1;
608 pv_time_ops.steal_clock = kvm_steal_clock;
609 }
610
551#ifdef CONFIG_SMP 611#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 612 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier); 613 register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
555 kvm_guest_cpu_init(); 615 kvm_guest_cpu_init();
556#endif 616#endif
557} 617}
618
619static __init int activate_jump_labels(void)
620{
621 if (has_steal_clock) {
622 jump_label_inc(&paravirt_steal_enabled);
623 if (steal_acc)
624 jump_label_inc(&paravirt_steal_rq_enabled);
625 }
626
627 return 0;
628}
629arch_initcall(activate_jump_labels);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 6389a6bca11..44842d756b2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -74,9 +74,10 @@ static cycle_t kvm_clock_read(void)
74 struct pvclock_vcpu_time_info *src; 74 struct pvclock_vcpu_time_info *src;
75 cycle_t ret; 75 cycle_t ret;
76 76
77 src = &get_cpu_var(hv_clock); 77 preempt_disable_notrace();
78 src = &__get_cpu_var(hv_clock);
78 ret = pvclock_clocksource_read(src); 79 ret = pvclock_clocksource_read(src);
79 put_cpu_var(hv_clock); 80 preempt_enable_notrace();
80 return ret; 81 return ret;
81} 82}
82 83
@@ -160,6 +161,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
160static void kvm_crash_shutdown(struct pt_regs *regs) 161static void kvm_crash_shutdown(struct pt_regs *regs)
161{ 162{
162 native_write_msr(msr_kvm_system_time, 0, 0); 163 native_write_msr(msr_kvm_system_time, 0, 0);
164 kvm_disable_steal_time();
163 native_machine_crash_shutdown(regs); 165 native_machine_crash_shutdown(regs);
164} 166}
165#endif 167#endif
@@ -167,6 +169,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
167static void kvm_shutdown(void) 169static void kvm_shutdown(void)
168{ 170{
169 native_write_msr(msr_kvm_system_time, 0, 0); 171 native_write_msr(msr_kvm_system_time, 0, 0);
172 kvm_disable_steal_time();
170 native_machine_shutdown(); 173 native_machine_shutdown();
171} 174}
172 175
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab1..591be0ee193 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_CONTAINER_SECTION_HDR 8 69#define SECTION_HDR_SIZE 8
70#define UCODE_CONTAINER_HEADER_SIZE 12 70#define CONTAINER_HDR_SZ 12
71 71
72static struct equiv_cpu_entry *equiv_cpu_table; 72static struct equiv_cpu_entry *equiv_cpu_table;
73 73
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
158{ 158{
159 struct cpuinfo_x86 *c = &cpu_data(cpu); 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
160 unsigned int max_size, actual_size; 160 u32 max_size, actual_size;
161 161
162#define F1XH_MPB_MAX_SIZE 2048 162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824 163#define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
175 break; 175 break;
176 } 176 }
177 177
178 actual_size = buf[4] + (buf[5] << 8); 178 actual_size = *(u32 *)(buf + 4);
179 179
180 if (actual_size > size || actual_size > max_size) { 180 if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
181 pr_err("section size mismatch\n"); 181 pr_err("section size mismatch\n");
182 return 0; 182 return 0;
183 } 183 }
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
191 struct microcode_header_amd *mc = NULL; 191 struct microcode_header_amd *mc = NULL;
192 unsigned int actual_size = 0; 192 unsigned int actual_size = 0;
193 193
194 if (buf[0] != UCODE_UCODE_TYPE) { 194 if (*(u32 *)buf != UCODE_UCODE_TYPE) {
195 pr_err("invalid type field in container file section header\n"); 195 pr_err("invalid type field in container file section header\n");
196 goto out; 196 goto out;
197 } 197 }
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
204 if (!mc) 204 if (!mc)
205 goto out; 205 goto out;
206 206
207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); 207 get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; 208 *mc_size = actual_size + SECTION_HDR_SIZE;
209 209
210out: 210out:
211 return mc; 211 return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
229 return -ENOMEM; 229 return -ENOMEM;
230 } 230 }
231 231
232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size); 232 get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
233 233
234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 /* add header length */
235 return size + CONTAINER_HDR_SZ;
235} 236}
236 237
237static void free_equiv_cpu_table(void) 238static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 52f256f2cc8..925179f871d 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -45,21 +45,6 @@ void *module_alloc(unsigned long size)
45 -1, __builtin_return_address(0)); 45 -1, __builtin_return_address(0));
46} 46}
47 47
48/* Free memory returned from module_alloc */
49void module_free(struct module *mod, void *module_region)
50{
51 vfree(module_region);
52}
53
54/* We don't need anything special. */
55int module_frob_arch_sections(Elf_Ehdr *hdr,
56 Elf_Shdr *sechdrs,
57 char *secstrings,
58 struct module *mod)
59{
60 return 0;
61}
62
63#ifdef CONFIG_X86_32 48#ifdef CONFIG_X86_32
64int apply_relocate(Elf32_Shdr *sechdrs, 49int apply_relocate(Elf32_Shdr *sechdrs,
65 const char *strtab, 50 const char *strtab,
@@ -100,17 +85,6 @@ int apply_relocate(Elf32_Shdr *sechdrs,
100 } 85 }
101 return 0; 86 return 0;
102} 87}
103
104int apply_relocate_add(Elf32_Shdr *sechdrs,
105 const char *strtab,
106 unsigned int symindex,
107 unsigned int relsec,
108 struct module *me)
109{
110 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
111 me->name);
112 return -ENOEXEC;
113}
114#else /*X86_64*/ 88#else /*X86_64*/
115int apply_relocate_add(Elf64_Shdr *sechdrs, 89int apply_relocate_add(Elf64_Shdr *sechdrs,
116 const char *strtab, 90 const char *strtab,
@@ -181,17 +155,6 @@ overflow:
181 me->name); 155 me->name);
182 return -ENOEXEC; 156 return -ENOEXEC;
183} 157}
184
185int apply_relocate(Elf_Shdr *sechdrs,
186 const char *strtab,
187 unsigned int symindex,
188 unsigned int relsec,
189 struct module *me)
190{
191 printk(KERN_ERR "non add relocation not supported\n");
192 return -ENOSYS;
193}
194
195#endif 158#endif
196 159
197int module_finalize(const Elf_Ehdr *hdr, 160int module_finalize(const Elf_Ehdr *hdr,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 9103b89c145..0741b062a30 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
95 } 95 }
96#endif 96#endif
97 97
98 set_bit(m->busid, mp_bus_not_pci);
98 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { 99 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
99 set_bit(m->busid, mp_bus_not_pci);
100#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 100#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
101 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 101 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
102#endif 102#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71..d90272e6bc4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
202 __native_flush_tlb_single(addr); 202 __native_flush_tlb_single(addr);
203} 203}
204 204
205struct jump_label_key paravirt_steal_enabled;
206struct jump_label_key paravirt_steal_rq_enabled;
207
208static u64 native_steal_clock(int cpu)
209{
210 return 0;
211}
212
205/* These are in entry.S */ 213/* These are in entry.S */
206extern void native_iret(void); 214extern void native_iret(void);
207extern void native_irq_enable_sysexit(void); 215extern void native_irq_enable_sysexit(void);
@@ -299,6 +307,10 @@ struct pv_info pv_info = {
299 .paravirt_enabled = 0, 307 .paravirt_enabled = 0,
300 .kernel_rpl = 0, 308 .kernel_rpl = 0,
301 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ 309 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
310
311#ifdef CONFIG_X86_64
312 .extra_user_64bit_cs = __USER_CS,
313#endif
302}; 314};
303 315
304struct pv_init_ops pv_init_ops = { 316struct pv_init_ops pv_init_ops = {
@@ -307,6 +319,7 @@ struct pv_init_ops pv_init_ops = {
307 319
308struct pv_time_ops pv_time_ops = { 320struct pv_time_ops pv_time_ops = {
309 .sched_clock = native_sched_clock, 321 .sched_clock = native_sched_clock,
322 .steal_clock = native_steal_clock,
310}; 323};
311 324
312struct pv_irq_ops pv_irq_ops = { 325struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e8c33a30200..726494b5834 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1553,7 +1553,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
1553 continue; 1553 continue;
1554 1554
1555 /* cover the whole region */ 1555 /* cover the whole region */
1556 npages = (r->end - r->start) >> PAGE_SHIFT; 1556 npages = resource_size(r) >> PAGE_SHIFT;
1557 npages++; 1557 npages++;
1558 1558
1559 iommu_range_reserve(tbl, r->start, npages); 1559 iommu_range_reserve(tbl, r->start, npages);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b49d00da2ae..3b730fb1385 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -44,6 +44,15 @@ int iommu_detected __read_mostly = 0;
44 */ 44 */
45int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
46 46
47/*
48 * Group multi-function PCI devices into a single device-group for the
49 * iommu_device_group interface. This tells the iommu driver to pretend
50 * it cannot distinguish between functions of a device, exposing only one
51 * group for the device. Useful for disallowing use of individual PCI
52 * functions from userspace drivers.
53 */
54int iommu_group_mf __read_mostly;
55
47extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 56extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
48 57
49/* Dummy device used for NULL arguments (normally ISA). */ 58/* Dummy device used for NULL arguments (normally ISA). */
@@ -168,6 +177,8 @@ static __init int iommu_setup(char *p)
168#endif 177#endif
169 if (!strncmp(p, "pt", 2)) 178 if (!strncmp(p, "pt", 2))
170 iommu_pass_through = 1; 179 iommu_pass_through = 1;
180 if (!strncmp(p, "group_mf", 8))
181 iommu_group_mf = 1;
171 182
172 gart_parse_options(p); 183 gart_parse_options(p);
173 184
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index ba0a4cce53b..63228035f9d 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -234,7 +234,7 @@ void __init probe_roms(void)
234 /* check for extension rom (ignore length byte!) */ 234 /* check for extension rom (ignore length byte!) */
235 rom = isa_bus_to_virt(extension_rom_resource.start); 235 rom = isa_bus_to_virt(extension_rom_resource.start);
236 if (romsignature(rom)) { 236 if (romsignature(rom)) {
237 length = extension_rom_resource.end - extension_rom_resource.start + 1; 237 length = resource_size(&extension_rom_resource);
238 if (romchecksum(rom, length)) { 238 if (romchecksum(rom, length)) {
239 request_resource(&iomem_resource, &extension_rom_resource); 239 request_resource(&iomem_resource, &extension_rom_resource);
240 upper = extension_rom_resource.start; 240 upper = extension_rom_resource.start;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e1ba8cb24e4..30eb651d1fa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -403,6 +403,14 @@ void default_idle(void)
403EXPORT_SYMBOL(default_idle); 403EXPORT_SYMBOL(default_idle);
404#endif 404#endif
405 405
406bool set_pm_idle_to_default(void)
407{
408 bool ret = !!pm_idle;
409
410 pm_idle = default_idle;
411
412 return ret;
413}
406void stop_this_cpu(void *dummy) 414void stop_this_cpu(void *dummy)
407{ 415{
408 local_irq_disable(); 416 local_irq_disable();
@@ -438,29 +446,6 @@ void cpu_idle_wait(void)
438} 446}
439EXPORT_SYMBOL_GPL(cpu_idle_wait); 447EXPORT_SYMBOL_GPL(cpu_idle_wait);
440 448
441/*
442 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
443 * which can obviate IPI to trigger checking of need_resched.
444 * We execute MONITOR against need_resched and enter optimized wait state
445 * through MWAIT. Whenever someone changes need_resched, we would be woken
446 * up from MWAIT (without an IPI).
447 *
448 * New with Core Duo processors, MWAIT can take some hints based on CPU
449 * capability.
450 */
451void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
452{
453 if (!need_resched()) {
454 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
455 clflush((void *)&current_thread_info()->flags);
456
457 __monitor((void *)&current_thread_info()->flags, 0, 0);
458 smp_mb();
459 if (!need_resched())
460 __mwait(ax, cx);
461 }
462}
463
464/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 449/* Default MONITOR/MWAIT with no hints, used for default C1 state */
465static void mwait_idle(void) 450static void mwait_idle(void)
466{ 451{
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a3d0dc59067..7a3b65107a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
38#include <linux/uaccess.h> 38#include <linux/uaccess.h>
39#include <linux/io.h> 39#include <linux/io.h>
40#include <linux/kdebug.h> 40#include <linux/kdebug.h>
41#include <linux/cpuidle.h>
41 42
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
@@ -109,7 +110,8 @@ void cpu_idle(void)
109 local_irq_disable(); 110 local_irq_disable();
110 /* Don't trace irqs off for idle */ 111 /* Don't trace irqs off for idle */
111 stop_critical_timings(); 112 stop_critical_timings();
112 pm_idle(); 113 if (cpuidle_idle_call())
114 pm_idle();
113 start_critical_timings(); 115 start_critical_timings();
114 } 116 }
115 tick_nohz_restart_sched_tick(); 117 tick_nohz_restart_sched_tick();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca6f7ab8df3..cbd26458911 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
37#include <linux/uaccess.h> 37#include <linux/uaccess.h>
38#include <linux/io.h> 38#include <linux/io.h>
39#include <linux/ftrace.h> 39#include <linux/ftrace.h>
40#include <linux/cpuidle.h>
40 41
41#include <asm/pgtable.h> 42#include <asm/pgtable.h>
42#include <asm/system.h> 43#include <asm/system.h>
@@ -56,31 +57,17 @@ asmlinkage extern void ret_from_fork(void);
56DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
57static DEFINE_PER_CPU(unsigned char, is_idle); 58static DEFINE_PER_CPU(unsigned char, is_idle);
58 59
59static ATOMIC_NOTIFIER_HEAD(idle_notifier);
60
61void idle_notifier_register(struct notifier_block *n)
62{
63 atomic_notifier_chain_register(&idle_notifier, n);
64}
65EXPORT_SYMBOL_GPL(idle_notifier_register);
66
67void idle_notifier_unregister(struct notifier_block *n)
68{
69 atomic_notifier_chain_unregister(&idle_notifier, n);
70}
71EXPORT_SYMBOL_GPL(idle_notifier_unregister);
72
73void enter_idle(void) 60void enter_idle(void)
74{ 61{
75 percpu_write(is_idle, 1); 62 percpu_write(is_idle, 1);
76 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 63 idle_notifier_call_chain(IDLE_START);
77} 64}
78 65
79static void __exit_idle(void) 66static void __exit_idle(void)
80{ 67{
81 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 68 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
82 return; 69 return;
83 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 70 idle_notifier_call_chain(IDLE_END);
84} 71}
85 72
86/* Called from interrupts to signify idle end */ 73/* Called from interrupts to signify idle end */
@@ -136,7 +123,8 @@ void cpu_idle(void)
136 enter_idle(); 123 enter_idle();
137 /* Don't trace irqs off for idle */ 124 /* Don't trace irqs off for idle */
138 stop_critical_timings(); 125 stop_critical_timings();
139 pm_idle(); 126 if (cpuidle_idle_call())
127 pm_idle();
140 start_critical_timings(); 128 start_critical_timings();
141 129
142 /* In many cases the interrupt that ended idle 130 /* In many cases the interrupt that ended idle
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 807c2a2b80f..82528799c5d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
528 return ret; 528 return ret;
529} 529}
530 530
531static void ptrace_triggered(struct perf_event *bp, int nmi, 531static void ptrace_triggered(struct perf_event *bp,
532 struct perf_sample_data *data, 532 struct perf_sample_data *data,
533 struct pt_regs *regs) 533 struct pt_regs *regs)
534{ 534{
@@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
715 attr.bp_type = HW_BREAKPOINT_W; 715 attr.bp_type = HW_BREAKPOINT_W;
716 attr.disabled = 1; 716 attr.disabled = 1;
717 717
718 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); 718 bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
719 NULL, tsk);
719 720
720 /* 721 /*
721 * CHECKME: the previous code returned -EIO if the addr wasn't 722 * CHECKME: the previous code returned -EIO if the addr wasn't
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 8bbe8c56916..b78643d0f9a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,7 +10,7 @@
10 10
11static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) 11static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
12{ 12{
13 u8 config, rev; 13 u8 config;
14 u16 word; 14 u16 word;
15 15
16 /* BIOS may enable hardware IRQ balancing for 16 /* BIOS may enable hardware IRQ balancing for
@@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
18 * based platforms. 18 * based platforms.
19 * Disable SW irqbalance/affinity on those platforms. 19 * Disable SW irqbalance/affinity on those platforms.
20 */ 20 */
21 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 21 if (dev->revision > 0x9)
22 if (rev > 0x9)
23 return; 22 return;
24 23
25 /* enable access to config space*/ 24 /* enable access to config space*/
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9242436e993..d4a705f2283 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -124,7 +124,7 @@ __setup("reboot=", reboot_setup);
124 */ 124 */
125 125
126/* 126/*
127 * Some machines require the "reboot=b" commandline option, 127 * Some machines require the "reboot=b" or "reboot=k" commandline options,
128 * this quirk makes that automatic. 128 * this quirk makes that automatic.
129 */ 129 */
130static int __init set_bios_reboot(const struct dmi_system_id *d) 130static int __init set_bios_reboot(const struct dmi_system_id *d)
@@ -136,6 +136,15 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
136 return 0; 136 return 0;
137} 137}
138 138
139static int __init set_kbd_reboot(const struct dmi_system_id *d)
140{
141 if (reboot_type != BOOT_KBD) {
142 reboot_type = BOOT_KBD;
143 printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident);
144 }
145 return 0;
146}
147
139static struct dmi_system_id __initdata reboot_dmi_table[] = { 148static struct dmi_system_id __initdata reboot_dmi_table[] = {
140 { /* Handle problems with rebooting on Dell E520's */ 149 { /* Handle problems with rebooting on Dell E520's */
141 .callback = set_bios_reboot, 150 .callback = set_bios_reboot,
@@ -295,7 +304,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
295 }, 304 },
296 }, 305 },
297 { /* Handle reboot issue on Acer Aspire one */ 306 { /* Handle reboot issue on Acer Aspire one */
298 .callback = set_bios_reboot, 307 .callback = set_kbd_reboot,
299 .ident = "Acer Aspire One A110", 308 .ident = "Acer Aspire One A110",
300 .matches = { 309 .matches = {
301 DMI_MATCH(DMI_SYS_VENDOR, "Acer"), 310 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11..36818f8ec2b 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -97,6 +97,8 @@ relocate_kernel:
97 ret 97 ret
98 98
99identity_mapped: 99identity_mapped:
100 /* set return address to 0 if not preserving context */
101 pushl $0
100 /* store the start address on the stack */ 102 /* store the start address on the stack */
101 pushl %edx 103 pushl %edx
102 104
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d47..7a6f3b3be3c 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -100,6 +100,8 @@ relocate_kernel:
100 ret 100 ret
101 101
102identity_mapped: 102identity_mapped:
103 /* set return address to 0 if not preserving context */
104 pushq $0
103 /* store the start address on the stack */ 105 /* store the start address on the stack */
104 pushq %rdx 106 pushq %rdx
105 107
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 3f2ad2640d8..ccdbc16b894 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -42,8 +42,11 @@ int mach_set_rtc_mmss(unsigned long nowtime)
42{ 42{
43 int real_seconds, real_minutes, cmos_minutes; 43 int real_seconds, real_minutes, cmos_minutes;
44 unsigned char save_control, save_freq_select; 44 unsigned char save_control, save_freq_select;
45 unsigned long flags;
45 int retval = 0; 46 int retval = 0;
46 47
48 spin_lock_irqsave(&rtc_lock, flags);
49
47 /* tell the clock it's being set */ 50 /* tell the clock it's being set */
48 save_control = CMOS_READ(RTC_CONTROL); 51 save_control = CMOS_READ(RTC_CONTROL);
49 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); 52 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
@@ -93,12 +96,17 @@ int mach_set_rtc_mmss(unsigned long nowtime)
93 CMOS_WRITE(save_control, RTC_CONTROL); 96 CMOS_WRITE(save_control, RTC_CONTROL);
94 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); 97 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
95 98
99 spin_unlock_irqrestore(&rtc_lock, flags);
100
96 return retval; 101 return retval;
97} 102}
98 103
99unsigned long mach_get_cmos_time(void) 104unsigned long mach_get_cmos_time(void)
100{ 105{
101 unsigned int status, year, mon, day, hour, min, sec, century = 0; 106 unsigned int status, year, mon, day, hour, min, sec, century = 0;
107 unsigned long flags;
108
109 spin_lock_irqsave(&rtc_lock, flags);
102 110
103 /* 111 /*
104 * If UIP is clear, then we have >= 244 microseconds before 112 * If UIP is clear, then we have >= 244 microseconds before
@@ -125,6 +133,8 @@ unsigned long mach_get_cmos_time(void)
125 status = CMOS_READ(RTC_CONTROL); 133 status = CMOS_READ(RTC_CONTROL);
126 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); 134 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
127 135
136 spin_unlock_irqrestore(&rtc_lock, flags);
137
128 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { 138 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
129 sec = bcd2bin(sec); 139 sec = bcd2bin(sec);
130 min = bcd2bin(min); 140 min = bcd2bin(min);
@@ -169,24 +179,15 @@ EXPORT_SYMBOL(rtc_cmos_write);
169 179
170int update_persistent_clock(struct timespec now) 180int update_persistent_clock(struct timespec now)
171{ 181{
172 unsigned long flags; 182 return x86_platform.set_wallclock(now.tv_sec);
173 int retval;
174
175 spin_lock_irqsave(&rtc_lock, flags);
176 retval = x86_platform.set_wallclock(now.tv_sec);
177 spin_unlock_irqrestore(&rtc_lock, flags);
178
179 return retval;
180} 183}
181 184
182/* not static: needed by APM */ 185/* not static: needed by APM */
183void read_persistent_clock(struct timespec *ts) 186void read_persistent_clock(struct timespec *ts)
184{ 187{
185 unsigned long retval, flags; 188 unsigned long retval;
186 189
187 spin_lock_irqsave(&rtc_lock, flags);
188 retval = x86_platform.get_wallclock(); 190 retval = x86_platform.get_wallclock();
189 spin_unlock_irqrestore(&rtc_lock, flags);
190 191
191 ts->tv_sec = retval; 192 ts->tv_sec = retval;
192 ts->tv_nsec = 0; 193 ts->tv_nsec = 0;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 40a24932a8a..54ddaeb221c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
485asmlinkage int 485asmlinkage int
486sys_sigsuspend(int history0, int history1, old_sigset_t mask) 486sys_sigsuspend(int history0, int history1, old_sigset_t mask)
487{ 487{
488 mask &= _BLOCKABLE; 488 sigset_t blocked;
489 spin_lock_irq(&current->sighand->siglock); 489
490 current->saved_sigmask = current->blocked; 490 current->saved_sigmask = current->blocked;
491 siginitset(&current->blocked, mask); 491
492 recalc_sigpending(); 492 mask &= _BLOCKABLE;
493 spin_unlock_irq(&current->sighand->siglock); 493 siginitset(&blocked, mask);
494 set_current_blocked(&blocked);
494 495
495 current->state = TASK_INTERRUPTIBLE; 496 current->state = TASK_INTERRUPTIBLE;
496 schedule(); 497 schedule();
497 set_restore_sigmask();
498 498
499 set_restore_sigmask();
499 return -ERESTARTNOHAND; 500 return -ERESTARTNOHAND;
500} 501}
501 502
@@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
572 goto badframe; 573 goto badframe;
573 574
574 sigdelsetmask(&set, ~_BLOCKABLE); 575 sigdelsetmask(&set, ~_BLOCKABLE);
575 spin_lock_irq(&current->sighand->siglock); 576 set_current_blocked(&set);
576 current->blocked = set;
577 recalc_sigpending();
578 spin_unlock_irq(&current->sighand->siglock);
579 577
580 if (restore_sigcontext(regs, &frame->sc, &ax)) 578 if (restore_sigcontext(regs, &frame->sc, &ax))
581 goto badframe; 579 goto badframe;
@@ -653,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
653 651
654static int 652static int
655setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 653setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
656 sigset_t *set, struct pt_regs *regs) 654 struct pt_regs *regs)
657{ 655{
658 int usig = signr_convert(sig); 656 int usig = signr_convert(sig);
657 sigset_t *set = &current->blocked;
659 int ret; 658 int ret;
660 659
660 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
661 set = &current->saved_sigmask;
662
661 /* Set up the stack frame */ 663 /* Set up the stack frame */
662 if (is_ia32) { 664 if (is_ia32) {
663 if (ka->sa.sa_flags & SA_SIGINFO) 665 if (ka->sa.sa_flags & SA_SIGINFO)
@@ -672,12 +674,13 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
672 return -EFAULT; 674 return -EFAULT;
673 } 675 }
674 676
677 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
675 return ret; 678 return ret;
676} 679}
677 680
678static int 681static int
679handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
680 sigset_t *oldset, struct pt_regs *regs) 683 struct pt_regs *regs)
681{ 684{
682 sigset_t blocked; 685 sigset_t blocked;
683 int ret; 686 int ret;
@@ -712,20 +715,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
712 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 715 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
713 regs->flags &= ~X86_EFLAGS_TF; 716 regs->flags &= ~X86_EFLAGS_TF;
714 717
715 ret = setup_rt_frame(sig, ka, info, oldset, regs); 718 ret = setup_rt_frame(sig, ka, info, regs);
716 719
717 if (ret) 720 if (ret)
718 return ret; 721 return ret;
719 722
720#ifdef CONFIG_X86_64
721 /*
722 * This has nothing to do with segment registers,
723 * despite the name. This magic affects uaccess.h
724 * macros' behavior. Reset it to the normal setting.
725 */
726 set_fs(USER_DS);
727#endif
728
729 /* 723 /*
730 * Clear the direction flag as per the ABI for function entry. 724 * Clear the direction flag as per the ABI for function entry.
731 */ 725 */
@@ -767,7 +761,6 @@ static void do_signal(struct pt_regs *regs)
767 struct k_sigaction ka; 761 struct k_sigaction ka;
768 siginfo_t info; 762 siginfo_t info;
769 int signr; 763 int signr;
770 sigset_t *oldset;
771 764
772 /* 765 /*
773 * We want the common case to go fast, which is why we may in certain 766 * We want the common case to go fast, which is why we may in certain
@@ -779,23 +772,10 @@ static void do_signal(struct pt_regs *regs)
779 if (!user_mode(regs)) 772 if (!user_mode(regs))
780 return; 773 return;
781 774
782 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
783 oldset = &current->saved_sigmask;
784 else
785 oldset = &current->blocked;
786
787 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 775 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
788 if (signr > 0) { 776 if (signr > 0) {
789 /* Whee! Actually deliver the signal. */ 777 /* Whee! Actually deliver the signal. */
790 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 778 handle_signal(signr, &info, &ka, regs);
791 /*
792 * A signal was successfully delivered; the saved
793 * sigmask will have been stored in the signal frame,
794 * and will be restored by sigreturn, so we can simply
795 * clear the TS_RESTORE_SIGMASK flag.
796 */
797 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
798 }
799 return; 779 return;
800 } 780 }
801 781
@@ -823,7 +803,7 @@ static void do_signal(struct pt_regs *regs)
823 */ 803 */
824 if (current_thread_info()->status & TS_RESTORE_SIGMASK) { 804 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
825 current_thread_info()->status &= ~TS_RESTORE_SIGMASK; 805 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
826 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL); 806 set_current_blocked(&current->saved_sigmask);
827 } 807 }
828} 808}
829 809
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9fd3137230d..39e11500b9b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -285,19 +285,6 @@ notrace static void __cpuinit start_secondary(void *unused)
285 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 285 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
286 x86_platform.nmi_init(); 286 x86_platform.nmi_init();
287 287
288 /*
289 * Wait until the cpu which brought this one up marked it
290 * online before enabling interrupts. If we don't do that then
291 * we can end up waking up the softirq thread before this cpu
292 * reached the active state, which makes the scheduler unhappy
293 * and schedule the softirq thread on the wrong cpu. This is
294 * only observable with forced threaded interrupts, but in
295 * theory it could also happen w/o them. It's just way harder
296 * to achieve.
297 */
298 while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
299 cpu_relax();
300
301 /* enable local interrupts */ 288 /* enable local interrupts */
302 local_irq_enable(); 289 local_irq_enable();
303 290
@@ -438,7 +425,7 @@ static void impress_friends(void)
438void __inquire_remote_apic(int apicid) 425void __inquire_remote_apic(int apicid)
439{ 426{
440 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 427 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
441 char *names[] = { "ID", "VERSION", "SPIV" }; 428 const char * const names[] = { "ID", "VERSION", "SPIV" };
442 int timeout; 429 int timeout;
443 u32 status; 430 u32 status;
444 431
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55d9bc03f69..fdd0c6430e5 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
66} 66}
67EXPORT_SYMBOL_GPL(save_stack_trace); 67EXPORT_SYMBOL_GPL(save_stack_trace);
68 68
69void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) 69void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
70{ 70{
71 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); 71 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
72 if (trace->nr_entries < trace->max_entries) 72 if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 7977f0cfe33..c346d116148 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -74,7 +74,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
74 74
75#ifdef CONFIG_X86_64 75#ifdef CONFIG_X86_64
76 case 0x40 ... 0x4f: 76 case 0x40 ... 0x4f:
77 if (regs->cs != __USER_CS) 77 if (!user_64bit_mode(regs))
78 /* 32-bit mode: register increment */ 78 /* 32-bit mode: register increment */
79 return 0; 79 return 0;
80 /* 64-bit mode: REX prefix */ 80 /* 64-bit mode: REX prefix */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 652137f34dc..058cac30916 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -168,7 +168,7 @@ ENTRY(sys_call_table)
168 .long ptregs_vm86 168 .long ptregs_vm86
169 .long sys_ni_syscall /* Old sys_query_module */ 169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll 170 .long sys_poll
171 .long sys_nfsservctl 171 .long sys_ni_syscall /* Old nfsservctl */
172 .long sys_setresgid16 /* 170 */ 172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16 173 .long sys_getresgid16
174 .long sys_prctl 174 .long sys_prctl
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 30ac65df7d4..e07a2fc876b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
36#include <asm/bootparam.h> 36#include <asm/bootparam.h>
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
39#include <asm/swiotlb.h>
39#include <asm/fixmap.h> 40#include <asm/fixmap.h>
40#include <asm/proto.h> 41#include <asm/proto.h>
41#include <asm/setup.h> 42#include <asm/setup.h>
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 00cbb272627..5a64d057be5 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,13 +11,13 @@
11 11
12#include <linux/clockchips.h> 12#include <linux/clockchips.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/i8253.h>
14#include <linux/time.h> 15#include <linux/time.h>
15#include <linux/mca.h> 16#include <linux/mca.h>
16 17
17#include <asm/vsyscall.h> 18#include <asm/vsyscall.h>
18#include <asm/x86_init.h> 19#include <asm/x86_init.h>
19#include <asm/i8259.h> 20#include <asm/i8259.h>
20#include <asm/i8253.h>
21#include <asm/timer.h> 21#include <asm/timer.h>
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/time.h> 23#include <asm/time.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9d..6913369c234 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -49,7 +49,7 @@
49#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
50#include <asm/processor.h> 50#include <asm/processor.h>
51#include <asm/debugreg.h> 51#include <asm/debugreg.h>
52#include <asm/atomic.h> 52#include <linux/atomic.h>
53#include <asm/system.h> 53#include <asm/system.h>
54#include <asm/traps.h> 54#include <asm/traps.h>
55#include <asm/desc.h> 55#include <asm/desc.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6cc6922262a..db483369f10 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,7 +5,6 @@
5#include <linux/timer.h> 5#include <linux/timer.h>
6#include <linux/acpi_pmtmr.h> 6#include <linux/acpi_pmtmr.h>
7#include <linux/cpufreq.h> 7#include <linux/cpufreq.h>
8#include <linux/dmi.h>
9#include <linux/delay.h> 8#include <linux/delay.h>
10#include <linux/clocksource.h> 9#include <linux/clocksource.h>
11#include <linux/percpu.h> 10#include <linux/percpu.h>
@@ -777,7 +776,7 @@ static struct clocksource clocksource_tsc = {
777 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 776 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
778 CLOCK_SOURCE_MUST_VERIFY, 777 CLOCK_SOURCE_MUST_VERIFY,
779#ifdef CONFIG_X86_64 778#ifdef CONFIG_X86_64
780 .vread = vread_tsc, 779 .archdata = { .vclock_mode = VCLOCK_TSC },
781#endif 780#endif
782}; 781};
783 782
@@ -800,27 +799,6 @@ void mark_tsc_unstable(char *reason)
800 799
801EXPORT_SYMBOL_GPL(mark_tsc_unstable); 800EXPORT_SYMBOL_GPL(mark_tsc_unstable);
802 801
803static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
804{
805 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
806 d->ident);
807 tsc_unstable = 1;
808 return 0;
809}
810
811/* List of systems that have known TSC problems */
812static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
813 {
814 .callback = dmi_mark_tsc_unstable,
815 .ident = "IBM Thinkpad 380XD",
816 .matches = {
817 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
818 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
819 },
820 },
821 {}
822};
823
824static void __init check_system_tsc_reliable(void) 802static void __init check_system_tsc_reliable(void)
825{ 803{
826#ifdef CONFIG_MGEODE_LX 804#ifdef CONFIG_MGEODE_LX
@@ -1010,8 +988,6 @@ void __init tsc_init(void)
1010 lpj_fine = lpj; 988 lpj_fine = lpj;
1011 989
1012 use_tsc_delay(); 990 use_tsc_delay();
1013 /* Check and install the TSC clocksource */
1014 dmi_check_system(bad_tsc_dmi_table);
1015 991
1016 if (unsynchronized_tsc()) 992 if (unsynchronized_tsc())
1017 mark_tsc_unstable("TSCs unsynchronized"); 993 mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 89aed99aafc..0f703f10901 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -71,7 +71,6 @@ PHDRS {
71 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
72 data PT_LOAD FLAGS(6); /* RW_ */ 72 data PT_LOAD FLAGS(6); /* RW_ */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 user PT_LOAD FLAGS(5); /* R_E */
75#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
76 percpu PT_LOAD FLAGS(6); /* RW_ */ 75 percpu PT_LOAD FLAGS(6); /* RW_ */
77#endif 76#endif
@@ -154,56 +153,25 @@ SECTIONS
154 153
155#ifdef CONFIG_X86_64 154#ifdef CONFIG_X86_64
156 155
157#define VSYSCALL_ADDR (-10*1024*1024) 156 . = ALIGN(PAGE_SIZE);
158 157 __vvar_page = .;
159#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET)
160#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
161
162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
164#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
165 ADDR(.vsyscall_0) + offset \
166 : AT(VLOAD(.vsyscall_var_ ## x)) { \
167 *(.vsyscall_var_ ## x) \
168 } \
169 x = VVIRT(.vsyscall_var_ ## x);
170
171 . = ALIGN(4096);
172 __vsyscall_0 = .;
173
174 . = VSYSCALL_ADDR;
175 .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
176 *(.vsyscall_0)
177 } :user
178
179 . = ALIGN(L1_CACHE_BYTES);
180 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
181 *(.vsyscall_fn)
182 }
183
184 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
185 *(.vsyscall_1)
186 }
187 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
188 *(.vsyscall_2)
189 }
190 158
191 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { 159 .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
192 *(.vsyscall_3) 160 /* work around gold bug 13023 */
193 } 161 __vvar_beginning_hack = .;
194 162
163 /* Place all vvars at the offsets in asm/vvar.h. */
164#define EMIT_VVAR(name, offset) \
165 . = __vvar_beginning_hack + offset; \
166 *(.vvar_ ## name)
195#define __VVAR_KERNEL_LDS 167#define __VVAR_KERNEL_LDS
196#include <asm/vvar.h> 168#include <asm/vvar.h>
197#undef __VVAR_KERNEL_LDS 169#undef __VVAR_KERNEL_LDS
170#undef EMIT_VVAR
198 171
199 . = __vsyscall_0 + PAGE_SIZE; 172 } :data
200 173
201#undef VSYSCALL_ADDR 174 . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
202#undef VLOAD_OFFSET
203#undef VLOAD
204#undef VVIRT_OFFSET
205#undef VVIRT
206#undef EMIT_VVAR
207 175
208#endif /* CONFIG_X86_64 */ 176#endif /* CONFIG_X86_64 */
209 177
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
deleted file mode 100644
index a81aa9e9894..00000000000
--- a/arch/x86/kernel/vread_tsc_64.c
+++ /dev/null
@@ -1,36 +0,0 @@
1/* This code runs in userspace. */
2
3#define DISABLE_BRANCH_PROFILING
4#include <asm/vgtod.h>
5
6notrace cycle_t __vsyscall_fn vread_tsc(void)
7{
8 cycle_t ret;
9 u64 last;
10
11 /*
12 * Empirically, a fence (of type that depends on the CPU)
13 * before rdtsc is enough to ensure that rdtsc is ordered
14 * with respect to loads. The various CPU manuals are unclear
15 * as to whether rdtsc can be reordered with later loads,
16 * but no one has ever seen it happen.
17 */
18 rdtsc_barrier();
19 ret = (cycle_t)vget_cycles();
20
21 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
22
23 if (likely(ret >= last))
24 return ret;
25
26 /*
27 * GCC likes to generate cmov here, but this branch is extremely
28 * predictable (it's just a funciton of time and the likely is
29 * very likely) and there's a data dependence, so force GCC
30 * to generate a branch instead. I don't barrier() because
31 * we don't actually need a barrier, and if this function
32 * ever gets inlined it will generate worse code.
33 */
34 asm volatile ("");
35 return last;
36}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3e682184d76..b56c65de384 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs. 3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 * 4 *
5 * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
6 *
5 * Thanks to hpa@transmeta.com for some useful hint. 7 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with 8 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name. 9 * a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,15 +13,11 @@
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid 13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this 14 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS. 15 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 * 16 *
16 * Note: the concept clashes with user mode linux. If you use UML and 17 * Note: the concept clashes with user mode linux. UML users should
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 18 * use the vDSO.
18 */ 19 */
19 20
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
23#include <linux/time.h> 21#include <linux/time.h>
24#include <linux/init.h> 22#include <linux/init.h>
25#include <linux/kernel.h> 23#include <linux/kernel.h>
@@ -32,9 +30,12 @@
32#include <linux/cpu.h> 30#include <linux/cpu.h>
33#include <linux/smp.h> 31#include <linux/smp.h>
34#include <linux/notifier.h> 32#include <linux/notifier.h>
33#include <linux/syscalls.h>
34#include <linux/ratelimit.h>
35 35
36#include <asm/vsyscall.h> 36#include <asm/vsyscall.h>
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/compat.h>
38#include <asm/page.h> 39#include <asm/page.h>
39#include <asm/unistd.h> 40#include <asm/unistd.h>
40#include <asm/fixmap.h> 41#include <asm/fixmap.h>
@@ -44,18 +45,38 @@
44#include <asm/desc.h> 45#include <asm/desc.h>
45#include <asm/topology.h> 46#include <asm/topology.h>
46#include <asm/vgtod.h> 47#include <asm/vgtod.h>
48#include <asm/traps.h>
47 49
48#define __vsyscall(nr) \ 50#define CREATE_TRACE_POINTS
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace 51#include "vsyscall_trace.h"
50#define __syscall_clobber "r11","cx","memory"
51 52
52DEFINE_VVAR(int, vgetcpu_mode); 53DEFINE_VVAR(int, vgetcpu_mode);
53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = 54DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
54{ 55{
55 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), 56 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
56 .sysctl_enabled = 1,
57}; 57};
58 58
59static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
60
61static int __init vsyscall_setup(char *str)
62{
63 if (str) {
64 if (!strcmp("emulate", str))
65 vsyscall_mode = EMULATE;
66 else if (!strcmp("native", str))
67 vsyscall_mode = NATIVE;
68 else if (!strcmp("none", str))
69 vsyscall_mode = NONE;
70 else
71 return -EINVAL;
72
73 return 0;
74 }
75
76 return -EINVAL;
77}
78early_param("vsyscall", vsyscall_setup);
79
59void update_vsyscall_tz(void) 80void update_vsyscall_tz(void)
60{ 81{
61 unsigned long flags; 82 unsigned long flags;
@@ -72,179 +93,140 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
72 unsigned long flags; 93 unsigned long flags;
73 94
74 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); 95 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
96
75 /* copy vsyscall data */ 97 /* copy vsyscall data */
76 vsyscall_gtod_data.clock.vread = clock->vread; 98 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
77 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 99 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
78 vsyscall_gtod_data.clock.mask = clock->mask; 100 vsyscall_gtod_data.clock.mask = clock->mask;
79 vsyscall_gtod_data.clock.mult = mult; 101 vsyscall_gtod_data.clock.mult = mult;
80 vsyscall_gtod_data.clock.shift = clock->shift; 102 vsyscall_gtod_data.clock.shift = clock->shift;
81 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 103 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
82 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 104 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
83 vsyscall_gtod_data.wall_to_monotonic = *wtm; 105 vsyscall_gtod_data.wall_to_monotonic = *wtm;
84 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 106 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
107
85 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 108 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
86} 109}
87 110
88/* RED-PEN may want to readd seq locking, but then the variable should be 111static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
89 * write-once. 112 const char *message)
90 */
91static __always_inline void do_get_tz(struct timezone * tz)
92{ 113{
93 *tz = VVAR(vsyscall_gtod_data).sys_tz; 114 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
94} 115 struct task_struct *tsk;
95 116
96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 117 if (!show_unhandled_signals || !__ratelimit(&rs))
97{ 118 return;
98 int ret;
99 asm volatile("syscall"
100 : "=a" (ret)
101 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
102 : __syscall_clobber );
103 return ret;
104}
105 119
106static __always_inline long time_syscall(long *t) 120 tsk = current;
107{
108 long secs;
109 asm volatile("syscall"
110 : "=a" (secs)
111 : "0" (__NR_time),"D" (t) : __syscall_clobber);
112 return secs;
113}
114 121
115static __always_inline void do_vgettimeofday(struct timeval * tv) 122 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
116{ 123 level, tsk->comm, task_pid_nr(tsk),
117 cycle_t now, base, mask, cycle_delta; 124 message, regs->ip, regs->cs,
118 unsigned seq; 125 regs->sp, regs->ax, regs->si, regs->di);
119 unsigned long mult, shift, nsec;
120 cycle_t (*vread)(void);
121 do {
122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
123
124 vread = VVAR(vsyscall_gtod_data).clock.vread;
125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
127 gettimeofday(tv,NULL);
128 return;
129 }
130
131 now = vread();
132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
133 mask = VVAR(vsyscall_gtod_data).clock.mask;
134 mult = VVAR(vsyscall_gtod_data).clock.mult;
135 shift = VVAR(vsyscall_gtod_data).clock.shift;
136
137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
140
141 /* calculate interval: */
142 cycle_delta = (now - base) & mask;
143 /* convert to nsecs: */
144 nsec += (cycle_delta * mult) >> shift;
145
146 while (nsec >= NSEC_PER_SEC) {
147 tv->tv_sec += 1;
148 nsec -= NSEC_PER_SEC;
149 }
150 tv->tv_usec = nsec / NSEC_PER_USEC;
151} 126}
152 127
153int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) 128static int addr_to_vsyscall_nr(unsigned long addr)
154{ 129{
155 if (tv) 130 int nr;
156 do_vgettimeofday(tv); 131
157 if (tz) 132 if ((addr & ~0xC00UL) != VSYSCALL_START)
158 do_get_tz(tz); 133 return -EINVAL;
159 return 0; 134
135 nr = (addr & 0xC00UL) >> 10;
136 if (nr >= 3)
137 return -EINVAL;
138
139 return nr;
160} 140}
161 141
162/* This will break when the xtime seconds get inaccurate, but that is 142bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
163 * unlikely */
164time_t __vsyscall(1) vtime(time_t *t)
165{ 143{
166 unsigned seq; 144 struct task_struct *tsk;
167 time_t result; 145 unsigned long caller;
168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) 146 int vsyscall_nr;
169 return time_syscall(t); 147 long ret;
170 148
171 do { 149 /*
172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); 150 * No point in checking CS -- the only way to get here is a user mode
151 * trap to a high address, which means that we're in 64-bit user code.
152 */
153
154 WARN_ON_ONCE(address != regs->ip);
155
156 if (vsyscall_mode == NONE) {
157 warn_bad_vsyscall(KERN_INFO, regs,
158 "vsyscall attempted with vsyscall=none");
159 return false;
160 }
173 161
174 result = VVAR(vsyscall_gtod_data).wall_time_sec; 162 vsyscall_nr = addr_to_vsyscall_nr(address);
175 163
176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); 164 trace_emulate_vsyscall(vsyscall_nr);
177 165
178 if (t) 166 if (vsyscall_nr < 0) {
179 *t = result; 167 warn_bad_vsyscall(KERN_WARNING, regs,
180 return result; 168 "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
181} 169 goto sigsegv;
170 }
182 171
183/* Fast way to get current CPU and node. 172 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
184 This helps to do per node and per CPU caches in user space. 173 warn_bad_vsyscall(KERN_WARNING, regs,
185 The result is not guaranteed without CPU affinity, but usually 174 "vsyscall with bad stack (exploit attempt?)");
186 works out because the scheduler tries to keep a thread on the same 175 goto sigsegv;
187 CPU. 176 }
188 177
189 tcache must point to a two element sized long array. 178 tsk = current;
190 All arguments can be NULL. */ 179 if (seccomp_mode(&tsk->seccomp))
191long __vsyscall(2) 180 do_exit(SIGKILL);
192vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 181
193{ 182 switch (vsyscall_nr) {
194 unsigned int p; 183 case 0:
195 unsigned long j = 0; 184 ret = sys_gettimeofday(
196 185 (struct timeval __user *)regs->di,
197 /* Fast cache - only recompute value once per jiffies and avoid 186 (struct timezone __user *)regs->si);
198 relatively costly rdtscp/cpuid otherwise. 187 break;
199 This works because the scheduler usually keeps the process 188
200 on the same CPU and this syscall doesn't guarantee its 189 case 1:
201 results anyways. 190 ret = sys_time((time_t __user *)regs->di);
202 We do this here because otherwise user space would do it on 191 break;
203 its own in a likely inferior way (no access to jiffies). 192
204 If you don't like it pass NULL. */ 193 case 2:
205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { 194 ret = sys_getcpu((unsigned __user *)regs->di,
206 p = tcache->blob[1]; 195 (unsigned __user *)regs->si,
207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 196 0);
208 /* Load per CPU data from RDTSCP */ 197 break;
209 native_read_tscp(&p);
210 } else {
211 /* Load per CPU data from GDT */
212 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
213 } 198 }
214 if (tcache) { 199
215 tcache->blob[0] = j; 200 if (ret == -EFAULT) {
216 tcache->blob[1] = p; 201 /*
202 * Bad news -- userspace fed a bad pointer to a vsyscall.
203 *
204 * With a real vsyscall, that would have caused SIGSEGV.
205 * To make writing reliable exploits using the emulated
206 * vsyscalls harder, generate SIGSEGV here as well.
207 */
208 warn_bad_vsyscall(KERN_INFO, regs,
209 "vsyscall fault (exploit attempt?)");
210 goto sigsegv;
217 } 211 }
218 if (cpu)
219 *cpu = p & 0xfff;
220 if (node)
221 *node = p >> 12;
222 return 0;
223}
224 212
225static long __vsyscall(3) venosys_1(void) 213 regs->ax = ret;
226{
227 return -ENOSYS;
228}
229 214
230#ifdef CONFIG_SYSCTL 215 /* Emulate a ret instruction. */
231static ctl_table kernel_table2[] = { 216 regs->ip = caller;
232 { .procname = "vsyscall64", 217 regs->sp += 8;
233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
234 .mode = 0644,
235 .proc_handler = proc_dointvec },
236 {}
237};
238 218
239static ctl_table kernel_root_table2[] = { 219 return true;
240 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 },
242 {}
243};
244#endif
245 220
246/* Assume __initcall executes before all user space. Hopefully kmod 221sigsegv:
247 doesn't violate that. We'll find out if it does. */ 222 force_sig(SIGSEGV, current);
223 return true;
224}
225
226/*
227 * Assume __initcall executes before all user space. Hopefully kmod
228 * doesn't violate that. We'll find out if it does.
229 */
248static void __cpuinit vsyscall_set_cpu(int cpu) 230static void __cpuinit vsyscall_set_cpu(int cpu)
249{ 231{
250 unsigned long d; 232 unsigned long d;
@@ -255,13 +237,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
255 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 237 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
256 write_rdtscp_aux((node << 12) | cpu); 238 write_rdtscp_aux((node << 12) | cpu);
257 239
258 /* Store cpu number in limit so that it can be loaded quickly 240 /*
259 in user space in vgetcpu. 241 * Store cpu number in limit so that it can be loaded quickly
260 12 bits for the CPU and 8 bits for the node. */ 242 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
243 */
261 d = 0x0f40000000000ULL; 244 d = 0x0f40000000000ULL;
262 d |= cpu; 245 d |= cpu;
263 d |= (node & 0xf) << 12; 246 d |= (node & 0xf) << 12;
264 d |= (node >> 4) << 48; 247 d |= (node >> 4) << 48;
248
265 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 249 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
266} 250}
267 251
@@ -275,34 +259,40 @@ static int __cpuinit
275cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) 259cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
276{ 260{
277 long cpu = (long)arg; 261 long cpu = (long)arg;
262
278 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 263 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
279 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); 264 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
265
280 return NOTIFY_DONE; 266 return NOTIFY_DONE;
281} 267}
282 268
283void __init map_vsyscall(void) 269void __init map_vsyscall(void)
284{ 270{
285 extern char __vsyscall_0; 271 extern char __vsyscall_page;
286 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 272 unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
287 273 extern char __vvar_page;
288 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ 274 unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
289 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); 275
276 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
277 vsyscall_mode == NATIVE
278 ? PAGE_KERNEL_VSYSCALL
279 : PAGE_KERNEL_VVAR);
280 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
281 (unsigned long)VSYSCALL_START);
282
283 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
284 BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
285 (unsigned long)VVAR_ADDRESS);
290} 286}
291 287
292static int __init vsyscall_init(void) 288static int __init vsyscall_init(void)
293{ 289{
294 BUG_ON(((unsigned long) &vgettimeofday != 290 BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
295 VSYSCALL_ADDR(__NR_vgettimeofday))); 291
296 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
297 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
298 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
299#ifdef CONFIG_SYSCTL
300 register_sysctl_table(kernel_root_table2);
301#endif
302 on_each_cpu(cpu_vsyscall_init, NULL, 1); 292 on_each_cpu(cpu_vsyscall_init, NULL, 1);
303 /* notifier priority > KVM */ 293 /* notifier priority > KVM */
304 hotcpu_notifier(cpu_vsyscall_notifier, 30); 294 hotcpu_notifier(cpu_vsyscall_notifier, 30);
295
305 return 0; 296 return 0;
306} 297}
307
308__initcall(vsyscall_init); 298__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 00000000000..c9596a9af15
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,37 @@
1/*
2 * vsyscall_emu_64.S: Vsyscall emulation page
3 *
4 * Copyright (c) 2011 Andy Lutomirski
5 *
6 * Subject to the GNU General Public License, version 2
7 */
8
9#include <linux/linkage.h>
10
11#include <asm/irq_vectors.h>
12#include <asm/page_types.h>
13#include <asm/unistd_64.h>
14
15__PAGE_ALIGNED_DATA
16 .globl __vsyscall_page
17 .balign PAGE_SIZE, 0xcc
18 .type __vsyscall_page, @object
19__vsyscall_page:
20
21 mov $__NR_gettimeofday, %rax
22 syscall
23 ret
24
25 .balign 1024, 0xcc
26 mov $__NR_time, %rax
27 syscall
28 ret
29
30 .balign 1024, 0xcc
31 mov $__NR_getcpu, %rax
32 syscall
33 ret
34
35 .balign 4096, 0xcc
36
37 .size __vsyscall_page, 4096
diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h
new file mode 100644
index 00000000000..a8b2edec54f
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_trace.h
@@ -0,0 +1,29 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM vsyscall
3
4#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define __VSYSCALL_TRACE_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(emulate_vsyscall,
10
11 TP_PROTO(int nr),
12
13 TP_ARGS(nr),
14
15 TP_STRUCT__entry(__field(int, nr)),
16
17 TP_fast_assign(
18 __entry->nr = nr;
19 ),
20
21 TP_printk("nr = %d", __entry->nr)
22);
23
24#endif
25
26#undef TRACE_INCLUDE_PATH
27#define TRACE_INCLUDE_PATH ../../arch/x86/kernel
28#define TRACE_INCLUDE_FILE vsyscall_trace
29#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 50f63648ce1..ff5790d8e99 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,6 +22,8 @@ config KVM
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 # for device assignment: 23 # for device assignment:
24 depends on PCI 24 depends on PCI
25 # for TASKSTATS/TASK_DELAY_ACCT:
26 depends on NET
25 select PREEMPT_NOTIFIERS 27 select PREEMPT_NOTIFIERS
26 select MMU_NOTIFIER 28 select MMU_NOTIFIER
27 select ANON_INODES 29 select ANON_INODES
@@ -31,6 +33,8 @@ config KVM
31 select KVM_ASYNC_PF 33 select KVM_ASYNC_PF
32 select USER_RETURN_NOTIFIER 34 select USER_RETURN_NOTIFIER
33 select KVM_MMIO 35 select KVM_MMIO
36 select TASKSTATS
37 select TASK_DELAY_ACCT
34 ---help--- 38 ---help---
35 Support hosting fully virtualized guest machines using hardware 39 Support hosting fully virtualized guest machines using hardware
36 virtualization extensions. You will need a fairly recent 40 virtualization extensions. You will need a fairly recent
@@ -76,6 +80,5 @@ config KVM_MMU_AUDIT
76# the virtualization menu. 80# the virtualization menu.
77source drivers/vhost/Kconfig 81source drivers/vhost/Kconfig
78source drivers/lguest/Kconfig 82source drivers/lguest/Kconfig
79source drivers/virtio/Kconfig
80 83
81endif # VIRTUALIZATION 84endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index adc98675cda..8b4cc5f067d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -407,76 +407,59 @@ struct gprefix {
407 } \ 407 } \
408 } while (0) 408 } while (0)
409 409
410/* Fetch next part of the instruction being emulated. */
411#define insn_fetch(_type, _size, _eip) \
412({ unsigned long _x; \
413 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
414 if (rc != X86EMUL_CONTINUE) \
415 goto done; \
416 (_eip) += (_size); \
417 (_type)_x; \
418})
419
420#define insn_fetch_arr(_arr, _size, _eip) \
421({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
422 if (rc != X86EMUL_CONTINUE) \
423 goto done; \
424 (_eip) += (_size); \
425})
426
427static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, 410static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
428 enum x86_intercept intercept, 411 enum x86_intercept intercept,
429 enum x86_intercept_stage stage) 412 enum x86_intercept_stage stage)
430{ 413{
431 struct x86_instruction_info info = { 414 struct x86_instruction_info info = {
432 .intercept = intercept, 415 .intercept = intercept,
433 .rep_prefix = ctxt->decode.rep_prefix, 416 .rep_prefix = ctxt->rep_prefix,
434 .modrm_mod = ctxt->decode.modrm_mod, 417 .modrm_mod = ctxt->modrm_mod,
435 .modrm_reg = ctxt->decode.modrm_reg, 418 .modrm_reg = ctxt->modrm_reg,
436 .modrm_rm = ctxt->decode.modrm_rm, 419 .modrm_rm = ctxt->modrm_rm,
437 .src_val = ctxt->decode.src.val64, 420 .src_val = ctxt->src.val64,
438 .src_bytes = ctxt->decode.src.bytes, 421 .src_bytes = ctxt->src.bytes,
439 .dst_bytes = ctxt->decode.dst.bytes, 422 .dst_bytes = ctxt->dst.bytes,
440 .ad_bytes = ctxt->decode.ad_bytes, 423 .ad_bytes = ctxt->ad_bytes,
441 .next_rip = ctxt->eip, 424 .next_rip = ctxt->eip,
442 }; 425 };
443 426
444 return ctxt->ops->intercept(ctxt, &info, stage); 427 return ctxt->ops->intercept(ctxt, &info, stage);
445} 428}
446 429
447static inline unsigned long ad_mask(struct decode_cache *c) 430static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
448{ 431{
449 return (1UL << (c->ad_bytes << 3)) - 1; 432 return (1UL << (ctxt->ad_bytes << 3)) - 1;
450} 433}
451 434
452/* Access/update address held in a register, based on addressing mode. */ 435/* Access/update address held in a register, based on addressing mode. */
453static inline unsigned long 436static inline unsigned long
454address_mask(struct decode_cache *c, unsigned long reg) 437address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
455{ 438{
456 if (c->ad_bytes == sizeof(unsigned long)) 439 if (ctxt->ad_bytes == sizeof(unsigned long))
457 return reg; 440 return reg;
458 else 441 else
459 return reg & ad_mask(c); 442 return reg & ad_mask(ctxt);
460} 443}
461 444
462static inline unsigned long 445static inline unsigned long
463register_address(struct decode_cache *c, unsigned long reg) 446register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
464{ 447{
465 return address_mask(c, reg); 448 return address_mask(ctxt, reg);
466} 449}
467 450
468static inline void 451static inline void
469register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) 452register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
470{ 453{
471 if (c->ad_bytes == sizeof(unsigned long)) 454 if (ctxt->ad_bytes == sizeof(unsigned long))
472 *reg += inc; 455 *reg += inc;
473 else 456 else
474 *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); 457 *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
475} 458}
476 459
477static inline void jmp_rel(struct decode_cache *c, int rel) 460static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
478{ 461{
479 register_address_increment(c, &c->eip, rel); 462 register_address_increment(ctxt, &ctxt->_eip, rel);
480} 463}
481 464
482static u32 desc_limit_scaled(struct desc_struct *desc) 465static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
486 return desc->g ? (limit << 12) | 0xfff : limit; 469 return desc->g ? (limit << 12) | 0xfff : limit;
487} 470}
488 471
489static void set_seg_override(struct decode_cache *c, int seg) 472static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
490{ 473{
491 c->has_seg_override = true; 474 ctxt->has_seg_override = true;
492 c->seg_override = seg; 475 ctxt->seg_override = seg;
493} 476}
494 477
495static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, 478static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
496 struct x86_emulate_ops *ops, int seg)
497{ 479{
498 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 480 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
499 return 0; 481 return 0;
500 482
501 return ops->get_cached_segment_base(ctxt, seg); 483 return ctxt->ops->get_cached_segment_base(ctxt, seg);
502} 484}
503 485
504static unsigned seg_override(struct x86_emulate_ctxt *ctxt, 486static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
505 struct decode_cache *c)
506{ 487{
507 if (!c->has_seg_override) 488 if (!ctxt->has_seg_override)
508 return 0; 489 return 0;
509 490
510 return c->seg_override; 491 return ctxt->seg_override;
511} 492}
512 493
513static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 494static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
579 unsigned size, bool write, bool fetch, 560 unsigned size, bool write, bool fetch,
580 ulong *linear) 561 ulong *linear)
581{ 562{
582 struct decode_cache *c = &ctxt->decode;
583 struct desc_struct desc; 563 struct desc_struct desc;
584 bool usable; 564 bool usable;
585 ulong la; 565 ulong la;
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
587 u16 sel; 567 u16 sel;
588 unsigned cpl, rpl; 568 unsigned cpl, rpl;
589 569
590 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; 570 la = seg_base(ctxt, addr.seg) + addr.ea;
591 switch (ctxt->mode) { 571 switch (ctxt->mode) {
592 case X86EMUL_MODE_REAL: 572 case X86EMUL_MODE_REAL:
593 break; 573 break;
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
637 } 617 }
638 break; 618 break;
639 } 619 }
640 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) 620 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
641 la &= (u32)-1; 621 la &= (u32)-1;
642 *linear = la; 622 *linear = la;
643 return X86EMUL_CONTINUE; 623 return X86EMUL_CONTINUE;
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
671 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); 651 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
672} 652}
673 653
674static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 654static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
675 struct x86_emulate_ops *ops,
676 unsigned long eip, u8 *dest) 655 unsigned long eip, u8 *dest)
677{ 656{
678 struct fetch_cache *fc = &ctxt->decode.fetch; 657 struct fetch_cache *fc = &ctxt->fetch;
679 int rc; 658 int rc;
680 int size, cur_size; 659 int size, cur_size;
681 660
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
687 rc = __linearize(ctxt, addr, size, false, true, &linear); 666 rc = __linearize(ctxt, addr, size, false, true, &linear);
688 if (rc != X86EMUL_CONTINUE) 667 if (rc != X86EMUL_CONTINUE)
689 return rc; 668 return rc;
690 rc = ops->fetch(ctxt, linear, fc->data + cur_size, 669 rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
691 size, &ctxt->exception); 670 size, &ctxt->exception);
692 if (rc != X86EMUL_CONTINUE) 671 if (rc != X86EMUL_CONTINUE)
693 return rc; 672 return rc;
694 fc->end += size; 673 fc->end += size;
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
698} 677}
699 678
700static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 679static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
701 struct x86_emulate_ops *ops,
702 unsigned long eip, void *dest, unsigned size) 680 unsigned long eip, void *dest, unsigned size)
703{ 681{
704 int rc; 682 int rc;
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
707 if (eip + size - ctxt->eip > 15) 685 if (eip + size - ctxt->eip > 15)
708 return X86EMUL_UNHANDLEABLE; 686 return X86EMUL_UNHANDLEABLE;
709 while (size--) { 687 while (size--) {
710 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 688 rc = do_insn_fetch_byte(ctxt, eip++, dest++);
711 if (rc != X86EMUL_CONTINUE) 689 if (rc != X86EMUL_CONTINUE)
712 return rc; 690 return rc;
713 } 691 }
714 return X86EMUL_CONTINUE; 692 return X86EMUL_CONTINUE;
715} 693}
716 694
695/* Fetch next part of the instruction being emulated. */
696#define insn_fetch(_type, _size, _eip) \
697({ unsigned long _x; \
698 rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \
699 if (rc != X86EMUL_CONTINUE) \
700 goto done; \
701 (_eip) += (_size); \
702 (_type)_x; \
703})
704
705#define insn_fetch_arr(_arr, _size, _eip) \
706({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \
707 if (rc != X86EMUL_CONTINUE) \
708 goto done; \
709 (_eip) += (_size); \
710})
711
717/* 712/*
718 * Given the 'reg' portion of a ModRM byte, and a register block, return a 713 * Given the 'reg' portion of a ModRM byte, and a register block, return a
719 * pointer into the block that addresses the relevant register. 714 * pointer into the block that addresses the relevant register.
@@ -857,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
857 852
858static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 853static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
859 struct operand *op, 854 struct operand *op,
860 struct decode_cache *c,
861 int inhibit_bytereg) 855 int inhibit_bytereg)
862{ 856{
863 unsigned reg = c->modrm_reg; 857 unsigned reg = ctxt->modrm_reg;
864 int highbyte_regs = c->rex_prefix == 0; 858 int highbyte_regs = ctxt->rex_prefix == 0;
865 859
866 if (!(c->d & ModRM)) 860 if (!(ctxt->d & ModRM))
867 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 861 reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
868 862
869 if (c->d & Sse) { 863 if (ctxt->d & Sse) {
870 op->type = OP_XMM; 864 op->type = OP_XMM;
871 op->bytes = 16; 865 op->bytes = 16;
872 op->addr.xmm = reg; 866 op->addr.xmm = reg;
@@ -875,49 +869,47 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
875 } 869 }
876 870
877 op->type = OP_REG; 871 op->type = OP_REG;
878 if ((c->d & ByteOp) && !inhibit_bytereg) { 872 if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
879 op->addr.reg = decode_register(reg, c->regs, highbyte_regs); 873 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
880 op->bytes = 1; 874 op->bytes = 1;
881 } else { 875 } else {
882 op->addr.reg = decode_register(reg, c->regs, 0); 876 op->addr.reg = decode_register(reg, ctxt->regs, 0);
883 op->bytes = c->op_bytes; 877 op->bytes = ctxt->op_bytes;
884 } 878 }
885 fetch_register_operand(op); 879 fetch_register_operand(op);
886 op->orig_val = op->val; 880 op->orig_val = op->val;
887} 881}
888 882
889static int decode_modrm(struct x86_emulate_ctxt *ctxt, 883static int decode_modrm(struct x86_emulate_ctxt *ctxt,
890 struct x86_emulate_ops *ops,
891 struct operand *op) 884 struct operand *op)
892{ 885{
893 struct decode_cache *c = &ctxt->decode;
894 u8 sib; 886 u8 sib;
895 int index_reg = 0, base_reg = 0, scale; 887 int index_reg = 0, base_reg = 0, scale;
896 int rc = X86EMUL_CONTINUE; 888 int rc = X86EMUL_CONTINUE;
897 ulong modrm_ea = 0; 889 ulong modrm_ea = 0;
898 890
899 if (c->rex_prefix) { 891 if (ctxt->rex_prefix) {
900 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 892 ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
901 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ 893 index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
902 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ 894 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
903 } 895 }
904 896
905 c->modrm = insn_fetch(u8, 1, c->eip); 897 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
906 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 898 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
907 c->modrm_reg |= (c->modrm & 0x38) >> 3; 899 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
908 c->modrm_rm |= (c->modrm & 0x07); 900 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
909 c->modrm_seg = VCPU_SREG_DS; 901 ctxt->modrm_seg = VCPU_SREG_DS;
910 902
911 if (c->modrm_mod == 3) { 903 if (ctxt->modrm_mod == 3) {
912 op->type = OP_REG; 904 op->type = OP_REG;
913 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 905 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
914 op->addr.reg = decode_register(c->modrm_rm, 906 op->addr.reg = decode_register(ctxt->modrm_rm,
915 c->regs, c->d & ByteOp); 907 ctxt->regs, ctxt->d & ByteOp);
916 if (c->d & Sse) { 908 if (ctxt->d & Sse) {
917 op->type = OP_XMM; 909 op->type = OP_XMM;
918 op->bytes = 16; 910 op->bytes = 16;
919 op->addr.xmm = c->modrm_rm; 911 op->addr.xmm = ctxt->modrm_rm;
920 read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); 912 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
921 return rc; 913 return rc;
922 } 914 }
923 fetch_register_operand(op); 915 fetch_register_operand(op);
@@ -926,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
926 918
927 op->type = OP_MEM; 919 op->type = OP_MEM;
928 920
929 if (c->ad_bytes == 2) { 921 if (ctxt->ad_bytes == 2) {
930 unsigned bx = c->regs[VCPU_REGS_RBX]; 922 unsigned bx = ctxt->regs[VCPU_REGS_RBX];
931 unsigned bp = c->regs[VCPU_REGS_RBP]; 923 unsigned bp = ctxt->regs[VCPU_REGS_RBP];
932 unsigned si = c->regs[VCPU_REGS_RSI]; 924 unsigned si = ctxt->regs[VCPU_REGS_RSI];
933 unsigned di = c->regs[VCPU_REGS_RDI]; 925 unsigned di = ctxt->regs[VCPU_REGS_RDI];
934 926
935 /* 16-bit ModR/M decode. */ 927 /* 16-bit ModR/M decode. */
936 switch (c->modrm_mod) { 928 switch (ctxt->modrm_mod) {
937 case 0: 929 case 0:
938 if (c->modrm_rm == 6) 930 if (ctxt->modrm_rm == 6)
939 modrm_ea += insn_fetch(u16, 2, c->eip); 931 modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
940 break; 932 break;
941 case 1: 933 case 1:
942 modrm_ea += insn_fetch(s8, 1, c->eip); 934 modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
943 break; 935 break;
944 case 2: 936 case 2:
945 modrm_ea += insn_fetch(u16, 2, c->eip); 937 modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
946 break; 938 break;
947 } 939 }
948 switch (c->modrm_rm) { 940 switch (ctxt->modrm_rm) {
949 case 0: 941 case 0:
950 modrm_ea += bx + si; 942 modrm_ea += bx + si;
951 break; 943 break;
@@ -965,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
965 modrm_ea += di; 957 modrm_ea += di;
966 break; 958 break;
967 case 6: 959 case 6:
968 if (c->modrm_mod != 0) 960 if (ctxt->modrm_mod != 0)
969 modrm_ea += bp; 961 modrm_ea += bp;
970 break; 962 break;
971 case 7: 963 case 7:
972 modrm_ea += bx; 964 modrm_ea += bx;
973 break; 965 break;
974 } 966 }
975 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 967 if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
976 (c->modrm_rm == 6 && c->modrm_mod != 0)) 968 (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
977 c->modrm_seg = VCPU_SREG_SS; 969 ctxt->modrm_seg = VCPU_SREG_SS;
978 modrm_ea = (u16)modrm_ea; 970 modrm_ea = (u16)modrm_ea;
979 } else { 971 } else {
980 /* 32/64-bit ModR/M decode. */ 972 /* 32/64-bit ModR/M decode. */
981 if ((c->modrm_rm & 7) == 4) { 973 if ((ctxt->modrm_rm & 7) == 4) {
982 sib = insn_fetch(u8, 1, c->eip); 974 sib = insn_fetch(u8, 1, ctxt->_eip);
983 index_reg |= (sib >> 3) & 7; 975 index_reg |= (sib >> 3) & 7;
984 base_reg |= sib & 7; 976 base_reg |= sib & 7;
985 scale = sib >> 6; 977 scale = sib >> 6;
986 978
987 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 979 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
988 modrm_ea += insn_fetch(s32, 4, c->eip); 980 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
989 else 981 else
990 modrm_ea += c->regs[base_reg]; 982 modrm_ea += ctxt->regs[base_reg];
991 if (index_reg != 4) 983 if (index_reg != 4)
992 modrm_ea += c->regs[index_reg] << scale; 984 modrm_ea += ctxt->regs[index_reg] << scale;
993 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 985 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
994 if (ctxt->mode == X86EMUL_MODE_PROT64) 986 if (ctxt->mode == X86EMUL_MODE_PROT64)
995 c->rip_relative = 1; 987 ctxt->rip_relative = 1;
996 } else 988 } else
997 modrm_ea += c->regs[c->modrm_rm]; 989 modrm_ea += ctxt->regs[ctxt->modrm_rm];
998 switch (c->modrm_mod) { 990 switch (ctxt->modrm_mod) {
999 case 0: 991 case 0:
1000 if (c->modrm_rm == 5) 992 if (ctxt->modrm_rm == 5)
1001 modrm_ea += insn_fetch(s32, 4, c->eip); 993 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
1002 break; 994 break;
1003 case 1: 995 case 1:
1004 modrm_ea += insn_fetch(s8, 1, c->eip); 996 modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
1005 break; 997 break;
1006 case 2: 998 case 2:
1007 modrm_ea += insn_fetch(s32, 4, c->eip); 999 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
1008 break; 1000 break;
1009 } 1001 }
1010 } 1002 }
@@ -1014,53 +1006,50 @@ done:
1014} 1006}
1015 1007
1016static int decode_abs(struct x86_emulate_ctxt *ctxt, 1008static int decode_abs(struct x86_emulate_ctxt *ctxt,
1017 struct x86_emulate_ops *ops,
1018 struct operand *op) 1009 struct operand *op)
1019{ 1010{
1020 struct decode_cache *c = &ctxt->decode;
1021 int rc = X86EMUL_CONTINUE; 1011 int rc = X86EMUL_CONTINUE;
1022 1012
1023 op->type = OP_MEM; 1013 op->type = OP_MEM;
1024 switch (c->ad_bytes) { 1014 switch (ctxt->ad_bytes) {
1025 case 2: 1015 case 2:
1026 op->addr.mem.ea = insn_fetch(u16, 2, c->eip); 1016 op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
1027 break; 1017 break;
1028 case 4: 1018 case 4:
1029 op->addr.mem.ea = insn_fetch(u32, 4, c->eip); 1019 op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
1030 break; 1020 break;
1031 case 8: 1021 case 8:
1032 op->addr.mem.ea = insn_fetch(u64, 8, c->eip); 1022 op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
1033 break; 1023 break;
1034 } 1024 }
1035done: 1025done:
1036 return rc; 1026 return rc;
1037} 1027}
1038 1028
1039static void fetch_bit_operand(struct decode_cache *c) 1029static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
1040{ 1030{
1041 long sv = 0, mask; 1031 long sv = 0, mask;
1042 1032
1043 if (c->dst.type == OP_MEM && c->src.type == OP_REG) { 1033 if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
1044 mask = ~(c->dst.bytes * 8 - 1); 1034 mask = ~(ctxt->dst.bytes * 8 - 1);
1045 1035
1046 if (c->src.bytes == 2) 1036 if (ctxt->src.bytes == 2)
1047 sv = (s16)c->src.val & (s16)mask; 1037 sv = (s16)ctxt->src.val & (s16)mask;
1048 else if (c->src.bytes == 4) 1038 else if (ctxt->src.bytes == 4)
1049 sv = (s32)c->src.val & (s32)mask; 1039 sv = (s32)ctxt->src.val & (s32)mask;
1050 1040
1051 c->dst.addr.mem.ea += (sv >> 3); 1041 ctxt->dst.addr.mem.ea += (sv >> 3);
1052 } 1042 }
1053 1043
1054 /* only subword offset */ 1044 /* only subword offset */
1055 c->src.val &= (c->dst.bytes << 3) - 1; 1045 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
1056} 1046}
1057 1047
1058static int read_emulated(struct x86_emulate_ctxt *ctxt, 1048static int read_emulated(struct x86_emulate_ctxt *ctxt,
1059 struct x86_emulate_ops *ops,
1060 unsigned long addr, void *dest, unsigned size) 1049 unsigned long addr, void *dest, unsigned size)
1061{ 1050{
1062 int rc; 1051 int rc;
1063 struct read_cache *mc = &ctxt->decode.mem_read; 1052 struct read_cache *mc = &ctxt->mem_read;
1064 1053
1065 while (size) { 1054 while (size) {
1066 int n = min(size, 8u); 1055 int n = min(size, 8u);
@@ -1068,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1068 if (mc->pos < mc->end) 1057 if (mc->pos < mc->end)
1069 goto read_cached; 1058 goto read_cached;
1070 1059
1071 rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, 1060 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
1072 &ctxt->exception); 1061 &ctxt->exception);
1073 if (rc != X86EMUL_CONTINUE) 1062 if (rc != X86EMUL_CONTINUE)
1074 return rc; 1063 return rc;
1075 mc->end += n; 1064 mc->end += n;
@@ -1094,7 +1083,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt,
1094 rc = linearize(ctxt, addr, size, false, &linear); 1083 rc = linearize(ctxt, addr, size, false, &linear);
1095 if (rc != X86EMUL_CONTINUE) 1084 if (rc != X86EMUL_CONTINUE)
1096 return rc; 1085 return rc;
1097 return read_emulated(ctxt, ctxt->ops, linear, data, size); 1086 return read_emulated(ctxt, linear, data, size);
1098} 1087}
1099 1088
1100static int segmented_write(struct x86_emulate_ctxt *ctxt, 1089static int segmented_write(struct x86_emulate_ctxt *ctxt,
@@ -1128,26 +1117,24 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
1128} 1117}
1129 1118
1130static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1119static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1131 struct x86_emulate_ops *ops,
1132 unsigned int size, unsigned short port, 1120 unsigned int size, unsigned short port,
1133 void *dest) 1121 void *dest)
1134{ 1122{
1135 struct read_cache *rc = &ctxt->decode.io_read; 1123 struct read_cache *rc = &ctxt->io_read;
1136 1124
1137 if (rc->pos == rc->end) { /* refill pio read ahead */ 1125 if (rc->pos == rc->end) { /* refill pio read ahead */
1138 struct decode_cache *c = &ctxt->decode;
1139 unsigned int in_page, n; 1126 unsigned int in_page, n;
1140 unsigned int count = c->rep_prefix ? 1127 unsigned int count = ctxt->rep_prefix ?
1141 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; 1128 address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
1142 in_page = (ctxt->eflags & EFLG_DF) ? 1129 in_page = (ctxt->eflags & EFLG_DF) ?
1143 offset_in_page(c->regs[VCPU_REGS_RDI]) : 1130 offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
1144 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); 1131 PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
1145 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, 1132 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1146 count); 1133 count);
1147 if (n == 0) 1134 if (n == 0)
1148 n = 1; 1135 n = 1;
1149 rc->pos = rc->end = 0; 1136 rc->pos = rc->end = 0;
1150 if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) 1137 if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
1151 return 0; 1138 return 0;
1152 rc->end = n * size; 1139 rc->end = n * size;
1153 } 1140 }
@@ -1158,9 +1145,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1158} 1145}
1159 1146
1160static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1147static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1161 struct x86_emulate_ops *ops,
1162 u16 selector, struct desc_ptr *dt) 1148 u16 selector, struct desc_ptr *dt)
1163{ 1149{
1150 struct x86_emulate_ops *ops = ctxt->ops;
1151
1164 if (selector & 1 << 2) { 1152 if (selector & 1 << 2) {
1165 struct desc_struct desc; 1153 struct desc_struct desc;
1166 u16 sel; 1154 u16 sel;
@@ -1177,48 +1165,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1177 1165
1178/* allowed just for 8 bytes segments */ 1166/* allowed just for 8 bytes segments */
1179static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1167static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1180 struct x86_emulate_ops *ops,
1181 u16 selector, struct desc_struct *desc) 1168 u16 selector, struct desc_struct *desc)
1182{ 1169{
1183 struct desc_ptr dt; 1170 struct desc_ptr dt;
1184 u16 index = selector >> 3; 1171 u16 index = selector >> 3;
1185 int ret;
1186 ulong addr; 1172 ulong addr;
1187 1173
1188 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1174 get_descriptor_table_ptr(ctxt, selector, &dt);
1189 1175
1190 if (dt.size < index * 8 + 7) 1176 if (dt.size < index * 8 + 7)
1191 return emulate_gp(ctxt, selector & 0xfffc); 1177 return emulate_gp(ctxt, selector & 0xfffc);
1192 addr = dt.address + index * 8;
1193 ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
1194 1178
1195 return ret; 1179 addr = dt.address + index * 8;
1180 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1181 &ctxt->exception);
1196} 1182}
1197 1183
1198/* allowed just for 8 bytes segments */ 1184/* allowed just for 8 bytes segments */
1199static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1185static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1200 struct x86_emulate_ops *ops,
1201 u16 selector, struct desc_struct *desc) 1186 u16 selector, struct desc_struct *desc)
1202{ 1187{
1203 struct desc_ptr dt; 1188 struct desc_ptr dt;
1204 u16 index = selector >> 3; 1189 u16 index = selector >> 3;
1205 ulong addr; 1190 ulong addr;
1206 int ret;
1207 1191
1208 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1192 get_descriptor_table_ptr(ctxt, selector, &dt);
1209 1193
1210 if (dt.size < index * 8 + 7) 1194 if (dt.size < index * 8 + 7)
1211 return emulate_gp(ctxt, selector & 0xfffc); 1195 return emulate_gp(ctxt, selector & 0xfffc);
1212 1196
1213 addr = dt.address + index * 8; 1197 addr = dt.address + index * 8;
1214 ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); 1198 return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
1215 1199 &ctxt->exception);
1216 return ret;
1217} 1200}
1218 1201
1219/* Does not support long mode */ 1202/* Does not support long mode */
1220static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1203static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1221 struct x86_emulate_ops *ops,
1222 u16 selector, int seg) 1204 u16 selector, int seg)
1223{ 1205{
1224 struct desc_struct seg_desc; 1206 struct desc_struct seg_desc;
@@ -1253,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1253 if (null_selector) /* for NULL selector skip all following checks */ 1235 if (null_selector) /* for NULL selector skip all following checks */
1254 goto load; 1236 goto load;
1255 1237
1256 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); 1238 ret = read_segment_descriptor(ctxt, selector, &seg_desc);
1257 if (ret != X86EMUL_CONTINUE) 1239 if (ret != X86EMUL_CONTINUE)
1258 return ret; 1240 return ret;
1259 1241
@@ -1271,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1271 1253
1272 rpl = selector & 3; 1254 rpl = selector & 3;
1273 dpl = seg_desc.dpl; 1255 dpl = seg_desc.dpl;
1274 cpl = ops->cpl(ctxt); 1256 cpl = ctxt->ops->cpl(ctxt);
1275 1257
1276 switch (seg) { 1258 switch (seg) {
1277 case VCPU_SREG_SS: 1259 case VCPU_SREG_SS:
@@ -1322,12 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1322 if (seg_desc.s) { 1304 if (seg_desc.s) {
1323 /* mark segment as accessed */ 1305 /* mark segment as accessed */
1324 seg_desc.type |= 1; 1306 seg_desc.type |= 1;
1325 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); 1307 ret = write_segment_descriptor(ctxt, selector, &seg_desc);
1326 if (ret != X86EMUL_CONTINUE) 1308 if (ret != X86EMUL_CONTINUE)
1327 return ret; 1309 return ret;
1328 } 1310 }
1329load: 1311load:
1330 ops->set_segment(ctxt, selector, &seg_desc, 0, seg); 1312 ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
1331 return X86EMUL_CONTINUE; 1313 return X86EMUL_CONTINUE;
1332exception: 1314exception:
1333 emulate_exception(ctxt, err_vec, err_code, true); 1315 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1356,29 +1338,28 @@ static void write_register_operand(struct operand *op)
1356static int writeback(struct x86_emulate_ctxt *ctxt) 1338static int writeback(struct x86_emulate_ctxt *ctxt)
1357{ 1339{
1358 int rc; 1340 int rc;
1359 struct decode_cache *c = &ctxt->decode;
1360 1341
1361 switch (c->dst.type) { 1342 switch (ctxt->dst.type) {
1362 case OP_REG: 1343 case OP_REG:
1363 write_register_operand(&c->dst); 1344 write_register_operand(&ctxt->dst);
1364 break; 1345 break;
1365 case OP_MEM: 1346 case OP_MEM:
1366 if (c->lock_prefix) 1347 if (ctxt->lock_prefix)
1367 rc = segmented_cmpxchg(ctxt, 1348 rc = segmented_cmpxchg(ctxt,
1368 c->dst.addr.mem, 1349 ctxt->dst.addr.mem,
1369 &c->dst.orig_val, 1350 &ctxt->dst.orig_val,
1370 &c->dst.val, 1351 &ctxt->dst.val,
1371 c->dst.bytes); 1352 ctxt->dst.bytes);
1372 else 1353 else
1373 rc = segmented_write(ctxt, 1354 rc = segmented_write(ctxt,
1374 c->dst.addr.mem, 1355 ctxt->dst.addr.mem,
1375 &c->dst.val, 1356 &ctxt->dst.val,
1376 c->dst.bytes); 1357 ctxt->dst.bytes);
1377 if (rc != X86EMUL_CONTINUE) 1358 if (rc != X86EMUL_CONTINUE)
1378 return rc; 1359 return rc;
1379 break; 1360 break;
1380 case OP_XMM: 1361 case OP_XMM:
1381 write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); 1362 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1382 break; 1363 break;
1383 case OP_NONE: 1364 case OP_NONE:
1384 /* no writeback */ 1365 /* no writeback */
@@ -1391,50 +1372,45 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1391 1372
1392static int em_push(struct x86_emulate_ctxt *ctxt) 1373static int em_push(struct x86_emulate_ctxt *ctxt)
1393{ 1374{
1394 struct decode_cache *c = &ctxt->decode;
1395 struct segmented_address addr; 1375 struct segmented_address addr;
1396 1376
1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1377 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
1398 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1378 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1399 addr.seg = VCPU_SREG_SS; 1379 addr.seg = VCPU_SREG_SS;
1400 1380
1401 /* Disable writeback. */ 1381 /* Disable writeback. */
1402 c->dst.type = OP_NONE; 1382 ctxt->dst.type = OP_NONE;
1403 return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); 1383 return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
1404} 1384}
1405 1385
1406static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1386static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1407 void *dest, int len) 1387 void *dest, int len)
1408{ 1388{
1409 struct decode_cache *c = &ctxt->decode;
1410 int rc; 1389 int rc;
1411 struct segmented_address addr; 1390 struct segmented_address addr;
1412 1391
1413 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1392 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1414 addr.seg = VCPU_SREG_SS; 1393 addr.seg = VCPU_SREG_SS;
1415 rc = segmented_read(ctxt, addr, dest, len); 1394 rc = segmented_read(ctxt, addr, dest, len);
1416 if (rc != X86EMUL_CONTINUE) 1395 if (rc != X86EMUL_CONTINUE)
1417 return rc; 1396 return rc;
1418 1397
1419 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1398 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
1420 return rc; 1399 return rc;
1421} 1400}
1422 1401
1423static int em_pop(struct x86_emulate_ctxt *ctxt) 1402static int em_pop(struct x86_emulate_ctxt *ctxt)
1424{ 1403{
1425 struct decode_cache *c = &ctxt->decode; 1404 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1426
1427 return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
1428} 1405}
1429 1406
1430static int emulate_popf(struct x86_emulate_ctxt *ctxt, 1407static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1431 struct x86_emulate_ops *ops, 1408 void *dest, int len)
1432 void *dest, int len)
1433{ 1409{
1434 int rc; 1410 int rc;
1435 unsigned long val, change_mask; 1411 unsigned long val, change_mask;
1436 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1412 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1437 int cpl = ops->cpl(ctxt); 1413 int cpl = ctxt->ops->cpl(ctxt);
1438 1414
1439 rc = emulate_pop(ctxt, &val, len); 1415 rc = emulate_pop(ctxt, &val, len);
1440 if (rc != X86EMUL_CONTINUE) 1416 if (rc != X86EMUL_CONTINUE)
@@ -1470,49 +1446,41 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1470 1446
1471static int em_popf(struct x86_emulate_ctxt *ctxt) 1447static int em_popf(struct x86_emulate_ctxt *ctxt)
1472{ 1448{
1473 struct decode_cache *c = &ctxt->decode; 1449 ctxt->dst.type = OP_REG;
1474 1450 ctxt->dst.addr.reg = &ctxt->eflags;
1475 c->dst.type = OP_REG; 1451 ctxt->dst.bytes = ctxt->op_bytes;
1476 c->dst.addr.reg = &ctxt->eflags; 1452 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1477 c->dst.bytes = c->op_bytes;
1478 return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
1479} 1453}
1480 1454
1481static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1455static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1482 struct x86_emulate_ops *ops, int seg)
1483{ 1456{
1484 struct decode_cache *c = &ctxt->decode; 1457 ctxt->src.val = get_segment_selector(ctxt, seg);
1485
1486 c->src.val = get_segment_selector(ctxt, seg);
1487 1458
1488 return em_push(ctxt); 1459 return em_push(ctxt);
1489} 1460}
1490 1461
1491static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1462static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1492 struct x86_emulate_ops *ops, int seg)
1493{ 1463{
1494 struct decode_cache *c = &ctxt->decode;
1495 unsigned long selector; 1464 unsigned long selector;
1496 int rc; 1465 int rc;
1497 1466
1498 rc = emulate_pop(ctxt, &selector, c->op_bytes); 1467 rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
1499 if (rc != X86EMUL_CONTINUE) 1468 if (rc != X86EMUL_CONTINUE)
1500 return rc; 1469 return rc;
1501 1470
1502 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); 1471 rc = load_segment_descriptor(ctxt, (u16)selector, seg);
1503 return rc; 1472 return rc;
1504} 1473}
1505 1474
1506static int em_pusha(struct x86_emulate_ctxt *ctxt) 1475static int em_pusha(struct x86_emulate_ctxt *ctxt)
1507{ 1476{
1508 struct decode_cache *c = &ctxt->decode; 1477 unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
1509 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1510 int rc = X86EMUL_CONTINUE; 1478 int rc = X86EMUL_CONTINUE;
1511 int reg = VCPU_REGS_RAX; 1479 int reg = VCPU_REGS_RAX;
1512 1480
1513 while (reg <= VCPU_REGS_RDI) { 1481 while (reg <= VCPU_REGS_RDI) {
1514 (reg == VCPU_REGS_RSP) ? 1482 (reg == VCPU_REGS_RSP) ?
1515 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1483 (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
1516 1484
1517 rc = em_push(ctxt); 1485 rc = em_push(ctxt);
1518 if (rc != X86EMUL_CONTINUE) 1486 if (rc != X86EMUL_CONTINUE)
@@ -1526,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
1526 1494
1527static int em_pushf(struct x86_emulate_ctxt *ctxt) 1495static int em_pushf(struct x86_emulate_ctxt *ctxt)
1528{ 1496{
1529 struct decode_cache *c = &ctxt->decode; 1497 ctxt->src.val = (unsigned long)ctxt->eflags;
1530
1531 c->src.val = (unsigned long)ctxt->eflags;
1532 return em_push(ctxt); 1498 return em_push(ctxt);
1533} 1499}
1534 1500
1535static int em_popa(struct x86_emulate_ctxt *ctxt) 1501static int em_popa(struct x86_emulate_ctxt *ctxt)
1536{ 1502{
1537 struct decode_cache *c = &ctxt->decode;
1538 int rc = X86EMUL_CONTINUE; 1503 int rc = X86EMUL_CONTINUE;
1539 int reg = VCPU_REGS_RDI; 1504 int reg = VCPU_REGS_RDI;
1540 1505
1541 while (reg >= VCPU_REGS_RAX) { 1506 while (reg >= VCPU_REGS_RAX) {
1542 if (reg == VCPU_REGS_RSP) { 1507 if (reg == VCPU_REGS_RSP) {
1543 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1508 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
1544 c->op_bytes); 1509 ctxt->op_bytes);
1545 --reg; 1510 --reg;
1546 } 1511 }
1547 1512
1548 rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); 1513 rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
1549 if (rc != X86EMUL_CONTINUE) 1514 if (rc != X86EMUL_CONTINUE)
1550 break; 1515 break;
1551 --reg; 1516 --reg;
@@ -1553,10 +1518,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1553 return rc; 1518 return rc;
1554} 1519}
1555 1520
1556int emulate_int_real(struct x86_emulate_ctxt *ctxt, 1521int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1557 struct x86_emulate_ops *ops, int irq)
1558{ 1522{
1559 struct decode_cache *c = &ctxt->decode; 1523 struct x86_emulate_ops *ops = ctxt->ops;
1560 int rc; 1524 int rc;
1561 struct desc_ptr dt; 1525 struct desc_ptr dt;
1562 gva_t cs_addr; 1526 gva_t cs_addr;
@@ -1564,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1564 u16 cs, eip; 1528 u16 cs, eip;
1565 1529
1566 /* TODO: Add limit checks */ 1530 /* TODO: Add limit checks */
1567 c->src.val = ctxt->eflags; 1531 ctxt->src.val = ctxt->eflags;
1568 rc = em_push(ctxt); 1532 rc = em_push(ctxt);
1569 if (rc != X86EMUL_CONTINUE) 1533 if (rc != X86EMUL_CONTINUE)
1570 return rc; 1534 return rc;
1571 1535
1572 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); 1536 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1573 1537
1574 c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); 1538 ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1575 rc = em_push(ctxt); 1539 rc = em_push(ctxt);
1576 if (rc != X86EMUL_CONTINUE) 1540 if (rc != X86EMUL_CONTINUE)
1577 return rc; 1541 return rc;
1578 1542
1579 c->src.val = c->eip; 1543 ctxt->src.val = ctxt->_eip;
1580 rc = em_push(ctxt); 1544 rc = em_push(ctxt);
1581 if (rc != X86EMUL_CONTINUE) 1545 if (rc != X86EMUL_CONTINUE)
1582 return rc; 1546 return rc;
@@ -1594,21 +1558,20 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1594 if (rc != X86EMUL_CONTINUE) 1558 if (rc != X86EMUL_CONTINUE)
1595 return rc; 1559 return rc;
1596 1560
1597 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); 1561 rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
1598 if (rc != X86EMUL_CONTINUE) 1562 if (rc != X86EMUL_CONTINUE)
1599 return rc; 1563 return rc;
1600 1564
1601 c->eip = eip; 1565 ctxt->_eip = eip;
1602 1566
1603 return rc; 1567 return rc;
1604} 1568}
1605 1569
1606static int emulate_int(struct x86_emulate_ctxt *ctxt, 1570static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
1607 struct x86_emulate_ops *ops, int irq)
1608{ 1571{
1609 switch(ctxt->mode) { 1572 switch(ctxt->mode) {
1610 case X86EMUL_MODE_REAL: 1573 case X86EMUL_MODE_REAL:
1611 return emulate_int_real(ctxt, ops, irq); 1574 return emulate_int_real(ctxt, irq);
1612 case X86EMUL_MODE_VM86: 1575 case X86EMUL_MODE_VM86:
1613 case X86EMUL_MODE_PROT16: 1576 case X86EMUL_MODE_PROT16:
1614 case X86EMUL_MODE_PROT32: 1577 case X86EMUL_MODE_PROT32:
@@ -1619,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt,
1619 } 1582 }
1620} 1583}
1621 1584
1622static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, 1585static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
1623 struct x86_emulate_ops *ops)
1624{ 1586{
1625 struct decode_cache *c = &ctxt->decode;
1626 int rc = X86EMUL_CONTINUE; 1587 int rc = X86EMUL_CONTINUE;
1627 unsigned long temp_eip = 0; 1588 unsigned long temp_eip = 0;
1628 unsigned long temp_eflags = 0; 1589 unsigned long temp_eflags = 0;
@@ -1634,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1634 1595
1635 /* TODO: Add stack limit check */ 1596 /* TODO: Add stack limit check */
1636 1597
1637 rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); 1598 rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
1638 1599
1639 if (rc != X86EMUL_CONTINUE) 1600 if (rc != X86EMUL_CONTINUE)
1640 return rc; 1601 return rc;
@@ -1642,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1642 if (temp_eip & ~0xffff) 1603 if (temp_eip & ~0xffff)
1643 return emulate_gp(ctxt, 0); 1604 return emulate_gp(ctxt, 0);
1644 1605
1645 rc = emulate_pop(ctxt, &cs, c->op_bytes); 1606 rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
1646 1607
1647 if (rc != X86EMUL_CONTINUE) 1608 if (rc != X86EMUL_CONTINUE)
1648 return rc; 1609 return rc;
1649 1610
1650 rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); 1611 rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
1651 1612
1652 if (rc != X86EMUL_CONTINUE) 1613 if (rc != X86EMUL_CONTINUE)
1653 return rc; 1614 return rc;
1654 1615
1655 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1616 rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
1656 1617
1657 if (rc != X86EMUL_CONTINUE) 1618 if (rc != X86EMUL_CONTINUE)
1658 return rc; 1619 return rc;
1659 1620
1660 c->eip = temp_eip; 1621 ctxt->_eip = temp_eip;
1661 1622
1662 1623
1663 if (c->op_bytes == 4) 1624 if (ctxt->op_bytes == 4)
1664 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); 1625 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1665 else if (c->op_bytes == 2) { 1626 else if (ctxt->op_bytes == 2) {
1666 ctxt->eflags &= ~0xffff; 1627 ctxt->eflags &= ~0xffff;
1667 ctxt->eflags |= temp_eflags; 1628 ctxt->eflags |= temp_eflags;
1668 } 1629 }
@@ -1673,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1673 return rc; 1634 return rc;
1674} 1635}
1675 1636
1676static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, 1637static int em_iret(struct x86_emulate_ctxt *ctxt)
1677 struct x86_emulate_ops* ops)
1678{ 1638{
1679 switch(ctxt->mode) { 1639 switch(ctxt->mode) {
1680 case X86EMUL_MODE_REAL: 1640 case X86EMUL_MODE_REAL:
1681 return emulate_iret_real(ctxt, ops); 1641 return emulate_iret_real(ctxt);
1682 case X86EMUL_MODE_VM86: 1642 case X86EMUL_MODE_VM86:
1683 case X86EMUL_MODE_PROT16: 1643 case X86EMUL_MODE_PROT16:
1684 case X86EMUL_MODE_PROT32: 1644 case X86EMUL_MODE_PROT32:
@@ -1691,53 +1651,49 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1691 1651
1692static int em_jmp_far(struct x86_emulate_ctxt *ctxt) 1652static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1693{ 1653{
1694 struct decode_cache *c = &ctxt->decode;
1695 int rc; 1654 int rc;
1696 unsigned short sel; 1655 unsigned short sel;
1697 1656
1698 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 1657 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
1699 1658
1700 rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); 1659 rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
1701 if (rc != X86EMUL_CONTINUE) 1660 if (rc != X86EMUL_CONTINUE)
1702 return rc; 1661 return rc;
1703 1662
1704 c->eip = 0; 1663 ctxt->_eip = 0;
1705 memcpy(&c->eip, c->src.valptr, c->op_bytes); 1664 memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
1706 return X86EMUL_CONTINUE; 1665 return X86EMUL_CONTINUE;
1707} 1666}
1708 1667
1709static int em_grp1a(struct x86_emulate_ctxt *ctxt) 1668static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1710{ 1669{
1711 struct decode_cache *c = &ctxt->decode; 1670 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
1712
1713 return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
1714} 1671}
1715 1672
1716static int em_grp2(struct x86_emulate_ctxt *ctxt) 1673static int em_grp2(struct x86_emulate_ctxt *ctxt)
1717{ 1674{
1718 struct decode_cache *c = &ctxt->decode; 1675 switch (ctxt->modrm_reg) {
1719 switch (c->modrm_reg) {
1720 case 0: /* rol */ 1676 case 0: /* rol */
1721 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); 1677 emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
1722 break; 1678 break;
1723 case 1: /* ror */ 1679 case 1: /* ror */
1724 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); 1680 emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
1725 break; 1681 break;
1726 case 2: /* rcl */ 1682 case 2: /* rcl */
1727 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); 1683 emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
1728 break; 1684 break;
1729 case 3: /* rcr */ 1685 case 3: /* rcr */
1730 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); 1686 emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
1731 break; 1687 break;
1732 case 4: /* sal/shl */ 1688 case 4: /* sal/shl */
1733 case 6: /* sal/shl */ 1689 case 6: /* sal/shl */
1734 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); 1690 emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
1735 break; 1691 break;
1736 case 5: /* shr */ 1692 case 5: /* shr */
1737 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); 1693 emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
1738 break; 1694 break;
1739 case 7: /* sar */ 1695 case 7: /* sar */
1740 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); 1696 emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
1741 break; 1697 break;
1742 } 1698 }
1743 return X86EMUL_CONTINUE; 1699 return X86EMUL_CONTINUE;
@@ -1745,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
1745 1701
1746static int em_grp3(struct x86_emulate_ctxt *ctxt) 1702static int em_grp3(struct x86_emulate_ctxt *ctxt)
1747{ 1703{
1748 struct decode_cache *c = &ctxt->decode; 1704 unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
1749 unsigned long *rax = &c->regs[VCPU_REGS_RAX]; 1705 unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
1750 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1751 u8 de = 0; 1706 u8 de = 0;
1752 1707
1753 switch (c->modrm_reg) { 1708 switch (ctxt->modrm_reg) {
1754 case 0 ... 1: /* test */ 1709 case 0 ... 1: /* test */
1755 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1710 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
1756 break; 1711 break;
1757 case 2: /* not */ 1712 case 2: /* not */
1758 c->dst.val = ~c->dst.val; 1713 ctxt->dst.val = ~ctxt->dst.val;
1759 break; 1714 break;
1760 case 3: /* neg */ 1715 case 3: /* neg */
1761 emulate_1op("neg", c->dst, ctxt->eflags); 1716 emulate_1op("neg", ctxt->dst, ctxt->eflags);
1762 break; 1717 break;
1763 case 4: /* mul */ 1718 case 4: /* mul */
1764 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); 1719 emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
1765 break; 1720 break;
1766 case 5: /* imul */ 1721 case 5: /* imul */
1767 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); 1722 emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
1768 break; 1723 break;
1769 case 6: /* div */ 1724 case 6: /* div */
1770 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, 1725 emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
1771 ctxt->eflags, de); 1726 ctxt->eflags, de);
1772 break; 1727 break;
1773 case 7: /* idiv */ 1728 case 7: /* idiv */
1774 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, 1729 emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
1775 ctxt->eflags, de); 1730 ctxt->eflags, de);
1776 break; 1731 break;
1777 default: 1732 default:
@@ -1784,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
1784 1739
1785static int em_grp45(struct x86_emulate_ctxt *ctxt) 1740static int em_grp45(struct x86_emulate_ctxt *ctxt)
1786{ 1741{
1787 struct decode_cache *c = &ctxt->decode;
1788 int rc = X86EMUL_CONTINUE; 1742 int rc = X86EMUL_CONTINUE;
1789 1743
1790 switch (c->modrm_reg) { 1744 switch (ctxt->modrm_reg) {
1791 case 0: /* inc */ 1745 case 0: /* inc */
1792 emulate_1op("inc", c->dst, ctxt->eflags); 1746 emulate_1op("inc", ctxt->dst, ctxt->eflags);
1793 break; 1747 break;
1794 case 1: /* dec */ 1748 case 1: /* dec */
1795 emulate_1op("dec", c->dst, ctxt->eflags); 1749 emulate_1op("dec", ctxt->dst, ctxt->eflags);
1796 break; 1750 break;
1797 case 2: /* call near abs */ { 1751 case 2: /* call near abs */ {
1798 long int old_eip; 1752 long int old_eip;
1799 old_eip = c->eip; 1753 old_eip = ctxt->_eip;
1800 c->eip = c->src.val; 1754 ctxt->_eip = ctxt->src.val;
1801 c->src.val = old_eip; 1755 ctxt->src.val = old_eip;
1802 rc = em_push(ctxt); 1756 rc = em_push(ctxt);
1803 break; 1757 break;
1804 } 1758 }
1805 case 4: /* jmp abs */ 1759 case 4: /* jmp abs */
1806 c->eip = c->src.val; 1760 ctxt->_eip = ctxt->src.val;
1807 break; 1761 break;
1808 case 5: /* jmp far */ 1762 case 5: /* jmp far */
1809 rc = em_jmp_far(ctxt); 1763 rc = em_jmp_far(ctxt);
@@ -1817,68 +1771,70 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
1817 1771
1818static int em_grp9(struct x86_emulate_ctxt *ctxt) 1772static int em_grp9(struct x86_emulate_ctxt *ctxt)
1819{ 1773{
1820 struct decode_cache *c = &ctxt->decode; 1774 u64 old = ctxt->dst.orig_val64;
1821 u64 old = c->dst.orig_val64;
1822 1775
1823 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1776 if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
1824 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1777 ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
1825 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1778 ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1826 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1779 ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1827 ctxt->eflags &= ~EFLG_ZF; 1780 ctxt->eflags &= ~EFLG_ZF;
1828 } else { 1781 } else {
1829 c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1782 ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
1830 (u32) c->regs[VCPU_REGS_RBX]; 1783 (u32) ctxt->regs[VCPU_REGS_RBX];
1831 1784
1832 ctxt->eflags |= EFLG_ZF; 1785 ctxt->eflags |= EFLG_ZF;
1833 } 1786 }
1834 return X86EMUL_CONTINUE; 1787 return X86EMUL_CONTINUE;
1835} 1788}
1836 1789
1837static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1790static int em_ret(struct x86_emulate_ctxt *ctxt)
1838 struct x86_emulate_ops *ops) 1791{
1792 ctxt->dst.type = OP_REG;
1793 ctxt->dst.addr.reg = &ctxt->_eip;
1794 ctxt->dst.bytes = ctxt->op_bytes;
1795 return em_pop(ctxt);
1796}
1797
1798static int em_ret_far(struct x86_emulate_ctxt *ctxt)
1839{ 1799{
1840 struct decode_cache *c = &ctxt->decode;
1841 int rc; 1800 int rc;
1842 unsigned long cs; 1801 unsigned long cs;
1843 1802
1844 rc = emulate_pop(ctxt, &c->eip, c->op_bytes); 1803 rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
1845 if (rc != X86EMUL_CONTINUE) 1804 if (rc != X86EMUL_CONTINUE)
1846 return rc; 1805 return rc;
1847 if (c->op_bytes == 4) 1806 if (ctxt->op_bytes == 4)
1848 c->eip = (u32)c->eip; 1807 ctxt->_eip = (u32)ctxt->_eip;
1849 rc = emulate_pop(ctxt, &cs, c->op_bytes); 1808 rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
1850 if (rc != X86EMUL_CONTINUE) 1809 if (rc != X86EMUL_CONTINUE)
1851 return rc; 1810 return rc;
1852 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1811 rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
1853 return rc; 1812 return rc;
1854} 1813}
1855 1814
1856static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, 1815static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
1857 struct x86_emulate_ops *ops, int seg)
1858{ 1816{
1859 struct decode_cache *c = &ctxt->decode;
1860 unsigned short sel; 1817 unsigned short sel;
1861 int rc; 1818 int rc;
1862 1819
1863 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 1820 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
1864 1821
1865 rc = load_segment_descriptor(ctxt, ops, sel, seg); 1822 rc = load_segment_descriptor(ctxt, sel, seg);
1866 if (rc != X86EMUL_CONTINUE) 1823 if (rc != X86EMUL_CONTINUE)
1867 return rc; 1824 return rc;
1868 1825
1869 c->dst.val = c->src.val; 1826 ctxt->dst.val = ctxt->src.val;
1870 return rc; 1827 return rc;
1871} 1828}
1872 1829
1873static inline void 1830static void
1874setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1831setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1875 struct x86_emulate_ops *ops, struct desc_struct *cs, 1832 struct desc_struct *cs, struct desc_struct *ss)
1876 struct desc_struct *ss)
1877{ 1833{
1878 u16 selector; 1834 u16 selector;
1879 1835
1880 memset(cs, 0, sizeof(struct desc_struct)); 1836 memset(cs, 0, sizeof(struct desc_struct));
1881 ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); 1837 ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
1882 memset(ss, 0, sizeof(struct desc_struct)); 1838 memset(ss, 0, sizeof(struct desc_struct));
1883 1839
1884 cs->l = 0; /* will be adjusted later */ 1840 cs->l = 0; /* will be adjusted later */
@@ -1901,10 +1857,9 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1901 ss->p = 1; 1857 ss->p = 1;
1902} 1858}
1903 1859
1904static int 1860static int em_syscall(struct x86_emulate_ctxt *ctxt)
1905emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1906{ 1861{
1907 struct decode_cache *c = &ctxt->decode; 1862 struct x86_emulate_ops *ops = ctxt->ops;
1908 struct desc_struct cs, ss; 1863 struct desc_struct cs, ss;
1909 u64 msr_data; 1864 u64 msr_data;
1910 u16 cs_sel, ss_sel; 1865 u16 cs_sel, ss_sel;
@@ -1916,7 +1871,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1916 return emulate_ud(ctxt); 1871 return emulate_ud(ctxt);
1917 1872
1918 ops->get_msr(ctxt, MSR_EFER, &efer); 1873 ops->get_msr(ctxt, MSR_EFER, &efer);
1919 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1874 setup_syscalls_segments(ctxt, &cs, &ss);
1920 1875
1921 ops->get_msr(ctxt, MSR_STAR, &msr_data); 1876 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1922 msr_data >>= 32; 1877 msr_data >>= 32;
@@ -1930,15 +1885,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1930 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 1885 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1931 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 1886 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1932 1887
1933 c->regs[VCPU_REGS_RCX] = c->eip; 1888 ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
1934 if (efer & EFER_LMA) { 1889 if (efer & EFER_LMA) {
1935#ifdef CONFIG_X86_64 1890#ifdef CONFIG_X86_64
1936 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1891 ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1937 1892
1938 ops->get_msr(ctxt, 1893 ops->get_msr(ctxt,
1939 ctxt->mode == X86EMUL_MODE_PROT64 ? 1894 ctxt->mode == X86EMUL_MODE_PROT64 ?
1940 MSR_LSTAR : MSR_CSTAR, &msr_data); 1895 MSR_LSTAR : MSR_CSTAR, &msr_data);
1941 c->eip = msr_data; 1896 ctxt->_eip = msr_data;
1942 1897
1943 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); 1898 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
1944 ctxt->eflags &= ~(msr_data | EFLG_RF); 1899 ctxt->eflags &= ~(msr_data | EFLG_RF);
@@ -1946,7 +1901,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1946 } else { 1901 } else {
1947 /* legacy mode */ 1902 /* legacy mode */
1948 ops->get_msr(ctxt, MSR_STAR, &msr_data); 1903 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1949 c->eip = (u32)msr_data; 1904 ctxt->_eip = (u32)msr_data;
1950 1905
1951 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1906 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1952 } 1907 }
@@ -1954,16 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1954 return X86EMUL_CONTINUE; 1909 return X86EMUL_CONTINUE;
1955} 1910}
1956 1911
1957static int 1912static int em_sysenter(struct x86_emulate_ctxt *ctxt)
1958emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1959{ 1913{
1960 struct decode_cache *c = &ctxt->decode; 1914 struct x86_emulate_ops *ops = ctxt->ops;
1961 struct desc_struct cs, ss; 1915 struct desc_struct cs, ss;
1962 u64 msr_data; 1916 u64 msr_data;
1963 u16 cs_sel, ss_sel; 1917 u16 cs_sel, ss_sel;
1964 u64 efer = 0; 1918 u64 efer = 0;
1965 1919
1966 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 1920 ops->get_msr(ctxt, MSR_EFER, &efer);
1967 /* inject #GP if in real mode */ 1921 /* inject #GP if in real mode */
1968 if (ctxt->mode == X86EMUL_MODE_REAL) 1922 if (ctxt->mode == X86EMUL_MODE_REAL)
1969 return emulate_gp(ctxt, 0); 1923 return emulate_gp(ctxt, 0);
@@ -1974,7 +1928,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1974 if (ctxt->mode == X86EMUL_MODE_PROT64) 1928 if (ctxt->mode == X86EMUL_MODE_PROT64)
1975 return emulate_ud(ctxt); 1929 return emulate_ud(ctxt);
1976 1930
1977 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1931 setup_syscalls_segments(ctxt, &cs, &ss);
1978 1932
1979 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); 1933 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
1980 switch (ctxt->mode) { 1934 switch (ctxt->mode) {
@@ -2002,31 +1956,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2002 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 1956 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2003 1957
2004 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); 1958 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
2005 c->eip = msr_data; 1959 ctxt->_eip = msr_data;
2006 1960
2007 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); 1961 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2008 c->regs[VCPU_REGS_RSP] = msr_data; 1962 ctxt->regs[VCPU_REGS_RSP] = msr_data;
2009 1963
2010 return X86EMUL_CONTINUE; 1964 return X86EMUL_CONTINUE;
2011} 1965}
2012 1966
2013static int 1967static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2014emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2015{ 1968{
2016 struct decode_cache *c = &ctxt->decode; 1969 struct x86_emulate_ops *ops = ctxt->ops;
2017 struct desc_struct cs, ss; 1970 struct desc_struct cs, ss;
2018 u64 msr_data; 1971 u64 msr_data;
2019 int usermode; 1972 int usermode;
2020 u16 cs_sel, ss_sel; 1973 u16 cs_sel = 0, ss_sel = 0;
2021 1974
2022 /* inject #GP if in real mode or Virtual 8086 mode */ 1975 /* inject #GP if in real mode or Virtual 8086 mode */
2023 if (ctxt->mode == X86EMUL_MODE_REAL || 1976 if (ctxt->mode == X86EMUL_MODE_REAL ||
2024 ctxt->mode == X86EMUL_MODE_VM86) 1977 ctxt->mode == X86EMUL_MODE_VM86)
2025 return emulate_gp(ctxt, 0); 1978 return emulate_gp(ctxt, 0);
2026 1979
2027 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1980 setup_syscalls_segments(ctxt, &cs, &ss);
2028 1981
2029 if ((c->rex_prefix & 0x8) != 0x0) 1982 if ((ctxt->rex_prefix & 0x8) != 0x0)
2030 usermode = X86EMUL_MODE_PROT64; 1983 usermode = X86EMUL_MODE_PROT64;
2031 else 1984 else
2032 usermode = X86EMUL_MODE_PROT32; 1985 usermode = X86EMUL_MODE_PROT32;
@@ -2056,14 +2009,13 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2056 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2009 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2057 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2010 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2058 2011
2059 c->eip = c->regs[VCPU_REGS_RDX]; 2012 ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
2060 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2013 ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
2061 2014
2062 return X86EMUL_CONTINUE; 2015 return X86EMUL_CONTINUE;
2063} 2016}
2064 2017
2065static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, 2018static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2066 struct x86_emulate_ops *ops)
2067{ 2019{
2068 int iopl; 2020 int iopl;
2069 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -2071,13 +2023,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2071 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
2072 return true; 2024 return true;
2073 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
2074 return ops->cpl(ctxt) > iopl; 2026 return ctxt->ops->cpl(ctxt) > iopl;
2075} 2027}
2076 2028
2077static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2078 struct x86_emulate_ops *ops,
2079 u16 port, u16 len) 2030 u16 port, u16 len)
2080{ 2031{
2032 struct x86_emulate_ops *ops = ctxt->ops;
2081 struct desc_struct tr_seg; 2033 struct desc_struct tr_seg;
2082 u32 base3; 2034 u32 base3;
2083 int r; 2035 int r;
@@ -2108,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2108} 2060}
2109 2061
2110static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, 2062static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2111 struct x86_emulate_ops *ops,
2112 u16 port, u16 len) 2063 u16 port, u16 len)
2113{ 2064{
2114 if (ctxt->perm_ok) 2065 if (ctxt->perm_ok)
2115 return true; 2066 return true;
2116 2067
2117 if (emulator_bad_iopl(ctxt, ops)) 2068 if (emulator_bad_iopl(ctxt))
2118 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2069 if (!emulator_io_port_access_allowed(ctxt, port, len))
2119 return false; 2070 return false;
2120 2071
2121 ctxt->perm_ok = true; 2072 ctxt->perm_ok = true;
@@ -2124,21 +2075,18 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2124} 2075}
2125 2076
2126static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2077static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2127 struct x86_emulate_ops *ops,
2128 struct tss_segment_16 *tss) 2078 struct tss_segment_16 *tss)
2129{ 2079{
2130 struct decode_cache *c = &ctxt->decode; 2080 tss->ip = ctxt->_eip;
2131
2132 tss->ip = c->eip;
2133 tss->flag = ctxt->eflags; 2081 tss->flag = ctxt->eflags;
2134 tss->ax = c->regs[VCPU_REGS_RAX]; 2082 tss->ax = ctxt->regs[VCPU_REGS_RAX];
2135 tss->cx = c->regs[VCPU_REGS_RCX]; 2083 tss->cx = ctxt->regs[VCPU_REGS_RCX];
2136 tss->dx = c->regs[VCPU_REGS_RDX]; 2084 tss->dx = ctxt->regs[VCPU_REGS_RDX];
2137 tss->bx = c->regs[VCPU_REGS_RBX]; 2085 tss->bx = ctxt->regs[VCPU_REGS_RBX];
2138 tss->sp = c->regs[VCPU_REGS_RSP]; 2086 tss->sp = ctxt->regs[VCPU_REGS_RSP];
2139 tss->bp = c->regs[VCPU_REGS_RBP]; 2087 tss->bp = ctxt->regs[VCPU_REGS_RBP];
2140 tss->si = c->regs[VCPU_REGS_RSI]; 2088 tss->si = ctxt->regs[VCPU_REGS_RSI];
2141 tss->di = c->regs[VCPU_REGS_RDI]; 2089 tss->di = ctxt->regs[VCPU_REGS_RDI];
2142 2090
2143 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2091 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2144 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2092 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2148,22 +2096,20 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2148} 2096}
2149 2097
2150static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, 2098static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2151 struct x86_emulate_ops *ops,
2152 struct tss_segment_16 *tss) 2099 struct tss_segment_16 *tss)
2153{ 2100{
2154 struct decode_cache *c = &ctxt->decode;
2155 int ret; 2101 int ret;
2156 2102
2157 c->eip = tss->ip; 2103 ctxt->_eip = tss->ip;
2158 ctxt->eflags = tss->flag | 2; 2104 ctxt->eflags = tss->flag | 2;
2159 c->regs[VCPU_REGS_RAX] = tss->ax; 2105 ctxt->regs[VCPU_REGS_RAX] = tss->ax;
2160 c->regs[VCPU_REGS_RCX] = tss->cx; 2106 ctxt->regs[VCPU_REGS_RCX] = tss->cx;
2161 c->regs[VCPU_REGS_RDX] = tss->dx; 2107 ctxt->regs[VCPU_REGS_RDX] = tss->dx;
2162 c->regs[VCPU_REGS_RBX] = tss->bx; 2108 ctxt->regs[VCPU_REGS_RBX] = tss->bx;
2163 c->regs[VCPU_REGS_RSP] = tss->sp; 2109 ctxt->regs[VCPU_REGS_RSP] = tss->sp;
2164 c->regs[VCPU_REGS_RBP] = tss->bp; 2110 ctxt->regs[VCPU_REGS_RBP] = tss->bp;
2165 c->regs[VCPU_REGS_RSI] = tss->si; 2111 ctxt->regs[VCPU_REGS_RSI] = tss->si;
2166 c->regs[VCPU_REGS_RDI] = tss->di; 2112 ctxt->regs[VCPU_REGS_RDI] = tss->di;
2167 2113
2168 /* 2114 /*
2169 * SDM says that segment selectors are loaded before segment 2115 * SDM says that segment selectors are loaded before segment
@@ -2179,19 +2125,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2179 * Now load segment descriptors. If fault happenes at this stage 2125 * Now load segment descriptors. If fault happenes at this stage
2180 * it is handled in a context of new task 2126 * it is handled in a context of new task
2181 */ 2127 */
2182 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); 2128 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
2183 if (ret != X86EMUL_CONTINUE) 2129 if (ret != X86EMUL_CONTINUE)
2184 return ret; 2130 return ret;
2185 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); 2131 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
2186 if (ret != X86EMUL_CONTINUE) 2132 if (ret != X86EMUL_CONTINUE)
2187 return ret; 2133 return ret;
2188 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); 2134 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
2189 if (ret != X86EMUL_CONTINUE) 2135 if (ret != X86EMUL_CONTINUE)
2190 return ret; 2136 return ret;
2191 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); 2137 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
2192 if (ret != X86EMUL_CONTINUE) 2138 if (ret != X86EMUL_CONTINUE)
2193 return ret; 2139 return ret;
2194 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); 2140 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
2195 if (ret != X86EMUL_CONTINUE) 2141 if (ret != X86EMUL_CONTINUE)
2196 return ret; 2142 return ret;
2197 2143
@@ -2199,10 +2145,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2199} 2145}
2200 2146
2201static int task_switch_16(struct x86_emulate_ctxt *ctxt, 2147static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2202 struct x86_emulate_ops *ops,
2203 u16 tss_selector, u16 old_tss_sel, 2148 u16 tss_selector, u16 old_tss_sel,
2204 ulong old_tss_base, struct desc_struct *new_desc) 2149 ulong old_tss_base, struct desc_struct *new_desc)
2205{ 2150{
2151 struct x86_emulate_ops *ops = ctxt->ops;
2206 struct tss_segment_16 tss_seg; 2152 struct tss_segment_16 tss_seg;
2207 int ret; 2153 int ret;
2208 u32 new_tss_base = get_desc_base(new_desc); 2154 u32 new_tss_base = get_desc_base(new_desc);
@@ -2213,7 +2159,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2213 /* FIXME: need to provide precise fault address */ 2159 /* FIXME: need to provide precise fault address */
2214 return ret; 2160 return ret;
2215 2161
2216 save_state_to_tss16(ctxt, ops, &tss_seg); 2162 save_state_to_tss16(ctxt, &tss_seg);
2217 2163
2218 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2164 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2219 &ctxt->exception); 2165 &ctxt->exception);
@@ -2239,26 +2185,23 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2239 return ret; 2185 return ret;
2240 } 2186 }
2241 2187
2242 return load_state_from_tss16(ctxt, ops, &tss_seg); 2188 return load_state_from_tss16(ctxt, &tss_seg);
2243} 2189}
2244 2190
2245static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, 2191static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2246 struct x86_emulate_ops *ops,
2247 struct tss_segment_32 *tss) 2192 struct tss_segment_32 *tss)
2248{ 2193{
2249 struct decode_cache *c = &ctxt->decode; 2194 tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
2250 2195 tss->eip = ctxt->_eip;
2251 tss->cr3 = ops->get_cr(ctxt, 3);
2252 tss->eip = c->eip;
2253 tss->eflags = ctxt->eflags; 2196 tss->eflags = ctxt->eflags;
2254 tss->eax = c->regs[VCPU_REGS_RAX]; 2197 tss->eax = ctxt->regs[VCPU_REGS_RAX];
2255 tss->ecx = c->regs[VCPU_REGS_RCX]; 2198 tss->ecx = ctxt->regs[VCPU_REGS_RCX];
2256 tss->edx = c->regs[VCPU_REGS_RDX]; 2199 tss->edx = ctxt->regs[VCPU_REGS_RDX];
2257 tss->ebx = c->regs[VCPU_REGS_RBX]; 2200 tss->ebx = ctxt->regs[VCPU_REGS_RBX];
2258 tss->esp = c->regs[VCPU_REGS_RSP]; 2201 tss->esp = ctxt->regs[VCPU_REGS_RSP];
2259 tss->ebp = c->regs[VCPU_REGS_RBP]; 2202 tss->ebp = ctxt->regs[VCPU_REGS_RBP];
2260 tss->esi = c->regs[VCPU_REGS_RSI]; 2203 tss->esi = ctxt->regs[VCPU_REGS_RSI];
2261 tss->edi = c->regs[VCPU_REGS_RDI]; 2204 tss->edi = ctxt->regs[VCPU_REGS_RDI];
2262 2205
2263 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2206 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2264 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2207 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2270,24 +2213,22 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2270} 2213}
2271 2214
2272static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2215static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2273 struct x86_emulate_ops *ops,
2274 struct tss_segment_32 *tss) 2216 struct tss_segment_32 *tss)
2275{ 2217{
2276 struct decode_cache *c = &ctxt->decode;
2277 int ret; 2218 int ret;
2278 2219
2279 if (ops->set_cr(ctxt, 3, tss->cr3)) 2220 if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
2280 return emulate_gp(ctxt, 0); 2221 return emulate_gp(ctxt, 0);
2281 c->eip = tss->eip; 2222 ctxt->_eip = tss->eip;
2282 ctxt->eflags = tss->eflags | 2; 2223 ctxt->eflags = tss->eflags | 2;
2283 c->regs[VCPU_REGS_RAX] = tss->eax; 2224 ctxt->regs[VCPU_REGS_RAX] = tss->eax;
2284 c->regs[VCPU_REGS_RCX] = tss->ecx; 2225 ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
2285 c->regs[VCPU_REGS_RDX] = tss->edx; 2226 ctxt->regs[VCPU_REGS_RDX] = tss->edx;
2286 c->regs[VCPU_REGS_RBX] = tss->ebx; 2227 ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
2287 c->regs[VCPU_REGS_RSP] = tss->esp; 2228 ctxt->regs[VCPU_REGS_RSP] = tss->esp;
2288 c->regs[VCPU_REGS_RBP] = tss->ebp; 2229 ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
2289 c->regs[VCPU_REGS_RSI] = tss->esi; 2230 ctxt->regs[VCPU_REGS_RSI] = tss->esi;
2290 c->regs[VCPU_REGS_RDI] = tss->edi; 2231 ctxt->regs[VCPU_REGS_RDI] = tss->edi;
2291 2232
2292 /* 2233 /*
2293 * SDM says that segment selectors are loaded before segment 2234 * SDM says that segment selectors are loaded before segment
@@ -2305,25 +2246,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2305 * Now load segment descriptors. If fault happenes at this stage 2246 * Now load segment descriptors. If fault happenes at this stage
2306 * it is handled in a context of new task 2247 * it is handled in a context of new task
2307 */ 2248 */
2308 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); 2249 ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
2309 if (ret != X86EMUL_CONTINUE) 2250 if (ret != X86EMUL_CONTINUE)
2310 return ret; 2251 return ret;
2311 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); 2252 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
2312 if (ret != X86EMUL_CONTINUE) 2253 if (ret != X86EMUL_CONTINUE)
2313 return ret; 2254 return ret;
2314 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); 2255 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
2315 if (ret != X86EMUL_CONTINUE) 2256 if (ret != X86EMUL_CONTINUE)
2316 return ret; 2257 return ret;
2317 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); 2258 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
2318 if (ret != X86EMUL_CONTINUE) 2259 if (ret != X86EMUL_CONTINUE)
2319 return ret; 2260 return ret;
2320 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); 2261 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
2321 if (ret != X86EMUL_CONTINUE) 2262 if (ret != X86EMUL_CONTINUE)
2322 return ret; 2263 return ret;
2323 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); 2264 ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
2324 if (ret != X86EMUL_CONTINUE) 2265 if (ret != X86EMUL_CONTINUE)
2325 return ret; 2266 return ret;
2326 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); 2267 ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
2327 if (ret != X86EMUL_CONTINUE) 2268 if (ret != X86EMUL_CONTINUE)
2328 return ret; 2269 return ret;
2329 2270
@@ -2331,10 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2331} 2272}
2332 2273
2333static int task_switch_32(struct x86_emulate_ctxt *ctxt, 2274static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2334 struct x86_emulate_ops *ops,
2335 u16 tss_selector, u16 old_tss_sel, 2275 u16 tss_selector, u16 old_tss_sel,
2336 ulong old_tss_base, struct desc_struct *new_desc) 2276 ulong old_tss_base, struct desc_struct *new_desc)
2337{ 2277{
2278 struct x86_emulate_ops *ops = ctxt->ops;
2338 struct tss_segment_32 tss_seg; 2279 struct tss_segment_32 tss_seg;
2339 int ret; 2280 int ret;
2340 u32 new_tss_base = get_desc_base(new_desc); 2281 u32 new_tss_base = get_desc_base(new_desc);
@@ -2345,7 +2286,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2345 /* FIXME: need to provide precise fault address */ 2286 /* FIXME: need to provide precise fault address */
2346 return ret; 2287 return ret;
2347 2288
2348 save_state_to_tss32(ctxt, ops, &tss_seg); 2289 save_state_to_tss32(ctxt, &tss_seg);
2349 2290
2350 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2291 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2351 &ctxt->exception); 2292 &ctxt->exception);
@@ -2371,14 +2312,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2371 return ret; 2312 return ret;
2372 } 2313 }
2373 2314
2374 return load_state_from_tss32(ctxt, ops, &tss_seg); 2315 return load_state_from_tss32(ctxt, &tss_seg);
2375} 2316}
2376 2317
2377static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, 2318static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 struct x86_emulate_ops *ops,
2379 u16 tss_selector, int reason, 2319 u16 tss_selector, int reason,
2380 bool has_error_code, u32 error_code) 2320 bool has_error_code, u32 error_code)
2381{ 2321{
2322 struct x86_emulate_ops *ops = ctxt->ops;
2382 struct desc_struct curr_tss_desc, next_tss_desc; 2323 struct desc_struct curr_tss_desc, next_tss_desc;
2383 int ret; 2324 int ret;
2384 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); 2325 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2388,10 +2329,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2388 2329
2389 /* FIXME: old_tss_base == ~0 ? */ 2330 /* FIXME: old_tss_base == ~0 ? */
2390 2331
2391 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); 2332 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
2392 if (ret != X86EMUL_CONTINUE) 2333 if (ret != X86EMUL_CONTINUE)
2393 return ret; 2334 return ret;
2394 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); 2335 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
2395 if (ret != X86EMUL_CONTINUE) 2336 if (ret != X86EMUL_CONTINUE)
2396 return ret; 2337 return ret;
2397 2338
@@ -2413,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2413 2354
2414 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 2355 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2415 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ 2356 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2416 write_segment_descriptor(ctxt, ops, old_tss_sel, 2357 write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
2417 &curr_tss_desc);
2418 } 2358 }
2419 2359
2420 if (reason == TASK_SWITCH_IRET) 2360 if (reason == TASK_SWITCH_IRET)
@@ -2426,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2426 old_tss_sel = 0xffff; 2366 old_tss_sel = 0xffff;
2427 2367
2428 if (next_tss_desc.type & 8) 2368 if (next_tss_desc.type & 8)
2429 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, 2369 ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
2430 old_tss_base, &next_tss_desc); 2370 old_tss_base, &next_tss_desc);
2431 else 2371 else
2432 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, 2372 ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
2433 old_tss_base, &next_tss_desc); 2373 old_tss_base, &next_tss_desc);
2434 if (ret != X86EMUL_CONTINUE) 2374 if (ret != X86EMUL_CONTINUE)
2435 return ret; 2375 return ret;
@@ -2439,19 +2379,16 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2439 2379
2440 if (reason != TASK_SWITCH_IRET) { 2380 if (reason != TASK_SWITCH_IRET) {
2441 next_tss_desc.type |= (1 << 1); /* set busy flag */ 2381 next_tss_desc.type |= (1 << 1); /* set busy flag */
2442 write_segment_descriptor(ctxt, ops, tss_selector, 2382 write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
2443 &next_tss_desc);
2444 } 2383 }
2445 2384
2446 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); 2385 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
2447 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); 2386 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
2448 2387
2449 if (has_error_code) { 2388 if (has_error_code) {
2450 struct decode_cache *c = &ctxt->decode; 2389 ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2451 2390 ctxt->lock_prefix = 0;
2452 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2391 ctxt->src.val = (unsigned long) error_code;
2453 c->lock_prefix = 0;
2454 c->src.val = (unsigned long) error_code;
2455 ret = em_push(ctxt); 2392 ret = em_push(ctxt);
2456 } 2393 }
2457 2394
@@ -2462,18 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2462 u16 tss_selector, int reason, 2399 u16 tss_selector, int reason,
2463 bool has_error_code, u32 error_code) 2400 bool has_error_code, u32 error_code)
2464{ 2401{
2465 struct x86_emulate_ops *ops = ctxt->ops;
2466 struct decode_cache *c = &ctxt->decode;
2467 int rc; 2402 int rc;
2468 2403
2469 c->eip = ctxt->eip; 2404 ctxt->_eip = ctxt->eip;
2470 c->dst.type = OP_NONE; 2405 ctxt->dst.type = OP_NONE;
2471 2406
2472 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2407 rc = emulator_do_task_switch(ctxt, tss_selector, reason,
2473 has_error_code, error_code); 2408 has_error_code, error_code);
2474 2409
2475 if (rc == X86EMUL_CONTINUE) 2410 if (rc == X86EMUL_CONTINUE)
2476 ctxt->eip = c->eip; 2411 ctxt->eip = ctxt->_eip;
2477 2412
2478 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 2413 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2479} 2414}
@@ -2481,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2481static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2416static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2482 int reg, struct operand *op) 2417 int reg, struct operand *op)
2483{ 2418{
2484 struct decode_cache *c = &ctxt->decode;
2485 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2419 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2486 2420
2487 register_address_increment(c, &c->regs[reg], df * op->bytes); 2421 register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
2488 op->addr.mem.ea = register_address(c, c->regs[reg]); 2422 op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
2489 op->addr.mem.seg = seg; 2423 op->addr.mem.seg = seg;
2490} 2424}
2491 2425
2492static int em_das(struct x86_emulate_ctxt *ctxt) 2426static int em_das(struct x86_emulate_ctxt *ctxt)
2493{ 2427{
2494 struct decode_cache *c = &ctxt->decode;
2495 u8 al, old_al; 2428 u8 al, old_al;
2496 bool af, cf, old_cf; 2429 bool af, cf, old_cf;
2497 2430
2498 cf = ctxt->eflags & X86_EFLAGS_CF; 2431 cf = ctxt->eflags & X86_EFLAGS_CF;
2499 al = c->dst.val; 2432 al = ctxt->dst.val;
2500 2433
2501 old_al = al; 2434 old_al = al;
2502 old_cf = cf; 2435 old_cf = cf;
@@ -2514,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2514 cf = true; 2447 cf = true;
2515 } 2448 }
2516 2449
2517 c->dst.val = al; 2450 ctxt->dst.val = al;
2518 /* Set PF, ZF, SF */ 2451 /* Set PF, ZF, SF */
2519 c->src.type = OP_IMM; 2452 ctxt->src.type = OP_IMM;
2520 c->src.val = 0; 2453 ctxt->src.val = 0;
2521 c->src.bytes = 1; 2454 ctxt->src.bytes = 1;
2522 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2455 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
2523 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2456 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2524 if (cf) 2457 if (cf)
2525 ctxt->eflags |= X86_EFLAGS_CF; 2458 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2530,175 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2530 2463
2531static int em_call_far(struct x86_emulate_ctxt *ctxt) 2464static int em_call_far(struct x86_emulate_ctxt *ctxt)
2532{ 2465{
2533 struct decode_cache *c = &ctxt->decode;
2534 u16 sel, old_cs; 2466 u16 sel, old_cs;
2535 ulong old_eip; 2467 ulong old_eip;
2536 int rc; 2468 int rc;
2537 2469
2538 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2470 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2539 old_eip = c->eip; 2471 old_eip = ctxt->_eip;
2540 2472
2541 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 2473 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
2542 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) 2474 if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
2543 return X86EMUL_CONTINUE; 2475 return X86EMUL_CONTINUE;
2544 2476
2545 c->eip = 0; 2477 ctxt->_eip = 0;
2546 memcpy(&c->eip, c->src.valptr, c->op_bytes); 2478 memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
2547 2479
2548 c->src.val = old_cs; 2480 ctxt->src.val = old_cs;
2549 rc = em_push(ctxt); 2481 rc = em_push(ctxt);
2550 if (rc != X86EMUL_CONTINUE) 2482 if (rc != X86EMUL_CONTINUE)
2551 return rc; 2483 return rc;
2552 2484
2553 c->src.val = old_eip; 2485 ctxt->src.val = old_eip;
2554 return em_push(ctxt); 2486 return em_push(ctxt);
2555} 2487}
2556 2488
2557static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) 2489static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2558{ 2490{
2559 struct decode_cache *c = &ctxt->decode;
2560 int rc; 2491 int rc;
2561 2492
2562 c->dst.type = OP_REG; 2493 ctxt->dst.type = OP_REG;
2563 c->dst.addr.reg = &c->eip; 2494 ctxt->dst.addr.reg = &ctxt->_eip;
2564 c->dst.bytes = c->op_bytes; 2495 ctxt->dst.bytes = ctxt->op_bytes;
2565 rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); 2496 rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
2566 if (rc != X86EMUL_CONTINUE) 2497 if (rc != X86EMUL_CONTINUE)
2567 return rc; 2498 return rc;
2568 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); 2499 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
2569 return X86EMUL_CONTINUE; 2500 return X86EMUL_CONTINUE;
2570} 2501}
2571 2502
2572static int em_add(struct x86_emulate_ctxt *ctxt) 2503static int em_add(struct x86_emulate_ctxt *ctxt)
2573{ 2504{
2574 struct decode_cache *c = &ctxt->decode; 2505 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
2575
2576 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2577 return X86EMUL_CONTINUE; 2506 return X86EMUL_CONTINUE;
2578} 2507}
2579 2508
2580static int em_or(struct x86_emulate_ctxt *ctxt) 2509static int em_or(struct x86_emulate_ctxt *ctxt)
2581{ 2510{
2582 struct decode_cache *c = &ctxt->decode; 2511 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
2583
2584 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2585 return X86EMUL_CONTINUE; 2512 return X86EMUL_CONTINUE;
2586} 2513}
2587 2514
2588static int em_adc(struct x86_emulate_ctxt *ctxt) 2515static int em_adc(struct x86_emulate_ctxt *ctxt)
2589{ 2516{
2590 struct decode_cache *c = &ctxt->decode; 2517 emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
2591
2592 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2593 return X86EMUL_CONTINUE; 2518 return X86EMUL_CONTINUE;
2594} 2519}
2595 2520
2596static int em_sbb(struct x86_emulate_ctxt *ctxt) 2521static int em_sbb(struct x86_emulate_ctxt *ctxt)
2597{ 2522{
2598 struct decode_cache *c = &ctxt->decode; 2523 emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
2599
2600 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2601 return X86EMUL_CONTINUE; 2524 return X86EMUL_CONTINUE;
2602} 2525}
2603 2526
2604static int em_and(struct x86_emulate_ctxt *ctxt) 2527static int em_and(struct x86_emulate_ctxt *ctxt)
2605{ 2528{
2606 struct decode_cache *c = &ctxt->decode; 2529 emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
2607
2608 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2609 return X86EMUL_CONTINUE; 2530 return X86EMUL_CONTINUE;
2610} 2531}
2611 2532
2612static int em_sub(struct x86_emulate_ctxt *ctxt) 2533static int em_sub(struct x86_emulate_ctxt *ctxt)
2613{ 2534{
2614 struct decode_cache *c = &ctxt->decode; 2535 emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
2615
2616 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2617 return X86EMUL_CONTINUE; 2536 return X86EMUL_CONTINUE;
2618} 2537}
2619 2538
2620static int em_xor(struct x86_emulate_ctxt *ctxt) 2539static int em_xor(struct x86_emulate_ctxt *ctxt)
2621{ 2540{
2622 struct decode_cache *c = &ctxt->decode; 2541 emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
2623
2624 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2625 return X86EMUL_CONTINUE; 2542 return X86EMUL_CONTINUE;
2626} 2543}
2627 2544
2628static int em_cmp(struct x86_emulate_ctxt *ctxt) 2545static int em_cmp(struct x86_emulate_ctxt *ctxt)
2629{ 2546{
2630 struct decode_cache *c = &ctxt->decode; 2547 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
2631
2632 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2633 /* Disable writeback. */ 2548 /* Disable writeback. */
2634 c->dst.type = OP_NONE; 2549 ctxt->dst.type = OP_NONE;
2635 return X86EMUL_CONTINUE; 2550 return X86EMUL_CONTINUE;
2636} 2551}
2637 2552
2638static int em_imul(struct x86_emulate_ctxt *ctxt) 2553static int em_test(struct x86_emulate_ctxt *ctxt)
2554{
2555 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
2556 return X86EMUL_CONTINUE;
2557}
2558
2559static int em_xchg(struct x86_emulate_ctxt *ctxt)
2639{ 2560{
2640 struct decode_cache *c = &ctxt->decode; 2561 /* Write back the register source. */
2562 ctxt->src.val = ctxt->dst.val;
2563 write_register_operand(&ctxt->src);
2641 2564
2642 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); 2565 /* Write back the memory destination with implicit LOCK prefix. */
2566 ctxt->dst.val = ctxt->src.orig_val;
2567 ctxt->lock_prefix = 1;
2643 return X86EMUL_CONTINUE; 2568 return X86EMUL_CONTINUE;
2644} 2569}
2645 2570
2646static int em_imul_3op(struct x86_emulate_ctxt *ctxt) 2571static int em_imul(struct x86_emulate_ctxt *ctxt)
2647{ 2572{
2648 struct decode_cache *c = &ctxt->decode; 2573 emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
2574 return X86EMUL_CONTINUE;
2575}
2649 2576
2650 c->dst.val = c->src2.val; 2577static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2578{
2579 ctxt->dst.val = ctxt->src2.val;
2651 return em_imul(ctxt); 2580 return em_imul(ctxt);
2652} 2581}
2653 2582
2654static int em_cwd(struct x86_emulate_ctxt *ctxt) 2583static int em_cwd(struct x86_emulate_ctxt *ctxt)
2655{ 2584{
2656 struct decode_cache *c = &ctxt->decode; 2585 ctxt->dst.type = OP_REG;
2657 2586 ctxt->dst.bytes = ctxt->src.bytes;
2658 c->dst.type = OP_REG; 2587 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
2659 c->dst.bytes = c->src.bytes; 2588 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
2660 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2661 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2662 2589
2663 return X86EMUL_CONTINUE; 2590 return X86EMUL_CONTINUE;
2664} 2591}
2665 2592
2666static int em_rdtsc(struct x86_emulate_ctxt *ctxt) 2593static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2667{ 2594{
2668 struct decode_cache *c = &ctxt->decode;
2669 u64 tsc = 0; 2595 u64 tsc = 0;
2670 2596
2671 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); 2597 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2672 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2598 ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
2673 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2599 ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
2674 return X86EMUL_CONTINUE; 2600 return X86EMUL_CONTINUE;
2675} 2601}
2676 2602
2677static int em_mov(struct x86_emulate_ctxt *ctxt) 2603static int em_mov(struct x86_emulate_ctxt *ctxt)
2678{ 2604{
2679 struct decode_cache *c = &ctxt->decode; 2605 ctxt->dst.val = ctxt->src.val;
2680 c->dst.val = c->src.val;
2681 return X86EMUL_CONTINUE; 2606 return X86EMUL_CONTINUE;
2682} 2607}
2683 2608
2609static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
2610{
2611 if (ctxt->modrm_reg > VCPU_SREG_GS)
2612 return emulate_ud(ctxt);
2613
2614 ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
2615 return X86EMUL_CONTINUE;
2616}
2617
2618static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2619{
2620 u16 sel = ctxt->src.val;
2621
2622 if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
2623 return emulate_ud(ctxt);
2624
2625 if (ctxt->modrm_reg == VCPU_SREG_SS)
2626 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2627
2628 /* Disable writeback. */
2629 ctxt->dst.type = OP_NONE;
2630 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2631}
2632
2684static int em_movdqu(struct x86_emulate_ctxt *ctxt) 2633static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2685{ 2634{
2686 struct decode_cache *c = &ctxt->decode; 2635 memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
2687 memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
2688 return X86EMUL_CONTINUE; 2636 return X86EMUL_CONTINUE;
2689} 2637}
2690 2638
2691static int em_invlpg(struct x86_emulate_ctxt *ctxt) 2639static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2692{ 2640{
2693 struct decode_cache *c = &ctxt->decode;
2694 int rc; 2641 int rc;
2695 ulong linear; 2642 ulong linear;
2696 2643
2697 rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); 2644 rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
2698 if (rc == X86EMUL_CONTINUE) 2645 if (rc == X86EMUL_CONTINUE)
2699 ctxt->ops->invlpg(ctxt, linear); 2646 ctxt->ops->invlpg(ctxt, linear);
2700 /* Disable writeback. */ 2647 /* Disable writeback. */
2701 c->dst.type = OP_NONE; 2648 ctxt->dst.type = OP_NONE;
2702 return X86EMUL_CONTINUE; 2649 return X86EMUL_CONTINUE;
2703} 2650}
2704 2651
@@ -2714,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
2714 2661
2715static int em_vmcall(struct x86_emulate_ctxt *ctxt) 2662static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2716{ 2663{
2717 struct decode_cache *c = &ctxt->decode;
2718 int rc; 2664 int rc;
2719 2665
2720 if (c->modrm_mod != 3 || c->modrm_rm != 1) 2666 if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
2721 return X86EMUL_UNHANDLEABLE; 2667 return X86EMUL_UNHANDLEABLE;
2722 2668
2723 rc = ctxt->ops->fix_hypercall(ctxt); 2669 rc = ctxt->ops->fix_hypercall(ctxt);
@@ -2725,73 +2671,104 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2725 return rc; 2671 return rc;
2726 2672
2727 /* Let the processor re-execute the fixed hypercall */ 2673 /* Let the processor re-execute the fixed hypercall */
2728 c->eip = ctxt->eip; 2674 ctxt->_eip = ctxt->eip;
2729 /* Disable writeback. */ 2675 /* Disable writeback. */
2730 c->dst.type = OP_NONE; 2676 ctxt->dst.type = OP_NONE;
2731 return X86EMUL_CONTINUE; 2677 return X86EMUL_CONTINUE;
2732} 2678}
2733 2679
2734static int em_lgdt(struct x86_emulate_ctxt *ctxt) 2680static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2735{ 2681{
2736 struct decode_cache *c = &ctxt->decode;
2737 struct desc_ptr desc_ptr; 2682 struct desc_ptr desc_ptr;
2738 int rc; 2683 int rc;
2739 2684
2740 rc = read_descriptor(ctxt, c->src.addr.mem, 2685 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2741 &desc_ptr.size, &desc_ptr.address, 2686 &desc_ptr.size, &desc_ptr.address,
2742 c->op_bytes); 2687 ctxt->op_bytes);
2743 if (rc != X86EMUL_CONTINUE) 2688 if (rc != X86EMUL_CONTINUE)
2744 return rc; 2689 return rc;
2745 ctxt->ops->set_gdt(ctxt, &desc_ptr); 2690 ctxt->ops->set_gdt(ctxt, &desc_ptr);
2746 /* Disable writeback. */ 2691 /* Disable writeback. */
2747 c->dst.type = OP_NONE; 2692 ctxt->dst.type = OP_NONE;
2748 return X86EMUL_CONTINUE; 2693 return X86EMUL_CONTINUE;
2749} 2694}
2750 2695
2751static int em_vmmcall(struct x86_emulate_ctxt *ctxt) 2696static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
2752{ 2697{
2753 struct decode_cache *c = &ctxt->decode;
2754 int rc; 2698 int rc;
2755 2699
2756 rc = ctxt->ops->fix_hypercall(ctxt); 2700 rc = ctxt->ops->fix_hypercall(ctxt);
2757 2701
2758 /* Disable writeback. */ 2702 /* Disable writeback. */
2759 c->dst.type = OP_NONE; 2703 ctxt->dst.type = OP_NONE;
2760 return rc; 2704 return rc;
2761} 2705}
2762 2706
2763static int em_lidt(struct x86_emulate_ctxt *ctxt) 2707static int em_lidt(struct x86_emulate_ctxt *ctxt)
2764{ 2708{
2765 struct decode_cache *c = &ctxt->decode;
2766 struct desc_ptr desc_ptr; 2709 struct desc_ptr desc_ptr;
2767 int rc; 2710 int rc;
2768 2711
2769 rc = read_descriptor(ctxt, c->src.addr.mem, 2712 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2770 &desc_ptr.size, &desc_ptr.address, 2713 &desc_ptr.size, &desc_ptr.address,
2771 c->op_bytes); 2714 ctxt->op_bytes);
2772 if (rc != X86EMUL_CONTINUE) 2715 if (rc != X86EMUL_CONTINUE)
2773 return rc; 2716 return rc;
2774 ctxt->ops->set_idt(ctxt, &desc_ptr); 2717 ctxt->ops->set_idt(ctxt, &desc_ptr);
2775 /* Disable writeback. */ 2718 /* Disable writeback. */
2776 c->dst.type = OP_NONE; 2719 ctxt->dst.type = OP_NONE;
2777 return X86EMUL_CONTINUE; 2720 return X86EMUL_CONTINUE;
2778} 2721}
2779 2722
2780static int em_smsw(struct x86_emulate_ctxt *ctxt) 2723static int em_smsw(struct x86_emulate_ctxt *ctxt)
2781{ 2724{
2782 struct decode_cache *c = &ctxt->decode; 2725 ctxt->dst.bytes = 2;
2783 2726 ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
2784 c->dst.bytes = 2;
2785 c->dst.val = ctxt->ops->get_cr(ctxt, 0);
2786 return X86EMUL_CONTINUE; 2727 return X86EMUL_CONTINUE;
2787} 2728}
2788 2729
2789static int em_lmsw(struct x86_emulate_ctxt *ctxt) 2730static int em_lmsw(struct x86_emulate_ctxt *ctxt)
2790{ 2731{
2791 struct decode_cache *c = &ctxt->decode;
2792 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) 2732 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
2793 | (c->src.val & 0x0f)); 2733 | (ctxt->src.val & 0x0f));
2794 c->dst.type = OP_NONE; 2734 ctxt->dst.type = OP_NONE;
2735 return X86EMUL_CONTINUE;
2736}
2737
2738static int em_loop(struct x86_emulate_ctxt *ctxt)
2739{
2740 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
2741 if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
2742 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
2743 jmp_rel(ctxt, ctxt->src.val);
2744
2745 return X86EMUL_CONTINUE;
2746}
2747
2748static int em_jcxz(struct x86_emulate_ctxt *ctxt)
2749{
2750 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
2751 jmp_rel(ctxt, ctxt->src.val);
2752
2753 return X86EMUL_CONTINUE;
2754}
2755
2756static int em_cli(struct x86_emulate_ctxt *ctxt)
2757{
2758 if (emulator_bad_iopl(ctxt))
2759 return emulate_gp(ctxt, 0);
2760
2761 ctxt->eflags &= ~X86_EFLAGS_IF;
2762 return X86EMUL_CONTINUE;
2763}
2764
2765static int em_sti(struct x86_emulate_ctxt *ctxt)
2766{
2767 if (emulator_bad_iopl(ctxt))
2768 return emulate_gp(ctxt, 0);
2769
2770 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2771 ctxt->eflags |= X86_EFLAGS_IF;
2795 return X86EMUL_CONTINUE; 2772 return X86EMUL_CONTINUE;
2796} 2773}
2797 2774
@@ -2809,9 +2786,7 @@ static bool valid_cr(int nr)
2809 2786
2810static int check_cr_read(struct x86_emulate_ctxt *ctxt) 2787static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2811{ 2788{
2812 struct decode_cache *c = &ctxt->decode; 2789 if (!valid_cr(ctxt->modrm_reg))
2813
2814 if (!valid_cr(c->modrm_reg))
2815 return emulate_ud(ctxt); 2790 return emulate_ud(ctxt);
2816 2791
2817 return X86EMUL_CONTINUE; 2792 return X86EMUL_CONTINUE;
@@ -2819,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2819 2794
2820static int check_cr_write(struct x86_emulate_ctxt *ctxt) 2795static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2821{ 2796{
2822 struct decode_cache *c = &ctxt->decode; 2797 u64 new_val = ctxt->src.val64;
2823 u64 new_val = c->src.val64; 2798 int cr = ctxt->modrm_reg;
2824 int cr = c->modrm_reg;
2825 u64 efer = 0; 2799 u64 efer = 0;
2826 2800
2827 static u64 cr_reserved_bits[] = { 2801 static u64 cr_reserved_bits[] = {
@@ -2898,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
2898 2872
2899static int check_dr_read(struct x86_emulate_ctxt *ctxt) 2873static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2900{ 2874{
2901 struct decode_cache *c = &ctxt->decode; 2875 int dr = ctxt->modrm_reg;
2902 int dr = c->modrm_reg;
2903 u64 cr4; 2876 u64 cr4;
2904 2877
2905 if (dr > 7) 2878 if (dr > 7)
@@ -2917,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2917 2890
2918static int check_dr_write(struct x86_emulate_ctxt *ctxt) 2891static int check_dr_write(struct x86_emulate_ctxt *ctxt)
2919{ 2892{
2920 struct decode_cache *c = &ctxt->decode; 2893 u64 new_val = ctxt->src.val64;
2921 u64 new_val = c->src.val64; 2894 int dr = ctxt->modrm_reg;
2922 int dr = c->modrm_reg;
2923 2895
2924 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) 2896 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
2925 return emulate_gp(ctxt, 0); 2897 return emulate_gp(ctxt, 0);
@@ -2941,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
2941 2913
2942static int check_svme_pa(struct x86_emulate_ctxt *ctxt) 2914static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
2943{ 2915{
2944 u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; 2916 u64 rax = ctxt->regs[VCPU_REGS_RAX];
2945 2917
2946 /* Valid physical address? */ 2918 /* Valid physical address? */
2947 if (rax & 0xffff000000000000ULL) 2919 if (rax & 0xffff000000000000ULL)
@@ -2963,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
2963static int check_rdpmc(struct x86_emulate_ctxt *ctxt) 2935static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2964{ 2936{
2965 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 2937 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2966 u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; 2938 u64 rcx = ctxt->regs[VCPU_REGS_RCX];
2967 2939
2968 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 2940 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
2969 (rcx > 3)) 2941 (rcx > 3))
@@ -2974,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2974 2946
2975static int check_perm_in(struct x86_emulate_ctxt *ctxt) 2947static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2976{ 2948{
2977 struct decode_cache *c = &ctxt->decode; 2949 ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
2978 2950 if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
2979 c->dst.bytes = min(c->dst.bytes, 4u);
2980 if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
2981 return emulate_gp(ctxt, 0); 2951 return emulate_gp(ctxt, 0);
2982 2952
2983 return X86EMUL_CONTINUE; 2953 return X86EMUL_CONTINUE;
@@ -2985,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2985 2955
2986static int check_perm_out(struct x86_emulate_ctxt *ctxt) 2956static int check_perm_out(struct x86_emulate_ctxt *ctxt)
2987{ 2957{
2988 struct decode_cache *c = &ctxt->decode; 2958 ctxt->src.bytes = min(ctxt->src.bytes, 4u);
2989 2959 if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
2990 c->src.bytes = min(c->src.bytes, 4u);
2991 if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
2992 return emulate_gp(ctxt, 0); 2960 return emulate_gp(ctxt, 0);
2993 2961
2994 return X86EMUL_CONTINUE; 2962 return X86EMUL_CONTINUE;
@@ -3165,12 +3133,15 @@ static struct opcode opcode_table[256] = {
3165 G(DstMem | SrcImm | ModRM | Group, group1), 3133 G(DstMem | SrcImm | ModRM | Group, group1),
3166 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3134 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
3167 G(DstMem | SrcImmByte | ModRM | Group, group1), 3135 G(DstMem | SrcImmByte | ModRM | Group, group1),
3168 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), 3136 I2bv(DstMem | SrcReg | ModRM, em_test),
3137 I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
3169 /* 0x88 - 0x8F */ 3138 /* 0x88 - 0x8F */
3170 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), 3139 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
3171 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), 3140 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
3172 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), 3141 I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
3173 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), 3142 D(ModRM | SrcMem | NoAccess | DstReg),
3143 I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
3144 G(0, group1A),
3174 /* 0x90 - 0x97 */ 3145 /* 0x90 - 0x97 */
3175 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), 3146 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
3176 /* 0x98 - 0x9F */ 3147 /* 0x98 - 0x9F */
@@ -3184,7 +3155,7 @@ static struct opcode opcode_table[256] = {
3184 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3155 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3185 I2bv(SrcSI | DstDI | String, em_cmp), 3156 I2bv(SrcSI | DstDI | String, em_cmp),
3186 /* 0xA8 - 0xAF */ 3157 /* 0xA8 - 0xAF */
3187 D2bv(DstAcc | SrcImm), 3158 I2bv(DstAcc | SrcImm, em_test),
3188 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3159 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3189 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3160 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3190 I2bv(SrcAcc | DstDI | String, em_cmp), 3161 I2bv(SrcAcc | DstDI | String, em_cmp),
@@ -3195,25 +3166,26 @@ static struct opcode opcode_table[256] = {
3195 /* 0xC0 - 0xC7 */ 3166 /* 0xC0 - 0xC7 */
3196 D2bv(DstMem | SrcImmByte | ModRM), 3167 D2bv(DstMem | SrcImmByte | ModRM),
3197 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3168 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3198 D(ImplicitOps | Stack), 3169 I(ImplicitOps | Stack, em_ret),
3199 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), 3170 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
3200 G(ByteOp, group11), G(0, group11), 3171 G(ByteOp, group11), G(0, group11),
3201 /* 0xC8 - 0xCF */ 3172 /* 0xC8 - 0xCF */
3202 N, N, N, D(ImplicitOps | Stack), 3173 N, N, N, I(ImplicitOps | Stack, em_ret_far),
3203 D(ImplicitOps), DI(SrcImmByte, intn), 3174 D(ImplicitOps), DI(SrcImmByte, intn),
3204 D(ImplicitOps | No64), DI(ImplicitOps, iret), 3175 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3205 /* 0xD0 - 0xD7 */ 3176 /* 0xD0 - 0xD7 */
3206 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3177 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
3207 N, N, N, N, 3178 N, N, N, N,
3208 /* 0xD8 - 0xDF */ 3179 /* 0xD8 - 0xDF */
3209 N, N, N, N, N, N, N, N, 3180 N, N, N, N, N, N, N, N,
3210 /* 0xE0 - 0xE7 */ 3181 /* 0xE0 - 0xE7 */
3211 X4(D(SrcImmByte)), 3182 X3(I(SrcImmByte, em_loop)),
3183 I(SrcImmByte, em_jcxz),
3212 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), 3184 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
3213 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), 3185 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
3214 /* 0xE8 - 0xEF */ 3186 /* 0xE8 - 0xEF */
3215 D(SrcImm | Stack), D(SrcImm | ImplicitOps), 3187 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
3216 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), 3188 I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
3217 D2bvIP(SrcDX | DstAcc, in, check_perm_in), 3189 D2bvIP(SrcDX | DstAcc, in, check_perm_in),
3218 D2bvIP(SrcAcc | DstDX, out, check_perm_out), 3190 D2bvIP(SrcAcc | DstDX, out, check_perm_out),
3219 /* 0xF0 - 0xF7 */ 3191 /* 0xF0 - 0xF7 */
@@ -3221,14 +3193,16 @@ static struct opcode opcode_table[256] = {
3221 DI(ImplicitOps | Priv, hlt), D(ImplicitOps), 3193 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
3222 G(ByteOp, group3), G(0, group3), 3194 G(ByteOp, group3), G(0, group3),
3223 /* 0xF8 - 0xFF */ 3195 /* 0xF8 - 0xFF */
3224 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), 3196 D(ImplicitOps), D(ImplicitOps),
3197 I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
3225 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3198 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3226}; 3199};
3227 3200
3228static struct opcode twobyte_table[256] = { 3201static struct opcode twobyte_table[256] = {
3229 /* 0x00 - 0x0F */ 3202 /* 0x00 - 0x0F */
3230 G(0, group6), GD(0, &group7), N, N, 3203 G(0, group6), GD(0, &group7), N, N,
3231 N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, 3204 N, I(ImplicitOps | VendorSpecific, em_syscall),
3205 II(ImplicitOps | Priv, em_clts, clts), N,
3232 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 3206 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
3233 N, D(ImplicitOps | ModRM), N, N, 3207 N, D(ImplicitOps | ModRM), N, N,
3234 /* 0x10 - 0x1F */ 3208 /* 0x10 - 0x1F */
@@ -3245,7 +3219,8 @@ static struct opcode twobyte_table[256] = {
3245 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3219 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3246 DI(ImplicitOps | Priv, rdmsr), 3220 DI(ImplicitOps | Priv, rdmsr),
3247 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), 3221 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
3248 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), 3222 I(ImplicitOps | VendorSpecific, em_sysenter),
3223 I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
3249 N, N, 3224 N, N,
3250 N, N, N, N, N, N, N, N, 3225 N, N, N, N, N, N, N, N,
3251 /* 0x40 - 0x4F */ 3226 /* 0x40 - 0x4F */
@@ -3313,11 +3288,11 @@ static struct opcode twobyte_table[256] = {
3313#undef I2bv 3288#undef I2bv
3314#undef I6ALU 3289#undef I6ALU
3315 3290
3316static unsigned imm_size(struct decode_cache *c) 3291static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
3317{ 3292{
3318 unsigned size; 3293 unsigned size;
3319 3294
3320 size = (c->d & ByteOp) ? 1 : c->op_bytes; 3295 size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3321 if (size == 8) 3296 if (size == 8)
3322 size = 4; 3297 size = 4;
3323 return size; 3298 return size;
@@ -3326,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c)
3326static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, 3301static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3327 unsigned size, bool sign_extension) 3302 unsigned size, bool sign_extension)
3328{ 3303{
3329 struct decode_cache *c = &ctxt->decode;
3330 struct x86_emulate_ops *ops = ctxt->ops;
3331 int rc = X86EMUL_CONTINUE; 3304 int rc = X86EMUL_CONTINUE;
3332 3305
3333 op->type = OP_IMM; 3306 op->type = OP_IMM;
3334 op->bytes = size; 3307 op->bytes = size;
3335 op->addr.mem.ea = c->eip; 3308 op->addr.mem.ea = ctxt->_eip;
3336 /* NB. Immediates are sign-extended as necessary. */ 3309 /* NB. Immediates are sign-extended as necessary. */
3337 switch (op->bytes) { 3310 switch (op->bytes) {
3338 case 1: 3311 case 1:
3339 op->val = insn_fetch(s8, 1, c->eip); 3312 op->val = insn_fetch(s8, 1, ctxt->_eip);
3340 break; 3313 break;
3341 case 2: 3314 case 2:
3342 op->val = insn_fetch(s16, 2, c->eip); 3315 op->val = insn_fetch(s16, 2, ctxt->_eip);
3343 break; 3316 break;
3344 case 4: 3317 case 4:
3345 op->val = insn_fetch(s32, 4, c->eip); 3318 op->val = insn_fetch(s32, 4, ctxt->_eip);
3346 break; 3319 break;
3347 } 3320 }
3348 if (!sign_extension) { 3321 if (!sign_extension) {
@@ -3362,11 +3335,8 @@ done:
3362 return rc; 3335 return rc;
3363} 3336}
3364 3337
3365int 3338int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3366x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3367{ 3339{
3368 struct x86_emulate_ops *ops = ctxt->ops;
3369 struct decode_cache *c = &ctxt->decode;
3370 int rc = X86EMUL_CONTINUE; 3340 int rc = X86EMUL_CONTINUE;
3371 int mode = ctxt->mode; 3341 int mode = ctxt->mode;
3372 int def_op_bytes, def_ad_bytes, goffset, simd_prefix; 3342 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
@@ -3374,11 +3344,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3374 struct opcode opcode; 3344 struct opcode opcode;
3375 struct operand memop = { .type = OP_NONE }, *memopp = NULL; 3345 struct operand memop = { .type = OP_NONE }, *memopp = NULL;
3376 3346
3377 c->eip = ctxt->eip; 3347 ctxt->_eip = ctxt->eip;
3378 c->fetch.start = c->eip; 3348 ctxt->fetch.start = ctxt->_eip;
3379 c->fetch.end = c->fetch.start + insn_len; 3349 ctxt->fetch.end = ctxt->fetch.start + insn_len;
3380 if (insn_len > 0) 3350 if (insn_len > 0)
3381 memcpy(c->fetch.data, insn, insn_len); 3351 memcpy(ctxt->fetch.data, insn, insn_len);
3382 3352
3383 switch (mode) { 3353 switch (mode) {
3384 case X86EMUL_MODE_REAL: 3354 case X86EMUL_MODE_REAL:
@@ -3399,46 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3399 return -1; 3369 return -1;
3400 } 3370 }
3401 3371
3402 c->op_bytes = def_op_bytes; 3372 ctxt->op_bytes = def_op_bytes;
3403 c->ad_bytes = def_ad_bytes; 3373 ctxt->ad_bytes = def_ad_bytes;
3404 3374
3405 /* Legacy prefixes. */ 3375 /* Legacy prefixes. */
3406 for (;;) { 3376 for (;;) {
3407 switch (c->b = insn_fetch(u8, 1, c->eip)) { 3377 switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
3408 case 0x66: /* operand-size override */ 3378 case 0x66: /* operand-size override */
3409 op_prefix = true; 3379 op_prefix = true;
3410 /* switch between 2/4 bytes */ 3380 /* switch between 2/4 bytes */
3411 c->op_bytes = def_op_bytes ^ 6; 3381 ctxt->op_bytes = def_op_bytes ^ 6;
3412 break; 3382 break;
3413 case 0x67: /* address-size override */ 3383 case 0x67: /* address-size override */
3414 if (mode == X86EMUL_MODE_PROT64) 3384 if (mode == X86EMUL_MODE_PROT64)
3415 /* switch between 4/8 bytes */ 3385 /* switch between 4/8 bytes */
3416 c->ad_bytes = def_ad_bytes ^ 12; 3386 ctxt->ad_bytes = def_ad_bytes ^ 12;
3417 else 3387 else
3418 /* switch between 2/4 bytes */ 3388 /* switch between 2/4 bytes */
3419 c->ad_bytes = def_ad_bytes ^ 6; 3389 ctxt->ad_bytes = def_ad_bytes ^ 6;
3420 break; 3390 break;
3421 case 0x26: /* ES override */ 3391 case 0x26: /* ES override */
3422 case 0x2e: /* CS override */ 3392 case 0x2e: /* CS override */
3423 case 0x36: /* SS override */ 3393 case 0x36: /* SS override */
3424 case 0x3e: /* DS override */ 3394 case 0x3e: /* DS override */
3425 set_seg_override(c, (c->b >> 3) & 3); 3395 set_seg_override(ctxt, (ctxt->b >> 3) & 3);
3426 break; 3396 break;
3427 case 0x64: /* FS override */ 3397 case 0x64: /* FS override */
3428 case 0x65: /* GS override */ 3398 case 0x65: /* GS override */
3429 set_seg_override(c, c->b & 7); 3399 set_seg_override(ctxt, ctxt->b & 7);
3430 break; 3400 break;
3431 case 0x40 ... 0x4f: /* REX */ 3401 case 0x40 ... 0x4f: /* REX */
3432 if (mode != X86EMUL_MODE_PROT64) 3402 if (mode != X86EMUL_MODE_PROT64)
3433 goto done_prefixes; 3403 goto done_prefixes;
3434 c->rex_prefix = c->b; 3404 ctxt->rex_prefix = ctxt->b;
3435 continue; 3405 continue;
3436 case 0xf0: /* LOCK */ 3406 case 0xf0: /* LOCK */
3437 c->lock_prefix = 1; 3407 ctxt->lock_prefix = 1;
3438 break; 3408 break;
3439 case 0xf2: /* REPNE/REPNZ */ 3409 case 0xf2: /* REPNE/REPNZ */
3440 case 0xf3: /* REP/REPE/REPZ */ 3410 case 0xf3: /* REP/REPE/REPZ */
3441 c->rep_prefix = c->b; 3411 ctxt->rep_prefix = ctxt->b;
3442 break; 3412 break;
3443 default: 3413 default:
3444 goto done_prefixes; 3414 goto done_prefixes;
@@ -3446,50 +3416,50 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3446 3416
3447 /* Any legacy prefix after a REX prefix nullifies its effect. */ 3417 /* Any legacy prefix after a REX prefix nullifies its effect. */
3448 3418
3449 c->rex_prefix = 0; 3419 ctxt->rex_prefix = 0;
3450 } 3420 }
3451 3421
3452done_prefixes: 3422done_prefixes:
3453 3423
3454 /* REX prefix. */ 3424 /* REX prefix. */
3455 if (c->rex_prefix & 8) 3425 if (ctxt->rex_prefix & 8)
3456 c->op_bytes = 8; /* REX.W */ 3426 ctxt->op_bytes = 8; /* REX.W */
3457 3427
3458 /* Opcode byte(s). */ 3428 /* Opcode byte(s). */
3459 opcode = opcode_table[c->b]; 3429 opcode = opcode_table[ctxt->b];
3460 /* Two-byte opcode? */ 3430 /* Two-byte opcode? */
3461 if (c->b == 0x0f) { 3431 if (ctxt->b == 0x0f) {
3462 c->twobyte = 1; 3432 ctxt->twobyte = 1;
3463 c->b = insn_fetch(u8, 1, c->eip); 3433 ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
3464 opcode = twobyte_table[c->b]; 3434 opcode = twobyte_table[ctxt->b];
3465 } 3435 }
3466 c->d = opcode.flags; 3436 ctxt->d = opcode.flags;
3467 3437
3468 while (c->d & GroupMask) { 3438 while (ctxt->d & GroupMask) {
3469 switch (c->d & GroupMask) { 3439 switch (ctxt->d & GroupMask) {
3470 case Group: 3440 case Group:
3471 c->modrm = insn_fetch(u8, 1, c->eip); 3441 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
3472 --c->eip; 3442 --ctxt->_eip;
3473 goffset = (c->modrm >> 3) & 7; 3443 goffset = (ctxt->modrm >> 3) & 7;
3474 opcode = opcode.u.group[goffset]; 3444 opcode = opcode.u.group[goffset];
3475 break; 3445 break;
3476 case GroupDual: 3446 case GroupDual:
3477 c->modrm = insn_fetch(u8, 1, c->eip); 3447 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
3478 --c->eip; 3448 --ctxt->_eip;
3479 goffset = (c->modrm >> 3) & 7; 3449 goffset = (ctxt->modrm >> 3) & 7;
3480 if ((c->modrm >> 6) == 3) 3450 if ((ctxt->modrm >> 6) == 3)
3481 opcode = opcode.u.gdual->mod3[goffset]; 3451 opcode = opcode.u.gdual->mod3[goffset];
3482 else 3452 else
3483 opcode = opcode.u.gdual->mod012[goffset]; 3453 opcode = opcode.u.gdual->mod012[goffset];
3484 break; 3454 break;
3485 case RMExt: 3455 case RMExt:
3486 goffset = c->modrm & 7; 3456 goffset = ctxt->modrm & 7;
3487 opcode = opcode.u.group[goffset]; 3457 opcode = opcode.u.group[goffset];
3488 break; 3458 break;
3489 case Prefix: 3459 case Prefix:
3490 if (c->rep_prefix && op_prefix) 3460 if (ctxt->rep_prefix && op_prefix)
3491 return X86EMUL_UNHANDLEABLE; 3461 return X86EMUL_UNHANDLEABLE;
3492 simd_prefix = op_prefix ? 0x66 : c->rep_prefix; 3462 simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
3493 switch (simd_prefix) { 3463 switch (simd_prefix) {
3494 case 0x00: opcode = opcode.u.gprefix->pfx_no; break; 3464 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
3495 case 0x66: opcode = opcode.u.gprefix->pfx_66; break; 3465 case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -3501,61 +3471,61 @@ done_prefixes:
3501 return X86EMUL_UNHANDLEABLE; 3471 return X86EMUL_UNHANDLEABLE;
3502 } 3472 }
3503 3473
3504 c->d &= ~GroupMask; 3474 ctxt->d &= ~GroupMask;
3505 c->d |= opcode.flags; 3475 ctxt->d |= opcode.flags;
3506 } 3476 }
3507 3477
3508 c->execute = opcode.u.execute; 3478 ctxt->execute = opcode.u.execute;
3509 c->check_perm = opcode.check_perm; 3479 ctxt->check_perm = opcode.check_perm;
3510 c->intercept = opcode.intercept; 3480 ctxt->intercept = opcode.intercept;
3511 3481
3512 /* Unrecognised? */ 3482 /* Unrecognised? */
3513 if (c->d == 0 || (c->d & Undefined)) 3483 if (ctxt->d == 0 || (ctxt->d & Undefined))
3514 return -1; 3484 return -1;
3515 3485
3516 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) 3486 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
3517 return -1; 3487 return -1;
3518 3488
3519 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 3489 if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
3520 c->op_bytes = 8; 3490 ctxt->op_bytes = 8;
3521 3491
3522 if (c->d & Op3264) { 3492 if (ctxt->d & Op3264) {
3523 if (mode == X86EMUL_MODE_PROT64) 3493 if (mode == X86EMUL_MODE_PROT64)
3524 c->op_bytes = 8; 3494 ctxt->op_bytes = 8;
3525 else 3495 else
3526 c->op_bytes = 4; 3496 ctxt->op_bytes = 4;
3527 } 3497 }
3528 3498
3529 if (c->d & Sse) 3499 if (ctxt->d & Sse)
3530 c->op_bytes = 16; 3500 ctxt->op_bytes = 16;
3531 3501
3532 /* ModRM and SIB bytes. */ 3502 /* ModRM and SIB bytes. */
3533 if (c->d & ModRM) { 3503 if (ctxt->d & ModRM) {
3534 rc = decode_modrm(ctxt, ops, &memop); 3504 rc = decode_modrm(ctxt, &memop);
3535 if (!c->has_seg_override) 3505 if (!ctxt->has_seg_override)
3536 set_seg_override(c, c->modrm_seg); 3506 set_seg_override(ctxt, ctxt->modrm_seg);
3537 } else if (c->d & MemAbs) 3507 } else if (ctxt->d & MemAbs)
3538 rc = decode_abs(ctxt, ops, &memop); 3508 rc = decode_abs(ctxt, &memop);
3539 if (rc != X86EMUL_CONTINUE) 3509 if (rc != X86EMUL_CONTINUE)
3540 goto done; 3510 goto done;
3541 3511
3542 if (!c->has_seg_override) 3512 if (!ctxt->has_seg_override)
3543 set_seg_override(c, VCPU_SREG_DS); 3513 set_seg_override(ctxt, VCPU_SREG_DS);
3544 3514
3545 memop.addr.mem.seg = seg_override(ctxt, c); 3515 memop.addr.mem.seg = seg_override(ctxt);
3546 3516
3547 if (memop.type == OP_MEM && c->ad_bytes != 8) 3517 if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
3548 memop.addr.mem.ea = (u32)memop.addr.mem.ea; 3518 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
3549 3519
3550 /* 3520 /*
3551 * Decode and fetch the source operand: register, memory 3521 * Decode and fetch the source operand: register, memory
3552 * or immediate. 3522 * or immediate.
3553 */ 3523 */
3554 switch (c->d & SrcMask) { 3524 switch (ctxt->d & SrcMask) {
3555 case SrcNone: 3525 case SrcNone:
3556 break; 3526 break;
3557 case SrcReg: 3527 case SrcReg:
3558 decode_register_operand(ctxt, &c->src, c, 0); 3528 decode_register_operand(ctxt, &ctxt->src, 0);
3559 break; 3529 break;
3560 case SrcMem16: 3530 case SrcMem16:
3561 memop.bytes = 2; 3531 memop.bytes = 2;
@@ -3564,60 +3534,60 @@ done_prefixes:
3564 memop.bytes = 4; 3534 memop.bytes = 4;
3565 goto srcmem_common; 3535 goto srcmem_common;
3566 case SrcMem: 3536 case SrcMem:
3567 memop.bytes = (c->d & ByteOp) ? 1 : 3537 memop.bytes = (ctxt->d & ByteOp) ? 1 :
3568 c->op_bytes; 3538 ctxt->op_bytes;
3569 srcmem_common: 3539 srcmem_common:
3570 c->src = memop; 3540 ctxt->src = memop;
3571 memopp = &c->src; 3541 memopp = &ctxt->src;
3572 break; 3542 break;
3573 case SrcImmU16: 3543 case SrcImmU16:
3574 rc = decode_imm(ctxt, &c->src, 2, false); 3544 rc = decode_imm(ctxt, &ctxt->src, 2, false);
3575 break; 3545 break;
3576 case SrcImm: 3546 case SrcImm:
3577 rc = decode_imm(ctxt, &c->src, imm_size(c), true); 3547 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
3578 break; 3548 break;
3579 case SrcImmU: 3549 case SrcImmU:
3580 rc = decode_imm(ctxt, &c->src, imm_size(c), false); 3550 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
3581 break; 3551 break;
3582 case SrcImmByte: 3552 case SrcImmByte:
3583 rc = decode_imm(ctxt, &c->src, 1, true); 3553 rc = decode_imm(ctxt, &ctxt->src, 1, true);
3584 break; 3554 break;
3585 case SrcImmUByte: 3555 case SrcImmUByte:
3586 rc = decode_imm(ctxt, &c->src, 1, false); 3556 rc = decode_imm(ctxt, &ctxt->src, 1, false);
3587 break; 3557 break;
3588 case SrcAcc: 3558 case SrcAcc:
3589 c->src.type = OP_REG; 3559 ctxt->src.type = OP_REG;
3590 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3560 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3591 c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; 3561 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3592 fetch_register_operand(&c->src); 3562 fetch_register_operand(&ctxt->src);
3593 break; 3563 break;
3594 case SrcOne: 3564 case SrcOne:
3595 c->src.bytes = 1; 3565 ctxt->src.bytes = 1;
3596 c->src.val = 1; 3566 ctxt->src.val = 1;
3597 break; 3567 break;
3598 case SrcSI: 3568 case SrcSI:
3599 c->src.type = OP_MEM; 3569 ctxt->src.type = OP_MEM;
3600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3570 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3601 c->src.addr.mem.ea = 3571 ctxt->src.addr.mem.ea =
3602 register_address(c, c->regs[VCPU_REGS_RSI]); 3572 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
3603 c->src.addr.mem.seg = seg_override(ctxt, c); 3573 ctxt->src.addr.mem.seg = seg_override(ctxt);
3604 c->src.val = 0; 3574 ctxt->src.val = 0;
3605 break; 3575 break;
3606 case SrcImmFAddr: 3576 case SrcImmFAddr:
3607 c->src.type = OP_IMM; 3577 ctxt->src.type = OP_IMM;
3608 c->src.addr.mem.ea = c->eip; 3578 ctxt->src.addr.mem.ea = ctxt->_eip;
3609 c->src.bytes = c->op_bytes + 2; 3579 ctxt->src.bytes = ctxt->op_bytes + 2;
3610 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 3580 insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
3611 break; 3581 break;
3612 case SrcMemFAddr: 3582 case SrcMemFAddr:
3613 memop.bytes = c->op_bytes + 2; 3583 memop.bytes = ctxt->op_bytes + 2;
3614 goto srcmem_common; 3584 goto srcmem_common;
3615 break; 3585 break;
3616 case SrcDX: 3586 case SrcDX:
3617 c->src.type = OP_REG; 3587 ctxt->src.type = OP_REG;
3618 c->src.bytes = 2; 3588 ctxt->src.bytes = 2;
3619 c->src.addr.reg = &c->regs[VCPU_REGS_RDX]; 3589 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3620 fetch_register_operand(&c->src); 3590 fetch_register_operand(&ctxt->src);
3621 break; 3591 break;
3622 } 3592 }
3623 3593
@@ -3628,22 +3598,22 @@ done_prefixes:
3628 * Decode and fetch the second source operand: register, memory 3598 * Decode and fetch the second source operand: register, memory
3629 * or immediate. 3599 * or immediate.
3630 */ 3600 */
3631 switch (c->d & Src2Mask) { 3601 switch (ctxt->d & Src2Mask) {
3632 case Src2None: 3602 case Src2None:
3633 break; 3603 break;
3634 case Src2CL: 3604 case Src2CL:
3635 c->src2.bytes = 1; 3605 ctxt->src2.bytes = 1;
3636 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; 3606 ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
3637 break; 3607 break;
3638 case Src2ImmByte: 3608 case Src2ImmByte:
3639 rc = decode_imm(ctxt, &c->src2, 1, true); 3609 rc = decode_imm(ctxt, &ctxt->src2, 1, true);
3640 break; 3610 break;
3641 case Src2One: 3611 case Src2One:
3642 c->src2.bytes = 1; 3612 ctxt->src2.bytes = 1;
3643 c->src2.val = 1; 3613 ctxt->src2.val = 1;
3644 break; 3614 break;
3645 case Src2Imm: 3615 case Src2Imm:
3646 rc = decode_imm(ctxt, &c->src2, imm_size(c), true); 3616 rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
3647 break; 3617 break;
3648 } 3618 }
3649 3619
@@ -3651,68 +3621,66 @@ done_prefixes:
3651 goto done; 3621 goto done;
3652 3622
3653 /* Decode and fetch the destination operand: register or memory. */ 3623 /* Decode and fetch the destination operand: register or memory. */
3654 switch (c->d & DstMask) { 3624 switch (ctxt->d & DstMask) {
3655 case DstReg: 3625 case DstReg:
3656 decode_register_operand(ctxt, &c->dst, c, 3626 decode_register_operand(ctxt, &ctxt->dst,
3657 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 3627 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3658 break; 3628 break;
3659 case DstImmUByte: 3629 case DstImmUByte:
3660 c->dst.type = OP_IMM; 3630 ctxt->dst.type = OP_IMM;
3661 c->dst.addr.mem.ea = c->eip; 3631 ctxt->dst.addr.mem.ea = ctxt->_eip;
3662 c->dst.bytes = 1; 3632 ctxt->dst.bytes = 1;
3663 c->dst.val = insn_fetch(u8, 1, c->eip); 3633 ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
3664 break; 3634 break;
3665 case DstMem: 3635 case DstMem:
3666 case DstMem64: 3636 case DstMem64:
3667 c->dst = memop; 3637 ctxt->dst = memop;
3668 memopp = &c->dst; 3638 memopp = &ctxt->dst;
3669 if ((c->d & DstMask) == DstMem64) 3639 if ((ctxt->d & DstMask) == DstMem64)
3670 c->dst.bytes = 8; 3640 ctxt->dst.bytes = 8;
3671 else 3641 else
3672 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3642 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3673 if (c->d & BitOp) 3643 if (ctxt->d & BitOp)
3674 fetch_bit_operand(c); 3644 fetch_bit_operand(ctxt);
3675 c->dst.orig_val = c->dst.val; 3645 ctxt->dst.orig_val = ctxt->dst.val;
3676 break; 3646 break;
3677 case DstAcc: 3647 case DstAcc:
3678 c->dst.type = OP_REG; 3648 ctxt->dst.type = OP_REG;
3679 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3649 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3680 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; 3650 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3681 fetch_register_operand(&c->dst); 3651 fetch_register_operand(&ctxt->dst);
3682 c->dst.orig_val = c->dst.val; 3652 ctxt->dst.orig_val = ctxt->dst.val;
3683 break; 3653 break;
3684 case DstDI: 3654 case DstDI:
3685 c->dst.type = OP_MEM; 3655 ctxt->dst.type = OP_MEM;
3686 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3656 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3687 c->dst.addr.mem.ea = 3657 ctxt->dst.addr.mem.ea =
3688 register_address(c, c->regs[VCPU_REGS_RDI]); 3658 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
3689 c->dst.addr.mem.seg = VCPU_SREG_ES; 3659 ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
3690 c->dst.val = 0; 3660 ctxt->dst.val = 0;
3691 break; 3661 break;
3692 case DstDX: 3662 case DstDX:
3693 c->dst.type = OP_REG; 3663 ctxt->dst.type = OP_REG;
3694 c->dst.bytes = 2; 3664 ctxt->dst.bytes = 2;
3695 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; 3665 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3696 fetch_register_operand(&c->dst); 3666 fetch_register_operand(&ctxt->dst);
3697 break; 3667 break;
3698 case ImplicitOps: 3668 case ImplicitOps:
3699 /* Special instructions do their own operand decoding. */ 3669 /* Special instructions do their own operand decoding. */
3700 default: 3670 default:
3701 c->dst.type = OP_NONE; /* Disable writeback. */ 3671 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3702 break; 3672 break;
3703 } 3673 }
3704 3674
3705done: 3675done:
3706 if (memopp && memopp->type == OP_MEM && c->rip_relative) 3676 if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
3707 memopp->addr.mem.ea += c->eip; 3677 memopp->addr.mem.ea += ctxt->_eip;
3708 3678
3709 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3679 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3710} 3680}
3711 3681
3712static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3682static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3713{ 3683{
3714 struct decode_cache *c = &ctxt->decode;
3715
3716 /* The second termination condition only applies for REPE 3684 /* The second termination condition only applies for REPE
3717 * and REPNE. Test if the repeat string operation prefix is 3685 * and REPNE. Test if the repeat string operation prefix is
3718 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the 3686 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
@@ -3720,304 +3688,232 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3720 * - if REPE/REPZ and ZF = 0 then done 3688 * - if REPE/REPZ and ZF = 0 then done
3721 * - if REPNE/REPNZ and ZF = 1 then done 3689 * - if REPNE/REPNZ and ZF = 1 then done
3722 */ 3690 */
3723 if (((c->b == 0xa6) || (c->b == 0xa7) || 3691 if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
3724 (c->b == 0xae) || (c->b == 0xaf)) 3692 (ctxt->b == 0xae) || (ctxt->b == 0xaf))
3725 && (((c->rep_prefix == REPE_PREFIX) && 3693 && (((ctxt->rep_prefix == REPE_PREFIX) &&
3726 ((ctxt->eflags & EFLG_ZF) == 0)) 3694 ((ctxt->eflags & EFLG_ZF) == 0))
3727 || ((c->rep_prefix == REPNE_PREFIX) && 3695 || ((ctxt->rep_prefix == REPNE_PREFIX) &&
3728 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) 3696 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3729 return true; 3697 return true;
3730 3698
3731 return false; 3699 return false;
3732} 3700}
3733 3701
3734int 3702int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3735x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3736{ 3703{
3737 struct x86_emulate_ops *ops = ctxt->ops; 3704 struct x86_emulate_ops *ops = ctxt->ops;
3738 u64 msr_data; 3705 u64 msr_data;
3739 struct decode_cache *c = &ctxt->decode;
3740 int rc = X86EMUL_CONTINUE; 3706 int rc = X86EMUL_CONTINUE;
3741 int saved_dst_type = c->dst.type; 3707 int saved_dst_type = ctxt->dst.type;
3742 int irq; /* Used for int 3, int, and into */
3743 3708
3744 ctxt->decode.mem_read.pos = 0; 3709 ctxt->mem_read.pos = 0;
3745 3710
3746 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 3711 if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
3747 rc = emulate_ud(ctxt); 3712 rc = emulate_ud(ctxt);
3748 goto done; 3713 goto done;
3749 } 3714 }
3750 3715
3751 /* LOCK prefix is allowed only with some instructions */ 3716 /* LOCK prefix is allowed only with some instructions */
3752 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 3717 if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
3753 rc = emulate_ud(ctxt); 3718 rc = emulate_ud(ctxt);
3754 goto done; 3719 goto done;
3755 } 3720 }
3756 3721
3757 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 3722 if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
3758 rc = emulate_ud(ctxt); 3723 rc = emulate_ud(ctxt);
3759 goto done; 3724 goto done;
3760 } 3725 }
3761 3726
3762 if ((c->d & Sse) 3727 if ((ctxt->d & Sse)
3763 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) 3728 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
3764 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { 3729 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
3765 rc = emulate_ud(ctxt); 3730 rc = emulate_ud(ctxt);
3766 goto done; 3731 goto done;
3767 } 3732 }
3768 3733
3769 if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 3734 if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
3770 rc = emulate_nm(ctxt); 3735 rc = emulate_nm(ctxt);
3771 goto done; 3736 goto done;
3772 } 3737 }
3773 3738
3774 if (unlikely(ctxt->guest_mode) && c->intercept) { 3739 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3775 rc = emulator_check_intercept(ctxt, c->intercept, 3740 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3776 X86_ICPT_PRE_EXCEPT); 3741 X86_ICPT_PRE_EXCEPT);
3777 if (rc != X86EMUL_CONTINUE) 3742 if (rc != X86EMUL_CONTINUE)
3778 goto done; 3743 goto done;
3779 } 3744 }
3780 3745
3781 /* Privileged instruction can be executed only in CPL=0 */ 3746 /* Privileged instruction can be executed only in CPL=0 */
3782 if ((c->d & Priv) && ops->cpl(ctxt)) { 3747 if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
3783 rc = emulate_gp(ctxt, 0); 3748 rc = emulate_gp(ctxt, 0);
3784 goto done; 3749 goto done;
3785 } 3750 }
3786 3751
3787 /* Instruction can only be executed in protected mode */ 3752 /* Instruction can only be executed in protected mode */
3788 if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { 3753 if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
3789 rc = emulate_ud(ctxt); 3754 rc = emulate_ud(ctxt);
3790 goto done; 3755 goto done;
3791 } 3756 }
3792 3757
3793 /* Do instruction specific permission checks */ 3758 /* Do instruction specific permission checks */
3794 if (c->check_perm) { 3759 if (ctxt->check_perm) {
3795 rc = c->check_perm(ctxt); 3760 rc = ctxt->check_perm(ctxt);
3796 if (rc != X86EMUL_CONTINUE) 3761 if (rc != X86EMUL_CONTINUE)
3797 goto done; 3762 goto done;
3798 } 3763 }
3799 3764
3800 if (unlikely(ctxt->guest_mode) && c->intercept) { 3765 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3801 rc = emulator_check_intercept(ctxt, c->intercept, 3766 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3802 X86_ICPT_POST_EXCEPT); 3767 X86_ICPT_POST_EXCEPT);
3803 if (rc != X86EMUL_CONTINUE) 3768 if (rc != X86EMUL_CONTINUE)
3804 goto done; 3769 goto done;
3805 } 3770 }
3806 3771
3807 if (c->rep_prefix && (c->d & String)) { 3772 if (ctxt->rep_prefix && (ctxt->d & String)) {
3808 /* All REP prefixes have the same first termination condition */ 3773 /* All REP prefixes have the same first termination condition */
3809 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3774 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
3810 ctxt->eip = c->eip; 3775 ctxt->eip = ctxt->_eip;
3811 goto done; 3776 goto done;
3812 } 3777 }
3813 } 3778 }
3814 3779
3815 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 3780 if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
3816 rc = segmented_read(ctxt, c->src.addr.mem, 3781 rc = segmented_read(ctxt, ctxt->src.addr.mem,
3817 c->src.valptr, c->src.bytes); 3782 ctxt->src.valptr, ctxt->src.bytes);
3818 if (rc != X86EMUL_CONTINUE) 3783 if (rc != X86EMUL_CONTINUE)
3819 goto done; 3784 goto done;
3820 c->src.orig_val64 = c->src.val64; 3785 ctxt->src.orig_val64 = ctxt->src.val64;
3821 } 3786 }
3822 3787
3823 if (c->src2.type == OP_MEM) { 3788 if (ctxt->src2.type == OP_MEM) {
3824 rc = segmented_read(ctxt, c->src2.addr.mem, 3789 rc = segmented_read(ctxt, ctxt->src2.addr.mem,
3825 &c->src2.val, c->src2.bytes); 3790 &ctxt->src2.val, ctxt->src2.bytes);
3826 if (rc != X86EMUL_CONTINUE) 3791 if (rc != X86EMUL_CONTINUE)
3827 goto done; 3792 goto done;
3828 } 3793 }
3829 3794
3830 if ((c->d & DstMask) == ImplicitOps) 3795 if ((ctxt->d & DstMask) == ImplicitOps)
3831 goto special_insn; 3796 goto special_insn;
3832 3797
3833 3798
3834 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3799 if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
3835 /* optimisation - avoid slow emulated read if Mov */ 3800 /* optimisation - avoid slow emulated read if Mov */
3836 rc = segmented_read(ctxt, c->dst.addr.mem, 3801 rc = segmented_read(ctxt, ctxt->dst.addr.mem,
3837 &c->dst.val, c->dst.bytes); 3802 &ctxt->dst.val, ctxt->dst.bytes);
3838 if (rc != X86EMUL_CONTINUE) 3803 if (rc != X86EMUL_CONTINUE)
3839 goto done; 3804 goto done;
3840 } 3805 }
3841 c->dst.orig_val = c->dst.val; 3806 ctxt->dst.orig_val = ctxt->dst.val;
3842 3807
3843special_insn: 3808special_insn:
3844 3809
3845 if (unlikely(ctxt->guest_mode) && c->intercept) { 3810 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3846 rc = emulator_check_intercept(ctxt, c->intercept, 3811 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3847 X86_ICPT_POST_MEMACCESS); 3812 X86_ICPT_POST_MEMACCESS);
3848 if (rc != X86EMUL_CONTINUE) 3813 if (rc != X86EMUL_CONTINUE)
3849 goto done; 3814 goto done;
3850 } 3815 }
3851 3816
3852 if (c->execute) { 3817 if (ctxt->execute) {
3853 rc = c->execute(ctxt); 3818 rc = ctxt->execute(ctxt);
3854 if (rc != X86EMUL_CONTINUE) 3819 if (rc != X86EMUL_CONTINUE)
3855 goto done; 3820 goto done;
3856 goto writeback; 3821 goto writeback;
3857 } 3822 }
3858 3823
3859 if (c->twobyte) 3824 if (ctxt->twobyte)
3860 goto twobyte_insn; 3825 goto twobyte_insn;
3861 3826
3862 switch (c->b) { 3827 switch (ctxt->b) {
3863 case 0x06: /* push es */ 3828 case 0x06: /* push es */
3864 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 3829 rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
3865 break; 3830 break;
3866 case 0x07: /* pop es */ 3831 case 0x07: /* pop es */
3867 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3832 rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
3868 break; 3833 break;
3869 case 0x0e: /* push cs */ 3834 case 0x0e: /* push cs */
3870 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 3835 rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
3871 break; 3836 break;
3872 case 0x16: /* push ss */ 3837 case 0x16: /* push ss */
3873 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 3838 rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
3874 break; 3839 break;
3875 case 0x17: /* pop ss */ 3840 case 0x17: /* pop ss */
3876 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3841 rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
3877 break; 3842 break;
3878 case 0x1e: /* push ds */ 3843 case 0x1e: /* push ds */
3879 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 3844 rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
3880 break; 3845 break;
3881 case 0x1f: /* pop ds */ 3846 case 0x1f: /* pop ds */
3882 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3847 rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
3883 break; 3848 break;
3884 case 0x40 ... 0x47: /* inc r16/r32 */ 3849 case 0x40 ... 0x47: /* inc r16/r32 */
3885 emulate_1op("inc", c->dst, ctxt->eflags); 3850 emulate_1op("inc", ctxt->dst, ctxt->eflags);
3886 break; 3851 break;
3887 case 0x48 ... 0x4f: /* dec r16/r32 */ 3852 case 0x48 ... 0x4f: /* dec r16/r32 */
3888 emulate_1op("dec", c->dst, ctxt->eflags); 3853 emulate_1op("dec", ctxt->dst, ctxt->eflags);
3889 break; 3854 break;
3890 case 0x63: /* movsxd */ 3855 case 0x63: /* movsxd */
3891 if (ctxt->mode != X86EMUL_MODE_PROT64) 3856 if (ctxt->mode != X86EMUL_MODE_PROT64)
3892 goto cannot_emulate; 3857 goto cannot_emulate;
3893 c->dst.val = (s32) c->src.val; 3858 ctxt->dst.val = (s32) ctxt->src.val;
3894 break; 3859 break;
3895 case 0x6c: /* insb */ 3860 case 0x6c: /* insb */
3896 case 0x6d: /* insw/insd */ 3861 case 0x6d: /* insw/insd */
3897 c->src.val = c->regs[VCPU_REGS_RDX]; 3862 ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
3898 goto do_io_in; 3863 goto do_io_in;
3899 case 0x6e: /* outsb */ 3864 case 0x6e: /* outsb */
3900 case 0x6f: /* outsw/outsd */ 3865 case 0x6f: /* outsw/outsd */
3901 c->dst.val = c->regs[VCPU_REGS_RDX]; 3866 ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
3902 goto do_io_out; 3867 goto do_io_out;
3903 break; 3868 break;
3904 case 0x70 ... 0x7f: /* jcc (short) */ 3869 case 0x70 ... 0x7f: /* jcc (short) */
3905 if (test_cc(c->b, ctxt->eflags)) 3870 if (test_cc(ctxt->b, ctxt->eflags))
3906 jmp_rel(c, c->src.val); 3871 jmp_rel(ctxt, ctxt->src.val);
3907 break;
3908 case 0x84 ... 0x85:
3909 test:
3910 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
3911 break;
3912 case 0x86 ... 0x87: /* xchg */
3913 xchg:
3914 /* Write back the register source. */
3915 c->src.val = c->dst.val;
3916 write_register_operand(&c->src);
3917 /*
3918 * Write back the memory destination with implicit LOCK
3919 * prefix.
3920 */
3921 c->dst.val = c->src.orig_val;
3922 c->lock_prefix = 1;
3923 break;
3924 case 0x8c: /* mov r/m, sreg */
3925 if (c->modrm_reg > VCPU_SREG_GS) {
3926 rc = emulate_ud(ctxt);
3927 goto done;
3928 }
3929 c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
3930 break; 3872 break;
3931 case 0x8d: /* lea r16/r32, m */ 3873 case 0x8d: /* lea r16/r32, m */
3932 c->dst.val = c->src.addr.mem.ea; 3874 ctxt->dst.val = ctxt->src.addr.mem.ea;
3933 break; 3875 break;
3934 case 0x8e: { /* mov seg, r/m16 */
3935 uint16_t sel;
3936
3937 sel = c->src.val;
3938
3939 if (c->modrm_reg == VCPU_SREG_CS ||
3940 c->modrm_reg > VCPU_SREG_GS) {
3941 rc = emulate_ud(ctxt);
3942 goto done;
3943 }
3944
3945 if (c->modrm_reg == VCPU_SREG_SS)
3946 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
3947
3948 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
3949
3950 c->dst.type = OP_NONE; /* Disable writeback. */
3951 break;
3952 }
3953 case 0x8f: /* pop (sole member of Grp1a) */ 3876 case 0x8f: /* pop (sole member of Grp1a) */
3954 rc = em_grp1a(ctxt); 3877 rc = em_grp1a(ctxt);
3955 break; 3878 break;
3956 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 3879 case 0x90 ... 0x97: /* nop / xchg reg, rax */
3957 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) 3880 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
3958 break; 3881 break;
3959 goto xchg; 3882 rc = em_xchg(ctxt);
3883 break;
3960 case 0x98: /* cbw/cwde/cdqe */ 3884 case 0x98: /* cbw/cwde/cdqe */
3961 switch (c->op_bytes) { 3885 switch (ctxt->op_bytes) {
3962 case 2: c->dst.val = (s8)c->dst.val; break; 3886 case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
3963 case 4: c->dst.val = (s16)c->dst.val; break; 3887 case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
3964 case 8: c->dst.val = (s32)c->dst.val; break; 3888 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
3965 } 3889 }
3966 break; 3890 break;
3967 case 0xa8 ... 0xa9: /* test ax, imm */
3968 goto test;
3969 case 0xc0 ... 0xc1: 3891 case 0xc0 ... 0xc1:
3970 rc = em_grp2(ctxt); 3892 rc = em_grp2(ctxt);
3971 break; 3893 break;
3972 case 0xc3: /* ret */
3973 c->dst.type = OP_REG;
3974 c->dst.addr.reg = &c->eip;
3975 c->dst.bytes = c->op_bytes;
3976 rc = em_pop(ctxt);
3977 break;
3978 case 0xc4: /* les */ 3894 case 0xc4: /* les */
3979 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); 3895 rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
3980 break; 3896 break;
3981 case 0xc5: /* lds */ 3897 case 0xc5: /* lds */
3982 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); 3898 rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
3983 break;
3984 case 0xcb: /* ret far */
3985 rc = emulate_ret_far(ctxt, ops);
3986 break; 3899 break;
3987 case 0xcc: /* int3 */ 3900 case 0xcc: /* int3 */
3988 irq = 3; 3901 rc = emulate_int(ctxt, 3);
3989 goto do_interrupt; 3902 break;
3990 case 0xcd: /* int n */ 3903 case 0xcd: /* int n */
3991 irq = c->src.val; 3904 rc = emulate_int(ctxt, ctxt->src.val);
3992 do_interrupt:
3993 rc = emulate_int(ctxt, ops, irq);
3994 break; 3905 break;
3995 case 0xce: /* into */ 3906 case 0xce: /* into */
3996 if (ctxt->eflags & EFLG_OF) { 3907 if (ctxt->eflags & EFLG_OF)
3997 irq = 4; 3908 rc = emulate_int(ctxt, 4);
3998 goto do_interrupt;
3999 }
4000 break;
4001 case 0xcf: /* iret */
4002 rc = emulate_iret(ctxt, ops);
4003 break; 3909 break;
4004 case 0xd0 ... 0xd1: /* Grp2 */ 3910 case 0xd0 ... 0xd1: /* Grp2 */
4005 rc = em_grp2(ctxt); 3911 rc = em_grp2(ctxt);
4006 break; 3912 break;
4007 case 0xd2 ... 0xd3: /* Grp2 */ 3913 case 0xd2 ... 0xd3: /* Grp2 */
4008 c->src.val = c->regs[VCPU_REGS_RCX]; 3914 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
4009 rc = em_grp2(ctxt); 3915 rc = em_grp2(ctxt);
4010 break; 3916 break;
4011 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
4012 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
4013 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
4014 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
4015 jmp_rel(c, c->src.val);
4016 break;
4017 case 0xe3: /* jcxz/jecxz/jrcxz */
4018 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
4019 jmp_rel(c, c->src.val);
4020 break;
4021 case 0xe4: /* inb */ 3917 case 0xe4: /* inb */
4022 case 0xe5: /* in */ 3918 case 0xe5: /* in */
4023 goto do_io_in; 3919 goto do_io_in;
@@ -4025,35 +3921,30 @@ special_insn:
4025 case 0xe7: /* out */ 3921 case 0xe7: /* out */
4026 goto do_io_out; 3922 goto do_io_out;
4027 case 0xe8: /* call (near) */ { 3923 case 0xe8: /* call (near) */ {
4028 long int rel = c->src.val; 3924 long int rel = ctxt->src.val;
4029 c->src.val = (unsigned long) c->eip; 3925 ctxt->src.val = (unsigned long) ctxt->_eip;
4030 jmp_rel(c, rel); 3926 jmp_rel(ctxt, rel);
4031 rc = em_push(ctxt); 3927 rc = em_push(ctxt);
4032 break; 3928 break;
4033 } 3929 }
4034 case 0xe9: /* jmp rel */ 3930 case 0xe9: /* jmp rel */
4035 goto jmp; 3931 case 0xeb: /* jmp rel short */
4036 case 0xea: /* jmp far */ 3932 jmp_rel(ctxt, ctxt->src.val);
4037 rc = em_jmp_far(ctxt); 3933 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4038 break;
4039 case 0xeb:
4040 jmp: /* jmp rel short */
4041 jmp_rel(c, c->src.val);
4042 c->dst.type = OP_NONE; /* Disable writeback. */
4043 break; 3934 break;
4044 case 0xec: /* in al,dx */ 3935 case 0xec: /* in al,dx */
4045 case 0xed: /* in (e/r)ax,dx */ 3936 case 0xed: /* in (e/r)ax,dx */
4046 do_io_in: 3937 do_io_in:
4047 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3938 if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
4048 &c->dst.val)) 3939 &ctxt->dst.val))
4049 goto done; /* IO is needed */ 3940 goto done; /* IO is needed */
4050 break; 3941 break;
4051 case 0xee: /* out dx,al */ 3942 case 0xee: /* out dx,al */
4052 case 0xef: /* out dx,(e/r)ax */ 3943 case 0xef: /* out dx,(e/r)ax */
4053 do_io_out: 3944 do_io_out:
4054 ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, 3945 ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
4055 &c->src.val, 1); 3946 &ctxt->src.val, 1);
4056 c->dst.type = OP_NONE; /* Disable writeback. */ 3947 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4057 break; 3948 break;
4058 case 0xf4: /* hlt */ 3949 case 0xf4: /* hlt */
4059 ctxt->ops->halt(ctxt); 3950 ctxt->ops->halt(ctxt);
@@ -4071,22 +3962,6 @@ special_insn:
4071 case 0xf9: /* stc */ 3962 case 0xf9: /* stc */
4072 ctxt->eflags |= EFLG_CF; 3963 ctxt->eflags |= EFLG_CF;
4073 break; 3964 break;
4074 case 0xfa: /* cli */
4075 if (emulator_bad_iopl(ctxt, ops)) {
4076 rc = emulate_gp(ctxt, 0);
4077 goto done;
4078 } else
4079 ctxt->eflags &= ~X86_EFLAGS_IF;
4080 break;
4081 case 0xfb: /* sti */
4082 if (emulator_bad_iopl(ctxt, ops)) {
4083 rc = emulate_gp(ctxt, 0);
4084 goto done;
4085 } else {
4086 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
4087 ctxt->eflags |= X86_EFLAGS_IF;
4088 }
4089 break;
4090 case 0xfc: /* cld */ 3965 case 0xfc: /* cld */
4091 ctxt->eflags &= ~EFLG_DF; 3966 ctxt->eflags &= ~EFLG_DF;
4092 break; 3967 break;
@@ -4115,40 +3990,40 @@ writeback:
4115 * restore dst type in case the decoding will be reused 3990 * restore dst type in case the decoding will be reused
4116 * (happens for string instruction ) 3991 * (happens for string instruction )
4117 */ 3992 */
4118 c->dst.type = saved_dst_type; 3993 ctxt->dst.type = saved_dst_type;
4119 3994
4120 if ((c->d & SrcMask) == SrcSI) 3995 if ((ctxt->d & SrcMask) == SrcSI)
4121 string_addr_inc(ctxt, seg_override(ctxt, c), 3996 string_addr_inc(ctxt, seg_override(ctxt),
4122 VCPU_REGS_RSI, &c->src); 3997 VCPU_REGS_RSI, &ctxt->src);
4123 3998
4124 if ((c->d & DstMask) == DstDI) 3999 if ((ctxt->d & DstMask) == DstDI)
4125 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, 4000 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
4126 &c->dst); 4001 &ctxt->dst);
4127 4002
4128 if (c->rep_prefix && (c->d & String)) { 4003 if (ctxt->rep_prefix && (ctxt->d & String)) {
4129 struct read_cache *r = &ctxt->decode.io_read; 4004 struct read_cache *r = &ctxt->io_read;
4130 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 4005 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
4131 4006
4132 if (!string_insn_completed(ctxt)) { 4007 if (!string_insn_completed(ctxt)) {
4133 /* 4008 /*
4134 * Re-enter guest when pio read ahead buffer is empty 4009 * Re-enter guest when pio read ahead buffer is empty
4135 * or, if it is not used, after each 1024 iteration. 4010 * or, if it is not used, after each 1024 iteration.
4136 */ 4011 */
4137 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && 4012 if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
4138 (r->end == 0 || r->end != r->pos)) { 4013 (r->end == 0 || r->end != r->pos)) {
4139 /* 4014 /*
4140 * Reset read cache. Usually happens before 4015 * Reset read cache. Usually happens before
4141 * decode, but since instruction is restarted 4016 * decode, but since instruction is restarted
4142 * we have to do it here. 4017 * we have to do it here.
4143 */ 4018 */
4144 ctxt->decode.mem_read.end = 0; 4019 ctxt->mem_read.end = 0;
4145 return EMULATION_RESTART; 4020 return EMULATION_RESTART;
4146 } 4021 }
4147 goto done; /* skip rip writeback */ 4022 goto done; /* skip rip writeback */
4148 } 4023 }
4149 } 4024 }
4150 4025
4151 ctxt->eip = c->eip; 4026 ctxt->eip = ctxt->_eip;
4152 4027
4153done: 4028done:
4154 if (rc == X86EMUL_PROPAGATE_FAULT) 4029 if (rc == X86EMUL_PROPAGATE_FAULT)
@@ -4159,13 +4034,7 @@ done:
4159 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4034 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
4160 4035
4161twobyte_insn: 4036twobyte_insn:
4162 switch (c->b) { 4037 switch (ctxt->b) {
4163 case 0x05: /* syscall */
4164 rc = emulate_syscall(ctxt, ops);
4165 break;
4166 case 0x06:
4167 rc = em_clts(ctxt);
4168 break;
4169 case 0x09: /* wbinvd */ 4038 case 0x09: /* wbinvd */
4170 (ctxt->ops->wbinvd)(ctxt); 4039 (ctxt->ops->wbinvd)(ctxt);
4171 break; 4040 break;
@@ -4174,21 +4043,21 @@ twobyte_insn:
4174 case 0x18: /* Grp16 (prefetch/nop) */ 4043 case 0x18: /* Grp16 (prefetch/nop) */
4175 break; 4044 break;
4176 case 0x20: /* mov cr, reg */ 4045 case 0x20: /* mov cr, reg */
4177 c->dst.val = ops->get_cr(ctxt, c->modrm_reg); 4046 ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
4178 break; 4047 break;
4179 case 0x21: /* mov from dr to reg */ 4048 case 0x21: /* mov from dr to reg */
4180 ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); 4049 ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
4181 break; 4050 break;
4182 case 0x22: /* mov reg, cr */ 4051 case 0x22: /* mov reg, cr */
4183 if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { 4052 if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
4184 emulate_gp(ctxt, 0); 4053 emulate_gp(ctxt, 0);
4185 rc = X86EMUL_PROPAGATE_FAULT; 4054 rc = X86EMUL_PROPAGATE_FAULT;
4186 goto done; 4055 goto done;
4187 } 4056 }
4188 c->dst.type = OP_NONE; 4057 ctxt->dst.type = OP_NONE;
4189 break; 4058 break;
4190 case 0x23: /* mov from reg to dr */ 4059 case 0x23: /* mov from reg to dr */
4191 if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & 4060 if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
4192 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 4061 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
4193 ~0ULL : ~0U)) < 0) { 4062 ~0ULL : ~0U)) < 0) {
4194 /* #UD condition is already handled by the code above */ 4063 /* #UD condition is already handled by the code above */
@@ -4197,13 +4066,13 @@ twobyte_insn:
4197 goto done; 4066 goto done;
4198 } 4067 }
4199 4068
4200 c->dst.type = OP_NONE; /* no writeback */ 4069 ctxt->dst.type = OP_NONE; /* no writeback */
4201 break; 4070 break;
4202 case 0x30: 4071 case 0x30:
4203 /* wrmsr */ 4072 /* wrmsr */
4204 msr_data = (u32)c->regs[VCPU_REGS_RAX] 4073 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
4205 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 4074 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
4206 if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { 4075 if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
4207 emulate_gp(ctxt, 0); 4076 emulate_gp(ctxt, 0);
4208 rc = X86EMUL_PROPAGATE_FAULT; 4077 rc = X86EMUL_PROPAGATE_FAULT;
4209 goto done; 4078 goto done;
@@ -4212,64 +4081,58 @@ twobyte_insn:
4212 break; 4081 break;
4213 case 0x32: 4082 case 0x32:
4214 /* rdmsr */ 4083 /* rdmsr */
4215 if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { 4084 if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
4216 emulate_gp(ctxt, 0); 4085 emulate_gp(ctxt, 0);
4217 rc = X86EMUL_PROPAGATE_FAULT; 4086 rc = X86EMUL_PROPAGATE_FAULT;
4218 goto done; 4087 goto done;
4219 } else { 4088 } else {
4220 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 4089 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
4221 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 4090 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
4222 } 4091 }
4223 rc = X86EMUL_CONTINUE; 4092 rc = X86EMUL_CONTINUE;
4224 break; 4093 break;
4225 case 0x34: /* sysenter */
4226 rc = emulate_sysenter(ctxt, ops);
4227 break;
4228 case 0x35: /* sysexit */
4229 rc = emulate_sysexit(ctxt, ops);
4230 break;
4231 case 0x40 ... 0x4f: /* cmov */ 4094 case 0x40 ... 0x4f: /* cmov */
4232 c->dst.val = c->dst.orig_val = c->src.val; 4095 ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
4233 if (!test_cc(c->b, ctxt->eflags)) 4096 if (!test_cc(ctxt->b, ctxt->eflags))
4234 c->dst.type = OP_NONE; /* no writeback */ 4097 ctxt->dst.type = OP_NONE; /* no writeback */
4235 break; 4098 break;
4236 case 0x80 ... 0x8f: /* jnz rel, etc*/ 4099 case 0x80 ... 0x8f: /* jnz rel, etc*/
4237 if (test_cc(c->b, ctxt->eflags)) 4100 if (test_cc(ctxt->b, ctxt->eflags))
4238 jmp_rel(c, c->src.val); 4101 jmp_rel(ctxt, ctxt->src.val);
4239 break; 4102 break;
4240 case 0x90 ... 0x9f: /* setcc r/m8 */ 4103 case 0x90 ... 0x9f: /* setcc r/m8 */
4241 c->dst.val = test_cc(c->b, ctxt->eflags); 4104 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4242 break; 4105 break;
4243 case 0xa0: /* push fs */ 4106 case 0xa0: /* push fs */
4244 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 4107 rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
4245 break; 4108 break;
4246 case 0xa1: /* pop fs */ 4109 case 0xa1: /* pop fs */
4247 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 4110 rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
4248 break; 4111 break;
4249 case 0xa3: 4112 case 0xa3:
4250 bt: /* bt */ 4113 bt: /* bt */
4251 c->dst.type = OP_NONE; 4114 ctxt->dst.type = OP_NONE;
4252 /* only subword offset */ 4115 /* only subword offset */
4253 c->src.val &= (c->dst.bytes << 3) - 1; 4116 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
4254 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); 4117 emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
4255 break; 4118 break;
4256 case 0xa4: /* shld imm8, r, r/m */ 4119 case 0xa4: /* shld imm8, r, r/m */
4257 case 0xa5: /* shld cl, r, r/m */ 4120 case 0xa5: /* shld cl, r, r/m */
4258 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 4121 emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
4259 break; 4122 break;
4260 case 0xa8: /* push gs */ 4123 case 0xa8: /* push gs */
4261 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 4124 rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
4262 break; 4125 break;
4263 case 0xa9: /* pop gs */ 4126 case 0xa9: /* pop gs */
4264 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 4127 rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
4265 break; 4128 break;
4266 case 0xab: 4129 case 0xab:
4267 bts: /* bts */ 4130 bts: /* bts */
4268 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 4131 emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
4269 break; 4132 break;
4270 case 0xac: /* shrd imm8, r, r/m */ 4133 case 0xac: /* shrd imm8, r, r/m */
4271 case 0xad: /* shrd cl, r, r/m */ 4134 case 0xad: /* shrd cl, r, r/m */
4272 emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); 4135 emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
4273 break; 4136 break;
4274 case 0xae: /* clflush */ 4137 case 0xae: /* clflush */
4275 break; 4138 break;
@@ -4278,38 +4141,38 @@ twobyte_insn:
4278 * Save real source value, then compare EAX against 4141 * Save real source value, then compare EAX against
4279 * destination. 4142 * destination.
4280 */ 4143 */
4281 c->src.orig_val = c->src.val; 4144 ctxt->src.orig_val = ctxt->src.val;
4282 c->src.val = c->regs[VCPU_REGS_RAX]; 4145 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
4283 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); 4146 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
4284 if (ctxt->eflags & EFLG_ZF) { 4147 if (ctxt->eflags & EFLG_ZF) {
4285 /* Success: write back to memory. */ 4148 /* Success: write back to memory. */
4286 c->dst.val = c->src.orig_val; 4149 ctxt->dst.val = ctxt->src.orig_val;
4287 } else { 4150 } else {
4288 /* Failure: write the value we saw to EAX. */ 4151 /* Failure: write the value we saw to EAX. */
4289 c->dst.type = OP_REG; 4152 ctxt->dst.type = OP_REG;
4290 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 4153 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
4291 } 4154 }
4292 break; 4155 break;
4293 case 0xb2: /* lss */ 4156 case 0xb2: /* lss */
4294 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); 4157 rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
4295 break; 4158 break;
4296 case 0xb3: 4159 case 0xb3:
4297 btr: /* btr */ 4160 btr: /* btr */
4298 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 4161 emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
4299 break; 4162 break;
4300 case 0xb4: /* lfs */ 4163 case 0xb4: /* lfs */
4301 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); 4164 rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
4302 break; 4165 break;
4303 case 0xb5: /* lgs */ 4166 case 0xb5: /* lgs */
4304 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); 4167 rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
4305 break; 4168 break;
4306 case 0xb6 ... 0xb7: /* movzx */ 4169 case 0xb6 ... 0xb7: /* movzx */
4307 c->dst.bytes = c->op_bytes; 4170 ctxt->dst.bytes = ctxt->op_bytes;
4308 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 4171 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
4309 : (u16) c->src.val; 4172 : (u16) ctxt->src.val;
4310 break; 4173 break;
4311 case 0xba: /* Grp8 */ 4174 case 0xba: /* Grp8 */
4312 switch (c->modrm_reg & 3) { 4175 switch (ctxt->modrm_reg & 3) {
4313 case 0: 4176 case 0:
4314 goto bt; 4177 goto bt;
4315 case 1: 4178 case 1:
@@ -4322,47 +4185,47 @@ twobyte_insn:
4322 break; 4185 break;
4323 case 0xbb: 4186 case 0xbb:
4324 btc: /* btc */ 4187 btc: /* btc */
4325 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 4188 emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
4326 break; 4189 break;
4327 case 0xbc: { /* bsf */ 4190 case 0xbc: { /* bsf */
4328 u8 zf; 4191 u8 zf;
4329 __asm__ ("bsf %2, %0; setz %1" 4192 __asm__ ("bsf %2, %0; setz %1"
4330 : "=r"(c->dst.val), "=q"(zf) 4193 : "=r"(ctxt->dst.val), "=q"(zf)
4331 : "r"(c->src.val)); 4194 : "r"(ctxt->src.val));
4332 ctxt->eflags &= ~X86_EFLAGS_ZF; 4195 ctxt->eflags &= ~X86_EFLAGS_ZF;
4333 if (zf) { 4196 if (zf) {
4334 ctxt->eflags |= X86_EFLAGS_ZF; 4197 ctxt->eflags |= X86_EFLAGS_ZF;
4335 c->dst.type = OP_NONE; /* Disable writeback. */ 4198 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4336 } 4199 }
4337 break; 4200 break;
4338 } 4201 }
4339 case 0xbd: { /* bsr */ 4202 case 0xbd: { /* bsr */
4340 u8 zf; 4203 u8 zf;
4341 __asm__ ("bsr %2, %0; setz %1" 4204 __asm__ ("bsr %2, %0; setz %1"
4342 : "=r"(c->dst.val), "=q"(zf) 4205 : "=r"(ctxt->dst.val), "=q"(zf)
4343 : "r"(c->src.val)); 4206 : "r"(ctxt->src.val));
4344 ctxt->eflags &= ~X86_EFLAGS_ZF; 4207 ctxt->eflags &= ~X86_EFLAGS_ZF;
4345 if (zf) { 4208 if (zf) {
4346 ctxt->eflags |= X86_EFLAGS_ZF; 4209 ctxt->eflags |= X86_EFLAGS_ZF;
4347 c->dst.type = OP_NONE; /* Disable writeback. */ 4210 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4348 } 4211 }
4349 break; 4212 break;
4350 } 4213 }
4351 case 0xbe ... 0xbf: /* movsx */ 4214 case 0xbe ... 0xbf: /* movsx */
4352 c->dst.bytes = c->op_bytes; 4215 ctxt->dst.bytes = ctxt->op_bytes;
4353 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 4216 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
4354 (s16) c->src.val; 4217 (s16) ctxt->src.val;
4355 break; 4218 break;
4356 case 0xc0 ... 0xc1: /* xadd */ 4219 case 0xc0 ... 0xc1: /* xadd */
4357 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 4220 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
4358 /* Write back the register source. */ 4221 /* Write back the register source. */
4359 c->src.val = c->dst.orig_val; 4222 ctxt->src.val = ctxt->dst.orig_val;
4360 write_register_operand(&c->src); 4223 write_register_operand(&ctxt->src);
4361 break; 4224 break;
4362 case 0xc3: /* movnti */ 4225 case 0xc3: /* movnti */
4363 c->dst.bytes = c->op_bytes; 4226 ctxt->dst.bytes = ctxt->op_bytes;
4364 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 4227 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
4365 (u64) c->src.val; 4228 (u64) ctxt->src.val;
4366 break; 4229 break;
4367 case 0xc7: /* Grp9 (cmpxchg8b) */ 4230 case 0xc7: /* Grp9 (cmpxchg8b) */
4368 rc = em_grp9(ctxt); 4231 rc = em_grp9(ctxt);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad7238505..43e04d128af 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -338,11 +338,15 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
338 return HRTIMER_NORESTART; 338 return HRTIMER_NORESTART;
339} 339}
340 340
341static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 341static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
342{ 342{
343 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
343 struct kvm_timer *pt = &ps->pit_timer; 344 struct kvm_timer *pt = &ps->pit_timer;
344 s64 interval; 345 s64 interval;
345 346
347 if (!irqchip_in_kernel(kvm))
348 return;
349
346 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 350 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
347 351
348 pr_debug("create pit timer, interval is %llu nsec\n", interval); 352 pr_debug("create pit timer, interval is %llu nsec\n", interval);
@@ -394,13 +398,13 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
394 /* FIXME: enhance mode 4 precision */ 398 /* FIXME: enhance mode 4 precision */
395 case 4: 399 case 4:
396 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { 400 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
397 create_pit_timer(ps, val, 0); 401 create_pit_timer(kvm, val, 0);
398 } 402 }
399 break; 403 break;
400 case 2: 404 case 2:
401 case 3: 405 case 3:
402 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ 406 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
403 create_pit_timer(ps, val, 1); 407 create_pit_timer(kvm, val, 1);
404 } 408 }
405 break; 409 break;
406 default: 410 default:
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2b2255b1f04..57dcbd4308f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,7 +33,7 @@
33#include <asm/page.h> 33#include <asm/page.h>
34#include <asm/current.h> 34#include <asm/current.h>
35#include <asm/apicdef.h> 35#include <asm/apicdef.h>
36#include <asm/atomic.h> 36#include <linux/atomic.h>
37#include "kvm_cache_regs.h" 37#include "kvm_cache_regs.h"
38#include "irq.h" 38#include "irq.h"
39#include "trace.h" 39#include "trace.h"
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee38623b76..8e8da7960db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,7 +22,6 @@
22#include "mmu.h" 22#include "mmu.h"
23#include "x86.h" 23#include "x86.h"
24#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
26 25
27#include <linux/kvm_host.h> 26#include <linux/kvm_host.h>
28#include <linux/types.h> 27#include <linux/types.h>
@@ -148,7 +147,7 @@ module_param(oos_shadow, bool, 0644);
148#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 147#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
149 | PT64_NX_MASK) 148 | PT64_NX_MASK)
150 149
151#define RMAP_EXT 4 150#define PTE_LIST_EXT 4
152 151
153#define ACC_EXEC_MASK 1 152#define ACC_EXEC_MASK 1
154#define ACC_WRITE_MASK PT_WRITABLE_MASK 153#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -164,16 +163,16 @@ module_param(oos_shadow, bool, 0644);
164 163
165#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 164#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
166 165
167struct kvm_rmap_desc { 166struct pte_list_desc {
168 u64 *sptes[RMAP_EXT]; 167 u64 *sptes[PTE_LIST_EXT];
169 struct kvm_rmap_desc *more; 168 struct pte_list_desc *more;
170}; 169};
171 170
172struct kvm_shadow_walk_iterator { 171struct kvm_shadow_walk_iterator {
173 u64 addr; 172 u64 addr;
174 hpa_t shadow_addr; 173 hpa_t shadow_addr;
175 int level;
176 u64 *sptep; 174 u64 *sptep;
175 int level;
177 unsigned index; 176 unsigned index;
178}; 177};
179 178
@@ -182,32 +181,68 @@ struct kvm_shadow_walk_iterator {
182 shadow_walk_okay(&(_walker)); \ 181 shadow_walk_okay(&(_walker)); \
183 shadow_walk_next(&(_walker))) 182 shadow_walk_next(&(_walker)))
184 183
185typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); 184#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
185 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
186 shadow_walk_okay(&(_walker)) && \
187 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
188 __shadow_walk_next(&(_walker), spte))
186 189
187static struct kmem_cache *pte_chain_cache; 190static struct kmem_cache *pte_list_desc_cache;
188static struct kmem_cache *rmap_desc_cache;
189static struct kmem_cache *mmu_page_header_cache; 191static struct kmem_cache *mmu_page_header_cache;
190static struct percpu_counter kvm_total_used_mmu_pages; 192static struct percpu_counter kvm_total_used_mmu_pages;
191 193
192static u64 __read_mostly shadow_trap_nonpresent_pte;
193static u64 __read_mostly shadow_notrap_nonpresent_pte;
194static u64 __read_mostly shadow_nx_mask; 194static u64 __read_mostly shadow_nx_mask;
195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
196static u64 __read_mostly shadow_user_mask; 196static u64 __read_mostly shadow_user_mask;
197static u64 __read_mostly shadow_accessed_mask; 197static u64 __read_mostly shadow_accessed_mask;
198static u64 __read_mostly shadow_dirty_mask; 198static u64 __read_mostly shadow_dirty_mask;
199static u64 __read_mostly shadow_mmio_mask;
199 200
200static inline u64 rsvd_bits(int s, int e) 201static void mmu_spte_set(u64 *sptep, u64 spte);
202
203void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
201{ 204{
202 return ((1ULL << (e - s + 1)) - 1) << s; 205 shadow_mmio_mask = mmio_mask;
203} 206}
207EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
204 208
205void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 209static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
206{ 210{
207 shadow_trap_nonpresent_pte = trap_pte; 211 access &= ACC_WRITE_MASK | ACC_USER_MASK;
208 shadow_notrap_nonpresent_pte = notrap_pte; 212
213 trace_mark_mmio_spte(sptep, gfn, access);
214 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
215}
216
217static bool is_mmio_spte(u64 spte)
218{
219 return (spte & shadow_mmio_mask) == shadow_mmio_mask;
220}
221
222static gfn_t get_mmio_spte_gfn(u64 spte)
223{
224 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
225}
226
227static unsigned get_mmio_spte_access(u64 spte)
228{
229 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
230}
231
232static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
233{
234 if (unlikely(is_noslot_pfn(pfn))) {
235 mark_mmio_spte(sptep, gfn, access);
236 return true;
237 }
238
239 return false;
240}
241
242static inline u64 rsvd_bits(int s, int e)
243{
244 return ((1ULL << (e - s + 1)) - 1) << s;
209} 245}
210EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
211 246
212void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 247void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
213 u64 dirty_mask, u64 nx_mask, u64 x_mask) 248 u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -220,11 +255,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
220} 255}
221EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 256EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
222 257
223static bool is_write_protection(struct kvm_vcpu *vcpu)
224{
225 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
226}
227
228static int is_cpuid_PSE36(void) 258static int is_cpuid_PSE36(void)
229{ 259{
230 return 1; 260 return 1;
@@ -237,8 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
237 267
238static int is_shadow_present_pte(u64 pte) 268static int is_shadow_present_pte(u64 pte)
239{ 269{
240 return pte != shadow_trap_nonpresent_pte 270 return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
241 && pte != shadow_notrap_nonpresent_pte;
242} 271}
243 272
244static int is_large_pte(u64 pte) 273static int is_large_pte(u64 pte)
@@ -246,11 +275,6 @@ static int is_large_pte(u64 pte)
246 return pte & PT_PAGE_SIZE_MASK; 275 return pte & PT_PAGE_SIZE_MASK;
247} 276}
248 277
249static int is_writable_pte(unsigned long pte)
250{
251 return pte & PT_WRITABLE_MASK;
252}
253
254static int is_dirty_gpte(unsigned long pte) 278static int is_dirty_gpte(unsigned long pte)
255{ 279{
256 return pte & PT_DIRTY_MASK; 280 return pte & PT_DIRTY_MASK;
@@ -282,26 +306,155 @@ static gfn_t pse36_gfn_delta(u32 gpte)
282 return (gpte & PT32_DIR_PSE36_MASK) << shift; 306 return (gpte & PT32_DIR_PSE36_MASK) << shift;
283} 307}
284 308
309#ifdef CONFIG_X86_64
285static void __set_spte(u64 *sptep, u64 spte) 310static void __set_spte(u64 *sptep, u64 spte)
286{ 311{
287 set_64bit(sptep, spte); 312 *sptep = spte;
288} 313}
289 314
290static u64 __xchg_spte(u64 *sptep, u64 new_spte) 315static void __update_clear_spte_fast(u64 *sptep, u64 spte)
291{ 316{
292#ifdef CONFIG_X86_64 317 *sptep = spte;
293 return xchg(sptep, new_spte); 318}
319
320static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
321{
322 return xchg(sptep, spte);
323}
324
325static u64 __get_spte_lockless(u64 *sptep)
326{
327 return ACCESS_ONCE(*sptep);
328}
329
330static bool __check_direct_spte_mmio_pf(u64 spte)
331{
332 /* It is valid if the spte is zapped. */
333 return spte == 0ull;
334}
294#else 335#else
295 u64 old_spte; 336union split_spte {
337 struct {
338 u32 spte_low;
339 u32 spte_high;
340 };
341 u64 spte;
342};
296 343
297 do { 344static void count_spte_clear(u64 *sptep, u64 spte)
298 old_spte = *sptep; 345{
299 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); 346 struct kvm_mmu_page *sp = page_header(__pa(sptep));
300 347
301 return old_spte; 348 if (is_shadow_present_pte(spte))
302#endif 349 return;
350
351 /* Ensure the spte is completely set before we increase the count */
352 smp_wmb();
353 sp->clear_spte_count++;
354}
355
356static void __set_spte(u64 *sptep, u64 spte)
357{
358 union split_spte *ssptep, sspte;
359
360 ssptep = (union split_spte *)sptep;
361 sspte = (union split_spte)spte;
362
363 ssptep->spte_high = sspte.spte_high;
364
365 /*
366 * If we map the spte from nonpresent to present, We should store
367 * the high bits firstly, then set present bit, so cpu can not
368 * fetch this spte while we are setting the spte.
369 */
370 smp_wmb();
371
372 ssptep->spte_low = sspte.spte_low;
303} 373}
304 374
375static void __update_clear_spte_fast(u64 *sptep, u64 spte)
376{
377 union split_spte *ssptep, sspte;
378
379 ssptep = (union split_spte *)sptep;
380 sspte = (union split_spte)spte;
381
382 ssptep->spte_low = sspte.spte_low;
383
384 /*
385 * If we map the spte from present to nonpresent, we should clear
386 * present bit firstly to avoid vcpu fetch the old high bits.
387 */
388 smp_wmb();
389
390 ssptep->spte_high = sspte.spte_high;
391 count_spte_clear(sptep, spte);
392}
393
394static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
395{
396 union split_spte *ssptep, sspte, orig;
397
398 ssptep = (union split_spte *)sptep;
399 sspte = (union split_spte)spte;
400
401 /* xchg acts as a barrier before the setting of the high bits */
402 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
403 orig.spte_high = ssptep->spte_high;
404 ssptep->spte_high = sspte.spte_high;
405 count_spte_clear(sptep, spte);
406
407 return orig.spte;
408}
409
410/*
411 * The idea using the light way get the spte on x86_32 guest is from
412 * gup_get_pte(arch/x86/mm/gup.c).
413 * The difference is we can not catch the spte tlb flush if we leave
414 * guest mode, so we emulate it by increase clear_spte_count when spte
415 * is cleared.
416 */
417static u64 __get_spte_lockless(u64 *sptep)
418{
419 struct kvm_mmu_page *sp = page_header(__pa(sptep));
420 union split_spte spte, *orig = (union split_spte *)sptep;
421 int count;
422
423retry:
424 count = sp->clear_spte_count;
425 smp_rmb();
426
427 spte.spte_low = orig->spte_low;
428 smp_rmb();
429
430 spte.spte_high = orig->spte_high;
431 smp_rmb();
432
433 if (unlikely(spte.spte_low != orig->spte_low ||
434 count != sp->clear_spte_count))
435 goto retry;
436
437 return spte.spte;
438}
439
440static bool __check_direct_spte_mmio_pf(u64 spte)
441{
442 union split_spte sspte = (union split_spte)spte;
443 u32 high_mmio_mask = shadow_mmio_mask >> 32;
444
445 /* It is valid if the spte is zapped. */
446 if (spte == 0ull)
447 return true;
448
449 /* It is valid if the spte is being zapped. */
450 if (sspte.spte_low == 0ull &&
451 (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
452 return true;
453
454 return false;
455}
456#endif
457
305static bool spte_has_volatile_bits(u64 spte) 458static bool spte_has_volatile_bits(u64 spte)
306{ 459{
307 if (!shadow_accessed_mask) 460 if (!shadow_accessed_mask)
@@ -322,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
322 return (old_spte & bit_mask) && !(new_spte & bit_mask); 475 return (old_spte & bit_mask) && !(new_spte & bit_mask);
323} 476}
324 477
325static void update_spte(u64 *sptep, u64 new_spte) 478/* Rules for using mmu_spte_set:
479 * Set the sptep from nonpresent to present.
480 * Note: the sptep being assigned *must* be either not present
481 * or in a state where the hardware will not attempt to update
482 * the spte.
483 */
484static void mmu_spte_set(u64 *sptep, u64 new_spte)
485{
486 WARN_ON(is_shadow_present_pte(*sptep));
487 __set_spte(sptep, new_spte);
488}
489
490/* Rules for using mmu_spte_update:
491 * Update the state bits, it means the mapped pfn is not changged.
492 */
493static void mmu_spte_update(u64 *sptep, u64 new_spte)
326{ 494{
327 u64 mask, old_spte = *sptep; 495 u64 mask, old_spte = *sptep;
328 496
329 WARN_ON(!is_rmap_spte(new_spte)); 497 WARN_ON(!is_rmap_spte(new_spte));
330 498
499 if (!is_shadow_present_pte(old_spte))
500 return mmu_spte_set(sptep, new_spte);
501
331 new_spte |= old_spte & shadow_dirty_mask; 502 new_spte |= old_spte & shadow_dirty_mask;
332 503
333 mask = shadow_accessed_mask; 504 mask = shadow_accessed_mask;
@@ -335,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
335 mask |= shadow_dirty_mask; 506 mask |= shadow_dirty_mask;
336 507
337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 508 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
338 __set_spte(sptep, new_spte); 509 __update_clear_spte_fast(sptep, new_spte);
339 else 510 else
340 old_spte = __xchg_spte(sptep, new_spte); 511 old_spte = __update_clear_spte_slow(sptep, new_spte);
341 512
342 if (!shadow_accessed_mask) 513 if (!shadow_accessed_mask)
343 return; 514 return;
@@ -348,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 519 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
349} 520}
350 521
522/*
523 * Rules for using mmu_spte_clear_track_bits:
524 * It sets the sptep from present to nonpresent, and track the
525 * state bits, it is used to clear the last level sptep.
526 */
527static int mmu_spte_clear_track_bits(u64 *sptep)
528{
529 pfn_t pfn;
530 u64 old_spte = *sptep;
531
532 if (!spte_has_volatile_bits(old_spte))
533 __update_clear_spte_fast(sptep, 0ull);
534 else
535 old_spte = __update_clear_spte_slow(sptep, 0ull);
536
537 if (!is_rmap_spte(old_spte))
538 return 0;
539
540 pfn = spte_to_pfn(old_spte);
541 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
542 kvm_set_pfn_accessed(pfn);
543 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
544 kvm_set_pfn_dirty(pfn);
545 return 1;
546}
547
548/*
549 * Rules for using mmu_spte_clear_no_track:
550 * Directly clear spte without caring the state bits of sptep,
551 * it is used to set the upper level spte.
552 */
553static void mmu_spte_clear_no_track(u64 *sptep)
554{
555 __update_clear_spte_fast(sptep, 0ull);
556}
557
558static u64 mmu_spte_get_lockless(u64 *sptep)
559{
560 return __get_spte_lockless(sptep);
561}
562
563static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
564{
565 rcu_read_lock();
566 atomic_inc(&vcpu->kvm->arch.reader_counter);
567
568 /* Increase the counter before walking shadow page table */
569 smp_mb__after_atomic_inc();
570}
571
572static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
573{
574 /* Decrease the counter after walking shadow page table finished */
575 smp_mb__before_atomic_dec();
576 atomic_dec(&vcpu->kvm->arch.reader_counter);
577 rcu_read_unlock();
578}
579
351static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 580static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
352 struct kmem_cache *base_cache, int min) 581 struct kmem_cache *base_cache, int min)
353{ 582{
@@ -397,12 +626,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
397{ 626{
398 int r; 627 int r;
399 628
400 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, 629 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
401 pte_chain_cache, 4); 630 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
402 if (r)
403 goto out;
404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
406 if (r) 631 if (r)
407 goto out; 632 goto out;
408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 633 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +641,8 @@ out:
416 641
417static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 642static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
418{ 643{
419 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); 644 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
420 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); 645 pte_list_desc_cache);
421 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 646 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
422 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 647 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
423 mmu_page_header_cache); 648 mmu_page_header_cache);
@@ -433,26 +658,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
433 return p; 658 return p;
434} 659}
435 660
436static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) 661static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
437{ 662{
438 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, 663 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
439 sizeof(struct kvm_pte_chain)); 664 sizeof(struct pte_list_desc));
440} 665}
441 666
442static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 667static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
443{ 668{
444 kmem_cache_free(pte_chain_cache, pc); 669 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
445}
446
447static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
448{
449 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
450 sizeof(struct kvm_rmap_desc));
451}
452
453static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
454{
455 kmem_cache_free(rmap_desc_cache, rd);
456} 670}
457 671
458static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 672static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +712,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
498 linfo = lpage_info_slot(gfn, slot, i); 712 linfo = lpage_info_slot(gfn, slot, i);
499 linfo->write_count += 1; 713 linfo->write_count += 1;
500 } 714 }
715 kvm->arch.indirect_shadow_pages++;
501} 716}
502 717
503static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 718static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +728,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
513 linfo->write_count -= 1; 728 linfo->write_count -= 1;
514 WARN_ON(linfo->write_count < 0); 729 WARN_ON(linfo->write_count < 0);
515 } 730 }
731 kvm->arch.indirect_shadow_pages--;
516} 732}
517 733
518static int has_wrprotected_page(struct kvm *kvm, 734static int has_wrprotected_page(struct kvm *kvm,
@@ -588,67 +804,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
588} 804}
589 805
590/* 806/*
591 * Take gfn and return the reverse mapping to it. 807 * Pte mapping structures:
592 */
593
594static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
595{
596 struct kvm_memory_slot *slot;
597 struct kvm_lpage_info *linfo;
598
599 slot = gfn_to_memslot(kvm, gfn);
600 if (likely(level == PT_PAGE_TABLE_LEVEL))
601 return &slot->rmap[gfn - slot->base_gfn];
602
603 linfo = lpage_info_slot(gfn, slot, level);
604
605 return &linfo->rmap_pde;
606}
607
608/*
609 * Reverse mapping data structures:
610 * 808 *
611 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry 809 * If pte_list bit zero is zero, then pte_list point to the spte.
612 * that points to page_address(page).
613 * 810 *
614 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 811 * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
615 * containing more mappings. 812 * pte_list_desc containing more mappings.
616 * 813 *
617 * Returns the number of rmap entries before the spte was added or zero if 814 * Returns the number of pte entries before the spte was added or zero if
618 * the spte was not added. 815 * the spte was not added.
619 * 816 *
620 */ 817 */
621static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 818static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
819 unsigned long *pte_list)
622{ 820{
623 struct kvm_mmu_page *sp; 821 struct pte_list_desc *desc;
624 struct kvm_rmap_desc *desc;
625 unsigned long *rmapp;
626 int i, count = 0; 822 int i, count = 0;
627 823
628 if (!is_rmap_spte(*spte)) 824 if (!*pte_list) {
629 return count; 825 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
630 sp = page_header(__pa(spte)); 826 *pte_list = (unsigned long)spte;
631 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 827 } else if (!(*pte_list & 1)) {
632 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 828 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
633 if (!*rmapp) { 829 desc = mmu_alloc_pte_list_desc(vcpu);
634 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 830 desc->sptes[0] = (u64 *)*pte_list;
635 *rmapp = (unsigned long)spte;
636 } else if (!(*rmapp & 1)) {
637 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
638 desc = mmu_alloc_rmap_desc(vcpu);
639 desc->sptes[0] = (u64 *)*rmapp;
640 desc->sptes[1] = spte; 831 desc->sptes[1] = spte;
641 *rmapp = (unsigned long)desc | 1; 832 *pte_list = (unsigned long)desc | 1;
642 ++count; 833 ++count;
643 } else { 834 } else {
644 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 835 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
645 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 836 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
646 while (desc->sptes[RMAP_EXT-1] && desc->more) { 837 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
647 desc = desc->more; 838 desc = desc->more;
648 count += RMAP_EXT; 839 count += PTE_LIST_EXT;
649 } 840 }
650 if (desc->sptes[RMAP_EXT-1]) { 841 if (desc->sptes[PTE_LIST_EXT-1]) {
651 desc->more = mmu_alloc_rmap_desc(vcpu); 842 desc->more = mmu_alloc_pte_list_desc(vcpu);
652 desc = desc->more; 843 desc = desc->more;
653 } 844 }
654 for (i = 0; desc->sptes[i]; ++i) 845 for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +849,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
658 return count; 849 return count;
659} 850}
660 851
661static void rmap_desc_remove_entry(unsigned long *rmapp, 852static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
662 struct kvm_rmap_desc *desc, 853{
663 int i, 854 struct pte_list_desc *desc;
664 struct kvm_rmap_desc *prev_desc) 855 u64 *prev_spte;
856 int i;
857
858 if (!*pte_list)
859 return NULL;
860 else if (!(*pte_list & 1)) {
861 if (!spte)
862 return (u64 *)*pte_list;
863 return NULL;
864 }
865 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
866 prev_spte = NULL;
867 while (desc) {
868 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
869 if (prev_spte == spte)
870 return desc->sptes[i];
871 prev_spte = desc->sptes[i];
872 }
873 desc = desc->more;
874 }
875 return NULL;
876}
877
878static void
879pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
880 int i, struct pte_list_desc *prev_desc)
665{ 881{
666 int j; 882 int j;
667 883
668 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) 884 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
669 ; 885 ;
670 desc->sptes[i] = desc->sptes[j]; 886 desc->sptes[i] = desc->sptes[j];
671 desc->sptes[j] = NULL; 887 desc->sptes[j] = NULL;
672 if (j != 0) 888 if (j != 0)
673 return; 889 return;
674 if (!prev_desc && !desc->more) 890 if (!prev_desc && !desc->more)
675 *rmapp = (unsigned long)desc->sptes[0]; 891 *pte_list = (unsigned long)desc->sptes[0];
676 else 892 else
677 if (prev_desc) 893 if (prev_desc)
678 prev_desc->more = desc->more; 894 prev_desc->more = desc->more;
679 else 895 else
680 *rmapp = (unsigned long)desc->more | 1; 896 *pte_list = (unsigned long)desc->more | 1;
681 mmu_free_rmap_desc(desc); 897 mmu_free_pte_list_desc(desc);
682} 898}
683 899
684static void rmap_remove(struct kvm *kvm, u64 *spte) 900static void pte_list_remove(u64 *spte, unsigned long *pte_list)
685{ 901{
686 struct kvm_rmap_desc *desc; 902 struct pte_list_desc *desc;
687 struct kvm_rmap_desc *prev_desc; 903 struct pte_list_desc *prev_desc;
688 struct kvm_mmu_page *sp;
689 gfn_t gfn;
690 unsigned long *rmapp;
691 int i; 904 int i;
692 905
693 sp = page_header(__pa(spte)); 906 if (!*pte_list) {
694 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 907 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
695 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
696 if (!*rmapp) {
697 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
698 BUG(); 908 BUG();
699 } else if (!(*rmapp & 1)) { 909 } else if (!(*pte_list & 1)) {
700 rmap_printk("rmap_remove: %p 1->0\n", spte); 910 rmap_printk("pte_list_remove: %p 1->0\n", spte);
701 if ((u64 *)*rmapp != spte) { 911 if ((u64 *)*pte_list != spte) {
702 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); 912 printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
703 BUG(); 913 BUG();
704 } 914 }
705 *rmapp = 0; 915 *pte_list = 0;
706 } else { 916 } else {
707 rmap_printk("rmap_remove: %p many->many\n", spte); 917 rmap_printk("pte_list_remove: %p many->many\n", spte);
708 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 918 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
709 prev_desc = NULL; 919 prev_desc = NULL;
710 while (desc) { 920 while (desc) {
711 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) 921 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
712 if (desc->sptes[i] == spte) { 922 if (desc->sptes[i] == spte) {
713 rmap_desc_remove_entry(rmapp, 923 pte_list_desc_remove_entry(pte_list,
714 desc, i, 924 desc, i,
715 prev_desc); 925 prev_desc);
716 return; 926 return;
@@ -718,62 +928,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
718 prev_desc = desc; 928 prev_desc = desc;
719 desc = desc->more; 929 desc = desc->more;
720 } 930 }
721 pr_err("rmap_remove: %p many->many\n", spte); 931 pr_err("pte_list_remove: %p many->many\n", spte);
722 BUG(); 932 BUG();
723 } 933 }
724} 934}
725 935
726static int set_spte_track_bits(u64 *sptep, u64 new_spte) 936typedef void (*pte_list_walk_fn) (u64 *spte);
937static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
727{ 938{
728 pfn_t pfn; 939 struct pte_list_desc *desc;
729 u64 old_spte = *sptep; 940 int i;
730 941
731 if (!spte_has_volatile_bits(old_spte)) 942 if (!*pte_list)
732 __set_spte(sptep, new_spte); 943 return;
733 else
734 old_spte = __xchg_spte(sptep, new_spte);
735 944
736 if (!is_rmap_spte(old_spte)) 945 if (!(*pte_list & 1))
737 return 0; 946 return fn((u64 *)*pte_list);
738 947
739 pfn = spte_to_pfn(old_spte); 948 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
740 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 949 while (desc) {
741 kvm_set_pfn_accessed(pfn); 950 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
742 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 951 fn(desc->sptes[i]);
743 kvm_set_pfn_dirty(pfn); 952 desc = desc->more;
744 return 1; 953 }
745} 954}
746 955
747static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 956/*
957 * Take gfn and return the reverse mapping to it.
958 */
959static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
748{ 960{
749 if (set_spte_track_bits(sptep, new_spte)) 961 struct kvm_memory_slot *slot;
750 rmap_remove(kvm, sptep); 962 struct kvm_lpage_info *linfo;
963
964 slot = gfn_to_memslot(kvm, gfn);
965 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn];
967
968 linfo = lpage_info_slot(gfn, slot, level);
969
970 return &linfo->rmap_pde;
971}
972
973static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
974{
975 struct kvm_mmu_page *sp;
976 unsigned long *rmapp;
977
978 sp = page_header(__pa(spte));
979 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
980 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
981 return pte_list_add(vcpu, spte, rmapp);
751} 982}
752 983
753static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 984static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
754{ 985{
755 struct kvm_rmap_desc *desc; 986 return pte_list_next(rmapp, spte);
756 u64 *prev_spte; 987}
757 int i;
758 988
759 if (!*rmapp) 989static void rmap_remove(struct kvm *kvm, u64 *spte)
760 return NULL; 990{
761 else if (!(*rmapp & 1)) { 991 struct kvm_mmu_page *sp;
762 if (!spte) 992 gfn_t gfn;
763 return (u64 *)*rmapp; 993 unsigned long *rmapp;
764 return NULL; 994
765 } 995 sp = page_header(__pa(spte));
766 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 996 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
767 prev_spte = NULL; 997 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
768 while (desc) { 998 pte_list_remove(spte, rmapp);
769 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 999}
770 if (prev_spte == spte) 1000
771 return desc->sptes[i]; 1001static void drop_spte(struct kvm *kvm, u64 *sptep)
772 prev_spte = desc->sptes[i]; 1002{
773 } 1003 if (mmu_spte_clear_track_bits(sptep))
774 desc = desc->more; 1004 rmap_remove(kvm, sptep);
775 }
776 return NULL;
777} 1005}
778 1006
779static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1007static int rmap_write_protect(struct kvm *kvm, u64 gfn)
@@ -790,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
790 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1018 BUG_ON(!(*spte & PT_PRESENT_MASK));
791 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1019 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
792 if (is_writable_pte(*spte)) { 1020 if (is_writable_pte(*spte)) {
793 update_spte(spte, *spte & ~PT_WRITABLE_MASK); 1021 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
794 write_protected = 1; 1022 write_protected = 1;
795 } 1023 }
796 spte = rmap_next(kvm, rmapp, spte); 1024 spte = rmap_next(kvm, rmapp, spte);
@@ -807,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
807 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 1035 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
808 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 1036 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
809 if (is_writable_pte(*spte)) { 1037 if (is_writable_pte(*spte)) {
810 drop_spte(kvm, spte, 1038 drop_spte(kvm, spte);
811 shadow_trap_nonpresent_pte);
812 --kvm->stat.lpages; 1039 --kvm->stat.lpages;
813 spte = NULL; 1040 spte = NULL;
814 write_protected = 1; 1041 write_protected = 1;
@@ -829,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
829 while ((spte = rmap_next(kvm, rmapp, NULL))) { 1056 while ((spte = rmap_next(kvm, rmapp, NULL))) {
830 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1057 BUG_ON(!(*spte & PT_PRESENT_MASK));
831 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1058 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
832 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1059 drop_spte(kvm, spte);
833 need_tlb_flush = 1; 1060 need_tlb_flush = 1;
834 } 1061 }
835 return need_tlb_flush; 1062 return need_tlb_flush;
@@ -851,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
851 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1078 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
852 need_flush = 1; 1079 need_flush = 1;
853 if (pte_write(*ptep)) { 1080 if (pte_write(*ptep)) {
854 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1081 drop_spte(kvm, spte);
855 spte = rmap_next(kvm, rmapp, NULL); 1082 spte = rmap_next(kvm, rmapp, NULL);
856 } else { 1083 } else {
857 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1084 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -860,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
860 new_spte &= ~PT_WRITABLE_MASK; 1087 new_spte &= ~PT_WRITABLE_MASK;
861 new_spte &= ~SPTE_HOST_WRITEABLE; 1088 new_spte &= ~SPTE_HOST_WRITEABLE;
862 new_spte &= ~shadow_accessed_mask; 1089 new_spte &= ~shadow_accessed_mask;
863 set_spte_track_bits(spte, new_spte); 1090 mmu_spte_clear_track_bits(spte);
1091 mmu_spte_set(spte, new_spte);
864 spte = rmap_next(kvm, rmapp, spte); 1092 spte = rmap_next(kvm, rmapp, spte);
865 } 1093 }
866 } 1094 }
@@ -1032,151 +1260,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1032 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1260 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033} 1261}
1034 1262
1035static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1263/*
1264 * Remove the sp from shadow page cache, after call it,
1265 * we can not find this sp from the cache, and the shadow
1266 * page table is still valid.
1267 * It should be under the protection of mmu lock.
1268 */
1269static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1036{ 1270{
1037 ASSERT(is_empty_shadow_page(sp->spt)); 1271 ASSERT(is_empty_shadow_page(sp->spt));
1038 hlist_del(&sp->hash_link); 1272 hlist_del(&sp->hash_link);
1039 list_del(&sp->link);
1040 free_page((unsigned long)sp->spt);
1041 if (!sp->role.direct) 1273 if (!sp->role.direct)
1042 free_page((unsigned long)sp->gfns); 1274 free_page((unsigned long)sp->gfns);
1043 kmem_cache_free(mmu_page_header_cache, sp);
1044 kvm_mod_used_mmu_pages(kvm, -1);
1045} 1275}
1046 1276
1047static unsigned kvm_page_table_hashfn(gfn_t gfn) 1277/*
1278 * Free the shadow page table and the sp, we can do it
1279 * out of the protection of mmu lock.
1280 */
1281static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1048{ 1282{
1049 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 1283 list_del(&sp->link);
1284 free_page((unsigned long)sp->spt);
1285 kmem_cache_free(mmu_page_header_cache, sp);
1050} 1286}
1051 1287
1052static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 1288static unsigned kvm_page_table_hashfn(gfn_t gfn)
1053 u64 *parent_pte, int direct)
1054{ 1289{
1055 struct kvm_mmu_page *sp; 1290 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1056
1057 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1058 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1059 if (!direct)
1060 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1061 PAGE_SIZE);
1062 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1063 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1064 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1065 sp->multimapped = 0;
1066 sp->parent_pte = parent_pte;
1067 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1068 return sp;
1069} 1291}
1070 1292
1071static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1293static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1072 struct kvm_mmu_page *sp, u64 *parent_pte) 1294 struct kvm_mmu_page *sp, u64 *parent_pte)
1073{ 1295{
1074 struct kvm_pte_chain *pte_chain;
1075 struct hlist_node *node;
1076 int i;
1077
1078 if (!parent_pte) 1296 if (!parent_pte)
1079 return; 1297 return;
1080 if (!sp->multimapped) {
1081 u64 *old = sp->parent_pte;
1082 1298
1083 if (!old) { 1299 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1084 sp->parent_pte = parent_pte;
1085 return;
1086 }
1087 sp->multimapped = 1;
1088 pte_chain = mmu_alloc_pte_chain(vcpu);
1089 INIT_HLIST_HEAD(&sp->parent_ptes);
1090 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1091 pte_chain->parent_ptes[0] = old;
1092 }
1093 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1094 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1095 continue;
1096 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1097 if (!pte_chain->parent_ptes[i]) {
1098 pte_chain->parent_ptes[i] = parent_pte;
1099 return;
1100 }
1101 }
1102 pte_chain = mmu_alloc_pte_chain(vcpu);
1103 BUG_ON(!pte_chain);
1104 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1105 pte_chain->parent_ptes[0] = parent_pte;
1106} 1300}
1107 1301
1108static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1302static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1109 u64 *parent_pte) 1303 u64 *parent_pte)
1110{ 1304{
1111 struct kvm_pte_chain *pte_chain; 1305 pte_list_remove(parent_pte, &sp->parent_ptes);
1112 struct hlist_node *node;
1113 int i;
1114
1115 if (!sp->multimapped) {
1116 BUG_ON(sp->parent_pte != parent_pte);
1117 sp->parent_pte = NULL;
1118 return;
1119 }
1120 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1121 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1122 if (!pte_chain->parent_ptes[i])
1123 break;
1124 if (pte_chain->parent_ptes[i] != parent_pte)
1125 continue;
1126 while (i + 1 < NR_PTE_CHAIN_ENTRIES
1127 && pte_chain->parent_ptes[i + 1]) {
1128 pte_chain->parent_ptes[i]
1129 = pte_chain->parent_ptes[i + 1];
1130 ++i;
1131 }
1132 pte_chain->parent_ptes[i] = NULL;
1133 if (i == 0) {
1134 hlist_del(&pte_chain->link);
1135 mmu_free_pte_chain(pte_chain);
1136 if (hlist_empty(&sp->parent_ptes)) {
1137 sp->multimapped = 0;
1138 sp->parent_pte = NULL;
1139 }
1140 }
1141 return;
1142 }
1143 BUG();
1144} 1306}
1145 1307
1146static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1308static void drop_parent_pte(struct kvm_mmu_page *sp,
1309 u64 *parent_pte)
1147{ 1310{
1148 struct kvm_pte_chain *pte_chain; 1311 mmu_page_remove_parent_pte(sp, parent_pte);
1149 struct hlist_node *node; 1312 mmu_spte_clear_no_track(parent_pte);
1150 struct kvm_mmu_page *parent_sp; 1313}
1151 int i;
1152
1153 if (!sp->multimapped && sp->parent_pte) {
1154 parent_sp = page_header(__pa(sp->parent_pte));
1155 fn(parent_sp, sp->parent_pte);
1156 return;
1157 }
1158
1159 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1160 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1161 u64 *spte = pte_chain->parent_ptes[i];
1162 1314
1163 if (!spte) 1315static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1164 break; 1316 u64 *parent_pte, int direct)
1165 parent_sp = page_header(__pa(spte)); 1317{
1166 fn(parent_sp, spte); 1318 struct kvm_mmu_page *sp;
1167 } 1319 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
1320 sizeof *sp);
1321 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1322 if (!direct)
1323 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1324 PAGE_SIZE);
1325 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1326 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1327 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1328 sp->parent_ptes = 0;
1329 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1330 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1331 return sp;
1168} 1332}
1169 1333
1170static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); 1334static void mark_unsync(u64 *spte);
1171static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1335static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1172{ 1336{
1173 mmu_parent_walk(sp, mark_unsync); 1337 pte_list_walk(&sp->parent_ptes, mark_unsync);
1174} 1338}
1175 1339
1176static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) 1340static void mark_unsync(u64 *spte)
1177{ 1341{
1342 struct kvm_mmu_page *sp;
1178 unsigned int index; 1343 unsigned int index;
1179 1344
1345 sp = page_header(__pa(spte));
1180 index = spte - sp->spt; 1346 index = spte - sp->spt;
1181 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1347 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1182 return; 1348 return;
@@ -1185,15 +1351,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1185 kvm_mmu_mark_parents_unsync(sp); 1351 kvm_mmu_mark_parents_unsync(sp);
1186} 1352}
1187 1353
1188static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1189 struct kvm_mmu_page *sp)
1190{
1191 int i;
1192
1193 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1194 sp->spt[i] = shadow_trap_nonpresent_pte;
1195}
1196
1197static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1354static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1198 struct kvm_mmu_page *sp) 1355 struct kvm_mmu_page *sp)
1199{ 1356{
@@ -1475,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1475 } 1632 }
1476} 1633}
1477 1634
1635static void init_shadow_page_table(struct kvm_mmu_page *sp)
1636{
1637 int i;
1638
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 sp->spt[i] = 0ull;
1641}
1642
1478static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1643static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1479 gfn_t gfn, 1644 gfn_t gfn,
1480 gva_t gaddr, 1645 gva_t gaddr,
@@ -1537,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1537 1702
1538 account_shadowed(vcpu->kvm, gfn); 1703 account_shadowed(vcpu->kvm, gfn);
1539 } 1704 }
1540 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1705 init_shadow_page_table(sp);
1541 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1542 else
1543 nonpaging_prefetch_page(vcpu, sp);
1544 trace_kvm_mmu_get_page(sp, true); 1706 trace_kvm_mmu_get_page(sp, true);
1545 return sp; 1707 return sp;
1546} 1708}
@@ -1572,21 +1734,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1572 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1734 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1573 return false; 1735 return false;
1574 1736
1575 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1576 if (is_large_pte(*iterator->sptep))
1577 return false;
1578
1579 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1737 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1580 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1738 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1581 return true; 1739 return true;
1582} 1740}
1583 1741
1584static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 1742static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
1743 u64 spte)
1585{ 1744{
1586 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; 1745 if (is_last_spte(spte, iterator->level)) {
1746 iterator->level = 0;
1747 return;
1748 }
1749
1750 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
1587 --iterator->level; 1751 --iterator->level;
1588} 1752}
1589 1753
1754static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1755{
1756 return __shadow_walk_next(iterator, *iterator->sptep);
1757}
1758
1590static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 1759static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1591{ 1760{
1592 u64 spte; 1761 u64 spte;
@@ -1594,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1594 spte = __pa(sp->spt) 1763 spte = __pa(sp->spt)
1595 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1764 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1596 | PT_WRITABLE_MASK | PT_USER_MASK; 1765 | PT_WRITABLE_MASK | PT_USER_MASK;
1597 __set_spte(sptep, spte); 1766 mmu_spte_set(sptep, spte);
1598} 1767}
1599 1768
1600static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1769static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1601{ 1770{
1602 if (is_large_pte(*sptep)) { 1771 if (is_large_pte(*sptep)) {
1603 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1772 drop_spte(vcpu->kvm, sptep);
1604 kvm_flush_remote_tlbs(vcpu->kvm); 1773 kvm_flush_remote_tlbs(vcpu->kvm);
1605 } 1774 }
1606} 1775}
@@ -1622,38 +1791,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1622 if (child->role.access == direct_access) 1791 if (child->role.access == direct_access)
1623 return; 1792 return;
1624 1793
1625 mmu_page_remove_parent_pte(child, sptep); 1794 drop_parent_pte(child, sptep);
1626 __set_spte(sptep, shadow_trap_nonpresent_pte);
1627 kvm_flush_remote_tlbs(vcpu->kvm); 1795 kvm_flush_remote_tlbs(vcpu->kvm);
1628 } 1796 }
1629} 1797}
1630 1798
1799static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1800 u64 *spte)
1801{
1802 u64 pte;
1803 struct kvm_mmu_page *child;
1804
1805 pte = *spte;
1806 if (is_shadow_present_pte(pte)) {
1807 if (is_last_spte(pte, sp->role.level))
1808 drop_spte(kvm, spte);
1809 else {
1810 child = page_header(pte & PT64_BASE_ADDR_MASK);
1811 drop_parent_pte(child, spte);
1812 }
1813 } else if (is_mmio_spte(pte))
1814 mmu_spte_clear_no_track(spte);
1815
1816 if (is_large_pte(pte))
1817 --kvm->stat.lpages;
1818}
1819
1631static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1820static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1632 struct kvm_mmu_page *sp) 1821 struct kvm_mmu_page *sp)
1633{ 1822{
1634 unsigned i; 1823 unsigned i;
1635 u64 *pt; 1824
1636 u64 ent; 1825 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1637 1826 mmu_page_zap_pte(kvm, sp, sp->spt + i);
1638 pt = sp->spt;
1639
1640 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1641 ent = pt[i];
1642
1643 if (is_shadow_present_pte(ent)) {
1644 if (!is_last_spte(ent, sp->role.level)) {
1645 ent &= PT64_BASE_ADDR_MASK;
1646 mmu_page_remove_parent_pte(page_header(ent),
1647 &pt[i]);
1648 } else {
1649 if (is_large_pte(ent))
1650 --kvm->stat.lpages;
1651 drop_spte(kvm, &pt[i],
1652 shadow_trap_nonpresent_pte);
1653 }
1654 }
1655 pt[i] = shadow_trap_nonpresent_pte;
1656 }
1657} 1827}
1658 1828
1659static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1829static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1844,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1674{ 1844{
1675 u64 *parent_pte; 1845 u64 *parent_pte;
1676 1846
1677 while (sp->multimapped || sp->parent_pte) { 1847 while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
1678 if (!sp->multimapped) 1848 drop_parent_pte(sp, parent_pte);
1679 parent_pte = sp->parent_pte;
1680 else {
1681 struct kvm_pte_chain *chain;
1682
1683 chain = container_of(sp->parent_ptes.first,
1684 struct kvm_pte_chain, link);
1685 parent_pte = chain->parent_ptes[0];
1686 }
1687 BUG_ON(!parent_pte);
1688 kvm_mmu_put_page(sp, parent_pte);
1689 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1690 }
1691} 1849}
1692 1850
1693static int mmu_zap_unsync_children(struct kvm *kvm, 1851static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1734 /* Count self */ 1892 /* Count self */
1735 ret++; 1893 ret++;
1736 list_move(&sp->link, invalid_list); 1894 list_move(&sp->link, invalid_list);
1895 kvm_mod_used_mmu_pages(kvm, -1);
1737 } else { 1896 } else {
1738 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1897 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1739 kvm_reload_remote_mmus(kvm); 1898 kvm_reload_remote_mmus(kvm);
@@ -1744,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1744 return ret; 1903 return ret;
1745} 1904}
1746 1905
1906static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1907{
1908 struct kvm_mmu_page *sp;
1909
1910 list_for_each_entry(sp, invalid_list, link)
1911 kvm_mmu_isolate_page(sp);
1912}
1913
1914static void free_pages_rcu(struct rcu_head *head)
1915{
1916 struct kvm_mmu_page *next, *sp;
1917
1918 sp = container_of(head, struct kvm_mmu_page, rcu);
1919 while (sp) {
1920 if (!list_empty(&sp->link))
1921 next = list_first_entry(&sp->link,
1922 struct kvm_mmu_page, link);
1923 else
1924 next = NULL;
1925 kvm_mmu_free_page(sp);
1926 sp = next;
1927 }
1928}
1929
1747static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1930static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1748 struct list_head *invalid_list) 1931 struct list_head *invalid_list)
1749{ 1932{
@@ -1754,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1754 1937
1755 kvm_flush_remote_tlbs(kvm); 1938 kvm_flush_remote_tlbs(kvm);
1756 1939
1940 if (atomic_read(&kvm->arch.reader_counter)) {
1941 kvm_mmu_isolate_pages(invalid_list);
1942 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1943 list_del_init(invalid_list);
1944
1945 trace_kvm_mmu_delay_free_pages(sp);
1946 call_rcu(&sp->rcu, free_pages_rcu);
1947 return;
1948 }
1949
1757 do { 1950 do {
1758 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 1951 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1759 WARN_ON(!sp->role.invalid || sp->root_count); 1952 WARN_ON(!sp->role.invalid || sp->root_count);
1760 kvm_mmu_free_page(kvm, sp); 1953 kvm_mmu_isolate_page(sp);
1954 kvm_mmu_free_page(sp);
1761 } while (!list_empty(invalid_list)); 1955 } while (!list_empty(invalid_list));
1762 1956
1763} 1957}
@@ -1783,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1783 page = container_of(kvm->arch.active_mmu_pages.prev, 1977 page = container_of(kvm->arch.active_mmu_pages.prev,
1784 struct kvm_mmu_page, link); 1978 struct kvm_mmu_page, link);
1785 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); 1979 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1786 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787 } 1980 }
1981 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1788 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 1982 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1789 } 1983 }
1790 1984
@@ -1833,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1833 __set_bit(slot, sp->slot_bitmap); 2027 __set_bit(slot, sp->slot_bitmap);
1834} 2028}
1835 2029
1836static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1837{
1838 int i;
1839 u64 *pt = sp->spt;
1840
1841 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1842 return;
1843
1844 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1845 if (pt[i] == shadow_notrap_nonpresent_pte)
1846 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1847 }
1848}
1849
1850/* 2030/*
1851 * The function is based on mtrr_type_lookup() in 2031 * The function is based on mtrr_type_lookup() in
1852 * arch/x86/kernel/cpu/mtrr/generic.c 2032 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1959 sp->unsync = 1; 2139 sp->unsync = 1;
1960 2140
1961 kvm_mmu_mark_parents_unsync(sp); 2141 kvm_mmu_mark_parents_unsync(sp);
1962 mmu_convert_notrap(sp);
1963} 2142}
1964 2143
1965static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 2144static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -2002,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2002 2181
2003static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2182static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 unsigned pte_access, int user_fault, 2183 unsigned pte_access, int user_fault,
2005 int write_fault, int dirty, int level, 2184 int write_fault, int level,
2006 gfn_t gfn, pfn_t pfn, bool speculative, 2185 gfn_t gfn, pfn_t pfn, bool speculative,
2007 bool can_unsync, bool host_writable) 2186 bool can_unsync, bool host_writable)
2008{ 2187{
2009 u64 spte, entry = *sptep; 2188 u64 spte, entry = *sptep;
2010 int ret = 0; 2189 int ret = 0;
2011 2190
2191 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2192 return 0;
2193
2012 /* 2194 /*
2013 * We don't set the accessed bit, since we sometimes want to see 2195 * We don't set the accessed bit, since we sometimes want to see
2014 * whether the guest actually used the pte (in order to detect 2196 * whether the guest actually used the pte (in order to detect
@@ -2017,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2017 spte = PT_PRESENT_MASK; 2199 spte = PT_PRESENT_MASK;
2018 if (!speculative) 2200 if (!speculative)
2019 spte |= shadow_accessed_mask; 2201 spte |= shadow_accessed_mask;
2020 if (!dirty) 2202
2021 pte_access &= ~ACC_WRITE_MASK;
2022 if (pte_access & ACC_EXEC_MASK) 2203 if (pte_access & ACC_EXEC_MASK)
2023 spte |= shadow_x_mask; 2204 spte |= shadow_x_mask;
2024 else 2205 else
@@ -2045,15 +2226,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 if (level > PT_PAGE_TABLE_LEVEL && 2226 if (level > PT_PAGE_TABLE_LEVEL &&
2046 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2227 has_wrprotected_page(vcpu->kvm, gfn, level)) {
2047 ret = 1; 2228 ret = 1;
2048 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2229 drop_spte(vcpu->kvm, sptep);
2049 goto done; 2230 goto done;
2050 } 2231 }
2051 2232
2052 spte |= PT_WRITABLE_MASK; 2233 spte |= PT_WRITABLE_MASK;
2053 2234
2054 if (!vcpu->arch.mmu.direct_map 2235 if (!vcpu->arch.mmu.direct_map
2055 && !(pte_access & ACC_WRITE_MASK)) 2236 && !(pte_access & ACC_WRITE_MASK)) {
2056 spte &= ~PT_USER_MASK; 2237 spte &= ~PT_USER_MASK;
2238 /*
2239 * If we converted a user page to a kernel page,
2240 * so that the kernel can write to it when cr0.wp=0,
2241 * then we should prevent the kernel from executing it
2242 * if SMEP is enabled.
2243 */
2244 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2245 spte |= PT64_NX_MASK;
2246 }
2057 2247
2058 /* 2248 /*
2059 * Optimization: for pte sync, if spte was writable the hash 2249 * Optimization: for pte sync, if spte was writable the hash
@@ -2078,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2078 mark_page_dirty(vcpu->kvm, gfn); 2268 mark_page_dirty(vcpu->kvm, gfn);
2079 2269
2080set_pte: 2270set_pte:
2081 update_spte(sptep, spte); 2271 mmu_spte_update(sptep, spte);
2082 /* 2272 /*
2083 * If we overwrite a writable spte with a read-only one we 2273 * If we overwrite a writable spte with a read-only one we
2084 * should flush remote TLBs. Otherwise rmap_write_protect 2274 * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2093,8 +2283,8 @@ done:
2093 2283
2094static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2284static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2095 unsigned pt_access, unsigned pte_access, 2285 unsigned pt_access, unsigned pte_access,
2096 int user_fault, int write_fault, int dirty, 2286 int user_fault, int write_fault,
2097 int *ptwrite, int level, gfn_t gfn, 2287 int *emulate, int level, gfn_t gfn,
2098 pfn_t pfn, bool speculative, 2288 pfn_t pfn, bool speculative,
2099 bool host_writable) 2289 bool host_writable)
2100{ 2290{
@@ -2117,26 +2307,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2117 u64 pte = *sptep; 2307 u64 pte = *sptep;
2118 2308
2119 child = page_header(pte & PT64_BASE_ADDR_MASK); 2309 child = page_header(pte & PT64_BASE_ADDR_MASK);
2120 mmu_page_remove_parent_pte(child, sptep); 2310 drop_parent_pte(child, sptep);
2121 __set_spte(sptep, shadow_trap_nonpresent_pte);
2122 kvm_flush_remote_tlbs(vcpu->kvm); 2311 kvm_flush_remote_tlbs(vcpu->kvm);
2123 } else if (pfn != spte_to_pfn(*sptep)) { 2312 } else if (pfn != spte_to_pfn(*sptep)) {
2124 pgprintk("hfn old %llx new %llx\n", 2313 pgprintk("hfn old %llx new %llx\n",
2125 spte_to_pfn(*sptep), pfn); 2314 spte_to_pfn(*sptep), pfn);
2126 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2315 drop_spte(vcpu->kvm, sptep);
2127 kvm_flush_remote_tlbs(vcpu->kvm); 2316 kvm_flush_remote_tlbs(vcpu->kvm);
2128 } else 2317 } else
2129 was_rmapped = 1; 2318 was_rmapped = 1;
2130 } 2319 }
2131 2320
2132 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2321 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2133 dirty, level, gfn, pfn, speculative, true, 2322 level, gfn, pfn, speculative, true,
2134 host_writable)) { 2323 host_writable)) {
2135 if (write_fault) 2324 if (write_fault)
2136 *ptwrite = 1; 2325 *emulate = 1;
2137 kvm_mmu_flush_tlb(vcpu); 2326 kvm_mmu_flush_tlb(vcpu);
2138 } 2327 }
2139 2328
2329 if (unlikely(is_mmio_spte(*sptep) && emulate))
2330 *emulate = 1;
2331
2140 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2332 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2141 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2333 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2142 is_large_pte(*sptep)? "2MB" : "4kB", 2334 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2145,11 +2337,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2145 if (!was_rmapped && is_large_pte(*sptep)) 2337 if (!was_rmapped && is_large_pte(*sptep))
2146 ++vcpu->kvm->stat.lpages; 2338 ++vcpu->kvm->stat.lpages;
2147 2339
2148 page_header_update_slot(vcpu->kvm, sptep, gfn); 2340 if (is_shadow_present_pte(*sptep)) {
2149 if (!was_rmapped) { 2341 page_header_update_slot(vcpu->kvm, sptep, gfn);
2150 rmap_count = rmap_add(vcpu, sptep, gfn); 2342 if (!was_rmapped) {
2151 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2343 rmap_count = rmap_add(vcpu, sptep, gfn);
2152 rmap_recycle(vcpu, sptep, gfn); 2344 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2345 rmap_recycle(vcpu, sptep, gfn);
2346 }
2153 } 2347 }
2154 kvm_release_pfn_clean(pfn); 2348 kvm_release_pfn_clean(pfn);
2155 if (speculative) { 2349 if (speculative) {
@@ -2170,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2170 2364
2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2365 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 if (!slot) { 2366 if (!slot) {
2173 get_page(bad_page); 2367 get_page(fault_page);
2174 return page_to_pfn(bad_page); 2368 return page_to_pfn(fault_page);
2175 } 2369 }
2176 2370
2177 hva = gfn_to_hva_memslot(slot, gfn); 2371 hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2198 2392
2199 for (i = 0; i < ret; i++, gfn++, start++) 2393 for (i = 0; i < ret; i++, gfn++, start++)
2200 mmu_set_spte(vcpu, start, ACC_ALL, 2394 mmu_set_spte(vcpu, start, ACC_ALL,
2201 access, 0, 0, 1, NULL, 2395 access, 0, 0, NULL,
2202 sp->role.level, gfn, 2396 sp->role.level, gfn,
2203 page_to_pfn(pages[i]), true, true); 2397 page_to_pfn(pages[i]), true, true);
2204 2398
@@ -2217,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2217 spte = sp->spt + i; 2411 spte = sp->spt + i;
2218 2412
2219 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2413 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { 2414 if (is_shadow_present_pte(*spte) || spte == sptep) {
2221 if (!start) 2415 if (!start)
2222 continue; 2416 continue;
2223 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2417 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2254{ 2448{
2255 struct kvm_shadow_walk_iterator iterator; 2449 struct kvm_shadow_walk_iterator iterator;
2256 struct kvm_mmu_page *sp; 2450 struct kvm_mmu_page *sp;
2257 int pt_write = 0; 2451 int emulate = 0;
2258 gfn_t pseudo_gfn; 2452 gfn_t pseudo_gfn;
2259 2453
2260 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2454 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2262 unsigned pte_access = ACC_ALL; 2456 unsigned pte_access = ACC_ALL;
2263 2457
2264 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 2458 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2265 0, write, 1, &pt_write, 2459 0, write, &emulate,
2266 level, gfn, pfn, prefault, map_writable); 2460 level, gfn, pfn, prefault, map_writable);
2267 direct_pte_prefetch(vcpu, iterator.sptep); 2461 direct_pte_prefetch(vcpu, iterator.sptep);
2268 ++vcpu->stat.pf_fixed; 2462 ++vcpu->stat.pf_fixed;
2269 break; 2463 break;
2270 } 2464 }
2271 2465
2272 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2466 if (!is_shadow_present_pte(*iterator.sptep)) {
2273 u64 base_addr = iterator.addr; 2467 u64 base_addr = iterator.addr;
2274 2468
2275 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2469 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2283,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2283 return -ENOMEM; 2477 return -ENOMEM;
2284 } 2478 }
2285 2479
2286 __set_spte(iterator.sptep, 2480 mmu_spte_set(iterator.sptep,
2287 __pa(sp->spt) 2481 __pa(sp->spt)
2288 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2482 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2289 | shadow_user_mask | shadow_x_mask 2483 | shadow_user_mask | shadow_x_mask
2290 | shadow_accessed_mask); 2484 | shadow_accessed_mask);
2291 } 2485 }
2292 } 2486 }
2293 return pt_write; 2487 return emulate;
2294} 2488}
2295 2489
2296static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2490static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,16 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2306 send_sig_info(SIGBUS, &info, tsk); 2500 send_sig_info(SIGBUS, &info, tsk);
2307} 2501}
2308 2502
2309static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2503static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2310{ 2504{
2311 kvm_release_pfn_clean(pfn); 2505 kvm_release_pfn_clean(pfn);
2312 if (is_hwpoison_pfn(pfn)) { 2506 if (is_hwpoison_pfn(pfn)) {
2313 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); 2507 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2314 return 0; 2508 return 0;
2315 } else if (is_fault_pfn(pfn)) 2509 }
2316 return -EFAULT;
2317 2510
2318 return 1; 2511 return -EFAULT;
2319} 2512}
2320 2513
2321static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 2514static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2360,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2360 } 2553 }
2361} 2554}
2362 2555
2556static bool mmu_invalid_pfn(pfn_t pfn)
2557{
2558 return unlikely(is_invalid_pfn(pfn));
2559}
2560
2561static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2562 pfn_t pfn, unsigned access, int *ret_val)
2563{
2564 bool ret = true;
2565
2566 /* The pfn is invalid, report the error! */
2567 if (unlikely(is_invalid_pfn(pfn))) {
2568 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2569 goto exit;
2570 }
2571
2572 if (unlikely(is_noslot_pfn(pfn)))
2573 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2574
2575 ret = false;
2576exit:
2577 return ret;
2578}
2579
2363static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2580static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2581 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365 2582
@@ -2394,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2394 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 2611 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 return 0; 2612 return 0;
2396 2613
2397 /* mmio */ 2614 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
2398 if (is_error_pfn(pfn)) 2615 return r;
2399 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2400 2616
2401 spin_lock(&vcpu->kvm->mmu_lock); 2617 spin_lock(&vcpu->kvm->mmu_lock);
2402 if (mmu_notifier_retry(vcpu, mmu_seq)) 2618 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2839,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2623 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2839 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2624 return; 2840 return;
2625 2841
2842 vcpu_clear_mmio_info(vcpu, ~0ul);
2626 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 2843 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 2844 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2628 hpa_t root = vcpu->arch.mmu.root_hpa; 2845 hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2667,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2667 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2884 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668} 2885}
2669 2886
2887static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2888{
2889 if (direct)
2890 return vcpu_match_mmio_gpa(vcpu, addr);
2891
2892 return vcpu_match_mmio_gva(vcpu, addr);
2893}
2894
2895
2896/*
2897 * On direct hosts, the last spte is only allows two states
2898 * for mmio page fault:
2899 * - It is the mmio spte
2900 * - It is zapped or it is being zapped.
2901 *
2902 * This function completely checks the spte when the last spte
2903 * is not the mmio spte.
2904 */
2905static bool check_direct_spte_mmio_pf(u64 spte)
2906{
2907 return __check_direct_spte_mmio_pf(spte);
2908}
2909
2910static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
2911{
2912 struct kvm_shadow_walk_iterator iterator;
2913 u64 spte = 0ull;
2914
2915 walk_shadow_page_lockless_begin(vcpu);
2916 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
2917 if (!is_shadow_present_pte(spte))
2918 break;
2919 walk_shadow_page_lockless_end(vcpu);
2920
2921 return spte;
2922}
2923
2924/*
2925 * If it is a real mmio page fault, return 1 and emulat the instruction
2926 * directly, return 0 to let CPU fault again on the address, -1 is
2927 * returned if bug is detected.
2928 */
2929int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2930{
2931 u64 spte;
2932
2933 if (quickly_check_mmio_pf(vcpu, addr, direct))
2934 return 1;
2935
2936 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
2937
2938 if (is_mmio_spte(spte)) {
2939 gfn_t gfn = get_mmio_spte_gfn(spte);
2940 unsigned access = get_mmio_spte_access(spte);
2941
2942 if (direct)
2943 addr = 0;
2944
2945 trace_handle_mmio_page_fault(addr, gfn, access);
2946 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
2947 return 1;
2948 }
2949
2950 /*
2951 * It's ok if the gva is remapped by other cpus on shadow guest,
2952 * it's a BUG if the gfn is not a mmio page.
2953 */
2954 if (direct && !check_direct_spte_mmio_pf(spte))
2955 return -1;
2956
2957 /*
2958 * If the page table is zapped by other cpus, let CPU fault again on
2959 * the address.
2960 */
2961 return 0;
2962}
2963EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
2964
2965static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
2966 u32 error_code, bool direct)
2967{
2968 int ret;
2969
2970 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
2971 WARN_ON(ret < 0);
2972 return ret;
2973}
2974
2670static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2975static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2671 u32 error_code, bool prefault) 2976 u32 error_code, bool prefault)
2672{ 2977{
@@ -2674,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2674 int r; 2979 int r;
2675 2980
2676 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 2981 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2982
2983 if (unlikely(error_code & PFERR_RSVD_MASK))
2984 return handle_mmio_page_fault(vcpu, gva, error_code, true);
2985
2677 r = mmu_topup_memory_caches(vcpu); 2986 r = mmu_topup_memory_caches(vcpu);
2678 if (r) 2987 if (r)
2679 return r; 2988 return r;
@@ -2750,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2750 ASSERT(vcpu); 3059 ASSERT(vcpu);
2751 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3060 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2752 3061
3062 if (unlikely(error_code & PFERR_RSVD_MASK))
3063 return handle_mmio_page_fault(vcpu, gpa, error_code, true);
3064
2753 r = mmu_topup_memory_caches(vcpu); 3065 r = mmu_topup_memory_caches(vcpu);
2754 if (r) 3066 if (r)
2755 return r; 3067 return r;
@@ -2767,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2767 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3079 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 return 0; 3080 return 0;
2769 3081
2770 /* mmio */ 3082 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
2771 if (is_error_pfn(pfn)) 3083 return r;
2772 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 3084
2773 spin_lock(&vcpu->kvm->mmu_lock); 3085 spin_lock(&vcpu->kvm->mmu_lock);
2774 if (mmu_notifier_retry(vcpu, mmu_seq)) 3086 if (mmu_notifier_retry(vcpu, mmu_seq))
2775 goto out_unlock; 3087 goto out_unlock;
@@ -2800,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2800 context->page_fault = nonpaging_page_fault; 3112 context->page_fault = nonpaging_page_fault;
2801 context->gva_to_gpa = nonpaging_gva_to_gpa; 3113 context->gva_to_gpa = nonpaging_gva_to_gpa;
2802 context->free = nonpaging_free; 3114 context->free = nonpaging_free;
2803 context->prefetch_page = nonpaging_prefetch_page;
2804 context->sync_page = nonpaging_sync_page; 3115 context->sync_page = nonpaging_sync_page;
2805 context->invlpg = nonpaging_invlpg; 3116 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte; 3117 context->update_pte = nonpaging_update_pte;
@@ -2848,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2848 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3159 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2849} 3160}
2850 3161
3162static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3163 int *nr_present)
3164{
3165 if (unlikely(is_mmio_spte(*sptep))) {
3166 if (gfn != get_mmio_spte_gfn(*sptep)) {
3167 mmu_spte_clear_no_track(sptep);
3168 return true;
3169 }
3170
3171 (*nr_present)++;
3172 mark_mmio_spte(sptep, gfn, access);
3173 return true;
3174 }
3175
3176 return false;
3177}
3178
2851#define PTTYPE 64 3179#define PTTYPE 64
2852#include "paging_tmpl.h" 3180#include "paging_tmpl.h"
2853#undef PTTYPE 3181#undef PTTYPE
@@ -2930,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2930 context->new_cr3 = paging_new_cr3; 3258 context->new_cr3 = paging_new_cr3;
2931 context->page_fault = paging64_page_fault; 3259 context->page_fault = paging64_page_fault;
2932 context->gva_to_gpa = paging64_gva_to_gpa; 3260 context->gva_to_gpa = paging64_gva_to_gpa;
2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 3261 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 3262 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte; 3263 context->update_pte = paging64_update_pte;
@@ -2959,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2959 context->page_fault = paging32_page_fault; 3286 context->page_fault = paging32_page_fault;
2960 context->gva_to_gpa = paging32_gva_to_gpa; 3287 context->gva_to_gpa = paging32_gva_to_gpa;
2961 context->free = paging_free; 3288 context->free = paging_free;
2962 context->prefetch_page = paging32_prefetch_page;
2963 context->sync_page = paging32_sync_page; 3289 context->sync_page = paging32_sync_page;
2964 context->invlpg = paging32_invlpg; 3290 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte; 3291 context->update_pte = paging32_update_pte;
@@ -2984,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2984 context->new_cr3 = nonpaging_new_cr3; 3310 context->new_cr3 = nonpaging_new_cr3;
2985 context->page_fault = tdp_page_fault; 3311 context->page_fault = tdp_page_fault;
2986 context->free = nonpaging_free; 3312 context->free = nonpaging_free;
2987 context->prefetch_page = nonpaging_prefetch_page;
2988 context->sync_page = nonpaging_sync_page; 3313 context->sync_page = nonpaging_sync_page;
2989 context->invlpg = nonpaging_invlpg; 3314 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte; 3315 context->update_pte = nonpaging_update_pte;
@@ -3023,6 +3348,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3023int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3348int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3024{ 3349{
3025 int r; 3350 int r;
3351 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3026 ASSERT(vcpu); 3352 ASSERT(vcpu);
3027 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3353 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3028 3354
@@ -3037,6 +3363,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3037 3363
3038 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3364 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3039 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3365 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3366 vcpu->arch.mmu.base_role.smep_andnot_wp
3367 = smep && !is_write_protection(vcpu);
3040 3368
3041 return r; 3369 return r;
3042} 3370}
@@ -3141,27 +3469,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3141} 3469}
3142EXPORT_SYMBOL_GPL(kvm_mmu_unload); 3470EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3143 3471
3144static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3145 struct kvm_mmu_page *sp,
3146 u64 *spte)
3147{
3148 u64 pte;
3149 struct kvm_mmu_page *child;
3150
3151 pte = *spte;
3152 if (is_shadow_present_pte(pte)) {
3153 if (is_last_spte(pte, sp->role.level))
3154 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3155 else {
3156 child = page_header(pte & PT64_BASE_ADDR_MASK);
3157 mmu_page_remove_parent_pte(child, spte);
3158 }
3159 }
3160 __set_spte(spte, shadow_trap_nonpresent_pte);
3161 if (is_large_pte(pte))
3162 --vcpu->kvm->stat.lpages;
3163}
3164
3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3472static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166 struct kvm_mmu_page *sp, u64 *spte, 3473 struct kvm_mmu_page *sp, u64 *spte,
3167 const void *new) 3474 const void *new)
@@ -3233,6 +3540,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3233 int level, npte, invlpg_counter, r, flooded = 0; 3540 int level, npte, invlpg_counter, r, flooded = 0;
3234 bool remote_flush, local_flush, zap_page; 3541 bool remote_flush, local_flush, zap_page;
3235 3542
3543 /*
3544 * If we don't have indirect shadow pages, it means no page is
3545 * write-protected, so we can exit simply.
3546 */
3547 if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
3548 return;
3549
3236 zap_page = remote_flush = local_flush = false; 3550 zap_page = remote_flush = local_flush = false;
3237 offset = offset_in_page(gpa); 3551 offset = offset_in_page(gpa);
3238 3552
@@ -3336,7 +3650,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3336 spte = &sp->spt[page_offset / sizeof(*spte)]; 3650 spte = &sp->spt[page_offset / sizeof(*spte)];
3337 while (npte--) { 3651 while (npte--) {
3338 entry = *spte; 3652 entry = *spte;
3339 mmu_pte_write_zap_pte(vcpu, sp, spte); 3653 mmu_page_zap_pte(vcpu->kvm, sp, spte);
3340 if (gentry && 3654 if (gentry &&
3341 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3655 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3342 & mask.word)) 3656 & mask.word))
@@ -3380,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3380 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3694 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3381 struct kvm_mmu_page, link); 3695 struct kvm_mmu_page, link);
3382 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 3696 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3383 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3384 ++vcpu->kvm->stat.mmu_recycled; 3697 ++vcpu->kvm->stat.mmu_recycled;
3385 } 3698 }
3699 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386} 3700}
3387 3701
3388int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 3702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3506,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3506 continue; 3820 continue;
3507 3821
3508 if (is_large_pte(pt[i])) { 3822 if (is_large_pte(pt[i])) {
3509 drop_spte(kvm, &pt[i], 3823 drop_spte(kvm, &pt[i]);
3510 shadow_trap_nonpresent_pte);
3511 --kvm->stat.lpages; 3824 --kvm->stat.lpages;
3512 continue; 3825 continue;
3513 } 3826 }
3514 3827
3515 /* avoid RMW */ 3828 /* avoid RMW */
3516 if (is_writable_pte(pt[i])) 3829 if (is_writable_pte(pt[i]))
3517 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3830 mmu_spte_update(&pt[i],
3831 pt[i] & ~PT_WRITABLE_MASK);
3518 } 3832 }
3519 } 3833 }
3520 kvm_flush_remote_tlbs(kvm); 3834 kvm_flush_remote_tlbs(kvm);
@@ -3590,25 +3904,18 @@ static struct shrinker mmu_shrinker = {
3590 3904
3591static void mmu_destroy_caches(void) 3905static void mmu_destroy_caches(void)
3592{ 3906{
3593 if (pte_chain_cache) 3907 if (pte_list_desc_cache)
3594 kmem_cache_destroy(pte_chain_cache); 3908 kmem_cache_destroy(pte_list_desc_cache);
3595 if (rmap_desc_cache)
3596 kmem_cache_destroy(rmap_desc_cache);
3597 if (mmu_page_header_cache) 3909 if (mmu_page_header_cache)
3598 kmem_cache_destroy(mmu_page_header_cache); 3910 kmem_cache_destroy(mmu_page_header_cache);
3599} 3911}
3600 3912
3601int kvm_mmu_module_init(void) 3913int kvm_mmu_module_init(void)
3602{ 3914{
3603 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3915 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
3604 sizeof(struct kvm_pte_chain), 3916 sizeof(struct pte_list_desc),
3605 0, 0, NULL);
3606 if (!pte_chain_cache)
3607 goto nomem;
3608 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3609 sizeof(struct kvm_rmap_desc),
3610 0, 0, NULL); 3917 0, 0, NULL);
3611 if (!rmap_desc_cache) 3918 if (!pte_list_desc_cache)
3612 goto nomem; 3919 goto nomem;
3613 3920
3614 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 3921 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
@@ -3775,16 +4082,17 @@ out:
3775int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 4082int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3776{ 4083{
3777 struct kvm_shadow_walk_iterator iterator; 4084 struct kvm_shadow_walk_iterator iterator;
4085 u64 spte;
3778 int nr_sptes = 0; 4086 int nr_sptes = 0;
3779 4087
3780 spin_lock(&vcpu->kvm->mmu_lock); 4088 walk_shadow_page_lockless_begin(vcpu);
3781 for_each_shadow_entry(vcpu, addr, iterator) { 4089 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3782 sptes[iterator.level-1] = *iterator.sptep; 4090 sptes[iterator.level-1] = spte;
3783 nr_sptes++; 4091 nr_sptes++;
3784 if (!is_shadow_present_pte(*iterator.sptep)) 4092 if (!is_shadow_present_pte(spte))
3785 break; 4093 break;
3786 } 4094 }
3787 spin_unlock(&vcpu->kvm->mmu_lock); 4095 walk_shadow_page_lockless_end(vcpu);
3788 4096
3789 return nr_sptes; 4097 return nr_sptes;
3790} 4098}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e..e374db9af02 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
53int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 54int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53 55
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 56static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte)
76 return pte & PT_PRESENT_MASK; 78 return pte & PT_PRESENT_MASK;
77} 79}
78 80
81static inline int is_writable_pte(unsigned long pte)
82{
83 return pte & PT_WRITABLE_MASK;
84}
85
86static inline bool is_write_protection(struct kvm_vcpu *vcpu)
87{
88 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
89}
90
91static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
92 bool write_fault, bool user_fault,
93 unsigned long pte)
94{
95 if (unlikely(write_fault && !is_writable_pte(pte)
96 && (user_fault || is_write_protection(vcpu))))
97 return false;
98
99 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
100 return false;
101
102 return true;
103}
79#endif 104#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5f6223b8bcf..2460a265be2 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
99 "level = %d\n", sp, level); 99 "level = %d\n", sp, level);
100 return; 100 return;
101 } 101 }
102
103 if (*sptep == shadow_notrap_nonpresent_pte) {
104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
113 return;
114 } 102 }
115 103
116 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) 104 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fdb3ed..eed67f34146 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198 198
199DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
200 TP_PROTO(struct kvm_mmu_page *sp),
201
202 TP_ARGS(sp)
203);
204
205TRACE_EVENT(
206 mark_mmio_spte,
207 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
208 TP_ARGS(sptep, gfn, access),
209
210 TP_STRUCT__entry(
211 __field(void *, sptep)
212 __field(gfn_t, gfn)
213 __field(unsigned, access)
214 ),
215
216 TP_fast_assign(
217 __entry->sptep = sptep;
218 __entry->gfn = gfn;
219 __entry->access = access;
220 ),
221
222 TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
223 __entry->access)
224);
225
226TRACE_EVENT(
227 handle_mmio_page_fault,
228 TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
229 TP_ARGS(addr, gfn, access),
230
231 TP_STRUCT__entry(
232 __field(u64, addr)
233 __field(gfn_t, gfn)
234 __field(unsigned, access)
235 ),
236
237 TP_fast_assign(
238 __entry->addr = addr;
239 __entry->gfn = gfn;
240 __entry->access = access;
241 ),
242
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access)
245);
246
199TRACE_EVENT( 247TRACE_EVENT(
200 kvm_mmu_audit, 248 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), 249 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9d03ad4dd5e..507e2b844cf 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
101 return (ret != orig_pte); 101 return (ret != orig_pte);
102} 102}
103 103
104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) 104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
105 bool last)
105{ 106{
106 unsigned access; 107 unsigned access;
107 108
108 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 109 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
110 if (last && !is_dirty_gpte(gpte))
111 access &= ~ACC_WRITE_MASK;
112
109#if PTTYPE == 64 113#if PTTYPE == 64
110 if (vcpu->arch.mmu.nx) 114 if (vcpu->arch.mmu.nx)
111 access &= ~(gpte >> PT64_NX_SHIFT); 115 access &= ~(gpte >> PT64_NX_SHIFT);
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113 return access; 117 return access;
114} 118}
115 119
120static bool FNAME(is_last_gpte)(struct guest_walker *walker,
121 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
122 pt_element_t gpte)
123{
124 if (walker->level == PT_PAGE_TABLE_LEVEL)
125 return true;
126
127 if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
128 (PTTYPE == 64 || is_pse(vcpu)))
129 return true;
130
131 if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
132 (mmu->root_level == PT64_ROOT_LEVEL))
133 return true;
134
135 return false;
136}
137
116/* 138/*
117 * Fetch a guest pte for a guest virtual address 139 * Fetch a guest pte for a guest virtual address
118 */ 140 */
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
125 gfn_t table_gfn; 147 gfn_t table_gfn;
126 unsigned index, pt_access, uninitialized_var(pte_access); 148 unsigned index, pt_access, uninitialized_var(pte_access);
127 gpa_t pte_gpa; 149 gpa_t pte_gpa;
128 bool eperm, present, rsvd_fault; 150 bool eperm;
129 int offset, write_fault, user_fault, fetch_fault; 151 int offset;
130 152 const int write_fault = access & PFERR_WRITE_MASK;
131 write_fault = access & PFERR_WRITE_MASK; 153 const int user_fault = access & PFERR_USER_MASK;
132 user_fault = access & PFERR_USER_MASK; 154 const int fetch_fault = access & PFERR_FETCH_MASK;
133 fetch_fault = access & PFERR_FETCH_MASK; 155 u16 errcode = 0;
134 156
135 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 157 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
136 fetch_fault); 158 fetch_fault);
137walk: 159retry_walk:
138 present = true; 160 eperm = false;
139 eperm = rsvd_fault = false;
140 walker->level = mmu->root_level; 161 walker->level = mmu->root_level;
141 pte = mmu->get_cr3(vcpu); 162 pte = mmu->get_cr3(vcpu);
142 163
@@ -144,10 +165,8 @@ walk:
144 if (walker->level == PT32E_ROOT_LEVEL) { 165 if (walker->level == PT32E_ROOT_LEVEL) {
145 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); 166 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
146 trace_kvm_mmu_paging_element(pte, walker->level); 167 trace_kvm_mmu_paging_element(pte, walker->level);
147 if (!is_present_gpte(pte)) { 168 if (!is_present_gpte(pte))
148 present = false;
149 goto error; 169 goto error;
150 }
151 --walker->level; 170 --walker->level;
152 } 171 }
153#endif 172#endif
@@ -170,42 +189,31 @@ walk:
170 189
171 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), 190 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
172 PFERR_USER_MASK|PFERR_WRITE_MASK); 191 PFERR_USER_MASK|PFERR_WRITE_MASK);
173 if (unlikely(real_gfn == UNMAPPED_GVA)) { 192 if (unlikely(real_gfn == UNMAPPED_GVA))
174 present = false; 193 goto error;
175 break;
176 }
177 real_gfn = gpa_to_gfn(real_gfn); 194 real_gfn = gpa_to_gfn(real_gfn);
178 195
179 host_addr = gfn_to_hva(vcpu->kvm, real_gfn); 196 host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180 if (unlikely(kvm_is_error_hva(host_addr))) { 197 if (unlikely(kvm_is_error_hva(host_addr)))
181 present = false; 198 goto error;
182 break;
183 }
184 199
185 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 200 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { 201 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
187 present = false; 202 goto error;
188 break;
189 }
190 203
191 trace_kvm_mmu_paging_element(pte, walker->level); 204 trace_kvm_mmu_paging_element(pte, walker->level);
192 205
193 if (unlikely(!is_present_gpte(pte))) { 206 if (unlikely(!is_present_gpte(pte)))
194 present = false; 207 goto error;
195 break;
196 }
197 208
198 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, 209 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199 walker->level))) { 210 walker->level))) {
200 rsvd_fault = true; 211 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
201 break; 212 goto error;
202 } 213 }
203 214
204 if (unlikely(write_fault && !is_writable_pte(pte) 215 if (!check_write_user_access(vcpu, write_fault, user_fault,
205 && (user_fault || is_write_protection(vcpu)))) 216 pte))
206 eperm = true;
207
208 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
209 eperm = true; 217 eperm = true;
210 218
211#if PTTYPE == 64 219#if PTTYPE == 64
@@ -213,39 +221,35 @@ walk:
213 eperm = true; 221 eperm = true;
214#endif 222#endif
215 223
216 if (!eperm && !rsvd_fault 224 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
217 && unlikely(!(pte & PT_ACCESSED_MASK))) {
218 int ret; 225 int ret;
219 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 226 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
220 sizeof(pte)); 227 sizeof(pte));
221 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 228 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
222 pte, pte|PT_ACCESSED_MASK); 229 pte, pte|PT_ACCESSED_MASK);
223 if (unlikely(ret < 0)) { 230 if (unlikely(ret < 0))
224 present = false; 231 goto error;
225 break; 232 else if (ret)
226 } else if (ret) 233 goto retry_walk;
227 goto walk;
228 234
229 mark_page_dirty(vcpu->kvm, table_gfn); 235 mark_page_dirty(vcpu->kvm, table_gfn);
230 pte |= PT_ACCESSED_MASK; 236 pte |= PT_ACCESSED_MASK;
231 } 237 }
232 238
233 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
234
235 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
236 240
237 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 241 if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
238 ((walker->level == PT_DIRECTORY_LEVEL) &&
239 is_large_pte(pte) &&
240 (PTTYPE == 64 || is_pse(vcpu))) ||
241 ((walker->level == PT_PDPE_LEVEL) &&
242 is_large_pte(pte) &&
243 mmu->root_level == PT64_ROOT_LEVEL)) {
244 int lvl = walker->level; 242 int lvl = walker->level;
245 gpa_t real_gpa; 243 gpa_t real_gpa;
246 gfn_t gfn; 244 gfn_t gfn;
247 u32 ac; 245 u32 ac;
248 246
247 /* check if the kernel is fetching from user page */
248 if (unlikely(pte_access & PT_USER_MASK) &&
249 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
250 if (fetch_fault && !user_fault)
251 eperm = true;
252
249 gfn = gpte_to_gfn_lvl(pte, lvl); 253 gfn = gpte_to_gfn_lvl(pte, lvl);
250 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; 254 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
251 255
@@ -266,12 +270,14 @@ walk:
266 break; 270 break;
267 } 271 }
268 272
269 pt_access = pte_access; 273 pt_access &= FNAME(gpte_access)(vcpu, pte, false);
270 --walker->level; 274 --walker->level;
271 } 275 }
272 276
273 if (unlikely(!present || eperm || rsvd_fault)) 277 if (unlikely(eperm)) {
278 errcode |= PFERR_PRESENT_MASK;
274 goto error; 279 goto error;
280 }
275 281
276 if (write_fault && unlikely(!is_dirty_gpte(pte))) { 282 if (write_fault && unlikely(!is_dirty_gpte(pte))) {
277 int ret; 283 int ret;
@@ -279,17 +285,17 @@ walk:
279 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 285 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
280 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 286 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
281 pte, pte|PT_DIRTY_MASK); 287 pte, pte|PT_DIRTY_MASK);
282 if (unlikely(ret < 0)) { 288 if (unlikely(ret < 0))
283 present = false;
284 goto error; 289 goto error;
285 } else if (ret) 290 else if (ret)
286 goto walk; 291 goto retry_walk;
287 292
288 mark_page_dirty(vcpu->kvm, table_gfn); 293 mark_page_dirty(vcpu->kvm, table_gfn);
289 pte |= PT_DIRTY_MASK; 294 pte |= PT_DIRTY_MASK;
290 walker->ptes[walker->level - 1] = pte; 295 walker->ptes[walker->level - 1] = pte;
291 } 296 }
292 297
298 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
293 walker->pt_access = pt_access; 299 walker->pt_access = pt_access;
294 walker->pte_access = pte_access; 300 walker->pte_access = pte_access;
295 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 301 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -297,19 +303,14 @@ walk:
297 return 1; 303 return 1;
298 304
299error: 305error:
306 errcode |= write_fault | user_fault;
307 if (fetch_fault && (mmu->nx ||
308 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
309 errcode |= PFERR_FETCH_MASK;
310
300 walker->fault.vector = PF_VECTOR; 311 walker->fault.vector = PF_VECTOR;
301 walker->fault.error_code_valid = true; 312 walker->fault.error_code_valid = true;
302 walker->fault.error_code = 0; 313 walker->fault.error_code = errcode;
303 if (present)
304 walker->fault.error_code |= PFERR_PRESENT_MASK;
305
306 walker->fault.error_code |= write_fault | user_fault;
307
308 if (fetch_fault && mmu->nx)
309 walker->fault.error_code |= PFERR_FETCH_MASK;
310 if (rsvd_fault)
311 walker->fault.error_code |= PFERR_RSVD_MASK;
312
313 walker->fault.address = addr; 314 walker->fault.address = addr;
314 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 315 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
315 316
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
336 struct kvm_mmu_page *sp, u64 *spte, 337 struct kvm_mmu_page *sp, u64 *spte,
337 pt_element_t gpte) 338 pt_element_t gpte)
338{ 339{
339 u64 nonpresent = shadow_trap_nonpresent_pte;
340
341 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) 340 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
342 goto no_present; 341 goto no_present;
343 342
344 if (!is_present_gpte(gpte)) { 343 if (!is_present_gpte(gpte))
345 if (!sp->unsync)
346 nonpresent = shadow_notrap_nonpresent_pte;
347 goto no_present; 344 goto no_present;
348 }
349 345
350 if (!(gpte & PT_ACCESSED_MASK)) 346 if (!(gpte & PT_ACCESSED_MASK))
351 goto no_present; 347 goto no_present;
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
353 return false; 349 return false;
354 350
355no_present: 351no_present:
356 drop_spte(vcpu->kvm, spte, nonpresent); 352 drop_spte(vcpu->kvm, spte);
357 return true; 353 return true;
358} 354}
359 355
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
369 return; 365 return;
370 366
371 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 367 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
372 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 368 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
373 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 369 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
374 if (is_error_pfn(pfn)) { 370 if (mmu_invalid_pfn(pfn)) {
375 kvm_release_pfn_clean(pfn); 371 kvm_release_pfn_clean(pfn);
376 return; 372 return;
377 } 373 }
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
381 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 377 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
382 */ 378 */
383 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 379 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
384 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, 380 NULL, PT_PAGE_TABLE_LEVEL,
385 gpte_to_gfn(gpte), pfn, true, true); 381 gpte_to_gfn(gpte), pfn, true, true);
386} 382}
387 383
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
432 unsigned pte_access; 428 unsigned pte_access;
433 gfn_t gfn; 429 gfn_t gfn;
434 pfn_t pfn; 430 pfn_t pfn;
435 bool dirty;
436 431
437 if (spte == sptep) 432 if (spte == sptep)
438 continue; 433 continue;
439 434
440 if (*spte != shadow_trap_nonpresent_pte) 435 if (is_shadow_present_pte(*spte))
441 continue; 436 continue;
442 437
443 gpte = gptep[i]; 438 gpte = gptep[i];
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
445 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 440 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
446 continue; 441 continue;
447 442
448 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 443 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
444 true);
449 gfn = gpte_to_gfn(gpte); 445 gfn = gpte_to_gfn(gpte);
450 dirty = is_dirty_gpte(gpte);
451 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 446 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
452 (pte_access & ACC_WRITE_MASK) && dirty); 447 pte_access & ACC_WRITE_MASK);
453 if (is_error_pfn(pfn)) { 448 if (mmu_invalid_pfn(pfn)) {
454 kvm_release_pfn_clean(pfn); 449 kvm_release_pfn_clean(pfn);
455 break; 450 break;
456 } 451 }
457 452
458 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 453 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
459 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, 454 NULL, PT_PAGE_TABLE_LEVEL, gfn,
460 pfn, true, true); 455 pfn, true, true);
461 } 456 }
462} 457}
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
467static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 462static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
468 struct guest_walker *gw, 463 struct guest_walker *gw,
469 int user_fault, int write_fault, int hlevel, 464 int user_fault, int write_fault, int hlevel,
470 int *ptwrite, pfn_t pfn, bool map_writable, 465 int *emulate, pfn_t pfn, bool map_writable,
471 bool prefault) 466 bool prefault)
472{ 467{
473 unsigned access = gw->pt_access; 468 unsigned access = gw->pt_access;
474 struct kvm_mmu_page *sp = NULL; 469 struct kvm_mmu_page *sp = NULL;
475 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
476 int top_level; 470 int top_level;
477 unsigned direct_access; 471 unsigned direct_access;
478 struct kvm_shadow_walk_iterator it; 472 struct kvm_shadow_walk_iterator it;
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
480 if (!is_present_gpte(gw->ptes[gw->level - 1])) 474 if (!is_present_gpte(gw->ptes[gw->level - 1]))
481 return NULL; 475 return NULL;
482 476
483 direct_access = gw->pt_access & gw->pte_access; 477 direct_access = gw->pte_access;
484 if (!dirty)
485 direct_access &= ~ACC_WRITE_MASK;
486 478
487 top_level = vcpu->arch.mmu.root_level; 479 top_level = vcpu->arch.mmu.root_level;
488 if (top_level == PT32E_ROOT_LEVEL) 480 if (top_level == PT32E_ROOT_LEVEL)
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
540 link_shadow_page(it.sptep, sp); 532 link_shadow_page(it.sptep, sp);
541 } 533 }
542 534
543 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 535 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
544 user_fault, write_fault, dirty, ptwrite, it.level, 536 user_fault, write_fault, emulate, it.level,
545 gw->gfn, pfn, prefault, map_writable); 537 gw->gfn, pfn, prefault, map_writable);
546 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 538 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
547 539
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
575 int user_fault = error_code & PFERR_USER_MASK; 567 int user_fault = error_code & PFERR_USER_MASK;
576 struct guest_walker walker; 568 struct guest_walker walker;
577 u64 *sptep; 569 u64 *sptep;
578 int write_pt = 0; 570 int emulate = 0;
579 int r; 571 int r;
580 pfn_t pfn; 572 pfn_t pfn;
581 int level = PT_PAGE_TABLE_LEVEL; 573 int level = PT_PAGE_TABLE_LEVEL;
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
585 577
586 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 578 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
587 579
580 if (unlikely(error_code & PFERR_RSVD_MASK))
581 return handle_mmio_page_fault(vcpu, addr, error_code,
582 mmu_is_nested(vcpu));
583
588 r = mmu_topup_memory_caches(vcpu); 584 r = mmu_topup_memory_caches(vcpu);
589 if (r) 585 if (r)
590 return r; 586 return r;
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
623 &map_writable)) 619 &map_writable))
624 return 0; 620 return 0;
625 621
626 /* mmio */ 622 if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
627 if (is_error_pfn(pfn)) 623 walker.gfn, pfn, walker.pte_access, &r))
628 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); 624 return r;
629 625
630 spin_lock(&vcpu->kvm->mmu_lock); 626 spin_lock(&vcpu->kvm->mmu_lock);
631 if (mmu_notifier_retry(vcpu, mmu_seq)) 627 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
636 if (!force_pt_level) 632 if (!force_pt_level)
637 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 633 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
638 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 634 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
639 level, &write_pt, pfn, map_writable, prefault); 635 level, &emulate, pfn, map_writable, prefault);
640 (void)sptep; 636 (void)sptep;
641 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 637 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
642 sptep, *sptep, write_pt); 638 sptep, *sptep, emulate);
643 639
644 if (!write_pt) 640 if (!emulate)
645 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 641 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
646 642
647 ++vcpu->stat.pf_fixed; 643 ++vcpu->stat.pf_fixed;
648 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 644 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
649 spin_unlock(&vcpu->kvm->mmu_lock); 645 spin_unlock(&vcpu->kvm->mmu_lock);
650 646
651 return write_pt; 647 return emulate;
652 648
653out_unlock: 649out_unlock:
654 spin_unlock(&vcpu->kvm->mmu_lock); 650 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
665 u64 *sptep; 661 u64 *sptep;
666 int need_flush = 0; 662 int need_flush = 0;
667 663
664 vcpu_clear_mmio_info(vcpu, gva);
665
668 spin_lock(&vcpu->kvm->mmu_lock); 666 spin_lock(&vcpu->kvm->mmu_lock);
669 667
670 for_each_shadow_entry(vcpu, gva, iterator) { 668 for_each_shadow_entry(vcpu, gva, iterator) {
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
688 if (is_shadow_present_pte(*sptep)) { 686 if (is_shadow_present_pte(*sptep)) {
689 if (is_large_pte(*sptep)) 687 if (is_large_pte(*sptep))
690 --vcpu->kvm->stat.lpages; 688 --vcpu->kvm->stat.lpages;
691 drop_spte(vcpu->kvm, sptep, 689 drop_spte(vcpu->kvm, sptep);
692 shadow_trap_nonpresent_pte);
693 need_flush = 1; 690 need_flush = 1;
694 } else 691 } else if (is_mmio_spte(*sptep))
695 __set_spte(sptep, shadow_trap_nonpresent_pte); 692 mmu_spte_clear_no_track(sptep);
693
696 break; 694 break;
697 } 695 }
698 696
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
752 return gpa; 750 return gpa;
753} 751}
754 752
755static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
756 struct kvm_mmu_page *sp)
757{
758 int i, j, offset, r;
759 pt_element_t pt[256 / sizeof(pt_element_t)];
760 gpa_t pte_gpa;
761
762 if (sp->role.direct
763 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
764 nonpaging_prefetch_page(vcpu, sp);
765 return;
766 }
767
768 pte_gpa = gfn_to_gpa(sp->gfn);
769 if (PTTYPE == 32) {
770 offset = sp->role.quadrant << PT64_LEVEL_BITS;
771 pte_gpa += offset * sizeof(pt_element_t);
772 }
773
774 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
775 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
776 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
777 for (j = 0; j < ARRAY_SIZE(pt); ++j)
778 if (r || is_present_gpte(pt[j]))
779 sp->spt[i+j] = shadow_trap_nonpresent_pte;
780 else
781 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
782 }
783}
784
785/* 753/*
786 * Using the cached information from sp->gfns is safe because: 754 * Using the cached information from sp->gfns is safe because:
787 * - The spte has a reference to the struct page, so the pfn for a given gfn 755 * - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
817 gpa_t pte_gpa; 785 gpa_t pte_gpa;
818 gfn_t gfn; 786 gfn_t gfn;
819 787
820 if (!is_shadow_present_pte(sp->spt[i])) 788 if (!sp->spt[i])
821 continue; 789 continue;
822 790
823 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 791 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
826 sizeof(pt_element_t))) 794 sizeof(pt_element_t)))
827 return -EINVAL; 795 return -EINVAL;
828 796
829 gfn = gpte_to_gfn(gpte);
830
831 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 797 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
832 vcpu->kvm->tlbs_dirty++; 798 vcpu->kvm->tlbs_dirty++;
833 continue; 799 continue;
834 } 800 }
835 801
802 gfn = gpte_to_gfn(gpte);
803 pte_access = sp->role.access;
804 pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
805
806 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
807 continue;
808
836 if (gfn != sp->gfns[i]) { 809 if (gfn != sp->gfns[i]) {
837 drop_spte(vcpu->kvm, &sp->spt[i], 810 drop_spte(vcpu->kvm, &sp->spt[i]);
838 shadow_trap_nonpresent_pte);
839 vcpu->kvm->tlbs_dirty++; 811 vcpu->kvm->tlbs_dirty++;
840 continue; 812 continue;
841 } 813 }
842 814
843 nr_present++; 815 nr_present++;
844 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 816
845 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 817 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
846 818
847 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 819 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
848 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 820 PT_PAGE_TABLE_LEVEL, gfn,
849 spte_to_pfn(sp->spt[i]), true, false, 821 spte_to_pfn(sp->spt[i]), true, false,
850 host_writable); 822 host_writable);
851 } 823 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 506e4fe23ad..475d1c94850 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1496 update_cr0_intercept(svm); 1496 update_cr0_intercept(svm);
1497} 1497}
1498 1498
1499static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1499static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1500{ 1500{
1501 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 1501 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1503 1503
1504 if (cr4 & X86_CR4_VMXE)
1505 return 1;
1506
1504 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1507 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1505 svm_flush_tlb(vcpu); 1508 svm_flush_tlb(vcpu);
1506 1509
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1510 cr4 |= host_cr4_mce; 1513 cr4 |= host_cr4_mce;
1511 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1514 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1512 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1515 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1516 return 0;
1513} 1517}
1514 1518
1515static void svm_set_segment(struct kvm_vcpu *vcpu, 1519static void svm_set_segment(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index abd86e865be..ae432ea1cd8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -15,7 +15,7 @@
15#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
16#include <linux/kvm.h> 16#include <linux/kvm.h>
17#include <linux/hrtimer.h> 17#include <linux/hrtimer.h>
18#include <asm/atomic.h> 18#include <linux/atomic.h>
19#include "kvm_timer.h" 19#include "kvm_timer.h"
20 20
21static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) 21static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index db932760ea8..3ff898c104f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn,
675 ), 675 ),
676 676
677 TP_fast_assign( 677 TP_fast_assign(
678 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; 678 __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
679 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); 679 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
680 __entry->len = vcpu->arch.emulate_ctxt.decode.eip 680 __entry->len = vcpu->arch.emulate_ctxt._eip
681 - vcpu->arch.emulate_ctxt.decode.fetch.start; 681 - vcpu->arch.emulate_ctxt.fetch.start;
682 memcpy(__entry->insn, 682 memcpy(__entry->insn,
683 vcpu->arch.emulate_ctxt.decode.fetch.data, 683 vcpu->arch.emulate_ctxt.fetch.data,
684 15); 684 15);
685 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); 685 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
686 __entry->failed = failed; 686 __entry->failed = failed;
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
698#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) 698#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
699#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) 699#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
700 700
701TRACE_EVENT(
702 vcpu_match_mmio,
703 TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
704 TP_ARGS(gva, gpa, write, gpa_match),
705
706 TP_STRUCT__entry(
707 __field(gva_t, gva)
708 __field(gpa_t, gpa)
709 __field(bool, write)
710 __field(bool, gpa_match)
711 ),
712
713 TP_fast_assign(
714 __entry->gva = gva;
715 __entry->gpa = gpa;
716 __entry->write = write;
717 __entry->gpa_match = gpa_match
718 ),
719
720 TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
721 __entry->write ? "Write" : "Read",
722 __entry->gpa_match ? "GPA" : "GVA")
723);
701#endif /* _TRACE_KVM_H */ 724#endif /* _TRACE_KVM_H */
702 725
703#undef TRACE_INCLUDE_PATH 726#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60ea42..e65a158dee6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -43,13 +43,12 @@
43#include "trace.h" 43#include "trace.h"
44 44
45#define __ex(x) __kvm_handle_fault_on_reboot(x) 45#define __ex(x) __kvm_handle_fault_on_reboot(x)
46#define __ex_clear(x, reg) \
47 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
46 48
47MODULE_AUTHOR("Qumranet"); 49MODULE_AUTHOR("Qumranet");
48MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
49 51
50static int __read_mostly bypass_guest_pf = 1;
51module_param(bypass_guest_pf, bool, S_IRUGO);
52
53static int __read_mostly enable_vpid = 1; 52static int __read_mostly enable_vpid = 1;
54module_param_named(vpid, enable_vpid, bool, 0444); 53module_param_named(vpid, enable_vpid, bool, 0444);
55 54
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
72static int __read_mostly yield_on_hlt = 1; 71static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO); 72module_param(yield_on_hlt, bool, S_IRUGO);
74 73
74/*
75 * If nested=1, nested virtualization is supported, i.e., guests may use
76 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
77 * use VMX instructions.
78 */
79static int __read_mostly nested = 0;
80module_param(nested, bool, S_IRUGO);
81
75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 82#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 83 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
77#define KVM_GUEST_CR0_MASK \ 84#define KVM_GUEST_CR0_MASK \
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
109module_param(ple_window, int, S_IRUGO); 116module_param(ple_window, int, S_IRUGO);
110 117
111#define NR_AUTOLOAD_MSRS 1 118#define NR_AUTOLOAD_MSRS 1
119#define VMCS02_POOL_SIZE 1
112 120
113struct vmcs { 121struct vmcs {
114 u32 revision_id; 122 u32 revision_id;
@@ -116,17 +124,237 @@ struct vmcs {
116 char data[0]; 124 char data[0];
117}; 125};
118 126
127/*
128 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
129 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
130 * loaded on this CPU (so we can clear them if the CPU goes down).
131 */
132struct loaded_vmcs {
133 struct vmcs *vmcs;
134 int cpu;
135 int launched;
136 struct list_head loaded_vmcss_on_cpu_link;
137};
138
119struct shared_msr_entry { 139struct shared_msr_entry {
120 unsigned index; 140 unsigned index;
121 u64 data; 141 u64 data;
122 u64 mask; 142 u64 mask;
123}; 143};
124 144
145/*
146 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
147 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
148 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
149 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
150 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
151 * More than one of these structures may exist, if L1 runs multiple L2 guests.
152 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
153 * underlying hardware which will be used to run L2.
154 * This structure is packed to ensure that its layout is identical across
155 * machines (necessary for live migration).
156 * If there are changes in this struct, VMCS12_REVISION must be changed.
157 */
158typedef u64 natural_width;
159struct __packed vmcs12 {
160 /* According to the Intel spec, a VMCS region must start with the
161 * following two fields. Then follow implementation-specific data.
162 */
163 u32 revision_id;
164 u32 abort;
165
166 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
167 u32 padding[7]; /* room for future expansion */
168
169 u64 io_bitmap_a;
170 u64 io_bitmap_b;
171 u64 msr_bitmap;
172 u64 vm_exit_msr_store_addr;
173 u64 vm_exit_msr_load_addr;
174 u64 vm_entry_msr_load_addr;
175 u64 tsc_offset;
176 u64 virtual_apic_page_addr;
177 u64 apic_access_addr;
178 u64 ept_pointer;
179 u64 guest_physical_address;
180 u64 vmcs_link_pointer;
181 u64 guest_ia32_debugctl;
182 u64 guest_ia32_pat;
183 u64 guest_ia32_efer;
184 u64 guest_ia32_perf_global_ctrl;
185 u64 guest_pdptr0;
186 u64 guest_pdptr1;
187 u64 guest_pdptr2;
188 u64 guest_pdptr3;
189 u64 host_ia32_pat;
190 u64 host_ia32_efer;
191 u64 host_ia32_perf_global_ctrl;
192 u64 padding64[8]; /* room for future expansion */
193 /*
194 * To allow migration of L1 (complete with its L2 guests) between
195 * machines of different natural widths (32 or 64 bit), we cannot have
196 * unsigned long fields with no explict size. We use u64 (aliased
197 * natural_width) instead. Luckily, x86 is little-endian.
198 */
199 natural_width cr0_guest_host_mask;
200 natural_width cr4_guest_host_mask;
201 natural_width cr0_read_shadow;
202 natural_width cr4_read_shadow;
203 natural_width cr3_target_value0;
204 natural_width cr3_target_value1;
205 natural_width cr3_target_value2;
206 natural_width cr3_target_value3;
207 natural_width exit_qualification;
208 natural_width guest_linear_address;
209 natural_width guest_cr0;
210 natural_width guest_cr3;
211 natural_width guest_cr4;
212 natural_width guest_es_base;
213 natural_width guest_cs_base;
214 natural_width guest_ss_base;
215 natural_width guest_ds_base;
216 natural_width guest_fs_base;
217 natural_width guest_gs_base;
218 natural_width guest_ldtr_base;
219 natural_width guest_tr_base;
220 natural_width guest_gdtr_base;
221 natural_width guest_idtr_base;
222 natural_width guest_dr7;
223 natural_width guest_rsp;
224 natural_width guest_rip;
225 natural_width guest_rflags;
226 natural_width guest_pending_dbg_exceptions;
227 natural_width guest_sysenter_esp;
228 natural_width guest_sysenter_eip;
229 natural_width host_cr0;
230 natural_width host_cr3;
231 natural_width host_cr4;
232 natural_width host_fs_base;
233 natural_width host_gs_base;
234 natural_width host_tr_base;
235 natural_width host_gdtr_base;
236 natural_width host_idtr_base;
237 natural_width host_ia32_sysenter_esp;
238 natural_width host_ia32_sysenter_eip;
239 natural_width host_rsp;
240 natural_width host_rip;
241 natural_width paddingl[8]; /* room for future expansion */
242 u32 pin_based_vm_exec_control;
243 u32 cpu_based_vm_exec_control;
244 u32 exception_bitmap;
245 u32 page_fault_error_code_mask;
246 u32 page_fault_error_code_match;
247 u32 cr3_target_count;
248 u32 vm_exit_controls;
249 u32 vm_exit_msr_store_count;
250 u32 vm_exit_msr_load_count;
251 u32 vm_entry_controls;
252 u32 vm_entry_msr_load_count;
253 u32 vm_entry_intr_info_field;
254 u32 vm_entry_exception_error_code;
255 u32 vm_entry_instruction_len;
256 u32 tpr_threshold;
257 u32 secondary_vm_exec_control;
258 u32 vm_instruction_error;
259 u32 vm_exit_reason;
260 u32 vm_exit_intr_info;
261 u32 vm_exit_intr_error_code;
262 u32 idt_vectoring_info_field;
263 u32 idt_vectoring_error_code;
264 u32 vm_exit_instruction_len;
265 u32 vmx_instruction_info;
266 u32 guest_es_limit;
267 u32 guest_cs_limit;
268 u32 guest_ss_limit;
269 u32 guest_ds_limit;
270 u32 guest_fs_limit;
271 u32 guest_gs_limit;
272 u32 guest_ldtr_limit;
273 u32 guest_tr_limit;
274 u32 guest_gdtr_limit;
275 u32 guest_idtr_limit;
276 u32 guest_es_ar_bytes;
277 u32 guest_cs_ar_bytes;
278 u32 guest_ss_ar_bytes;
279 u32 guest_ds_ar_bytes;
280 u32 guest_fs_ar_bytes;
281 u32 guest_gs_ar_bytes;
282 u32 guest_ldtr_ar_bytes;
283 u32 guest_tr_ar_bytes;
284 u32 guest_interruptibility_info;
285 u32 guest_activity_state;
286 u32 guest_sysenter_cs;
287 u32 host_ia32_sysenter_cs;
288 u32 padding32[8]; /* room for future expansion */
289 u16 virtual_processor_id;
290 u16 guest_es_selector;
291 u16 guest_cs_selector;
292 u16 guest_ss_selector;
293 u16 guest_ds_selector;
294 u16 guest_fs_selector;
295 u16 guest_gs_selector;
296 u16 guest_ldtr_selector;
297 u16 guest_tr_selector;
298 u16 host_es_selector;
299 u16 host_cs_selector;
300 u16 host_ss_selector;
301 u16 host_ds_selector;
302 u16 host_fs_selector;
303 u16 host_gs_selector;
304 u16 host_tr_selector;
305};
306
307/*
308 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
309 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
310 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
311 */
312#define VMCS12_REVISION 0x11e57ed0
313
314/*
315 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
316 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
317 * current implementation, 4K are reserved to avoid future complications.
318 */
319#define VMCS12_SIZE 0x1000
320
321/* Used to remember the last vmcs02 used for some recently used vmcs12s */
322struct vmcs02_list {
323 struct list_head list;
324 gpa_t vmptr;
325 struct loaded_vmcs vmcs02;
326};
327
328/*
329 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
330 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
331 */
332struct nested_vmx {
333 /* Has the level1 guest done vmxon? */
334 bool vmxon;
335
336 /* The guest-physical address of the current VMCS L1 keeps for L2 */
337 gpa_t current_vmptr;
338 /* The host-usable pointer to the above */
339 struct page *current_vmcs12_page;
340 struct vmcs12 *current_vmcs12;
341
342 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
343 struct list_head vmcs02_pool;
344 int vmcs02_num;
345 u64 vmcs01_tsc_offset;
346 /* L2 must run next, and mustn't decide to exit to L1. */
347 bool nested_run_pending;
348 /*
349 * Guest pages referred to in vmcs02 with host-physical pointers, so
350 * we must keep them pinned while L2 runs.
351 */
352 struct page *apic_access_page;
353};
354
125struct vcpu_vmx { 355struct vcpu_vmx {
126 struct kvm_vcpu vcpu; 356 struct kvm_vcpu vcpu;
127 struct list_head local_vcpus_link;
128 unsigned long host_rsp; 357 unsigned long host_rsp;
129 int launched;
130 u8 fail; 358 u8 fail;
131 u8 cpl; 359 u8 cpl;
132 bool nmi_known_unmasked; 360 bool nmi_known_unmasked;
@@ -140,7 +368,14 @@ struct vcpu_vmx {
140 u64 msr_host_kernel_gs_base; 368 u64 msr_host_kernel_gs_base;
141 u64 msr_guest_kernel_gs_base; 369 u64 msr_guest_kernel_gs_base;
142#endif 370#endif
143 struct vmcs *vmcs; 371 /*
372 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
373 * non-nested (L1) guest, it always points to vmcs01. For a nested
374 * guest (L2), it points to a different VMCS.
375 */
376 struct loaded_vmcs vmcs01;
377 struct loaded_vmcs *loaded_vmcs;
378 bool __launched; /* temporary, used in vmx_vcpu_run */
144 struct msr_autoload { 379 struct msr_autoload {
145 unsigned nr; 380 unsigned nr;
146 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 381 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -176,6 +411,9 @@ struct vcpu_vmx {
176 u32 exit_reason; 411 u32 exit_reason;
177 412
178 bool rdtscp_enabled; 413 bool rdtscp_enabled;
414
415 /* Support for a guest hypervisor (nested VMX) */
416 struct nested_vmx nested;
179}; 417};
180 418
181enum segment_cache_field { 419enum segment_cache_field {
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
192 return container_of(vcpu, struct vcpu_vmx, vcpu); 430 return container_of(vcpu, struct vcpu_vmx, vcpu);
193} 431}
194 432
433#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
434#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
435#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
436 [number##_HIGH] = VMCS12_OFFSET(name)+4
437
438static unsigned short vmcs_field_to_offset_table[] = {
439 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
440 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
441 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
442 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
443 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
444 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
445 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
446 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
447 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
448 FIELD(HOST_ES_SELECTOR, host_es_selector),
449 FIELD(HOST_CS_SELECTOR, host_cs_selector),
450 FIELD(HOST_SS_SELECTOR, host_ss_selector),
451 FIELD(HOST_DS_SELECTOR, host_ds_selector),
452 FIELD(HOST_FS_SELECTOR, host_fs_selector),
453 FIELD(HOST_GS_SELECTOR, host_gs_selector),
454 FIELD(HOST_TR_SELECTOR, host_tr_selector),
455 FIELD64(IO_BITMAP_A, io_bitmap_a),
456 FIELD64(IO_BITMAP_B, io_bitmap_b),
457 FIELD64(MSR_BITMAP, msr_bitmap),
458 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
459 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
460 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
461 FIELD64(TSC_OFFSET, tsc_offset),
462 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
463 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
464 FIELD64(EPT_POINTER, ept_pointer),
465 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
466 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
467 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
468 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
469 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
470 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
471 FIELD64(GUEST_PDPTR0, guest_pdptr0),
472 FIELD64(GUEST_PDPTR1, guest_pdptr1),
473 FIELD64(GUEST_PDPTR2, guest_pdptr2),
474 FIELD64(GUEST_PDPTR3, guest_pdptr3),
475 FIELD64(HOST_IA32_PAT, host_ia32_pat),
476 FIELD64(HOST_IA32_EFER, host_ia32_efer),
477 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
478 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
479 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
480 FIELD(EXCEPTION_BITMAP, exception_bitmap),
481 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
482 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
483 FIELD(CR3_TARGET_COUNT, cr3_target_count),
484 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
485 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
486 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
487 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
488 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
489 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
490 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
491 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
492 FIELD(TPR_THRESHOLD, tpr_threshold),
493 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
494 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
495 FIELD(VM_EXIT_REASON, vm_exit_reason),
496 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
497 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
498 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
499 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
500 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
501 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
502 FIELD(GUEST_ES_LIMIT, guest_es_limit),
503 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
504 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
505 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
506 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
507 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
508 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
509 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
510 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
511 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
512 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
513 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
514 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
515 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
516 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
517 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
518 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
519 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
520 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
521 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
522 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
523 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
524 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
525 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
526 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
527 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
528 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
529 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
530 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
531 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
532 FIELD(EXIT_QUALIFICATION, exit_qualification),
533 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
534 FIELD(GUEST_CR0, guest_cr0),
535 FIELD(GUEST_CR3, guest_cr3),
536 FIELD(GUEST_CR4, guest_cr4),
537 FIELD(GUEST_ES_BASE, guest_es_base),
538 FIELD(GUEST_CS_BASE, guest_cs_base),
539 FIELD(GUEST_SS_BASE, guest_ss_base),
540 FIELD(GUEST_DS_BASE, guest_ds_base),
541 FIELD(GUEST_FS_BASE, guest_fs_base),
542 FIELD(GUEST_GS_BASE, guest_gs_base),
543 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
544 FIELD(GUEST_TR_BASE, guest_tr_base),
545 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
546 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
547 FIELD(GUEST_DR7, guest_dr7),
548 FIELD(GUEST_RSP, guest_rsp),
549 FIELD(GUEST_RIP, guest_rip),
550 FIELD(GUEST_RFLAGS, guest_rflags),
551 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
552 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
553 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
554 FIELD(HOST_CR0, host_cr0),
555 FIELD(HOST_CR3, host_cr3),
556 FIELD(HOST_CR4, host_cr4),
557 FIELD(HOST_FS_BASE, host_fs_base),
558 FIELD(HOST_GS_BASE, host_gs_base),
559 FIELD(HOST_TR_BASE, host_tr_base),
560 FIELD(HOST_GDTR_BASE, host_gdtr_base),
561 FIELD(HOST_IDTR_BASE, host_idtr_base),
562 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
563 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
564 FIELD(HOST_RSP, host_rsp),
565 FIELD(HOST_RIP, host_rip),
566};
567static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
568
569static inline short vmcs_field_to_offset(unsigned long field)
570{
571 if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
572 return -1;
573 return vmcs_field_to_offset_table[field];
574}
575
576static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
577{
578 return to_vmx(vcpu)->nested.current_vmcs12;
579}
580
581static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
582{
583 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
584 if (is_error_page(page)) {
585 kvm_release_page_clean(page);
586 return NULL;
587 }
588 return page;
589}
590
591static void nested_release_page(struct page *page)
592{
593 kvm_release_page_dirty(page);
594}
595
596static void nested_release_page_clean(struct page *page)
597{
598 kvm_release_page_clean(page);
599}
600
195static u64 construct_eptp(unsigned long root_hpa); 601static u64 construct_eptp(unsigned long root_hpa);
196static void kvm_cpu_vmxon(u64 addr); 602static void kvm_cpu_vmxon(u64 addr);
197static void kvm_cpu_vmxoff(void); 603static void kvm_cpu_vmxoff(void);
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
200 606
201static DEFINE_PER_CPU(struct vmcs *, vmxarea); 607static DEFINE_PER_CPU(struct vmcs *, vmxarea);
202static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 608static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
203static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 609/*
610 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
611 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
612 */
613static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
204static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 614static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
205 615
206static unsigned long *vmx_io_bitmap_a; 616static unsigned long *vmx_io_bitmap_a;
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void)
442 return flexpriority_enabled; 852 return flexpriority_enabled;
443} 853}
444 854
855static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
856{
857 return vmcs12->cpu_based_vm_exec_control & bit;
858}
859
860static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
861{
862 return (vmcs12->cpu_based_vm_exec_control &
863 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
864 (vmcs12->secondary_vm_exec_control & bit);
865}
866
867static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
868 struct kvm_vcpu *vcpu)
869{
870 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
871}
872
873static inline bool is_exception(u32 intr_info)
874{
875 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
876 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
877}
878
879static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
880static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
881 struct vmcs12 *vmcs12,
882 u32 reason, unsigned long qualification);
883
445static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 884static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
446{ 885{
447 int i; 886 int i;
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs)
501 vmcs, phys_addr); 940 vmcs, phys_addr);
502} 941}
503 942
943static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
944{
945 vmcs_clear(loaded_vmcs->vmcs);
946 loaded_vmcs->cpu = -1;
947 loaded_vmcs->launched = 0;
948}
949
504static void vmcs_load(struct vmcs *vmcs) 950static void vmcs_load(struct vmcs *vmcs)
505{ 951{
506 u64 phys_addr = __pa(vmcs); 952 u64 phys_addr = __pa(vmcs);
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs)
510 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 956 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
511 : "cc", "memory"); 957 : "cc", "memory");
512 if (error) 958 if (error)
513 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 959 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
514 vmcs, phys_addr); 960 vmcs, phys_addr);
515} 961}
516 962
517static void __vcpu_clear(void *arg) 963static void __loaded_vmcs_clear(void *arg)
518{ 964{
519 struct vcpu_vmx *vmx = arg; 965 struct loaded_vmcs *loaded_vmcs = arg;
520 int cpu = raw_smp_processor_id(); 966 int cpu = raw_smp_processor_id();
521 967
522 if (vmx->vcpu.cpu == cpu) 968 if (loaded_vmcs->cpu != cpu)
523 vmcs_clear(vmx->vmcs); 969 return; /* vcpu migration can race with cpu offline */
524 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 970 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
525 per_cpu(current_vmcs, cpu) = NULL; 971 per_cpu(current_vmcs, cpu) = NULL;
526 list_del(&vmx->local_vcpus_link); 972 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
527 vmx->vcpu.cpu = -1; 973 loaded_vmcs_init(loaded_vmcs);
528 vmx->launched = 0;
529} 974}
530 975
531static void vcpu_clear(struct vcpu_vmx *vmx) 976static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
532{ 977{
533 if (vmx->vcpu.cpu == -1) 978 if (loaded_vmcs->cpu != -1)
534 return; 979 smp_call_function_single(
535 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 980 loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
536} 981}
537 982
538static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 983static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
585 } 1030 }
586} 1031}
587 1032
588static unsigned long vmcs_readl(unsigned long field) 1033static __always_inline unsigned long vmcs_readl(unsigned long field)
589{ 1034{
590 unsigned long value = 0; 1035 unsigned long value;
591 1036
592 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 1037 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
593 : "+a"(value) : "d"(field) : "cc"); 1038 : "=a"(value) : "d"(field) : "cc");
594 return value; 1039 return value;
595} 1040}
596 1041
597static u16 vmcs_read16(unsigned long field) 1042static __always_inline u16 vmcs_read16(unsigned long field)
598{ 1043{
599 return vmcs_readl(field); 1044 return vmcs_readl(field);
600} 1045}
601 1046
602static u32 vmcs_read32(unsigned long field) 1047static __always_inline u32 vmcs_read32(unsigned long field)
603{ 1048{
604 return vmcs_readl(field); 1049 return vmcs_readl(field);
605} 1050}
606 1051
607static u64 vmcs_read64(unsigned long field) 1052static __always_inline u64 vmcs_read64(unsigned long field)
608{ 1053{
609#ifdef CONFIG_X86_64 1054#ifdef CONFIG_X86_64
610 return vmcs_readl(field); 1055 return vmcs_readl(field);
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
731 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1176 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
732 if (vcpu->fpu_active) 1177 if (vcpu->fpu_active)
733 eb &= ~(1u << NM_VECTOR); 1178 eb &= ~(1u << NM_VECTOR);
1179
1180 /* When we are running a nested L2 guest and L1 specified for it a
1181 * certain exception bitmap, we must trap the same exceptions and pass
1182 * them to L1. When running L2, we will only handle the exceptions
1183 * specified above if L1 did not want them.
1184 */
1185 if (is_guest_mode(vcpu))
1186 eb |= get_vmcs12(vcpu)->exception_bitmap;
1187
734 vmcs_write32(EXCEPTION_BITMAP, eb); 1188 vmcs_write32(EXCEPTION_BITMAP, eb);
735} 1189}
736 1190
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
971 1425
972 if (!vmm_exclusive) 1426 if (!vmm_exclusive)
973 kvm_cpu_vmxon(phys_addr); 1427 kvm_cpu_vmxon(phys_addr);
974 else if (vcpu->cpu != cpu) 1428 else if (vmx->loaded_vmcs->cpu != cpu)
975 vcpu_clear(vmx); 1429 loaded_vmcs_clear(vmx->loaded_vmcs);
976 1430
977 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 1431 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
978 per_cpu(current_vmcs, cpu) = vmx->vmcs; 1432 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
979 vmcs_load(vmx->vmcs); 1433 vmcs_load(vmx->loaded_vmcs->vmcs);
980 } 1434 }
981 1435
982 if (vcpu->cpu != cpu) { 1436 if (vmx->loaded_vmcs->cpu != cpu) {
983 struct desc_ptr *gdt = &__get_cpu_var(host_gdt); 1437 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
984 unsigned long sysenter_esp; 1438 unsigned long sysenter_esp;
985 1439
986 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1440 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
987 local_irq_disable(); 1441 local_irq_disable();
988 list_add(&vmx->local_vcpus_link, 1442 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
989 &per_cpu(vcpus_on_cpu, cpu)); 1443 &per_cpu(loaded_vmcss_on_cpu, cpu));
990 local_irq_enable(); 1444 local_irq_enable();
991 1445
992 /* 1446 /*
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
998 1452
999 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1453 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1000 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1454 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1455 vmx->loaded_vmcs->cpu = cpu;
1001 } 1456 }
1002} 1457}
1003 1458
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1005{ 1460{
1006 __vmx_load_host_state(to_vmx(vcpu)); 1461 __vmx_load_host_state(to_vmx(vcpu));
1007 if (!vmm_exclusive) { 1462 if (!vmm_exclusive) {
1008 __vcpu_clear(to_vmx(vcpu)); 1463 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1464 vcpu->cpu = -1;
1009 kvm_cpu_vmxoff(); 1465 kvm_cpu_vmxoff();
1010 } 1466 }
1011} 1467}
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1023 vmcs_writel(GUEST_CR0, cr0); 1479 vmcs_writel(GUEST_CR0, cr0);
1024 update_exception_bitmap(vcpu); 1480 update_exception_bitmap(vcpu);
1025 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 1481 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1482 if (is_guest_mode(vcpu))
1483 vcpu->arch.cr0_guest_owned_bits &=
1484 ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1026 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1485 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1027} 1486}
1028 1487
1029static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1488static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1030 1489
1490/*
1491 * Return the cr0 value that a nested guest would read. This is a combination
1492 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1493 * its hypervisor (cr0_read_shadow).
1494 */
1495static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1496{
1497 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1498 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1499}
1500static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1501{
1502 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1503 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1504}
1505
1031static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 1506static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1032{ 1507{
1508 /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1509 * set this *before* calling this function.
1510 */
1033 vmx_decache_cr0_guest_bits(vcpu); 1511 vmx_decache_cr0_guest_bits(vcpu);
1034 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 1512 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1035 update_exception_bitmap(vcpu); 1513 update_exception_bitmap(vcpu);
1036 vcpu->arch.cr0_guest_owned_bits = 0; 1514 vcpu->arch.cr0_guest_owned_bits = 0;
1037 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1515 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1038 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 1516 if (is_guest_mode(vcpu)) {
1517 /*
1518 * L1's specified read shadow might not contain the TS bit,
1519 * so now that we turned on shadowing of this bit, we need to
1520 * set this bit of the shadow. Like in nested_vmx_run we need
1521 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1522 * up-to-date here because we just decached cr0.TS (and we'll
1523 * only update vmcs12->guest_cr0 on nested exit).
1524 */
1525 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1526 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1527 (vcpu->arch.cr0 & X86_CR0_TS);
1528 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1529 } else
1530 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1039} 1531}
1040 1532
1041static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1533static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1119 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1611 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1120} 1612}
1121 1613
1614/*
1615 * KVM wants to inject page-faults which it got to the guest. This function
1616 * checks whether in a nested guest, we need to inject them to L1 or L2.
1617 * This function assumes it is called with the exit reason in vmcs02 being
1618 * a #PF exception (this is the only case in which KVM injects a #PF when L2
1619 * is running).
1620 */
1621static int nested_pf_handled(struct kvm_vcpu *vcpu)
1622{
1623 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1624
1625 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1626 if (!(vmcs12->exception_bitmap & PF_VECTOR))
1627 return 0;
1628
1629 nested_vmx_vmexit(vcpu);
1630 return 1;
1631}
1632
1122static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1633static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1123 bool has_error_code, u32 error_code, 1634 bool has_error_code, u32 error_code,
1124 bool reinject) 1635 bool reinject)
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1126 struct vcpu_vmx *vmx = to_vmx(vcpu); 1637 struct vcpu_vmx *vmx = to_vmx(vcpu);
1127 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1638 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1128 1639
1640 if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
1641 nested_pf_handled(vcpu))
1642 return;
1643
1129 if (has_error_code) { 1644 if (has_error_code) {
1130 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 1645 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1131 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1646 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1763static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1249{ 1764{
1250 vmcs_write64(TSC_OFFSET, offset); 1765 vmcs_write64(TSC_OFFSET, offset);
1766 if (is_guest_mode(vcpu))
1767 /*
1768 * We're here if L1 chose not to trap the TSC MSR. Since
1769 * prepare_vmcs12() does not copy tsc_offset, we need to also
1770 * set the vmcs12 field here.
1771 */
1772 get_vmcs12(vcpu)->tsc_offset = offset -
1773 to_vmx(vcpu)->nested.vmcs01_tsc_offset;
1251} 1774}
1252 1775
1253static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1776static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1254{ 1777{
1255 u64 offset = vmcs_read64(TSC_OFFSET); 1778 u64 offset = vmcs_read64(TSC_OFFSET);
1256 vmcs_write64(TSC_OFFSET, offset + adjustment); 1779 vmcs_write64(TSC_OFFSET, offset + adjustment);
1780 if (is_guest_mode(vcpu)) {
1781 /* Even when running L2, the adjustment needs to apply to L1 */
1782 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
1783 }
1257} 1784}
1258 1785
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 1786static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1261 return target_tsc - native_read_tsc(); 1788 return target_tsc - native_read_tsc();
1262} 1789}
1263 1790
1791static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
1792{
1793 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
1794 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
1795}
1796
1797/*
1798 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1799 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1800 * all guests if the "nested" module option is off, and can also be disabled
1801 * for a single guest by disabling its VMX cpuid bit.
1802 */
1803static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1804{
1805 return nested && guest_cpuid_has_vmx(vcpu);
1806}
1807
1808/*
1809 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
1810 * returned for the various VMX controls MSRs when nested VMX is enabled.
1811 * The same values should also be used to verify that vmcs12 control fields are
1812 * valid during nested entry from L1 to L2.
1813 * Each of these control msrs has a low and high 32-bit half: A low bit is on
1814 * if the corresponding bit in the (32-bit) control field *must* be on, and a
1815 * bit in the high half is on if the corresponding bit in the control field
1816 * may be on. See also vmx_control_verify().
1817 * TODO: allow these variables to be modified (downgraded) by module options
1818 * or other means.
1819 */
1820static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
1821static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
1822static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
1823static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
1824static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
1825static __init void nested_vmx_setup_ctls_msrs(void)
1826{
1827 /*
1828 * Note that as a general rule, the high half of the MSRs (bits in
1829 * the control fields which may be 1) should be initialized by the
1830 * intersection of the underlying hardware's MSR (i.e., features which
1831 * can be supported) and the list of features we want to expose -
1832 * because they are known to be properly supported in our code.
1833 * Also, usually, the low half of the MSRs (bits which must be 1) can
1834 * be set to 0, meaning that L1 may turn off any of these bits. The
1835 * reason is that if one of these bits is necessary, it will appear
1836 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
1837 * fields of vmcs01 and vmcs02, will turn these bits off - and
1838 * nested_vmx_exit_handled() will not pass related exits to L1.
1839 * These rules have exceptions below.
1840 */
1841
1842 /* pin-based controls */
1843 /*
1844 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
1845 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
1846 */
1847 nested_vmx_pinbased_ctls_low = 0x16 ;
1848 nested_vmx_pinbased_ctls_high = 0x16 |
1849 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
1850 PIN_BASED_VIRTUAL_NMIS;
1851
1852 /* exit controls */
1853 nested_vmx_exit_ctls_low = 0;
1854 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
1855#ifdef CONFIG_X86_64
1856 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
1857#else
1858 nested_vmx_exit_ctls_high = 0;
1859#endif
1860
1861 /* entry controls */
1862 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
1863 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
1864 nested_vmx_entry_ctls_low = 0;
1865 nested_vmx_entry_ctls_high &=
1866 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
1867
1868 /* cpu-based controls */
1869 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
1870 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
1871 nested_vmx_procbased_ctls_low = 0;
1872 nested_vmx_procbased_ctls_high &=
1873 CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
1874 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
1875 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
1876 CPU_BASED_CR3_STORE_EXITING |
1877#ifdef CONFIG_X86_64
1878 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
1879#endif
1880 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1881 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1882 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1883 /*
1884 * We can allow some features even when not supported by the
1885 * hardware. For example, L1 can specify an MSR bitmap - and we
1886 * can use it to avoid exits to L1 - even when L0 runs L2
1887 * without MSR bitmaps.
1888 */
1889 nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
1890
1891 /* secondary cpu-based controls */
1892 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
1893 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
1894 nested_vmx_secondary_ctls_low = 0;
1895 nested_vmx_secondary_ctls_high &=
1896 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1897}
1898
1899static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
1900{
1901 /*
1902 * Bits 0 in high must be 0, and bits 1 in low must be 1.
1903 */
1904 return ((control & high) | low) == control;
1905}
1906
1907static inline u64 vmx_control_msr(u32 low, u32 high)
1908{
1909 return low | ((u64)high << 32);
1910}
1911
1912/*
1913 * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
1914 * also let it use VMX-specific MSRs.
1915 * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
1916 * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
1917 * like all other MSRs).
1918 */
1919static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1920{
1921 if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
1922 msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
1923 /*
1924 * According to the spec, processors which do not support VMX
1925 * should throw a #GP(0) when VMX capability MSRs are read.
1926 */
1927 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
1928 return 1;
1929 }
1930
1931 switch (msr_index) {
1932 case MSR_IA32_FEATURE_CONTROL:
1933 *pdata = 0;
1934 break;
1935 case MSR_IA32_VMX_BASIC:
1936 /*
1937 * This MSR reports some information about VMX support. We
1938 * should return information about the VMX we emulate for the
1939 * guest, and the VMCS structure we give it - not about the
1940 * VMX support of the underlying hardware.
1941 */
1942 *pdata = VMCS12_REVISION |
1943 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
1944 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
1945 break;
1946 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1947 case MSR_IA32_VMX_PINBASED_CTLS:
1948 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
1949 nested_vmx_pinbased_ctls_high);
1950 break;
1951 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1952 case MSR_IA32_VMX_PROCBASED_CTLS:
1953 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
1954 nested_vmx_procbased_ctls_high);
1955 break;
1956 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1957 case MSR_IA32_VMX_EXIT_CTLS:
1958 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
1959 nested_vmx_exit_ctls_high);
1960 break;
1961 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1962 case MSR_IA32_VMX_ENTRY_CTLS:
1963 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
1964 nested_vmx_entry_ctls_high);
1965 break;
1966 case MSR_IA32_VMX_MISC:
1967 *pdata = 0;
1968 break;
1969 /*
1970 * These MSRs specify bits which the guest must keep fixed (on or off)
1971 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
1972 * We picked the standard core2 setting.
1973 */
1974#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
1975#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
1976 case MSR_IA32_VMX_CR0_FIXED0:
1977 *pdata = VMXON_CR0_ALWAYSON;
1978 break;
1979 case MSR_IA32_VMX_CR0_FIXED1:
1980 *pdata = -1ULL;
1981 break;
1982 case MSR_IA32_VMX_CR4_FIXED0:
1983 *pdata = VMXON_CR4_ALWAYSON;
1984 break;
1985 case MSR_IA32_VMX_CR4_FIXED1:
1986 *pdata = -1ULL;
1987 break;
1988 case MSR_IA32_VMX_VMCS_ENUM:
1989 *pdata = 0x1f;
1990 break;
1991 case MSR_IA32_VMX_PROCBASED_CTLS2:
1992 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
1993 nested_vmx_secondary_ctls_high);
1994 break;
1995 case MSR_IA32_VMX_EPT_VPID_CAP:
1996 /* Currently, no nested ept or nested vpid */
1997 *pdata = 0;
1998 break;
1999 default:
2000 return 0;
2001 }
2002
2003 return 1;
2004}
2005
2006static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2007{
2008 if (!nested_vmx_allowed(vcpu))
2009 return 0;
2010
2011 if (msr_index == MSR_IA32_FEATURE_CONTROL)
2012 /* TODO: the right thing. */
2013 return 1;
2014 /*
2015 * No need to treat VMX capability MSRs specially: If we don't handle
2016 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
2017 */
2018 return 0;
2019}
2020
1264/* 2021/*
1265 * Reads an msr value (of 'msr_index') into 'pdata'. 2022 * Reads an msr value (of 'msr_index') into 'pdata'.
1266 * Returns 0 on success, non-0 otherwise. 2023 * Returns 0 on success, non-0 otherwise.
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1309 /* Otherwise falls through */ 2066 /* Otherwise falls through */
1310 default: 2067 default:
1311 vmx_load_host_state(to_vmx(vcpu)); 2068 vmx_load_host_state(to_vmx(vcpu));
2069 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2070 return 0;
1312 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2071 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1313 if (msr) { 2072 if (msr) {
1314 vmx_load_host_state(to_vmx(vcpu)); 2073 vmx_load_host_state(to_vmx(vcpu));
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1380 return 1; 2139 return 1;
1381 /* Otherwise falls through */ 2140 /* Otherwise falls through */
1382 default: 2141 default:
2142 if (vmx_set_vmx_msr(vcpu, msr_index, data))
2143 break;
1383 msr = find_msr_entry(vmx, msr_index); 2144 msr = find_msr_entry(vmx, msr_index);
1384 if (msr) { 2145 if (msr) {
1385 vmx_load_host_state(vmx); 2146 vmx_load_host_state(vmx);
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage)
1469 if (read_cr4() & X86_CR4_VMXE) 2230 if (read_cr4() & X86_CR4_VMXE)
1470 return -EBUSY; 2231 return -EBUSY;
1471 2232
1472 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 2233 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
1473 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2234 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1474 2235
1475 test_bits = FEATURE_CONTROL_LOCKED; 2236 test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage)
1493 return 0; 2254 return 0;
1494} 2255}
1495 2256
1496static void vmclear_local_vcpus(void) 2257static void vmclear_local_loaded_vmcss(void)
1497{ 2258{
1498 int cpu = raw_smp_processor_id(); 2259 int cpu = raw_smp_processor_id();
1499 struct vcpu_vmx *vmx, *n; 2260 struct loaded_vmcs *v, *n;
1500 2261
1501 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), 2262 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
1502 local_vcpus_link) 2263 loaded_vmcss_on_cpu_link)
1503 __vcpu_clear(vmx); 2264 __loaded_vmcs_clear(v);
1504} 2265}
1505 2266
1506 2267
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void)
1515static void hardware_disable(void *garbage) 2276static void hardware_disable(void *garbage)
1516{ 2277{
1517 if (vmm_exclusive) { 2278 if (vmm_exclusive) {
1518 vmclear_local_vcpus(); 2279 vmclear_local_loaded_vmcss();
1519 kvm_cpu_vmxoff(); 2280 kvm_cpu_vmxoff();
1520 } 2281 }
1521 write_cr4(read_cr4() & ~X86_CR4_VMXE); 2282 write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs)
1696 free_pages((unsigned long)vmcs, vmcs_config.order); 2457 free_pages((unsigned long)vmcs, vmcs_config.order);
1697} 2458}
1698 2459
2460/*
2461 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2462 */
2463static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2464{
2465 if (!loaded_vmcs->vmcs)
2466 return;
2467 loaded_vmcs_clear(loaded_vmcs);
2468 free_vmcs(loaded_vmcs->vmcs);
2469 loaded_vmcs->vmcs = NULL;
2470}
2471
1699static void free_kvm_area(void) 2472static void free_kvm_area(void)
1700{ 2473{
1701 int cpu; 2474 int cpu;
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void)
1756 if (!cpu_has_vmx_ple()) 2529 if (!cpu_has_vmx_ple())
1757 ple_gap = 0; 2530 ple_gap = 0;
1758 2531
2532 if (nested)
2533 nested_vmx_setup_ctls_msrs();
2534
1759 return alloc_kvm_area(); 2535 return alloc_kvm_area();
1760} 2536}
1761 2537
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2041 (unsigned long *)&vcpu->arch.regs_dirty); 2817 (unsigned long *)&vcpu->arch.regs_dirty);
2042} 2818}
2043 2819
2044static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 2820static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
2045 2821
2046static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 2822static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2047 unsigned long cr0, 2823 unsigned long cr0,
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2139 vmcs_writel(GUEST_CR3, guest_cr3); 2915 vmcs_writel(GUEST_CR3, guest_cr3);
2140} 2916}
2141 2917
2142static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2918static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2143{ 2919{
2144 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? 2920 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
2145 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 2921 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
2146 2922
2923 if (cr4 & X86_CR4_VMXE) {
2924 /*
2925 * To use VMXON (and later other VMX instructions), a guest
2926 * must first be able to turn on cr4.VMXE (see handle_vmon()).
2927 * So basically the check on whether to allow nested VMX
2928 * is here.
2929 */
2930 if (!nested_vmx_allowed(vcpu))
2931 return 1;
2932 } else if (to_vmx(vcpu)->nested.vmxon)
2933 return 1;
2934
2147 vcpu->arch.cr4 = cr4; 2935 vcpu->arch.cr4 = cr4;
2148 if (enable_ept) { 2936 if (enable_ept) {
2149 if (!is_paging(vcpu)) { 2937 if (!is_paging(vcpu)) {
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2156 2944
2157 vmcs_writel(CR4_READ_SHADOW, cr4); 2945 vmcs_writel(CR4_READ_SHADOW, cr4);
2158 vmcs_writel(GUEST_CR4, hw_cr4); 2946 vmcs_writel(GUEST_CR4, hw_cr4);
2947 return 0;
2159} 2948}
2160 2949
2161static void vmx_get_segment(struct kvm_vcpu *vcpu, 2950static void vmx_get_segment(struct kvm_vcpu *vcpu,
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2721} 3510}
2722 3511
2723/* 3512/*
3513 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3514 * will not change in the lifetime of the guest.
3515 * Note that host-state that does change is set elsewhere. E.g., host-state
3516 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3517 */
3518static void vmx_set_constant_host_state(void)
3519{
3520 u32 low32, high32;
3521 unsigned long tmpl;
3522 struct desc_ptr dt;
3523
3524 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
3525 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
3526 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
3527
3528 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3529 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3530 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3531 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3532 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3533
3534 native_store_idt(&dt);
3535 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3536
3537 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
3538 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
3539
3540 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3541 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3542 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3543 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
3544
3545 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3546 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3547 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3548 }
3549}
3550
3551static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3552{
3553 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3554 if (enable_ept)
3555 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
3556 if (is_guest_mode(&vmx->vcpu))
3557 vmx->vcpu.arch.cr4_guest_owned_bits &=
3558 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3559 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3560}
3561
3562static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3563{
3564 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3565 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
3566 exec_control &= ~CPU_BASED_TPR_SHADOW;
3567#ifdef CONFIG_X86_64
3568 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3569 CPU_BASED_CR8_LOAD_EXITING;
3570#endif
3571 }
3572 if (!enable_ept)
3573 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3574 CPU_BASED_CR3_LOAD_EXITING |
3575 CPU_BASED_INVLPG_EXITING;
3576 return exec_control;
3577}
3578
3579static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3580{
3581 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
3582 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
3583 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3584 if (vmx->vpid == 0)
3585 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
3586 if (!enable_ept) {
3587 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3588 enable_unrestricted_guest = 0;
3589 }
3590 if (!enable_unrestricted_guest)
3591 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3592 if (!ple_gap)
3593 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3594 return exec_control;
3595}
3596
3597static void ept_set_mmio_spte_mask(void)
3598{
3599 /*
3600 * EPT Misconfigurations can be generated if the value of bits 2:0
3601 * of an EPT paging-structure entry is 110b (write/execute).
3602 * Also, magic bits (0xffull << 49) is set to quickly identify mmio
3603 * spte.
3604 */
3605 kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
3606}
3607
3608/*
2724 * Sets up the vmcs for emulated real mode. 3609 * Sets up the vmcs for emulated real mode.
2725 */ 3610 */
2726static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 3611static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2727{ 3612{
2728 u32 host_sysenter_cs, msr_low, msr_high; 3613#ifdef CONFIG_X86_64
2729 u32 junk;
2730 u64 host_pat;
2731 unsigned long a; 3614 unsigned long a;
2732 struct desc_ptr dt; 3615#endif
2733 int i; 3616 int i;
2734 unsigned long kvm_vmx_return;
2735 u32 exec_control;
2736 3617
2737 /* I/O */ 3618 /* I/O */
2738 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 3619 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2747 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 3628 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2748 vmcs_config.pin_based_exec_ctrl); 3629 vmcs_config.pin_based_exec_ctrl);
2749 3630
2750 exec_control = vmcs_config.cpu_based_exec_ctrl; 3631 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
2751 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2752 exec_control &= ~CPU_BASED_TPR_SHADOW;
2753#ifdef CONFIG_X86_64
2754 exec_control |= CPU_BASED_CR8_STORE_EXITING |
2755 CPU_BASED_CR8_LOAD_EXITING;
2756#endif
2757 }
2758 if (!enable_ept)
2759 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2760 CPU_BASED_CR3_LOAD_EXITING |
2761 CPU_BASED_INVLPG_EXITING;
2762 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2763 3632
2764 if (cpu_has_secondary_exec_ctrls()) { 3633 if (cpu_has_secondary_exec_ctrls()) {
2765 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3634 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
2766 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 3635 vmx_secondary_exec_control(vmx));
2767 exec_control &=
2768 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2769 if (vmx->vpid == 0)
2770 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2771 if (!enable_ept) {
2772 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2773 enable_unrestricted_guest = 0;
2774 }
2775 if (!enable_unrestricted_guest)
2776 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2777 if (!ple_gap)
2778 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2779 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2780 } 3636 }
2781 3637
2782 if (ple_gap) { 3638 if (ple_gap) {
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2784 vmcs_write32(PLE_WINDOW, ple_window); 3640 vmcs_write32(PLE_WINDOW, ple_window);
2785 } 3641 }
2786 3642
2787 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 3643 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2788 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 3644 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2789 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 3645 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2790 3646
2791 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2792 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2793 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2794
2795 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
2796 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2797 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2798 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 3647 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
2799 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 3648 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
2800 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3649 vmx_set_constant_host_state();
2801#ifdef CONFIG_X86_64 3650#ifdef CONFIG_X86_64
2802 rdmsrl(MSR_FS_BASE, a); 3651 rdmsrl(MSR_FS_BASE, a);
2803 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 3652 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2808 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 3657 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2809#endif 3658#endif
2810 3659
2811 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2812
2813 native_store_idt(&dt);
2814 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2815
2816 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2817 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2818 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 3660 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2819 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3661 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2820 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 3662 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2821 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3663 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2822 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 3664 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2823 3665
2824 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2825 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2826 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2827 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
2828 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2829 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
2830
2831 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2832 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2833 host_pat = msr_low | ((u64) msr_high << 32);
2834 vmcs_write64(HOST_IA32_PAT, host_pat);
2835 }
2836 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 3666 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3667 u32 msr_low, msr_high;
3668 u64 host_pat;
2837 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); 3669 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2838 host_pat = msr_low | ((u64) msr_high << 32); 3670 host_pat = msr_low | ((u64) msr_high << 32);
2839 /* Write the default value follow host pat */ 3671 /* Write the default value follow host pat */
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2863 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 3695 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2864 3696
2865 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3697 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2866 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 3698 set_cr4_guest_host_mask(vmx);
2867 if (enable_ept)
2868 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2869 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2870 3699
2871 kvm_write_tsc(&vmx->vcpu, 0); 3700 kvm_write_tsc(&vmx->vcpu, 0);
2872 3701
@@ -2990,9 +3819,25 @@ out:
2990 return ret; 3819 return ret;
2991} 3820}
2992 3821
3822/*
3823 * In nested virtualization, check if L1 asked to exit on external interrupts.
3824 * For most existing hypervisors, this will always return true.
3825 */
3826static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
3827{
3828 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
3829 PIN_BASED_EXT_INTR_MASK;
3830}
3831
2993static void enable_irq_window(struct kvm_vcpu *vcpu) 3832static void enable_irq_window(struct kvm_vcpu *vcpu)
2994{ 3833{
2995 u32 cpu_based_vm_exec_control; 3834 u32 cpu_based_vm_exec_control;
3835 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
3836 /* We can get here when nested_run_pending caused
3837 * vmx_interrupt_allowed() to return false. In this case, do
3838 * nothing - the interrupt will be injected later.
3839 */
3840 return;
2996 3841
2997 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3842 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2998 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 3843 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
3049{ 3894{
3050 struct vcpu_vmx *vmx = to_vmx(vcpu); 3895 struct vcpu_vmx *vmx = to_vmx(vcpu);
3051 3896
3897 if (is_guest_mode(vcpu))
3898 return;
3899
3052 if (!cpu_has_virtual_nmis()) { 3900 if (!cpu_has_virtual_nmis()) {
3053 /* 3901 /*
3054 * Tracking the NMI-blocked state in software is built upon 3902 * Tracking the NMI-blocked state in software is built upon
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3115 3963
3116static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 3964static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
3117{ 3965{
3966 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
3967 struct vmcs12 *vmcs12;
3968 if (to_vmx(vcpu)->nested.nested_run_pending)
3969 return 0;
3970 nested_vmx_vmexit(vcpu);
3971 vmcs12 = get_vmcs12(vcpu);
3972 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
3973 vmcs12->vm_exit_intr_info = 0;
3974 /* fall through to normal code, but now in L1, not L2 */
3975 }
3976
3118 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 3977 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
3119 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3978 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3120 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 3979 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3356 hypercall[2] = 0xc1; 4215 hypercall[2] = 0xc1;
3357} 4216}
3358 4217
4218/* called to set cr0 as approriate for a mov-to-cr0 exit. */
4219static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4220{
4221 if (to_vmx(vcpu)->nested.vmxon &&
4222 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4223 return 1;
4224
4225 if (is_guest_mode(vcpu)) {
4226 /*
4227 * We get here when L2 changed cr0 in a way that did not change
4228 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4229 * but did change L0 shadowed bits. This can currently happen
4230 * with the TS bit: L0 may want to leave TS on (for lazy fpu
4231 * loading) while pretending to allow the guest to change it.
4232 */
4233 if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
4234 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
4235 return 1;
4236 vmcs_writel(CR0_READ_SHADOW, val);
4237 return 0;
4238 } else
4239 return kvm_set_cr0(vcpu, val);
4240}
4241
4242static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4243{
4244 if (is_guest_mode(vcpu)) {
4245 if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
4246 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
4247 return 1;
4248 vmcs_writel(CR4_READ_SHADOW, val);
4249 return 0;
4250 } else
4251 return kvm_set_cr4(vcpu, val);
4252}
4253
4254/* called to set cr0 as approriate for clts instruction exit. */
4255static void handle_clts(struct kvm_vcpu *vcpu)
4256{
4257 if (is_guest_mode(vcpu)) {
4258 /*
4259 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
4260 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
4261 * just pretend it's off (also in arch.cr0 for fpu_activate).
4262 */
4263 vmcs_writel(CR0_READ_SHADOW,
4264 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
4265 vcpu->arch.cr0 &= ~X86_CR0_TS;
4266 } else
4267 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4268}
4269
3359static int handle_cr(struct kvm_vcpu *vcpu) 4270static int handle_cr(struct kvm_vcpu *vcpu)
3360{ 4271{
3361 unsigned long exit_qualification, val; 4272 unsigned long exit_qualification, val;
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3372 trace_kvm_cr_write(cr, val); 4283 trace_kvm_cr_write(cr, val);
3373 switch (cr) { 4284 switch (cr) {
3374 case 0: 4285 case 0:
3375 err = kvm_set_cr0(vcpu, val); 4286 err = handle_set_cr0(vcpu, val);
3376 kvm_complete_insn_gp(vcpu, err); 4287 kvm_complete_insn_gp(vcpu, err);
3377 return 1; 4288 return 1;
3378 case 3: 4289 case 3:
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3380 kvm_complete_insn_gp(vcpu, err); 4291 kvm_complete_insn_gp(vcpu, err);
3381 return 1; 4292 return 1;
3382 case 4: 4293 case 4:
3383 err = kvm_set_cr4(vcpu, val); 4294 err = handle_set_cr4(vcpu, val);
3384 kvm_complete_insn_gp(vcpu, err); 4295 kvm_complete_insn_gp(vcpu, err);
3385 return 1; 4296 return 1;
3386 case 8: { 4297 case 8: {
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3398 }; 4309 };
3399 break; 4310 break;
3400 case 2: /* clts */ 4311 case 2: /* clts */
3401 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4312 handle_clts(vcpu);
3402 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 4313 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
3403 skip_emulated_instruction(vcpu); 4314 skip_emulated_instruction(vcpu);
3404 vmx_fpu_activate(vcpu); 4315 vmx_fpu_activate(vcpu);
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
3574 return 1; 4485 return 1;
3575} 4486}
3576 4487
3577static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3578{
3579 kvm_queue_exception(vcpu, UD_VECTOR);
3580 return 1;
3581}
3582
3583static int handle_invd(struct kvm_vcpu *vcpu) 4488static int handle_invd(struct kvm_vcpu *vcpu)
3584{ 4489{
3585 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4490 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3777static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 4682static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3778{ 4683{
3779 u64 sptes[4]; 4684 u64 sptes[4];
3780 int nr_sptes, i; 4685 int nr_sptes, i, ret;
3781 gpa_t gpa; 4686 gpa_t gpa;
3782 4687
3783 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4688 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3784 4689
4690 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
4691 if (likely(ret == 1))
4692 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
4693 EMULATE_DONE;
4694 if (unlikely(!ret))
4695 return 1;
4696
4697 /* It is the real ept misconfig */
3785 printk(KERN_ERR "EPT: Misconfiguration.\n"); 4698 printk(KERN_ERR "EPT: Misconfiguration.\n");
3786 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); 4699 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3787 4700
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
3866} 4779}
3867 4780
3868/* 4781/*
4782 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
4783 * We could reuse a single VMCS for all the L2 guests, but we also want the
4784 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
4785 * allows keeping them loaded on the processor, and in the future will allow
4786 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
4787 * every entry if they never change.
4788 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
4789 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
4790 *
4791 * The following functions allocate and free a vmcs02 in this pool.
4792 */
4793
4794/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
4795static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
4796{
4797 struct vmcs02_list *item;
4798 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4799 if (item->vmptr == vmx->nested.current_vmptr) {
4800 list_move(&item->list, &vmx->nested.vmcs02_pool);
4801 return &item->vmcs02;
4802 }
4803
4804 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
4805 /* Recycle the least recently used VMCS. */
4806 item = list_entry(vmx->nested.vmcs02_pool.prev,
4807 struct vmcs02_list, list);
4808 item->vmptr = vmx->nested.current_vmptr;
4809 list_move(&item->list, &vmx->nested.vmcs02_pool);
4810 return &item->vmcs02;
4811 }
4812
4813 /* Create a new VMCS */
4814 item = (struct vmcs02_list *)
4815 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
4816 if (!item)
4817 return NULL;
4818 item->vmcs02.vmcs = alloc_vmcs();
4819 if (!item->vmcs02.vmcs) {
4820 kfree(item);
4821 return NULL;
4822 }
4823 loaded_vmcs_init(&item->vmcs02);
4824 item->vmptr = vmx->nested.current_vmptr;
4825 list_add(&(item->list), &(vmx->nested.vmcs02_pool));
4826 vmx->nested.vmcs02_num++;
4827 return &item->vmcs02;
4828}
4829
4830/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
4831static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
4832{
4833 struct vmcs02_list *item;
4834 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4835 if (item->vmptr == vmptr) {
4836 free_loaded_vmcs(&item->vmcs02);
4837 list_del(&item->list);
4838 kfree(item);
4839 vmx->nested.vmcs02_num--;
4840 return;
4841 }
4842}
4843
4844/*
4845 * Free all VMCSs saved for this vcpu, except the one pointed by
4846 * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
4847 * currently used, if running L2), and vmcs01 when running L2.
4848 */
4849static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
4850{
4851 struct vmcs02_list *item, *n;
4852 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
4853 if (vmx->loaded_vmcs != &item->vmcs02)
4854 free_loaded_vmcs(&item->vmcs02);
4855 list_del(&item->list);
4856 kfree(item);
4857 }
4858 vmx->nested.vmcs02_num = 0;
4859
4860 if (vmx->loaded_vmcs != &vmx->vmcs01)
4861 free_loaded_vmcs(&vmx->vmcs01);
4862}
4863
4864/*
4865 * Emulate the VMXON instruction.
4866 * Currently, we just remember that VMX is active, and do not save or even
4867 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4868 * do not currently need to store anything in that guest-allocated memory
4869 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4870 * argument is different from the VMXON pointer (which the spec says they do).
4871 */
4872static int handle_vmon(struct kvm_vcpu *vcpu)
4873{
4874 struct kvm_segment cs;
4875 struct vcpu_vmx *vmx = to_vmx(vcpu);
4876
4877 /* The Intel VMX Instruction Reference lists a bunch of bits that
4878 * are prerequisite to running VMXON, most notably cr4.VMXE must be
4879 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
4880 * Otherwise, we should fail with #UD. We test these now:
4881 */
4882 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
4883 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
4884 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4885 kvm_queue_exception(vcpu, UD_VECTOR);
4886 return 1;
4887 }
4888
4889 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4890 if (is_long_mode(vcpu) && !cs.l) {
4891 kvm_queue_exception(vcpu, UD_VECTOR);
4892 return 1;
4893 }
4894
4895 if (vmx_get_cpl(vcpu)) {
4896 kvm_inject_gp(vcpu, 0);
4897 return 1;
4898 }
4899
4900 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
4901 vmx->nested.vmcs02_num = 0;
4902
4903 vmx->nested.vmxon = true;
4904
4905 skip_emulated_instruction(vcpu);
4906 return 1;
4907}
4908
4909/*
4910 * Intel's VMX Instruction Reference specifies a common set of prerequisites
4911 * for running VMX instructions (except VMXON, whose prerequisites are
4912 * slightly different). It also specifies what exception to inject otherwise.
4913 */
4914static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
4915{
4916 struct kvm_segment cs;
4917 struct vcpu_vmx *vmx = to_vmx(vcpu);
4918
4919 if (!vmx->nested.vmxon) {
4920 kvm_queue_exception(vcpu, UD_VECTOR);
4921 return 0;
4922 }
4923
4924 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4925 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
4926 (is_long_mode(vcpu) && !cs.l)) {
4927 kvm_queue_exception(vcpu, UD_VECTOR);
4928 return 0;
4929 }
4930
4931 if (vmx_get_cpl(vcpu)) {
4932 kvm_inject_gp(vcpu, 0);
4933 return 0;
4934 }
4935
4936 return 1;
4937}
4938
4939/*
4940 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
4941 * just stops using VMX.
4942 */
4943static void free_nested(struct vcpu_vmx *vmx)
4944{
4945 if (!vmx->nested.vmxon)
4946 return;
4947 vmx->nested.vmxon = false;
4948 if (vmx->nested.current_vmptr != -1ull) {
4949 kunmap(vmx->nested.current_vmcs12_page);
4950 nested_release_page(vmx->nested.current_vmcs12_page);
4951 vmx->nested.current_vmptr = -1ull;
4952 vmx->nested.current_vmcs12 = NULL;
4953 }
4954 /* Unpin physical memory we referred to in current vmcs02 */
4955 if (vmx->nested.apic_access_page) {
4956 nested_release_page(vmx->nested.apic_access_page);
4957 vmx->nested.apic_access_page = 0;
4958 }
4959
4960 nested_free_all_saved_vmcss(vmx);
4961}
4962
4963/* Emulate the VMXOFF instruction */
4964static int handle_vmoff(struct kvm_vcpu *vcpu)
4965{
4966 if (!nested_vmx_check_permission(vcpu))
4967 return 1;
4968 free_nested(to_vmx(vcpu));
4969 skip_emulated_instruction(vcpu);
4970 return 1;
4971}
4972
4973/*
4974 * Decode the memory-address operand of a vmx instruction, as recorded on an
4975 * exit caused by such an instruction (run by a guest hypervisor).
4976 * On success, returns 0. When the operand is invalid, returns 1 and throws
4977 * #UD or #GP.
4978 */
4979static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
4980 unsigned long exit_qualification,
4981 u32 vmx_instruction_info, gva_t *ret)
4982{
4983 /*
4984 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4985 * Execution", on an exit, vmx_instruction_info holds most of the
4986 * addressing components of the operand. Only the displacement part
4987 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4988 * For how an actual address is calculated from all these components,
4989 * refer to Vol. 1, "Operand Addressing".
4990 */
4991 int scaling = vmx_instruction_info & 3;
4992 int addr_size = (vmx_instruction_info >> 7) & 7;
4993 bool is_reg = vmx_instruction_info & (1u << 10);
4994 int seg_reg = (vmx_instruction_info >> 15) & 7;
4995 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4996 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4997 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4998 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4999
5000 if (is_reg) {
5001 kvm_queue_exception(vcpu, UD_VECTOR);
5002 return 1;
5003 }
5004
5005 /* Addr = segment_base + offset */
5006 /* offset = base + [index * scale] + displacement */
5007 *ret = vmx_get_segment_base(vcpu, seg_reg);
5008 if (base_is_valid)
5009 *ret += kvm_register_read(vcpu, base_reg);
5010 if (index_is_valid)
5011 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
5012 *ret += exit_qualification; /* holds the displacement */
5013
5014 if (addr_size == 1) /* 32 bit */
5015 *ret &= 0xffffffff;
5016
5017 /*
5018 * TODO: throw #GP (and return 1) in various cases that the VM*
5019 * instructions require it - e.g., offset beyond segment limit,
5020 * unusable or unreadable/unwritable segment, non-canonical 64-bit
5021 * address, and so on. Currently these are not checked.
5022 */
5023 return 0;
5024}
5025
5026/*
5027 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5028 * set the success or error code of an emulated VMX instruction, as specified
5029 * by Vol 2B, VMX Instruction Reference, "Conventions".
5030 */
5031static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5032{
5033 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5034 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5035 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5036}
5037
5038static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5039{
5040 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5041 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5042 X86_EFLAGS_SF | X86_EFLAGS_OF))
5043 | X86_EFLAGS_CF);
5044}
5045
5046static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5047 u32 vm_instruction_error)
5048{
5049 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5050 /*
5051 * failValid writes the error number to the current VMCS, which
5052 * can't be done there isn't a current VMCS.
5053 */
5054 nested_vmx_failInvalid(vcpu);
5055 return;
5056 }
5057 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5058 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5059 X86_EFLAGS_SF | X86_EFLAGS_OF))
5060 | X86_EFLAGS_ZF);
5061 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5062}
5063
5064/* Emulate the VMCLEAR instruction */
5065static int handle_vmclear(struct kvm_vcpu *vcpu)
5066{
5067 struct vcpu_vmx *vmx = to_vmx(vcpu);
5068 gva_t gva;
5069 gpa_t vmptr;
5070 struct vmcs12 *vmcs12;
5071 struct page *page;
5072 struct x86_exception e;
5073
5074 if (!nested_vmx_check_permission(vcpu))
5075 return 1;
5076
5077 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5078 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5079 return 1;
5080
5081 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5082 sizeof(vmptr), &e)) {
5083 kvm_inject_page_fault(vcpu, &e);
5084 return 1;
5085 }
5086
5087 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5088 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5089 skip_emulated_instruction(vcpu);
5090 return 1;
5091 }
5092
5093 if (vmptr == vmx->nested.current_vmptr) {
5094 kunmap(vmx->nested.current_vmcs12_page);
5095 nested_release_page(vmx->nested.current_vmcs12_page);
5096 vmx->nested.current_vmptr = -1ull;
5097 vmx->nested.current_vmcs12 = NULL;
5098 }
5099
5100 page = nested_get_page(vcpu, vmptr);
5101 if (page == NULL) {
5102 /*
5103 * For accurate processor emulation, VMCLEAR beyond available
5104 * physical memory should do nothing at all. However, it is
5105 * possible that a nested vmx bug, not a guest hypervisor bug,
5106 * resulted in this case, so let's shut down before doing any
5107 * more damage:
5108 */
5109 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5110 return 1;
5111 }
5112 vmcs12 = kmap(page);
5113 vmcs12->launch_state = 0;
5114 kunmap(page);
5115 nested_release_page(page);
5116
5117 nested_free_vmcs02(vmx, vmptr);
5118
5119 skip_emulated_instruction(vcpu);
5120 nested_vmx_succeed(vcpu);
5121 return 1;
5122}
5123
5124static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
5125
5126/* Emulate the VMLAUNCH instruction */
5127static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5128{
5129 return nested_vmx_run(vcpu, true);
5130}
5131
5132/* Emulate the VMRESUME instruction */
5133static int handle_vmresume(struct kvm_vcpu *vcpu)
5134{
5135
5136 return nested_vmx_run(vcpu, false);
5137}
5138
5139enum vmcs_field_type {
5140 VMCS_FIELD_TYPE_U16 = 0,
5141 VMCS_FIELD_TYPE_U64 = 1,
5142 VMCS_FIELD_TYPE_U32 = 2,
5143 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
5144};
5145
5146static inline int vmcs_field_type(unsigned long field)
5147{
5148 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
5149 return VMCS_FIELD_TYPE_U32;
5150 return (field >> 13) & 0x3 ;
5151}
5152
5153static inline int vmcs_field_readonly(unsigned long field)
5154{
5155 return (((field >> 10) & 0x3) == 1);
5156}
5157
5158/*
5159 * Read a vmcs12 field. Since these can have varying lengths and we return
5160 * one type, we chose the biggest type (u64) and zero-extend the return value
5161 * to that size. Note that the caller, handle_vmread, might need to use only
5162 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
5163 * 64-bit fields are to be returned).
5164 */
5165static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
5166 unsigned long field, u64 *ret)
5167{
5168 short offset = vmcs_field_to_offset(field);
5169 char *p;
5170
5171 if (offset < 0)
5172 return 0;
5173
5174 p = ((char *)(get_vmcs12(vcpu))) + offset;
5175
5176 switch (vmcs_field_type(field)) {
5177 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5178 *ret = *((natural_width *)p);
5179 return 1;
5180 case VMCS_FIELD_TYPE_U16:
5181 *ret = *((u16 *)p);
5182 return 1;
5183 case VMCS_FIELD_TYPE_U32:
5184 *ret = *((u32 *)p);
5185 return 1;
5186 case VMCS_FIELD_TYPE_U64:
5187 *ret = *((u64 *)p);
5188 return 1;
5189 default:
5190 return 0; /* can never happen. */
5191 }
5192}
5193
5194/*
5195 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
5196 * used before) all generate the same failure when it is missing.
5197 */
5198static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
5199{
5200 struct vcpu_vmx *vmx = to_vmx(vcpu);
5201 if (vmx->nested.current_vmptr == -1ull) {
5202 nested_vmx_failInvalid(vcpu);
5203 skip_emulated_instruction(vcpu);
5204 return 0;
5205 }
5206 return 1;
5207}
5208
5209static int handle_vmread(struct kvm_vcpu *vcpu)
5210{
5211 unsigned long field;
5212 u64 field_value;
5213 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5214 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5215 gva_t gva = 0;
5216
5217 if (!nested_vmx_check_permission(vcpu) ||
5218 !nested_vmx_check_vmcs12(vcpu))
5219 return 1;
5220
5221 /* Decode instruction info and find the field to read */
5222 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5223 /* Read the field, zero-extended to a u64 field_value */
5224 if (!vmcs12_read_any(vcpu, field, &field_value)) {
5225 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5226 skip_emulated_instruction(vcpu);
5227 return 1;
5228 }
5229 /*
5230 * Now copy part of this value to register or memory, as requested.
5231 * Note that the number of bits actually copied is 32 or 64 depending
5232 * on the guest's mode (32 or 64 bit), not on the given field's length.
5233 */
5234 if (vmx_instruction_info & (1u << 10)) {
5235 kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
5236 field_value);
5237 } else {
5238 if (get_vmx_mem_address(vcpu, exit_qualification,
5239 vmx_instruction_info, &gva))
5240 return 1;
5241 /* _system ok, as nested_vmx_check_permission verified cpl=0 */
5242 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
5243 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
5244 }
5245
5246 nested_vmx_succeed(vcpu);
5247 skip_emulated_instruction(vcpu);
5248 return 1;
5249}
5250
5251
5252static int handle_vmwrite(struct kvm_vcpu *vcpu)
5253{
5254 unsigned long field;
5255 gva_t gva;
5256 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5257 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5258 char *p;
5259 short offset;
5260 /* The value to write might be 32 or 64 bits, depending on L1's long
5261 * mode, and eventually we need to write that into a field of several
5262 * possible lengths. The code below first zero-extends the value to 64
5263 * bit (field_value), and then copies only the approriate number of
5264 * bits into the vmcs12 field.
5265 */
5266 u64 field_value = 0;
5267 struct x86_exception e;
5268
5269 if (!nested_vmx_check_permission(vcpu) ||
5270 !nested_vmx_check_vmcs12(vcpu))
5271 return 1;
5272
5273 if (vmx_instruction_info & (1u << 10))
5274 field_value = kvm_register_read(vcpu,
5275 (((vmx_instruction_info) >> 3) & 0xf));
5276 else {
5277 if (get_vmx_mem_address(vcpu, exit_qualification,
5278 vmx_instruction_info, &gva))
5279 return 1;
5280 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
5281 &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
5282 kvm_inject_page_fault(vcpu, &e);
5283 return 1;
5284 }
5285 }
5286
5287
5288 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5289 if (vmcs_field_readonly(field)) {
5290 nested_vmx_failValid(vcpu,
5291 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5292 skip_emulated_instruction(vcpu);
5293 return 1;
5294 }
5295
5296 offset = vmcs_field_to_offset(field);
5297 if (offset < 0) {
5298 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5299 skip_emulated_instruction(vcpu);
5300 return 1;
5301 }
5302 p = ((char *) get_vmcs12(vcpu)) + offset;
5303
5304 switch (vmcs_field_type(field)) {
5305 case VMCS_FIELD_TYPE_U16:
5306 *(u16 *)p = field_value;
5307 break;
5308 case VMCS_FIELD_TYPE_U32:
5309 *(u32 *)p = field_value;
5310 break;
5311 case VMCS_FIELD_TYPE_U64:
5312 *(u64 *)p = field_value;
5313 break;
5314 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5315 *(natural_width *)p = field_value;
5316 break;
5317 default:
5318 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5319 skip_emulated_instruction(vcpu);
5320 return 1;
5321 }
5322
5323 nested_vmx_succeed(vcpu);
5324 skip_emulated_instruction(vcpu);
5325 return 1;
5326}
5327
5328/* Emulate the VMPTRLD instruction */
5329static int handle_vmptrld(struct kvm_vcpu *vcpu)
5330{
5331 struct vcpu_vmx *vmx = to_vmx(vcpu);
5332 gva_t gva;
5333 gpa_t vmptr;
5334 struct x86_exception e;
5335
5336 if (!nested_vmx_check_permission(vcpu))
5337 return 1;
5338
5339 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5340 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5341 return 1;
5342
5343 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5344 sizeof(vmptr), &e)) {
5345 kvm_inject_page_fault(vcpu, &e);
5346 return 1;
5347 }
5348
5349 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5350 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5351 skip_emulated_instruction(vcpu);
5352 return 1;
5353 }
5354
5355 if (vmx->nested.current_vmptr != vmptr) {
5356 struct vmcs12 *new_vmcs12;
5357 struct page *page;
5358 page = nested_get_page(vcpu, vmptr);
5359 if (page == NULL) {
5360 nested_vmx_failInvalid(vcpu);
5361 skip_emulated_instruction(vcpu);
5362 return 1;
5363 }
5364 new_vmcs12 = kmap(page);
5365 if (new_vmcs12->revision_id != VMCS12_REVISION) {
5366 kunmap(page);
5367 nested_release_page_clean(page);
5368 nested_vmx_failValid(vcpu,
5369 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5370 skip_emulated_instruction(vcpu);
5371 return 1;
5372 }
5373 if (vmx->nested.current_vmptr != -1ull) {
5374 kunmap(vmx->nested.current_vmcs12_page);
5375 nested_release_page(vmx->nested.current_vmcs12_page);
5376 }
5377
5378 vmx->nested.current_vmptr = vmptr;
5379 vmx->nested.current_vmcs12 = new_vmcs12;
5380 vmx->nested.current_vmcs12_page = page;
5381 }
5382
5383 nested_vmx_succeed(vcpu);
5384 skip_emulated_instruction(vcpu);
5385 return 1;
5386}
5387
5388/* Emulate the VMPTRST instruction */
5389static int handle_vmptrst(struct kvm_vcpu *vcpu)
5390{
5391 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5392 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5393 gva_t vmcs_gva;
5394 struct x86_exception e;
5395
5396 if (!nested_vmx_check_permission(vcpu))
5397 return 1;
5398
5399 if (get_vmx_mem_address(vcpu, exit_qualification,
5400 vmx_instruction_info, &vmcs_gva))
5401 return 1;
5402 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
5403 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
5404 (void *)&to_vmx(vcpu)->nested.current_vmptr,
5405 sizeof(u64), &e)) {
5406 kvm_inject_page_fault(vcpu, &e);
5407 return 1;
5408 }
5409 nested_vmx_succeed(vcpu);
5410 skip_emulated_instruction(vcpu);
5411 return 1;
5412}
5413
5414/*
3869 * The exit handlers return 1 if the exit was handled fully and guest execution 5415 * The exit handlers return 1 if the exit was handled fully and guest execution
3870 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5416 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3871 * to be done to userspace and return 0. 5417 * to be done to userspace and return 0.
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3886 [EXIT_REASON_INVD] = handle_invd, 5432 [EXIT_REASON_INVD] = handle_invd,
3887 [EXIT_REASON_INVLPG] = handle_invlpg, 5433 [EXIT_REASON_INVLPG] = handle_invlpg,
3888 [EXIT_REASON_VMCALL] = handle_vmcall, 5434 [EXIT_REASON_VMCALL] = handle_vmcall,
3889 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 5435 [EXIT_REASON_VMCLEAR] = handle_vmclear,
3890 [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, 5436 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
3891 [EXIT_REASON_VMPTRLD] = handle_vmx_insn, 5437 [EXIT_REASON_VMPTRLD] = handle_vmptrld,
3892 [EXIT_REASON_VMPTRST] = handle_vmx_insn, 5438 [EXIT_REASON_VMPTRST] = handle_vmptrst,
3893 [EXIT_REASON_VMREAD] = handle_vmx_insn, 5439 [EXIT_REASON_VMREAD] = handle_vmread,
3894 [EXIT_REASON_VMRESUME] = handle_vmx_insn, 5440 [EXIT_REASON_VMRESUME] = handle_vmresume,
3895 [EXIT_REASON_VMWRITE] = handle_vmx_insn, 5441 [EXIT_REASON_VMWRITE] = handle_vmwrite,
3896 [EXIT_REASON_VMOFF] = handle_vmx_insn, 5442 [EXIT_REASON_VMOFF] = handle_vmoff,
3897 [EXIT_REASON_VMON] = handle_vmx_insn, 5443 [EXIT_REASON_VMON] = handle_vmon,
3898 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5444 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3899 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5445 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3900 [EXIT_REASON_WBINVD] = handle_wbinvd, 5446 [EXIT_REASON_WBINVD] = handle_wbinvd,
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3911static const int kvm_vmx_max_exit_handlers = 5457static const int kvm_vmx_max_exit_handlers =
3912 ARRAY_SIZE(kvm_vmx_exit_handlers); 5458 ARRAY_SIZE(kvm_vmx_exit_handlers);
3913 5459
5460/*
5461 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5462 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5463 * disinterest in the current event (read or write a specific MSR) by using an
5464 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5465 */
5466static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5467 struct vmcs12 *vmcs12, u32 exit_reason)
5468{
5469 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5470 gpa_t bitmap;
5471
5472 if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
5473 return 1;
5474
5475 /*
5476 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5477 * for the four combinations of read/write and low/high MSR numbers.
5478 * First we need to figure out which of the four to use:
5479 */
5480 bitmap = vmcs12->msr_bitmap;
5481 if (exit_reason == EXIT_REASON_MSR_WRITE)
5482 bitmap += 2048;
5483 if (msr_index >= 0xc0000000) {
5484 msr_index -= 0xc0000000;
5485 bitmap += 1024;
5486 }
5487
5488 /* Then read the msr_index'th bit from this bitmap: */
5489 if (msr_index < 1024*8) {
5490 unsigned char b;
5491 kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
5492 return 1 & (b >> (msr_index & 7));
5493 } else
5494 return 1; /* let L1 handle the wrong parameter */
5495}
5496
5497/*
5498 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5499 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5500 * intercept (via guest_host_mask etc.) the current event.
5501 */
5502static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5503 struct vmcs12 *vmcs12)
5504{
5505 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5506 int cr = exit_qualification & 15;
5507 int reg = (exit_qualification >> 8) & 15;
5508 unsigned long val = kvm_register_read(vcpu, reg);
5509
5510 switch ((exit_qualification >> 4) & 3) {
5511 case 0: /* mov to cr */
5512 switch (cr) {
5513 case 0:
5514 if (vmcs12->cr0_guest_host_mask &
5515 (val ^ vmcs12->cr0_read_shadow))
5516 return 1;
5517 break;
5518 case 3:
5519 if ((vmcs12->cr3_target_count >= 1 &&
5520 vmcs12->cr3_target_value0 == val) ||
5521 (vmcs12->cr3_target_count >= 2 &&
5522 vmcs12->cr3_target_value1 == val) ||
5523 (vmcs12->cr3_target_count >= 3 &&
5524 vmcs12->cr3_target_value2 == val) ||
5525 (vmcs12->cr3_target_count >= 4 &&
5526 vmcs12->cr3_target_value3 == val))
5527 return 0;
5528 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5529 return 1;
5530 break;
5531 case 4:
5532 if (vmcs12->cr4_guest_host_mask &
5533 (vmcs12->cr4_read_shadow ^ val))
5534 return 1;
5535 break;
5536 case 8:
5537 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5538 return 1;
5539 break;
5540 }
5541 break;
5542 case 2: /* clts */
5543 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5544 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5545 return 1;
5546 break;
5547 case 1: /* mov from cr */
5548 switch (cr) {
5549 case 3:
5550 if (vmcs12->cpu_based_vm_exec_control &
5551 CPU_BASED_CR3_STORE_EXITING)
5552 return 1;
5553 break;
5554 case 8:
5555 if (vmcs12->cpu_based_vm_exec_control &
5556 CPU_BASED_CR8_STORE_EXITING)
5557 return 1;
5558 break;
5559 }
5560 break;
5561 case 3: /* lmsw */
5562 /*
5563 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5564 * cr0. Other attempted changes are ignored, with no exit.
5565 */
5566 if (vmcs12->cr0_guest_host_mask & 0xe &
5567 (val ^ vmcs12->cr0_read_shadow))
5568 return 1;
5569 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5570 !(vmcs12->cr0_read_shadow & 0x1) &&
5571 (val & 0x1))
5572 return 1;
5573 break;
5574 }
5575 return 0;
5576}
5577
5578/*
5579 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5580 * should handle it ourselves in L0 (and then continue L2). Only call this
5581 * when in is_guest_mode (L2).
5582 */
5583static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
5584{
5585 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
5586 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5587 struct vcpu_vmx *vmx = to_vmx(vcpu);
5588 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5589
5590 if (vmx->nested.nested_run_pending)
5591 return 0;
5592
5593 if (unlikely(vmx->fail)) {
5594 printk(KERN_INFO "%s failed vm entry %x\n",
5595 __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
5596 return 1;
5597 }
5598
5599 switch (exit_reason) {
5600 case EXIT_REASON_EXCEPTION_NMI:
5601 if (!is_exception(intr_info))
5602 return 0;
5603 else if (is_page_fault(intr_info))
5604 return enable_ept;
5605 return vmcs12->exception_bitmap &
5606 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5607 case EXIT_REASON_EXTERNAL_INTERRUPT:
5608 return 0;
5609 case EXIT_REASON_TRIPLE_FAULT:
5610 return 1;
5611 case EXIT_REASON_PENDING_INTERRUPT:
5612 case EXIT_REASON_NMI_WINDOW:
5613 /*
5614 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
5615 * (aka Interrupt Window Exiting) only when L1 turned it on,
5616 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
5617 * Same for NMI Window Exiting.
5618 */
5619 return 1;
5620 case EXIT_REASON_TASK_SWITCH:
5621 return 1;
5622 case EXIT_REASON_CPUID:
5623 return 1;
5624 case EXIT_REASON_HLT:
5625 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5626 case EXIT_REASON_INVD:
5627 return 1;
5628 case EXIT_REASON_INVLPG:
5629 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5630 case EXIT_REASON_RDPMC:
5631 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5632 case EXIT_REASON_RDTSC:
5633 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5634 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5635 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5636 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
5637 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
5638 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5639 /*
5640 * VMX instructions trap unconditionally. This allows L1 to
5641 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5642 */
5643 return 1;
5644 case EXIT_REASON_CR_ACCESS:
5645 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5646 case EXIT_REASON_DR_ACCESS:
5647 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5648 case EXIT_REASON_IO_INSTRUCTION:
5649 /* TODO: support IO bitmaps */
5650 return 1;
5651 case EXIT_REASON_MSR_READ:
5652 case EXIT_REASON_MSR_WRITE:
5653 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5654 case EXIT_REASON_INVALID_STATE:
5655 return 1;
5656 case EXIT_REASON_MWAIT_INSTRUCTION:
5657 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5658 case EXIT_REASON_MONITOR_INSTRUCTION:
5659 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5660 case EXIT_REASON_PAUSE_INSTRUCTION:
5661 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5662 nested_cpu_has2(vmcs12,
5663 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5664 case EXIT_REASON_MCE_DURING_VMENTRY:
5665 return 0;
5666 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5667 return 1;
5668 case EXIT_REASON_APIC_ACCESS:
5669 return nested_cpu_has2(vmcs12,
5670 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
5671 case EXIT_REASON_EPT_VIOLATION:
5672 case EXIT_REASON_EPT_MISCONFIG:
5673 return 0;
5674 case EXIT_REASON_WBINVD:
5675 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5676 case EXIT_REASON_XSETBV:
5677 return 1;
5678 default:
5679 return 1;
5680 }
5681}
5682
3914static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 5683static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3915{ 5684{
3916 *info1 = vmcs_readl(EXIT_QUALIFICATION); 5685 *info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3933 if (vmx->emulation_required && emulate_invalid_guest_state) 5702 if (vmx->emulation_required && emulate_invalid_guest_state)
3934 return handle_invalid_guest_state(vcpu); 5703 return handle_invalid_guest_state(vcpu);
3935 5704
5705 /*
5706 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
5707 * we did not inject a still-pending event to L1 now because of
5708 * nested_run_pending, we need to re-enable this bit.
5709 */
5710 if (vmx->nested.nested_run_pending)
5711 kvm_make_request(KVM_REQ_EVENT, vcpu);
5712
5713 if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
5714 exit_reason == EXIT_REASON_VMRESUME))
5715 vmx->nested.nested_run_pending = 1;
5716 else
5717 vmx->nested.nested_run_pending = 0;
5718
5719 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
5720 nested_vmx_vmexit(vcpu);
5721 return 1;
5722 }
5723
3936 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 5724 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3937 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5725 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3938 vcpu->run->fail_entry.hardware_entry_failure_reason 5726 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3955 "(0x%x) and exit reason is 0x%x\n", 5743 "(0x%x) and exit reason is 0x%x\n",
3956 __func__, vectoring_info, exit_reason); 5744 __func__, vectoring_info, exit_reason);
3957 5745
3958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 5746 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5747 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
5748 get_vmcs12(vcpu), vcpu)))) {
3959 if (vmx_interrupt_allowed(vcpu)) { 5749 if (vmx_interrupt_allowed(vcpu)) {
3960 vmx->soft_vnmi_blocked = 0; 5750 vmx->soft_vnmi_blocked = 0;
3961 } else if (vmx->vnmi_blocked_time > 1000000000LL && 5751 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
4118 5908
4119static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 5909static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4120{ 5910{
5911 if (is_guest_mode(&vmx->vcpu))
5912 return;
4121 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, 5913 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
4122 VM_EXIT_INSTRUCTION_LEN, 5914 VM_EXIT_INSTRUCTION_LEN,
4123 IDT_VECTORING_ERROR_CODE); 5915 IDT_VECTORING_ERROR_CODE);
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4125 5917
4126static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 5918static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
4127{ 5919{
5920 if (is_guest_mode(vcpu))
5921 return;
4128 __vmx_complete_interrupts(to_vmx(vcpu), 5922 __vmx_complete_interrupts(to_vmx(vcpu),
4129 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 5923 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
4130 VM_ENTRY_INSTRUCTION_LEN, 5924 VM_ENTRY_INSTRUCTION_LEN,
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4145{ 5939{
4146 struct vcpu_vmx *vmx = to_vmx(vcpu); 5940 struct vcpu_vmx *vmx = to_vmx(vcpu);
4147 5941
5942 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
5943 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5944 if (vmcs12->idt_vectoring_info_field &
5945 VECTORING_INFO_VALID_MASK) {
5946 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5947 vmcs12->idt_vectoring_info_field);
5948 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5949 vmcs12->vm_exit_instruction_len);
5950 if (vmcs12->idt_vectoring_info_field &
5951 VECTORING_INFO_DELIVER_CODE_MASK)
5952 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
5953 vmcs12->idt_vectoring_error_code);
5954 }
5955 }
5956
4148 /* Record the guest's net vcpu time for enforced NMI injections. */ 5957 /* Record the guest's net vcpu time for enforced NMI injections. */
4149 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 5958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
4150 vmx->entry_time = ktime_get(); 5959 vmx->entry_time = ktime_get();
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4167 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5976 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
4168 vmx_set_interrupt_shadow(vcpu, 0); 5977 vmx_set_interrupt_shadow(vcpu, 0);
4169 5978
5979 vmx->__launched = vmx->loaded_vmcs->launched;
4170 asm( 5980 asm(
4171 /* Store host registers */ 5981 /* Store host registers */
4172 "push %%"R"dx; push %%"R"bp;" 5982 "push %%"R"dx; push %%"R"bp;"
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4237 "pop %%"R"bp; pop %%"R"dx \n\t" 6047 "pop %%"R"bp; pop %%"R"dx \n\t"
4238 "setbe %c[fail](%0) \n\t" 6048 "setbe %c[fail](%0) \n\t"
4239 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 6049 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4240 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 6050 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
4241 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 6051 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
4242 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 6052 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
4243 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 6053 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4276 6086
4277 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6087 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4278 6088
6089 if (is_guest_mode(vcpu)) {
6090 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6091 vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
6092 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
6093 vmcs12->idt_vectoring_error_code =
6094 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6095 vmcs12->vm_exit_instruction_len =
6096 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6097 }
6098 }
6099
4279 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 6100 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4280 vmx->launched = 1; 6101 vmx->loaded_vmcs->launched = 1;
4281 6102
4282 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6103 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4283 6104
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4289#undef R 6110#undef R
4290#undef Q 6111#undef Q
4291 6112
4292static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
4293{
4294 struct vcpu_vmx *vmx = to_vmx(vcpu);
4295
4296 if (vmx->vmcs) {
4297 vcpu_clear(vmx);
4298 free_vmcs(vmx->vmcs);
4299 vmx->vmcs = NULL;
4300 }
4301}
4302
4303static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6113static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4304{ 6114{
4305 struct vcpu_vmx *vmx = to_vmx(vcpu); 6115 struct vcpu_vmx *vmx = to_vmx(vcpu);
4306 6116
4307 free_vpid(vmx); 6117 free_vpid(vmx);
4308 vmx_free_vmcs(vcpu); 6118 free_nested(vmx);
6119 free_loaded_vmcs(vmx->loaded_vmcs);
4309 kfree(vmx->guest_msrs); 6120 kfree(vmx->guest_msrs);
4310 kvm_vcpu_uninit(vcpu); 6121 kvm_vcpu_uninit(vcpu);
4311 kmem_cache_free(kvm_vcpu_cache, vmx); 6122 kmem_cache_free(kvm_vcpu_cache, vmx);
4312} 6123}
4313 6124
4314static inline void vmcs_init(struct vmcs *vmcs)
4315{
4316 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4317
4318 if (!vmm_exclusive)
4319 kvm_cpu_vmxon(phys_addr);
4320
4321 vmcs_clear(vmcs);
4322
4323 if (!vmm_exclusive)
4324 kvm_cpu_vmxoff();
4325}
4326
4327static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6125static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4328{ 6126{
4329 int err; 6127 int err;
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4345 goto uninit_vcpu; 6143 goto uninit_vcpu;
4346 } 6144 }
4347 6145
4348 vmx->vmcs = alloc_vmcs(); 6146 vmx->loaded_vmcs = &vmx->vmcs01;
4349 if (!vmx->vmcs) 6147 vmx->loaded_vmcs->vmcs = alloc_vmcs();
6148 if (!vmx->loaded_vmcs->vmcs)
4350 goto free_msrs; 6149 goto free_msrs;
4351 6150 if (!vmm_exclusive)
4352 vmcs_init(vmx->vmcs); 6151 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
6152 loaded_vmcs_init(vmx->loaded_vmcs);
6153 if (!vmm_exclusive)
6154 kvm_cpu_vmxoff();
4353 6155
4354 cpu = get_cpu(); 6156 cpu = get_cpu();
4355 vmx_vcpu_load(&vmx->vcpu, cpu); 6157 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4375 goto free_vmcs; 6177 goto free_vmcs;
4376 } 6178 }
4377 6179
6180 vmx->nested.current_vmptr = -1ull;
6181 vmx->nested.current_vmcs12 = NULL;
6182
4378 return &vmx->vcpu; 6183 return &vmx->vcpu;
4379 6184
4380free_vmcs: 6185free_vmcs:
4381 free_vmcs(vmx->vmcs); 6186 free_vmcs(vmx->loaded_vmcs->vmcs);
4382free_msrs: 6187free_msrs:
4383 kfree(vmx->guest_msrs); 6188 kfree(vmx->guest_msrs);
4384uninit_vcpu: 6189uninit_vcpu:
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4512 6317
4513static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6318static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4514{ 6319{
6320 if (func == 1 && nested)
6321 entry->ecx |= bit(X86_FEATURE_VMX);
6322}
6323
6324/*
6325 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
6326 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
6327 * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
6328 * guest in a way that will both be appropriate to L1's requests, and our
6329 * needs. In addition to modifying the active vmcs (which is vmcs02), this
6330 * function also has additional necessary side-effects, like setting various
6331 * vcpu->arch fields.
6332 */
6333static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6334{
6335 struct vcpu_vmx *vmx = to_vmx(vcpu);
6336 u32 exec_control;
6337
6338 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
6339 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
6340 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
6341 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
6342 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
6343 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
6344 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
6345 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
6346 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
6347 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
6348 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
6349 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
6350 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
6351 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
6352 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
6353 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
6354 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
6355 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
6356 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
6357 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
6358 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
6359 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
6360 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
6361 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
6362 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
6363 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
6364 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
6365 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
6366 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
6367 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
6368 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
6369 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
6370 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
6371 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
6372 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
6373 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
6374
6375 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
6376 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6377 vmcs12->vm_entry_intr_info_field);
6378 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6379 vmcs12->vm_entry_exception_error_code);
6380 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6381 vmcs12->vm_entry_instruction_len);
6382 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
6383 vmcs12->guest_interruptibility_info);
6384 vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
6385 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
6386 vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
6387 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
6388 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
6389 vmcs12->guest_pending_dbg_exceptions);
6390 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
6391 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
6392
6393 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6394
6395 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
6396 (vmcs_config.pin_based_exec_ctrl |
6397 vmcs12->pin_based_vm_exec_control));
6398
6399 /*
6400 * Whether page-faults are trapped is determined by a combination of
6401 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
6402 * If enable_ept, L0 doesn't care about page faults and we should
6403 * set all of these to L1's desires. However, if !enable_ept, L0 does
6404 * care about (at least some) page faults, and because it is not easy
6405 * (if at all possible?) to merge L0 and L1's desires, we simply ask
6406 * to exit on each and every L2 page fault. This is done by setting
6407 * MASK=MATCH=0 and (see below) EB.PF=1.
6408 * Note that below we don't need special code to set EB.PF beyond the
6409 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
6410 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
6411 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
6412 *
6413 * A problem with this approach (when !enable_ept) is that L1 may be
6414 * injected with more page faults than it asked for. This could have
6415 * caused problems, but in practice existing hypervisors don't care.
6416 * To fix this, we will need to emulate the PFEC checking (on the L1
6417 * page tables), using walk_addr(), when injecting PFs to L1.
6418 */
6419 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
6420 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
6421 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
6422 enable_ept ? vmcs12->page_fault_error_code_match : 0);
6423
6424 if (cpu_has_secondary_exec_ctrls()) {
6425 u32 exec_control = vmx_secondary_exec_control(vmx);
6426 if (!vmx->rdtscp_enabled)
6427 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6428 /* Take the following fields only from vmcs12 */
6429 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6430 if (nested_cpu_has(vmcs12,
6431 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
6432 exec_control |= vmcs12->secondary_vm_exec_control;
6433
6434 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
6435 /*
6436 * Translate L1 physical address to host physical
6437 * address for vmcs02. Keep the page pinned, so this
6438 * physical address remains valid. We keep a reference
6439 * to it so we can release it later.
6440 */
6441 if (vmx->nested.apic_access_page) /* shouldn't happen */
6442 nested_release_page(vmx->nested.apic_access_page);
6443 vmx->nested.apic_access_page =
6444 nested_get_page(vcpu, vmcs12->apic_access_addr);
6445 /*
6446 * If translation failed, no matter: This feature asks
6447 * to exit when accessing the given address, and if it
6448 * can never be accessed, this feature won't do
6449 * anything anyway.
6450 */
6451 if (!vmx->nested.apic_access_page)
6452 exec_control &=
6453 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6454 else
6455 vmcs_write64(APIC_ACCESS_ADDR,
6456 page_to_phys(vmx->nested.apic_access_page));
6457 }
6458
6459 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6460 }
6461
6462
6463 /*
6464 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
6465 * Some constant fields are set here by vmx_set_constant_host_state().
6466 * Other fields are different per CPU, and will be set later when
6467 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
6468 */
6469 vmx_set_constant_host_state();
6470
6471 /*
6472 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
6473 * entry, but only if the current (host) sp changed from the value
6474 * we wrote last (vmx->host_rsp). This cache is no longer relevant
6475 * if we switch vmcs, and rather than hold a separate cache per vmcs,
6476 * here we just force the write to happen on entry.
6477 */
6478 vmx->host_rsp = 0;
6479
6480 exec_control = vmx_exec_control(vmx); /* L0's desires */
6481 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
6482 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
6483 exec_control &= ~CPU_BASED_TPR_SHADOW;
6484 exec_control |= vmcs12->cpu_based_vm_exec_control;
6485 /*
6486 * Merging of IO and MSR bitmaps not currently supported.
6487 * Rather, exit every time.
6488 */
6489 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
6490 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
6491 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
6492
6493 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
6494
6495 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
6496 * bitwise-or of what L1 wants to trap for L2, and what we want to
6497 * trap. Note that CR0.TS also needs updating - we do this later.
6498 */
6499 update_exception_bitmap(vcpu);
6500 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
6501 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6502
6503 /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
6504 vmcs_write32(VM_EXIT_CONTROLS,
6505 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
6506 vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
6507 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
6508
6509 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
6510 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
6511 else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6512 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6513
6514
6515 set_cr4_guest_host_mask(vmx);
6516
6517 vmcs_write64(TSC_OFFSET,
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
6519
6520 if (enable_vpid) {
6521 /*
6522 * Trivially support vpid by letting L2s share their parent
6523 * L1's vpid. TODO: move to a more elaborate solution, giving
6524 * each L2 its own vpid and exposing the vpid feature to L1.
6525 */
6526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6527 vmx_flush_tlb(vcpu);
6528 }
6529
6530 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
6531 vcpu->arch.efer = vmcs12->guest_ia32_efer;
6532 if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
6533 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6534 else
6535 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6536 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
6537 vmx_set_efer(vcpu, vcpu->arch.efer);
6538
6539 /*
6540 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
6541 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
6542 * The CR0_READ_SHADOW is what L2 should have expected to read given
6543 * the specifications by L1; It's not enough to take
6544 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
6545 * have more bits than L1 expected.
6546 */
6547 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
6548 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
6549
6550 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
6551 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
6552
6553 /* shadow page tables on either EPT or shadow page tables */
6554 kvm_set_cr3(vcpu, vmcs12->guest_cr3);
6555 kvm_mmu_reset_context(vcpu);
6556
6557 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
6558 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
6559}
6560
6561/*
6562 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
6563 * for running an L2 nested guest.
6564 */
6565static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6566{
6567 struct vmcs12 *vmcs12;
6568 struct vcpu_vmx *vmx = to_vmx(vcpu);
6569 int cpu;
6570 struct loaded_vmcs *vmcs02;
6571
6572 if (!nested_vmx_check_permission(vcpu) ||
6573 !nested_vmx_check_vmcs12(vcpu))
6574 return 1;
6575
6576 skip_emulated_instruction(vcpu);
6577 vmcs12 = get_vmcs12(vcpu);
6578
6579 /*
6580 * The nested entry process starts with enforcing various prerequisites
6581 * on vmcs12 as required by the Intel SDM, and act appropriately when
6582 * they fail: As the SDM explains, some conditions should cause the
6583 * instruction to fail, while others will cause the instruction to seem
6584 * to succeed, but return an EXIT_REASON_INVALID_STATE.
6585 * To speed up the normal (success) code path, we should avoid checking
6586 * for misconfigurations which will anyway be caught by the processor
6587 * when using the merged vmcs02.
6588 */
6589 if (vmcs12->launch_state == launch) {
6590 nested_vmx_failValid(vcpu,
6591 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
6592 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
6593 return 1;
6594 }
6595
6596 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
6597 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
6598 /*TODO: Also verify bits beyond physical address width are 0*/
6599 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6600 return 1;
6601 }
6602
6603 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
6604 !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
6605 /*TODO: Also verify bits beyond physical address width are 0*/
6606 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6607 return 1;
6608 }
6609
6610 if (vmcs12->vm_entry_msr_load_count > 0 ||
6611 vmcs12->vm_exit_msr_load_count > 0 ||
6612 vmcs12->vm_exit_msr_store_count > 0) {
6613 if (printk_ratelimit())
6614 printk(KERN_WARNING
6615 "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
6616 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6617 return 1;
6618 }
6619
6620 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
6621 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
6622 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
6623 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
6624 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
6625 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
6626 !vmx_control_verify(vmcs12->vm_exit_controls,
6627 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
6628 !vmx_control_verify(vmcs12->vm_entry_controls,
6629 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
6630 {
6631 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6632 return 1;
6633 }
6634
6635 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6636 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6637 nested_vmx_failValid(vcpu,
6638 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
6639 return 1;
6640 }
6641
6642 if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6643 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6644 nested_vmx_entry_failure(vcpu, vmcs12,
6645 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
6646 return 1;
6647 }
6648 if (vmcs12->vmcs_link_pointer != -1ull) {
6649 nested_vmx_entry_failure(vcpu, vmcs12,
6650 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
6651 return 1;
6652 }
6653
6654 /*
6655 * We're finally done with prerequisite checking, and can start with
6656 * the nested entry.
6657 */
6658
6659 vmcs02 = nested_get_current_vmcs02(vmx);
6660 if (!vmcs02)
6661 return -ENOMEM;
6662
6663 enter_guest_mode(vcpu);
6664
6665 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
6666
6667 cpu = get_cpu();
6668 vmx->loaded_vmcs = vmcs02;
6669 vmx_vcpu_put(vcpu);
6670 vmx_vcpu_load(vcpu, cpu);
6671 vcpu->cpu = cpu;
6672 put_cpu();
6673
6674 vmcs12->launch_state = 1;
6675
6676 prepare_vmcs02(vcpu, vmcs12);
6677
6678 /*
6679 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
6680 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
6681 * returned as far as L1 is concerned. It will only return (and set
6682 * the success flag) when L2 exits (see nested_vmx_vmexit()).
6683 */
6684 return 1;
6685}
6686
6687/*
6688 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
6689 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
6690 * This function returns the new value we should put in vmcs12.guest_cr0.
6691 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
6692 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
6693 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
6694 * didn't trap the bit, because if L1 did, so would L0).
6695 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
6696 * been modified by L2, and L1 knows it. So just leave the old value of
6697 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
6698 * isn't relevant, because if L0 traps this bit it can set it to anything.
6699 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
6700 * changed these bits, and therefore they need to be updated, but L0
6701 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
6702 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
6703 */
6704static inline unsigned long
6705vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6706{
6707 return
6708 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
6709 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
6710 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
6711 vcpu->arch.cr0_guest_owned_bits));
6712}
6713
6714static inline unsigned long
6715vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6716{
6717 return
6718 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
6719 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
6720 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
6721 vcpu->arch.cr4_guest_owned_bits));
6722}
6723
6724/*
6725 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
6726 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
6727 * and this function updates it to reflect the changes to the guest state while
6728 * L2 was running (and perhaps made some exits which were handled directly by L0
6729 * without going back to L1), and to reflect the exit reason.
6730 * Note that we do not have to copy here all VMCS fields, just those that
6731 * could have changed by the L2 guest or the exit - i.e., the guest-state and
6732 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
6733 * which already writes to vmcs12 directly.
6734 */
6735void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6736{
6737 /* update guest state fields: */
6738 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
6739 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
6740
6741 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
6742 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6743 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
6744 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
6745
6746 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
6747 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
6748 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
6749 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
6750 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
6751 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
6752 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
6753 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
6754 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
6755 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
6756 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
6757 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
6758 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
6759 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
6760 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
6761 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
6762 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
6763 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
6764 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
6765 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
6766 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
6767 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
6768 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
6769 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
6770 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
6771 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
6772 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
6773 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
6774 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
6775 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
6776 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
6777 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
6778 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
6779 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
6780 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
6781 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
6782
6783 vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
6784 vmcs12->guest_interruptibility_info =
6785 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
6786 vmcs12->guest_pending_dbg_exceptions =
6787 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
6788
6789 /* TODO: These cannot have changed unless we have MSR bitmaps and
6790 * the relevant bit asks not to trap the change */
6791 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
6792 if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
6793 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
6794 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
6795 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
6796 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
6797
6798 /* update exit information fields: */
6799
6800 vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
6801 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6802
6803 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6804 vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6805 vmcs12->idt_vectoring_info_field =
6806 vmcs_read32(IDT_VECTORING_INFO_FIELD);
6807 vmcs12->idt_vectoring_error_code =
6808 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6809 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6810 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6811
6812 /* clear vm-entry fields which are to be cleared on exit */
6813 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6814 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
6815}
6816
6817/*
6818 * A part of what we need to when the nested L2 guest exits and we want to
6819 * run its L1 parent, is to reset L1's guest state to the host state specified
6820 * in vmcs12.
6821 * This function is to be called not only on normal nested exit, but also on
6822 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
6823 * Failures During or After Loading Guest State").
6824 * This function should be called when the active VMCS is L1's (vmcs01).
6825 */
6826void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6827{
6828 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
6829 vcpu->arch.efer = vmcs12->host_ia32_efer;
6830 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
6831 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6832 else
6833 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6834 vmx_set_efer(vcpu, vcpu->arch.efer);
6835
6836 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
6837 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
6838 /*
6839 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
6840 * actually changed, because it depends on the current state of
6841 * fpu_active (which may have changed).
6842 * Note that vmx_set_cr0 refers to efer set above.
6843 */
6844 kvm_set_cr0(vcpu, vmcs12->host_cr0);
6845 /*
6846 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
6847 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
6848 * but we also need to update cr0_guest_host_mask and exception_bitmap.
6849 */
6850 update_exception_bitmap(vcpu);
6851 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
6852 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6853
6854 /*
6855 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
6856 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
6857 */
6858 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
6859 kvm_set_cr4(vcpu, vmcs12->host_cr4);
6860
6861 /* shadow page tables on either EPT or shadow page tables */
6862 kvm_set_cr3(vcpu, vmcs12->host_cr3);
6863 kvm_mmu_reset_context(vcpu);
6864
6865 if (enable_vpid) {
6866 /*
6867 * Trivially support vpid by letting L2s share their parent
6868 * L1's vpid. TODO: move to a more elaborate solution, giving
6869 * each L2 its own vpid and exposing the vpid feature to L1.
6870 */
6871 vmx_flush_tlb(vcpu);
6872 }
6873
6874
6875 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
6876 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
6877 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
6878 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
6879 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
6880 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
6881 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
6882 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
6883 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
6884 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
6885 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
6886 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
6887 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
6888 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
6889 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
6890
6891 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
6892 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
6893 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6894 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
6895 vmcs12->host_ia32_perf_global_ctrl);
6896}
6897
6898/*
6899 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
6900 * and modify vmcs12 to make it see what it would expect to see there if
6901 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
6902 */
6903static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
6904{
6905 struct vcpu_vmx *vmx = to_vmx(vcpu);
6906 int cpu;
6907 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6908
6909 leave_guest_mode(vcpu);
6910 prepare_vmcs12(vcpu, vmcs12);
6911
6912 cpu = get_cpu();
6913 vmx->loaded_vmcs = &vmx->vmcs01;
6914 vmx_vcpu_put(vcpu);
6915 vmx_vcpu_load(vcpu, cpu);
6916 vcpu->cpu = cpu;
6917 put_cpu();
6918
6919 /* if no vmcs02 cache requested, remove the one we used */
6920 if (VMCS02_POOL_SIZE == 0)
6921 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
6922
6923 load_vmcs12_host_state(vcpu, vmcs12);
6924
6925 /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
6926 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6927
6928 /* This is needed for same reason as it was needed in prepare_vmcs02 */
6929 vmx->host_rsp = 0;
6930
6931 /* Unpin physical memory we referred to in vmcs02 */
6932 if (vmx->nested.apic_access_page) {
6933 nested_release_page(vmx->nested.apic_access_page);
6934 vmx->nested.apic_access_page = 0;
6935 }
6936
6937 /*
6938 * Exiting from L2 to L1, we're now back to L1 which thinks it just
6939 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
6940 * success or failure flag accordingly.
6941 */
6942 if (unlikely(vmx->fail)) {
6943 vmx->fail = 0;
6944 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
6945 } else
6946 nested_vmx_succeed(vcpu);
6947}
6948
6949/*
6950 * L1's failure to enter L2 is a subset of a normal exit, as explained in
6951 * 23.7 "VM-entry failures during or after loading guest state" (this also
6952 * lists the acceptable exit-reason and exit-qualification parameters).
6953 * It should only be called before L2 actually succeeded to run, and when
6954 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
6955 */
6956static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
6957 struct vmcs12 *vmcs12,
6958 u32 reason, unsigned long qualification)
6959{
6960 load_vmcs12_host_state(vcpu, vmcs12);
6961 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
6962 vmcs12->exit_qualification = qualification;
6963 nested_vmx_succeed(vcpu);
4515} 6964}
4516 6965
4517static int vmx_check_intercept(struct kvm_vcpu *vcpu, 6966static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void)
4670 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7119 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
4671 7120
4672 if (enable_ept) { 7121 if (enable_ept) {
4673 bypass_guest_pf = 0;
4674 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7122 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4675 VMX_EPT_EXECUTABLE_MASK); 7123 VMX_EPT_EXECUTABLE_MASK);
7124 ept_set_mmio_spte_mask();
4676 kvm_enable_tdp(); 7125 kvm_enable_tdp();
4677 } else 7126 } else
4678 kvm_disable_tdp(); 7127 kvm_disable_tdp();
4679 7128
4680 if (bypass_guest_pf)
4681 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4682
4683 return 0; 7129 return 0;
4684 7130
4685out3: 7131out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77c9d8673dc..73c6a4268bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -44,6 +44,7 @@
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h> 46#include <linux/hash.h>
47#include <linux/pci.h>
47#include <trace/events/kvm.h> 48#include <trace/events/kvm.h>
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
@@ -347,6 +348,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347 vcpu->arch.cr2 = fault->address; 348 vcpu->arch.cr2 = fault->address;
348 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 349 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
349} 350}
351EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
350 352
351void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 353void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
352{ 354{
@@ -579,6 +581,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
579 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 581 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
580} 582}
581 583
584static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
585{
586 struct kvm_cpuid_entry2 *best;
587
588 best = kvm_find_cpuid_entry(vcpu, 7, 0);
589 return best && (best->ebx & bit(X86_FEATURE_SMEP));
590}
591
592static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
593{
594 struct kvm_cpuid_entry2 *best;
595
596 best = kvm_find_cpuid_entry(vcpu, 7, 0);
597 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
598}
599
582static void update_cpuid(struct kvm_vcpu *vcpu) 600static void update_cpuid(struct kvm_vcpu *vcpu)
583{ 601{
584 struct kvm_cpuid_entry2 *best; 602 struct kvm_cpuid_entry2 *best;
@@ -598,14 +616,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
598int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 616int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
599{ 617{
600 unsigned long old_cr4 = kvm_read_cr4(vcpu); 618 unsigned long old_cr4 = kvm_read_cr4(vcpu);
601 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 619 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
602 620 X86_CR4_PAE | X86_CR4_SMEP;
603 if (cr4 & CR4_RESERVED_BITS) 621 if (cr4 & CR4_RESERVED_BITS)
604 return 1; 622 return 1;
605 623
606 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 624 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
607 return 1; 625 return 1;
608 626
627 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
628 return 1;
629
630 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
631 return 1;
632
609 if (is_long_mode(vcpu)) { 633 if (is_long_mode(vcpu)) {
610 if (!(cr4 & X86_CR4_PAE)) 634 if (!(cr4 & X86_CR4_PAE))
611 return 1; 635 return 1;
@@ -615,11 +639,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
615 kvm_read_cr3(vcpu))) 639 kvm_read_cr3(vcpu)))
616 return 1; 640 return 1;
617 641
618 if (cr4 & X86_CR4_VMXE) 642 if (kvm_x86_ops->set_cr4(vcpu, cr4))
619 return 1; 643 return 1;
620 644
621 kvm_x86_ops->set_cr4(vcpu, cr4);
622
623 if ((cr4 ^ old_cr4) & pdptr_bits) 645 if ((cr4 ^ old_cr4) & pdptr_bits)
624 kvm_mmu_reset_context(vcpu); 646 kvm_mmu_reset_context(vcpu);
625 647
@@ -787,12 +809,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
787 * kvm-specific. Those are put in the beginning of the list. 809 * kvm-specific. Those are put in the beginning of the list.
788 */ 810 */
789 811
790#define KVM_SAVE_MSRS_BEGIN 8 812#define KVM_SAVE_MSRS_BEGIN 9
791static u32 msrs_to_save[] = { 813static u32 msrs_to_save[] = {
792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 814 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 815 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 816 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, 817 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 818 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
797 MSR_STAR, 819 MSR_STAR,
798#ifdef CONFIG_X86_64 820#ifdef CONFIG_X86_64
@@ -1388,7 +1410,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1388 return 1; 1410 return 1;
1389 kvm_x86_ops->patch_hypercall(vcpu, instructions); 1411 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1390 ((unsigned char *)instructions)[3] = 0xc3; /* ret */ 1412 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1391 if (copy_to_user((void __user *)addr, instructions, 4)) 1413 if (__copy_to_user((void __user *)addr, instructions, 4))
1392 return 1; 1414 return 1;
1393 kvm->arch.hv_hypercall = data; 1415 kvm->arch.hv_hypercall = data;
1394 break; 1416 break;
@@ -1415,7 +1437,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1415 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); 1437 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1416 if (kvm_is_error_hva(addr)) 1438 if (kvm_is_error_hva(addr))
1417 return 1; 1439 return 1;
1418 if (clear_user((void __user *)addr, PAGE_SIZE)) 1440 if (__clear_user((void __user *)addr, PAGE_SIZE))
1419 return 1; 1441 return 1;
1420 vcpu->arch.hv_vapic = data; 1442 vcpu->arch.hv_vapic = data;
1421 break; 1443 break;
@@ -1467,6 +1489,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
1467 } 1489 }
1468} 1490}
1469 1491
1492static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1493{
1494 u64 delta;
1495
1496 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1497 return;
1498
1499 delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
1500 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1501 vcpu->arch.st.accum_steal = delta;
1502}
1503
1504static void record_steal_time(struct kvm_vcpu *vcpu)
1505{
1506 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1507 return;
1508
1509 if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1510 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
1511 return;
1512
1513 vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
1514 vcpu->arch.st.steal.version += 2;
1515 vcpu->arch.st.accum_steal = 0;
1516
1517 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1518 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1519}
1520
1470int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1521int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1471{ 1522{
1472 switch (msr) { 1523 switch (msr) {
@@ -1549,6 +1600,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1549 if (kvm_pv_enable_async_pf(vcpu, data)) 1600 if (kvm_pv_enable_async_pf(vcpu, data))
1550 return 1; 1601 return 1;
1551 break; 1602 break;
1603 case MSR_KVM_STEAL_TIME:
1604
1605 if (unlikely(!sched_info_on()))
1606 return 1;
1607
1608 if (data & KVM_STEAL_RESERVED_MASK)
1609 return 1;
1610
1611 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
1612 data & KVM_STEAL_VALID_BITS))
1613 return 1;
1614
1615 vcpu->arch.st.msr_val = data;
1616
1617 if (!(data & KVM_MSR_ENABLED))
1618 break;
1619
1620 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1621
1622 preempt_disable();
1623 accumulate_steal_time(vcpu);
1624 preempt_enable();
1625
1626 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1627
1628 break;
1629
1552 case MSR_IA32_MCG_CTL: 1630 case MSR_IA32_MCG_CTL:
1553 case MSR_IA32_MCG_STATUS: 1631 case MSR_IA32_MCG_STATUS:
1554 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1632 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1834,6 +1912,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1834 case MSR_KVM_ASYNC_PF_EN: 1912 case MSR_KVM_ASYNC_PF_EN:
1835 data = vcpu->arch.apf.msr_val; 1913 data = vcpu->arch.apf.msr_val;
1836 break; 1914 break;
1915 case MSR_KVM_STEAL_TIME:
1916 data = vcpu->arch.st.msr_val;
1917 break;
1837 case MSR_IA32_P5_MC_ADDR: 1918 case MSR_IA32_P5_MC_ADDR:
1838 case MSR_IA32_P5_MC_TYPE: 1919 case MSR_IA32_P5_MC_TYPE:
1839 case MSR_IA32_MCG_CAP: 1920 case MSR_IA32_MCG_CAP:
@@ -2015,7 +2096,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2015 r = 0; 2096 r = 0;
2016 break; 2097 break;
2017 case KVM_CAP_IOMMU: 2098 case KVM_CAP_IOMMU:
2018 r = iommu_found(); 2099 r = iommu_present(&pci_bus_type);
2019 break; 2100 break;
2020 case KVM_CAP_MCE: 2101 case KVM_CAP_MCE:
2021 r = KVM_MAX_MCE_BANKS; 2102 r = KVM_MAX_MCE_BANKS;
@@ -2145,6 +2226,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2145 kvm_migrate_timers(vcpu); 2226 kvm_migrate_timers(vcpu);
2146 vcpu->cpu = cpu; 2227 vcpu->cpu = cpu;
2147 } 2228 }
2229
2230 accumulate_steal_time(vcpu);
2231 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2148} 2232}
2149 2233
2150void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2234void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2283,6 +2367,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2283 entry->flags = 0; 2367 entry->flags = 0;
2284} 2368}
2285 2369
2370static bool supported_xcr0_bit(unsigned bit)
2371{
2372 u64 mask = ((u64)1 << bit);
2373
2374 return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
2375}
2376
2286#define F(x) bit(X86_FEATURE_##x) 2377#define F(x) bit(X86_FEATURE_##x)
2287 2378
2288static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2379static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
@@ -2328,7 +2419,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2328 0 /* Reserved, DCA */ | F(XMM4_1) | 2419 0 /* Reserved, DCA */ | F(XMM4_1) |
2329 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 2420 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
2330 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 2421 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2331 F(F16C); 2422 F(F16C) | F(RDRAND);
2332 /* cpuid 0x80000001.ecx */ 2423 /* cpuid 0x80000001.ecx */
2333 const u32 kvm_supported_word6_x86_features = 2424 const u32 kvm_supported_word6_x86_features =
2334 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 2425 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
@@ -2342,6 +2433,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2342 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | 2433 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2343 F(PMM) | F(PMM_EN); 2434 F(PMM) | F(PMM_EN);
2344 2435
2436 /* cpuid 7.0.ebx */
2437 const u32 kvm_supported_word9_x86_features =
2438 F(SMEP) | F(FSGSBASE) | F(ERMS);
2439
2345 /* all calls to cpuid_count() should be made on the same cpu */ 2440 /* all calls to cpuid_count() should be made on the same cpu */
2346 get_cpu(); 2441 get_cpu();
2347 do_cpuid_1_ent(entry, function, index); 2442 do_cpuid_1_ent(entry, function, index);
@@ -2376,7 +2471,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2376 } 2471 }
2377 break; 2472 break;
2378 } 2473 }
2379 /* function 4 and 0xb have additional index. */ 2474 /* function 4 has additional index. */
2380 case 4: { 2475 case 4: {
2381 int i, cache_type; 2476 int i, cache_type;
2382 2477
@@ -2393,6 +2488,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2393 } 2488 }
2394 break; 2489 break;
2395 } 2490 }
2491 case 7: {
2492 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2493 /* Mask ebx against host capbability word 9 */
2494 if (index == 0) {
2495 entry->ebx &= kvm_supported_word9_x86_features;
2496 cpuid_mask(&entry->ebx, 9);
2497 } else
2498 entry->ebx = 0;
2499 entry->eax = 0;
2500 entry->ecx = 0;
2501 entry->edx = 0;
2502 break;
2503 }
2504 case 9:
2505 break;
2506 /* function 0xb has additional index. */
2396 case 0xb: { 2507 case 0xb: {
2397 int i, level_type; 2508 int i, level_type;
2398 2509
@@ -2410,16 +2521,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2410 break; 2521 break;
2411 } 2522 }
2412 case 0xd: { 2523 case 0xd: {
2413 int i; 2524 int idx, i;
2414 2525
2415 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2526 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2416 for (i = 1; *nent < maxnent && i < 64; ++i) { 2527 for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
2417 if (entry[i].eax == 0) 2528 do_cpuid_1_ent(&entry[i], function, idx);
2529 if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
2418 continue; 2530 continue;
2419 do_cpuid_1_ent(&entry[i], function, i);
2420 entry[i].flags |= 2531 entry[i].flags |=
2421 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2532 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2422 ++*nent; 2533 ++*nent;
2534 ++i;
2423 } 2535 }
2424 break; 2536 break;
2425 } 2537 }
@@ -2438,6 +2550,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2438 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2550 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2439 (1 << KVM_FEATURE_ASYNC_PF) | 2551 (1 << KVM_FEATURE_ASYNC_PF) |
2440 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2552 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2553
2554 if (sched_info_on())
2555 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
2556
2441 entry->ebx = 0; 2557 entry->ebx = 0;
2442 entry->ecx = 0; 2558 entry->ecx = 0;
2443 entry->edx = 0; 2559 entry->edx = 0;
@@ -2451,6 +2567,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2451 entry->ecx &= kvm_supported_word6_x86_features; 2567 entry->ecx &= kvm_supported_word6_x86_features;
2452 cpuid_mask(&entry->ecx, 6); 2568 cpuid_mask(&entry->ecx, 6);
2453 break; 2569 break;
2570 case 0x80000008: {
2571 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
2572 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
2573 unsigned phys_as = entry->eax & 0xff;
2574
2575 if (!g_phys_as)
2576 g_phys_as = phys_as;
2577 entry->eax = g_phys_as | (virt_as << 8);
2578 entry->ebx = entry->edx = 0;
2579 break;
2580 }
2581 case 0x80000019:
2582 entry->ecx = entry->edx = 0;
2583 break;
2584 case 0x8000001a:
2585 break;
2586 case 0x8000001d:
2587 break;
2454 /*Add support for Centaur's CPUID instruction*/ 2588 /*Add support for Centaur's CPUID instruction*/
2455 case 0xC0000000: 2589 case 0xC0000000:
2456 /*Just support up to 0xC0000004 now*/ 2590 /*Just support up to 0xC0000004 now*/
@@ -2460,10 +2594,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2460 entry->edx &= kvm_supported_word5_x86_features; 2594 entry->edx &= kvm_supported_word5_x86_features;
2461 cpuid_mask(&entry->edx, 5); 2595 cpuid_mask(&entry->edx, 5);
2462 break; 2596 break;
2597 case 3: /* Processor serial number */
2598 case 5: /* MONITOR/MWAIT */
2599 case 6: /* Thermal management */
2600 case 0xA: /* Architectural Performance Monitoring */
2601 case 0x80000007: /* Advanced power management */
2463 case 0xC0000002: 2602 case 0xC0000002:
2464 case 0xC0000003: 2603 case 0xC0000003:
2465 case 0xC0000004: 2604 case 0xC0000004:
2466 /*Now nothing to do, reserved for the future*/ 2605 default:
2606 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
2467 break; 2607 break;
2468 } 2608 }
2469 2609
@@ -3817,7 +3957,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3817 exception); 3957 exception);
3818} 3958}
3819 3959
3820static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 3960int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3821 gva_t addr, void *val, unsigned int bytes, 3961 gva_t addr, void *val, unsigned int bytes,
3822 struct x86_exception *exception) 3962 struct x86_exception *exception)
3823{ 3963{
@@ -3827,6 +3967,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3827 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3967 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3828 exception); 3968 exception);
3829} 3969}
3970EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
3830 3971
3831static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3972static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3832 gva_t addr, void *val, unsigned int bytes, 3973 gva_t addr, void *val, unsigned int bytes,
@@ -3836,7 +3977,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3836 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 3977 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3837} 3978}
3838 3979
3839static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3980int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3840 gva_t addr, void *val, 3981 gva_t addr, void *val,
3841 unsigned int bytes, 3982 unsigned int bytes,
3842 struct x86_exception *exception) 3983 struct x86_exception *exception)
@@ -3868,6 +4009,42 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3868out: 4009out:
3869 return r; 4010 return r;
3870} 4011}
4012EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4013
4014static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4015 gpa_t *gpa, struct x86_exception *exception,
4016 bool write)
4017{
4018 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4019
4020 if (vcpu_match_mmio_gva(vcpu, gva) &&
4021 check_write_user_access(vcpu, write, access,
4022 vcpu->arch.access)) {
4023 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4024 (gva & (PAGE_SIZE - 1));
4025 trace_vcpu_match_mmio(gva, *gpa, write, false);
4026 return 1;
4027 }
4028
4029 if (write)
4030 access |= PFERR_WRITE_MASK;
4031
4032 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4033
4034 if (*gpa == UNMAPPED_GVA)
4035 return -1;
4036
4037 /* For APIC access vmexit */
4038 if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4039 return 1;
4040
4041 if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4042 trace_vcpu_match_mmio(gva, *gpa, write, true);
4043 return 1;
4044 }
4045
4046 return 0;
4047}
3871 4048
3872static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 4049static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3873 unsigned long addr, 4050 unsigned long addr,
@@ -3876,8 +4053,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3876 struct x86_exception *exception) 4053 struct x86_exception *exception)
3877{ 4054{
3878 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4055 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3879 gpa_t gpa; 4056 gpa_t gpa;
3880 int handled; 4057 int handled, ret;
3881 4058
3882 if (vcpu->mmio_read_completed) { 4059 if (vcpu->mmio_read_completed) {
3883 memcpy(val, vcpu->mmio_data, bytes); 4060 memcpy(val, vcpu->mmio_data, bytes);
@@ -3887,13 +4064,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3887 return X86EMUL_CONTINUE; 4064 return X86EMUL_CONTINUE;
3888 } 4065 }
3889 4066
3890 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); 4067 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
3891 4068
3892 if (gpa == UNMAPPED_GVA) 4069 if (ret < 0)
3893 return X86EMUL_PROPAGATE_FAULT; 4070 return X86EMUL_PROPAGATE_FAULT;
3894 4071
3895 /* For APIC access vmexit */ 4072 if (ret)
3896 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3897 goto mmio; 4073 goto mmio;
3898 4074
3899 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) 4075 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -3944,16 +4120,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
3944 struct x86_exception *exception, 4120 struct x86_exception *exception,
3945 struct kvm_vcpu *vcpu) 4121 struct kvm_vcpu *vcpu)
3946{ 4122{
3947 gpa_t gpa; 4123 gpa_t gpa;
3948 int handled; 4124 int handled, ret;
3949 4125
3950 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); 4126 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
3951 4127
3952 if (gpa == UNMAPPED_GVA) 4128 if (ret < 0)
3953 return X86EMUL_PROPAGATE_FAULT; 4129 return X86EMUL_PROPAGATE_FAULT;
3954 4130
3955 /* For APIC access vmexit */ 4131 /* For APIC access vmexit */
3956 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 4132 if (ret)
3957 goto mmio; 4133 goto mmio;
3958 4134
3959 if (emulator_write_phys(vcpu, gpa, val, bytes)) 4135 if (emulator_write_phys(vcpu, gpa, val, bytes))
@@ -4473,9 +4649,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4473 kvm_queue_exception(vcpu, ctxt->exception.vector); 4649 kvm_queue_exception(vcpu, ctxt->exception.vector);
4474} 4650}
4475 4651
4652static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
4653 const unsigned long *regs)
4654{
4655 memset(&ctxt->twobyte, 0,
4656 (void *)&ctxt->regs - (void *)&ctxt->twobyte);
4657 memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
4658
4659 ctxt->fetch.start = 0;
4660 ctxt->fetch.end = 0;
4661 ctxt->io_read.pos = 0;
4662 ctxt->io_read.end = 0;
4663 ctxt->mem_read.pos = 0;
4664 ctxt->mem_read.end = 0;
4665}
4666
4476static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4667static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4477{ 4668{
4478 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4669 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4479 int cs_db, cs_l; 4670 int cs_db, cs_l;
4480 4671
4481 /* 4672 /*
@@ -4488,40 +4679,38 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4488 4679
4489 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4680 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4490 4681
4491 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 4682 ctxt->eflags = kvm_get_rflags(vcpu);
4492 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 4683 ctxt->eip = kvm_rip_read(vcpu);
4493 vcpu->arch.emulate_ctxt.mode = 4684 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4494 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4685 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
4495 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 4686 cs_l ? X86EMUL_MODE_PROT64 :
4496 ? X86EMUL_MODE_VM86 : cs_l 4687 cs_db ? X86EMUL_MODE_PROT32 :
4497 ? X86EMUL_MODE_PROT64 : cs_db 4688 X86EMUL_MODE_PROT16;
4498 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 4689 ctxt->guest_mode = is_guest_mode(vcpu);
4499 vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); 4690
4500 memset(c, 0, sizeof(struct decode_cache)); 4691 init_decode_cache(ctxt, vcpu->arch.regs);
4501 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4502 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4692 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4503} 4693}
4504 4694
4505int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) 4695int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4506{ 4696{
4507 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4697 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4508 int ret; 4698 int ret;
4509 4699
4510 init_emulate_ctxt(vcpu); 4700 init_emulate_ctxt(vcpu);
4511 4701
4512 vcpu->arch.emulate_ctxt.decode.op_bytes = 2; 4702 ctxt->op_bytes = 2;
4513 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; 4703 ctxt->ad_bytes = 2;
4514 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + 4704 ctxt->_eip = ctxt->eip + inc_eip;
4515 inc_eip; 4705 ret = emulate_int_real(ctxt, irq);
4516 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4517 4706
4518 if (ret != X86EMUL_CONTINUE) 4707 if (ret != X86EMUL_CONTINUE)
4519 return EMULATE_FAIL; 4708 return EMULATE_FAIL;
4520 4709
4521 vcpu->arch.emulate_ctxt.eip = c->eip; 4710 ctxt->eip = ctxt->_eip;
4522 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4711 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4523 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4712 kvm_rip_write(vcpu, ctxt->eip);
4524 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4713 kvm_set_rflags(vcpu, ctxt->eflags);
4525 4714
4526 if (irq == NMI_VECTOR) 4715 if (irq == NMI_VECTOR)
4527 vcpu->arch.nmi_pending = false; 4716 vcpu->arch.nmi_pending = false;
@@ -4582,21 +4771,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4582 int insn_len) 4771 int insn_len)
4583{ 4772{
4584 int r; 4773 int r;
4585 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4774 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4586 bool writeback = true; 4775 bool writeback = true;
4587 4776
4588 kvm_clear_exception_queue(vcpu); 4777 kvm_clear_exception_queue(vcpu);
4589 4778
4590 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4779 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4591 init_emulate_ctxt(vcpu); 4780 init_emulate_ctxt(vcpu);
4592 vcpu->arch.emulate_ctxt.interruptibility = 0; 4781 ctxt->interruptibility = 0;
4593 vcpu->arch.emulate_ctxt.have_exception = false; 4782 ctxt->have_exception = false;
4594 vcpu->arch.emulate_ctxt.perm_ok = false; 4783 ctxt->perm_ok = false;
4595 4784
4596 vcpu->arch.emulate_ctxt.only_vendor_specific_insn 4785 ctxt->only_vendor_specific_insn
4597 = emulation_type & EMULTYPE_TRAP_UD; 4786 = emulation_type & EMULTYPE_TRAP_UD;
4598 4787
4599 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4788 r = x86_decode_insn(ctxt, insn, insn_len);
4600 4789
4601 trace_kvm_emulate_insn_start(vcpu); 4790 trace_kvm_emulate_insn_start(vcpu);
4602 ++vcpu->stat.insn_emulation; 4791 ++vcpu->stat.insn_emulation;
@@ -4612,7 +4801,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4612 } 4801 }
4613 4802
4614 if (emulation_type & EMULTYPE_SKIP) { 4803 if (emulation_type & EMULTYPE_SKIP) {
4615 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 4804 kvm_rip_write(vcpu, ctxt->_eip);
4616 return EMULATE_DONE; 4805 return EMULATE_DONE;
4617 } 4806 }
4618 4807
@@ -4620,11 +4809,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4620 changes registers values during IO operation */ 4809 changes registers values during IO operation */
4621 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4810 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4622 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4811 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4623 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4812 memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
4624 } 4813 }
4625 4814
4626restart: 4815restart:
4627 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); 4816 r = x86_emulate_insn(ctxt);
4628 4817
4629 if (r == EMULATION_INTERCEPTED) 4818 if (r == EMULATION_INTERCEPTED)
4630 return EMULATE_DONE; 4819 return EMULATE_DONE;
@@ -4636,7 +4825,7 @@ restart:
4636 return handle_emulation_failure(vcpu); 4825 return handle_emulation_failure(vcpu);
4637 } 4826 }
4638 4827
4639 if (vcpu->arch.emulate_ctxt.have_exception) { 4828 if (ctxt->have_exception) {
4640 inject_emulated_exception(vcpu); 4829 inject_emulated_exception(vcpu);
4641 r = EMULATE_DONE; 4830 r = EMULATE_DONE;
4642 } else if (vcpu->arch.pio.count) { 4831 } else if (vcpu->arch.pio.count) {
@@ -4655,13 +4844,12 @@ restart:
4655 r = EMULATE_DONE; 4844 r = EMULATE_DONE;
4656 4845
4657 if (writeback) { 4846 if (writeback) {
4658 toggle_interruptibility(vcpu, 4847 toggle_interruptibility(vcpu, ctxt->interruptibility);
4659 vcpu->arch.emulate_ctxt.interruptibility); 4848 kvm_set_rflags(vcpu, ctxt->eflags);
4660 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4661 kvm_make_request(KVM_REQ_EVENT, vcpu); 4849 kvm_make_request(KVM_REQ_EVENT, vcpu);
4662 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4850 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4663 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 4851 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4664 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4852 kvm_rip_write(vcpu, ctxt->eip);
4665 } else 4853 } else
4666 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 4854 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4667 4855
@@ -4878,6 +5066,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
4878} 5066}
4879EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 5067EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
4880 5068
5069static void kvm_set_mmio_spte_mask(void)
5070{
5071 u64 mask;
5072 int maxphyaddr = boot_cpu_data.x86_phys_bits;
5073
5074 /*
5075 * Set the reserved bits and the present bit of an paging-structure
5076 * entry to generate page fault with PFER.RSV = 1.
5077 */
5078 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
5079 mask |= 1ull;
5080
5081#ifdef CONFIG_X86_64
5082 /*
5083 * If reserved bit is not supported, clear the present bit to disable
5084 * mmio page fault.
5085 */
5086 if (maxphyaddr == 52)
5087 mask &= ~1ull;
5088#endif
5089
5090 kvm_mmu_set_mmio_spte_mask(mask);
5091}
5092
4881int kvm_arch_init(void *opaque) 5093int kvm_arch_init(void *opaque)
4882{ 5094{
4883 int r; 5095 int r;
@@ -4904,10 +5116,10 @@ int kvm_arch_init(void *opaque)
4904 if (r) 5116 if (r)
4905 goto out; 5117 goto out;
4906 5118
5119 kvm_set_mmio_spte_mask();
4907 kvm_init_msr_list(); 5120 kvm_init_msr_list();
4908 5121
4909 kvm_x86_ops = ops; 5122 kvm_x86_ops = ops;
4910 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4911 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 5123 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4912 PT_DIRTY_MASK, PT64_NX_MASK, 0); 5124 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4913 5125
@@ -5082,8 +5294,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5082 5294
5083 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5295 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5084 5296
5085 return emulator_write_emulated(&vcpu->arch.emulate_ctxt, 5297 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5086 rip, instruction, 3, NULL);
5087} 5298}
5088 5299
5089static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 5300static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5384,6 +5595,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5384 r = 1; 5595 r = 1;
5385 goto out; 5596 goto out;
5386 } 5597 }
5598 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5599 record_steal_time(vcpu);
5600
5387 } 5601 }
5388 5602
5389 r = kvm_mmu_reload(vcpu); 5603 r = kvm_mmu_reload(vcpu);
@@ -5671,8 +5885,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5671 * that usually, but some bad designed PV devices (vmware 5885 * that usually, but some bad designed PV devices (vmware
5672 * backdoor interface) need this to work 5886 * backdoor interface) need this to work
5673 */ 5887 */
5674 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5888 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5675 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5889 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5676 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5890 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5677 } 5891 }
5678 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5892 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5801,21 +6015,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5801int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 6015int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5802 bool has_error_code, u32 error_code) 6016 bool has_error_code, u32 error_code)
5803{ 6017{
5804 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 6018 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5805 int ret; 6019 int ret;
5806 6020
5807 init_emulate_ctxt(vcpu); 6021 init_emulate_ctxt(vcpu);
5808 6022
5809 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, 6023 ret = emulator_task_switch(ctxt, tss_selector, reason,
5810 tss_selector, reason, has_error_code, 6024 has_error_code, error_code);
5811 error_code);
5812 6025
5813 if (ret) 6026 if (ret)
5814 return EMULATE_FAIL; 6027 return EMULATE_FAIL;
5815 6028
5816 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 6029 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5817 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 6030 kvm_rip_write(vcpu, ctxt->eip);
5818 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 6031 kvm_set_rflags(vcpu, ctxt->eflags);
5819 kvm_make_request(KVM_REQ_EVENT, vcpu); 6032 kvm_make_request(KVM_REQ_EVENT, vcpu);
5820 return EMULATE_DONE; 6033 return EMULATE_DONE;
5821} 6034}
@@ -6093,12 +6306,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6093 if (r == 0) 6306 if (r == 0)
6094 r = kvm_mmu_setup(vcpu); 6307 r = kvm_mmu_setup(vcpu);
6095 vcpu_put(vcpu); 6308 vcpu_put(vcpu);
6096 if (r < 0)
6097 goto free_vcpu;
6098 6309
6099 return 0;
6100free_vcpu:
6101 kvm_x86_ops->vcpu_free(vcpu);
6102 return r; 6310 return r;
6103} 6311}
6104 6312
@@ -6126,6 +6334,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6126 6334
6127 kvm_make_request(KVM_REQ_EVENT, vcpu); 6335 kvm_make_request(KVM_REQ_EVENT, vcpu);
6128 vcpu->arch.apf.msr_val = 0; 6336 vcpu->arch.apf.msr_val = 0;
6337 vcpu->arch.st.msr_val = 0;
6129 6338
6130 kvmclock_reset(vcpu); 6339 kvmclock_reset(vcpu);
6131 6340
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e407ed3df81..d36fe237c66 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno)
75 return 1 << (bitno & 31); 75 return 1 << (bitno & 31);
76} 76}
77 77
78static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
79 gva_t gva, gfn_t gfn, unsigned access)
80{
81 vcpu->arch.mmio_gva = gva & PAGE_MASK;
82 vcpu->arch.access = access;
83 vcpu->arch.mmio_gfn = gfn;
84}
85
86/*
87 * Clear the mmio cache info for the given gva,
88 * specially, if gva is ~0ul, we clear all mmio cache info.
89 */
90static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
91{
92 if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
93 return;
94
95 vcpu->arch.mmio_gva = 0;
96}
97
98static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
99{
100 if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
101 return true;
102
103 return false;
104}
105
106static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
107{
108 if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
109 return true;
110
111 return false;
112}
113
78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 114void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 115void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 116int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
81 117
82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 118void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
83 119
120int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
121 gva_t addr, void *val, unsigned int bytes,
122 struct x86_exception *exception);
123
124int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
125 gva_t addr, void *val, unsigned int bytes,
126 struct x86_exception *exception);
127
84#endif 128#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index db832fd65ec..13ee258442a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -71,7 +71,8 @@
71#include <asm/stackprotector.h> 71#include <asm/stackprotector.h>
72#include <asm/reboot.h> /* for struct machine_ops */ 72#include <asm/reboot.h> /* for struct machine_ops */
73 73
74/*G:010 Welcome to the Guest! 74/*G:010
75 * Welcome to the Guest!
75 * 76 *
76 * The Guest in our tale is a simple creature: identical to the Host but 77 * The Guest in our tale is a simple creature: identical to the Host but
77 * behaving in simplified but equivalent ways. In particular, the Guest is the 78 * behaving in simplified but equivalent ways. In particular, the Guest is the
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
190#endif 191#endif
191 192
192/*G:036 193/*G:036
193 * When lazy mode is turned off reset the per-cpu lazy mode variable and then 194 * When lazy mode is turned off, we issue the do-nothing hypercall to
194 * issue the do-nothing hypercall to flush any stored calls. 195 * flush any stored calls, and call the generic helper to reset the
195:*/ 196 * per-cpu lazy mode variable.
197 */
196static void lguest_leave_lazy_mmu_mode(void) 198static void lguest_leave_lazy_mmu_mode(void)
197{ 199{
198 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 200 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
199 paravirt_leave_lazy_mmu(); 201 paravirt_leave_lazy_mmu();
200} 202}
201 203
204/*
205 * We also catch the end of context switch; we enter lazy mode for much of
206 * that too, so again we need to flush here.
207 *
208 * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
209 * mode, but unlike Xen, lguest doesn't care about the difference).
210 */
202static void lguest_end_context_switch(struct task_struct *next) 211static void lguest_end_context_switch(struct task_struct *next)
203{ 212{
204 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 213 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
391 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 400 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
392 * 401 *
393 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 402 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
394 * has been translated into 5 languages. I am not making this up! 403 * has been translated into 6 languages. I am not making this up!
395 * 404 *
396 * We could get funky here and identify ourselves as "GenuineLguest", but 405 * We could get funky here and identify ourselves as "GenuineLguest", but
397 * instead we just use the real "cpuid" instruction. Then I pretty much turned 406 * instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
458 /* 467 /*
459 * PAE systems can mark pages as non-executable. Linux calls this the 468 * PAE systems can mark pages as non-executable. Linux calls this the
460 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced 469 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
461 * Virus Protection). We just switch turn if off here, since we don't 470 * Virus Protection). We just switch it off here, since we don't
462 * support it. 471 * support it.
463 */ 472 */
464 case 0x80000001: 473 case 0x80000001:
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)
520 529
521/* See lguest_set_pte() below. */ 530/* See lguest_set_pte() below. */
522static bool cr3_changed = false; 531static bool cr3_changed = false;
532static unsigned long current_cr3;
523 533
524/* 534/*
525 * cr3 is the current toplevel pagetable page: the principle is the same as 535 * cr3 is the current toplevel pagetable page: the principle is the same as
526 * cr0. Keep a local copy, and tell the Host when it changes. The only 536 * cr0. Keep a local copy, and tell the Host when it changes.
527 * difference is that our local copy is in lguest_data because the Host needs
528 * to set it upon our initial hypercall.
529 */ 537 */
530static void lguest_write_cr3(unsigned long cr3) 538static void lguest_write_cr3(unsigned long cr3)
531{ 539{
532 lguest_data.pgdir = cr3;
533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); 540 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
541 current_cr3 = cr3;
534 542
535 /* These two page tables are simple, linear, and used during boot */ 543 /* These two page tables are simple, linear, and used during boot */
536 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) 544 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)
539 547
540static unsigned long lguest_read_cr3(void) 548static unsigned long lguest_read_cr3(void)
541{ 549{
542 return lguest_data.pgdir; 550 return current_cr3;
543} 551}
544 552
545/* cr4 is used to enable and disable PGE, but we don't care. */ 553/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)
641 649
642/* 650/*
643 * The Guest calls this after it has set a second-level entry (pte), ie. to map 651 * The Guest calls this after it has set a second-level entry (pte), ie. to map
644 * a page into a process' address space. Wetell the Host the toplevel and 652 * a page into a process' address space. We tell the Host the toplevel and
645 * address this corresponds to. The Guest uses one pagetable per process, so 653 * address this corresponds to. The Guest uses one pagetable per process, so
646 * we need to tell the Host which one we're changing (mm->pgd). 654 * we need to tell the Host which one we're changing (mm->pgd).
647 */ 655 */
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
758static void lguest_flush_tlb_single(unsigned long addr) 766static void lguest_flush_tlb_single(unsigned long addr)
759{ 767{
760 /* Simply set it to zero: if it was not, it will fault back in. */ 768 /* Simply set it to zero: if it was not, it will fault back in. */
761 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 769 lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
762} 770}
763 771
764/* 772/*
@@ -1140,7 +1148,7 @@ static struct notifier_block paniced = {
1140static __init char *lguest_memory_setup(void) 1148static __init char *lguest_memory_setup(void)
1141{ 1149{
1142 /* 1150 /*
1143 *The Linux bootloader header contains an "e820" memory map: the 1151 * The Linux bootloader header contains an "e820" memory map: the
1144 * Launcher populated the first entry with our memory limit. 1152 * Launcher populated the first entry with our memory limit.
1145 */ 1153 */
1146 e820_add_region(boot_params.e820_map[0].addr, 1154 e820_add_region(boot_params.e820_map[0].addr,
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d5..6ddfe4fc23c 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,18 +6,22 @@
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 8/*G:020
9 * Our story starts with the kernel booting into startup_32 in 9
10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 * Our story starts with the bzImage: booting starts at startup_32 in
11 * the bootloader (the Launcher in our case). 11 * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
12 * kernel in place and then jumps into it: startup_32 in
13 * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
14 * register, which is created by the bootloader (the Launcher in our case).
12 * 15 *
13 * The startup_32 function does very little: it clears the uninitialized global 16 * The startup_32 function does very little: it clears the uninitialized global
14 * C variables which we expect to be zero (ie. BSS) and then copies the boot 17 * C variables which we expect to be zero (ie. BSS) and then copies the boot
15 * header and kernel command line somewhere safe. Finally it checks the 18 * header and kernel command line somewhere safe, and populates some initial
16 * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: 19 * page tables. Finally it checks the 'hardware_subarch' field. This was
17 * if it's set to '1' (lguest's assigned number), then it calls us here. 20 * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
21 * assigned number), then it calls us here.
18 * 22 *
19 * WARNING: be very careful here! We're running at addresses equal to physical 23 * WARNING: be very careful here! We're running at addresses equal to physical
20 * addesses (around 0), not above PAGE_OFFSET as most code expectes 24 * addresses (around 0), not above PAGE_OFFSET as most code expects
21 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any 25 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
22 * data without remembering to subtract __PAGE_OFFSET! 26 * data without remembering to subtract __PAGE_OFFSET!
23 * 27 *
@@ -27,13 +31,18 @@
27.section .init.text, "ax", @progbits 31.section .init.text, "ax", @progbits
28ENTRY(lguest_entry) 32ENTRY(lguest_entry)
29 /* 33 /*
30 * We make the "initialization" hypercall now to tell the Host about 34 * We make the "initialization" hypercall now to tell the Host where
31 * us, and also find out where it put our page tables. 35 * our lguest_data struct is.
32 */ 36 */
33 movl $LHCALL_LGUEST_INIT, %eax 37 movl $LHCALL_LGUEST_INIT, %eax
34 movl $lguest_data - __PAGE_OFFSET, %ebx 38 movl $lguest_data - __PAGE_OFFSET, %ebx
35 int $LGUEST_TRAP_ENTRY 39 int $LGUEST_TRAP_ENTRY
36 40
41 /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
42 movl $LHCALL_NEW_PGTABLE, %eax
43 movl $(initial_page_table - __PAGE_OFFSET), %ebx
44 int $LGUEST_TRAP_ENTRY
45
37 /* Set up the initial stack so we can run C code. */ 46 /* Set up the initial stack so we can run C code. */
38 movl $(init_thread_union+THREAD_SIZE),%esp 47 movl $(init_thread_union+THREAD_SIZE),%esp
39 48
@@ -96,12 +105,8 @@ send_interrupts:
96 */ 105 */
97 pushl %eax 106 pushl %eax
98 movl $LHCALL_SEND_INTERRUPTS, %eax 107 movl $LHCALL_SEND_INTERRUPTS, %eax
99 /* 108 /* This is the actual hypercall trap. */
100 * This is a vmcall instruction (same thing that KVM uses). Older 109 int $LGUEST_TRAP_ENTRY
101 * assembler versions might not know the "vmcall" instruction, so we
102 * create one manually here.
103 */
104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
105 /* Put eax back the way we found it. */ 110 /* Put eax back the way we found it. */
106 popl %eax 111 popl %eax
107 ret 112 ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2479f19ddd..b00f6785da7 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -18,8 +18,10 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
18 18
19lib-y := delay.o 19lib-y := delay.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
21lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
22lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-$(CONFIG_SMP) += rwlock.o
24lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
23lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o 25lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
24 26
25obj-y += msr.o msr-reg.o msr-reg-export.o 27obj-y += msr.o msr-reg.o msr-reg-export.o
@@ -29,7 +31,7 @@ ifeq ($(CONFIG_X86_32),y)
29 lib-y += atomic64_cx8_32.o 31 lib-y += atomic64_cx8_32.o
30 lib-y += checksum_32.o 32 lib-y += checksum_32.o
31 lib-y += strstr_32.o 33 lib-y += strstr_32.o
32 lib-y += semaphore_32.o string_32.o 34 lib-y += string_32.o
33 lib-y += cmpxchg.o 35 lib-y += cmpxchg.o
34ifneq ($(CONFIG_X86_CMPXCHG64),y) 36ifneq ($(CONFIG_X86_CMPXCHG64),y)
35 lib-y += cmpxchg8b_emu.o atomic64_386_32.o 37 lib-y += cmpxchg8b_emu.o atomic64_386_32.o
@@ -40,7 +42,6 @@ else
40 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 42 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
41 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 43 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
42 lib-y += memmove_64.o memset_64.o 44 lib-y += memmove_64.o memset_64.o
43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 45 lib-y += copy_user_64.o copy_user_nocache_64.o
44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
45 lib-y += cmpxchg16b_emu.o 46 lib-y += cmpxchg16b_emu.o
46endif 47endif
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 540179e8e9f..042f6826bf5 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -4,7 +4,7 @@
4 4
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/cmpxchg.h> 6#include <asm/cmpxchg.h>
7#include <asm/atomic.h> 7#include <linux/atomic.h>
8 8
9long long atomic64_read_cx8(long long, const atomic64_t *v); 9long long atomic64_read_cx8(long long, const atomic64_t *v);
10EXPORT_SYMBOL(atomic64_read_cx8); 10EXPORT_SYMBOL(atomic64_read_cx8);
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1cebe..01c805ba535 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,6 +2,7 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/alternative-asm.h>
5 6
6 ALIGN 7 ALIGN
7copy_page_c: 8copy_page_c:
@@ -110,10 +111,6 @@ ENDPROC(copy_page)
1102: 1112:
111 .previous 112 .previous
112 .section .altinstructions,"a" 113 .section .altinstructions,"a"
113 .align 8 114 altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
114 .quad copy_page 115 .Lcopy_page_end-copy_page, 2b-1b
115 .quad 1b
116 .word X86_FEATURE_REP_GOOD
117 .byte .Lcopy_page_end - copy_page
118 .byte 2b - 1b
119 .previous 116 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index d0ec9c2936d..ee164610ec4 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -9,6 +9,7 @@
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
12#include <asm/alternative-asm.h>
12 13
13#undef memmove 14#undef memmove
14 15
@@ -214,11 +215,9 @@ ENTRY(memmove)
214 .previous 215 .previous
215 216
216 .section .altinstructions,"a" 217 .section .altinstructions,"a"
217 .align 8 218 altinstruction_entry .Lmemmove_begin_forward, \
218 .quad .Lmemmove_begin_forward 219 .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
219 .quad .Lmemmove_begin_forward_efs 220 .Lmemmove_end_forward-.Lmemmove_begin_forward, \
220 .word X86_FEATURE_ERMS 221 .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
221 .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
222 .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
223 .previous 222 .previous
224ENDPROC(memmove) 223ENDPROC(memmove)
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
new file mode 100644
index 00000000000..1cad22139c8
--- /dev/null
+++ b/arch/x86/lib/rwlock.S
@@ -0,0 +1,44 @@
1/* Slow paths of read/write spinlocks. */
2
3#include <linux/linkage.h>
4#include <asm/alternative-asm.h>
5#include <asm/frame.h>
6#include <asm/rwlock.h>
7
8#ifdef CONFIG_X86_32
9# define __lock_ptr eax
10#else
11# define __lock_ptr rdi
12#endif
13
14ENTRY(__write_lock_failed)
15 CFI_STARTPROC
16 FRAME
170: LOCK_PREFIX
18 WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
191: rep; nop
20 cmpl $WRITE_LOCK_CMP, (%__lock_ptr)
21 jne 1b
22 LOCK_PREFIX
23 WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
24 jnz 0b
25 ENDFRAME
26 ret
27 CFI_ENDPROC
28END(__write_lock_failed)
29
30ENTRY(__read_lock_failed)
31 CFI_STARTPROC
32 FRAME
330: LOCK_PREFIX
34 READ_LOCK_SIZE(inc) (%__lock_ptr)
351: rep; nop
36 READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
37 js 1b
38 LOCK_PREFIX
39 READ_LOCK_SIZE(dec) (%__lock_ptr)
40 js 0b
41 ENDFRAME
42 ret
43 CFI_ENDPROC
44END(__read_lock_failed)
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
deleted file mode 100644
index 05ea55f7140..00000000000
--- a/arch/x86/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
1/* Slow paths of read/write spinlocks. */
2
3#include <linux/linkage.h>
4#include <asm/rwlock.h>
5#include <asm/alternative-asm.h>
6#include <asm/dwarf2.h>
7
8/* rdi: pointer to rwlock_t */
9ENTRY(__write_lock_failed)
10 CFI_STARTPROC
11 LOCK_PREFIX
12 addl $RW_LOCK_BIAS,(%rdi)
131: rep
14 nop
15 cmpl $RW_LOCK_BIAS,(%rdi)
16 jne 1b
17 LOCK_PREFIX
18 subl $RW_LOCK_BIAS,(%rdi)
19 jnz __write_lock_failed
20 ret
21 CFI_ENDPROC
22END(__write_lock_failed)
23
24/* rdi: pointer to rwlock_t */
25ENTRY(__read_lock_failed)
26 CFI_STARTPROC
27 LOCK_PREFIX
28 incl (%rdi)
291: rep
30 nop
31 cmpl $1,(%rdi)
32 js 1b
33 LOCK_PREFIX
34 decl (%rdi)
35 js __read_lock_failed
36 ret
37 CFI_ENDPROC
38END(__read_lock_failed)
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem.S
index 67743977398..5dff5f04246 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem.S
@@ -1,4 +1,51 @@
1/* 1/*
2 * x86 semaphore implementation.
3 *
4 * (C) Copyright 1999 Linus Torvalds
5 *
6 * Portions Copyright 1999 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
14 */
15
16#include <linux/linkage.h>
17#include <asm/alternative-asm.h>
18#include <asm/dwarf2.h>
19
20#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
21#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
22
23#ifdef CONFIG_X86_32
24
25/*
26 * The semaphore operations have a special calling sequence that
27 * allow us to do a simpler in-line version of them. These routines
28 * need to convert that sequence back into the C sequence when
29 * there is contention on the semaphore.
30 *
31 * %eax contains the semaphore pointer on entry. Save the C-clobbered
32 * registers (%eax, %edx and %ecx) except %eax whish is either a return
33 * value or just clobbered..
34 */
35
36#define save_common_regs \
37 pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
38
39#define restore_common_regs \
40 popl_cfi %ecx; CFI_RESTORE ecx
41
42 /* Avoid uglifying the argument copying x86-64 needs to do. */
43 .macro movq src, dst
44 .endm
45
46#else
47
48/*
2 * x86-64 rwsem wrappers 49 * x86-64 rwsem wrappers
3 * 50 *
4 * This interfaces the inline asm code to the slow-path 51 * This interfaces the inline asm code to the slow-path
@@ -16,12 +63,6 @@
16 * but %rdi, %rsi, %rcx, %r8-r11 always need saving. 63 * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
17 */ 64 */
18 65
19#include <linux/linkage.h>
20#include <asm/rwlock.h>
21#include <asm/alternative-asm.h>
22#include <asm/frame.h>
23#include <asm/dwarf2.h>
24
25#define save_common_regs \ 66#define save_common_regs \
26 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ 67 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
27 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ 68 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
@@ -40,16 +81,18 @@
40 popq_cfi %rsi; CFI_RESTORE rsi; \ 81 popq_cfi %rsi; CFI_RESTORE rsi; \
41 popq_cfi %rdi; CFI_RESTORE rdi 82 popq_cfi %rdi; CFI_RESTORE rdi
42 83
84#endif
85
43/* Fix up special calling conventions */ 86/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed) 87ENTRY(call_rwsem_down_read_failed)
45 CFI_STARTPROC 88 CFI_STARTPROC
46 save_common_regs 89 save_common_regs
47 pushq_cfi %rdx 90 __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
48 CFI_REL_OFFSET rdx, 0 91 CFI_REL_OFFSET __ASM_REG(dx), 0
49 movq %rax,%rdi 92 movq %rax,%rdi
50 call rwsem_down_read_failed 93 call rwsem_down_read_failed
51 popq_cfi %rdx 94 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
52 CFI_RESTORE rdx 95 CFI_RESTORE __ASM_REG(dx)
53 restore_common_regs 96 restore_common_regs
54 ret 97 ret
55 CFI_ENDPROC 98 CFI_ENDPROC
@@ -67,7 +110,8 @@ ENDPROC(call_rwsem_down_write_failed)
67 110
68ENTRY(call_rwsem_wake) 111ENTRY(call_rwsem_wake)
69 CFI_STARTPROC 112 CFI_STARTPROC
70 decl %edx /* do nothing if still outstanding active readers */ 113 /* do nothing if still outstanding active readers */
114 __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
71 jnz 1f 115 jnz 1f
72 save_common_regs 116 save_common_regs
73 movq %rax,%rdi 117 movq %rax,%rdi
@@ -77,16 +121,15 @@ ENTRY(call_rwsem_wake)
77 CFI_ENDPROC 121 CFI_ENDPROC
78ENDPROC(call_rwsem_wake) 122ENDPROC(call_rwsem_wake)
79 123
80/* Fix up special calling conventions */
81ENTRY(call_rwsem_downgrade_wake) 124ENTRY(call_rwsem_downgrade_wake)
82 CFI_STARTPROC 125 CFI_STARTPROC
83 save_common_regs 126 save_common_regs
84 pushq_cfi %rdx 127 __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
85 CFI_REL_OFFSET rdx, 0 128 CFI_REL_OFFSET __ASM_REG(dx), 0
86 movq %rax,%rdi 129 movq %rax,%rdi
87 call rwsem_downgrade_wake 130 call rwsem_downgrade_wake
88 popq_cfi %rdx 131 __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
89 CFI_RESTORE rdx 132 CFI_RESTORE __ASM_REG(dx)
90 restore_common_regs 133 restore_common_regs
91 ret 134 ret
92 CFI_ENDPROC 135 CFI_ENDPROC
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
deleted file mode 100644
index 06691daa410..00000000000
--- a/arch/x86/lib/semaphore_32.S
+++ /dev/null
@@ -1,124 +0,0 @@
1/*
2 * i386 semaphore implementation.
3 *
4 * (C) Copyright 1999 Linus Torvalds
5 *
6 * Portions Copyright 1999 Red Hat, Inc.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
14 */
15
16#include <linux/linkage.h>
17#include <asm/rwlock.h>
18#include <asm/alternative-asm.h>
19#include <asm/frame.h>
20#include <asm/dwarf2.h>
21
22/*
23 * The semaphore operations have a special calling sequence that
24 * allow us to do a simpler in-line version of them. These routines
25 * need to convert that sequence back into the C sequence when
26 * there is contention on the semaphore.
27 *
28 * %eax contains the semaphore pointer on entry. Save the C-clobbered
29 * registers (%eax, %edx and %ecx) except %eax whish is either a return
30 * value or just clobbered..
31 */
32 .section .sched.text, "ax"
33
34/*
35 * rw spinlock fallbacks
36 */
37#ifdef CONFIG_SMP
38ENTRY(__write_lock_failed)
39 CFI_STARTPROC
40 FRAME
412: LOCK_PREFIX
42 addl $ RW_LOCK_BIAS,(%eax)
431: rep; nop
44 cmpl $ RW_LOCK_BIAS,(%eax)
45 jne 1b
46 LOCK_PREFIX
47 subl $ RW_LOCK_BIAS,(%eax)
48 jnz 2b
49 ENDFRAME
50 ret
51 CFI_ENDPROC
52 ENDPROC(__write_lock_failed)
53
54ENTRY(__read_lock_failed)
55 CFI_STARTPROC
56 FRAME
572: LOCK_PREFIX
58 incl (%eax)
591: rep; nop
60 cmpl $1,(%eax)
61 js 1b
62 LOCK_PREFIX
63 decl (%eax)
64 js 2b
65 ENDFRAME
66 ret
67 CFI_ENDPROC
68 ENDPROC(__read_lock_failed)
69
70#endif
71
72#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
73
74/* Fix up special calling conventions */
75ENTRY(call_rwsem_down_read_failed)
76 CFI_STARTPROC
77 pushl_cfi %ecx
78 CFI_REL_OFFSET ecx,0
79 pushl_cfi %edx
80 CFI_REL_OFFSET edx,0
81 call rwsem_down_read_failed
82 popl_cfi %edx
83 popl_cfi %ecx
84 ret
85 CFI_ENDPROC
86 ENDPROC(call_rwsem_down_read_failed)
87
88ENTRY(call_rwsem_down_write_failed)
89 CFI_STARTPROC
90 pushl_cfi %ecx
91 CFI_REL_OFFSET ecx,0
92 calll rwsem_down_write_failed
93 popl_cfi %ecx
94 ret
95 CFI_ENDPROC
96 ENDPROC(call_rwsem_down_write_failed)
97
98ENTRY(call_rwsem_wake)
99 CFI_STARTPROC
100 decw %dx /* do nothing if still outstanding active readers */
101 jnz 1f
102 pushl_cfi %ecx
103 CFI_REL_OFFSET ecx,0
104 call rwsem_wake
105 popl_cfi %ecx
1061: ret
107 CFI_ENDPROC
108 ENDPROC(call_rwsem_wake)
109
110/* Fix up special calling conventions */
111ENTRY(call_rwsem_downgrade_wake)
112 CFI_STARTPROC
113 pushl_cfi %ecx
114 CFI_REL_OFFSET ecx,0
115 pushl_cfi %edx
116 CFI_REL_OFFSET edx,0
117 call rwsem_downgrade_wake
118 popl_cfi %edx
119 popl_cfi %ecx
120 ret
121 CFI_ENDPROC
122 ENDPROC(call_rwsem_downgrade_wake)
123
124#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 782b082c9ff..a63efd6bb6a 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -5,50 +5,41 @@
5 * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. 5 * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
6 * Subject to the GNU public license, v.2. No warranty of any kind. 6 * Subject to the GNU public license, v.2. No warranty of any kind.
7 */ 7 */
8#include <linux/linkage.h>
9#include <asm/dwarf2.h>
10#include <asm/calling.h>
8 11
9 #include <linux/linkage.h> 12 /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
10 #include <asm/dwarf2.h> 13 .macro THUNK name, func, put_ret_addr_in_rdi=0
11 #include <asm/calling.h>
12 #include <asm/rwlock.h>
13
14 /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
15 .macro thunk name,func
16 .globl \name
17\name:
18 CFI_STARTPROC
19 SAVE_ARGS
20 call \func
21 jmp restore
22 CFI_ENDPROC
23 .endm
24
25#ifdef CONFIG_TRACE_IRQFLAGS
26 /* put return address in rdi (arg1) */
27 .macro thunk_ra name,func
28 .globl \name 14 .globl \name
29\name: 15\name:
30 CFI_STARTPROC 16 CFI_STARTPROC
17
18 /* this one pushes 9 elems, the next one would be %rIP */
31 SAVE_ARGS 19 SAVE_ARGS
32 /* SAVE_ARGS pushs 9 elements */ 20
33 /* the next element would be the rip */ 21 .if \put_ret_addr_in_rdi
34 movq 9*8(%rsp), %rdi 22 movq_cfi_restore 9*8, rdi
23 .endif
24
35 call \func 25 call \func
36 jmp restore 26 jmp restore
37 CFI_ENDPROC 27 CFI_ENDPROC
38 .endm 28 .endm
39 29
40 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller 30#ifdef CONFIG_TRACE_IRQFLAGS
41 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller 31 THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
32 THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
42#endif 33#endif
43 34
44#ifdef CONFIG_DEBUG_LOCK_ALLOC 35#ifdef CONFIG_DEBUG_LOCK_ALLOC
45 thunk lockdep_sys_exit_thunk,lockdep_sys_exit 36 THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
46#endif 37#endif
47 38
48 /* SAVE_ARGS below is used only for the .cfi directives it contains. */ 39 /* SAVE_ARGS below is used only for the .cfi directives it contains. */
49 CFI_STARTPROC 40 CFI_STARTPROC
50 SAVE_ARGS 41 SAVE_ARGS
51restore: 42restore:
52 RESTORE_ARGS 43 RESTORE_ARGS
53 ret 44 ret
54 CFI_ENDPROC 45 CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 00000000000..97be9cb5448
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,43 @@
1/*
2 * User address space access functions.
3 *
4 * For licencing details see kernel-base/COPYING
5 */
6
7#include <linux/highmem.h>
8#include <linux/module.h>
9
10/*
11 * best effort, GUP based copy_from_user() that is NMI-safe
12 */
13unsigned long
14copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
15{
16 unsigned long offset, addr = (unsigned long)from;
17 unsigned long size, len = 0;
18 struct page *page;
19 void *map;
20 int ret;
21
22 do {
23 ret = __get_user_pages_fast(addr, 1, 0, &page);
24 if (!ret)
25 break;
26
27 offset = addr & (PAGE_SIZE - 1);
28 size = min(PAGE_SIZE - offset, n - len);
29
30 map = kmap_atomic(page);
31 memcpy(to, map+offset, size);
32 kunmap_atomic(map);
33 put_page(page);
34
35 len += size;
36 to += size;
37 addr += size;
38
39 } while (len < n);
40
41 return len;
42}
43EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2dbf6bf4c7e..0d17c8c50ac 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -17,6 +17,7 @@
17#include <asm/traps.h> /* dotraplinkage, ... */ 17#include <asm/traps.h> /* dotraplinkage, ... */
18#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
19#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 19#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
20#include <asm/vsyscall.h>
20 21
21/* 22/*
22 * Page fault error code bits: 23 * Page fault error code bits:
@@ -105,7 +106,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
105 * but for now it's good enough to assume that long 106 * but for now it's good enough to assume that long
106 * mode only uses well known segments or kernel. 107 * mode only uses well known segments or kernel.
107 */ 108 */
108 return (!user_mode(regs)) || (regs->cs == __USER_CS); 109 return (!user_mode(regs) || user_64bit_mode(regs));
109#endif 110#endif
110 case 0x60: 111 case 0x60:
111 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 112 /* 0x64 thru 0x67 are valid prefixes in all modes. */
@@ -720,6 +721,18 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
720 if (is_errata100(regs, address)) 721 if (is_errata100(regs, address))
721 return; 722 return;
722 723
724#ifdef CONFIG_X86_64
725 /*
726 * Instruction fetch faults in the vsyscall page might need
727 * emulation.
728 */
729 if (unlikely((error_code & PF_INSTR) &&
730 ((address & ~0xfff) == VSYSCALL_START))) {
731 if (emulate_vsyscall(regs, address))
732 return;
733 }
734#endif
735
723 if (unlikely(show_unhandled_signals)) 736 if (unlikely(show_unhandled_signals))
724 show_signal_msg(regs, error_code, address, tsk); 737 show_signal_msg(regs, error_code, address, tsk);
725 738
@@ -1059,7 +1072,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1059 if (unlikely(error_code & PF_RSVD)) 1072 if (unlikely(error_code & PF_RSVD))
1060 pgtable_bad(regs, error_code, address); 1073 pgtable_bad(regs, error_code, address);
1061 1074
1062 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 1075 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1063 1076
1064 /* 1077 /*
1065 * If we're in an interrupt, have no user context or are running 1078 * If we're in an interrupt, have no user context or are running
@@ -1161,11 +1174,11 @@ good_area:
1161 if (flags & FAULT_FLAG_ALLOW_RETRY) { 1174 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1162 if (fault & VM_FAULT_MAJOR) { 1175 if (fault & VM_FAULT_MAJOR) {
1163 tsk->maj_flt++; 1176 tsk->maj_flt++;
1164 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1177 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
1165 regs, address); 1178 regs, address);
1166 } else { 1179 } else {
1167 tsk->min_flt++; 1180 tsk->min_flt++;
1168 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1181 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
1169 regs, address); 1182 regs, address);
1170 } 1183 }
1171 if (fault & VM_FAULT_RETRY) { 1184 if (fault & VM_FAULT_RETRY) {
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dbe34b93137..dd74e46828c 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -108,16 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr)
108 SetPageReferenced(page); 108 SetPageReferenced(page);
109} 109}
110 110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
119}
120
121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 111static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
122 unsigned long end, int write, struct page **pages, int *nr) 112 unsigned long end, int write, struct page **pages, int *nr)
123{ 113{
@@ -211,6 +201,8 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
211 do { 201 do {
212 VM_BUG_ON(compound_head(page) != head); 202 VM_BUG_ON(compound_head(page) != head);
213 pages[*nr] = page; 203 pages[*nr] = page;
204 if (PageTail(page))
205 get_huge_page_tail(page);
214 (*nr)++; 206 (*nr)++;
215 page++; 207 page++;
216 refs++; 208 refs++;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index b4996266210..f4f29b19fac 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -45,6 +45,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
45 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 45 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
46 BUG_ON(!pte_none(*(kmap_pte-idx))); 46 BUG_ON(!pte_none(*(kmap_pte-idx)));
47 set_pte(kmap_pte-idx, mk_pte(page, prot)); 47 set_pte(kmap_pte-idx, mk_pte(page, prot));
48 arch_flush_lazy_mmu_mode();
48 49
49 return (void *)vaddr; 50 return (void *)vaddr;
50} 51}
@@ -88,6 +89,7 @@ void __kunmap_atomic(void *kvaddr)
88 */ 89 */
89 kpte_clear_flush(kmap_pte-idx, vaddr); 90 kpte_clear_flush(kmap_pte-idx, vaddr);
90 kmap_atomic_idx_pop(); 91 kmap_atomic_idx_pop();
92 arch_flush_lazy_mmu_mode();
91 } 93 }
92#ifdef CONFIG_DEBUG_HIGHMEM 94#ifdef CONFIG_DEBUG_HIGHMEM
93 else { 95 else {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 30326443ab8..87488b93a65 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -63,9 +63,8 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
63#ifdef CONFIG_X86_32 63#ifdef CONFIG_X86_32
64 /* for fixmap */ 64 /* for fixmap */
65 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 65 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
66
67 good_end = max_pfn_mapped << PAGE_SHIFT;
68#endif 66#endif
67 good_end = max_pfn_mapped << PAGE_SHIFT;
69 68
70 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); 69 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
71 if (base == MEMBLOCK_ERROR) 70 if (base == MEMBLOCK_ERROR)
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 704a37ceddd..dab41876cdd 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
185 e->trace.entries = e->trace_entries; 185 e->trace.entries = e->trace_entries;
186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries); 186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
187 e->trace.skip = 0; 187 e->trace.skip = 0;
188 save_stack_trace_regs(&e->trace, regs); 188 save_stack_trace_regs(regs, &e->trace);
189 189
190 /* Round address down to nearest 16 bytes */ 190 /* Round address down to nearest 16 bytes */
191 shadow_copy = kmemcheck_shadow_lookup(address 191 shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab5194fd9..f927429d07c 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -87,9 +87,9 @@ static unsigned long mmap_rnd(void)
87 */ 87 */
88 if (current->flags & PF_RANDOMIZE) { 88 if (current->flags & PF_RANDOMIZE) {
89 if (mmap_is_ia32()) 89 if (mmap_is_ia32())
90 rnd = (long)get_random_int() % (1<<8); 90 rnd = get_random_int() % (1<<8);
91 else 91 else
92 rnd = (long)(get_random_int() % (1<<28)); 92 rnd = get_random_int() % (1<<28);
93 } 93 }
94 return rnd << PAGE_SHIFT; 94 return rnd << PAGE_SHIFT;
95} 95}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3adff7dcc14..67421f38a21 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -34,7 +34,7 @@
34#include <asm/pgtable.h> 34#include <asm/pgtable.h>
35#include <linux/mmiotrace.h> 35#include <linux/mmiotrace.h>
36#include <asm/e820.h> /* for ISA_START_ADDRESS */ 36#include <asm/e820.h> /* for ISA_START_ADDRESS */
37#include <asm/atomic.h> 37#include <linux/atomic.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/cpu.h> 39#include <linux/cpu.h>
40 40
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f5510d889a2..fbeaaf41661 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -496,6 +496,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
496 496
497static int __init numa_register_memblks(struct numa_meminfo *mi) 497static int __init numa_register_memblks(struct numa_meminfo *mi)
498{ 498{
499 unsigned long uninitialized_var(pfn_align);
499 int i, nid; 500 int i, nid;
500 501
501 /* Account for nodes with cpus and no memory */ 502 /* Account for nodes with cpus and no memory */
@@ -511,6 +512,20 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
511 512
512 /* for out of order entries */ 513 /* for out of order entries */
513 sort_node_map(); 514 sort_node_map();
515
516 /*
517 * If sections array is gonna be used for pfn -> nid mapping, check
518 * whether its granularity is fine enough.
519 */
520#ifdef NODE_NOT_IN_PAGE_FLAGS
521 pfn_align = node_map_pfn_alignment();
522 if (pfn_align && pfn_align < PAGES_PER_SECTION) {
523 printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
524 PFN_PHYS(pfn_align) >> 20,
525 PFN_PHYS(PAGES_PER_SECTION) >> 20);
526 return -EINVAL;
527 }
528#endif
514 if (!numa_meminfo_cover_memory(mi)) 529 if (!numa_meminfo_cover_memory(mi))
515 return -EINVAL; 530 return -EINVAL;
516 531
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 849a975d3fa..3adebe7e536 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -41,7 +41,7 @@
41 * physnode_map[16-31] = 1; 41 * physnode_map[16-31] = 1;
42 * physnode_map[32- ] = -1; 42 * physnode_map[32- ] = -1;
43 */ 43 */
44s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 44s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
45EXPORT_SYMBOL(physnode_map); 45EXPORT_SYMBOL(physnode_map);
46 46
47void memory_present(int nid, unsigned long start, unsigned long end) 47void memory_present(int nid, unsigned long start, unsigned long end)
@@ -52,8 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
52 nid, start, end); 52 nid, start, end);
53 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 53 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
54 printk(KERN_DEBUG " "); 54 printk(KERN_DEBUG " ");
55 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 55 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
56 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 56 physnode_map[pfn / PAGES_PER_SECTION] = nid;
57 printk(KERN_CONT "%lx ", pfn); 57 printk(KERN_CONT "%lx ", pfn);
58 } 58 }
59 printk(KERN_CONT "\n"); 59 printk(KERN_CONT "\n");
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index e1d10690921..b0086567271 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -123,12 +123,11 @@ static int pageattr_test(void)
123 if (print) 123 if (print)
124 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
125 125
126 bm = vmalloc((max_pfn_mapped + 7) / 8); 126 bm = vzalloc((max_pfn_mapped + 7) / 8);
127 if (!bm) { 127 if (!bm) {
128 printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); 128 printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
129 return -ENOMEM; 129 return -ENOMEM;
130 } 130 }
131 memset(bm, 0, (max_pfn_mapped + 7) / 8);
132 131
133 failed += print_split(&sa); 132 failed += print_split(&sa);
134 srandom32(100); 133 srandom32(100);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bfab3fa10ed..7c1b765ecc5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -151,17 +151,18 @@ void bpf_jit_compile(struct sk_filter *fp)
151 cleanup_addr = proglen; /* epilogue address */ 151 cleanup_addr = proglen; /* epilogue address */
152 152
153 for (pass = 0; pass < 10; pass++) { 153 for (pass = 0; pass < 10; pass++) {
154 u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen;
154 /* no prologue/epilogue for trivial filters (RET something) */ 155 /* no prologue/epilogue for trivial filters (RET something) */
155 proglen = 0; 156 proglen = 0;
156 prog = temp; 157 prog = temp;
157 158
158 if (seen) { 159 if (seen_or_pass0) {
159 EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */ 160 EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
160 EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */ 161 EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
161 /* note : must save %rbx in case bpf_error is hit */ 162 /* note : must save %rbx in case bpf_error is hit */
162 if (seen & (SEEN_XREG | SEEN_DATAREF)) 163 if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF))
163 EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */ 164 EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
164 if (seen & SEEN_XREG) 165 if (seen_or_pass0 & SEEN_XREG)
165 CLEAR_X(); /* make sure we dont leek kernel memory */ 166 CLEAR_X(); /* make sure we dont leek kernel memory */
166 167
167 /* 168 /*
@@ -170,7 +171,7 @@ void bpf_jit_compile(struct sk_filter *fp)
170 * r9 = skb->len - skb->data_len 171 * r9 = skb->len - skb->data_len
171 * r8 = skb->data 172 * r8 = skb->data
172 */ 173 */
173 if (seen & SEEN_DATAREF) { 174 if (seen_or_pass0 & SEEN_DATAREF) {
174 if (offsetof(struct sk_buff, len) <= 127) 175 if (offsetof(struct sk_buff, len) <= 127)
175 /* mov off8(%rdi),%r9d */ 176 /* mov off8(%rdi),%r9d */
176 EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); 177 EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
@@ -260,9 +261,14 @@ void bpf_jit_compile(struct sk_filter *fp)
260 case BPF_S_ALU_DIV_X: /* A /= X; */ 261 case BPF_S_ALU_DIV_X: /* A /= X; */
261 seen |= SEEN_XREG; 262 seen |= SEEN_XREG;
262 EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ 263 EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
263 if (pc_ret0 != -1) 264 if (pc_ret0 > 0) {
264 EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4)); 265 /* addrs[pc_ret0 - 1] is start address of target
265 else { 266 * (addrs[i] - 4) is the address following this jmp
267 * ("xor %edx,%edx; div %ebx" being 4 bytes long)
268 */
269 EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] -
270 (addrs[i] - 4));
271 } else {
266 EMIT_COND_JMP(X86_JNE, 2 + 5); 272 EMIT_COND_JMP(X86_JNE, 2 + 5);
267 CLEAR_A(); 273 CLEAR_A();
268 EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */ 274 EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
@@ -335,12 +341,12 @@ void bpf_jit_compile(struct sk_filter *fp)
335 } 341 }
336 /* fallinto */ 342 /* fallinto */
337 case BPF_S_RET_A: 343 case BPF_S_RET_A:
338 if (seen) { 344 if (seen_or_pass0) {
339 if (i != flen - 1) { 345 if (i != flen - 1) {
340 EMIT_JMP(cleanup_addr - addrs[i]); 346 EMIT_JMP(cleanup_addr - addrs[i]);
341 break; 347 break;
342 } 348 }
343 if (seen & SEEN_XREG) 349 if (seen_or_pass0 & SEEN_XREG)
344 EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */ 350 EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
345 EMIT1(0xc9); /* leaveq */ 351 EMIT1(0xc9); /* leaveq */
346 } 352 }
@@ -483,8 +489,9 @@ common_load: seen |= SEEN_DATAREF;
483 goto common_load; 489 goto common_load;
484 case BPF_S_LDX_B_MSH: 490 case BPF_S_LDX_B_MSH:
485 if ((int)K < 0) { 491 if ((int)K < 0) {
486 if (pc_ret0 != -1) { 492 if (pc_ret0 > 0) {
487 EMIT_JMP(addrs[pc_ret0] - addrs[i]); 493 /* addrs[pc_ret0 - 1] is the start address */
494 EMIT_JMP(addrs[pc_ret0 - 1] - addrs[i]);
488 break; 495 break;
489 } 496 }
490 CLEAR_A(); 497 CLEAR_A();
@@ -568,8 +575,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
568 break; 575 break;
569 } 576 }
570 if (filter[i].jt != 0) { 577 if (filter[i].jt != 0) {
571 if (filter[i].jf) 578 if (filter[i].jf && f_offset)
572 t_offset += is_near(f_offset) ? 2 : 6; 579 t_offset += is_near(f_offset) ? 2 : 5;
573 EMIT_COND_JMP(t_op, t_offset); 580 EMIT_COND_JMP(t_op, t_offset);
574 if (filter[i].jf) 581 if (filter[i].jf)
575 EMIT_JMP(f_offset); 582 EMIT_JMP(f_offset);
@@ -599,13 +606,14 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
599 * use it to give the cleanup instruction(s) addr 606 * use it to give the cleanup instruction(s) addr
600 */ 607 */
601 cleanup_addr = proglen - 1; /* ret */ 608 cleanup_addr = proglen - 1; /* ret */
602 if (seen) 609 if (seen_or_pass0)
603 cleanup_addr -= 1; /* leaveq */ 610 cleanup_addr -= 1; /* leaveq */
604 if (seen & SEEN_XREG) 611 if (seen_or_pass0 & SEEN_XREG)
605 cleanup_addr -= 4; /* mov -8(%rbp),%rbx */ 612 cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
606 613
607 if (image) { 614 if (image) {
608 WARN_ON(proglen != oldproglen); 615 if (proglen != oldproglen)
616 pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n", proglen, oldproglen);
609 break; 617 break;
610 } 618 }
611 if (proglen == oldproglen) { 619 if (proglen == oldproglen) {
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index a5b64ab4cd6..bff89dfe361 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -11,10 +11,11 @@
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/compat.h>
15#include <linux/uaccess.h>
16
14#include <asm/ptrace.h> 17#include <asm/ptrace.h>
15#include <asm/uaccess.h>
16#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
17#include <linux/compat.h>
18 19
19static int backtrace_stack(void *data, char *name) 20static int backtrace_stack(void *data, char *name)
20{ 21{
@@ -40,13 +41,13 @@ static struct stacktrace_ops backtrace_ops = {
40static struct stack_frame_ia32 * 41static struct stack_frame_ia32 *
41dump_user_backtrace_32(struct stack_frame_ia32 *head) 42dump_user_backtrace_32(struct stack_frame_ia32 *head)
42{ 43{
44 /* Also check accessibility of one struct frame_head beyond: */
43 struct stack_frame_ia32 bufhead[2]; 45 struct stack_frame_ia32 bufhead[2];
44 struct stack_frame_ia32 *fp; 46 struct stack_frame_ia32 *fp;
47 unsigned long bytes;
45 48
46 /* Also check accessibility of one struct frame_head beyond */ 49 bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
47 if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) 50 if (bytes != sizeof(bufhead))
48 return NULL;
49 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
50 return NULL; 51 return NULL;
51 52
52 fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); 53 fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
@@ -87,12 +88,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
87 88
88static struct stack_frame *dump_user_backtrace(struct stack_frame *head) 89static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
89{ 90{
91 /* Also check accessibility of one struct frame_head beyond: */
90 struct stack_frame bufhead[2]; 92 struct stack_frame bufhead[2];
93 unsigned long bytes;
91 94
92 /* Also check accessibility of one struct stack_frame beyond */ 95 bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
93 if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) 96 if (bytes != sizeof(bufhead))
94 return NULL;
95 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
96 return NULL; 97 return NULL;
97 98
98 oprofile_add_trace(bufhead[0].return_address); 99 oprofile_add_trace(bufhead[0].return_address);
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
index cdfe4c54dec..f148cf65267 100644
--- a/arch/x86/oprofile/init.c
+++ b/arch/x86/oprofile/init.c
@@ -21,6 +21,7 @@ extern int op_nmi_timer_init(struct oprofile_operations *ops);
21extern void op_nmi_exit(void); 21extern void op_nmi_exit(void);
22extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); 22extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
23 23
24static int nmi_timer;
24 25
25int __init oprofile_arch_init(struct oprofile_operations *ops) 26int __init oprofile_arch_init(struct oprofile_operations *ops)
26{ 27{
@@ -31,8 +32,9 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
31#ifdef CONFIG_X86_LOCAL_APIC 32#ifdef CONFIG_X86_LOCAL_APIC
32 ret = op_nmi_init(ops); 33 ret = op_nmi_init(ops);
33#endif 34#endif
35 nmi_timer = (ret != 0);
34#ifdef CONFIG_X86_IO_APIC 36#ifdef CONFIG_X86_IO_APIC
35 if (ret < 0) 37 if (nmi_timer)
36 ret = op_nmi_timer_init(ops); 38 ret = op_nmi_timer_init(ops);
37#endif 39#endif
38 ops->backtrace = x86_backtrace; 40 ops->backtrace = x86_backtrace;
@@ -44,6 +46,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
44void oprofile_arch_exit(void) 46void oprofile_arch_exit(void)
45{ 47{
46#ifdef CONFIG_X86_LOCAL_APIC 48#ifdef CONFIG_X86_LOCAL_APIC
47 op_nmi_exit(); 49 if (!nmi_timer)
50 op_nmi_exit();
48#endif 51#endif
49} 52}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f7634..d24d3da7292 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,8 +18,9 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
18obj-$(CONFIG_X86_MRST) += mrst.o 18obj-$(CONFIG_X86_MRST) += mrst.o
19 19
20obj-y += common.o early.o 20obj-y += common.o early.o
21obj-y += amd_bus.o bus_numa.o 21obj-y += bus_numa.o
22 22
23obj-$(CONFIG_AMD_NB) += amd_bus.o
23obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o 24obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
24 25
25ifeq ($(CONFIG_PCI_DEBUG),y) 26ifeq ($(CONFIG_PCI_DEBUG),y)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 68c3c139520..f8348ab1032 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -43,6 +43,17 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
43 DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"), 43 DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
44 }, 44 },
45 }, 45 },
46 /* https://bugzilla.kernel.org/show_bug.cgi?id=30552 */
47 /* 2006 AMD HT/VIA system with two host bridges */
48 {
49 .callback = set_use_crs,
50 .ident = "ASUS M2V-MX SE",
51 .matches = {
52 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
53 DMI_MATCH(DMI_BOARD_NAME, "M2V-MX SE"),
54 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
55 },
56 },
46 {} 57 {}
47}; 58};
48 59
@@ -138,7 +149,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
138 struct acpi_resource_address64 addr; 149 struct acpi_resource_address64 addr;
139 acpi_status status; 150 acpi_status status;
140 unsigned long flags; 151 unsigned long flags;
141 u64 start, end; 152 u64 start, orig_end, end;
142 153
143 status = resource_to_addr(acpi_res, &addr); 154 status = resource_to_addr(acpi_res, &addr);
144 if (!ACPI_SUCCESS(status)) 155 if (!ACPI_SUCCESS(status))
@@ -154,7 +165,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
154 return AE_OK; 165 return AE_OK;
155 166
156 start = addr.minimum + addr.translation_offset; 167 start = addr.minimum + addr.translation_offset;
157 end = addr.maximum + addr.translation_offset; 168 orig_end = end = addr.maximum + addr.translation_offset;
169
170 /* Exclude non-addressable range or non-addressable portion of range */
171 end = min(end, (u64)iomem_resource.end);
172 if (end <= start) {
173 dev_info(&info->bridge->dev,
174 "host bridge window [%#llx-%#llx] "
175 "(ignored, not CPU addressable)\n", start, orig_end);
176 return AE_OK;
177 } else if (orig_end != end) {
178 dev_info(&info->bridge->dev,
179 "host bridge window [%#llx-%#llx] "
180 "([%#llx-%#llx] ignored, not CPU addressable)\n",
181 start, orig_end, end + 1, orig_end);
182 }
158 183
159 res = &info->res[info->res_num]; 184 res = &info->res[info->res_num];
160 res->name = info->name; 185 res->name = info->name;
@@ -246,10 +271,9 @@ static void add_resources(struct pci_root_info *info)
246 271
247 conflict = insert_resource_conflict(root, res); 272 conflict = insert_resource_conflict(root, res);
248 if (conflict) 273 if (conflict)
249 dev_err(&info->bridge->dev, 274 dev_info(&info->bridge->dev,
250 "address space collision: host bridge window %pR " 275 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
251 "conflicts with %s %pR\n", 276 res, conflict->name, conflict);
252 res, conflict->name, conflict);
253 else 277 else
254 pci_bus_add_resource(info->bus, res, 0); 278 pci_bus_add_resource(info->bus, res, 0);
255 } 279 }
@@ -361,6 +385,20 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
361 } 385 }
362 } 386 }
363 387
388 /* After the PCI-E bus has been walked and all devices discovered,
389 * configure any settings of the fabric that might be necessary.
390 */
391 if (bus) {
392 struct pci_bus *child;
393 list_for_each_entry(child, &bus->children, node) {
394 struct pci_dev *self = child->self;
395 if (!self)
396 continue;
397
398 pcie_bus_configure_settings(child, self->pcie_mpss);
399 }
400 }
401
364 if (!bus) 402 if (!bus)
365 kfree(sd); 403 kfree(sd);
366 404
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 026e4931d16..385a940b542 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, 30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
31}; 31};
32 32
33static u64 __initdata fam10h_mmconf_start;
34static u64 __initdata fam10h_mmconf_end;
35static void __init get_pci_mmcfg_amd_fam10h_range(void)
36{
37 u32 address;
38 u64 base, msr;
39 unsigned segn_busn_bits;
40
41 /* assume all cpus from fam10h have mmconf */
42 if (boot_cpu_data.x86 < 0x10)
43 return;
44
45 address = MSR_FAM10H_MMIO_CONF_BASE;
46 rdmsrl(address, msr);
47
48 /* mmconfig is not enable */
49 if (!(msr & FAM10H_MMIO_CONF_ENABLE))
50 return;
51
52 base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
53
54 segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
55 FAM10H_MMIO_CONF_BUSRANGE_MASK;
56
57 fam10h_mmconf_start = base;
58 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
59}
60
61#define RANGE_NUM 16 33#define RANGE_NUM 16
62 34
63/** 35/**
@@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void)
85 u64 val; 57 u64 val;
86 u32 address; 58 u32 address;
87 bool found; 59 bool found;
60 struct resource fam10h_mmconf_res, *fam10h_mmconf;
61 u64 fam10h_mmconf_start;
62 u64 fam10h_mmconf_end;
88 63
89 if (!early_pci_allowed()) 64 if (!early_pci_allowed())
90 return -1; 65 return -1;
@@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void)
211 subtract_range(range, RANGE_NUM, 0, end); 186 subtract_range(range, RANGE_NUM, 0, end);
212 187
213 /* get mmconfig */ 188 /* get mmconfig */
214 get_pci_mmcfg_amd_fam10h_range(); 189 fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
215 /* need to take out mmconf range */ 190 /* need to take out mmconf range */
216 if (fam10h_mmconf_end) { 191 if (fam10h_mmconf) {
217 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); 192 printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
193 fam10h_mmconf_start = fam10h_mmconf->start;
194 fam10h_mmconf_end = fam10h_mmconf->end;
218 subtract_range(range, RANGE_NUM, fam10h_mmconf_start, 195 subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
219 fam10h_mmconf_end + 1); 196 fam10h_mmconf_end + 1);
197 } else {
198 fam10h_mmconf_start = 0;
199 fam10h_mmconf_end = 0;
220 } 200 }
221 201
222 /* mmio resource */ 202 /* mmio resource */
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 67858be4b52..99176094500 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -257,6 +257,7 @@ static int ce4100_conf_read(unsigned int seg, unsigned int bus,
257{ 257{
258 int i; 258 int i;
259 259
260 WARN_ON(seg);
260 if (bus == 1) { 261 if (bus == 1) {
261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { 262 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
262 if (bus1_fixups[i].dev_func == devfn && 263 if (bus1_fixups[i].dev_func == devfn &&
@@ -282,6 +283,7 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus,
282{ 283{
283 int i; 284 int i;
284 285
286 WARN_ON(seg);
285 if (bus == 1) { 287 if (bus == 1) {
286 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { 288 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
287 if (bus1_fixups[i].dev_func == devfn && 289 if (bus1_fixups[i].dev_func == devfn &&
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 5fe75026ecc..92df322e0b5 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -247,13 +247,6 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
247 }, 247 },
248#endif /* __i386__ */ 248#endif /* __i386__ */
249 { 249 {
250 .callback = find_sort_method,
251 .ident = "Dell System",
252 .matches = {
253 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
254 },
255 },
256 {
257 .callback = set_bf_sort, 250 .callback = set_bf_sort,
258 .ident = "Dell PowerEdge 1950", 251 .ident = "Dell PowerEdge 1950",
259 .matches = { 252 .matches = {
@@ -294,6 +287,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
294 }, 287 },
295 }, 288 },
296 { 289 {
290 .callback = find_sort_method,
291 .ident = "Dell System",
292 .matches = {
293 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
294 },
295 },
296 {
297 .callback = set_bf_sort, 297 .callback = set_bf_sort,
298 .ident = "HP ProLiant BL20p G3", 298 .ident = "HP ProLiant BL20p G3",
299 .matches = { 299 .matches = {
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index e6fd8473fb7..4f2c70439d7 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -22,7 +22,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
22{ 22{
23 unsigned long flags; 23 unsigned long flags;
24 24
25 if ((bus > 255) || (devfn > 255) || (reg > 4095)) { 25 if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) {
26 *value = -1; 26 *value = -1;
27 return -EINVAL; 27 return -EINVAL;
28 } 28 }
@@ -53,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
53{ 53{
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if ((bus > 255) || (devfn > 255) || (reg > 4095)) 56 if (seg || (bus > 255) || (devfn > 255) || (reg > 4095))
57 return -EINVAL; 57 return -EINVAL;
58 58
59 raw_spin_lock_irqsave(&pci_config_lock, flags); 59 raw_spin_lock_irqsave(&pci_config_lock, flags);
@@ -97,6 +97,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
97 unsigned long flags; 97 unsigned long flags;
98 int dev, fn; 98 int dev, fn;
99 99
100 WARN_ON(seg);
100 if ((bus > 255) || (devfn > 255) || (reg > 255)) { 101 if ((bus > 255) || (devfn > 255) || (reg > 255)) {
101 *value = -1; 102 *value = -1;
102 return -EINVAL; 103 return -EINVAL;
@@ -138,6 +139,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
138 unsigned long flags; 139 unsigned long flags;
139 int dev, fn; 140 int dev, fn;
140 141
142 WARN_ON(seg);
141 if ((bus > 255) || (devfn > 255) || (reg > 255)) 143 if ((bus > 255) || (devfn > 255) || (reg > 255))
142 return -EINVAL; 144 return -EINVAL;
143 145
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 750c346ef50..301e325992f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -519,7 +519,8 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
519 if (cfg->address < 0xFFFFFFFF) 519 if (cfg->address < 0xFFFFFFFF)
520 return 0; 520 return 0;
521 521
522 if (!strcmp(mcfg->header.oem_id, "SGI")) 522 if (!strcmp(mcfg->header.oem_id, "SGI") ||
523 !strcmp(mcfg->header.oem_id, "SGI2"))
523 return 0; 524 return 0;
524 525
525 if (mcfg->header.revision >= 1) { 526 if (mcfg->header.revision >= 1) {
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 5c9e2458df4..512a88c4150 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -34,6 +34,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
34 unsigned long flags; 34 unsigned long flags;
35 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); 35 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
36 36
37 WARN_ON(seg);
37 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 38 if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
38 return -EINVAL; 39 return -EINVAL;
39 40
@@ -73,6 +74,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
73 unsigned long flags; 74 unsigned long flags;
74 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); 75 void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
75 76
77 WARN_ON(seg);
76 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 78 if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
77 return -EINVAL; 79 return -EINVAL;
78 80
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index 13700ec8e2e..5262603b04d 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -206,6 +206,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus,
206{ 206{
207 uint32_t *addr; 207 uint32_t *addr;
208 208
209 WARN_ON(seg);
210
209 /* Use the hardware mechanism for non-simulated devices */ 211 /* Use the hardware mechanism for non-simulated devices */
210 if (!is_simulated(bus, devfn)) 212 if (!is_simulated(bus, devfn))
211 return pci_direct_conf1.read(seg, bus, devfn, reg, len, value); 213 return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
@@ -264,6 +266,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus,
264static int pci_olpc_write(unsigned int seg, unsigned int bus, 266static int pci_olpc_write(unsigned int seg, unsigned int bus,
265 unsigned int devfn, int reg, int len, uint32_t value) 267 unsigned int devfn, int reg, int len, uint32_t value)
266{ 268{
269 WARN_ON(seg);
270
267 /* Use the hardware mechanism for non-simulated devices */ 271 /* Use the hardware mechanism for non-simulated devices */
268 if (!is_simulated(bus, devfn)) 272 if (!is_simulated(bus, devfn))
269 return pci_direct_conf1.write(seg, bus, devfn, reg, len, value); 273 return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index a5f7d0d63de..f6855355146 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -181,6 +181,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
181 unsigned long flags; 181 unsigned long flags;
182 unsigned long bx = (bus << 8) | devfn; 182 unsigned long bx = (bus << 8) | devfn;
183 183
184 WARN_ON(seg);
184 if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) 185 if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
185 return -EINVAL; 186 return -EINVAL;
186 187
@@ -247,6 +248,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
247 unsigned long flags; 248 unsigned long flags;
248 unsigned long bx = (bus << 8) | devfn; 249 unsigned long bx = (bus << 8) | devfn;
249 250
251 WARN_ON(seg);
250 if ((bus > 255) || (devfn > 255) || (reg > 255)) 252 if ((bus > 255) || (devfn > 255) || (reg > 255))
251 return -EINVAL; 253 return -EINVAL;
252 254
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 03008f72eb0..6f2f8eeed17 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -24,7 +24,7 @@ static void pci_visws_disable_irq(struct pci_dev *dev) { }
24 24
25unsigned int pci_bus0, pci_bus1; 25unsigned int pci_bus0, pci_bus1;
26 26
27static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin) 27static int __init visws_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
28{ 28{
29 int irq, bus = dev->bus->number; 29 int irq, bus = dev->bus->number;
30 30
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index f567965c062..1017c7bee38 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -1,8 +1,13 @@
1/* 1/*
2 * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux 2 * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and
3 * x86 PCI core to support the Xen PCI Frontend 3 * initial domain support. We also handle the DSDT _PRT callbacks for GSI's
4 * used in HVM and initial domain mode (PV does not parse ACPI, so it has no
5 * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and
6 * 0xcf8 PCI configuration read/write.
4 * 7 *
5 * Author: Ryan Wilson <hap9@epoch.ncsc.mil> 8 * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
9 * Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
10 * Stefano Stabellini <stefano.stabellini@eu.citrix.com>
6 */ 11 */
7#include <linux/module.h> 12#include <linux/module.h>
8#include <linux/init.h> 13#include <linux/init.h>
@@ -19,22 +24,53 @@
19#include <xen/events.h> 24#include <xen/events.h>
20#include <asm/xen/pci.h> 25#include <asm/xen/pci.h>
21 26
27static int xen_pcifront_enable_irq(struct pci_dev *dev)
28{
29 int rc;
30 int share = 1;
31 int pirq;
32 u8 gsi;
33
34 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
35 if (rc < 0) {
36 dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
37 rc);
38 return rc;
39 }
40 /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
41 pirq = gsi;
42
43 if (gsi < NR_IRQS_LEGACY)
44 share = 0;
45
46 rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
47 if (rc < 0) {
48 dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
49 gsi, pirq, rc);
50 return rc;
51 }
52
53 dev->irq = rc;
54 dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
55 return 0;
56}
57
22#ifdef CONFIG_ACPI 58#ifdef CONFIG_ACPI
23static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, 59static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
24 int trigger, int polarity) 60 bool set_pirq)
25{ 61{
26 int rc, irq; 62 int rc, pirq = -1, irq = -1;
27 struct physdev_map_pirq map_irq; 63 struct physdev_map_pirq map_irq;
28 int shareable = 0; 64 int shareable = 0;
29 char *name; 65 char *name;
30 66
31 if (!xen_hvm_domain()) 67 if (set_pirq)
32 return -1; 68 pirq = gsi;
33 69
34 map_irq.domid = DOMID_SELF; 70 map_irq.domid = DOMID_SELF;
35 map_irq.type = MAP_PIRQ_TYPE_GSI; 71 map_irq.type = MAP_PIRQ_TYPE_GSI;
36 map_irq.index = gsi; 72 map_irq.index = gsi;
37 map_irq.pirq = -1; 73 map_irq.pirq = pirq;
38 74
39 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 75 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
40 if (rc) { 76 if (rc) {
@@ -42,7 +78,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
42 return -1; 78 return -1;
43 } 79 }
44 80
45 if (trigger == ACPI_EDGE_SENSITIVE) { 81 if (triggering == ACPI_EDGE_SENSITIVE) {
46 shareable = 0; 82 shareable = 0;
47 name = "ioapic-edge"; 83 name = "ioapic-edge";
48 } else { 84 } else {
@@ -50,12 +86,63 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
50 name = "ioapic-level"; 86 name = "ioapic-level";
51 } 87 }
52 88
89 if (gsi_override >= 0)
90 gsi = gsi_override;
91
53 irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); 92 irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
93 if (irq < 0)
94 goto out;
95
96 printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi);
97out:
98 return irq;
99}
100
101static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
102 int trigger, int polarity)
103{
104 if (!xen_hvm_domain())
105 return -1;
54 106
55 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); 107 return xen_register_pirq(gsi, -1 /* no GSI override */, trigger,
108 false /* no mapping of GSI to PIRQ */);
109}
110
111#ifdef CONFIG_XEN_DOM0
112static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
113{
114 int rc, irq;
115 struct physdev_setup_gsi setup_gsi;
116
117 if (!xen_pv_domain())
118 return -1;
119
120 printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
121 gsi, triggering, polarity);
122
123 irq = xen_register_pirq(gsi, gsi_override, triggering, true);
124
125 setup_gsi.gsi = gsi;
126 setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
127 setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
128
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
130 if (rc == -EEXIST)
131 printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
132 else if (rc) {
133 printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
134 gsi, rc);
135 }
56 136
57 return irq; 137 return irq;
58} 138}
139
140static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
141 int trigger, int polarity)
142{
143 return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
144}
145#endif
59#endif 146#endif
60 147
61#if defined(CONFIG_PCI_MSI) 148#if defined(CONFIG_PCI_MSI)
@@ -65,6 +152,43 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
65struct xen_pci_frontend_ops *xen_pci_frontend; 152struct xen_pci_frontend_ops *xen_pci_frontend;
66EXPORT_SYMBOL_GPL(xen_pci_frontend); 153EXPORT_SYMBOL_GPL(xen_pci_frontend);
67 154
155static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
156{
157 int irq, ret, i;
158 struct msi_desc *msidesc;
159 int *v;
160
161 v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
162 if (!v)
163 return -ENOMEM;
164
165 if (type == PCI_CAP_ID_MSIX)
166 ret = xen_pci_frontend_enable_msix(dev, v, nvec);
167 else
168 ret = xen_pci_frontend_enable_msi(dev, v);
169 if (ret)
170 goto error;
171 i = 0;
172 list_for_each_entry(msidesc, &dev->msi_list, list) {
173 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
174 (type == PCI_CAP_ID_MSIX) ?
175 "pcifront-msi-x" :
176 "pcifront-msi",
177 DOMID_SELF);
178 if (irq < 0)
179 goto free;
180 i++;
181 }
182 kfree(v);
183 return 0;
184
185error:
186 dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
187free:
188 kfree(v);
189 return ret;
190}
191
68#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ 192#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
69 MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) 193 MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
70 194
@@ -123,67 +247,6 @@ error:
123 return -ENODEV; 247 return -ENODEV;
124} 248}
125 249
126/*
127 * For MSI interrupts we have to use drivers/xen/event.s functions to
128 * allocate an irq_desc and setup the right */
129
130
131static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
132{
133 int irq, ret, i;
134 struct msi_desc *msidesc;
135 int *v;
136
137 v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
138 if (!v)
139 return -ENOMEM;
140
141 if (type == PCI_CAP_ID_MSIX)
142 ret = xen_pci_frontend_enable_msix(dev, v, nvec);
143 else
144 ret = xen_pci_frontend_enable_msi(dev, v);
145 if (ret)
146 goto error;
147 i = 0;
148 list_for_each_entry(msidesc, &dev->msi_list, list) {
149 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
150 (type == PCI_CAP_ID_MSIX) ?
151 "pcifront-msi-x" :
152 "pcifront-msi",
153 DOMID_SELF);
154 if (irq < 0)
155 goto free;
156 i++;
157 }
158 kfree(v);
159 return 0;
160
161error:
162 dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
163free:
164 kfree(v);
165 return ret;
166}
167
168static void xen_teardown_msi_irqs(struct pci_dev *dev)
169{
170 struct msi_desc *msidesc;
171
172 msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
173 if (msidesc->msi_attrib.is_msix)
174 xen_pci_frontend_disable_msix(dev);
175 else
176 xen_pci_frontend_disable_msi(dev);
177
178 /* Free the IRQ's and the msidesc using the generic code. */
179 default_teardown_msi_irqs(dev);
180}
181
182static void xen_teardown_msi_irq(unsigned int irq)
183{
184 xen_destroy_irq(irq);
185}
186
187#ifdef CONFIG_XEN_DOM0 250#ifdef CONFIG_XEN_DOM0
188static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 251static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
189{ 252{
@@ -242,45 +305,28 @@ out:
242 return ret; 305 return ret;
243} 306}
244#endif 307#endif
245#endif
246 308
247static int xen_pcifront_enable_irq(struct pci_dev *dev) 309static void xen_teardown_msi_irqs(struct pci_dev *dev)
248{ 310{
249 int rc; 311 struct msi_desc *msidesc;
250 int share = 1;
251 int pirq;
252 u8 gsi;
253
254 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
255 if (rc < 0) {
256 dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
257 rc);
258 return rc;
259 }
260
261 rc = xen_allocate_pirq_gsi(gsi);
262 if (rc < 0) {
263 dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
264 gsi, rc);
265 return rc;
266 }
267 pirq = rc;
268 312
269 if (gsi < NR_IRQS_LEGACY) 313 msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
270 share = 0; 314 if (msidesc->msi_attrib.is_msix)
315 xen_pci_frontend_disable_msix(dev);
316 else
317 xen_pci_frontend_disable_msi(dev);
271 318
272 rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); 319 /* Free the IRQ's and the msidesc using the generic code. */
273 if (rc < 0) { 320 default_teardown_msi_irqs(dev);
274 dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", 321}
275 gsi, pirq, rc);
276 return rc;
277 }
278 322
279 dev->irq = rc; 323static void xen_teardown_msi_irq(unsigned int irq)
280 dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); 324{
281 return 0; 325 xen_destroy_irq(irq);
282} 326}
283 327
328#endif
329
284int __init pci_xen_init(void) 330int __init pci_xen_init(void)
285{ 331{
286 if (!xen_pv_domain() || xen_initial_domain()) 332 if (!xen_pv_domain() || xen_initial_domain())
@@ -327,79 +373,6 @@ int __init pci_xen_hvm_init(void)
327} 373}
328 374
329#ifdef CONFIG_XEN_DOM0 375#ifdef CONFIG_XEN_DOM0
330static int xen_register_pirq(u32 gsi, int gsi_override, int triggering)
331{
332 int rc, pirq, irq = -1;
333 struct physdev_map_pirq map_irq;
334 int shareable = 0;
335 char *name;
336
337 if (!xen_pv_domain())
338 return -1;
339
340 if (triggering == ACPI_EDGE_SENSITIVE) {
341 shareable = 0;
342 name = "ioapic-edge";
343 } else {
344 shareable = 1;
345 name = "ioapic-level";
346 }
347 pirq = xen_allocate_pirq_gsi(gsi);
348 if (pirq < 0)
349 goto out;
350
351 if (gsi_override >= 0)
352 irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name);
353 else
354 irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
355 if (irq < 0)
356 goto out;
357
358 printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi);
359
360 map_irq.domid = DOMID_SELF;
361 map_irq.type = MAP_PIRQ_TYPE_GSI;
362 map_irq.index = gsi;
363 map_irq.pirq = pirq;
364
365 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
366 if (rc) {
367 printk(KERN_WARNING "xen map irq failed %d\n", rc);
368 return -1;
369 }
370
371out:
372 return irq;
373}
374
375static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
376{
377 int rc, irq;
378 struct physdev_setup_gsi setup_gsi;
379
380 if (!xen_pv_domain())
381 return -1;
382
383 printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
384 gsi, triggering, polarity);
385
386 irq = xen_register_pirq(gsi, gsi_override, triggering);
387
388 setup_gsi.gsi = gsi;
389 setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
390 setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
391
392 rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
393 if (rc == -EEXIST)
394 printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
395 else if (rc) {
396 printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
397 gsi, rc);
398 }
399
400 return irq;
401}
402
403static __init void xen_setup_acpi_sci(void) 376static __init void xen_setup_acpi_sci(void)
404{ 377{
405 int rc; 378 int rc;
@@ -419,7 +392,7 @@ static __init void xen_setup_acpi_sci(void)
419 } 392 }
420 trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; 393 trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
421 polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; 394 polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
422 395
423 printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d " 396 printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
424 "polarity=%d\n", gsi, trigger, polarity); 397 "polarity=%d\n", gsi, trigger, polarity);
425 398
@@ -434,10 +407,9 @@ static __init void xen_setup_acpi_sci(void)
434 * the ACPI interpreter and keels over since IRQ 9 has not been 407 * the ACPI interpreter and keels over since IRQ 9 has not been
435 * setup as we had setup IRQ 20 for it). 408 * setup as we had setup IRQ 20 for it).
436 */ 409 */
437 /* Check whether the GSI != IRQ */
438 if (acpi_gsi_to_irq(gsi, &irq) == 0) { 410 if (acpi_gsi_to_irq(gsi, &irq) == 0) {
439 if (irq >= 0 && irq != gsi) 411 /* Use the provided value if it's valid. */
440 /* Bugger, we MUST have that IRQ. */ 412 if (irq >= 0)
441 gsi_override = irq; 413 gsi_override = irq;
442 } 414 }
443 415
@@ -447,41 +419,16 @@ static __init void xen_setup_acpi_sci(void)
447 return; 419 return;
448} 420}
449 421
450static int acpi_register_gsi_xen(struct device *dev, u32 gsi, 422int __init pci_xen_initial_domain(void)
451 int trigger, int polarity)
452{ 423{
453 return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); 424 int irq;
454}
455 425
456static int __init pci_xen_initial_domain(void)
457{
458#ifdef CONFIG_PCI_MSI 426#ifdef CONFIG_PCI_MSI
459 x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; 427 x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
460 x86_msi.teardown_msi_irq = xen_teardown_msi_irq; 428 x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
461#endif 429#endif
462 xen_setup_acpi_sci(); 430 xen_setup_acpi_sci();
463 __acpi_register_gsi = acpi_register_gsi_xen; 431 __acpi_register_gsi = acpi_register_gsi_xen;
464
465 return 0;
466}
467
468void __init xen_setup_pirqs(void)
469{
470 int pirq, irq;
471
472 pci_xen_initial_domain();
473
474 if (0 == nr_ioapics) {
475 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
476 pirq = xen_allocate_pirq_gsi(irq);
477 if (WARN(pirq < 0,
478 "Could not allocate PIRQ for legacy interrupt\n"))
479 break;
480 irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
481 }
482 return;
483 }
484
485 /* Pre-allocate legacy irqs */ 432 /* Pre-allocate legacy irqs */
486 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { 433 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
487 int trigger, polarity; 434 int trigger, polarity;
@@ -490,12 +437,16 @@ void __init xen_setup_pirqs(void)
490 continue; 437 continue;
491 438
492 xen_register_pirq(irq, -1 /* no GSI override */, 439 xen_register_pirq(irq, -1 /* no GSI override */,
493 trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); 440 trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
441 true /* Map GSI to PIRQ */);
494 } 442 }
443 if (0 == nr_ioapics) {
444 for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
445 xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
446 }
447 return 0;
495} 448}
496#endif
497 449
498#ifdef CONFIG_XEN_DOM0
499struct xen_device_domain_owner { 450struct xen_device_domain_owner {
500 domid_t domain; 451 domid_t domain;
501 struct pci_dev *dev; 452 struct pci_dev *dev;
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 899e393d8e7..3ae4128013e 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -51,7 +51,17 @@
51int efi_enabled; 51int efi_enabled;
52EXPORT_SYMBOL(efi_enabled); 52EXPORT_SYMBOL(efi_enabled);
53 53
54struct efi efi; 54struct efi __read_mostly efi = {
55 .mps = EFI_INVALID_TABLE_ADDR,
56 .acpi = EFI_INVALID_TABLE_ADDR,
57 .acpi20 = EFI_INVALID_TABLE_ADDR,
58 .smbios = EFI_INVALID_TABLE_ADDR,
59 .sal_systab = EFI_INVALID_TABLE_ADDR,
60 .boot_info = EFI_INVALID_TABLE_ADDR,
61 .hcdp = EFI_INVALID_TABLE_ADDR,
62 .uga = EFI_INVALID_TABLE_ADDR,
63 .uv_systab = EFI_INVALID_TABLE_ADDR,
64};
55EXPORT_SYMBOL(efi); 65EXPORT_SYMBOL(efi);
56 66
57struct efi_memory_map memmap; 67struct efi_memory_map memmap;
@@ -79,26 +89,50 @@ early_param("add_efi_memmap", setup_add_efi_memmap);
79 89
80static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) 90static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
81{ 91{
82 return efi_call_virt2(get_time, tm, tc); 92 unsigned long flags;
93 efi_status_t status;
94
95 spin_lock_irqsave(&rtc_lock, flags);
96 status = efi_call_virt2(get_time, tm, tc);
97 spin_unlock_irqrestore(&rtc_lock, flags);
98 return status;
83} 99}
84 100
85static efi_status_t virt_efi_set_time(efi_time_t *tm) 101static efi_status_t virt_efi_set_time(efi_time_t *tm)
86{ 102{
87 return efi_call_virt1(set_time, tm); 103 unsigned long flags;
104 efi_status_t status;
105
106 spin_lock_irqsave(&rtc_lock, flags);
107 status = efi_call_virt1(set_time, tm);
108 spin_unlock_irqrestore(&rtc_lock, flags);
109 return status;
88} 110}
89 111
90static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, 112static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
91 efi_bool_t *pending, 113 efi_bool_t *pending,
92 efi_time_t *tm) 114 efi_time_t *tm)
93{ 115{
94 return efi_call_virt3(get_wakeup_time, 116 unsigned long flags;
95 enabled, pending, tm); 117 efi_status_t status;
118
119 spin_lock_irqsave(&rtc_lock, flags);
120 status = efi_call_virt3(get_wakeup_time,
121 enabled, pending, tm);
122 spin_unlock_irqrestore(&rtc_lock, flags);
123 return status;
96} 124}
97 125
98static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) 126static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
99{ 127{
100 return efi_call_virt2(set_wakeup_time, 128 unsigned long flags;
101 enabled, tm); 129 efi_status_t status;
130
131 spin_lock_irqsave(&rtc_lock, flags);
132 status = efi_call_virt2(set_wakeup_time,
133 enabled, tm);
134 spin_unlock_irqrestore(&rtc_lock, flags);
135 return status;
102} 136}
103 137
104static efi_status_t virt_efi_get_variable(efi_char16_t *name, 138static efi_status_t virt_efi_get_variable(efi_char16_t *name,
@@ -122,7 +156,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
122 156
123static efi_status_t virt_efi_set_variable(efi_char16_t *name, 157static efi_status_t virt_efi_set_variable(efi_char16_t *name,
124 efi_guid_t *vendor, 158 efi_guid_t *vendor,
125 unsigned long attr, 159 u32 attr,
126 unsigned long data_size, 160 unsigned long data_size,
127 void *data) 161 void *data)
128{ 162{
@@ -131,6 +165,18 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
131 data_size, data); 165 data_size, data);
132} 166}
133 167
168static efi_status_t virt_efi_query_variable_info(u32 attr,
169 u64 *storage_space,
170 u64 *remaining_space,
171 u64 *max_variable_size)
172{
173 if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
174 return EFI_UNSUPPORTED;
175
176 return efi_call_virt4(query_variable_info, attr, storage_space,
177 remaining_space, max_variable_size);
178}
179
134static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) 180static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
135{ 181{
136 return efi_call_virt1(get_next_high_mono_count, count); 182 return efi_call_virt1(get_next_high_mono_count, count);
@@ -145,6 +191,28 @@ static void virt_efi_reset_system(int reset_type,
145 data_size, data); 191 data_size, data);
146} 192}
147 193
194static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
195 unsigned long count,
196 unsigned long sg_list)
197{
198 if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
199 return EFI_UNSUPPORTED;
200
201 return efi_call_virt3(update_capsule, capsules, count, sg_list);
202}
203
204static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
205 unsigned long count,
206 u64 *max_size,
207 int *reset_type)
208{
209 if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
210 return EFI_UNSUPPORTED;
211
212 return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
213 reset_type);
214}
215
148static efi_status_t __init phys_efi_set_virtual_address_map( 216static efi_status_t __init phys_efi_set_virtual_address_map(
149 unsigned long memory_map_size, 217 unsigned long memory_map_size,
150 unsigned long descriptor_size, 218 unsigned long descriptor_size,
@@ -164,11 +232,14 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
164static efi_status_t __init phys_efi_get_time(efi_time_t *tm, 232static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
165 efi_time_cap_t *tc) 233 efi_time_cap_t *tc)
166{ 234{
235 unsigned long flags;
167 efi_status_t status; 236 efi_status_t status;
168 237
238 spin_lock_irqsave(&rtc_lock, flags);
169 efi_call_phys_prelog(); 239 efi_call_phys_prelog();
170 status = efi_call_phys2(efi_phys.get_time, tm, tc); 240 status = efi_call_phys2(efi_phys.get_time, tm, tc);
171 efi_call_phys_epilog(); 241 efi_call_phys_epilog();
242 spin_unlock_irqrestore(&rtc_lock, flags);
172 return status; 243 return status;
173} 244}
174 245
@@ -669,6 +740,9 @@ void __init efi_enter_virtual_mode(void)
669 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; 740 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
670 efi.reset_system = virt_efi_reset_system; 741 efi.reset_system = virt_efi_reset_system;
671 efi.set_virtual_address_map = NULL; 742 efi.set_virtual_address_map = NULL;
743 efi.query_variable_info = virt_efi_query_variable_info;
744 efi.update_capsule = virt_efi_update_capsule;
745 efi.query_capsule_caps = virt_efi_query_capsule_caps;
672 if (__supported_pte_mask & _PAGE_NX) 746 if (__supported_pte_mask & _PAGE_NX)
673 runtime_code_page_mkexec(); 747 runtime_code_page_mkexec();
674 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); 748 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index f61ccdd4934..1ea38775a6d 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_X86_MRST) += mrst.o 1obj-$(CONFIG_X86_MRST) += mrst.o
2obj-$(CONFIG_X86_MRST) += vrtc.o 2obj-$(CONFIG_X86_MRST) += vrtc.o
3obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o 3obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
4obj-$(CONFIG_X86_MRST) += pmu.o
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 7000e74b308..fe73276e026 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -678,36 +678,40 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
678 pentry = (struct sfi_device_table_entry *)sb->pentry; 678 pentry = (struct sfi_device_table_entry *)sb->pentry;
679 679
680 for (i = 0; i < num; i++, pentry++) { 680 for (i = 0; i < num; i++, pentry++) {
681 if (pentry->irq != (u8)0xff) { /* native RTE case */ 681 int irq = pentry->irq;
682
683 if (irq != (u8)0xff) { /* native RTE case */
682 /* these SPI2 devices are not exposed to system as PCI 684 /* these SPI2 devices are not exposed to system as PCI
683 * devices, but they have separate RTE entry in IOAPIC 685 * devices, but they have separate RTE entry in IOAPIC
684 * so we have to enable them one by one here 686 * so we have to enable them one by one here
685 */ 687 */
686 ioapic = mp_find_ioapic(pentry->irq); 688 ioapic = mp_find_ioapic(irq);
687 irq_attr.ioapic = ioapic; 689 irq_attr.ioapic = ioapic;
688 irq_attr.ioapic_pin = pentry->irq; 690 irq_attr.ioapic_pin = irq;
689 irq_attr.trigger = 1; 691 irq_attr.trigger = 1;
690 irq_attr.polarity = 1; 692 irq_attr.polarity = 1;
691 io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); 693 io_apic_set_pci_routing(NULL, irq, &irq_attr);
692 } 694 } else
695 irq = 0; /* No irq */
696
693 switch (pentry->type) { 697 switch (pentry->type) {
694 case SFI_DEV_TYPE_IPC: 698 case SFI_DEV_TYPE_IPC:
695 /* ID as IRQ is a hack that will go away */ 699 /* ID as IRQ is a hack that will go away */
696 pdev = platform_device_alloc(pentry->name, pentry->irq); 700 pdev = platform_device_alloc(pentry->name, irq);
697 if (pdev == NULL) { 701 if (pdev == NULL) {
698 pr_err("out of memory for SFI platform device '%s'.\n", 702 pr_err("out of memory for SFI platform device '%s'.\n",
699 pentry->name); 703 pentry->name);
700 continue; 704 continue;
701 } 705 }
702 install_irq_resource(pdev, pentry->irq); 706 install_irq_resource(pdev, irq);
703 pr_debug("info[%2d]: IPC bus, name = %16.16s, " 707 pr_debug("info[%2d]: IPC bus, name = %16.16s, "
704 "irq = 0x%2x\n", i, pentry->name, pentry->irq); 708 "irq = 0x%2x\n", i, pentry->name, irq);
705 sfi_handle_ipc_dev(pdev); 709 sfi_handle_ipc_dev(pdev);
706 break; 710 break;
707 case SFI_DEV_TYPE_SPI: 711 case SFI_DEV_TYPE_SPI:
708 memset(&spi_info, 0, sizeof(spi_info)); 712 memset(&spi_info, 0, sizeof(spi_info));
709 strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN); 713 strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
710 spi_info.irq = pentry->irq; 714 spi_info.irq = irq;
711 spi_info.bus_num = pentry->host_num; 715 spi_info.bus_num = pentry->host_num;
712 spi_info.chip_select = pentry->addr; 716 spi_info.chip_select = pentry->addr;
713 spi_info.max_speed_hz = pentry->max_freq; 717 spi_info.max_speed_hz = pentry->max_freq;
@@ -724,7 +728,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
724 memset(&i2c_info, 0, sizeof(i2c_info)); 728 memset(&i2c_info, 0, sizeof(i2c_info));
725 bus = pentry->host_num; 729 bus = pentry->host_num;
726 strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN); 730 strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
727 i2c_info.irq = pentry->irq; 731 i2c_info.irq = irq;
728 i2c_info.addr = pentry->addr; 732 i2c_info.addr = pentry->addr;
729 pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, " 733 pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
730 "irq = 0x%2x, addr = 0x%x\n", i, bus, 734 "irq = 0x%2x, addr = 0x%x\n", i, bus,
diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c
new file mode 100644
index 00000000000..9281da7d91b
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.c
@@ -0,0 +1,817 @@
1/*
2 * mrst/pmu.c - driver for MRST Power Management Unit
3 *
4 * Copyright (c) 2011, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20#include <linux/cpuidle.h>
21#include <linux/debugfs.h>
22#include <linux/delay.h>
23#include <linux/interrupt.h>
24#include <linux/module.h>
25#include <linux/pci.h>
26#include <linux/seq_file.h>
27#include <linux/sfi.h>
28#include <asm/intel_scu_ipc.h>
29#include "pmu.h"
30
31#define IPCMSG_FW_REVISION 0xF4
32
33struct mrst_device {
34 u16 pci_dev_num; /* DEBUG only */
35 u16 lss;
36 u16 latest_request;
37 unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */
38};
39
40/*
41 * comlete list of MRST PCI devices
42 */
43static struct mrst_device mrst_devs[] = {
44/* 0 */ { 0x0800, LSS_SPI0 }, /* Moorestown SPI Ctrl 0 */
45/* 1 */ { 0x0801, LSS_SPI1 }, /* Moorestown SPI Ctrl 1 */
46/* 2 */ { 0x0802, LSS_I2C0 }, /* Moorestown I2C 0 */
47/* 3 */ { 0x0803, LSS_I2C1 }, /* Moorestown I2C 1 */
48/* 4 */ { 0x0804, LSS_I2C2 }, /* Moorestown I2C 2 */
49/* 5 */ { 0x0805, LSS_KBD }, /* Moorestown Keyboard Ctrl */
50/* 6 */ { 0x0806, LSS_USB_HC }, /* Moorestown USB Ctrl */
51/* 7 */ { 0x0807, LSS_SD_HC0 }, /* Moorestown SD Host Ctrl 0 */
52/* 8 */ { 0x0808, LSS_SD_HC1 }, /* Moorestown SD Host Ctrl 1 */
53/* 9 */ { 0x0809, LSS_NAND }, /* Moorestown NAND Ctrl */
54/* 10 */ { 0x080a, LSS_AUDIO }, /* Moorestown Audio Ctrl */
55/* 11 */ { 0x080b, LSS_IMAGING }, /* Moorestown ISP */
56/* 12 */ { 0x080c, LSS_SECURITY }, /* Moorestown Security Controller */
57/* 13 */ { 0x080d, LSS_DISPLAY }, /* Moorestown External Displays */
58/* 14 */ { 0x080e, 0 }, /* Moorestown SCU IPC */
59/* 15 */ { 0x080f, LSS_GPIO }, /* Moorestown GPIO Controller */
60/* 16 */ { 0x0810, 0 }, /* Moorestown Power Management Unit */
61/* 17 */ { 0x0811, LSS_USB_OTG }, /* Moorestown OTG Ctrl */
62/* 18 */ { 0x0812, LSS_SPI2 }, /* Moorestown SPI Ctrl 2 */
63/* 19 */ { 0x0813, 0 }, /* Moorestown SC DMA */
64/* 20 */ { 0x0814, LSS_AUDIO_LPE }, /* Moorestown LPE DMA */
65/* 21 */ { 0x0815, LSS_AUDIO_SSP }, /* Moorestown SSP0 */
66
67/* 22 */ { 0x084F, LSS_SD_HC2 }, /* Moorestown SD Host Ctrl 2 */
68
69/* 23 */ { 0x4102, 0 }, /* Lincroft */
70/* 24 */ { 0x4110, 0 }, /* Lincroft */
71};
72
73/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */
74static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0};
75static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803,
76 0x0804, 0x0805, 0x080f, 0};
77
78/* handle concurrent SMP invokations of pmu_pci_set_power_state() */
79static spinlock_t mrst_pmu_power_state_lock;
80
81static unsigned int wake_counters[MRST_NUM_LSS]; /* DEBUG only */
82static unsigned int pmu_irq_stats[INT_INVALID + 1]; /* DEBUG only */
83
84static int graphics_is_off;
85static int lss_s0i3_enabled;
86static bool mrst_pmu_s0i3_enable;
87
88/* debug counters */
89static u32 pmu_wait_ready_calls;
90static u32 pmu_wait_ready_udelays;
91static u32 pmu_wait_ready_udelays_max;
92static u32 pmu_wait_done_calls;
93static u32 pmu_wait_done_udelays;
94static u32 pmu_wait_done_udelays_max;
95static u32 pmu_set_power_state_entry;
96static u32 pmu_set_power_state_send_cmd;
97
98static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num)
99{
100 int index = 0;
101
102 if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815))
103 index = pci_dev_num - 0x800;
104 else if (pci_dev_num == 0x084F)
105 index = 22;
106 else if (pci_dev_num == 0x4102)
107 index = 23;
108 else if (pci_dev_num == 0x4110)
109 index = 24;
110
111 if (pci_dev_num != mrst_devs[index].pci_dev_num) {
112 WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num);
113 return 0;
114 }
115
116 return &mrst_devs[index];
117}
118
119/**
120 * mrst_pmu_validate_cstates
121 * @dev: cpuidle_device
122 *
123 * Certain states are not appropriate for governor to pick in some cases.
124 * This function will be called as cpuidle_device's prepare callback and
125 * thus tells governor to ignore such states when selecting the next state
126 * to enter.
127 */
128
129#define IDLE_STATE4_IS_C6 4
130#define IDLE_STATE5_IS_S0I3 5
131
132int mrst_pmu_invalid_cstates(void)
133{
134 int cpu = smp_processor_id();
135
136 /*
137 * Demote to C4 if the PMU is busy.
138 * Since LSS changes leave the busy bit clear...
139 * busy means either the PMU is waiting for an ACK-C6 that
140 * isn't coming due to an MWAIT that returned immediately;
141 * or we returned from S0i3 successfully, and the PMU
142 * is not done sending us interrupts.
143 */
144 if (pmu_read_busy_status())
145 return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3;
146
147 /*
148 * Disallow S0i3 if: PMU is not initialized, or CPU1 is active,
149 * or if device LSS is insufficient, or the GPU is active,
150 * or if it has been explicitly disabled.
151 */
152 if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) ||
153 !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable)
154 return 1 << IDLE_STATE5_IS_S0I3;
155 else
156 return 0;
157}
158
159/*
160 * pmu_update_wake_counters(): read PM_WKS, update wake_counters[]
161 * DEBUG only.
162 */
163static void pmu_update_wake_counters(void)
164{
165 int lss;
166 u32 wake_status;
167
168 wake_status = pmu_read_wks();
169
170 for (lss = 0; lss < MRST_NUM_LSS; ++lss) {
171 if (wake_status & (1 << lss))
172 wake_counters[lss]++;
173 }
174}
175
176int mrst_pmu_s0i3_entry(void)
177{
178 int status;
179
180 /* Clear any possible error conditions */
181 pmu_write_ics(0x300);
182
183 /* set wake control to current D-states */
184 pmu_write_wssc(S0I3_SSS_TARGET);
185
186 status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd);
187 pmu_update_wake_counters();
188 return status;
189}
190
191/* poll for maximum of 5ms for busy bit to clear */
192static int pmu_wait_ready(void)
193{
194 int udelays;
195
196 pmu_wait_ready_calls++;
197
198 for (udelays = 0; udelays < 500; ++udelays) {
199 if (udelays > pmu_wait_ready_udelays_max)
200 pmu_wait_ready_udelays_max = udelays;
201
202 if (pmu_read_busy_status() == 0)
203 return 0;
204
205 udelay(10);
206 pmu_wait_ready_udelays++;
207 }
208
209 /*
210 * if this fires, observe
211 * /sys/kernel/debug/mrst_pmu_wait_ready_calls
212 * /sys/kernel/debug/mrst_pmu_wait_ready_udelays
213 */
214 WARN_ONCE(1, "SCU not ready for 5ms");
215 return -EBUSY;
216}
217/* poll for maximum of 50ms us for busy bit to clear */
218static int pmu_wait_done(void)
219{
220 int udelays;
221
222 pmu_wait_done_calls++;
223
224 for (udelays = 0; udelays < 500; ++udelays) {
225 if (udelays > pmu_wait_done_udelays_max)
226 pmu_wait_done_udelays_max = udelays;
227
228 if (pmu_read_busy_status() == 0)
229 return 0;
230
231 udelay(100);
232 pmu_wait_done_udelays++;
233 }
234
235 /*
236 * if this fires, observe
237 * /sys/kernel/debug/mrst_pmu_wait_done_calls
238 * /sys/kernel/debug/mrst_pmu_wait_done_udelays
239 */
240 WARN_ONCE(1, "SCU not done for 50ms");
241 return -EBUSY;
242}
243
244u32 mrst_pmu_msi_is_disabled(void)
245{
246 return pmu_msi_is_disabled();
247}
248
249void mrst_pmu_enable_msi(void)
250{
251 pmu_msi_enable();
252}
253
254/**
255 * pmu_irq - pmu driver interrupt handler
256 * Context: interrupt context
257 */
258static irqreturn_t pmu_irq(int irq, void *dummy)
259{
260 union pmu_pm_ics pmu_ics;
261
262 pmu_ics.value = pmu_read_ics();
263
264 if (!pmu_ics.bits.pending)
265 return IRQ_NONE;
266
267 switch (pmu_ics.bits.cause) {
268 case INT_SPURIOUS:
269 case INT_CMD_DONE:
270 case INT_CMD_ERR:
271 case INT_WAKE_RX:
272 case INT_SS_ERROR:
273 case INT_S0IX_MISS:
274 case INT_NO_ACKC6:
275 pmu_irq_stats[pmu_ics.bits.cause]++;
276 break;
277 default:
278 pmu_irq_stats[INT_INVALID]++;
279 }
280
281 pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */
282
283 return IRQ_HANDLED;
284}
285
286/*
287 * Translate PCI power management to MRST LSS D-states
288 */
289static int pci_2_mrst_state(int lss, pci_power_t pci_state)
290{
291 switch (pci_state) {
292 case PCI_D0:
293 if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET)
294 return D0i1;
295 else
296 return D0;
297 case PCI_D1:
298 return D0i1;
299 case PCI_D2:
300 return D0i2;
301 case PCI_D3hot:
302 case PCI_D3cold:
303 return D0i3;
304 default:
305 WARN(1, "pci_state %d\n", pci_state);
306 return 0;
307 }
308}
309
310static int pmu_issue_command(u32 pm_ssc)
311{
312 union pmu_pm_set_cfg_cmd_t command;
313
314 if (pmu_read_busy_status()) {
315 pr_debug("pmu is busy, Operation not permitted\n");
316 return -1;
317 }
318
319 /*
320 * enable interrupts in PMU so that interrupts are
321 * propagated when ioc bit for a particular set
322 * command is set
323 */
324
325 pmu_irq_enable();
326
327 /* Configure the sub systems for pmu2 */
328
329 pmu_write_ssc(pm_ssc);
330
331 /*
332 * Send the set config command for pmu its configured
333 * for mode CM_IMMEDIATE & hence with No Trigger
334 */
335
336 command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE;
337 command.pmu2_params.d_param.cfg_delay = 0;
338 command.pmu2_params.d_param.rsvd = 0;
339
340 /* construct the command to send SET_CFG to particular PMU */
341 command.pmu2_params.d_param.cmd = SET_CFG_CMD;
342 command.pmu2_params.d_param.ioc = 0;
343 command.pmu2_params.d_param.mode_id = 0;
344 command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0;
345
346 /* write the value of PM_CMD into particular PMU */
347 pr_debug("pmu command being written %x\n",
348 command.pmu_pm_set_cfg_cmd_value);
349
350 pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value);
351
352 return 0;
353}
354
355static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state)
356{
357 u16 existing_request;
358 int i;
359
360 for (i = 0; ids[i]; ++i) {
361 struct mrst_device *mrst_dev;
362
363 mrst_dev = pci_id_2_mrst_dev(ids[i]);
364 if (unlikely(!mrst_dev))
365 continue;
366
367 existing_request = mrst_dev->latest_request;
368 if (existing_request < pci_state)
369 pci_state = existing_request;
370 }
371 return pci_state;
372}
373
374/**
375 * pmu_pci_set_power_state - Callback function is used by all the PCI devices
376 * for a platform specific device power on/shutdown.
377 */
378
379int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state)
380{
381 u32 old_sss, new_sss;
382 int status = 0;
383 struct mrst_device *mrst_dev;
384
385 pmu_set_power_state_entry++;
386
387 BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL);
388 BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold);
389
390 mrst_dev = pci_id_2_mrst_dev(pdev->device);
391 if (unlikely(!mrst_dev))
392 return -ENODEV;
393
394 mrst_dev->pci_state_counts[pci_state]++; /* count invocations */
395
396 /* PMU driver calls self as part of PCI initialization, ignore */
397 if (pdev->device == PCI_DEV_ID_MRST_PMU)
398 return 0;
399
400 BUG_ON(!pmu_reg); /* SW bug if called before initialized */
401
402 spin_lock(&mrst_pmu_power_state_lock);
403
404 if (pdev->d3_delay) {
405 dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n",
406 pdev->d3_delay);
407 pdev->d3_delay = 0;
408 }
409 /*
410 * If Lincroft graphics, simply remember state
411 */
412 if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY
413 && !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) {
414 if (pci_state == PCI_D0)
415 graphics_is_off = 0;
416 else
417 graphics_is_off = 1;
418 goto ret;
419 }
420
421 if (!mrst_dev->lss)
422 goto ret; /* device with no LSS */
423
424 if (mrst_dev->latest_request == pci_state)
425 goto ret; /* no change */
426
427 mrst_dev->latest_request = pci_state; /* record latest request */
428
429 /*
430 * LSS9 and LSS10 contain multiple PCI devices.
431 * Use the lowest numbered (highest power) state in the LSS
432 */
433 if (mrst_dev->lss == 9)
434 pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state);
435 else if (mrst_dev->lss == 10)
436 pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state);
437
438 status = pmu_wait_ready();
439 if (status)
440 goto ret;
441
442 old_sss = pmu_read_sss();
443 new_sss = old_sss & ~SSMSK(3, mrst_dev->lss);
444 new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state),
445 mrst_dev->lss);
446
447 if (new_sss == old_sss)
448 goto ret; /* nothing to do */
449
450 pmu_set_power_state_send_cmd++;
451
452 status = pmu_issue_command(new_sss);
453
454 if (unlikely(status != 0)) {
455 dev_err(&pdev->dev, "Failed to Issue a PM command\n");
456 goto ret;
457 }
458
459 if (pmu_wait_done())
460 goto ret;
461
462 lss_s0i3_enabled =
463 ((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET);
464ret:
465 spin_unlock(&mrst_pmu_power_state_lock);
466 return status;
467}
468
469#ifdef CONFIG_DEBUG_FS
470static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"};
471
472static inline const char *d0ix_name(int state)
473{
474 return d0ix_names[(int) state];
475}
476
477static int debug_mrst_pmu_show(struct seq_file *s, void *unused)
478{
479 struct pci_dev *pdev = NULL;
480 u32 cur_pmsss;
481 int lss;
482
483 seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET);
484
485 cur_pmsss = pmu_read_sss();
486
487 seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET);
488
489 seq_printf(s, "0x%08X Current SSS ", cur_pmsss);
490 seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n");
491
492 if (cpumask_equal(cpu_online_mask, cpumask_of(0)))
493 seq_printf(s, "cpu0 is only cpu online\n");
494 else
495 seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n");
496
497 seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]");
498
499
500 for_each_pci_dev(pdev) {
501 int pos;
502 u16 pmcsr;
503 struct mrst_device *mrst_dev;
504 int i;
505
506 mrst_dev = pci_id_2_mrst_dev(pdev->device);
507
508 seq_printf(s, "%s %04x/%04X %-16.16s ",
509 dev_name(&pdev->dev),
510 pdev->vendor, pdev->device,
511 dev_driver_string(&pdev->dev));
512
513 if (unlikely (!mrst_dev)) {
514 seq_printf(s, " UNKNOWN\n");
515 continue;
516 }
517
518 if (mrst_dev->lss)
519 seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss,
520 d0ix_name(((cur_pmsss >>
521 (mrst_dev->lss * 2)) & 0x3)));
522 else
523 seq_printf(s, " ");
524
525 /* PCI PM config space setting */
526 pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
527 if (pos != 0) {
528 pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
529 seq_printf(s, "PCI-%-4s",
530 pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK));
531 } else {
532 seq_printf(s, " ");
533 }
534
535 seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request));
536 for (i = 0; i <= PCI_D3cold; ++i)
537 seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]);
538
539 if (mrst_dev->lss) {
540 unsigned int lssmask;
541
542 lssmask = SSMSK(D0i3, mrst_dev->lss);
543
544 if ((lssmask & S0I3_SSS_TARGET) &&
545 ((lssmask & cur_pmsss) !=
546 (lssmask & S0I3_SSS_TARGET)))
547 seq_printf(s , "[BLOCKS s0i3]");
548 }
549
550 seq_printf(s, "\n");
551 }
552 seq_printf(s, "Wake Counters:\n");
553 for (lss = 0; lss < MRST_NUM_LSS; ++lss)
554 seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]);
555
556 seq_printf(s, "Interrupt Counters:\n");
557 seq_printf(s,
558 "INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n"
559 "INT_CMD_ERR \t%8u\n" "INT_WAKE_RX \t%8u\n"
560 "INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n"
561 "INT_NO_ACKC6 \t%8u\n" "INT_INVALID \t%8u\n",
562 pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE],
563 pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX],
564 pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS],
565 pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]);
566
567 seq_printf(s, "mrst_pmu_wait_ready_calls %8d\n",
568 pmu_wait_ready_calls);
569 seq_printf(s, "mrst_pmu_wait_ready_udelays %8d\n",
570 pmu_wait_ready_udelays);
571 seq_printf(s, "mrst_pmu_wait_ready_udelays_max %8d\n",
572 pmu_wait_ready_udelays_max);
573 seq_printf(s, "mrst_pmu_wait_done_calls %8d\n",
574 pmu_wait_done_calls);
575 seq_printf(s, "mrst_pmu_wait_done_udelays %8d\n",
576 pmu_wait_done_udelays);
577 seq_printf(s, "mrst_pmu_wait_done_udelays_max %8d\n",
578 pmu_wait_done_udelays_max);
579 seq_printf(s, "mrst_pmu_set_power_state_entry %8d\n",
580 pmu_set_power_state_entry);
581 seq_printf(s, "mrst_pmu_set_power_state_send_cmd %8d\n",
582 pmu_set_power_state_send_cmd);
583 seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status());
584
585 return 0;
586}
587
588static int debug_mrst_pmu_open(struct inode *inode, struct file *file)
589{
590 return single_open(file, debug_mrst_pmu_show, NULL);
591}
592
593static const struct file_operations devices_state_operations = {
594 .open = debug_mrst_pmu_open,
595 .read = seq_read,
596 .llseek = seq_lseek,
597 .release = single_release,
598};
599#endif /* DEBUG_FS */
600
601/*
602 * Validate SCU PCI shim PCI vendor capability byte
603 * against LSS hard-coded in mrst_devs[] above.
604 * DEBUG only.
605 */
606static void pmu_scu_firmware_debug(void)
607{
608 struct pci_dev *pdev = NULL;
609
610 for_each_pci_dev(pdev) {
611 struct mrst_device *mrst_dev;
612 u8 pci_config_lss;
613 int pos;
614
615 mrst_dev = pci_id_2_mrst_dev(pdev->device);
616 if (unlikely(!mrst_dev)) {
617 printk(KERN_ERR FW_BUG "pmu: Unknown "
618 "PCI device 0x%04X\n", pdev->device);
619 continue;
620 }
621
622 if (mrst_dev->lss == 0)
623 continue; /* no LSS in our table */
624
625 pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
626 if (!pos != 0) {
627 printk(KERN_ERR FW_BUG "pmu: 0x%04X "
628 "missing PCI Vendor Capability\n",
629 pdev->device);
630 continue;
631 }
632 pci_read_config_byte(pdev, pos + 4, &pci_config_lss);
633 if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) {
634 printk(KERN_ERR FW_BUG "pmu: 0x%04X "
635 "invalid PCI Vendor Capability 0x%x "
636 " expected LSS 0x%X\n",
637 pdev->device, pci_config_lss, mrst_dev->lss);
638 continue;
639 }
640 pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK;
641
642 if (mrst_dev->lss == pci_config_lss)
643 continue;
644
645 printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n",
646 pdev->device, pci_config_lss, mrst_dev->lss);
647 }
648}
649
650/**
651 * pmu_probe
652 */
653static int __devinit pmu_probe(struct pci_dev *pdev,
654 const struct pci_device_id *pci_id)
655{
656 int ret;
657 struct mrst_pmu_reg *pmu;
658
659 /* Init the device */
660 ret = pci_enable_device(pdev);
661 if (ret) {
662 dev_err(&pdev->dev, "Unable to Enable PCI device\n");
663 return ret;
664 }
665
666 ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME);
667 if (ret < 0) {
668 dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n");
669 goto out_err1;
670 }
671
672 /* Map the memory of PMU reg base */
673 pmu = pci_iomap(pdev, 0, 0);
674 if (!pmu) {
675 dev_err(&pdev->dev, "Unable to map the PMU address space\n");
676 ret = -ENOMEM;
677 goto out_err2;
678 }
679
680#ifdef CONFIG_DEBUG_FS
681 /* /sys/kernel/debug/mrst_pmu */
682 (void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO,
683 NULL, NULL, &devices_state_operations);
684#endif
685 pmu_reg = pmu; /* success */
686
687 if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) {
688 dev_err(&pdev->dev, "Registering isr has failed\n");
689 ret = -1;
690 goto out_err3;
691 }
692
693 pmu_scu_firmware_debug();
694
695 pmu_write_wkc(S0I3_WAKE_SOURCES); /* Enable S0i3 wakeup sources */
696
697 pmu_wait_ready();
698
699 pmu_write_ssc(D0I1_ACG_SSS_TARGET); /* Enable Auto-Clock_Gating */
700 pmu_write_cmd(0x201);
701
702 spin_lock_init(&mrst_pmu_power_state_lock);
703
704 /* Enable the hardware interrupt */
705 pmu_irq_enable();
706 return 0;
707
708out_err3:
709 free_irq(pdev->irq, NULL);
710 pci_iounmap(pdev, pmu_reg);
711 pmu_reg = NULL;
712out_err2:
713 pci_release_region(pdev, 0);
714out_err1:
715 pci_disable_device(pdev);
716 return ret;
717}
718
719static void __devexit pmu_remove(struct pci_dev *pdev)
720{
721 dev_err(&pdev->dev, "Mid PM pmu_remove called\n");
722
723 /* Freeing up the irq */
724 free_irq(pdev->irq, NULL);
725
726 pci_iounmap(pdev, pmu_reg);
727 pmu_reg = NULL;
728
729 /* disable the current PCI device */
730 pci_release_region(pdev, 0);
731 pci_disable_device(pdev);
732}
733
734static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = {
735 { PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 },
736 { }
737};
738
739MODULE_DEVICE_TABLE(pci, pmu_pci_ids);
740
741static struct pci_driver driver = {
742 .name = MRST_PMU_DRV_NAME,
743 .id_table = pmu_pci_ids,
744 .probe = pmu_probe,
745 .remove = __devexit_p(pmu_remove),
746};
747
748/**
749 * pmu_pci_register - register the PMU driver as PCI device
750 */
751static int __init pmu_pci_register(void)
752{
753 return pci_register_driver(&driver);
754}
755
756/* Register and probe via fs_initcall() to preceed device_initcall() */
757fs_initcall(pmu_pci_register);
758
759static void __exit mid_pci_cleanup(void)
760{
761 pci_unregister_driver(&driver);
762}
763
764static int ia_major;
765static int ia_minor;
766
767static int pmu_sfi_parse_oem(struct sfi_table_header *table)
768{
769 struct sfi_table_simple *sb;
770
771 sb = (struct sfi_table_simple *)table;
772 ia_major = (sb->pentry[1] >> 0) & 0xFFFF;
773 ia_minor = (sb->pentry[1] >> 16) & 0xFFFF;
774 printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n",
775 ia_major, ia_minor);
776
777 return 0;
778}
779
780static int __init scu_fw_check(void)
781{
782 int ret;
783 u32 fw_version;
784
785 if (!pmu_reg)
786 return 0; /* this driver didn't probe-out */
787
788 sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem);
789
790 if (ia_major < 0x6005 || ia_minor < 0x1525) {
791 WARN(1, "mrst_pmu: IA FW version too old\n");
792 return -1;
793 }
794
795 ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0,
796 &fw_version, 1);
797
798 if (ret) {
799 WARN(1, "mrst_pmu: IPC FW version? %d\n", ret);
800 } else {
801 int scu_major = (fw_version >> 8) & 0xFF;
802 int scu_minor = (fw_version >> 0) & 0xFF;
803
804 printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version);
805
806 if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) {
807 printk(KERN_INFO "mrst_pmu: enabling S0i3\n");
808 mrst_pmu_s0i3_enable = true;
809 } else {
810 WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X",
811 scu_major, scu_minor);
812 }
813 }
814 return 0;
815}
816late_initcall(scu_fw_check);
817module_exit(mid_pci_cleanup);
diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h
new file mode 100644
index 00000000000..bfbfe64b167
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.h
@@ -0,0 +1,234 @@
1/*
2 * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c
3 *
4 * Copyright (c) 2011, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20#ifndef _MRST_PMU_H_
21#define _MRST_PMU_H_
22
23#define PCI_DEV_ID_MRST_PMU 0x0810
24#define MRST_PMU_DRV_NAME "mrst_pmu"
25#define PCI_SUB_CLASS_MASK 0xFF00
26
27#define PCI_VENDOR_CAP_LOG_ID_MASK 0x7F
28#define PCI_VENDOR_CAP_LOG_SS_MASK 0x80
29
30#define SUB_SYS_ALL_D0I1 0x01155555
31#define S0I3_WAKE_SOURCES 0x00001FFF
32
33#define PM_S0I3_COMMAND \
34 ((0 << 31) | /* Reserved */ \
35 (0 << 30) | /* Core must be idle */ \
36 (0xc2 << 22) | /* ACK C6 trigger */ \
37 (3 << 19) | /* Trigger on DMI message */ \
38 (3 << 16) | /* Enter S0i3 */ \
39 (0 << 13) | /* Numeric mode ID (sw) */ \
40 (3 << 9) | /* Trigger mode */ \
41 (0 << 8) | /* Do not interrupt */ \
42 (1 << 0)) /* Set configuration */
43
44#define LSS_DMI 0
45#define LSS_SD_HC0 1
46#define LSS_SD_HC1 2
47#define LSS_NAND 3
48#define LSS_IMAGING 4
49#define LSS_SECURITY 5
50#define LSS_DISPLAY 6
51#define LSS_USB_HC 7
52#define LSS_USB_OTG 8
53#define LSS_AUDIO 9
54#define LSS_AUDIO_LPE 9
55#define LSS_AUDIO_SSP 9
56#define LSS_I2C0 10
57#define LSS_I2C1 10
58#define LSS_I2C2 10
59#define LSS_KBD 10
60#define LSS_SPI0 10
61#define LSS_SPI1 10
62#define LSS_SPI2 10
63#define LSS_GPIO 10
64#define LSS_SRAM 11 /* used by SCU, do not touch */
65#define LSS_SD_HC2 12
66/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */
67#define MRST_NUM_LSS 13
68
69#define MIN(a, b) (((a) < (b)) ? (a) : (b))
70
71#define SSMSK(mask, lss) ((mask) << ((lss) * 2))
72#define D0 0
73#define D0i1 1
74#define D0i2 2
75#define D0i3 3
76
77#define S0I3_SSS_TARGET ( \
78 SSMSK(D0i1, LSS_DMI) | \
79 SSMSK(D0i3, LSS_SD_HC0) | \
80 SSMSK(D0i3, LSS_SD_HC1) | \
81 SSMSK(D0i3, LSS_NAND) | \
82 SSMSK(D0i3, LSS_SD_HC2) | \
83 SSMSK(D0i3, LSS_IMAGING) | \
84 SSMSK(D0i3, LSS_SECURITY) | \
85 SSMSK(D0i3, LSS_DISPLAY) | \
86 SSMSK(D0i3, LSS_USB_HC) | \
87 SSMSK(D0i3, LSS_USB_OTG) | \
88 SSMSK(D0i3, LSS_AUDIO) | \
89 SSMSK(D0i1, LSS_I2C0))
90
91/*
92 * D0i1 on Langwell is Autonomous Clock Gating (ACG).
93 * Enable ACG on every LSS except camera and audio
94 */
95#define D0I1_ACG_SSS_TARGET \
96 (SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO))
97
98enum cm_mode {
99 CM_NOP, /* ignore the config mode value */
100 CM_IMMEDIATE,
101 CM_DELAY,
102 CM_TRIGGER,
103 CM_INVALID
104};
105
106enum sys_state {
107 SYS_STATE_S0I0,
108 SYS_STATE_S0I1,
109 SYS_STATE_S0I2,
110 SYS_STATE_S0I3,
111 SYS_STATE_S3,
112 SYS_STATE_S5
113};
114
115#define SET_CFG_CMD 1
116
117enum int_status {
118 INT_SPURIOUS = 0,
119 INT_CMD_DONE = 1,
120 INT_CMD_ERR = 2,
121 INT_WAKE_RX = 3,
122 INT_SS_ERROR = 4,
123 INT_S0IX_MISS = 5,
124 INT_NO_ACKC6 = 6,
125 INT_INVALID = 7,
126};
127
128/* PMU register interface */
129static struct mrst_pmu_reg {
130 u32 pm_sts; /* 0x00 */
131 u32 pm_cmd; /* 0x04 */
132 u32 pm_ics; /* 0x08 */
133 u32 _resv1; /* 0x0C */
134 u32 pm_wkc[2]; /* 0x10 */
135 u32 pm_wks[2]; /* 0x18 */
136 u32 pm_ssc[4]; /* 0x20 */
137 u32 pm_sss[4]; /* 0x30 */
138 u32 pm_wssc[4]; /* 0x40 */
139 u32 pm_c3c4; /* 0x50 */
140 u32 pm_c5c6; /* 0x54 */
141 u32 pm_msi_disable; /* 0x58 */
142} *pmu_reg;
143
144static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); }
145static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); }
146static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); }
147static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); }
148
149static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); }
150static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); }
151static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); }
152static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); }
153static inline void pmu_write_wssc(u32 arg)
154 { writel(arg, &pmu_reg->pm_wssc[0]); }
155
156static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); }
157static inline u32 pmu_msi_is_disabled(void)
158 { return readl(&pmu_reg->pm_msi_disable); }
159
160union pmu_pm_ics {
161 struct {
162 u32 cause:8;
163 u32 enable:1;
164 u32 pending:1;
165 u32 reserved:22;
166 } bits;
167 u32 value;
168};
169
170static inline void pmu_irq_enable(void)
171{
172 union pmu_pm_ics pmu_ics;
173
174 pmu_ics.value = pmu_read_ics();
175 pmu_ics.bits.enable = 1;
176 pmu_write_ics(pmu_ics.value);
177}
178
179union pmu_pm_status {
180 struct {
181 u32 pmu_rev:8;
182 u32 pmu_busy:1;
183 u32 mode_id:4;
184 u32 Reserved:19;
185 } pmu_status_parts;
186 u32 pmu_status_value;
187};
188
189static inline int pmu_read_busy_status(void)
190{
191 union pmu_pm_status result;
192
193 result.pmu_status_value = pmu_read_sts();
194
195 return result.pmu_status_parts.pmu_busy;
196}
197
198/* pmu set config parameters */
199struct cfg_delay_param_t {
200 u32 cmd:8;
201 u32 ioc:1;
202 u32 cfg_mode:4;
203 u32 mode_id:3;
204 u32 sys_state:3;
205 u32 cfg_delay:8;
206 u32 rsvd:5;
207};
208
209struct cfg_trig_param_t {
210 u32 cmd:8;
211 u32 ioc:1;
212 u32 cfg_mode:4;
213 u32 mode_id:3;
214 u32 sys_state:3;
215 u32 cfg_trig_type:3;
216 u32 cfg_trig_val:8;
217 u32 cmbi:1;
218 u32 rsvd1:1;
219};
220
221union pmu_pm_set_cfg_cmd_t {
222 union {
223 struct cfg_delay_param_t d_param;
224 struct cfg_trig_param_t t_param;
225 } pmu2_params;
226 u32 pmu_pm_set_cfg_cmd_value;
227};
228
229#ifdef FUTURE_PATCH
230extern int mrst_s0i3_entry(u32 regval, u32 *regaddr);
231#else
232static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; }
233#endif
234#endif
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 73d70d65e76..6d5dbcdd444 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -58,8 +58,11 @@ EXPORT_SYMBOL_GPL(vrtc_cmos_write);
58unsigned long vrtc_get_time(void) 58unsigned long vrtc_get_time(void)
59{ 59{
60 u8 sec, min, hour, mday, mon; 60 u8 sec, min, hour, mday, mon;
61 unsigned long flags;
61 u32 year; 62 u32 year;
62 63
64 spin_lock_irqsave(&rtc_lock, flags);
65
63 while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP)) 66 while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
64 cpu_relax(); 67 cpu_relax();
65 68
@@ -70,6 +73,8 @@ unsigned long vrtc_get_time(void)
70 mon = vrtc_cmos_read(RTC_MONTH); 73 mon = vrtc_cmos_read(RTC_MONTH);
71 year = vrtc_cmos_read(RTC_YEAR); 74 year = vrtc_cmos_read(RTC_YEAR);
72 75
76 spin_unlock_irqrestore(&rtc_lock, flags);
77
73 /* vRTC YEAR reg contains the offset to 1960 */ 78 /* vRTC YEAR reg contains the offset to 1960 */
74 year += 1960; 79 year += 1960;
75 80
@@ -83,8 +88,10 @@ unsigned long vrtc_get_time(void)
83int vrtc_set_mmss(unsigned long nowtime) 88int vrtc_set_mmss(unsigned long nowtime)
84{ 89{
85 int real_sec, real_min; 90 int real_sec, real_min;
91 unsigned long flags;
86 int vrtc_min; 92 int vrtc_min;
87 93
94 spin_lock_irqsave(&rtc_lock, flags);
88 vrtc_min = vrtc_cmos_read(RTC_MINUTES); 95 vrtc_min = vrtc_cmos_read(RTC_MINUTES);
89 96
90 real_sec = nowtime % 60; 97 real_sec = nowtime % 60;
@@ -95,6 +102,8 @@ int vrtc_set_mmss(unsigned long nowtime)
95 102
96 vrtc_cmos_write(real_sec, RTC_SECONDS); 103 vrtc_cmos_write(real_sec, RTC_SECONDS);
97 vrtc_cmos_write(real_min, RTC_MINUTES); 104 vrtc_cmos_write(real_min, RTC_MINUTES);
105 spin_unlock_irqrestore(&rtc_lock, flags);
106
98 return 0; 107 return 0;
99} 108}
100 109
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index 81c5e2165c2..fd332c53394 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,2 +1,5 @@
1obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o 1obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o
3obj-$(CONFIG_OLPC_XO1_RTC) += olpc-xo1-rtc.o
4obj-$(CONFIG_OLPC_XO1_SCI) += olpc-xo1-sci.o
5obj-$(CONFIG_OLPC_XO15_SCI) += olpc-xo15-sci.o
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
new file mode 100644
index 00000000000..6f3855a5a2f
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -0,0 +1,215 @@
1/*
2 * Support for power management features of the OLPC XO-1 laptop
3 *
4 * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
5 * Copyright (C) 2010 One Laptop per Child
6 * Copyright (C) 2006 Red Hat, Inc.
7 * Copyright (C) 2006 Advanced Micro Devices, Inc.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 */
14
15#include <linux/cs5535.h>
16#include <linux/platform_device.h>
17#include <linux/pm.h>
18#include <linux/mfd/core.h>
19#include <linux/suspend.h>
20
21#include <asm/io.h>
22#include <asm/olpc.h>
23
24#define DRV_NAME "olpc-xo1-pm"
25
26static unsigned long acpi_base;
27static unsigned long pms_base;
28
29static u16 wakeup_mask = CS5536_PM_PWRBTN;
30
31static struct {
32 unsigned long address;
33 unsigned short segment;
34} ofw_bios_entry = { 0xF0000 + PAGE_OFFSET, __KERNEL_CS };
35
36/* Set bits in the wakeup mask */
37void olpc_xo1_pm_wakeup_set(u16 value)
38{
39 wakeup_mask |= value;
40}
41EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_set);
42
43/* Clear bits in the wakeup mask */
44void olpc_xo1_pm_wakeup_clear(u16 value)
45{
46 wakeup_mask &= ~value;
47}
48EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_clear);
49
50static int xo1_power_state_enter(suspend_state_t pm_state)
51{
52 unsigned long saved_sci_mask;
53 int r;
54
55 /* Only STR is supported */
56 if (pm_state != PM_SUSPEND_MEM)
57 return -EINVAL;
58
59 r = olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0);
60 if (r)
61 return r;
62
63 /*
64 * Save SCI mask (this gets lost since PM1_EN is used as a mask for
65 * wakeup events, which is not necessarily the same event set)
66 */
67 saved_sci_mask = inl(acpi_base + CS5536_PM1_STS);
68 saved_sci_mask &= 0xffff0000;
69
70 /* Save CPU state */
71 do_olpc_suspend_lowlevel();
72
73 /* Resume path starts here */
74
75 /* Restore SCI mask (using dword access to CS5536_PM1_EN) */
76 outl(saved_sci_mask, acpi_base + CS5536_PM1_STS);
77
78 /* Tell the EC to stop inhibiting SCIs */
79 olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0);
80
81 /*
82 * Tell the wireless module to restart USB communication.
83 * Must be done twice.
84 */
85 olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
86 olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
87
88 return 0;
89}
90
91asmlinkage int xo1_do_sleep(u8 sleep_state)
92{
93 void *pgd_addr = __va(read_cr3());
94
95 /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
96 outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
97
98 __asm__("movl %0,%%eax" : : "r" (pgd_addr));
99 __asm__("call *(%%edi); cld"
100 : : "D" (&ofw_bios_entry));
101 __asm__("movb $0x34, %al\n\t"
102 "outb %al, $0x70\n\t"
103 "movb $0x30, %al\n\t"
104 "outb %al, $0x71\n\t");
105 return 0;
106}
107
108static void xo1_power_off(void)
109{
110 printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
111
112 /* Enable all of these controls with 0 delay */
113 outl(0x40000000, pms_base + CS5536_PM_SCLK);
114 outl(0x40000000, pms_base + CS5536_PM_IN_SLPCTL);
115 outl(0x40000000, pms_base + CS5536_PM_WKXD);
116 outl(0x40000000, pms_base + CS5536_PM_WKD);
117
118 /* Clear status bits (possibly unnecessary) */
119 outl(0x0002ffff, pms_base + CS5536_PM_SSC);
120 outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
121
122 /* Write SLP_EN bit to start the machinery */
123 outl(0x00002000, acpi_base + CS5536_PM1_CNT);
124}
125
126static int xo1_power_state_valid(suspend_state_t pm_state)
127{
128 /* suspend-to-RAM only */
129 return pm_state == PM_SUSPEND_MEM;
130}
131
132static const struct platform_suspend_ops xo1_suspend_ops = {
133 .valid = xo1_power_state_valid,
134 .enter = xo1_power_state_enter,
135};
136
137static int __devinit xo1_pm_probe(struct platform_device *pdev)
138{
139 struct resource *res;
140 int err;
141
142 /* don't run on non-XOs */
143 if (!machine_is_olpc())
144 return -ENODEV;
145
146 err = mfd_cell_enable(pdev);
147 if (err)
148 return err;
149
150 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
151 if (!res) {
152 dev_err(&pdev->dev, "can't fetch device resource info\n");
153 return -EIO;
154 }
155 if (strcmp(pdev->name, "cs5535-pms") == 0)
156 pms_base = res->start;
157 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
158 acpi_base = res->start;
159
160 /* If we have both addresses, we can override the poweroff hook */
161 if (pms_base && acpi_base) {
162 suspend_set_ops(&xo1_suspend_ops);
163 pm_power_off = xo1_power_off;
164 printk(KERN_INFO "OLPC XO-1 support registered\n");
165 }
166
167 return 0;
168}
169
170static int __devexit xo1_pm_remove(struct platform_device *pdev)
171{
172 mfd_cell_disable(pdev);
173
174 if (strcmp(pdev->name, "cs5535-pms") == 0)
175 pms_base = 0;
176 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
177 acpi_base = 0;
178
179 pm_power_off = NULL;
180 return 0;
181}
182
183static struct platform_driver cs5535_pms_driver = {
184 .driver = {
185 .name = "cs5535-pms",
186 .owner = THIS_MODULE,
187 },
188 .probe = xo1_pm_probe,
189 .remove = __devexit_p(xo1_pm_remove),
190};
191
192static struct platform_driver cs5535_acpi_driver = {
193 .driver = {
194 .name = "olpc-xo1-pm-acpi",
195 .owner = THIS_MODULE,
196 },
197 .probe = xo1_pm_probe,
198 .remove = __devexit_p(xo1_pm_remove),
199};
200
201static int __init xo1_pm_init(void)
202{
203 int r;
204
205 r = platform_driver_register(&cs5535_pms_driver);
206 if (r)
207 return r;
208
209 r = platform_driver_register(&cs5535_acpi_driver);
210 if (r)
211 platform_driver_unregister(&cs5535_pms_driver);
212
213 return r;
214}
215arch_initcall(xo1_pm_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c
new file mode 100644
index 00000000000..a2b4efddd61
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c
@@ -0,0 +1,81 @@
1/*
2 * Support for OLPC XO-1 Real Time Clock (RTC)
3 *
4 * Copyright (C) 2011 One Laptop per Child
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/mc146818rtc.h>
13#include <linux/platform_device.h>
14#include <linux/rtc.h>
15#include <linux/of.h>
16
17#include <asm/msr.h>
18#include <asm/olpc.h>
19
20static void rtc_wake_on(struct device *dev)
21{
22 olpc_xo1_pm_wakeup_set(CS5536_PM_RTC);
23}
24
25static void rtc_wake_off(struct device *dev)
26{
27 olpc_xo1_pm_wakeup_clear(CS5536_PM_RTC);
28}
29
30static struct resource rtc_platform_resource[] = {
31 [0] = {
32 .start = RTC_PORT(0),
33 .end = RTC_PORT(1),
34 .flags = IORESOURCE_IO,
35 },
36 [1] = {
37 .start = RTC_IRQ,
38 .end = RTC_IRQ,
39 .flags = IORESOURCE_IRQ,
40 }
41};
42
43static struct cmos_rtc_board_info rtc_info = {
44 .rtc_day_alarm = 0,
45 .rtc_mon_alarm = 0,
46 .rtc_century = 0,
47 .wake_on = rtc_wake_on,
48 .wake_off = rtc_wake_off,
49};
50
51static struct platform_device xo1_rtc_device = {
52 .name = "rtc_cmos",
53 .id = -1,
54 .num_resources = ARRAY_SIZE(rtc_platform_resource),
55 .dev.platform_data = &rtc_info,
56 .resource = rtc_platform_resource,
57};
58
59static int __init xo1_rtc_init(void)
60{
61 int r;
62 struct device_node *node;
63
64 node = of_find_compatible_node(NULL, NULL, "olpc,xo1-rtc");
65 if (!node)
66 return 0;
67 of_node_put(node);
68
69 pr_info("olpc-xo1-rtc: Initializing OLPC XO-1 RTC\n");
70 rdmsrl(MSR_RTC_DOMA_OFFSET, rtc_info.rtc_day_alarm);
71 rdmsrl(MSR_RTC_MONA_OFFSET, rtc_info.rtc_mon_alarm);
72 rdmsrl(MSR_RTC_CEN_OFFSET, rtc_info.rtc_century);
73
74 r = platform_device_register(&xo1_rtc_device);
75 if (r)
76 return r;
77
78 device_init_wakeup(&xo1_rtc_device.dev, 1);
79 return 0;
80}
81arch_initcall(xo1_rtc_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
new file mode 100644
index 00000000000..1d4c783d732
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -0,0 +1,614 @@
1/*
2 * Support for OLPC XO-1 System Control Interrupts (SCI)
3 *
4 * Copyright (C) 2010 One Laptop per Child
5 * Copyright (C) 2006 Red Hat, Inc.
6 * Copyright (C) 2006 Advanced Micro Devices, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/cs5535.h>
15#include <linux/device.h>
16#include <linux/gpio.h>
17#include <linux/input.h>
18#include <linux/interrupt.h>
19#include <linux/platform_device.h>
20#include <linux/pm.h>
21#include <linux/mfd/core.h>
22#include <linux/power_supply.h>
23#include <linux/suspend.h>
24#include <linux/workqueue.h>
25
26#include <asm/io.h>
27#include <asm/msr.h>
28#include <asm/olpc.h>
29
30#define DRV_NAME "olpc-xo1-sci"
31#define PFX DRV_NAME ": "
32
33static unsigned long acpi_base;
34static struct input_dev *power_button_idev;
35static struct input_dev *ebook_switch_idev;
36static struct input_dev *lid_switch_idev;
37
38static int sci_irq;
39
40static bool lid_open;
41static bool lid_inverted;
42static int lid_wake_mode;
43
44enum lid_wake_modes {
45 LID_WAKE_ALWAYS,
46 LID_WAKE_OPEN,
47 LID_WAKE_CLOSE,
48};
49
50static const char * const lid_wake_mode_names[] = {
51 [LID_WAKE_ALWAYS] = "always",
52 [LID_WAKE_OPEN] = "open",
53 [LID_WAKE_CLOSE] = "close",
54};
55
56static void battery_status_changed(void)
57{
58 struct power_supply *psy = power_supply_get_by_name("olpc-battery");
59
60 if (psy) {
61 power_supply_changed(psy);
62 put_device(psy->dev);
63 }
64}
65
66static void ac_status_changed(void)
67{
68 struct power_supply *psy = power_supply_get_by_name("olpc-ac");
69
70 if (psy) {
71 power_supply_changed(psy);
72 put_device(psy->dev);
73 }
74}
75
76/* Report current ebook switch state through input layer */
77static void send_ebook_state(void)
78{
79 unsigned char state;
80
81 if (olpc_ec_cmd(EC_READ_EB_MODE, NULL, 0, &state, 1)) {
82 pr_err(PFX "failed to get ebook state\n");
83 return;
84 }
85
86 input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state);
87 input_sync(ebook_switch_idev);
88}
89
90static void flip_lid_inverter(void)
91{
92 /* gpio is high; invert so we'll get l->h event interrupt */
93 if (lid_inverted)
94 cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
95 else
96 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
97 lid_inverted = !lid_inverted;
98}
99
100static void detect_lid_state(void)
101{
102 /*
103 * the edge detector hookup on the gpio inputs on the geode is
104 * odd, to say the least. See http://dev.laptop.org/ticket/5703
105 * for details, but in a nutshell: we don't use the edge
106 * detectors. instead, we make use of an anomoly: with the both
107 * edge detectors turned off, we still get an edge event on a
108 * positive edge transition. to take advantage of this, we use the
109 * front-end inverter to ensure that that's the edge we're always
110 * going to see next.
111 */
112
113 int state;
114
115 state = cs5535_gpio_isset(OLPC_GPIO_LID, GPIO_READ_BACK);
116 lid_open = !state ^ !lid_inverted; /* x ^^ y */
117 if (!state)
118 return;
119
120 flip_lid_inverter();
121}
122
123/* Report current lid switch state through input layer */
124static void send_lid_state(void)
125{
126 input_report_switch(lid_switch_idev, SW_LID, !lid_open);
127 input_sync(lid_switch_idev);
128}
129
130static ssize_t lid_wake_mode_show(struct device *dev,
131 struct device_attribute *attr, char *buf)
132{
133 const char *mode = lid_wake_mode_names[lid_wake_mode];
134 return sprintf(buf, "%s\n", mode);
135}
136static ssize_t lid_wake_mode_set(struct device *dev,
137 struct device_attribute *attr,
138 const char *buf, size_t count)
139{
140 int i;
141 for (i = 0; i < ARRAY_SIZE(lid_wake_mode_names); i++) {
142 const char *mode = lid_wake_mode_names[i];
143 if (strlen(mode) != count || strncasecmp(mode, buf, count))
144 continue;
145
146 lid_wake_mode = i;
147 return count;
148 }
149 return -EINVAL;
150}
151static DEVICE_ATTR(lid_wake_mode, S_IWUSR | S_IRUGO, lid_wake_mode_show,
152 lid_wake_mode_set);
153
154/*
155 * Process all items in the EC's SCI queue.
156 *
157 * This is handled in a workqueue because olpc_ec_cmd can be slow (and
158 * can even timeout).
159 *
160 * If propagate_events is false, the queue is drained without events being
161 * generated for the interrupts.
162 */
163static void process_sci_queue(bool propagate_events)
164{
165 int r;
166 u16 data;
167
168 do {
169 r = olpc_ec_sci_query(&data);
170 if (r || !data)
171 break;
172
173 pr_debug(PFX "SCI 0x%x received\n", data);
174
175 switch (data) {
176 case EC_SCI_SRC_BATERR:
177 case EC_SCI_SRC_BATSOC:
178 case EC_SCI_SRC_BATTERY:
179 case EC_SCI_SRC_BATCRIT:
180 battery_status_changed();
181 break;
182 case EC_SCI_SRC_ACPWR:
183 ac_status_changed();
184 break;
185 }
186
187 if (data == EC_SCI_SRC_EBOOK && propagate_events)
188 send_ebook_state();
189 } while (data);
190
191 if (r)
192 pr_err(PFX "Failed to clear SCI queue");
193}
194
195static void process_sci_queue_work(struct work_struct *work)
196{
197 process_sci_queue(true);
198}
199
200static DECLARE_WORK(sci_work, process_sci_queue_work);
201
202static irqreturn_t xo1_sci_intr(int irq, void *dev_id)
203{
204 struct platform_device *pdev = dev_id;
205 u32 sts;
206 u32 gpe;
207
208 sts = inl(acpi_base + CS5536_PM1_STS);
209 outl(sts | 0xffff, acpi_base + CS5536_PM1_STS);
210
211 gpe = inl(acpi_base + CS5536_PM_GPE0_STS);
212 outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
213
214 dev_dbg(&pdev->dev, "sts %x gpe %x\n", sts, gpe);
215
216 if (sts & CS5536_PWRBTN_FLAG && !(sts & CS5536_WAK_FLAG)) {
217 input_report_key(power_button_idev, KEY_POWER, 1);
218 input_sync(power_button_idev);
219 input_report_key(power_button_idev, KEY_POWER, 0);
220 input_sync(power_button_idev);
221 }
222
223 if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */
224 cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
225 schedule_work(&sci_work);
226 }
227
228 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
229 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
230 detect_lid_state();
231 send_lid_state();
232
233 return IRQ_HANDLED;
234}
235
236static int xo1_sci_suspend(struct platform_device *pdev, pm_message_t state)
237{
238 if (device_may_wakeup(&power_button_idev->dev))
239 olpc_xo1_pm_wakeup_set(CS5536_PM_PWRBTN);
240 else
241 olpc_xo1_pm_wakeup_clear(CS5536_PM_PWRBTN);
242
243 if (device_may_wakeup(&ebook_switch_idev->dev))
244 olpc_ec_wakeup_set(EC_SCI_SRC_EBOOK);
245 else
246 olpc_ec_wakeup_clear(EC_SCI_SRC_EBOOK);
247
248 if (!device_may_wakeup(&lid_switch_idev->dev)) {
249 cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
250 } else if ((lid_open && lid_wake_mode == LID_WAKE_OPEN) ||
251 (!lid_open && lid_wake_mode == LID_WAKE_CLOSE)) {
252 flip_lid_inverter();
253
254 /* we may have just caused an event */
255 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
256 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
257
258 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
259 }
260
261 return 0;
262}
263
264static int xo1_sci_resume(struct platform_device *pdev)
265{
266 /*
267 * We don't know what may have happened while we were asleep.
268 * Reestablish our lid setup so we're sure to catch all transitions.
269 */
270 detect_lid_state();
271 send_lid_state();
272 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
273
274 /* Enable all EC events */
275 olpc_ec_mask_write(EC_SCI_SRC_ALL);
276
277 /* Power/battery status might have changed too */
278 battery_status_changed();
279 ac_status_changed();
280 return 0;
281}
282
283static int __devinit setup_sci_interrupt(struct platform_device *pdev)
284{
285 u32 lo, hi;
286 u32 sts;
287 int r;
288
289 rdmsr(0x51400020, lo, hi);
290 sci_irq = (lo >> 20) & 15;
291
292 if (sci_irq) {
293 dev_info(&pdev->dev, "SCI is mapped to IRQ %d\n", sci_irq);
294 } else {
295 /* Zero means masked */
296 dev_info(&pdev->dev, "SCI unmapped. Mapping to IRQ 3\n");
297 sci_irq = 3;
298 lo |= 0x00300000;
299 wrmsrl(0x51400020, lo);
300 }
301
302 /* Select level triggered in PIC */
303 if (sci_irq < 8) {
304 lo = inb(CS5536_PIC_INT_SEL1);
305 lo |= 1 << sci_irq;
306 outb(lo, CS5536_PIC_INT_SEL1);
307 } else {
308 lo = inb(CS5536_PIC_INT_SEL2);
309 lo |= 1 << (sci_irq - 8);
310 outb(lo, CS5536_PIC_INT_SEL2);
311 }
312
313 /* Enable SCI from power button, and clear pending interrupts */
314 sts = inl(acpi_base + CS5536_PM1_STS);
315 outl((CS5536_PM_PWRBTN << 16) | 0xffff, acpi_base + CS5536_PM1_STS);
316
317 r = request_irq(sci_irq, xo1_sci_intr, 0, DRV_NAME, pdev);
318 if (r)
319 dev_err(&pdev->dev, "can't request interrupt\n");
320
321 return r;
322}
323
324static int __devinit setup_ec_sci(void)
325{
326 int r;
327
328 r = gpio_request(OLPC_GPIO_ECSCI, "OLPC-ECSCI");
329 if (r)
330 return r;
331
332 gpio_direction_input(OLPC_GPIO_ECSCI);
333
334 /* Clear pending EC SCI events */
335 cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
336 cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_POSITIVE_EDGE_STS);
337
338 /*
339 * Enable EC SCI events, and map them to both a PME and the SCI
340 * interrupt.
341 *
342 * Ordinarily, in addition to functioning as GPIOs, Geode GPIOs can
343 * be mapped to regular interrupts *or* Geode-specific Power
344 * Management Events (PMEs) - events that bring the system out of
345 * suspend. In this case, we want both of those things - the system
346 * wakeup, *and* the ability to get an interrupt when an event occurs.
347 *
348 * To achieve this, we map the GPIO to a PME, and then we use one
349 * of the many generic knobs on the CS5535 PIC to additionally map the
350 * PME to the regular SCI interrupt line.
351 */
352 cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_EVENTS_ENABLE);
353
354 /* Set the SCI to cause a PME event on group 7 */
355 cs5535_gpio_setup_event(OLPC_GPIO_ECSCI, 7, 1);
356
357 /* And have group 7 also fire the SCI interrupt */
358 cs5535_pic_unreqz_select_high(7, sci_irq);
359
360 return 0;
361}
362
363static void free_ec_sci(void)
364{
365 gpio_free(OLPC_GPIO_ECSCI);
366}
367
368static int __devinit setup_lid_events(void)
369{
370 int r;
371
372 r = gpio_request(OLPC_GPIO_LID, "OLPC-LID");
373 if (r)
374 return r;
375
376 gpio_direction_input(OLPC_GPIO_LID);
377
378 cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
379 lid_inverted = 0;
380
381 /* Clear edge detection and event enable for now */
382 cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
383 cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_EN);
384 cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_EN);
385 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
386 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
387
388 /* Set the LID to cause an PME event on group 6 */
389 cs5535_gpio_setup_event(OLPC_GPIO_LID, 6, 1);
390
391 /* Set PME group 6 to fire the SCI interrupt */
392 cs5535_gpio_set_irq(6, sci_irq);
393
394 /* Enable the event */
395 cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
396
397 return 0;
398}
399
400static void free_lid_events(void)
401{
402 gpio_free(OLPC_GPIO_LID);
403}
404
405static int __devinit setup_power_button(struct platform_device *pdev)
406{
407 int r;
408
409 power_button_idev = input_allocate_device();
410 if (!power_button_idev)
411 return -ENOMEM;
412
413 power_button_idev->name = "Power Button";
414 power_button_idev->phys = DRV_NAME "/input0";
415 set_bit(EV_KEY, power_button_idev->evbit);
416 set_bit(KEY_POWER, power_button_idev->keybit);
417
418 power_button_idev->dev.parent = &pdev->dev;
419 device_init_wakeup(&power_button_idev->dev, 1);
420
421 r = input_register_device(power_button_idev);
422 if (r) {
423 dev_err(&pdev->dev, "failed to register power button: %d\n", r);
424 input_free_device(power_button_idev);
425 }
426
427 return r;
428}
429
430static void free_power_button(void)
431{
432 input_unregister_device(power_button_idev);
433 input_free_device(power_button_idev);
434}
435
436static int __devinit setup_ebook_switch(struct platform_device *pdev)
437{
438 int r;
439
440 ebook_switch_idev = input_allocate_device();
441 if (!ebook_switch_idev)
442 return -ENOMEM;
443
444 ebook_switch_idev->name = "EBook Switch";
445 ebook_switch_idev->phys = DRV_NAME "/input1";
446 set_bit(EV_SW, ebook_switch_idev->evbit);
447 set_bit(SW_TABLET_MODE, ebook_switch_idev->swbit);
448
449 ebook_switch_idev->dev.parent = &pdev->dev;
450 device_set_wakeup_capable(&ebook_switch_idev->dev, true);
451
452 r = input_register_device(ebook_switch_idev);
453 if (r) {
454 dev_err(&pdev->dev, "failed to register ebook switch: %d\n", r);
455 input_free_device(ebook_switch_idev);
456 }
457
458 return r;
459}
460
461static void free_ebook_switch(void)
462{
463 input_unregister_device(ebook_switch_idev);
464 input_free_device(ebook_switch_idev);
465}
466
467static int __devinit setup_lid_switch(struct platform_device *pdev)
468{
469 int r;
470
471 lid_switch_idev = input_allocate_device();
472 if (!lid_switch_idev)
473 return -ENOMEM;
474
475 lid_switch_idev->name = "Lid Switch";
476 lid_switch_idev->phys = DRV_NAME "/input2";
477 set_bit(EV_SW, lid_switch_idev->evbit);
478 set_bit(SW_LID, lid_switch_idev->swbit);
479
480 lid_switch_idev->dev.parent = &pdev->dev;
481 device_set_wakeup_capable(&lid_switch_idev->dev, true);
482
483 r = input_register_device(lid_switch_idev);
484 if (r) {
485 dev_err(&pdev->dev, "failed to register lid switch: %d\n", r);
486 goto err_register;
487 }
488
489 r = device_create_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
490 if (r) {
491 dev_err(&pdev->dev, "failed to create wake mode attr: %d\n", r);
492 goto err_create_attr;
493 }
494
495 return 0;
496
497err_create_attr:
498 input_unregister_device(lid_switch_idev);
499err_register:
500 input_free_device(lid_switch_idev);
501 return r;
502}
503
504static void free_lid_switch(void)
505{
506 device_remove_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
507 input_unregister_device(lid_switch_idev);
508 input_free_device(lid_switch_idev);
509}
510
511static int __devinit xo1_sci_probe(struct platform_device *pdev)
512{
513 struct resource *res;
514 int r;
515
516 /* don't run on non-XOs */
517 if (!machine_is_olpc())
518 return -ENODEV;
519
520 r = mfd_cell_enable(pdev);
521 if (r)
522 return r;
523
524 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
525 if (!res) {
526 dev_err(&pdev->dev, "can't fetch device resource info\n");
527 return -EIO;
528 }
529 acpi_base = res->start;
530
531 r = setup_power_button(pdev);
532 if (r)
533 return r;
534
535 r = setup_ebook_switch(pdev);
536 if (r)
537 goto err_ebook;
538
539 r = setup_lid_switch(pdev);
540 if (r)
541 goto err_lid;
542
543 r = setup_lid_events();
544 if (r)
545 goto err_lidevt;
546
547 r = setup_ec_sci();
548 if (r)
549 goto err_ecsci;
550
551 /* Enable PME generation for EC-generated events */
552 outl(CS5536_GPIOM6_PME_EN | CS5536_GPIOM7_PME_EN,
553 acpi_base + CS5536_PM_GPE0_EN);
554
555 /* Clear pending events */
556 outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
557 process_sci_queue(false);
558
559 /* Initial sync */
560 send_ebook_state();
561 detect_lid_state();
562 send_lid_state();
563
564 r = setup_sci_interrupt(pdev);
565 if (r)
566 goto err_sci;
567
568 /* Enable all EC events */
569 olpc_ec_mask_write(EC_SCI_SRC_ALL);
570
571 return r;
572
573err_sci:
574 free_ec_sci();
575err_ecsci:
576 free_lid_events();
577err_lidevt:
578 free_lid_switch();
579err_lid:
580 free_ebook_switch();
581err_ebook:
582 free_power_button();
583 return r;
584}
585
586static int __devexit xo1_sci_remove(struct platform_device *pdev)
587{
588 mfd_cell_disable(pdev);
589 free_irq(sci_irq, pdev);
590 cancel_work_sync(&sci_work);
591 free_ec_sci();
592 free_lid_events();
593 free_lid_switch();
594 free_ebook_switch();
595 free_power_button();
596 acpi_base = 0;
597 return 0;
598}
599
600static struct platform_driver xo1_sci_driver = {
601 .driver = {
602 .name = "olpc-xo1-sci-acpi",
603 },
604 .probe = xo1_sci_probe,
605 .remove = __devexit_p(xo1_sci_remove),
606 .suspend = xo1_sci_suspend,
607 .resume = xo1_sci_resume,
608};
609
610static int __init xo1_sci_init(void)
611{
612 return platform_driver_register(&xo1_sci_driver);
613}
614arch_initcall(xo1_sci_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
deleted file mode 100644
index ab81fb27176..00000000000
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ /dev/null
@@ -1,146 +0,0 @@
1/*
2 * Support for features of the OLPC XO-1 laptop
3 *
4 * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
5 * Copyright (C) 2010 One Laptop per Child
6 * Copyright (C) 2006 Red Hat, Inc.
7 * Copyright (C) 2006 Advanced Micro Devices, Inc.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 */
14
15#include <linux/module.h>
16#include <linux/platform_device.h>
17#include <linux/pm.h>
18#include <linux/mfd/core.h>
19
20#include <asm/io.h>
21#include <asm/olpc.h>
22
23#define DRV_NAME "olpc-xo1"
24
25/* PMC registers (PMS block) */
26#define PM_SCLK 0x10
27#define PM_IN_SLPCTL 0x20
28#define PM_WKXD 0x34
29#define PM_WKD 0x30
30#define PM_SSC 0x54
31
32/* PM registers (ACPI block) */
33#define PM1_CNT 0x08
34#define PM_GPE0_STS 0x18
35
36static unsigned long acpi_base;
37static unsigned long pms_base;
38
39static void xo1_power_off(void)
40{
41 printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
42
43 /* Enable all of these controls with 0 delay */
44 outl(0x40000000, pms_base + PM_SCLK);
45 outl(0x40000000, pms_base + PM_IN_SLPCTL);
46 outl(0x40000000, pms_base + PM_WKXD);
47 outl(0x40000000, pms_base + PM_WKD);
48
49 /* Clear status bits (possibly unnecessary) */
50 outl(0x0002ffff, pms_base + PM_SSC);
51 outl(0xffffffff, acpi_base + PM_GPE0_STS);
52
53 /* Write SLP_EN bit to start the machinery */
54 outl(0x00002000, acpi_base + PM1_CNT);
55}
56
57static int __devinit olpc_xo1_probe(struct platform_device *pdev)
58{
59 struct resource *res;
60 int err;
61
62 /* don't run on non-XOs */
63 if (!machine_is_olpc())
64 return -ENODEV;
65
66 err = mfd_cell_enable(pdev);
67 if (err)
68 return err;
69
70 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
71 if (!res) {
72 dev_err(&pdev->dev, "can't fetch device resource info\n");
73 return -EIO;
74 }
75 if (strcmp(pdev->name, "cs5535-pms") == 0)
76 pms_base = res->start;
77 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
78 acpi_base = res->start;
79
80 /* If we have both addresses, we can override the poweroff hook */
81 if (pms_base && acpi_base) {
82 pm_power_off = xo1_power_off;
83 printk(KERN_INFO "OLPC XO-1 support registered\n");
84 }
85
86 return 0;
87}
88
89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
90{
91 mfd_cell_disable(pdev);
92
93 if (strcmp(pdev->name, "cs5535-pms") == 0)
94 pms_base = 0;
95 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
96 acpi_base = 0;
97
98 pm_power_off = NULL;
99 return 0;
100}
101
102static struct platform_driver cs5535_pms_drv = {
103 .driver = {
104 .name = "cs5535-pms",
105 .owner = THIS_MODULE,
106 },
107 .probe = olpc_xo1_probe,
108 .remove = __devexit_p(olpc_xo1_remove),
109};
110
111static struct platform_driver cs5535_acpi_drv = {
112 .driver = {
113 .name = "olpc-xo1-pm-acpi",
114 .owner = THIS_MODULE,
115 },
116 .probe = olpc_xo1_probe,
117 .remove = __devexit_p(olpc_xo1_remove),
118};
119
120static int __init olpc_xo1_init(void)
121{
122 int r;
123
124 r = platform_driver_register(&cs5535_pms_drv);
125 if (r)
126 return r;
127
128 r = platform_driver_register(&cs5535_acpi_drv);
129 if (r)
130 platform_driver_unregister(&cs5535_pms_drv);
131
132 return r;
133}
134
135static void __exit olpc_xo1_exit(void)
136{
137 platform_driver_unregister(&cs5535_acpi_drv);
138 platform_driver_unregister(&cs5535_pms_drv);
139}
140
141MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
142MODULE_LICENSE("GPL");
143MODULE_ALIAS("platform:cs5535-pms");
144
145module_init(olpc_xo1_init);
146module_exit(olpc_xo1_exit);
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
new file mode 100644
index 00000000000..2b235b77d9a
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -0,0 +1,168 @@
1/*
2 * Support for OLPC XO-1.5 System Control Interrupts (SCI)
3 *
4 * Copyright (C) 2009-2010 One Laptop per Child
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/device.h>
13#include <linux/slab.h>
14#include <linux/workqueue.h>
15#include <linux/power_supply.h>
16
17#include <acpi/acpi_bus.h>
18#include <acpi/acpi_drivers.h>
19#include <asm/olpc.h>
20
21#define DRV_NAME "olpc-xo15-sci"
22#define PFX DRV_NAME ": "
23#define XO15_SCI_CLASS DRV_NAME
24#define XO15_SCI_DEVICE_NAME "OLPC XO-1.5 SCI"
25
26static unsigned long xo15_sci_gpe;
27
28static void battery_status_changed(void)
29{
30 struct power_supply *psy = power_supply_get_by_name("olpc-battery");
31
32 if (psy) {
33 power_supply_changed(psy);
34 put_device(psy->dev);
35 }
36}
37
38static void ac_status_changed(void)
39{
40 struct power_supply *psy = power_supply_get_by_name("olpc-ac");
41
42 if (psy) {
43 power_supply_changed(psy);
44 put_device(psy->dev);
45 }
46}
47
48static void process_sci_queue(void)
49{
50 u16 data;
51 int r;
52
53 do {
54 r = olpc_ec_sci_query(&data);
55 if (r || !data)
56 break;
57
58 pr_debug(PFX "SCI 0x%x received\n", data);
59
60 switch (data) {
61 case EC_SCI_SRC_BATERR:
62 case EC_SCI_SRC_BATSOC:
63 case EC_SCI_SRC_BATTERY:
64 case EC_SCI_SRC_BATCRIT:
65 battery_status_changed();
66 break;
67 case EC_SCI_SRC_ACPWR:
68 ac_status_changed();
69 break;
70 }
71 } while (data);
72
73 if (r)
74 pr_err(PFX "Failed to clear SCI queue");
75}
76
77static void process_sci_queue_work(struct work_struct *work)
78{
79 process_sci_queue();
80}
81
82static DECLARE_WORK(sci_work, process_sci_queue_work);
83
84static u32 xo15_sci_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context)
85{
86 schedule_work(&sci_work);
87 return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
88}
89
90static int xo15_sci_add(struct acpi_device *device)
91{
92 unsigned long long tmp;
93 acpi_status status;
94
95 if (!device)
96 return -EINVAL;
97
98 strcpy(acpi_device_name(device), XO15_SCI_DEVICE_NAME);
99 strcpy(acpi_device_class(device), XO15_SCI_CLASS);
100
101 /* Get GPE bit assignment (EC events). */
102 status = acpi_evaluate_integer(device->handle, "_GPE", NULL, &tmp);
103 if (ACPI_FAILURE(status))
104 return -EINVAL;
105
106 xo15_sci_gpe = tmp;
107 status = acpi_install_gpe_handler(NULL, xo15_sci_gpe,
108 ACPI_GPE_EDGE_TRIGGERED,
109 xo15_sci_gpe_handler, device);
110 if (ACPI_FAILURE(status))
111 return -ENODEV;
112
113 dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe);
114
115 /* Flush queue, and enable all SCI events */
116 process_sci_queue();
117 olpc_ec_mask_write(EC_SCI_SRC_ALL);
118
119 acpi_enable_gpe(NULL, xo15_sci_gpe);
120
121 /* Enable wake-on-EC */
122 if (device->wakeup.flags.valid)
123 device_init_wakeup(&device->dev, true);
124
125 return 0;
126}
127
128static int xo15_sci_remove(struct acpi_device *device, int type)
129{
130 acpi_disable_gpe(NULL, xo15_sci_gpe);
131 acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
132 cancel_work_sync(&sci_work);
133 return 0;
134}
135
136static int xo15_sci_resume(struct acpi_device *device)
137{
138 /* Enable all EC events */
139 olpc_ec_mask_write(EC_SCI_SRC_ALL);
140
141 /* Power/battery status might have changed */
142 battery_status_changed();
143 ac_status_changed();
144
145 return 0;
146}
147
148static const struct acpi_device_id xo15_sci_device_ids[] = {
149 {"XO15EC", 0},
150 {"", 0},
151};
152
153static struct acpi_driver xo15_sci_drv = {
154 .name = DRV_NAME,
155 .class = XO15_SCI_CLASS,
156 .ids = xo15_sci_device_ids,
157 .ops = {
158 .add = xo15_sci_add,
159 .remove = xo15_sci_remove,
160 .resume = xo15_sci_resume,
161 },
162};
163
164static int __init xo15_sci_init(void)
165{
166 return acpi_bus_register_driver(&xo15_sci_drv);
167}
168device_initcall(xo15_sci_init);
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index 0060fd59ea0..7cce722667b 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -19,6 +19,7 @@
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/platform_device.h> 20#include <linux/platform_device.h>
21#include <linux/of.h> 21#include <linux/of.h>
22#include <linux/syscore_ops.h>
22 23
23#include <asm/geode.h> 24#include <asm/geode.h>
24#include <asm/setup.h> 25#include <asm/setup.h>
@@ -30,6 +31,9 @@ EXPORT_SYMBOL_GPL(olpc_platform_info);
30 31
31static DEFINE_SPINLOCK(ec_lock); 32static DEFINE_SPINLOCK(ec_lock);
32 33
34/* EC event mask to be applied during suspend (defining wakeup sources). */
35static u16 ec_wakeup_mask;
36
33/* what the timeout *should* be (in ms) */ 37/* what the timeout *should* be (in ms) */
34#define EC_BASE_TIMEOUT 20 38#define EC_BASE_TIMEOUT 20
35 39
@@ -157,13 +161,13 @@ restart:
157 if (inbuf && inlen) { 161 if (inbuf && inlen) {
158 /* write data to EC */ 162 /* write data to EC */
159 for (i = 0; i < inlen; i++) { 163 for (i = 0; i < inlen; i++) {
164 pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
165 outb(inbuf[i], 0x68);
160 if (wait_on_ibf(0x6c, 0)) { 166 if (wait_on_ibf(0x6c, 0)) {
161 printk(KERN_ERR "olpc-ec: timeout waiting for" 167 printk(KERN_ERR "olpc-ec: timeout waiting for"
162 " EC accept data!\n"); 168 " EC accept data!\n");
163 goto err; 169 goto err;
164 } 170 }
165 pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
166 outb(inbuf[i], 0x68);
167 } 171 }
168 } 172 }
169 if (outbuf && outlen) { 173 if (outbuf && outlen) {
@@ -188,6 +192,88 @@ err:
188} 192}
189EXPORT_SYMBOL_GPL(olpc_ec_cmd); 193EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190 194
195void olpc_ec_wakeup_set(u16 value)
196{
197 ec_wakeup_mask |= value;
198}
199EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
200
201void olpc_ec_wakeup_clear(u16 value)
202{
203 ec_wakeup_mask &= ~value;
204}
205EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
206
207/*
208 * Returns true if the compile and runtime configurations allow for EC events
209 * to wake the system.
210 */
211bool olpc_ec_wakeup_available(void)
212{
213 if (!machine_is_olpc())
214 return false;
215
216 /*
217 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
218 * compiled in
219 */
220#ifdef CONFIG_OLPC_XO1_SCI
221 if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
222 return true;
223#endif
224
225 /*
226 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
227 * compiled in
228 */
229#ifdef CONFIG_OLPC_XO15_SCI
230 if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */
231 return true;
232#endif
233
234 return false;
235}
236EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
237
238int olpc_ec_mask_write(u16 bits)
239{
240 if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
241 __be16 ec_word = cpu_to_be16(bits);
242 return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2,
243 NULL, 0);
244 } else {
245 unsigned char ec_byte = bits & 0xff;
246 return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
247 }
248}
249EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
250
251int olpc_ec_sci_query(u16 *sci_value)
252{
253 int ret;
254
255 if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
256 __be16 ec_word;
257 ret = olpc_ec_cmd(EC_EXT_SCI_QUERY,
258 NULL, 0, (void *) &ec_word, 2);
259 if (ret == 0)
260 *sci_value = be16_to_cpu(ec_word);
261 } else {
262 unsigned char ec_byte;
263 ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
264 if (ret == 0)
265 *sci_value = ec_byte;
266 }
267
268 return ret;
269}
270EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
271
272static int olpc_ec_suspend(void)
273{
274 return olpc_ec_mask_write(ec_wakeup_mask);
275}
276
191static bool __init check_ofw_architecture(struct device_node *root) 277static bool __init check_ofw_architecture(struct device_node *root)
192{ 278{
193 const char *olpc_arch; 279 const char *olpc_arch;
@@ -242,6 +328,10 @@ static int __init add_xo1_platform_devices(void)
242 return 0; 328 return 0;
243} 329}
244 330
331static struct syscore_ops olpc_syscore_ops = {
332 .suspend = olpc_ec_suspend,
333};
334
245static int __init olpc_init(void) 335static int __init olpc_init(void)
246{ 336{
247 int r = 0; 337 int r = 0;
@@ -266,6 +356,9 @@ static int __init olpc_init(void)
266 !cs5535_has_vsa2()) 356 !cs5535_has_vsa2())
267 x86_init.pci.arch_init = pci_olpc_init; 357 x86_init.pci.arch_init = pci_olpc_init;
268#endif 358#endif
359 /* EC version 0x5f adds support for wide SCI mask */
360 if (olpc_platform_info.ecver >= 0x5f)
361 olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
269 362
270 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 363 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
271 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", 364 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
@@ -278,6 +371,8 @@ static int __init olpc_init(void)
278 return r; 371 return r;
279 } 372 }
280 373
374 register_syscore_ops(&olpc_syscore_ops);
375
281 return 0; 376 return 0;
282} 377}
283 378
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index d39f63d017d..d6ee9298692 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -165,6 +165,107 @@ static struct of_pdt_ops prom_olpc_ops __initdata = {
165 .pkg2path = olpc_dt_pkg2path, 165 .pkg2path = olpc_dt_pkg2path,
166}; 166};
167 167
168static phandle __init olpc_dt_finddevice(const char *path)
169{
170 phandle node;
171 const void *args[] = { path };
172 void *res[] = { &node };
173
174 if (olpc_ofw("finddevice", args, res)) {
175 pr_err("olpc_dt: finddevice failed!\n");
176 return 0;
177 }
178
179 if ((s32) node == -1)
180 return 0;
181
182 return node;
183}
184
185static int __init olpc_dt_interpret(const char *words)
186{
187 int result;
188 const void *args[] = { words };
189 void *res[] = { &result };
190
191 if (olpc_ofw("interpret", args, res)) {
192 pr_err("olpc_dt: interpret failed!\n");
193 return -1;
194 }
195
196 return result;
197}
198
199/*
200 * Extract board revision directly from OFW device tree.
201 * We can't use olpc_platform_info because that hasn't been set up yet.
202 */
203static u32 __init olpc_dt_get_board_revision(void)
204{
205 phandle node;
206 __be32 rev;
207 int r;
208
209 node = olpc_dt_finddevice("/");
210 if (!node)
211 return 0;
212
213 r = olpc_dt_getproperty(node, "board-revision-int",
214 (char *) &rev, sizeof(rev));
215 if (r < 0)
216 return 0;
217
218 return be32_to_cpu(rev);
219}
220
221void __init olpc_dt_fixup(void)
222{
223 int r;
224 char buf[64];
225 phandle node;
226 u32 board_rev;
227
228 node = olpc_dt_finddevice("/battery@0");
229 if (!node)
230 return;
231
232 /*
233 * If the battery node has a compatible property, we are running a new
234 * enough firmware and don't have fixups to make.
235 */
236 r = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf));
237 if (r > 0)
238 return;
239
240 pr_info("PROM DT: Old firmware detected, applying fixes\n");
241
242 /* Add olpc,xo1-battery compatible marker to battery node */
243 olpc_dt_interpret("\" /battery@0\" find-device"
244 " \" olpc,xo1-battery\" +compatible"
245 " device-end");
246
247 board_rev = olpc_dt_get_board_revision();
248 if (!board_rev)
249 return;
250
251 if (board_rev >= olpc_board_pre(0xd0)) {
252 /* XO-1.5: add dcon device */
253 olpc_dt_interpret("\" /pci/display@1\" find-device"
254 " new-device"
255 " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
256 " finish-device device-end");
257 } else {
258 /* XO-1: add dcon device, mark RTC as olpc,xo1-rtc */
259 olpc_dt_interpret("\" /pci/display@1,1\" find-device"
260 " new-device"
261 " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
262 " finish-device device-end"
263 " \" /rtc\" find-device"
264 " \" olpc,xo1-rtc\" +compatible"
265 " device-end");
266 }
267}
268
168void __init olpc_dt_build_devicetree(void) 269void __init olpc_dt_build_devicetree(void)
169{ 270{
170 phandle root; 271 phandle root;
@@ -172,6 +273,8 @@ void __init olpc_dt_build_devicetree(void)
172 if (!olpc_ofw_is_installed()) 273 if (!olpc_ofw_is_installed())
173 return; 274 return;
174 275
276 olpc_dt_fixup();
277
175 root = olpc_dt_getsibling(0); 278 root = olpc_dt_getsibling(0);
176 if (!root) { 279 if (!root) {
177 pr_err("PROM: unable to get root node from OFW!\n"); 280 pr_err("PROM: unable to get root node from OFW!\n");
diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S
new file mode 100644
index 00000000000..948deb28975
--- /dev/null
+++ b/arch/x86/platform/olpc/xo1-wakeup.S
@@ -0,0 +1,124 @@
1.text
2#include <linux/linkage.h>
3#include <asm/segment.h>
4#include <asm/page.h>
5#include <asm/pgtable_32.h>
6
7 .macro writepost,value
8 movb $0x34, %al
9 outb %al, $0x70
10 movb $\value, %al
11 outb %al, $0x71
12 .endm
13
14wakeup_start:
15 # OFW lands us here, running in protected mode, with a
16 # kernel-compatible GDT already setup.
17
18 # Clear any dangerous flags
19 pushl $0
20 popfl
21
22 writepost 0x31
23
24 # Set up %cr3
25 movl $initial_page_table - __PAGE_OFFSET, %eax
26 movl %eax, %cr3
27
28 movl saved_cr4, %eax
29 movl %eax, %cr4
30
31 movl saved_cr0, %eax
32 movl %eax, %cr0
33
34 # Control registers were modified, pipeline resync is needed
35 jmp 1f
361:
37
38 movw $__KERNEL_DS, %ax
39 movw %ax, %ss
40 movw %ax, %ds
41 movw %ax, %es
42 movw %ax, %fs
43 movw %ax, %gs
44
45 lgdt saved_gdt
46 lidt saved_idt
47 lldt saved_ldt
48 ljmp $(__KERNEL_CS),$1f
491:
50 movl %cr3, %eax
51 movl %eax, %cr3
52 wbinvd
53
54 # Go back to the return point
55 jmp ret_point
56
57save_registers:
58 sgdt saved_gdt
59 sidt saved_idt
60 sldt saved_ldt
61
62 pushl %edx
63 movl %cr4, %edx
64 movl %edx, saved_cr4
65
66 movl %cr0, %edx
67 movl %edx, saved_cr0
68
69 popl %edx
70
71 movl %ebx, saved_context_ebx
72 movl %ebp, saved_context_ebp
73 movl %esi, saved_context_esi
74 movl %edi, saved_context_edi
75
76 pushfl
77 popl saved_context_eflags
78
79 ret
80
81restore_registers:
82 movl saved_context_ebp, %ebp
83 movl saved_context_ebx, %ebx
84 movl saved_context_esi, %esi
85 movl saved_context_edi, %edi
86
87 pushl saved_context_eflags
88 popfl
89
90 ret
91
92ENTRY(do_olpc_suspend_lowlevel)
93 call save_processor_state
94 call save_registers
95
96 # This is the stack context we want to remember
97 movl %esp, saved_context_esp
98
99 pushl $3
100 call xo1_do_sleep
101
102 jmp wakeup_start
103 .p2align 4,,7
104ret_point:
105 movl saved_context_esp, %esp
106
107 writepost 0x32
108
109 call restore_registers
110 call restore_processor_state
111 ret
112
113.data
114saved_gdt: .long 0,0
115saved_idt: .long 0,0
116saved_ldt: .long 0
117saved_cr4: .long 0
118saved_cr0: .long 0
119saved_context_esp: .long 0
120saved_context_edi: .long 0
121saved_context_esi: .long 0
122saved_context_ebx: .long 0
123saved_context_ebp: .long 0
124saved_context_eflags: .long 0
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 68e467f69fe..5b552198f77 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -115,9 +115,6 @@ early_param("nobau", setup_nobau);
115 115
116/* base pnode in this partition */ 116/* base pnode in this partition */
117static int uv_base_pnode __read_mostly; 117static int uv_base_pnode __read_mostly;
118/* position of pnode (which is nasid>>1): */
119static int uv_nshift __read_mostly;
120static unsigned long uv_mmask __read_mostly;
121 118
122static DEFINE_PER_CPU(struct ptc_stats, ptcstats); 119static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
123static DEFINE_PER_CPU(struct bau_control, bau_control); 120static DEFINE_PER_CPU(struct bau_control, bau_control);
@@ -296,14 +293,18 @@ static void bau_process_message(struct msg_desc *mdp,
296} 293}
297 294
298/* 295/*
299 * Determine the first cpu on a uvhub. 296 * Determine the first cpu on a pnode.
300 */ 297 */
301static int uvhub_to_first_cpu(int uvhub) 298static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
302{ 299{
303 int cpu; 300 int cpu;
304 for_each_present_cpu(cpu) 301 struct hub_and_pnode *hpp;
305 if (uvhub == uv_cpu_to_blade_id(cpu)) 302
303 for_each_present_cpu(cpu) {
304 hpp = &smaster->thp[cpu];
305 if (pnode == hpp->pnode)
306 return cpu; 306 return cpu;
307 }
307 return -1; 308 return -1;
308} 309}
309 310
@@ -366,28 +367,32 @@ static void do_reset(void *ptr)
366 * Use IPI to get all target uvhubs to release resources held by 367 * Use IPI to get all target uvhubs to release resources held by
367 * a given sending cpu number. 368 * a given sending cpu number.
368 */ 369 */
369static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender) 370static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
370{ 371{
371 int uvhub; 372 int pnode;
373 int apnode;
372 int maskbits; 374 int maskbits;
373 cpumask_t mask; 375 int sender = bcp->cpu;
376 cpumask_t *mask = bcp->uvhub_master->cpumask;
377 struct bau_control *smaster = bcp->socket_master;
374 struct reset_args reset_args; 378 struct reset_args reset_args;
375 379
376 reset_args.sender = sender; 380 reset_args.sender = sender;
377 cpus_clear(mask); 381 cpus_clear(*mask);
378 /* find a single cpu for each uvhub in this distribution mask */ 382 /* find a single cpu for each uvhub in this distribution mask */
379 maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE; 383 maskbits = sizeof(struct pnmask) * BITSPERBYTE;
380 for (uvhub = 0; uvhub < maskbits; uvhub++) { 384 /* each bit is a pnode relative to the partition base pnode */
385 for (pnode = 0; pnode < maskbits; pnode++) {
381 int cpu; 386 int cpu;
382 if (!bau_uvhub_isset(uvhub, distribution)) 387 if (!bau_uvhub_isset(pnode, distribution))
383 continue; 388 continue;
384 /* find a cpu for this uvhub */ 389 apnode = pnode + bcp->partition_base_pnode;
385 cpu = uvhub_to_first_cpu(uvhub); 390 cpu = pnode_to_first_cpu(apnode, smaster);
386 cpu_set(cpu, mask); 391 cpu_set(cpu, *mask);
387 } 392 }
388 393
389 /* IPI all cpus; preemption is already disabled */ 394 /* IPI all cpus; preemption is already disabled */
390 smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1); 395 smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
391 return; 396 return;
392} 397}
393 398
@@ -604,7 +609,7 @@ static void destination_plugged(struct bau_desc *bau_desc,
604 quiesce_local_uvhub(hmaster); 609 quiesce_local_uvhub(hmaster);
605 610
606 spin_lock(&hmaster->queue_lock); 611 spin_lock(&hmaster->queue_lock);
607 reset_with_ipi(&bau_desc->distribution, bcp->cpu); 612 reset_with_ipi(&bau_desc->distribution, bcp);
608 spin_unlock(&hmaster->queue_lock); 613 spin_unlock(&hmaster->queue_lock);
609 614
610 end_uvhub_quiesce(hmaster); 615 end_uvhub_quiesce(hmaster);
@@ -626,7 +631,7 @@ static void destination_timeout(struct bau_desc *bau_desc,
626 quiesce_local_uvhub(hmaster); 631 quiesce_local_uvhub(hmaster);
627 632
628 spin_lock(&hmaster->queue_lock); 633 spin_lock(&hmaster->queue_lock);
629 reset_with_ipi(&bau_desc->distribution, bcp->cpu); 634 reset_with_ipi(&bau_desc->distribution, bcp);
630 spin_unlock(&hmaster->queue_lock); 635 spin_unlock(&hmaster->queue_lock);
631 636
632 end_uvhub_quiesce(hmaster); 637 end_uvhub_quiesce(hmaster);
@@ -1334,9 +1339,10 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
1334 1339
1335 instr[count] = '\0'; 1340 instr[count] = '\0';
1336 1341
1337 bcp = &per_cpu(bau_control, smp_processor_id()); 1342 cpu = get_cpu();
1338 1343 bcp = &per_cpu(bau_control, cpu);
1339 ret = parse_tunables_write(bcp, instr, count); 1344 ret = parse_tunables_write(bcp, instr, count);
1345 put_cpu();
1340 if (ret) 1346 if (ret)
1341 return ret; 1347 return ret;
1342 1348
@@ -1426,7 +1432,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1426{ 1432{
1427 int i; 1433 int i;
1428 int cpu; 1434 int cpu;
1429 unsigned long pa; 1435 unsigned long gpa;
1430 unsigned long m; 1436 unsigned long m;
1431 unsigned long n; 1437 unsigned long n;
1432 size_t dsize; 1438 size_t dsize;
@@ -1442,9 +1448,9 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
1442 bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); 1448 bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
1443 BUG_ON(!bau_desc); 1449 BUG_ON(!bau_desc);
1444 1450
1445 pa = uv_gpa(bau_desc); /* need the real nasid*/ 1451 gpa = uv_gpa(bau_desc);
1446 n = pa >> uv_nshift; 1452 n = uv_gpa_to_gnode(gpa);
1447 m = pa & uv_mmask; 1453 m = uv_gpa_to_offset(gpa);
1448 1454
1449 /* the 14-bit pnode */ 1455 /* the 14-bit pnode */
1450 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); 1456 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
@@ -1516,9 +1522,9 @@ static void pq_init(int node, int pnode)
1516 bcp->queue_last = pqp + (DEST_Q_SIZE - 1); 1522 bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
1517 } 1523 }
1518 /* 1524 /*
1519 * need the pnode of where the memory was really allocated 1525 * need the gnode of where the memory was really allocated
1520 */ 1526 */
1521 pn = uv_gpa(pqp) >> uv_nshift; 1527 pn = uv_gpa_to_gnode(uv_gpa(pqp));
1522 first = uv_physnodeaddr(pqp); 1528 first = uv_physnodeaddr(pqp);
1523 pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; 1529 pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
1524 last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); 1530 last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
@@ -1687,6 +1693,16 @@ static void make_per_cpu_thp(struct bau_control *smaster)
1687} 1693}
1688 1694
1689/* 1695/*
1696 * Each uvhub is to get a local cpumask.
1697 */
1698static void make_per_hub_cpumask(struct bau_control *hmaster)
1699{
1700 int sz = sizeof(cpumask_t);
1701
1702 hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
1703}
1704
1705/*
1690 * Initialize all the per_cpu information for the cpu's on a given socket, 1706 * Initialize all the per_cpu information for the cpu's on a given socket,
1691 * given what has been gathered into the socket_desc struct. 1707 * given what has been gathered into the socket_desc struct.
1692 * And reports the chosen hub and socket masters back to the caller. 1708 * And reports the chosen hub and socket masters back to the caller.
@@ -1751,11 +1767,12 @@ static int __init summarize_uvhub_sockets(int nuvhubs,
1751 sdp = &bdp->socket[socket]; 1767 sdp = &bdp->socket[socket];
1752 if (scan_sock(sdp, bdp, &smaster, &hmaster)) 1768 if (scan_sock(sdp, bdp, &smaster, &hmaster))
1753 return 1; 1769 return 1;
1770 make_per_cpu_thp(smaster);
1754 } 1771 }
1755 socket++; 1772 socket++;
1756 socket_mask = (socket_mask >> 1); 1773 socket_mask = (socket_mask >> 1);
1757 make_per_cpu_thp(smaster);
1758 } 1774 }
1775 make_per_hub_cpumask(hmaster);
1759 } 1776 }
1760 return 0; 1777 return 0;
1761} 1778}
@@ -1777,15 +1794,20 @@ static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
1777 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); 1794 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1778 1795
1779 if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) 1796 if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
1780 return 1; 1797 goto fail;
1781 1798
1782 if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask)) 1799 if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
1783 return 1; 1800 goto fail;
1784 1801
1785 kfree(uvhub_descs); 1802 kfree(uvhub_descs);
1786 kfree(uvhub_mask); 1803 kfree(uvhub_mask);
1787 init_per_cpu_tunables(); 1804 init_per_cpu_tunables();
1788 return 0; 1805 return 0;
1806
1807fail:
1808 kfree(uvhub_descs);
1809 kfree(uvhub_mask);
1810 return 1;
1789} 1811}
1790 1812
1791/* 1813/*
@@ -1812,8 +1834,6 @@ static int __init uv_bau_init(void)
1812 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); 1834 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
1813 } 1835 }
1814 1836
1815 uv_nshift = uv_hub_info->m_val;
1816 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1817 nuvhubs = uv_num_possible_blades(); 1837 nuvhubs = uv_num_possible_blades();
1818 spin_lock_init(&disable_lock); 1838 spin_lock_init(&disable_lock);
1819 congested_cycles = usec_2_cycles(congested_respns_us); 1839 congested_cycles = usec_2_cycles(congested_respns_us);
diff --git a/arch/x86/realmode/rm/.gitignore b/arch/x86/realmode/rm/.gitignore
new file mode 100644
index 00000000000..b6ed3a2555c
--- /dev/null
+++ b/arch/x86/realmode/rm/.gitignore
@@ -0,0 +1,3 @@
1pasyms.h
2realmode.lds
3realmode.relocs
diff --git a/arch/x86/tools/.gitignore b/arch/x86/tools/.gitignore
new file mode 100644
index 00000000000..be0ed065249
--- /dev/null
+++ b/arch/x86/tools/.gitignore
@@ -0,0 +1 @@
relocs
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore
index 60274d5746e..3282874bc61 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/vdso/.gitignore
@@ -1,5 +1,7 @@
1vdso.lds 1vdso.lds
2vdso-syms.lds 2vdso-syms.lds
3vdsox32.lds
4vdsox32-syms.lds
3vdso32-syms.lds 5vdso32-syms.lds
4vdso32-syscall-syms.lds 6vdso32-syscall-syms.lds
5vdso32-sysenter-syms.lds 7vdso32-sysenter-syms.lds
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index bef0bc96240..5d179502a52 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
26export CPPFLAGS_vdso.lds += -P -C 26export CPPFLAGS_vdso.lds += -P -C
27 27
28VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ 28VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
29 -Wl,--no-undefined \
29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 30 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
30 31
31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 32$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index a724905fdae..6bc0e723b6e 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -6,7 +6,6 @@
6 * 6 *
7 * The code should have no internal unresolved relocations. 7 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing. 8 * Check with readelf after changing.
9 * Also alternative() doesn't work.
10 */ 9 */
11 10
12/* Disable profiling for userspace code: */ 11/* Disable profiling for userspace code: */
@@ -17,6 +16,7 @@
17#include <linux/time.h> 16#include <linux/time.h>
18#include <linux/string.h> 17#include <linux/string.h>
19#include <asm/vsyscall.h> 18#include <asm/vsyscall.h>
19#include <asm/fixmap.h>
20#include <asm/vgtod.h> 20#include <asm/vgtod.h>
21#include <asm/timex.h> 21#include <asm/timex.h>
22#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -25,6 +25,43 @@
25 25
26#define gtod (&VVAR(vsyscall_gtod_data)) 26#define gtod (&VVAR(vsyscall_gtod_data))
27 27
28notrace static cycle_t vread_tsc(void)
29{
30 cycle_t ret;
31 u64 last;
32
33 /*
34 * Empirically, a fence (of type that depends on the CPU)
35 * before rdtsc is enough to ensure that rdtsc is ordered
36 * with respect to loads. The various CPU manuals are unclear
37 * as to whether rdtsc can be reordered with later loads,
38 * but no one has ever seen it happen.
39 */
40 rdtsc_barrier();
41 ret = (cycle_t)vget_cycles();
42
43 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
44
45 if (likely(ret >= last))
46 return ret;
47
48 /*
49 * GCC likes to generate cmov here, but this branch is extremely
50 * predictable (it's just a funciton of time and the likely is
51 * very likely) and there's a data dependence, so force GCC
52 * to generate a branch instead. I don't barrier() because
53 * we don't actually need a barrier, and if this function
54 * ever gets inlined it will generate worse code.
55 */
56 asm volatile ("");
57 return last;
58}
59
60static notrace cycle_t vread_hpet(void)
61{
62 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
63}
64
28notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 65notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
29{ 66{
30 long ret; 67 long ret;
@@ -36,9 +73,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
36notrace static inline long vgetns(void) 73notrace static inline long vgetns(void)
37{ 74{
38 long v; 75 long v;
39 cycles_t (*vread)(void); 76 cycles_t cycles;
40 vread = gtod->clock.vread; 77 if (gtod->clock.vclock_mode == VCLOCK_TSC)
41 v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; 78 cycles = vread_tsc();
79 else
80 cycles = vread_hpet();
81 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
42 return (v * gtod->clock.mult) >> gtod->clock.shift; 82 return (v * gtod->clock.mult) >> gtod->clock.shift;
43} 83}
44 84
@@ -116,21 +156,21 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
116 156
117notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 157notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
118{ 158{
119 if (likely(gtod->sysctl_enabled)) 159 switch (clock) {
120 switch (clock) { 160 case CLOCK_REALTIME:
121 case CLOCK_REALTIME: 161 if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
122 if (likely(gtod->clock.vread)) 162 return do_realtime(ts);
123 return do_realtime(ts); 163 break;
124 break; 164 case CLOCK_MONOTONIC:
125 case CLOCK_MONOTONIC: 165 if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
126 if (likely(gtod->clock.vread)) 166 return do_monotonic(ts);
127 return do_monotonic(ts); 167 break;
128 break; 168 case CLOCK_REALTIME_COARSE:
129 case CLOCK_REALTIME_COARSE: 169 return do_realtime_coarse(ts);
130 return do_realtime_coarse(ts); 170 case CLOCK_MONOTONIC_COARSE:
131 case CLOCK_MONOTONIC_COARSE: 171 return do_monotonic_coarse(ts);
132 return do_monotonic_coarse(ts); 172 }
133 } 173
134 return vdso_fallback_gettime(clock, ts); 174 return vdso_fallback_gettime(clock, ts);
135} 175}
136int clock_gettime(clockid_t, struct timespec *) 176int clock_gettime(clockid_t, struct timespec *)
@@ -139,7 +179,7 @@ int clock_gettime(clockid_t, struct timespec *)
139notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 179notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
140{ 180{
141 long ret; 181 long ret;
142 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { 182 if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
143 if (likely(tv != NULL)) { 183 if (likely(tv != NULL)) {
144 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != 184 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
145 offsetof(struct timespec, tv_nsec) || 185 offsetof(struct timespec, tv_nsec) ||
@@ -161,27 +201,14 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
161int gettimeofday(struct timeval *, struct timezone *) 201int gettimeofday(struct timeval *, struct timezone *)
162 __attribute__((weak, alias("__vdso_gettimeofday"))); 202 __attribute__((weak, alias("__vdso_gettimeofday")));
163 203
164/* This will break when the xtime seconds get inaccurate, but that is 204/*
165 * unlikely */ 205 * This will break when the xtime seconds get inaccurate, but that is
166 206 * unlikely
167static __always_inline long time_syscall(long *t) 207 */
168{
169 long secs;
170 asm volatile("syscall"
171 : "=a" (secs)
172 : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
173 return secs;
174}
175
176notrace time_t __vdso_time(time_t *t) 208notrace time_t __vdso_time(time_t *t)
177{ 209{
178 time_t result;
179
180 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
181 return time_syscall(t);
182
183 /* This is atomic on x86_64 so we don't need any locks. */ 210 /* This is atomic on x86_64 so we don't need any locks. */
184 result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); 211 time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
185 212
186 if (t) 213 if (t)
187 *t = result; 214 *t = result;
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 1d3aa6b8718..01f5e3b4613 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,10 +1,22 @@
1#include <asm/page_types.h>
2#include <linux/linkage.h>
1#include <linux/init.h> 3#include <linux/init.h>
2 4
3__INITDATA 5__PAGE_ALIGNED_DATA
4 6
5 .globl vdso_start, vdso_end 7 .globl vdso_start, vdso_end
8 .align PAGE_SIZE
6vdso_start: 9vdso_start:
7 .incbin "arch/x86/vdso/vdso.so" 10 .incbin "arch/x86/vdso/vdso.so"
8vdso_end: 11vdso_end:
12 .align PAGE_SIZE /* extra data here leaks to userspace. */
9 13
10__FINIT 14.previous
15
16 .globl vdso_pages
17 .bss
18 .align 8
19 .type vdso_pages, @object
20vdso_pages:
21 .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
22 .size vdso_pages, .-vdso_pages
diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S
index e2800affa75..e354bceee0e 100644
--- a/arch/x86/vdso/vdso32/sysenter.S
+++ b/arch/x86/vdso/vdso32/sysenter.S
@@ -43,7 +43,7 @@ __kernel_vsyscall:
43 .space 7,0x90 43 .space 7,0x90
44 44
45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ 45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
46 jmp .Lenter_kernel 46 int $0x80
47 /* 16: System call normal return point is here! */ 47 /* 16: System call normal return point is here! */
48VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ 48VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
49 pop %ebp 49 pop %ebp
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7abd2be0f9b..316fbca3490 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -14,41 +14,61 @@
14#include <asm/vgtod.h> 14#include <asm/vgtod.h>
15#include <asm/proto.h> 15#include <asm/proto.h>
16#include <asm/vdso.h> 16#include <asm/vdso.h>
17#include <asm/page.h>
17 18
18unsigned int __read_mostly vdso_enabled = 1; 19unsigned int __read_mostly vdso_enabled = 1;
19 20
20extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
21extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
22 23
23static struct page **vdso_pages; 24extern struct page *vdso_pages[];
24static unsigned vdso_size; 25static unsigned vdso_size;
25 26
26static int __init init_vdso_vars(void) 27static void __init patch_vdso(void *vdso, size_t len)
28{
29 Elf64_Ehdr *hdr = vdso;
30 Elf64_Shdr *sechdrs, *alt_sec = 0;
31 char *secstrings;
32 void *alt_data;
33 int i;
34
35 BUG_ON(len < sizeof(Elf64_Ehdr));
36 BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
37
38 sechdrs = (void *)hdr + hdr->e_shoff;
39 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
40
41 for (i = 1; i < hdr->e_shnum; i++) {
42 Elf64_Shdr *shdr = &sechdrs[i];
43 if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
44 alt_sec = shdr;
45 goto found;
46 }
47 }
48
49 /* If we get here, it's probably a bug. */
50 pr_warning("patch_vdso: .altinstructions not found\n");
51 return; /* nothing to patch */
52
53found:
54 alt_data = (void *)hdr + alt_sec->sh_offset;
55 apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
56}
57
58static int __init init_vdso(void)
27{ 59{
28 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; 60 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
29 int i; 61 int i;
30 62
63 patch_vdso(vdso_start, vdso_end - vdso_start);
64
31 vdso_size = npages << PAGE_SHIFT; 65 vdso_size = npages << PAGE_SHIFT;
32 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 66 for (i = 0; i < npages; i++)
33 if (!vdso_pages) 67 vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
34 goto oom;
35 for (i = 0; i < npages; i++) {
36 struct page *p;
37 p = alloc_page(GFP_KERNEL);
38 if (!p)
39 goto oom;
40 vdso_pages[i] = p;
41 copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
42 }
43 68
44 return 0; 69 return 0;
45
46 oom:
47 printk("Cannot allocate vdso\n");
48 vdso_enabled = 0;
49 return -ENOMEM;
50} 70}
51subsys_initcall(init_vdso_vars); 71subsys_initcall(init_vdso);
52 72
53struct linux_binprm; 73struct linux_binprm;
54 74
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 17c565de3d6..add2c2d729c 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -15,8 +15,10 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
15 grant-table.o suspend.o platform-pci-unplug.o \ 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o 16 p2m.o
17 17
18obj-$(CONFIG_EVENT_TRACING) += trace.o
19
18obj-$(CONFIG_SMP) += smp.o 20obj-$(CONFIG_SMP) += smp.o
19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 21obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
20obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o 22obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
21 23obj-$(CONFIG_XEN_DOM0) += vga.o
22obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o 24obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5525163a039..46c8069ae98 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(xen_domain_type);
77 77
78unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; 78unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
79EXPORT_SYMBOL(machine_to_phys_mapping); 79EXPORT_SYMBOL(machine_to_phys_mapping);
80unsigned int machine_to_phys_order; 80unsigned long machine_to_phys_nr;
81EXPORT_SYMBOL(machine_to_phys_order); 81EXPORT_SYMBOL(machine_to_phys_nr);
82 82
83struct start_info *xen_start_info; 83struct start_info *xen_start_info;
84EXPORT_SYMBOL_GPL(xen_start_info); 84EXPORT_SYMBOL_GPL(xen_start_info);
@@ -341,6 +341,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)
341 struct mmuext_op *op; 341 struct mmuext_op *op;
342 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 342 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
343 343
344 trace_xen_cpu_set_ldt(addr, entries);
345
344 op = mcs.args; 346 op = mcs.args;
345 op->cmd = MMUEXT_SET_LDT; 347 op->cmd = MMUEXT_SET_LDT;
346 op->arg1.linear_addr = (unsigned long)addr; 348 op->arg1.linear_addr = (unsigned long)addr;
@@ -496,6 +498,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
496 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); 498 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
497 u64 entry = *(u64 *)ptr; 499 u64 entry = *(u64 *)ptr;
498 500
501 trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
502
499 preempt_disable(); 503 preempt_disable();
500 504
501 xen_mc_flush(); 505 xen_mc_flush();
@@ -565,6 +569,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
565 unsigned long p = (unsigned long)&dt[entrynum]; 569 unsigned long p = (unsigned long)&dt[entrynum];
566 unsigned long start, end; 570 unsigned long start, end;
567 571
572 trace_xen_cpu_write_idt_entry(dt, entrynum, g);
573
568 preempt_disable(); 574 preempt_disable();
569 575
570 start = __this_cpu_read(idt_desc.address); 576 start = __this_cpu_read(idt_desc.address);
@@ -619,6 +625,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
619 static DEFINE_SPINLOCK(lock); 625 static DEFINE_SPINLOCK(lock);
620 static struct trap_info traps[257]; 626 static struct trap_info traps[257];
621 627
628 trace_xen_cpu_load_idt(desc);
629
622 spin_lock(&lock); 630 spin_lock(&lock);
623 631
624 __get_cpu_var(idt_desc) = *desc; 632 __get_cpu_var(idt_desc) = *desc;
@@ -637,6 +645,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
637static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 645static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
638 const void *desc, int type) 646 const void *desc, int type)
639{ 647{
648 trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
649
640 preempt_disable(); 650 preempt_disable();
641 651
642 switch (type) { 652 switch (type) {
@@ -665,6 +675,8 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
665static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 675static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
666 const void *desc, int type) 676 const void *desc, int type)
667{ 677{
678 trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
679
668 switch (type) { 680 switch (type) {
669 case DESC_LDT: 681 case DESC_LDT:
670 case DESC_TSS: 682 case DESC_TSS:
@@ -684,7 +696,9 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
684static void xen_load_sp0(struct tss_struct *tss, 696static void xen_load_sp0(struct tss_struct *tss,
685 struct thread_struct *thread) 697 struct thread_struct *thread)
686{ 698{
687 struct multicall_space mcs = xen_mc_entry(0); 699 struct multicall_space mcs;
700
701 mcs = xen_mc_entry(0);
688 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 702 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
689 xen_mc_issue(PARAVIRT_LAZY_CPU); 703 xen_mc_issue(PARAVIRT_LAZY_CPU);
690} 704}
@@ -937,6 +951,10 @@ static const struct pv_info xen_info __initconst = {
937 .paravirt_enabled = 1, 951 .paravirt_enabled = 1,
938 .shared_kernel_pmd = 0, 952 .shared_kernel_pmd = 0,
939 953
954#ifdef CONFIG_X86_64
955 .extra_user_64bit_cs = FLAT_USER_CS64,
956#endif
957
940 .name = "Xen", 958 .name = "Xen",
941}; 959};
942 960
@@ -1248,6 +1266,14 @@ asmlinkage void __init xen_start_kernel(void)
1248 if (pci_xen) 1266 if (pci_xen)
1249 x86_init.pci.arch_init = pci_xen_init; 1267 x86_init.pci.arch_init = pci_xen_init;
1250 } else { 1268 } else {
1269 const struct dom0_vga_console_info *info =
1270 (void *)((char *)xen_start_info +
1271 xen_start_info->console.dom0.info_off);
1272
1273 xen_init_vga(info, xen_start_info->console.dom0.info_size);
1274 xen_start_info->console.domU.mfn = 0;
1275 xen_start_info->console.domU.evtchn = 0;
1276
1251 /* Make sure ACS will be enabled */ 1277 /* Make sure ACS will be enabled */
1252 pci_request_acs(); 1278 pci_request_acs();
1253 } 1279 }
@@ -1329,7 +1355,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1329 int cpu = (long)hcpu; 1355 int cpu = (long)hcpu;
1330 switch (action) { 1356 switch (action) {
1331 case CPU_UP_PREPARE: 1357 case CPU_UP_PREPARE:
1332 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1358 xen_vcpu_setup(cpu);
1333 if (xen_have_vector_callback) 1359 if (xen_have_vector_callback)
1334 xen_init_lock_cpu(cpu); 1360 xen_init_lock_cpu(cpu);
1335 break; 1361 break;
@@ -1359,7 +1385,6 @@ static void __init xen_hvm_guest_init(void)
1359 xen_hvm_smp_init(); 1385 xen_hvm_smp_init();
1360 register_cpu_notifier(&xen_hvm_cpu_notifier); 1386 register_cpu_notifier(&xen_hvm_cpu_notifier);
1361 xen_unplug_emulated_devices(); 1387 xen_unplug_emulated_devices();
1362 have_vcpu_info_placement = 0;
1363 x86_init.irqs.intr_init = xen_init_IRQ; 1388 x86_init.irqs.intr_init = xen_init_IRQ;
1364 xen_hvm_init_time_ops(); 1389 xen_hvm_init_time_ops();
1365 xen_hvm_init_mmu_ops(); 1390 xen_hvm_init_mmu_ops();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0ccccb67a99..3dd53f997b1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -48,6 +48,8 @@
48#include <linux/memblock.h> 48#include <linux/memblock.h>
49#include <linux/seq_file.h> 49#include <linux/seq_file.h>
50 50
51#include <trace/events/xen.h>
52
51#include <asm/pgtable.h> 53#include <asm/pgtable.h>
52#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
53#include <asm/fixmap.h> 55#include <asm/fixmap.h>
@@ -194,6 +196,8 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
194 struct multicall_space mcs; 196 struct multicall_space mcs;
195 struct mmu_update *u; 197 struct mmu_update *u;
196 198
199 trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
200
197 mcs = xen_mc_entry(sizeof(*u)); 201 mcs = xen_mc_entry(sizeof(*u));
198 u = mcs.args; 202 u = mcs.args;
199 203
@@ -225,6 +229,24 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
225 *u = *update; 229 *u = *update;
226} 230}
227 231
232static void xen_extend_mmuext_op(const struct mmuext_op *op)
233{
234 struct multicall_space mcs;
235 struct mmuext_op *u;
236
237 mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
238
239 if (mcs.mc != NULL) {
240 mcs.mc->args[1]++;
241 } else {
242 mcs = __xen_mc_entry(sizeof(*u));
243 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
244 }
245
246 u = mcs.args;
247 *u = *op;
248}
249
228static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 250static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
229{ 251{
230 struct mmu_update u; 252 struct mmu_update u;
@@ -245,6 +267,8 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
245 267
246static void xen_set_pmd(pmd_t *ptr, pmd_t val) 268static void xen_set_pmd(pmd_t *ptr, pmd_t val)
247{ 269{
270 trace_xen_mmu_set_pmd(ptr, val);
271
248 /* If page is not pinned, we can just update the entry 272 /* If page is not pinned, we can just update the entry
249 directly */ 273 directly */
250 if (!xen_page_pinned(ptr)) { 274 if (!xen_page_pinned(ptr)) {
@@ -282,22 +306,30 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
282 return true; 306 return true;
283} 307}
284 308
285static void xen_set_pte(pte_t *ptep, pte_t pteval) 309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
286{ 310{
287 if (!xen_batched_set_pte(ptep, pteval)) 311 if (!xen_batched_set_pte(ptep, pteval))
288 native_set_pte(ptep, pteval); 312 native_set_pte(ptep, pteval);
289} 313}
290 314
315static void xen_set_pte(pte_t *ptep, pte_t pteval)
316{
317 trace_xen_mmu_set_pte(ptep, pteval);
318 __xen_set_pte(ptep, pteval);
319}
320
291static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 321static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
292 pte_t *ptep, pte_t pteval) 322 pte_t *ptep, pte_t pteval)
293{ 323{
294 xen_set_pte(ptep, pteval); 324 trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
325 __xen_set_pte(ptep, pteval);
295} 326}
296 327
297pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 328pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
298 unsigned long addr, pte_t *ptep) 329 unsigned long addr, pte_t *ptep)
299{ 330{
300 /* Just return the pte as-is. We preserve the bits on commit */ 331 /* Just return the pte as-is. We preserve the bits on commit */
332 trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
301 return *ptep; 333 return *ptep;
302} 334}
303 335
@@ -306,6 +338,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
306{ 338{
307 struct mmu_update u; 339 struct mmu_update u;
308 340
341 trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
309 xen_mc_batch(); 342 xen_mc_batch();
310 343
311 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 344 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
@@ -530,6 +563,8 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
530 563
531static void xen_set_pud(pud_t *ptr, pud_t val) 564static void xen_set_pud(pud_t *ptr, pud_t val)
532{ 565{
566 trace_xen_mmu_set_pud(ptr, val);
567
533 /* If page is not pinned, we can just update the entry 568 /* If page is not pinned, we can just update the entry
534 directly */ 569 directly */
535 if (!xen_page_pinned(ptr)) { 570 if (!xen_page_pinned(ptr)) {
@@ -543,17 +578,20 @@ static void xen_set_pud(pud_t *ptr, pud_t val)
543#ifdef CONFIG_X86_PAE 578#ifdef CONFIG_X86_PAE
544static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 579static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
545{ 580{
581 trace_xen_mmu_set_pte_atomic(ptep, pte);
546 set_64bit((u64 *)ptep, native_pte_val(pte)); 582 set_64bit((u64 *)ptep, native_pte_val(pte));
547} 583}
548 584
549static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 585static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
550{ 586{
587 trace_xen_mmu_pte_clear(mm, addr, ptep);
551 if (!xen_batched_set_pte(ptep, native_make_pte(0))) 588 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
552 native_pte_clear(mm, addr, ptep); 589 native_pte_clear(mm, addr, ptep);
553} 590}
554 591
555static void xen_pmd_clear(pmd_t *pmdp) 592static void xen_pmd_clear(pmd_t *pmdp)
556{ 593{
594 trace_xen_mmu_pmd_clear(pmdp);
557 set_pmd(pmdp, __pmd(0)); 595 set_pmd(pmdp, __pmd(0));
558} 596}
559#endif /* CONFIG_X86_PAE */ 597#endif /* CONFIG_X86_PAE */
@@ -629,6 +667,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
629{ 667{
630 pgd_t *user_ptr = xen_get_user_pgd(ptr); 668 pgd_t *user_ptr = xen_get_user_pgd(ptr);
631 669
670 trace_xen_mmu_set_pgd(ptr, user_ptr, val);
671
632 /* If page is not pinned, we can just update the entry 672 /* If page is not pinned, we can just update the entry
633 directly */ 673 directly */
634 if (!xen_page_pinned(ptr)) { 674 if (!xen_page_pinned(ptr)) {
@@ -788,14 +828,12 @@ static void xen_pte_unlock(void *v)
788 828
789static void xen_do_pin(unsigned level, unsigned long pfn) 829static void xen_do_pin(unsigned level, unsigned long pfn)
790{ 830{
791 struct mmuext_op *op; 831 struct mmuext_op op;
792 struct multicall_space mcs;
793 832
794 mcs = __xen_mc_entry(sizeof(*op)); 833 op.cmd = level;
795 op = mcs.args; 834 op.arg1.mfn = pfn_to_mfn(pfn);
796 op->cmd = level; 835
797 op->arg1.mfn = pfn_to_mfn(pfn); 836 xen_extend_mmuext_op(&op);
798 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
799} 837}
800 838
801static int xen_pin_page(struct mm_struct *mm, struct page *page, 839static int xen_pin_page(struct mm_struct *mm, struct page *page,
@@ -863,6 +901,8 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
863 read-only, and can be pinned. */ 901 read-only, and can be pinned. */
864static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) 902static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
865{ 903{
904 trace_xen_mmu_pgd_pin(mm, pgd);
905
866 xen_mc_batch(); 906 xen_mc_batch();
867 907
868 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { 908 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -988,6 +1028,8 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page,
988/* Release a pagetables pages back as normal RW */ 1028/* Release a pagetables pages back as normal RW */
989static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) 1029static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
990{ 1030{
1031 trace_xen_mmu_pgd_unpin(mm, pgd);
1032
991 xen_mc_batch(); 1033 xen_mc_batch();
992 1034
993 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1035 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1196,6 +1238,8 @@ static void xen_flush_tlb(void)
1196 struct mmuext_op *op; 1238 struct mmuext_op *op;
1197 struct multicall_space mcs; 1239 struct multicall_space mcs;
1198 1240
1241 trace_xen_mmu_flush_tlb(0);
1242
1199 preempt_disable(); 1243 preempt_disable();
1200 1244
1201 mcs = xen_mc_entry(sizeof(*op)); 1245 mcs = xen_mc_entry(sizeof(*op));
@@ -1214,6 +1258,8 @@ static void xen_flush_tlb_single(unsigned long addr)
1214 struct mmuext_op *op; 1258 struct mmuext_op *op;
1215 struct multicall_space mcs; 1259 struct multicall_space mcs;
1216 1260
1261 trace_xen_mmu_flush_tlb_single(addr);
1262
1217 preempt_disable(); 1263 preempt_disable();
1218 1264
1219 mcs = xen_mc_entry(sizeof(*op)); 1265 mcs = xen_mc_entry(sizeof(*op));
@@ -1240,6 +1286,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1240 } *args; 1286 } *args;
1241 struct multicall_space mcs; 1287 struct multicall_space mcs;
1242 1288
1289 trace_xen_mmu_flush_tlb_others(cpus, mm, va);
1290
1243 if (cpumask_empty(cpus)) 1291 if (cpumask_empty(cpus))
1244 return; /* nothing to do */ 1292 return; /* nothing to do */
1245 1293
@@ -1275,10 +1323,11 @@ static void set_current_cr3(void *v)
1275 1323
1276static void __xen_write_cr3(bool kernel, unsigned long cr3) 1324static void __xen_write_cr3(bool kernel, unsigned long cr3)
1277{ 1325{
1278 struct mmuext_op *op; 1326 struct mmuext_op op;
1279 struct multicall_space mcs;
1280 unsigned long mfn; 1327 unsigned long mfn;
1281 1328
1329 trace_xen_mmu_write_cr3(kernel, cr3);
1330
1282 if (cr3) 1331 if (cr3)
1283 mfn = pfn_to_mfn(PFN_DOWN(cr3)); 1332 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1284 else 1333 else
@@ -1286,13 +1335,10 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
1286 1335
1287 WARN_ON(mfn == 0 && kernel); 1336 WARN_ON(mfn == 0 && kernel);
1288 1337
1289 mcs = __xen_mc_entry(sizeof(*op)); 1338 op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1290 1339 op.arg1.mfn = mfn;
1291 op = mcs.args;
1292 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1293 op->arg1.mfn = mfn;
1294 1340
1295 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1341 xen_extend_mmuext_op(&op);
1296 1342
1297 if (kernel) { 1343 if (kernel) {
1298 percpu_write(xen_cr3, cr3); 1344 percpu_write(xen_cr3, cr3);
@@ -1451,19 +1497,52 @@ static void __init xen_release_pmd_init(unsigned long pfn)
1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1497 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1452} 1498}
1453 1499
1500static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1501{
1502 struct multicall_space mcs;
1503 struct mmuext_op *op;
1504
1505 mcs = __xen_mc_entry(sizeof(*op));
1506 op = mcs.args;
1507 op->cmd = cmd;
1508 op->arg1.mfn = pfn_to_mfn(pfn);
1509
1510 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1511}
1512
1513static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1514{
1515 struct multicall_space mcs;
1516 unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1517
1518 mcs = __xen_mc_entry(0);
1519 MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1520 pfn_pte(pfn, prot), 0);
1521}
1522
1454/* This needs to make sure the new pte page is pinned iff its being 1523/* This needs to make sure the new pte page is pinned iff its being
1455 attached to a pinned pagetable. */ 1524 attached to a pinned pagetable. */
1456static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) 1525static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1526 unsigned level)
1457{ 1527{
1458 struct page *page = pfn_to_page(pfn); 1528 bool pinned = PagePinned(virt_to_page(mm->pgd));
1529
1530 trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1531
1532 if (pinned) {
1533 struct page *page = pfn_to_page(pfn);
1459 1534
1460 if (PagePinned(virt_to_page(mm->pgd))) {
1461 SetPagePinned(page); 1535 SetPagePinned(page);
1462 1536
1463 if (!PageHighMem(page)) { 1537 if (!PageHighMem(page)) {
1464 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); 1538 xen_mc_batch();
1539
1540 __set_pfn_prot(pfn, PAGE_KERNEL_RO);
1541
1465 if (level == PT_PTE && USE_SPLIT_PTLOCKS) 1542 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1466 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 1543 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1544
1545 xen_mc_issue(PARAVIRT_LAZY_MMU);
1467 } else { 1546 } else {
1468 /* make sure there are no stray mappings of 1547 /* make sure there are no stray mappings of
1469 this page */ 1548 this page */
@@ -1483,15 +1562,23 @@ static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1483} 1562}
1484 1563
1485/* This should never happen until we're OK to use struct page */ 1564/* This should never happen until we're OK to use struct page */
1486static void xen_release_ptpage(unsigned long pfn, unsigned level) 1565static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1487{ 1566{
1488 struct page *page = pfn_to_page(pfn); 1567 struct page *page = pfn_to_page(pfn);
1568 bool pinned = PagePinned(page);
1489 1569
1490 if (PagePinned(page)) { 1570 trace_xen_mmu_release_ptpage(pfn, level, pinned);
1571
1572 if (pinned) {
1491 if (!PageHighMem(page)) { 1573 if (!PageHighMem(page)) {
1574 xen_mc_batch();
1575
1492 if (level == PT_PTE && USE_SPLIT_PTLOCKS) 1576 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1493 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1577 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1494 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1578
1579 __set_pfn_prot(pfn, PAGE_KERNEL);
1580
1581 xen_mc_issue(PARAVIRT_LAZY_MMU);
1495 } 1582 }
1496 ClearPagePinned(page); 1583 ClearPagePinned(page);
1497 } 1584 }
@@ -1626,15 +1713,17 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1626void __init xen_setup_machphys_mapping(void) 1713void __init xen_setup_machphys_mapping(void)
1627{ 1714{
1628 struct xen_machphys_mapping mapping; 1715 struct xen_machphys_mapping mapping;
1629 unsigned long machine_to_phys_nr_ents;
1630 1716
1631 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { 1717 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1632 machine_to_phys_mapping = (unsigned long *)mapping.v_start; 1718 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1633 machine_to_phys_nr_ents = mapping.max_mfn + 1; 1719 machine_to_phys_nr = mapping.max_mfn + 1;
1634 } else { 1720 } else {
1635 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; 1721 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1636 } 1722 }
1637 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); 1723#ifdef CONFIG_X86_32
1724 WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1725 < machine_to_phys_mapping);
1726#endif
1638} 1727}
1639 1728
1640#ifdef CONFIG_X86_64 1729#ifdef CONFIG_X86_64
@@ -1829,6 +1918,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1829# endif 1918# endif
1830#else 1919#else
1831 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: 1920 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1921 case VVAR_PAGE:
1832#endif 1922#endif
1833 case FIX_TEXT_POKE0: 1923 case FIX_TEXT_POKE0:
1834 case FIX_TEXT_POKE1: 1924 case FIX_TEXT_POKE1:
@@ -1869,7 +1959,8 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1869#ifdef CONFIG_X86_64 1959#ifdef CONFIG_X86_64
1870 /* Replicate changes to map the vsyscall page into the user 1960 /* Replicate changes to map the vsyscall page into the user
1871 pagetable vsyscall mapping. */ 1961 pagetable vsyscall mapping. */
1872 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { 1962 if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) ||
1963 idx == VVAR_PAGE) {
1873 unsigned long vaddr = __fix_to_virt(idx); 1964 unsigned long vaddr = __fix_to_virt(idx);
1874 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); 1965 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1875 } 1966 }
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 1b2b73ff0a6..0d82003e76a 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -30,12 +30,13 @@
30 30
31#define MC_BATCH 32 31#define MC_BATCH 32
32 32
33#define MC_DEBUG 1 33#define MC_DEBUG 0
34 34
35#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
36 36
37 37
38struct mc_buffer { 38struct mc_buffer {
39 unsigned mcidx, argidx, cbidx;
39 struct multicall_entry entries[MC_BATCH]; 40 struct multicall_entry entries[MC_BATCH];
40#if MC_DEBUG 41#if MC_DEBUG
41 struct multicall_entry debug[MC_BATCH]; 42 struct multicall_entry debug[MC_BATCH];
@@ -46,85 +47,15 @@ struct mc_buffer {
46 void (*fn)(void *); 47 void (*fn)(void *);
47 void *data; 48 void *data;
48 } callbacks[MC_BATCH]; 49 } callbacks[MC_BATCH];
49 unsigned mcidx, argidx, cbidx;
50}; 50};
51 51
52static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 52static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
53DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 53DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
54 54
55/* flush reasons 0- slots, 1- args, 2- callbacks */
56enum flush_reasons
57{
58 FL_SLOTS,
59 FL_ARGS,
60 FL_CALLBACKS,
61
62 FL_N_REASONS
63};
64
65#ifdef CONFIG_XEN_DEBUG_FS
66#define NHYPERCALLS 40 /* not really */
67
68static struct {
69 unsigned histo[MC_BATCH+1];
70
71 unsigned issued;
72 unsigned arg_total;
73 unsigned hypercalls;
74 unsigned histo_hypercalls[NHYPERCALLS];
75
76 unsigned flush[FL_N_REASONS];
77} mc_stats;
78
79static u8 zero_stats;
80
81static inline void check_zero(void)
82{
83 if (unlikely(zero_stats)) {
84 memset(&mc_stats, 0, sizeof(mc_stats));
85 zero_stats = 0;
86 }
87}
88
89static void mc_add_stats(const struct mc_buffer *mc)
90{
91 int i;
92
93 check_zero();
94
95 mc_stats.issued++;
96 mc_stats.hypercalls += mc->mcidx;
97 mc_stats.arg_total += mc->argidx;
98
99 mc_stats.histo[mc->mcidx]++;
100 for(i = 0; i < mc->mcidx; i++) {
101 unsigned op = mc->entries[i].op;
102 if (op < NHYPERCALLS)
103 mc_stats.histo_hypercalls[op]++;
104 }
105}
106
107static void mc_stats_flush(enum flush_reasons idx)
108{
109 check_zero();
110
111 mc_stats.flush[idx]++;
112}
113
114#else /* !CONFIG_XEN_DEBUG_FS */
115
116static inline void mc_add_stats(const struct mc_buffer *mc)
117{
118}
119
120static inline void mc_stats_flush(enum flush_reasons idx)
121{
122}
123#endif /* CONFIG_XEN_DEBUG_FS */
124
125void xen_mc_flush(void) 55void xen_mc_flush(void)
126{ 56{
127 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 57 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
58 struct multicall_entry *mc;
128 int ret = 0; 59 int ret = 0;
129 unsigned long flags; 60 unsigned long flags;
130 int i; 61 int i;
@@ -135,9 +66,26 @@ void xen_mc_flush(void)
135 something in the middle */ 66 something in the middle */
136 local_irq_save(flags); 67 local_irq_save(flags);
137 68
138 mc_add_stats(b); 69 trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
70
71 switch (b->mcidx) {
72 case 0:
73 /* no-op */
74 BUG_ON(b->argidx != 0);
75 break;
76
77 case 1:
78 /* Singleton multicall - bypass multicall machinery
79 and just do the call directly. */
80 mc = &b->entries[0];
81
82 mc->result = privcmd_call(mc->op,
83 mc->args[0], mc->args[1], mc->args[2],
84 mc->args[3], mc->args[4]);
85 ret = mc->result < 0;
86 break;
139 87
140 if (b->mcidx) { 88 default:
141#if MC_DEBUG 89#if MC_DEBUG
142 memcpy(b->debug, b->entries, 90 memcpy(b->debug, b->entries,
143 b->mcidx * sizeof(struct multicall_entry)); 91 b->mcidx * sizeof(struct multicall_entry));
@@ -164,11 +112,10 @@ void xen_mc_flush(void)
164 } 112 }
165 } 113 }
166#endif 114#endif
115 }
167 116
168 b->mcidx = 0; 117 b->mcidx = 0;
169 b->argidx = 0; 118 b->argidx = 0;
170 } else
171 BUG_ON(b->argidx != 0);
172 119
173 for (i = 0; i < b->cbidx; i++) { 120 for (i = 0; i < b->cbidx; i++) {
174 struct callback *cb = &b->callbacks[i]; 121 struct callback *cb = &b->callbacks[i];
@@ -188,18 +135,21 @@ struct multicall_space __xen_mc_entry(size_t args)
188 struct multicall_space ret; 135 struct multicall_space ret;
189 unsigned argidx = roundup(b->argidx, sizeof(u64)); 136 unsigned argidx = roundup(b->argidx, sizeof(u64));
190 137
138 trace_xen_mc_entry_alloc(args);
139
191 BUG_ON(preemptible()); 140 BUG_ON(preemptible());
192 BUG_ON(b->argidx >= MC_ARGS); 141 BUG_ON(b->argidx >= MC_ARGS);
193 142
194 if (b->mcidx == MC_BATCH || 143 if (unlikely(b->mcidx == MC_BATCH ||
195 (argidx + args) >= MC_ARGS) { 144 (argidx + args) >= MC_ARGS)) {
196 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); 145 trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
146 XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
197 xen_mc_flush(); 147 xen_mc_flush();
198 argidx = roundup(b->argidx, sizeof(u64)); 148 argidx = roundup(b->argidx, sizeof(u64));
199 } 149 }
200 150
201 ret.mc = &b->entries[b->mcidx]; 151 ret.mc = &b->entries[b->mcidx];
202#ifdef MC_DEBUG 152#if MC_DEBUG
203 b->caller[b->mcidx] = __builtin_return_address(0); 153 b->caller[b->mcidx] = __builtin_return_address(0);
204#endif 154#endif
205 b->mcidx++; 155 b->mcidx++;
@@ -218,20 +168,25 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
218 BUG_ON(preemptible()); 168 BUG_ON(preemptible());
219 BUG_ON(b->argidx >= MC_ARGS); 169 BUG_ON(b->argidx >= MC_ARGS);
220 170
221 if (b->mcidx == 0) 171 if (unlikely(b->mcidx == 0 ||
222 return ret; 172 b->entries[b->mcidx - 1].op != op)) {
223 173 trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
224 if (b->entries[b->mcidx - 1].op != op) 174 goto out;
225 return ret; 175 }
226 176
227 if ((b->argidx + size) >= MC_ARGS) 177 if (unlikely((b->argidx + size) >= MC_ARGS)) {
228 return ret; 178 trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
179 goto out;
180 }
229 181
230 ret.mc = &b->entries[b->mcidx - 1]; 182 ret.mc = &b->entries[b->mcidx - 1];
231 ret.args = &b->args[b->argidx]; 183 ret.args = &b->args[b->argidx];
232 b->argidx += size; 184 b->argidx += size;
233 185
234 BUG_ON(b->argidx >= MC_ARGS); 186 BUG_ON(b->argidx >= MC_ARGS);
187
188 trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
189out:
235 return ret; 190 return ret;
236} 191}
237 192
@@ -241,43 +196,13 @@ void xen_mc_callback(void (*fn)(void *), void *data)
241 struct callback *cb; 196 struct callback *cb;
242 197
243 if (b->cbidx == MC_BATCH) { 198 if (b->cbidx == MC_BATCH) {
244 mc_stats_flush(FL_CALLBACKS); 199 trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
245 xen_mc_flush(); 200 xen_mc_flush();
246 } 201 }
247 202
203 trace_xen_mc_callback(fn, data);
204
248 cb = &b->callbacks[b->cbidx++]; 205 cb = &b->callbacks[b->cbidx++];
249 cb->fn = fn; 206 cb->fn = fn;
250 cb->data = data; 207 cb->data = data;
251} 208}
252
253#ifdef CONFIG_XEN_DEBUG_FS
254
255static struct dentry *d_mc_debug;
256
257static int __init xen_mc_debugfs(void)
258{
259 struct dentry *d_xen = xen_init_debugfs();
260
261 if (d_xen == NULL)
262 return -ENOMEM;
263
264 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
265
266 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
267
268 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
269 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
270 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
271
272 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
273 mc_stats.histo, MC_BATCH);
274 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
275 mc_stats.histo_hypercalls, NHYPERCALLS);
276 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
277 mc_stats.flush, FL_N_REASONS);
278
279 return 0;
280}
281fs_initcall(xen_mc_debugfs);
282
283#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 4ec8035e321..dee79b78a90 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -1,6 +1,8 @@
1#ifndef _XEN_MULTICALLS_H 1#ifndef _XEN_MULTICALLS_H
2#define _XEN_MULTICALLS_H 2#define _XEN_MULTICALLS_H
3 3
4#include <trace/events/xen.h>
5
4#include "xen-ops.h" 6#include "xen-ops.h"
5 7
6/* Multicalls */ 8/* Multicalls */
@@ -20,8 +22,10 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
20static inline void xen_mc_batch(void) 22static inline void xen_mc_batch(void)
21{ 23{
22 unsigned long flags; 24 unsigned long flags;
25
23 /* need to disable interrupts until this entry is complete */ 26 /* need to disable interrupts until this entry is complete */
24 local_irq_save(flags); 27 local_irq_save(flags);
28 trace_xen_mc_batch(paravirt_get_lazy_mode());
25 __this_cpu_write(xen_mc_irq_flags, flags); 29 __this_cpu_write(xen_mc_irq_flags, flags);
26} 30}
27 31
@@ -37,6 +41,8 @@ void xen_mc_flush(void);
37/* Issue a multicall if we're not in a lazy mode */ 41/* Issue a multicall if we're not in a lazy mode */
38static inline void xen_mc_issue(unsigned mode) 42static inline void xen_mc_issue(unsigned mode)
39{ 43{
44 trace_xen_mc_issue(mode);
45
40 if ((paravirt_get_lazy_mode() & mode) == 0) 46 if ((paravirt_get_lazy_mode() & mode) == 0)
41 xen_mc_flush(); 47 xen_mc_flush();
42 48
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 25c52f94a27..ffcf2615640 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
35#ifdef CONFIG_XEN_PVHVM 35#ifdef CONFIG_XEN_PVHVM
36static int xen_emul_unplug; 36static int xen_emul_unplug;
37 37
38static int __init check_platform_magic(void) 38static int check_platform_magic(void)
39{ 39{
40 short magic; 40 short magic;
41 char protocol; 41 char protocol;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 60aeeb56948..e1913024687 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/memblock.h> 11#include <linux/memblock.h>
12#include <linux/cpuidle.h>
12 13
13#include <asm/elf.h> 14#include <asm/elf.h>
14#include <asm/vdso.h> 15#include <asm/vdso.h>
@@ -92,8 +93,6 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
92 if (end <= start) 93 if (end <= start)
93 return 0; 94 return 0;
94 95
95 printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
96 start, end);
97 for(pfn = start; pfn < end; pfn++) { 96 for(pfn = start; pfn < end; pfn++) {
98 unsigned long mfn = pfn_to_mfn(pfn); 97 unsigned long mfn = pfn_to_mfn(pfn);
99 98
@@ -106,14 +105,14 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
106 105
107 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 106 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
108 &reservation); 107 &reservation);
109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 108 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
110 start, end, ret);
111 if (ret == 1) { 109 if (ret == 1) {
112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 110 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
113 len++; 111 len++;
114 } 112 }
115 } 113 }
116 printk(KERN_CONT "%ld pages freed\n", len); 114 printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n",
115 start, end, len);
117 116
118 return len; 117 return len;
119} 118}
@@ -139,7 +138,7 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
139 if (last_end < max_addr) 138 if (last_end < max_addr)
140 released += xen_release_chunk(last_end, max_addr); 139 released += xen_release_chunk(last_end, max_addr);
141 140
142 printk(KERN_INFO "released %ld pages of unused memory\n", released); 141 printk(KERN_INFO "released %lu pages of unused memory\n", released);
143 return released; 142 return released;
144} 143}
145 144
@@ -185,6 +184,31 @@ static unsigned long __init xen_set_identity(const struct e820entry *list,
185 PFN_UP(start_pci), PFN_DOWN(last)); 184 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity; 185 return identity;
187} 186}
187
188static unsigned long __init xen_get_max_pages(void)
189{
190 unsigned long max_pages = MAX_DOMAIN_PAGES;
191 domid_t domid = DOMID_SELF;
192 int ret;
193
194 /*
195 * For the initial domain we use the maximum reservation as
196 * the maximum page.
197 *
198 * For guest domains the current maximum reservation reflects
199 * the current maximum rather than the static maximum. In this
200 * case the e820 map provided to us will cover the static
201 * maximum region.
202 */
203 if (xen_initial_domain()) {
204 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
205 if (ret > 0)
206 max_pages = ret;
207 }
208
209 return min(max_pages, MAX_DOMAIN_PAGES);
210}
211
188/** 212/**
189 * machine_specific_memory_setup - Hook for machine specific memory setup. 213 * machine_specific_memory_setup - Hook for machine specific memory setup.
190 **/ 214 **/
@@ -293,6 +317,14 @@ char * __init xen_memory_setup(void)
293 317
294 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 318 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
295 319
320 extra_limit = xen_get_max_pages();
321 if (max_pfn + extra_pages > extra_limit) {
322 if (extra_limit > max_pfn)
323 extra_pages = extra_limit - max_pfn;
324 else
325 extra_pages = 0;
326 }
327
296 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); 328 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
297 329
298 /* 330 /*
@@ -426,8 +458,8 @@ void __init xen_arch_setup(void)
426#ifdef CONFIG_X86_32 458#ifdef CONFIG_X86_32
427 boot_cpu_data.hlt_works_ok = 1; 459 boot_cpu_data.hlt_works_ok = 1;
428#endif 460#endif
429 pm_idle = default_idle; 461 disable_cpuidle();
430 boot_option_idle_override = IDLE_HALT; 462 boot_option_idle_override = IDLE_HALT;
431 463 WARN_ON(set_pm_idle_to_default());
432 fiddle_vdso(); 464 fiddle_vdso();
433} 465}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index b4533a86d7e..041d4fe9dfe 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -32,6 +32,7 @@
32#include <xen/page.h> 32#include <xen/page.h>
33#include <xen/events.h> 33#include <xen/events.h>
34 34
35#include <xen/hvc-console.h>
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
@@ -207,6 +208,15 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
207 unsigned cpu; 208 unsigned cpu;
208 unsigned int i; 209 unsigned int i;
209 210
211 if (skip_ioapic_setup) {
212 char *m = (max_cpus == 0) ?
213 "The nosmp parameter is incompatible with Xen; " \
214 "use Xen dom0_max_vcpus=1 parameter" :
215 "The noapic parameter is incompatible with Xen";
216
217 xen_raw_printk(m);
218 panic(m);
219 }
210 xen_init_lock_cpu(0); 220 xen_init_lock_cpu(0);
211 221
212 smp_store_cpu_info(0); 222 smp_store_cpu_info(0);
@@ -521,10 +531,7 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
521 native_smp_prepare_cpus(max_cpus); 531 native_smp_prepare_cpus(max_cpus);
522 WARN_ON(xen_smp_intr_init(0)); 532 WARN_ON(xen_smp_intr_init(0));
523 533
524 if (!xen_have_vector_callback)
525 return;
526 xen_init_lock_cpu(0); 534 xen_init_lock_cpu(0);
527 xen_init_spinlocks();
528} 535}
529 536
530static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) 537static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
@@ -546,6 +553,8 @@ static void xen_hvm_cpu_die(unsigned int cpu)
546 553
547void __init xen_hvm_smp_init(void) 554void __init xen_hvm_smp_init(void)
548{ 555{
556 if (!xen_have_vector_callback)
557 return;
549 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; 558 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
550 smp_ops.smp_send_reschedule = xen_smp_send_reschedule; 559 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
551 smp_ops.cpu_up = xen_hvm_cpu_up; 560 smp_ops.cpu_up = xen_hvm_cpu_up;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 5158c505bef..163b4679556 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -168,9 +168,10 @@ cycle_t xen_clocksource_read(void)
168 struct pvclock_vcpu_time_info *src; 168 struct pvclock_vcpu_time_info *src;
169 cycle_t ret; 169 cycle_t ret;
170 170
171 src = &get_cpu_var(xen_vcpu)->time; 171 preempt_disable_notrace();
172 src = &__get_cpu_var(xen_vcpu)->time;
172 ret = pvclock_clocksource_read(src); 173 ret = pvclock_clocksource_read(src);
173 put_cpu_var(xen_vcpu); 174 preempt_enable_notrace();
174 return ret; 175 return ret;
175} 176}
176 177
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
new file mode 100644
index 00000000000..520022d1a18
--- /dev/null
+++ b/arch/x86/xen/trace.c
@@ -0,0 +1,62 @@
1#include <linux/ftrace.h>
2#include <xen/interface/xen.h>
3
4#define N(x) [__HYPERVISOR_##x] = "("#x")"
5static const char *xen_hypercall_names[] = {
6 N(set_trap_table),
7 N(mmu_update),
8 N(set_gdt),
9 N(stack_switch),
10 N(set_callbacks),
11 N(fpu_taskswitch),
12 N(sched_op_compat),
13 N(dom0_op),
14 N(set_debugreg),
15 N(get_debugreg),
16 N(update_descriptor),
17 N(memory_op),
18 N(multicall),
19 N(update_va_mapping),
20 N(set_timer_op),
21 N(event_channel_op_compat),
22 N(xen_version),
23 N(console_io),
24 N(physdev_op_compat),
25 N(grant_table_op),
26 N(vm_assist),
27 N(update_va_mapping_otherdomain),
28 N(iret),
29 N(vcpu_op),
30 N(set_segment_base),
31 N(mmuext_op),
32 N(acm_op),
33 N(nmi_op),
34 N(sched_op),
35 N(callback_op),
36 N(xenoprof_op),
37 N(event_channel_op),
38 N(physdev_op),
39 N(hvm_op),
40
41/* Architecture-specific hypercall definitions. */
42 N(arch_0),
43 N(arch_1),
44 N(arch_2),
45 N(arch_3),
46 N(arch_4),
47 N(arch_5),
48 N(arch_6),
49 N(arch_7),
50};
51#undef N
52
53static const char *xen_hypercall_name(unsigned op)
54{
55 if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
56 return xen_hypercall_names[op];
57
58 return "";
59}
60
61#define CREATE_TRACE_POINTS
62#include <trace/events/xen.h>
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 00000000000..1cd7f4d11e2
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,67 @@
1#include <linux/screen_info.h>
2#include <linux/init.h>
3
4#include <asm/bootparam.h>
5#include <asm/setup.h>
6
7#include <xen/interface/xen.h>
8
9#include "xen-ops.h"
10
11void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
12{
13 struct screen_info *screen_info = &boot_params.screen_info;
14
15 /* This is drawn from a dump from vgacon:startup in
16 * standard Linux. */
17 screen_info->orig_video_mode = 3;
18 screen_info->orig_video_isVGA = 1;
19 screen_info->orig_video_lines = 25;
20 screen_info->orig_video_cols = 80;
21 screen_info->orig_video_ega_bx = 3;
22 screen_info->orig_video_points = 16;
23 screen_info->orig_y = screen_info->orig_video_lines - 1;
24
25 switch (info->video_type) {
26 case XEN_VGATYPE_TEXT_MODE_3:
27 if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
28 + sizeof(info->u.text_mode_3))
29 break;
30 screen_info->orig_video_lines = info->u.text_mode_3.rows;
31 screen_info->orig_video_cols = info->u.text_mode_3.columns;
32 screen_info->orig_x = info->u.text_mode_3.cursor_x;
33 screen_info->orig_y = info->u.text_mode_3.cursor_y;
34 screen_info->orig_video_points =
35 info->u.text_mode_3.font_height;
36 break;
37
38 case XEN_VGATYPE_VESA_LFB:
39 if (size < offsetof(struct dom0_vga_console_info,
40 u.vesa_lfb.gbl_caps))
41 break;
42 screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
43 screen_info->lfb_width = info->u.vesa_lfb.width;
44 screen_info->lfb_height = info->u.vesa_lfb.height;
45 screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
46 screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
47 screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
48 screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
49 screen_info->red_size = info->u.vesa_lfb.red_size;
50 screen_info->red_pos = info->u.vesa_lfb.red_pos;
51 screen_info->green_size = info->u.vesa_lfb.green_size;
52 screen_info->green_pos = info->u.vesa_lfb.green_pos;
53 screen_info->blue_size = info->u.vesa_lfb.blue_size;
54 screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
55 screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
56 screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
57 if (size >= offsetof(struct dom0_vga_console_info,
58 u.vesa_lfb.gbl_caps)
59 + sizeof(info->u.vesa_lfb.gbl_caps))
60 screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
61 if (size >= offsetof(struct dom0_vga_console_info,
62 u.vesa_lfb.mode_attrs)
63 + sizeof(info->u.vesa_lfb.mode_attrs))
64 screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
65 break;
66 }
67}
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 22a2093b586..b040b0e518c 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -113,11 +113,13 @@ xen_iret_start_crit:
113 113
114 /* 114 /*
115 * If there's something pending, mask events again so we can 115 * If there's something pending, mask events again so we can
116 * jump back into xen_hypervisor_callback 116 * jump back into xen_hypervisor_callback. Otherwise do not
117 * touch XEN_vcpu_info_mask.
117 */ 118 */
118 sete XEN_vcpu_info_mask(%eax) 119 jne 1f
120 movb $1, XEN_vcpu_info_mask(%eax)
119 121
120 popl %eax 1221: popl %eax
121 123
122 /* 124 /*
123 * From this point on the registers are restored and the stack 125 * From this point on the registers are restored and the stack
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 97dfdc8757b..b095739ccd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu)
88} 88}
89#endif 89#endif
90 90
91struct dom0_vga_console_info;
92
93#ifdef CONFIG_XEN_DOM0
94void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
95#else
96static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
97 size_t size)
98{
99}
100#endif
101
91/* Declare an asm function, along with symbols needed to make it 102/* Declare an asm function, along with symbols needed to make it
92 inlineable */ 103 inlineable */
93#define DECL_ASM(ret, name, ...) \ 104#define DECL_ASM(ret, name, ...) \