aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild1
-rw-r--r--arch/x86/Kconfig114
-rw-r--r--arch/x86/Kconfig.debug14
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/compressed/misc.c29
-rw-r--r--arch/x86/include/asm/alternative.h11
-rw-r--r--arch/x86/include/asm/amd_iommu.h6
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h23
-rw-r--r--arch/x86/include/asm/amd_nb.h (renamed from arch/x86/include/asm/k8.h)21
-rw-r--r--arch/x86/include/asm/apic.h4
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/bitops.h2
-rw-r--r--arch/x86/include/asm/calgary.h4
-rw-r--r--arch/x86/include/asm/calling.h52
-rw-r--r--arch/x86/include/asm/cpufeature.h13
-rw-r--r--arch/x86/include/asm/dwarf2.h20
-rw-r--r--arch/x86/include/asm/e820.h20
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h23
-rw-r--r--arch/x86/include/asm/fixmap.h15
-rw-r--r--arch/x86/include/asm/gart.h20
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/highmem.h11
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/i387.h185
-rw-r--r--arch/x86/include/asm/io.h2
-rw-r--r--arch/x86/include/asm/iomap.h4
-rw-r--r--arch/x86/include/asm/iommu_table.h100
-rw-r--r--arch/x86/include/asm/irq.h12
-rw-r--r--arch/x86/include/asm/irq_vectors.h4
-rw-r--r--arch/x86/include/asm/irqflags.h32
-rw-r--r--arch/x86/include/asm/jump_label.h37
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h105
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/include/asm/memblock.h23
-rw-r--r--arch/x86/include/asm/module.h7
-rw-r--r--arch/x86/include/asm/mrst.h10
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/mwait.h15
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h4
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h21
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/percpu.h14
-rw-r--r--arch/x86/include/asm/perf_event.h19
-rw-r--r--arch/x86/include/asm/perf_event_p4.h52
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_32.h16
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/processor.h29
-rw-r--r--arch/x86/include/asm/pvclock.h38
-rw-r--r--arch/x86/include/asm/segment.h32
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/swiotlb.h13
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/trampoline.h3
-rw-r--r--arch/x86/include/asm/vmi.h269
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/xen/hypercall.h17
-rw-r--r--arch/x86/include/asm/xen/page.h12
-rw-r--r--arch/x86/kernel/Makefile20
-rw-r--r--arch/x86/kernel/acpi/cstate.c11
-rw-r--r--arch/x86/kernel/acpi/sleep.c15
-rw-r--r--arch/x86/kernel/alternative.c71
-rw-r--r--arch/x86/kernel/amd_iommu.c2
-rw-r--r--arch/x86/kernel/amd_iommu_init.c139
-rw-r--r--arch/x86/kernel/amd_nb.c (renamed from arch/x86/kernel/k8.c)56
-rw-r--r--arch/x86/kernel/apb_timer.c4
-rw-r--r--arch/x86/kernel/aperture_64.c31
-rw-r--r--arch/x86/kernel/apic/apic.c88
-rw-r--r--arch/x86/kernel/apic/io_apic.c3
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apm_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/check.c16
-rw-r--r--arch/x86/kernel/cpu/amd.c75
-rw-r--r--arch/x86/kernel/cpu/common.c22
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c27
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c128
-rw-r--r--arch/x86/kernel/cpu/perf_event.c306
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c8
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c229
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c292
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c9
-rw-r--r--arch/x86/kernel/cpu/scattered.c6
-rw-r--r--arch/x86/kernel/crash_dump_32.c2
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c8
-rw-r--r--arch/x86/kernel/e820.c191
-rw-r--r--arch/x86/kernel/early_printk.c13
-rw-r--r--arch/x86/kernel/early_printk_mrst.c319
-rw-r--r--arch/x86/kernel/entry_32.S310
-rw-r--r--arch/x86/kernel/entry_64.S134
-rw-r--r--arch/x86/kernel/ftrace.c63
-rw-r--r--arch/x86/kernel/head.c3
-rw-r--r--arch/x86/kernel/head32.c11
-rw-r--r--arch/x86/kernel/head64.c9
-rw-r--r--arch/x86/kernel/head_32.S55
-rw-r--r--arch/x86/kernel/hpet.c53
-rw-r--r--arch/x86/kernel/i387.c58
-rw-r--r--arch/x86/kernel/irq.c8
-rw-r--r--arch/x86/kernel/irq_32.c20
-rw-r--r--arch/x86/kernel/irq_work.c30
-rw-r--r--arch/x86/kernel/irqinit.c6
-rw-r--r--arch/x86/kernel/jump_label.c50
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c11
-rw-r--r--arch/x86/kernel/kprobes.c14
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/microcode_core.c3
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/module.c3
-rw-r--r--arch/x86/kernel/mpparse.c5
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c44
-rw-r--r--arch/x86/kernel/pci-gart_64.c33
-rw-r--r--arch/x86/kernel/pci-iommu_table.c89
-rw-r--r--arch/x86/kernel/pci-swiotlb.c44
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c17
-rw-r--r--arch/x86/kernel/pvclock.c3
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/setup.c234
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smp.c15
-rw-r--r--arch/x86/kernel/smpboot.c129
-rw-r--r--arch/x86/kernel/sys_i386_32.c4
-rw-r--r--arch/x86/kernel/trampoline.c26
-rw-r--r--arch/x86/kernel/traps.c36
-rw-r--r--arch/x86/kernel/tsc.c66
-rw-r--r--arch/x86/kernel/vm86_32.c10
-rw-r--r--arch/x86/kernel/vmi_32.c893
-rw-r--r--arch/x86/kernel/vmiclock_32.c317
-rw-r--r--arch/x86/kernel/vmlinux.lds.S30
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/emulate.c2262
-rw-r--r--arch/x86/kvm/i8254.c11
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/lapic.c15
-rw-r--r--arch/x86/kvm/mmu.c918
-rw-r--r--arch/x86/kvm/mmu.h9
-rw-r--r--arch/x86/kvm/mmu_audit.c299
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h202
-rw-r--r--arch/x86/kvm/svm.c298
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/vmx.c243
-rw-r--r--arch/x86/kvm/x86.c787
-rw-r--r--arch/x86/kvm/x86.h8
-rw-r--r--arch/x86/lib/memcpy_32.c199
-rw-r--r--arch/x86/lib/memcpy_64.S158
-rw-r--r--arch/x86/lib/memmove_64.c189
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c110
-rw-r--r--arch/x86/mm/highmem_32.c76
-rw-r--r--arch/x86/mm/init.c10
-rw-r--r--arch/x86/mm/init_32.c164
-rw-r--r--arch/x86/mm/init_64.c115
-rw-r--r--arch/x86/mm/iomap_32.c43
-rw-r--r--arch/x86/mm/ioremap.c5
-rw-r--r--arch/x86/mm/k8topology_64.c6
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c2
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/numa_32.c30
-rw-r--r--arch/x86/mm/numa_64.c86
-rw-r--r--arch/x86/mm/pgtable.c24
-rw-r--r--arch/x86/mm/srat_32.c3
-rw-r--r--arch/x86/mm/srat_64.c11
-rw-r--r--arch/x86/mm/tlb.c48
-rw-r--r--arch/x86/oprofile/backtrace.c70
-rw-r--r--arch/x86/oprofile/nmi_int.c15
-rw-r--r--arch/x86/oprofile/op_model_amd.c261
-rw-r--r--arch/x86/pci/i386.c17
-rw-r--r--arch/x86/pci/irq.c11
-rw-r--r--arch/x86/pci/mmconfig-shared.c4
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/platform/Makefile8
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c (renamed from arch/x86/kernel/efi.c)5
-rw-r--r--arch/x86/platform/efi/efi_32.c (renamed from arch/x86/kernel/efi_32.c)0
-rw-r--r--arch/x86/platform/efi/efi_64.c (renamed from arch/x86/kernel/efi_64.c)0
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S (renamed from arch/x86/kernel/efi_stub_32.S)0
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S (renamed from arch/x86/kernel/efi_stub_64.S)0
-rw-r--r--arch/x86/platform/mrst/Makefile1
-rw-r--r--arch/x86/platform/mrst/mrst.c (renamed from arch/x86/kernel/mrst.c)0
-rw-r--r--arch/x86/platform/olpc/Makefile3
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c140
-rw-r--r--arch/x86/platform/olpc/olpc.c (renamed from arch/x86/kernel/olpc.c)89
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c (renamed from arch/x86/kernel/olpc_ofw.c)6
-rw-r--r--arch/x86/platform/scx200/Makefile2
-rw-r--r--arch/x86/platform/scx200/scx200_32.c (renamed from arch/x86/kernel/scx200_32.c)0
-rw-r--r--arch/x86/platform/sfi/Makefile1
-rw-r--r--arch/x86/platform/sfi/sfi.c (renamed from arch/x86/kernel/sfi.c)4
-rw-r--r--arch/x86/platform/uv/Makefile1
-rw-r--r--arch/x86/platform/uv/bios_uv.c (renamed from arch/x86/kernel/bios_uv.c)0
-rw-r--r--arch/x86/platform/uv/tlb_uv.c (renamed from arch/x86/kernel/tlb_uv.c)12
-rw-r--r--arch/x86/platform/uv/uv_irq.c (renamed from arch/x86/kernel/uv_irq.c)0
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c (renamed from arch/x86/kernel/uv_sysfs.c)0
-rw-r--r--arch/x86/platform/uv/uv_time.c (renamed from arch/x86/kernel/uv_time.c)0
-rw-r--r--arch/x86/platform/visws/Makefile1
-rw-r--r--arch/x86/platform/visws/visws_quirks.c (renamed from arch/x86/kernel/visws_quirks.c)0
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c22
-rw-r--r--arch/x86/xen/mmu.c533
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c5
-rw-r--r--arch/x86/xen/setup.c115
-rw-r--r--arch/x86/xen/smp.c6
-rw-r--r--arch/x86/xen/spinlock.c2
-rw-r--r--arch/x86/xen/xen-ops.h3
236 files changed, 8729 insertions, 6155 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index ad8ec356fb36..0e103236b754 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -14,3 +14,4 @@ obj-y += crypto/
14obj-y += vdso/ 14obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/ 15obj-$(CONFIG_IA32_EMULATION) += ia32/
16 16
17obj-y += platform/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 74ea59d34076..e8327686d3c5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,6 +1,3 @@
1# x86 configuration
2mainmenu "Linux Kernel Configuration for x86"
3
4# Select 32 or 64 bit 1# Select 32 or 64 bit
5config 64BIT 2config 64BIT
6 bool "64-bit kernel" if ARCH = "x86" 3 bool "64-bit kernel" if ARCH = "x86"
@@ -25,14 +22,17 @@ config X86
25 select HAVE_IDE 22 select HAVE_IDE
26 select HAVE_OPROFILE 23 select HAVE_OPROFILE
27 select HAVE_PERF_EVENTS if (!M386 && !M486) 24 select HAVE_PERF_EVENTS if (!M386 && !M486)
25 select HAVE_IRQ_WORK
28 select HAVE_IOREMAP_PROT 26 select HAVE_IOREMAP_PROT
29 select HAVE_KPROBES 27 select HAVE_KPROBES
28 select HAVE_MEMBLOCK
30 select ARCH_WANT_OPTIONAL_GPIOLIB 29 select ARCH_WANT_OPTIONAL_GPIOLIB
31 select ARCH_WANT_FRAME_POINTERS 30 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS 31 select HAVE_DMA_ATTRS
33 select HAVE_KRETPROBES 32 select HAVE_KRETPROBES
34 select HAVE_OPTPROBES 33 select HAVE_OPTPROBES
35 select HAVE_FTRACE_MCOUNT_RECORD 34 select HAVE_FTRACE_MCOUNT_RECORD
35 select HAVE_C_RECORDMCOUNT
36 select HAVE_DYNAMIC_FTRACE 36 select HAVE_DYNAMIC_FTRACE
37 select HAVE_FUNCTION_TRACER 37 select HAVE_FUNCTION_TRACER
38 select HAVE_FUNCTION_GRAPH_TRACER 38 select HAVE_FUNCTION_GRAPH_TRACER
@@ -59,6 +59,8 @@ config X86
59 select ANON_INODES 59 select ANON_INODES
60 select HAVE_ARCH_KMEMCHECK 60 select HAVE_ARCH_KMEMCHECK
61 select HAVE_USER_RETURN_NOTIFIER 61 select HAVE_USER_RETURN_NOTIFIER
62 select HAVE_ARCH_JUMP_LABEL
63 select HAVE_TEXT_POKE_SMP
62 select HAVE_GENERIC_HARDIRQS 64 select HAVE_GENERIC_HARDIRQS
63 select HAVE_SPARSE_IRQ 65 select HAVE_SPARSE_IRQ
64 select GENERIC_IRQ_PROBE 66 select GENERIC_IRQ_PROBE
@@ -197,9 +199,6 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
197config ARCH_SUPPORTS_DEBUG_PAGEALLOC 199config ARCH_SUPPORTS_DEBUG_PAGEALLOC
198 def_bool y 200 def_bool y
199 201
200config HAVE_EARLY_RES
201 def_bool y
202
203config HAVE_INTEL_TXT 202config HAVE_INTEL_TXT
204 def_bool y 203 def_bool y
205 depends on EXPERIMENTAL && DMAR && ACPI 204 depends on EXPERIMENTAL && DMAR && ACPI
@@ -345,6 +344,7 @@ endif
345 344
346config X86_VSMP 345config X86_VSMP
347 bool "ScaleMP vSMP" 346 bool "ScaleMP vSMP"
347 select PARAVIRT_GUEST
348 select PARAVIRT 348 select PARAVIRT
349 depends on X86_64 && PCI 349 depends on X86_64 && PCI
350 depends on X86_EXTENDED_PLATFORM 350 depends on X86_EXTENDED_PLATFORM
@@ -490,25 +490,6 @@ if PARAVIRT_GUEST
490 490
491source "arch/x86/xen/Kconfig" 491source "arch/x86/xen/Kconfig"
492 492
493config VMI
494 bool "VMI Guest support (DEPRECATED)"
495 select PARAVIRT
496 depends on X86_32
497 ---help---
498 VMI provides a paravirtualized interface to the VMware ESX server
499 (it could be used by other hypervisors in theory too, but is not
500 at the moment), by linking the kernel to a GPL-ed ROM module
501 provided by the hypervisor.
502
503 As of September 2009, VMware has started a phased retirement
504 of this feature from VMware's products. Please see
505 feature-removal-schedule.txt for details. If you are
506 planning to enable this option, please note that you cannot
507 live migrate a VMI enabled VM to a future VMware product,
508 which doesn't support VMI. So if you expect your kernel to
509 seamlessly migrate to newer VMware products, keep this
510 disabled.
511
512config KVM_CLOCK 493config KVM_CLOCK
513 bool "KVM paravirtualized clock" 494 bool "KVM paravirtualized clock"
514 select PARAVIRT 495 select PARAVIRT
@@ -563,16 +544,7 @@ config PARAVIRT_DEBUG
563 a paravirt_op is missing when it is called. 544 a paravirt_op is missing when it is called.
564 545
565config NO_BOOTMEM 546config NO_BOOTMEM
566 default y 547 def_bool y
567 bool "Disable Bootmem code"
568 ---help---
569 Use early_res directly instead of bootmem before slab is ready.
570 - allocator (buddy) [generic]
571 - early allocator (bootmem) [generic]
572 - very early allocator (reserve_early*()) [x86]
573 - very very early allocator (early brk model) [x86]
574 So reduce one layer between early allocator to final allocator
575
576 548
577config MEMTEST 549config MEMTEST
578 bool "Memtest" 550 bool "Memtest"
@@ -643,7 +615,7 @@ config GART_IOMMU
643 bool "GART IOMMU support" if EMBEDDED 615 bool "GART IOMMU support" if EMBEDDED
644 default y 616 default y
645 select SWIOTLB 617 select SWIOTLB
646 depends on X86_64 && PCI && K8_NB 618 depends on X86_64 && PCI && AMD_NB
647 ---help--- 619 ---help---
648 Support for full DMA access of devices with 32bit memory access only 620 Support for full DMA access of devices with 32bit memory access only
649 on systems with more than 3GB. This is usually needed for USB, 621 on systems with more than 3GB. This is usually needed for USB,
@@ -768,6 +740,17 @@ config SCHED_MC
768 making when dealing with multi-core CPU chips at a cost of slightly 740 making when dealing with multi-core CPU chips at a cost of slightly
769 increased overhead in some places. If unsure say N here. 741 increased overhead in some places. If unsure say N here.
770 742
743config IRQ_TIME_ACCOUNTING
744 bool "Fine granularity task level IRQ time accounting"
745 default n
746 ---help---
747 Select this option to enable fine granularity task irq time
748 accounting. This is done by reading a timestamp on each
749 transitions between softirq and hardirq state, so there can be a
750 small performance impact.
751
752 If in doubt, say N here.
753
771source "kernel/Kconfig.preempt" 754source "kernel/Kconfig.preempt"
772 755
773config X86_UP_APIC 756config X86_UP_APIC
@@ -1121,6 +1104,9 @@ config X86_PAE
1121config ARCH_PHYS_ADDR_T_64BIT 1104config ARCH_PHYS_ADDR_T_64BIT
1122 def_bool X86_64 || X86_PAE 1105 def_bool X86_64 || X86_PAE
1123 1106
1107config ARCH_DMA_ADDR_T_64BIT
1108 def_bool X86_64 || HIGHMEM64G
1109
1124config DIRECT_GBPAGES 1110config DIRECT_GBPAGES
1125 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1111 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
1126 default y 1112 default y
@@ -1299,25 +1285,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1299 Set whether the default state of memory_corruption_check is 1285 Set whether the default state of memory_corruption_check is
1300 on or off. 1286 on or off.
1301 1287
1302config X86_RESERVE_LOW_64K 1288config X86_RESERVE_LOW
1303 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" 1289 int "Amount of low memory, in kilobytes, to reserve for the BIOS"
1304 default y 1290 default 64
1291 range 4 640
1305 ---help--- 1292 ---help---
1306 Reserve the first 64K of physical RAM on BIOSes that are known 1293 Specify the amount of low memory to reserve for the BIOS.
1307 to potentially corrupt that memory range. A numbers of BIOSes are 1294
1308 known to utilize this area during suspend/resume, so it must not 1295 The first page contains BIOS data structures that the kernel
1309 be used by the kernel. 1296 must not use, so that page must always be reserved.
1310 1297
1311 Set this to N if you are absolutely sure that you trust the BIOS 1298 By default we reserve the first 64K of physical RAM, as a
1312 to get all its memory reservations and usages right. 1299 number of BIOSes are known to corrupt that memory range
1300 during events such as suspend/resume or monitor cable
1301 insertion, so it must not be used by the kernel.
1313 1302
1314 If you have doubts about the BIOS (e.g. suspend/resume does not 1303 You can set this to 4 if you are absolutely sure that you
1315 work or there's kernel crashes after certain hardware hotplug 1304 trust the BIOS to get all its memory reservations and usages
1316 events) and it's not AMI or Phoenix, then you might want to enable 1305 right. If you know your BIOS have problems beyond the
1317 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical 1306 default 64K area, you can set this to 640 to avoid using the
1318 corruption patterns. 1307 entire low memory range.
1319 1308
1320 Say Y if unsure. 1309 If you have doubts about the BIOS (e.g. suspend/resume does
1310 not work or there's kernel crashes after certain hardware
1311 hotplug events) then you might want to enable
1312 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
1313 typical corruption patterns.
1314
1315 Leave this to the default value of 64 if you are unsure.
1321 1316
1322config MATH_EMULATION 1317config MATH_EMULATION
1323 bool 1318 bool
@@ -1873,7 +1868,7 @@ config PCI_GODIRECT
1873 bool "Direct" 1868 bool "Direct"
1874 1869
1875config PCI_GOOLPC 1870config PCI_GOOLPC
1876 bool "OLPC" 1871 bool "OLPC XO-1"
1877 depends on OLPC 1872 depends on OLPC
1878 1873
1879config PCI_GOANY 1874config PCI_GOANY
@@ -2039,14 +2034,21 @@ config SCx200HR_TIMER
2039config OLPC 2034config OLPC
2040 bool "One Laptop Per Child support" 2035 bool "One Laptop Per Child support"
2041 select GPIOLIB 2036 select GPIOLIB
2037 select OLPC_OPENFIRMWARE
2042 ---help--- 2038 ---help---
2043 Add support for detecting the unique features of the OLPC 2039 Add support for detecting the unique features of the OLPC
2044 XO hardware. 2040 XO hardware.
2045 2041
2042config OLPC_XO1
2043 tristate "OLPC XO-1 support"
2044 depends on OLPC && PCI
2045 ---help---
2046 Add support for non-essential features of the OLPC XO-1 laptop.
2047
2046config OLPC_OPENFIRMWARE 2048config OLPC_OPENFIRMWARE
2047 bool "Support for OLPC's Open Firmware" 2049 bool "Support for OLPC's Open Firmware"
2048 depends on !X86_64 && !X86_PAE 2050 depends on !X86_64 && !X86_PAE
2049 default y if OLPC 2051 default n
2050 help 2052 help
2051 This option adds support for the implementation of Open Firmware 2053 This option adds support for the implementation of Open Firmware
2052 that is used on the OLPC XO-1 Children's Machine. 2054 that is used on the OLPC XO-1 Children's Machine.
@@ -2054,7 +2056,7 @@ config OLPC_OPENFIRMWARE
2054 2056
2055endif # X86_32 2057endif # X86_32
2056 2058
2057config K8_NB 2059config AMD_NB
2058 def_bool y 2060 def_bool y
2059 depends on CPU_SUP_AMD && PCI 2061 depends on CPU_SUP_AMD && PCI
2060 2062
@@ -2103,6 +2105,10 @@ config HAVE_ATOMIC_IOMAP
2103 def_bool y 2105 def_bool y
2104 depends on X86_32 2106 depends on X86_32
2105 2107
2108config HAVE_TEXT_POKE_SMP
2109 bool
2110 select STOP_MACHINE if SMP
2111
2106source "net/Kconfig" 2112source "net/Kconfig"
2107 2113
2108source "drivers/Kconfig" 2114source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63e..b59ee765414e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_MRST
47 bool "Early printk for MRST platform support"
48 depends on EARLY_PRINTK && X86_MRST
49
46config EARLY_PRINTK_DBGP 50config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port" 51 bool "Early printk via EHCI debug port"
48 depends on EARLY_PRINTK && PCI 52 depends on EARLY_PRINTK && PCI
@@ -121,16 +125,6 @@ config DEBUG_NX_TEST
121 and the software setup of this feature. 125 and the software setup of this feature.
122 If in doubt, say "N" 126 If in doubt, say "N"
123 127
124config 4KSTACKS
125 bool "Use 4Kb for kernel stacks instead of 8Kb"
126 depends on X86_32
127 ---help---
128 If you say Y here the kernel will use a 4Kb stacksize for the
129 kernel stack attached to each process/thread. This facilitates
130 running more threads on a system and also reduces the pressure
131 on the VM subsystem for higher order allocations. This option
132 will also use IRQ stacks to compensate for the reduced stackspace.
133
134config DOUBLEFAULT 128config DOUBLEFAULT
135 default y 129 default y
136 bool "Enable doublefault exception handler" if EMBEDDED 130 bool "Enable doublefault exception handler" if EMBEDDED
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e8c8881351b3..b02e509072a7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
96# is .cfi_signal_frame supported too? 96# is .cfi_signal_frame supported too?
97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) 97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) 98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
99KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) 99
100KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) 100# does binutils support specific instructions?
101asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
102
103KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
104KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
101 105
102LDFLAGS := -m elf_$(UTS_MACHINE) 106LDFLAGS := -m elf_$(UTS_MACHINE)
103 107
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8f7bef8e9fff..23f315c9f215 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -229,18 +229,35 @@ void *memset(void *s, int c, size_t n)
229 ss[i] = c; 229 ss[i] = c;
230 return s; 230 return s;
231} 231}
232 232#ifdef CONFIG_X86_32
233void *memcpy(void *dest, const void *src, size_t n) 233void *memcpy(void *dest, const void *src, size_t n)
234{ 234{
235 int i; 235 int d0, d1, d2;
236 const char *s = src; 236 asm volatile(
237 char *d = dest; 237 "rep ; movsl\n\t"
238 "movl %4,%%ecx\n\t"
239 "rep ; movsb\n\t"
240 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
241 : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
242 : "memory");
238 243
239 for (i = 0; i < n; i++)
240 d[i] = s[i];
241 return dest; 244 return dest;
242} 245}
246#else
247void *memcpy(void *dest, const void *src, size_t n)
248{
249 long d0, d1, d2;
250 asm volatile(
251 "rep ; movsq\n\t"
252 "movq %4,%%rcx\n\t"
253 "rep ; movsb\n\t"
254 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
255 : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
256 : "memory");
243 257
258 return dest;
259}
260#endif
244 261
245static void error(char *x) 262static void error(char *x)
246{ 263{
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bc6abb7bc7ee..76561d20ea2f 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,6 +4,7 @@
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/jump_label.h>
7#include <asm/asm.h> 8#include <asm/asm.h>
8 9
9/* 10/*
@@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
160#define __parainstructions_end NULL 161#define __parainstructions_end NULL
161#endif 162#endif
162 163
164extern void *text_poke_early(void *addr, const void *opcode, size_t len);
165
163/* 166/*
164 * Clear and restore the kernel write-protection flag on the local CPU. 167 * Clear and restore the kernel write-protection flag on the local CPU.
165 * Allows the kernel to edit read-only pages. 168 * Allows the kernel to edit read-only pages.
@@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
180extern void *text_poke(void *addr, const void *opcode, size_t len); 183extern void *text_poke(void *addr, const void *opcode, size_t len);
181extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 184extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
182 185
186#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
187#define IDEAL_NOP_SIZE_5 5
188extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
189extern void arch_init_ideal_nop5(void);
190#else
191static inline void arch_init_ideal_nop5(void) {}
192#endif
193
183#endif /* _ASM_X86_ALTERNATIVE_H */ 194#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b5..a6863a2dec1f 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -24,11 +24,11 @@
24 24
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26 26
27extern void amd_iommu_detect(void); 27extern int amd_iommu_detect(void);
28 28
29#else 29#else
30 30
31static inline void amd_iommu_detect(void) { } 31static inline int amd_iommu_detect(void) { return -ENODEV; }
32 32
33#endif 33#endif
34 34
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90a..916bc8111a01 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify it 5 * This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 08616180deaf..e3509fc303bf 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -416,13 +416,22 @@ struct amd_iommu {
416 struct dma_ops_domain *default_dom; 416 struct dma_ops_domain *default_dom;
417 417
418 /* 418 /*
419 * This array is required to work around a potential BIOS bug. 419 * We can't rely on the BIOS to restore all values on reinit, so we
420 * The BIOS may miss to restore parts of the PCI configuration 420 * need to stash them
421 * space when the system resumes from S3. The result is that the
422 * IOMMU does not execute commands anymore which leads to system
423 * failure.
424 */ 421 */
425 u32 cache_cfg[4]; 422
423 /* The iommu BAR */
424 u32 stored_addr_lo;
425 u32 stored_addr_hi;
426
427 /*
428 * Each iommu has 6 l1s, each of which is documented as having 0x12
429 * registers
430 */
431 u32 stored_l1[6][0x12];
432
433 /* The l2 indirect registers */
434 u32 stored_l2[0x83];
426}; 435};
427 436
428/* 437/*
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h
index af00bd1d2089..c8517f81b21e 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,5 +1,5 @@
1#ifndef _ASM_X86_K8_H 1#ifndef _ASM_X86_AMD_NB_H
2#define _ASM_X86_K8_H 2#define _ASM_X86_AMD_NB_H
3 3
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[];
7struct bootnode; 7struct bootnode;
8 8
9extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
10extern struct pci_dev **k8_northbridges;
11extern int num_k8_northbridges;
12extern int cache_k8_northbridges(void); 10extern int cache_k8_northbridges(void);
13extern void k8_flush_garts(void); 11extern void k8_flush_garts(void);
14extern int k8_get_nodes(struct bootnode *nodes); 12extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); 13extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void); 14extern int k8_scan_nodes(void);
17 15
18#ifdef CONFIG_K8_NB 16struct k8_northbridge_info {
19extern int num_k8_northbridges; 17 u16 num;
18 u8 gart_supported;
19 struct pci_dev **nb_misc;
20};
21extern struct k8_northbridge_info k8_northbridges;
22
23#ifdef CONFIG_AMD_NB
20 24
21static inline struct pci_dev *node_to_k8_nb_misc(int node) 25static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{ 26{
23 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; 27 return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
24} 28}
25 29
26#else 30#else
27#define num_k8_northbridges 0
28 31
29static inline struct pci_dev *node_to_k8_nb_misc(int node) 32static inline struct pci_dev *node_to_k8_nb_misc(int node)
30{ 33{
@@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node)
33#endif 36#endif
34 37
35 38
36#endif /* _ASM_X86_K8_H */ 39#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1fa03e04ae44..286de34b0ed6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void)
252} 252}
253#endif 253#endif
254 254
255extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); 255extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
256extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
257
258 256
259#else /* !CONFIG_X86_LOCAL_APIC */ 257#else /* !CONFIG_X86_LOCAL_APIC */
260static inline void lapic_shutdown(void) { } 258static inline void lapic_shutdown(void) { }
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7fe3b3060f08..a859ca461fb0 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -131,6 +131,7 @@
131#define APIC_EILVTn(n) (0x500 + 0x10 * n) 131#define APIC_EILVTn(n) (0x500 + 0x10 * n)
132#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ 132#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
133#define APIC_EILVT_NR_AMD_10H 4 133#define APIC_EILVT_NR_AMD_10H 4
134#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
134#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) 135#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
135#define APIC_EILVT_MSG_FIX 0x0 136#define APIC_EILVT_MSG_FIX 0x0
136#define APIC_EILVT_MSG_SMI 0x2 137#define APIC_EILVT_MSG_SMI 0x2
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index bafd80defa43..903683b07e42 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -440,6 +440,8 @@ static inline int fls(int x)
440 440
441#ifdef __KERNEL__ 441#ifdef __KERNEL__
442 442
443#include <asm-generic/bitops/find.h>
444
443#include <asm-generic/bitops/sched.h> 445#include <asm-generic/bitops/sched.h>
444 446
445#define ARCH_HAS_FAST_MULTIPLIER 1 447#define ARCH_HAS_FAST_MULTIPLIER 1
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0918654305af..0d467b338835 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,9 +62,9 @@ struct cal_chipset_ops {
62extern int use_calgary; 62extern int use_calgary;
63 63
64#ifdef CONFIG_CALGARY_IOMMU 64#ifdef CONFIG_CALGARY_IOMMU
65extern void detect_calgary(void); 65extern int detect_calgary(void);
66#else 66#else
67static inline void detect_calgary(void) { return; } 67static inline int detect_calgary(void) { return -ENODEV; }
68#endif 68#endif
69 69
70#endif /* _ASM_X86_CALGARY_H */ 70#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 0e63c9a2a8d0..30af5a832163 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,36 +48,38 @@ For 32-bit we have the following conventions - kernel is built with
48 48
49 49
50/* 50/*
51 * 64-bit system call stack frame layout defines and helpers, 51 * 64-bit system call stack frame layout defines and helpers, for
52 * for assembly code: 52 * assembly code (note that the seemingly unnecessary parentheses
53 * are to prevent cpp from inserting spaces in expressions that get
54 * passed to macros):
53 */ 55 */
54 56
55#define R15 0 57#define R15 (0)
56#define R14 8 58#define R14 (8)
57#define R13 16 59#define R13 (16)
58#define R12 24 60#define R12 (24)
59#define RBP 32 61#define RBP (32)
60#define RBX 40 62#define RBX (40)
61 63
62/* arguments: interrupts/non tracing syscalls only save up to here: */ 64/* arguments: interrupts/non tracing syscalls only save up to here: */
63#define R11 48 65#define R11 (48)
64#define R10 56 66#define R10 (56)
65#define R9 64 67#define R9 (64)
66#define R8 72 68#define R8 (72)
67#define RAX 80 69#define RAX (80)
68#define RCX 88 70#define RCX (88)
69#define RDX 96 71#define RDX (96)
70#define RSI 104 72#define RSI (104)
71#define RDI 112 73#define RDI (112)
72#define ORIG_RAX 120 /* + error_code */ 74#define ORIG_RAX (120) /* + error_code */
73/* end of arguments */ 75/* end of arguments */
74 76
75/* cpu exception frame or undefined in case of fast syscall: */ 77/* cpu exception frame or undefined in case of fast syscall: */
76#define RIP 128 78#define RIP (128)
77#define CS 136 79#define CS (136)
78#define EFLAGS 144 80#define EFLAGS (144)
79#define RSP 152 81#define RSP (152)
80#define SS 160 82#define SS (160)
81 83
82#define ARGOFFSET R11 84#define ARGOFFSET R11
83#define SWFRAME ORIG_RAX 85#define SWFRAME ORIG_RAX
@@ -111,7 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
111 .endif 113 .endif
112 .endm 114 .endm
113 115
114#define ARG_SKIP 9*8 116#define ARG_SKIP (9*8)
115 117
116 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ 118 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
117 skipr8910=0, skiprdx=0 119 skipr8910=0, skiprdx=0
@@ -169,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
169 .endif 171 .endif
170 .endm 172 .endm
171 173
172#define REST_SKIP 6*8 174#define REST_SKIP (6*8)
173 175
174 .macro SAVE_REST 176 .macro SAVE_REST
175 subq $REST_SKIP, %rsp 177 subq $REST_SKIP, %rsp
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3f76523589af..220e2ea08e80 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -152,10 +152,14 @@
152#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ 152#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
153#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ 153#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
154#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ 154#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
155#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ 155#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
156#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ 156#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
157#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ 157#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
158#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
159#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
158#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ 160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
159 163
160/* 164/*
161 * Auxiliary flags: Linux defined - For features scattered in various 165 * Auxiliary flags: Linux defined - For features scattered in various
@@ -180,6 +184,13 @@
180#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ 184#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
181#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ 185#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
182#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ 186#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
187#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
188#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
189#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
190#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
191#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
192#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
193
183 194
184/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 195/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
185#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 196#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a9..326099199318 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
89 CFI_ADJUST_CFA_OFFSET -8 89 CFI_ADJUST_CFA_OFFSET -8
90 .endm 90 .endm
91 91
92 .macro pushfq_cfi
93 pushfq
94 CFI_ADJUST_CFA_OFFSET 8
95 .endm
96
97 .macro popfq_cfi
98 popfq
99 CFI_ADJUST_CFA_OFFSET -8
100 .endm
101
92 .macro movq_cfi reg offset=0 102 .macro movq_cfi reg offset=0
93 movq %\reg, \offset(%rsp) 103 movq %\reg, \offset(%rsp)
94 CFI_REL_OFFSET \reg, \offset 104 CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
109 CFI_ADJUST_CFA_OFFSET -4 119 CFI_ADJUST_CFA_OFFSET -4
110 .endm 120 .endm
111 121
122 .macro pushfl_cfi
123 pushfl
124 CFI_ADJUST_CFA_OFFSET 4
125 .endm
126
127 .macro popfl_cfi
128 popfl
129 CFI_ADJUST_CFA_OFFSET -4
130 .endm
131
112 .macro movl_cfi reg offset=0 132 .macro movl_cfi reg offset=0
113 movl %\reg, \offset(%esp) 133 movl %\reg, \offset(%esp)
114 CFI_REL_OFFSET \reg, \offset 134 CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index ec8a52d14ab1..5be1542fbfaf 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -112,23 +112,13 @@ static inline void early_memtest(unsigned long start, unsigned long end)
112} 112}
113#endif 113#endif
114 114
115extern unsigned long end_user_pfn;
116
117extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
118extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
119extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
120#include <linux/early_res.h>
121
122extern unsigned long e820_end_of_ram_pfn(void); 115extern unsigned long e820_end_of_ram_pfn(void);
123extern unsigned long e820_end_of_low_ram_pfn(void); 116extern unsigned long e820_end_of_low_ram_pfn(void);
124extern int e820_find_active_region(const struct e820entry *ei, 117extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
125 unsigned long start_pfn, 118
126 unsigned long last_pfn, 119void memblock_x86_fill(void);
127 unsigned long *ei_startpfn, 120void memblock_find_dma_reserve(void);
128 unsigned long *ei_endpfn); 121
129extern void e820_register_active_regions(int nid, unsigned long start_pfn,
130 unsigned long end_pfn);
131extern u64 e820_hole_size(u64 start, u64 end);
132extern void finish_e820_parsing(void); 122extern void finish_e820_parsing(void);
133extern void e820_reserve_resources(void); 123extern void e820_reserve_resources(void);
134extern void e820_reserve_resources_late(void); 124extern void e820_reserve_resources_late(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8406ed7f9926..8e4a16508d4e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,7 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
90#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
91 91
92extern int add_efi_memmap; 92extern int add_efi_memmap;
93extern void efi_reserve_early(void); 93extern void efi_memblock_x86_reserve_range(void);
94extern void efi_call_phys_prelog(void); 94extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void); 95extern void efi_call_phys_epilog(void);
96 96
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98f..57650ab4a5f5 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,22 +16,11 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18 18
19BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, 19.irpc idx, "01234567"
20 smp_invalidate_interrupt) 20BUILD_INTERRUPT3(invalidate_interrupt\idx,
21BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1, 21 (INVALIDATE_TLB_VECTOR_START)+\idx,
22 smp_invalidate_interrupt)
23BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
24 smp_invalidate_interrupt)
25BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
26 smp_invalidate_interrupt)
27BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
28 smp_invalidate_interrupt)
29BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
30 smp_invalidate_interrupt)
31BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
32 smp_invalidate_interrupt)
33BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
34 smp_invalidate_interrupt) 22 smp_invalidate_interrupt)
23.endr
35#endif 24#endif
36 25
37BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 26BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
@@ -49,8 +38,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
49BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 38BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 39BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
51 40
52#ifdef CONFIG_PERF_EVENTS 41#ifdef CONFIG_IRQ_WORK
53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) 42BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
54#endif 43#endif
55 44
56#ifdef CONFIG_X86_THERMAL_VECTOR 45#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1dc..4d293dced62f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
214 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); 214 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
215 return __virt_to_fix(vaddr); 215 return __virt_to_fix(vaddr);
216} 216}
217
218/* Return an pointer with offset calculated */
219static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
220 phys_addr_t phys, pgprot_t flags)
221{
222 __set_fixmap(idx, phys, flags);
223 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
224}
225
226#define set_fixmap_offset(idx, phys) \
227 __set_fixmap_offset(idx, phys, PAGE_KERNEL)
228
229#define set_fixmap_offset_nocache(idx, phys) \
230 __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
231
217#endif /* !__ASSEMBLY__ */ 232#endif /* !__ASSEMBLY__ */
218#endif /* _ASM_X86_FIXMAP_H */ 233#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc1..43085bfc99c3 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
17#define GARTEN (1<<0) 17#define GARTEN (1<<0)
18#define DISGARTCPU (1<<4) 18#define DISGARTCPU (1<<4)
19#define DISGARTIO (1<<5) 19#define DISGARTIO (1<<5)
20#define DISTLBWALKPRB (1<<6)
20 21
21/* GART cache control register bits. */ 22/* GART cache control register bits. */
22#define INVGART (1<<0) 23#define INVGART (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
27#define AMD64_GARTAPERTUREBASE 0x94 28#define AMD64_GARTAPERTUREBASE 0x94
28#define AMD64_GARTTABLEBASE 0x98 29#define AMD64_GARTTABLEBASE 0x98
29#define AMD64_GARTCACHECTL 0x9c 30#define AMD64_GARTCACHECTL 0x9c
30#define AMD64_GARTEN (1<<0)
31 31
32#ifdef CONFIG_GART_IOMMU 32#ifdef CONFIG_GART_IOMMU
33extern int gart_iommu_aperture; 33extern int gart_iommu_aperture;
@@ -37,7 +37,7 @@ extern int gart_iommu_aperture_disabled;
37extern void early_gart_iommu_check(void); 37extern void early_gart_iommu_check(void);
38extern int gart_iommu_init(void); 38extern int gart_iommu_init(void);
39extern void __init gart_parse_options(char *); 39extern void __init gart_parse_options(char *);
40extern void gart_iommu_hole_init(void); 40extern int gart_iommu_hole_init(void);
41 41
42#else 42#else
43#define gart_iommu_aperture 0 43#define gart_iommu_aperture 0
@@ -50,13 +50,27 @@ static inline void early_gart_iommu_check(void)
50static inline void gart_parse_options(char *options) 50static inline void gart_parse_options(char *options)
51{ 51{
52} 52}
53static inline void gart_iommu_hole_init(void) 53static inline int gart_iommu_hole_init(void)
54{ 54{
55 return -ENODEV;
55} 56}
56#endif 57#endif
57 58
58extern int agp_amd64_init(void); 59extern int agp_amd64_init(void);
59 60
61static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
62{
63 u32 ctl;
64
65 /*
66 * Don't enable translation but enable GART IO and CPU accesses.
67 * Also, set DISTLBWALKPRB since GART tables memory is UC.
68 */
69 ctl = DISTLBWALKPRB | order << 1;
70
71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
72}
73
60static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) 74static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
61{ 75{
62 u32 tmp, ctl; 76 u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index aeab29aee617..55e4de613f0e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
14#endif 14#endif
15 unsigned int x86_platform_ipis; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_irq_work_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
20 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 8caac76ac324..3bd04022fd0c 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page);
59 59
60void *kmap(struct page *page); 60void *kmap(struct page *page);
61void kunmap(struct page *page); 61void kunmap(struct page *page);
62void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); 62
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic_prot(struct page *page, pgprot_t prot);
64void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); 64void *__kmap_atomic(struct page *page);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void __kunmap_atomic(void *kvaddr);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_pfn(unsigned long pfn);
67void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 68struct page *kmap_atomic_to_page(void *ptr);
68 69
69#define flush_cache_kmaps() do { } while (0) 70#define flush_cache_kmaps() do { } while (0)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index d5905fd8ba41..0274ec5a7e62 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void x86_platform_ipi(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void irq_work_interrupt(void);
33 33
34extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
35extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e69..4aa2bb3b242a 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
55extern int restore_i387_xstate_ia32(void __user *buf); 55extern int restore_i387_xstate_ia32(void __user *buf);
56#endif 56#endif
57 57
58#ifdef CONFIG_MATH_EMULATION
59extern void finit_soft_fpu(struct i387_soft_struct *soft);
60#else
61static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
62#endif
63
58#define X87_FSW_ES (1 << 7) /* Exception Summary */ 64#define X87_FSW_ES (1 << 7) /* Exception Summary */
59 65
60static __always_inline __pure bool use_xsaveopt(void) 66static __always_inline __pure bool use_xsaveopt(void)
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
67 return static_cpu_has(X86_FEATURE_XSAVE); 73 return static_cpu_has(X86_FEATURE_XSAVE);
68} 74}
69 75
76static __always_inline __pure bool use_fxsr(void)
77{
78 return static_cpu_has(X86_FEATURE_FXSR);
79}
80
70extern void __sanitize_i387_state(struct task_struct *); 81extern void __sanitize_i387_state(struct task_struct *);
71 82
72static inline void sanitize_i387_state(struct task_struct *tsk) 83static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
77} 88}
78 89
79#ifdef CONFIG_X86_64 90#ifdef CONFIG_X86_64
80
81/* Ignore delayed exceptions from user space */
82static inline void tolerant_fwait(void)
83{
84 asm volatile("1: fwait\n"
85 "2:\n"
86 _ASM_EXTABLE(1b, 2b));
87}
88
89static inline int fxrstor_checking(struct i387_fxsave_struct *fx) 91static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
90{ 92{
91 int err; 93 int err;
92 94
95 /* See comment in fxsave() below. */
93 asm volatile("1: rex64/fxrstor (%[fx])\n\t" 96 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
94 "2:\n" 97 "2:\n"
95 ".section .fixup,\"ax\"\n" 98 ".section .fixup,\"ax\"\n"
@@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
98 ".previous\n" 101 ".previous\n"
99 _ASM_EXTABLE(1b, 3b) 102 _ASM_EXTABLE(1b, 3b)
100 : [err] "=r" (err) 103 : [err] "=r" (err)
101#if 0 /* See comment in fxsave() below. */ 104 : [fx] "R" (fx), "m" (*fx), "0" (0));
102 : [fx] "r" (fx), "m" (*fx), "0" (0));
103#else
104 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
105#endif
106 return err; 105 return err;
107} 106}
108 107
109/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
110 is pending. Clear the x87 state here by setting it to fixed
111 values. The kernel data segment can be sometimes 0 and sometimes
112 new user value. Both should be ok.
113 Use the PDA as safe address because it should be already in L1. */
114static inline void fpu_clear(struct fpu *fpu)
115{
116 struct xsave_struct *xstate = &fpu->state->xsave;
117 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
118
119 /*
120 * xsave header may indicate the init state of the FP.
121 */
122 if (use_xsave() &&
123 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
124 return;
125
126 if (unlikely(fx->swd & X87_FSW_ES))
127 asm volatile("fnclex");
128 alternative_input(ASM_NOP8 ASM_NOP2,
129 " emms\n" /* clear stack tags */
130 " fildl %%gs:0", /* load to clear state */
131 X86_FEATURE_FXSAVE_LEAK);
132}
133
134static inline void clear_fpu_state(struct task_struct *tsk)
135{
136 fpu_clear(&tsk->thread.fpu);
137}
138
139static inline int fxsave_user(struct i387_fxsave_struct __user *fx) 108static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
140{ 109{
141 int err; 110 int err;
@@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
149 if (unlikely(err)) 118 if (unlikely(err))
150 return -EFAULT; 119 return -EFAULT;
151 120
121 /* See comment in fxsave() below. */
152 asm volatile("1: rex64/fxsave (%[fx])\n\t" 122 asm volatile("1: rex64/fxsave (%[fx])\n\t"
153 "2:\n" 123 "2:\n"
154 ".section .fixup,\"ax\"\n" 124 ".section .fixup,\"ax\"\n"
@@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
157 ".previous\n" 127 ".previous\n"
158 _ASM_EXTABLE(1b, 3b) 128 _ASM_EXTABLE(1b, 3b)
159 : [err] "=r" (err), "=m" (*fx) 129 : [err] "=r" (err), "=m" (*fx)
160#if 0 /* See comment in fxsave() below. */ 130 : [fx] "R" (fx), "0" (0));
161 : [fx] "r" (fx), "0" (0));
162#else
163 : [fx] "cdaSDb" (fx), "0" (0));
164#endif
165 if (unlikely(err) && 131 if (unlikely(err) &&
166 __clear_user(fx, sizeof(struct i387_fxsave_struct))) 132 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
167 err = -EFAULT; 133 err = -EFAULT;
@@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
175 uses any extended registers for addressing, a second REX prefix 141 uses any extended registers for addressing, a second REX prefix
176 will be generated (to the assembler, rex64 followed by semicolon 142 will be generated (to the assembler, rex64 followed by semicolon
177 is a separate instruction), and hence the 64-bitness is lost. */ 143 is a separate instruction), and hence the 64-bitness is lost. */
178#if 0 144
145#ifdef CONFIG_AS_FXSAVEQ
179 /* Using "fxsaveq %0" would be the ideal choice, but is only supported 146 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
180 starting with gas 2.16. */ 147 starting with gas 2.16. */
181 __asm__ __volatile__("fxsaveq %0" 148 __asm__ __volatile__("fxsaveq %0"
182 : "=m" (fpu->state->fxsave)); 149 : "=m" (fpu->state->fxsave));
183#elif 0 150#else
184 /* Using, as a workaround, the properly prefixed form below isn't 151 /* Using, as a workaround, the properly prefixed form below isn't
185 accepted by any binutils version so far released, complaining that 152 accepted by any binutils version so far released, complaining that
186 the same type of prefix is used twice if an extended register is 153 the same type of prefix is used twice if an extended register is
187 needed for addressing (fix submitted to mainline 2005-11-21). */ 154 needed for addressing (fix submitted to mainline 2005-11-21).
188 __asm__ __volatile__("rex64/fxsave %0" 155 asm volatile("rex64/fxsave %0"
189 : "=m" (fpu->state->fxsave)); 156 : "=m" (fpu->state->fxsave));
190#else 157 This, however, we can work around by forcing the compiler to select
191 /* This, however, we can work around by forcing the compiler to select
192 an addressing mode that doesn't require extended registers. */ 158 an addressing mode that doesn't require extended registers. */
193 __asm__ __volatile__("rex64/fxsave (%1)" 159 asm volatile("rex64/fxsave (%[fx])"
194 : "=m" (fpu->state->fxsave) 160 : "=m" (fpu->state->fxsave)
195 : "cdaSDb" (&fpu->state->fxsave)); 161 : [fx] "R" (&fpu->state->fxsave));
196#endif 162#endif
197} 163}
198 164
199static inline void fpu_save_init(struct fpu *fpu)
200{
201 if (use_xsave())
202 fpu_xsave(fpu);
203 else
204 fpu_fxsave(fpu);
205
206 fpu_clear(fpu);
207}
208
209static inline void __save_init_fpu(struct task_struct *tsk)
210{
211 fpu_save_init(&tsk->thread.fpu);
212 task_thread_info(tsk)->status &= ~TS_USEDFPU;
213}
214
215#else /* CONFIG_X86_32 */ 165#else /* CONFIG_X86_32 */
216 166
217#ifdef CONFIG_MATH_EMULATION
218extern void finit_soft_fpu(struct i387_soft_struct *soft);
219#else
220static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
221#endif
222
223static inline void tolerant_fwait(void)
224{
225 asm volatile("fnclex ; fwait");
226}
227
228/* perform fxrstor iff the processor has extended states, otherwise frstor */ 167/* perform fxrstor iff the processor has extended states, otherwise frstor */
229static inline int fxrstor_checking(struct i387_fxsave_struct *fx) 168static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
230{ 169{
@@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
241 return 0; 180 return 0;
242} 181}
243 182
183static inline void fpu_fxsave(struct fpu *fpu)
184{
185 asm volatile("fxsave %[fx]"
186 : [fx] "=m" (fpu->state->fxsave));
187}
188
189#endif /* CONFIG_X86_64 */
190
244/* We need a safe address that is cheap to find and that is already 191/* We need a safe address that is cheap to find and that is already
245 in L1 during context switch. The best choices are unfortunately 192 in L1 during context switch. The best choices are unfortunately
246 different for UP and SMP */ 193 different for UP and SMP */
@@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
256static inline void fpu_save_init(struct fpu *fpu) 203static inline void fpu_save_init(struct fpu *fpu)
257{ 204{
258 if (use_xsave()) { 205 if (use_xsave()) {
259 struct xsave_struct *xstate = &fpu->state->xsave;
260 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
261
262 fpu_xsave(fpu); 206 fpu_xsave(fpu);
263 207
264 /* 208 /*
265 * xsave header may indicate the init state of the FP. 209 * xsave header may indicate the init state of the FP.
266 */ 210 */
267 if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) 211 if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
268 goto end; 212 return;
269 213 } else if (use_fxsr()) {
270 if (unlikely(fx->swd & X87_FSW_ES)) 214 fpu_fxsave(fpu);
271 asm volatile("fnclex"); 215 } else {
272 216 asm volatile("fsave %[fx]; fwait"
273 /* 217 : [fx] "=m" (fpu->state->fsave));
274 * we can do a simple return here or be paranoid :) 218 return;
275 */
276 goto clear_state;
277 } 219 }
278 220
279 /* Use more nops than strictly needed in case the compiler 221 if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
280 varies code */ 222 asm volatile("fnclex");
281 alternative_input( 223
282 "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
283 "fxsave %[fx]\n"
284 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
285 X86_FEATURE_FXSR,
286 [fx] "m" (fpu->state->fxsave),
287 [fsw] "m" (fpu->state->fxsave.swd) : "memory");
288clear_state:
289 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception 224 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
290 is pending. Clear the x87 state here by setting it to fixed 225 is pending. Clear the x87 state here by setting it to fixed
291 values. safe_address is a random variable that should be in L1 */ 226 values. safe_address is a random variable that should be in L1 */
292 alternative_input( 227 alternative_input(
293 GENERIC_NOP8 GENERIC_NOP2, 228 ASM_NOP8 ASM_NOP2,
294 "emms\n\t" /* clear stack tags */ 229 "emms\n\t" /* clear stack tags */
295 "fildl %[addr]", /* set F?P to defined value */ 230 "fildl %P[addr]", /* set F?P to defined value */
296 X86_FEATURE_FXSAVE_LEAK, 231 X86_FEATURE_FXSAVE_LEAK,
297 [addr] "m" (safe_address)); 232 [addr] "m" (safe_address));
298end:
299 ;
300} 233}
301 234
302static inline void __save_init_fpu(struct task_struct *tsk) 235static inline void __save_init_fpu(struct task_struct *tsk)
@@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
305 task_thread_info(tsk)->status &= ~TS_USEDFPU; 238 task_thread_info(tsk)->status &= ~TS_USEDFPU;
306} 239}
307 240
308
309#endif /* CONFIG_X86_64 */
310
311static inline int fpu_fxrstor_checking(struct fpu *fpu) 241static inline int fpu_fxrstor_checking(struct fpu *fpu)
312{ 242{
313 return fxrstor_checking(&fpu->state->fxsave); 243 return fxrstor_checking(&fpu->state->fxsave);
@@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
344static inline void __clear_fpu(struct task_struct *tsk) 274static inline void __clear_fpu(struct task_struct *tsk)
345{ 275{
346 if (task_thread_info(tsk)->status & TS_USEDFPU) { 276 if (task_thread_info(tsk)->status & TS_USEDFPU) {
347 tolerant_fwait(); 277 /* Ignore delayed exceptions from user space */
278 asm volatile("1: fwait\n"
279 "2:\n"
280 _ASM_EXTABLE(1b, 2b));
348 task_thread_info(tsk)->status &= ~TS_USEDFPU; 281 task_thread_info(tsk)->status &= ~TS_USEDFPU;
349 stts(); 282 stts();
350 } 283 }
@@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state)
405 stts(); 338 stts();
406} 339}
407 340
408#ifdef CONFIG_X86_64
409
410static inline void save_init_fpu(struct task_struct *tsk)
411{
412 __save_init_fpu(tsk);
413 stts();
414}
415
416#define unlazy_fpu __unlazy_fpu
417#define clear_fpu __clear_fpu
418
419#else /* CONFIG_X86_32 */
420
421/* 341/*
422 * These disable preemption on their own and are safe 342 * These disable preemption on their own and are safe
423 */ 343 */
@@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk)
443 preempt_enable(); 363 preempt_enable();
444} 364}
445 365
446#endif /* CONFIG_X86_64 */
447
448/* 366/*
449 * i387 state interaction 367 * i387 state interaction
450 */ 368 */
@@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu);
508 426
509#endif /* __ASSEMBLY__ */ 427#endif /* __ASSEMBLY__ */
510 428
511#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
512#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
513
514#endif /* _ASM_X86_I387_H */ 429#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 0ad29d401565..072273082528 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -208,6 +208,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
208 208
209extern void iounmap(volatile void __iomem *addr); 209extern void iounmap(volatile void __iomem *addr);
210 210
211extern void set_iounmap_nonlazy(void);
211 212
212#ifdef __KERNEL__ 213#ifdef __KERNEL__
213 214
@@ -350,6 +351,7 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
350 unsigned long size); 351 unsigned long size);
351extern void early_iounmap(void __iomem *addr, unsigned long size); 352extern void early_iounmap(void __iomem *addr, unsigned long size);
352extern void fixup_early_ioremap(void); 353extern void fixup_early_ioremap(void);
354extern bool is_early_ioremap_ptep(pte_t *ptep);
353 355
354#ifdef CONFIG_XEN 356#ifdef CONFIG_XEN
355struct bio_vec; 357struct bio_vec;
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index c4191b3b7056..363e33eb6ec1 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -27,10 +27,10 @@
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29void __iomem * 29void __iomem *
30iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 30iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
31 31
32void 32void
33iounmap_atomic(void __iomem *kvaddr, enum km_type type); 33iounmap_atomic(void __iomem *kvaddr);
34 34
35int 35int
36iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); 36iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
new file mode 100644
index 000000000000..f229b13a5f30
--- /dev/null
+++ b/arch/x86/include/asm/iommu_table.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_IOMMU_TABLE_H
2#define _ASM_X86_IOMMU_TABLE_H
3
4#include <asm/swiotlb.h>
5
6/*
7 * History lesson:
8 * The execution chain of IOMMUs in 2.6.36 looks as so:
9 *
10 * [xen-swiotlb]
11 * |
12 * +----[swiotlb *]--+
13 * / | \
14 * / | \
15 * [GART] [Calgary] [Intel VT-d]
16 * /
17 * /
18 * [AMD-Vi]
19 *
20 * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
21 * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
22 * Also it would surreptitiously initialize set the swiotlb=1 if there were
23 * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
24 * flag would be turned off by all IOMMUs except the Calgary one.
25 *
26 * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
27 * to be built by defining who we depend on.
28 *
29 * And all that needs to be done is to use one of the macros in the IOMMU
30 * and the pci-dma.c will take care of the rest.
31 */
32
33struct iommu_table_entry {
34 initcall_t detect;
35 initcall_t depend;
36 void (*early_init)(void); /* No memory allocate available. */
37 void (*late_init)(void); /* Yes, can allocate memory. */
38#define IOMMU_FINISH_IF_DETECTED (1<<0)
39#define IOMMU_DETECTED (1<<1)
40 int flags;
41};
42/*
43 * Macro fills out an entry in the .iommu_table that is equivalent
44 * to the fields that 'struct iommu_table_entry' has. The entries
45 * that are put in the .iommu_table section are not put in any order
46 * hence during boot-time we will have to resort them based on
47 * dependency. */
48
49
50#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
51 static const struct iommu_table_entry const \
52 __iommu_entry_##_detect __used \
53 __attribute__ ((unused, __section__(".iommu_table"), \
54 aligned((sizeof(void *))))) \
55 = {_detect, _depend, _early_init, _late_init, \
56 _finish ? IOMMU_FINISH_IF_DETECTED : 0}
57/*
58 * The simplest IOMMU definition. Provide the detection routine
59 * and it will be run after the SWIOTLB and the other IOMMUs
60 * that utilize this macro. If the IOMMU is detected (ie, the
61 * detect routine returns a positive value), the other IOMMUs
62 * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer
63 * to stop detecting the other IOMMUs after yours has been detected.
64 */
65#define IOMMU_INIT_POST(_detect) \
66 __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0)
67
68#define IOMMU_INIT_POST_FINISH(detect) \
69 __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1)
70
71/*
72 * A more sophisticated version of IOMMU_INIT. This variant requires:
73 * a). A detection routine function.
74 * b). The name of the detection routine we depend on to get called
75 * before us.
76 * c). The init routine which gets called if the detection routine
77 * returns a positive value from the pci_iommu_alloc. This means
78 * no presence of a memory allocator.
79 * d). Similar to the 'init', except that this gets called from pci_iommu_init
80 * where we do have a memory allocator.
81 *
82 * The standard vs the _FINISH differs in that the _FINISH variant will
83 * continue detecting other IOMMUs in the call list after the
84 * the detection routine returns a positive number. The _FINISH will
85 * stop the execution chain. Both will still call the 'init' and
86 * 'late_init' functions if they are set.
87 */
88#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
89 __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
90
91#define IOMMU_INIT(_detect, _depend, _init, _late_init) \
92 __IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
93
94void sort_iommu_table(struct iommu_table_entry *start,
95 struct iommu_table_entry *finish);
96
97void check_iommu_entries(struct iommu_table_entry *start,
98 struct iommu_table_entry *finish);
99
100#endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 5458380b6ef8..13b0ebaa512f 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -19,18 +19,14 @@ static inline int irq_canonicalize(int irq)
19# define ARCH_HAS_NMI_WATCHDOG 19# define ARCH_HAS_NMI_WATCHDOG
20#endif 20#endif
21 21
22#ifdef CONFIG_4KSTACKS 22#ifdef CONFIG_X86_32
23 extern void irq_ctx_init(int cpu); 23extern void irq_ctx_init(int cpu);
24 extern void irq_ctx_exit(int cpu);
25# define __ARCH_HAS_DO_SOFTIRQ
26#else 24#else
27# define irq_ctx_init(cpu) do { } while (0) 25# define irq_ctx_init(cpu) do { } while (0)
28# define irq_ctx_exit(cpu) do { } while (0)
29# ifdef CONFIG_X86_64
30# define __ARCH_HAS_DO_SOFTIRQ
31# endif
32#endif 26#endif
33 27
28#define __ARCH_HAS_DO_SOFTIRQ
29
34#ifdef CONFIG_HOTPLUG_CPU 30#ifdef CONFIG_HOTPLUG_CPU
35#include <linux/cpumask.h> 31#include <linux/cpumask.h>
36extern void fixup_irqs(void); 32extern void fixup_irqs(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e2ca30092557..6af0894dafb4 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
114#define X86_PLATFORM_IPI_VECTOR 0xed 114#define X86_PLATFORM_IPI_VECTOR 0xed
115 115
116/* 116/*
117 * Performance monitoring pending work vector: 117 * IRQ work vector:
118 */ 118 */
119#define LOCAL_PENDING_VECTOR 0xec 119#define IRQ_WORK_VECTOR 0xec
120 120
121#define UV_BAU_MESSAGE 0xea 121#define UV_BAU_MESSAGE 0xea
122 122
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 9e2b952f810a..5745ce8bf108 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -61,22 +61,22 @@ static inline void native_halt(void)
61#else 61#else
62#ifndef __ASSEMBLY__ 62#ifndef __ASSEMBLY__
63 63
64static inline unsigned long __raw_local_save_flags(void) 64static inline unsigned long arch_local_save_flags(void)
65{ 65{
66 return native_save_fl(); 66 return native_save_fl();
67} 67}
68 68
69static inline void raw_local_irq_restore(unsigned long flags) 69static inline void arch_local_irq_restore(unsigned long flags)
70{ 70{
71 native_restore_fl(flags); 71 native_restore_fl(flags);
72} 72}
73 73
74static inline void raw_local_irq_disable(void) 74static inline void arch_local_irq_disable(void)
75{ 75{
76 native_irq_disable(); 76 native_irq_disable();
77} 77}
78 78
79static inline void raw_local_irq_enable(void) 79static inline void arch_local_irq_enable(void)
80{ 80{
81 native_irq_enable(); 81 native_irq_enable();
82} 82}
@@ -85,7 +85,7 @@ static inline void raw_local_irq_enable(void)
85 * Used in the idle loop; sti takes one instruction cycle 85 * Used in the idle loop; sti takes one instruction cycle
86 * to complete: 86 * to complete:
87 */ 87 */
88static inline void raw_safe_halt(void) 88static inline void arch_safe_halt(void)
89{ 89{
90 native_safe_halt(); 90 native_safe_halt();
91} 91}
@@ -102,12 +102,10 @@ static inline void halt(void)
102/* 102/*
103 * For spinlocks, etc: 103 * For spinlocks, etc:
104 */ 104 */
105static inline unsigned long __raw_local_irq_save(void) 105static inline unsigned long arch_local_irq_save(void)
106{ 106{
107 unsigned long flags = __raw_local_save_flags(); 107 unsigned long flags = arch_local_save_flags();
108 108 arch_local_irq_disable();
109 raw_local_irq_disable();
110
111 return flags; 109 return flags;
112} 110}
113#else 111#else
@@ -153,22 +151,16 @@ static inline unsigned long __raw_local_irq_save(void)
153#endif /* CONFIG_PARAVIRT */ 151#endif /* CONFIG_PARAVIRT */
154 152
155#ifndef __ASSEMBLY__ 153#ifndef __ASSEMBLY__
156#define raw_local_save_flags(flags) \ 154static inline int arch_irqs_disabled_flags(unsigned long flags)
157 do { (flags) = __raw_local_save_flags(); } while (0)
158
159#define raw_local_irq_save(flags) \
160 do { (flags) = __raw_local_irq_save(); } while (0)
161
162static inline int raw_irqs_disabled_flags(unsigned long flags)
163{ 155{
164 return !(flags & X86_EFLAGS_IF); 156 return !(flags & X86_EFLAGS_IF);
165} 157}
166 158
167static inline int raw_irqs_disabled(void) 159static inline int arch_irqs_disabled(void)
168{ 160{
169 unsigned long flags = __raw_local_save_flags(); 161 unsigned long flags = arch_local_save_flags();
170 162
171 return raw_irqs_disabled_flags(flags); 163 return arch_irqs_disabled_flags(flags);
172} 164}
173 165
174#else 166#else
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 000000000000..f52d42e80585
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,37 @@
1#ifndef _ASM_X86_JUMP_LABEL_H
2#define _ASM_X86_JUMP_LABEL_H
3
4#ifdef __KERNEL__
5
6#include <linux/types.h>
7#include <asm/nops.h>
8
9#define JUMP_LABEL_NOP_SIZE 5
10
11# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
12
13# define JUMP_LABEL(key, label) \
14 do { \
15 asm goto("1:" \
16 JUMP_LABEL_INITIAL_NOP \
17 ".pushsection __jump_table, \"a\" \n\t"\
18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
19 ".popsection \n\t" \
20 : : "i" (key) : : label); \
21 } while (0)
22
23#endif /* __KERNEL__ */
24
25#ifdef CONFIG_X86_64
26typedef u64 jump_label_t;
27#else
28typedef u32 jump_label_t;
29#endif
30
31struct jump_entry {
32 jump_label_t code;
33 jump_label_t target;
34 jump_label_t key;
35};
36
37#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1f99ecfc48e1..b36c6b3fe144 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,7 @@ struct x86_emulate_ops {
139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); 140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
142 void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 143 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 144 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
144 int (*cpl)(struct kvm_vcpu *vcpu); 145 int (*cpl)(struct kvm_vcpu *vcpu);
@@ -156,7 +157,10 @@ struct operand {
156 unsigned long orig_val; 157 unsigned long orig_val;
157 u64 orig_val64; 158 u64 orig_val64;
158 }; 159 };
159 unsigned long *ptr; 160 union {
161 unsigned long *reg;
162 unsigned long mem;
163 } addr;
160 union { 164 union {
161 unsigned long val; 165 unsigned long val;
162 u64 val64; 166 u64 val64;
@@ -190,6 +194,7 @@ struct decode_cache {
190 bool has_seg_override; 194 bool has_seg_override;
191 u8 seg_override; 195 u8 seg_override;
192 unsigned int d; 196 unsigned int d;
197 int (*execute)(struct x86_emulate_ctxt *ctxt);
193 unsigned long regs[NR_VCPU_REGS]; 198 unsigned long regs[NR_VCPU_REGS];
194 unsigned long eip; 199 unsigned long eip;
195 /* modrm */ 200 /* modrm */
@@ -197,17 +202,16 @@ struct decode_cache {
197 u8 modrm_mod; 202 u8 modrm_mod;
198 u8 modrm_reg; 203 u8 modrm_reg;
199 u8 modrm_rm; 204 u8 modrm_rm;
200 u8 use_modrm_ea; 205 u8 modrm_seg;
201 bool rip_relative; 206 bool rip_relative;
202 unsigned long modrm_ea;
203 void *modrm_ptr;
204 unsigned long modrm_val;
205 struct fetch_cache fetch; 207 struct fetch_cache fetch;
206 struct read_cache io_read; 208 struct read_cache io_read;
207 struct read_cache mem_read; 209 struct read_cache mem_read;
208}; 210};
209 211
210struct x86_emulate_ctxt { 212struct x86_emulate_ctxt {
213 struct x86_emulate_ops *ops;
214
211 /* Register state before/after emulation. */ 215 /* Register state before/after emulation. */
212 struct kvm_vcpu *vcpu; 216 struct kvm_vcpu *vcpu;
213 217
@@ -220,12 +224,11 @@ struct x86_emulate_ctxt {
220 /* interruptibility state, as a result of execution of STI or MOV SS */ 224 /* interruptibility state, as a result of execution of STI or MOV SS */
221 int interruptibility; 225 int interruptibility;
222 226
223 bool restart; /* restart string instruction after writeback */ 227 bool perm_ok; /* do not check permissions if true */
224 228
225 int exception; /* exception that happens during emulation or -1 */ 229 int exception; /* exception that happens during emulation or -1 */
226 u32 error_code; /* error code for exception */ 230 u32 error_code; /* error code for exception */
227 bool error_code_valid; 231 bool error_code_valid;
228 unsigned long cr2; /* faulted address in case of #PF */
229 232
230 /* decode cache */ 233 /* decode cache */
231 struct decode_cache decode; 234 struct decode_cache decode;
@@ -249,13 +252,14 @@ struct x86_emulate_ctxt {
249#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 252#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
250#endif 253#endif
251 254
252int x86_decode_insn(struct x86_emulate_ctxt *ctxt, 255int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
253 struct x86_emulate_ops *ops); 256#define EMULATION_FAILED -1
254int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, 257#define EMULATION_OK 0
255 struct x86_emulate_ops *ops); 258#define EMULATION_RESTART 1
259int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
256int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 260int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
257 struct x86_emulate_ops *ops,
258 u16 tss_selector, int reason, 261 u16 tss_selector, int reason,
259 bool has_error_code, u32 error_code); 262 bool has_error_code, u32 error_code);
260 263int emulate_int_real(struct x86_emulate_ctxt *ctxt,
264 struct x86_emulate_ops *ops, int irq);
261#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 265#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 502e53f999cf..9e6fe391094e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,10 +236,14 @@ struct kvm_pio_request {
236 */ 236 */
237struct kvm_mmu { 237struct kvm_mmu {
238 void (*new_cr3)(struct kvm_vcpu *vcpu); 238 void (*new_cr3)(struct kvm_vcpu *vcpu);
239 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
240 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
239 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 241 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
242 void (*inject_page_fault)(struct kvm_vcpu *vcpu);
240 void (*free)(struct kvm_vcpu *vcpu); 243 void (*free)(struct kvm_vcpu *vcpu);
241 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 244 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
242 u32 *error); 245 u32 *error);
246 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
243 void (*prefetch_page)(struct kvm_vcpu *vcpu, 247 void (*prefetch_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *page); 248 struct kvm_mmu_page *page);
245 int (*sync_page)(struct kvm_vcpu *vcpu, 249 int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -249,13 +253,18 @@ struct kvm_mmu {
249 int root_level; 253 int root_level;
250 int shadow_root_level; 254 int shadow_root_level;
251 union kvm_mmu_page_role base_role; 255 union kvm_mmu_page_role base_role;
256 bool direct_map;
252 257
253 u64 *pae_root; 258 u64 *pae_root;
259 u64 *lm_root;
254 u64 rsvd_bits_mask[2][4]; 260 u64 rsvd_bits_mask[2][4];
261
262 bool nx;
263
264 u64 pdptrs[4]; /* pae */
255}; 265};
256 266
257struct kvm_vcpu_arch { 267struct kvm_vcpu_arch {
258 u64 host_tsc;
259 /* 268 /*
260 * rip and regs accesses must go through 269 * rip and regs accesses must go through
261 * kvm_{register,rip}_{read,write} functions. 270 * kvm_{register,rip}_{read,write} functions.
@@ -272,7 +281,6 @@ struct kvm_vcpu_arch {
272 unsigned long cr4_guest_owned_bits; 281 unsigned long cr4_guest_owned_bits;
273 unsigned long cr8; 282 unsigned long cr8;
274 u32 hflags; 283 u32 hflags;
275 u64 pdptrs[4]; /* pae */
276 u64 efer; 284 u64 efer;
277 u64 apic_base; 285 u64 apic_base;
278 struct kvm_lapic *apic; /* kernel irqchip context */ 286 struct kvm_lapic *apic; /* kernel irqchip context */
@@ -282,7 +290,41 @@ struct kvm_vcpu_arch {
282 u64 ia32_misc_enable_msr; 290 u64 ia32_misc_enable_msr;
283 bool tpr_access_reporting; 291 bool tpr_access_reporting;
284 292
293 /*
294 * Paging state of the vcpu
295 *
296 * If the vcpu runs in guest mode with two level paging this still saves
297 * the paging mode of the l1 guest. This context is always used to
298 * handle faults.
299 */
285 struct kvm_mmu mmu; 300 struct kvm_mmu mmu;
301
302 /*
303 * Paging state of an L2 guest (used for nested npt)
304 *
305 * This context will save all necessary information to walk page tables
306 * of the an L2 guest. This context is only initialized for page table
307 * walking and not for faulting since we never handle l2 page faults on
308 * the host.
309 */
310 struct kvm_mmu nested_mmu;
311
312 /*
313 * Pointer to the mmu context currently used for
314 * gva_to_gpa translations.
315 */
316 struct kvm_mmu *walk_mmu;
317
318 /*
319 * This struct is filled with the necessary information to propagate a
320 * page fault into the guest
321 */
322 struct {
323 u64 address;
324 unsigned error_code;
325 bool nested;
326 } fault;
327
286 /* only needed in kvm_pv_mmu_op() path, but it's hot so 328 /* only needed in kvm_pv_mmu_op() path, but it's hot so
287 * put it here to avoid allocation */ 329 * put it here to avoid allocation */
288 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 330 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -336,9 +378,15 @@ struct kvm_vcpu_arch {
336 378
337 gpa_t time; 379 gpa_t time;
338 struct pvclock_vcpu_time_info hv_clock; 380 struct pvclock_vcpu_time_info hv_clock;
339 unsigned int hv_clock_tsc_khz; 381 unsigned int hw_tsc_khz;
340 unsigned int time_offset; 382 unsigned int time_offset;
341 struct page *time_page; 383 struct page *time_page;
384 u64 last_host_tsc;
385 u64 last_guest_tsc;
386 u64 last_kernel_ns;
387 u64 last_tsc_nsec;
388 u64 last_tsc_write;
389 bool tsc_catchup;
342 390
343 bool nmi_pending; 391 bool nmi_pending;
344 bool nmi_injected; 392 bool nmi_injected;
@@ -367,9 +415,9 @@ struct kvm_vcpu_arch {
367}; 415};
368 416
369struct kvm_arch { 417struct kvm_arch {
370 unsigned int n_free_mmu_pages; 418 unsigned int n_used_mmu_pages;
371 unsigned int n_requested_mmu_pages; 419 unsigned int n_requested_mmu_pages;
372 unsigned int n_alloc_mmu_pages; 420 unsigned int n_max_mmu_pages;
373 atomic_t invlpg_counter; 421 atomic_t invlpg_counter;
374 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 422 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
375 /* 423 /*
@@ -394,8 +442,14 @@ struct kvm_arch {
394 gpa_t ept_identity_map_addr; 442 gpa_t ept_identity_map_addr;
395 443
396 unsigned long irq_sources_bitmap; 444 unsigned long irq_sources_bitmap;
397 u64 vm_init_tsc;
398 s64 kvmclock_offset; 445 s64 kvmclock_offset;
446 spinlock_t tsc_write_lock;
447 u64 last_tsc_nsec;
448 u64 last_tsc_offset;
449 u64 last_tsc_write;
450 u32 virtual_tsc_khz;
451 u32 virtual_tsc_mult;
452 s8 virtual_tsc_shift;
399 453
400 struct kvm_xen_hvm_config xen_hvm_config; 454 struct kvm_xen_hvm_config xen_hvm_config;
401 455
@@ -505,6 +559,7 @@ struct kvm_x86_ops {
505 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 559 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
506 bool has_error_code, u32 error_code, 560 bool has_error_code, u32 error_code,
507 bool reinject); 561 bool reinject);
562 void (*cancel_injection)(struct kvm_vcpu *vcpu);
508 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 563 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
509 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 564 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
510 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 565 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -517,11 +572,16 @@ struct kvm_x86_ops {
517 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 572 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
518 int (*get_lpage_level)(void); 573 int (*get_lpage_level)(void);
519 bool (*rdtscp_supported)(void); 574 bool (*rdtscp_supported)(void);
575 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
576
577 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
520 578
521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 579 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
522 580
523 bool (*has_wbinvd_exit)(void); 581 bool (*has_wbinvd_exit)(void);
524 582
583 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
584
525 const struct trace_print_flags *exit_reasons_str; 585 const struct trace_print_flags *exit_reasons_str;
526}; 586};
527 587
@@ -544,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
544unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 604unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
545void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 605void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
546 606
547int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 607int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
548 608
549int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 609int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
550 const void *val, int bytes); 610 const void *val, int bytes);
@@ -608,8 +668,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
608void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 668void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
609void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); 669void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
610void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 670void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
611void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 671void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
612 u32 error_code); 672int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
673 gfn_t gfn, void *data, int offset, int len,
674 u32 access);
675void kvm_propagate_fault(struct kvm_vcpu *vcpu);
613bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 676bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
614 677
615int kvm_pic_set_irq(void *opaque, int irq, int level); 678int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -652,20 +715,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
652 return (struct kvm_mmu_page *)page_private(page); 715 return (struct kvm_mmu_page *)page_private(page);
653} 716}
654 717
655static inline u16 kvm_read_fs(void)
656{
657 u16 seg;
658 asm("mov %%fs, %0" : "=g"(seg));
659 return seg;
660}
661
662static inline u16 kvm_read_gs(void)
663{
664 u16 seg;
665 asm("mov %%gs, %0" : "=g"(seg));
666 return seg;
667}
668
669static inline u16 kvm_read_ldt(void) 718static inline u16 kvm_read_ldt(void)
670{ 719{
671 u16 ldt; 720 u16 ldt;
@@ -673,16 +722,6 @@ static inline u16 kvm_read_ldt(void)
673 return ldt; 722 return ldt;
674} 723}
675 724
676static inline void kvm_load_fs(u16 sel)
677{
678 asm("mov %0, %%fs" : : "rm"(sel));
679}
680
681static inline void kvm_load_gs(u16 sel)
682{
683 asm("mov %0, %%gs" : : "rm"(sel));
684}
685
686static inline void kvm_load_ldt(u16 sel) 725static inline void kvm_load_ldt(u16 sel)
687{ 726{
688 asm("lldt %0" : : "rm"(sel)); 727 asm("lldt %0" : : "rm"(sel));
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e9a8e8..7b562b6184bc 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void)
158 return cpuid_eax(KVM_CPUID_FEATURES); 158 return cpuid_eax(KVM_CPUID_FEATURES);
159} 159}
160 160
161#ifdef CONFIG_KVM_GUEST
162void __init kvm_guest_init(void);
163#else
164#define kvm_guest_init() do { } while (0)
161#endif 165#endif
162 166
167#endif /* __KERNEL__ */
168
163#endif /* _ASM_X86_KVM_PARA_H */ 169#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
new file mode 100644
index 000000000000..19ae14ba6978
--- /dev/null
+++ b/arch/x86/include/asm/memblock.h
@@ -0,0 +1,23 @@
1#ifndef _X86_MEMBLOCK_H
2#define _X86_MEMBLOCK_H
3
4#define ARCH_DISCARD_MEMBLOCK
5
6u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
7void memblock_x86_to_bootmem(u64 start, u64 end);
8
9void memblock_x86_reserve_range(u64 start, u64 end, char *name);
10void memblock_x86_free_range(u64 start, u64 end);
11struct range;
12int __get_free_all_memory_range(struct range **range, int nodeid,
13 unsigned long start_pfn, unsigned long end_pfn);
14int get_free_all_memory_range(struct range **rangep, int nodeid);
15
16void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
17 unsigned long last_pfn);
18u64 memblock_x86_hole_size(u64 start, u64 end);
19u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
20u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
21u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
22
23#endif
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 3e2ce58a31a3..67763c5d8b4e 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -60,12 +60,7 @@
60#endif 60#endif
61 61
62#ifdef CONFIG_X86_32 62#ifdef CONFIG_X86_32
63# ifdef CONFIG_4KSTACKS 63# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
64# define MODULE_STACKSIZE "4KSTACKS "
65# else
66# define MODULE_STACKSIZE ""
67# endif
68# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
69#endif 64#endif
70 65
71#endif /* _ASM_X86_MODULE_H */ 66#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 16350740edf6..4a711a684b17 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,6 +10,9 @@
10 */ 10 */
11#ifndef _ASM_X86_MRST_H 11#ifndef _ASM_X86_MRST_H
12#define _ASM_X86_MRST_H 12#define _ASM_X86_MRST_H
13
14#include <linux/sfi.h>
15
13extern int pci_mrst_init(void); 16extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table); 17int __init sfi_parse_mrtc(struct sfi_table_header *table);
15 18
@@ -26,7 +29,7 @@ enum mrst_cpu_type {
26}; 29};
27 30
28extern enum mrst_cpu_type __mrst_cpu_chip; 31extern enum mrst_cpu_type __mrst_cpu_chip;
29static enum mrst_cpu_type mrst_identify_cpu(void) 32static inline enum mrst_cpu_type mrst_identify_cpu(void)
30{ 33{
31 return __mrst_cpu_chip; 34 return __mrst_cpu_chip;
32} 35}
@@ -42,4 +45,9 @@ extern enum mrst_timer_options mrst_timer_options;
42#define SFI_MTMR_MAX_NUM 8 45#define SFI_MTMR_MAX_NUM 8
43#define SFI_MRTC_MAX 8 46#define SFI_MRTC_MAX 8
44 47
48extern struct console early_mrst_console;
49extern void mrst_early_console_init(void);
50
51extern struct console early_hsu_console;
52extern void hsu_early_console_init(void);
45#endif /* _ASM_X86_MRST_H */ 53#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f7790fdb2..3ea3dc487047 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,6 +121,7 @@
121#define MSR_AMD64_IBSDCLINAD 0xc0011038 121#define MSR_AMD64_IBSDCLINAD 0xc0011038
122#define MSR_AMD64_IBSDCPHYSAD 0xc0011039 122#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
123#define MSR_AMD64_IBSCTL 0xc001103a 123#define MSR_AMD64_IBSCTL 0xc001103a
124#define MSR_AMD64_IBSBRTARGET 0xc001103b
124 125
125/* Fam 10h MSRs */ 126/* Fam 10h MSRs */
126#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 127#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
@@ -198,6 +199,7 @@
198#define MSR_IA32_TSC 0x00000010 199#define MSR_IA32_TSC 0x00000010
199#define MSR_IA32_PLATFORM_ID 0x00000017 200#define MSR_IA32_PLATFORM_ID 0x00000017
200#define MSR_IA32_EBL_CR_POWERON 0x0000002a 201#define MSR_IA32_EBL_CR_POWERON 0x0000002a
202#define MSR_EBC_FREQUENCY_ID 0x0000002c
201#define MSR_IA32_FEATURE_CONTROL 0x0000003a 203#define MSR_IA32_FEATURE_CONTROL 0x0000003a
202 204
203#define FEATURE_CONTROL_LOCKED (1<<0) 205#define FEATURE_CONTROL_LOCKED (1<<0)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 000000000000..bcdff997668c
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
1#ifndef _ASM_X86_MWAIT_H
2#define _ASM_X86_MWAIT_H
3
4#define MWAIT_SUBSTATE_MASK 0xf
5#define MWAIT_CSTATE_MASK 0xf
6#define MWAIT_SUBSTATE_SIZE 4
7#define MWAIT_MAX_NUM_CSTATES 8
8
9#define CPUID_MWAIT_LEAF 5
10#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
11#define CPUID5_ECX_INTERRUPT_BREAK 0x2
12
13#define MWAIT_ECX_INTERRUPT_BREAK 0x1
14
15#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 101229b0d8ed..42a978c0c1b3 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits);
89/* EC commands */ 89/* EC commands */
90 90
91#define EC_FIRMWARE_REV 0x08 91#define EC_FIRMWARE_REV 0x08
92#define EC_WLAN_ENTER_RESET 0x35
93#define EC_WLAN_LEAVE_RESET 0x25
92 94
93/* SCI source values */ 95/* SCI source values */
94 96
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 08fde475cb3b..2a8478140bb3 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void);
21/* install OFW's pde permanently into the kernel's pgtable */ 21/* install OFW's pde permanently into the kernel's pgtable */
22extern void setup_olpc_ofw_pgd(void); 22extern void setup_olpc_ofw_pgd(void);
23 23
24/* check if OFW was detected during boot */
25extern bool olpc_ofw_present(void);
26
24#else /* !CONFIG_OLPC_OPENFIRMWARE */ 27#else /* !CONFIG_OLPC_OPENFIRMWARE */
25 28
26static inline void olpc_ofw_detect(void) { } 29static inline void olpc_ofw_detect(void) { }
27static inline void setup_olpc_ofw_pgd(void) { } 30static inline void setup_olpc_ofw_pgd(void) { }
31static inline bool olpc_ofw_present(void) { return false; }
28 32
29#endif /* !CONFIG_OLPC_OPENFIRMWARE */ 33#endif /* !CONFIG_OLPC_OPENFIRMWARE */
30 34
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 6f1b7331313f..ade619ff9e2a 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,11 +15,7 @@
15 */ 15 */
16#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) 16#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
17 17
18#ifdef CONFIG_4KSTACKS
19#define THREAD_ORDER 0
20#else
21#define THREAD_ORDER 1 18#define THREAD_ORDER 1
22#endif
23#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) 19#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
24 20
25#define STACKFAULT_STACK 0 21#define STACKFAULT_STACK 0
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c7254..1df66211fd1b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) 8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1)) 9#define PAGE_MASK (~(PAGE_SIZE-1))
10 10
11#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) 11#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
12#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 12#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
13 13
14/* Cast PAGE_MASK to a signed type so that it is sign-extended if 14/* Cast PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e5..18e3b8a8709f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -105,7 +105,7 @@ static inline void write_cr8(unsigned long x)
105} 105}
106#endif 106#endif
107 107
108static inline void raw_safe_halt(void) 108static inline void arch_safe_halt(void)
109{ 109{
110 PVOP_VCALL0(pv_irq_ops.safe_halt); 110 PVOP_VCALL0(pv_irq_ops.safe_halt);
111} 111}
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
416 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); 416 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
417} 417}
418 418
419static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
420 unsigned long start, unsigned long count)
421{
422 PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
423}
424static inline void paravirt_release_pmd(unsigned long pfn) 419static inline void paravirt_release_pmd(unsigned long pfn)
425{ 420{
426 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); 421 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
@@ -829,32 +824,32 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
829#define __PV_IS_CALLEE_SAVE(func) \ 824#define __PV_IS_CALLEE_SAVE(func) \
830 ((struct paravirt_callee_save) { func }) 825 ((struct paravirt_callee_save) { func })
831 826
832static inline unsigned long __raw_local_save_flags(void) 827static inline unsigned long arch_local_save_flags(void)
833{ 828{
834 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); 829 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
835} 830}
836 831
837static inline void raw_local_irq_restore(unsigned long f) 832static inline void arch_local_irq_restore(unsigned long f)
838{ 833{
839 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); 834 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
840} 835}
841 836
842static inline void raw_local_irq_disable(void) 837static inline void arch_local_irq_disable(void)
843{ 838{
844 PVOP_VCALLEE0(pv_irq_ops.irq_disable); 839 PVOP_VCALLEE0(pv_irq_ops.irq_disable);
845} 840}
846 841
847static inline void raw_local_irq_enable(void) 842static inline void arch_local_irq_enable(void)
848{ 843{
849 PVOP_VCALLEE0(pv_irq_ops.irq_enable); 844 PVOP_VCALLEE0(pv_irq_ops.irq_enable);
850} 845}
851 846
852static inline unsigned long __raw_local_irq_save(void) 847static inline unsigned long arch_local_irq_save(void)
853{ 848{
854 unsigned long f; 849 unsigned long f;
855 850
856 f = __raw_local_save_flags(); 851 f = arch_local_save_flags();
857 raw_local_irq_disable(); 852 arch_local_irq_disable();
858 return f; 853 return f;
859} 854}
860 855
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef5532341..b82bac975250 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
255 */ 255 */
256 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); 256 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
257 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); 257 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
258 void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
259 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); 258 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
260 void (*release_pte)(unsigned long pfn); 259 void (*release_pte)(unsigned long pfn);
261 void (*release_pmd)(unsigned long pfn); 260 void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index cd28f9ad910d..f899e01a8ac9 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -47,6 +47,20 @@
47#ifdef CONFIG_SMP 47#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
49#define __my_cpu_offset percpu_read(this_cpu_off) 49#define __my_cpu_offset percpu_read(this_cpu_off)
50
51/*
52 * Compared to the generic __my_cpu_offset version, the following
53 * saves one instruction and avoids clobbering a temp register.
54 */
55#define __this_cpu_ptr(ptr) \
56({ \
57 unsigned long tcp_ptr__; \
58 __verify_pcpu_ptr(ptr); \
59 asm volatile("add " __percpu_arg(1) ", %0" \
60 : "=r" (tcp_ptr__) \
61 : "m" (this_cpu_off), "0" (ptr)); \
62 (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
63})
50#else 64#else
51#define __percpu_arg(x) "%P" #x 65#define __percpu_arg(x) "%P" #x
52#endif 66#endif
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6e742cc4251b..550e26b1dbb3 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -111,17 +111,18 @@ union cpuid10_edx {
111#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 111#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
112 112
113/* IbsFetchCtl bits/masks */ 113/* IbsFetchCtl bits/masks */
114#define IBS_FETCH_RAND_EN (1ULL<<57) 114#define IBS_FETCH_RAND_EN (1ULL<<57)
115#define IBS_FETCH_VAL (1ULL<<49) 115#define IBS_FETCH_VAL (1ULL<<49)
116#define IBS_FETCH_ENABLE (1ULL<<48) 116#define IBS_FETCH_ENABLE (1ULL<<48)
117#define IBS_FETCH_CNT 0xFFFF0000ULL 117#define IBS_FETCH_CNT 0xFFFF0000ULL
118#define IBS_FETCH_MAX_CNT 0x0000FFFFULL 118#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
119 119
120/* IbsOpCtl bits */ 120/* IbsOpCtl bits */
121#define IBS_OP_CNT_CTL (1ULL<<19) 121#define IBS_OP_CNT_CTL (1ULL<<19)
122#define IBS_OP_VAL (1ULL<<18) 122#define IBS_OP_VAL (1ULL<<18)
123#define IBS_OP_ENABLE (1ULL<<17) 123#define IBS_OP_ENABLE (1ULL<<17)
124#define IBS_OP_MAX_CNT 0x0000FFFFULL 124#define IBS_OP_MAX_CNT 0x0000FFFFULL
125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
125 126
126#ifdef CONFIG_PERF_EVENTS 127#ifdef CONFIG_PERF_EVENTS
127extern void init_hw_perf_events(void); 128extern void init_hw_perf_events(void);
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index def500776b16..a70cd216be5d 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -36,19 +36,6 @@
36#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT) 36#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
37#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT) 37#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
38 38
39/* Non HT mask */
40#define P4_ESCR_MASK \
41 (P4_ESCR_EVENT_MASK | \
42 P4_ESCR_EVENTMASK_MASK | \
43 P4_ESCR_TAG_MASK | \
44 P4_ESCR_TAG_ENABLE | \
45 P4_ESCR_T0_OS | \
46 P4_ESCR_T0_USR)
47
48/* HT mask */
49#define P4_ESCR_MASK_HT \
50 (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
51
52#define P4_CCCR_OVF 0x80000000U 39#define P4_CCCR_OVF 0x80000000U
53#define P4_CCCR_CASCADE 0x40000000U 40#define P4_CCCR_CASCADE 0x40000000U
54#define P4_CCCR_OVF_PMI_T0 0x04000000U 41#define P4_CCCR_OVF_PMI_T0 0x04000000U
@@ -70,23 +57,6 @@
70#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) 57#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
71#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) 58#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
72 59
73/* Non HT mask */
74#define P4_CCCR_MASK \
75 (P4_CCCR_OVF | \
76 P4_CCCR_CASCADE | \
77 P4_CCCR_OVF_PMI_T0 | \
78 P4_CCCR_FORCE_OVF | \
79 P4_CCCR_EDGE | \
80 P4_CCCR_THRESHOLD_MASK | \
81 P4_CCCR_COMPLEMENT | \
82 P4_CCCR_COMPARE | \
83 P4_CCCR_ESCR_SELECT_MASK | \
84 P4_CCCR_ENABLE)
85
86/* HT mask */
87#define P4_CCCR_MASK_HT \
88 (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
89
90#define P4_GEN_ESCR_EMASK(class, name, bit) \ 60#define P4_GEN_ESCR_EMASK(class, name, bit) \
91 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) 61 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
92#define P4_ESCR_EMASK_BIT(class, name) class##__##name 62#define P4_ESCR_EMASK_BIT(class, name) class##__##name
@@ -127,6 +97,28 @@
127#define P4_CONFIG_HT_SHIFT 63 97#define P4_CONFIG_HT_SHIFT 63
128#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) 98#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
129 99
100/*
101 * The bits we allow to pass for RAW events
102 */
103#define P4_CONFIG_MASK_ESCR \
104 P4_ESCR_EVENT_MASK | \
105 P4_ESCR_EVENTMASK_MASK | \
106 P4_ESCR_TAG_MASK | \
107 P4_ESCR_TAG_ENABLE
108
109#define P4_CONFIG_MASK_CCCR \
110 P4_CCCR_EDGE | \
111 P4_CCCR_THRESHOLD_MASK | \
112 P4_CCCR_COMPLEMENT | \
113 P4_CCCR_COMPARE | \
114 P4_CCCR_THREAD_ANY | \
115 P4_CCCR_RESERVED
116
117/* some dangerous bits are reserved for kernel internals */
118#define P4_CONFIG_MASK \
119 (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
120 (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
121
130static inline bool p4_is_event_cascaded(u64 config) 122static inline bool p4_is_event_cascaded(u64 config)
131{ 123{
132 u32 cccr = p4_config_unpack_cccr(config); 124 u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a63..ada823a13c7c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
28extern spinlock_t pgd_lock; 28extern spinlock_t pgd_lock;
29extern struct list_head pgd_list; 29extern struct list_head pgd_list;
30 30
31extern struct mm_struct *pgd_page_get_mm(struct page *page);
32
31#ifdef CONFIG_PARAVIRT 33#ifdef CONFIG_PARAVIRT
32#include <asm/paravirt.h> 34#include <asm/paravirt.h>
33#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
603 pte_update(mm, addr, ptep); 605 pte_update(mm, addr, ptep);
604} 606}
605 607
608#define flush_tlb_fix_spurious_fault(vma, address)
609
606/* 610/*
607 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
608 * 612 *
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f686f49e8b7b..0c92113c4cb6 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -26,7 +26,7 @@ struct mm_struct;
26struct vm_area_struct; 26struct vm_area_struct;
27 27
28extern pgd_t swapper_pg_dir[1024]; 28extern pgd_t swapper_pg_dir[1024];
29extern pgd_t trampoline_pg_dir[1024]; 29extern pgd_t initial_page_table[1024];
30 30
31static inline void pgtable_cache_init(void) { } 31static inline void pgtable_cache_init(void) { }
32static inline void check_pgt_cache(void) { } 32static inline void check_pgt_cache(void) { }
@@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
49#endif 49#endif
50 50
51#if defined(CONFIG_HIGHPTE) 51#if defined(CONFIG_HIGHPTE)
52#define __KM_PTE \
53 (in_nmi() ? KM_NMI_PTE : \
54 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0)
56#define pte_offset_map(dir, address) \ 52#define pte_offset_map(dir, address) \
57 ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ 53 ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \
58 pte_index((address))) 54 pte_index((address)))
59#define pte_offset_map_nested(dir, address) \ 55#define pte_unmap(pte) kunmap_atomic((pte))
60 ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
61 pte_index((address)))
62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
64#else 56#else
65#define pte_offset_map(dir, address) \ 57#define pte_offset_map(dir, address) \
66 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) 58 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
67#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
68#define pte_unmap(pte) do { } while (0) 59#define pte_unmap(pte) do { } while (0)
69#define pte_unmap_nested(pte) do { } while (0)
70#endif 60#endif
71 61
72/* Clear a kernel PTE and flush it from the TLB */ 62/* Clear a kernel PTE and flush it from the TLB */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 076052cd62be..f86da20347f2 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
102 native_set_pgd(pgd, native_make_pgd(0)); 102 native_set_pgd(pgd, native_make_pgd(0));
103} 103}
104 104
105extern void sync_global_pgds(unsigned long start, unsigned long end);
106
105/* 107/*
106 * Conversion functions: convert a page and protection to a page entry, 108 * Conversion functions: convert a page and protection to a page entry,
107 * and a page entry and page directory to the page they refer to. 109 * and a page entry and page directory to the page they refer to.
@@ -125,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
125 127
126/* x86-64 always has all page tables mapped. */ 128/* x86-64 always has all page tables mapped. */
127#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) 129#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
128#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
129#define pte_unmap(pte) ((void)(pte))/* NOP */ 130#define pte_unmap(pte) ((void)(pte))/* NOP */
130#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
131 131
132#define update_mmu_cache(vma, address, ptep) do { } while (0) 132#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 133
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbebaa..cae9c3cb95cf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
110 u16 phys_proc_id; 110 u16 phys_proc_id;
111 /* Core id: */ 111 /* Core id: */
112 u16 cpu_core_id; 112 u16 cpu_core_id;
113 /* Compute unit id */
114 u8 compute_unit_id;
113 /* Index into per_cpu list: */ 115 /* Index into per_cpu list: */
114 u16 cpu_index; 116 u16 cpu_index;
115#endif 117#endif
@@ -602,7 +604,7 @@ extern unsigned long mmu_cr4_features;
602 604
603static inline void set_in_cr4(unsigned long mask) 605static inline void set_in_cr4(unsigned long mask)
604{ 606{
605 unsigned cr4; 607 unsigned long cr4;
606 608
607 mmu_cr4_features |= mask; 609 mmu_cr4_features |= mask;
608 cr4 = read_cr4(); 610 cr4 = read_cr4();
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask)
612 614
613static inline void clear_in_cr4(unsigned long mask) 615static inline void clear_in_cr4(unsigned long mask)
614{ 616{
615 unsigned cr4; 617 unsigned long cr4;
616 618
617 mmu_cr4_features &= ~mask; 619 mmu_cr4_features &= ~mask;
618 cr4 = read_cr4(); 620 cr4 = read_cr4();
@@ -764,29 +766,6 @@ extern unsigned long idle_halt;
764extern unsigned long idle_nomwait; 766extern unsigned long idle_nomwait;
765extern bool c1e_detected; 767extern bool c1e_detected;
766 768
767/*
768 * on systems with caches, caches must be flashed as the absolute
769 * last instruction before going into a suspended halt. Otherwise,
770 * dirty data can linger in the cache and become stale on resume,
771 * leading to strange errors.
772 *
773 * perform a variety of operations to guarantee that the compiler
774 * will not reorder instructions. wbinvd itself is serializing
775 * so the processor will not reorder.
776 *
777 * Systems without cache can just go into halt.
778 */
779static inline void wbinvd_halt(void)
780{
781 mb();
782 /* check for clflush to determine if wbinvd is legal */
783 if (cpu_has_clflush)
784 asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
785 else
786 while (1)
787 halt();
788}
789
790extern void enable_sep_cpu(void); 769extern void enable_sep_cpu(void);
791extern int sysenter_setup(void); 770extern int sysenter_setup(void);
792 771
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f324aa6b..7f7e577a0e39 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
12 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
13 struct timespec *ts); 13 struct timespec *ts);
14 14
15/*
16 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
17 * yielding a 64-bit result.
18 */
19static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
20{
21 u64 product;
22#ifdef __i386__
23 u32 tmp1, tmp2;
24#endif
25
26 if (shift < 0)
27 delta >>= -shift;
28 else
29 delta <<= shift;
30
31#ifdef __i386__
32 __asm__ (
33 "mul %5 ; "
34 "mov %4,%%eax ; "
35 "mov %%edx,%4 ; "
36 "mul %5 ; "
37 "xor %5,%5 ; "
38 "add %4,%%eax ; "
39 "adc %5,%%edx ; "
40 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
41 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
42#elif defined(__x86_64__)
43 __asm__ (
44 "mul %%rdx ; shrd $32,%%rdx,%%rax"
45 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
46#else
47#error implement me!
48#endif
49
50 return product;
51}
52
15#endif /* _ASM_X86_PVCLOCK_H */ 53#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 14e0ed86a6f9..231f1c1d6607 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -73,31 +73,31 @@
73 73
74#define GDT_ENTRY_DEFAULT_USER_DS 15 74#define GDT_ENTRY_DEFAULT_USER_DS 15
75 75
76#define GDT_ENTRY_KERNEL_BASE 12 76#define GDT_ENTRY_KERNEL_BASE (12)
77 77
78#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) 78#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0)
79 79
80#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) 80#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1)
81 81
82#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) 82#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4)
83#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) 83#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5)
84 84
85#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) 85#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6)
86#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) 86#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11)
87 87
88#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) 88#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14)
89#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) 89#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
90 90
91#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) 91#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
92#ifdef CONFIG_SMP 92#ifdef CONFIG_SMP
93#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) 93#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
94#else 94#else
95#define __KERNEL_PERCPU 0 95#define __KERNEL_PERCPU 0
96#endif 96#endif
97 97
98#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16) 98#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
99#ifdef CONFIG_CC_STACKPROTECTOR 99#ifdef CONFIG_CC_STACKPROTECTOR
100#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8) 100#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
101#else 101#else
102#define __KERNEL_STACK_CANARY 0 102#define __KERNEL_STACK_CANARY 0
103#endif 103#endif
@@ -182,10 +182,10 @@
182 182
183#endif 183#endif
184 184
185#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) 185#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
186#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) 186#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
187#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) 187#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
188#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) 188#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
189#ifndef CONFIG_PARAVIRT 189#ifndef CONFIG_PARAVIRT
190#define get_kernel_rpl() 0 190#define get_kernel_rpl() 0
191#endif 191#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d74..d6763b139a84 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
93 : : "i" (sz)); \ 93 : : "i" (sz)); \
94 } 94 }
95 95
96/* Helper for reserving space for arrays of things */
97#define RESERVE_BRK_ARRAY(type, name, entries) \
98 type *name; \
99 RESERVE_BRK(name, sizeof(type) * entries)
100
96#ifdef __i386__ 101#ifdef __i386__
97 102
98void __init i386_start_kernel(void); 103void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4cfc90824068..4c2f63c7fc1b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -50,7 +50,7 @@ struct smp_ops {
50 void (*smp_prepare_cpus)(unsigned max_cpus); 50 void (*smp_prepare_cpus)(unsigned max_cpus);
51 void (*smp_cpus_done)(unsigned max_cpus); 51 void (*smp_cpus_done)(unsigned max_cpus);
52 52
53 void (*smp_send_stop)(void); 53 void (*stop_other_cpus)(int wait);
54 void (*smp_send_reschedule)(int cpu); 54 void (*smp_send_reschedule)(int cpu);
55 55
56 int (*cpu_up)(unsigned cpu); 56 int (*cpu_up)(unsigned cpu);
@@ -73,7 +73,12 @@ extern struct smp_ops smp_ops;
73 73
74static inline void smp_send_stop(void) 74static inline void smp_send_stop(void)
75{ 75{
76 smp_ops.smp_send_stop(); 76 smp_ops.stop_other_cpus(0);
77}
78
79static inline void stop_other_cpus(void)
80{
81 smp_ops.stop_other_cpus(1);
77} 82}
78 83
79static inline void smp_prepare_boot_cpu(void) 84static inline void smp_prepare_boot_cpu(void)
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 8085277e1b8b..977f1761a25d 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -5,17 +5,26 @@
5 5
6#ifdef CONFIG_SWIOTLB 6#ifdef CONFIG_SWIOTLB
7extern int swiotlb; 7extern int swiotlb;
8extern int __init pci_swiotlb_detect(void); 8extern int __init pci_swiotlb_detect_override(void);
9extern int __init pci_swiotlb_detect_4gb(void);
9extern void __init pci_swiotlb_init(void); 10extern void __init pci_swiotlb_init(void);
11extern void __init pci_swiotlb_late_init(void);
10#else 12#else
11#define swiotlb 0 13#define swiotlb 0
12static inline int pci_swiotlb_detect(void) 14static inline int pci_swiotlb_detect_override(void)
15{
16 return 0;
17}
18static inline int pci_swiotlb_detect_4gb(void)
13{ 19{
14 return 0; 20 return 0;
15} 21}
16static inline void pci_swiotlb_init(void) 22static inline void pci_swiotlb_init(void)
17{ 23{
18} 24}
25static inline void pci_swiotlb_late_init(void)
26{
27}
19#endif 28#endif
20 29
21static inline void dma_mark_clean(void *addr, size_t size) {} 30static inline void dma_mark_clean(void *addr, size_t size) {}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba08e7de..169be8938b96 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
172 flush_tlb_all(); 172 flush_tlb_all();
173} 173}
174 174
175extern void zap_low_mappings(bool early);
176
177#endif /* _ASM_X86_TLBFLUSH_H */ 175#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 4dde797c0578..f4500fb3b485 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,16 +13,13 @@ extern unsigned char *trampoline_base;
13 13
14extern unsigned long init_rsp; 14extern unsigned long init_rsp;
15extern unsigned long initial_code; 15extern unsigned long initial_code;
16extern unsigned long initial_page_table;
17extern unsigned long initial_gs; 16extern unsigned long initial_gs;
18 17
19#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
20 19
21extern unsigned long setup_trampoline(void); 20extern unsigned long setup_trampoline(void);
22extern void __init setup_trampoline_page_table(void);
23extern void __init reserve_trampoline_memory(void); 21extern void __init reserve_trampoline_memory(void);
24#else 22#else
25static inline void setup_trampoline_page_table(void) {}
26static inline void reserve_trampoline_memory(void) {} 23static inline void reserve_trampoline_memory(void) {}
27#endif /* CONFIG_X86_TRAMPOLINE */ 24#endif /* CONFIG_X86_TRAMPOLINE */
28 25
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a2907..000000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * VMI interface definition
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Maintained by: Zachary Amsden zach@vmware.com
22 *
23 */
24#include <linux/types.h>
25
26/*
27 *---------------------------------------------------------------------
28 *
29 * VMI Option ROM API
30 *
31 *---------------------------------------------------------------------
32 */
33#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
34
35#define PCI_VENDOR_ID_VMWARE 0x15AD
36#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
37
38/*
39 * We use two version numbers for compatibility, with the major
40 * number signifying interface breakages, and the minor number
41 * interface extensions.
42 */
43#define VMI_API_REV_MAJOR 3
44#define VMI_API_REV_MINOR 0
45
46#define VMI_CALL_CPUID 0
47#define VMI_CALL_WRMSR 1
48#define VMI_CALL_RDMSR 2
49#define VMI_CALL_SetGDT 3
50#define VMI_CALL_SetLDT 4
51#define VMI_CALL_SetIDT 5
52#define VMI_CALL_SetTR 6
53#define VMI_CALL_GetGDT 7
54#define VMI_CALL_GetLDT 8
55#define VMI_CALL_GetIDT 9
56#define VMI_CALL_GetTR 10
57#define VMI_CALL_WriteGDTEntry 11
58#define VMI_CALL_WriteLDTEntry 12
59#define VMI_CALL_WriteIDTEntry 13
60#define VMI_CALL_UpdateKernelStack 14
61#define VMI_CALL_SetCR0 15
62#define VMI_CALL_SetCR2 16
63#define VMI_CALL_SetCR3 17
64#define VMI_CALL_SetCR4 18
65#define VMI_CALL_GetCR0 19
66#define VMI_CALL_GetCR2 20
67#define VMI_CALL_GetCR3 21
68#define VMI_CALL_GetCR4 22
69#define VMI_CALL_WBINVD 23
70#define VMI_CALL_SetDR 24
71#define VMI_CALL_GetDR 25
72#define VMI_CALL_RDPMC 26
73#define VMI_CALL_RDTSC 27
74#define VMI_CALL_CLTS 28
75#define VMI_CALL_EnableInterrupts 29
76#define VMI_CALL_DisableInterrupts 30
77#define VMI_CALL_GetInterruptMask 31
78#define VMI_CALL_SetInterruptMask 32
79#define VMI_CALL_IRET 33
80#define VMI_CALL_SYSEXIT 34
81#define VMI_CALL_Halt 35
82#define VMI_CALL_Reboot 36
83#define VMI_CALL_Shutdown 37
84#define VMI_CALL_SetPxE 38
85#define VMI_CALL_SetPxELong 39
86#define VMI_CALL_UpdatePxE 40
87#define VMI_CALL_UpdatePxELong 41
88#define VMI_CALL_MachineToPhysical 42
89#define VMI_CALL_PhysicalToMachine 43
90#define VMI_CALL_AllocatePage 44
91#define VMI_CALL_ReleasePage 45
92#define VMI_CALL_InvalPage 46
93#define VMI_CALL_FlushTLB 47
94#define VMI_CALL_SetLinearMapping 48
95
96#define VMI_CALL_SetIOPLMask 61
97#define VMI_CALL_SetInitialAPState 62
98#define VMI_CALL_APICWrite 63
99#define VMI_CALL_APICRead 64
100#define VMI_CALL_IODelay 65
101#define VMI_CALL_SetLazyMode 73
102
103/*
104 *---------------------------------------------------------------------
105 *
106 * MMU operation flags
107 *
108 *---------------------------------------------------------------------
109 */
110
111/* Flags used by VMI_{Allocate|Release}Page call */
112#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
113#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
114#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
115
116
117/* Flags shared by Allocate|Release Page and PTE updates */
118#define VMI_PAGE_PT 0x01
119#define VMI_PAGE_PD 0x02
120#define VMI_PAGE_PDP 0x04
121#define VMI_PAGE_PML4 0x08
122
123#define VMI_PAGE_NORMAL 0x00 /* for debugging */
124
125/* Flags used by PTE updates */
126#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
127#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
128#define VMI_PAGE_VA_MASK 0xfffff000
129
130#ifdef CONFIG_X86_PAE
131#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
132#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
133#else
134#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
135#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
136#endif
137
138/* Flags used by VMI_FlushTLB call */
139#define VMI_FLUSH_TLB 0x01
140#define VMI_FLUSH_GLOBAL 0x02
141
142/*
143 *---------------------------------------------------------------------
144 *
145 * VMI relocation definitions for ROM call get_reloc
146 *
147 *---------------------------------------------------------------------
148 */
149
150/* VMI Relocation types */
151#define VMI_RELOCATION_NONE 0
152#define VMI_RELOCATION_CALL_REL 1
153#define VMI_RELOCATION_JUMP_REL 2
154#define VMI_RELOCATION_NOP 3
155
156#ifndef __ASSEMBLY__
157struct vmi_relocation_info {
158 unsigned char *eip;
159 unsigned char type;
160 unsigned char reserved[3];
161};
162#endif
163
164
165/*
166 *---------------------------------------------------------------------
167 *
168 * Generic ROM structures and definitions
169 *
170 *---------------------------------------------------------------------
171 */
172
173#ifndef __ASSEMBLY__
174
175struct vrom_header {
176 u16 rom_signature; /* option ROM signature */
177 u8 rom_length; /* ROM length in 512 byte chunks */
178 u8 rom_entry[4]; /* 16-bit code entry point */
179 u8 rom_pad0; /* 4-byte align pad */
180 u32 vrom_signature; /* VROM identification signature */
181 u8 api_version_min;/* Minor version of API */
182 u8 api_version_maj;/* Major version of API */
183 u8 jump_slots; /* Number of jump slots */
184 u8 reserved1; /* Reserved for expansion */
185 u32 virtual_top; /* Hypervisor virtual address start */
186 u16 reserved2; /* Reserved for expansion */
187 u16 license_offs; /* Offset to License string */
188 u16 pci_header_offs;/* Offset to PCI OPROM header */
189 u16 pnp_header_offs;/* Offset to PnP OPROM header */
190 u32 rom_pad3; /* PnP reserverd / VMI reserved */
191 u8 reserved[96]; /* Reserved for headers */
192 char vmi_init[8]; /* VMI_Init jump point */
193 char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
194} __attribute__((packed));
195
196struct pnp_header {
197 char sig[4];
198 char rev;
199 char size;
200 short next;
201 short res;
202 long devID;
203 unsigned short manufacturer_offset;
204 unsigned short product_offset;
205} __attribute__((packed));
206
207struct pci_header {
208 char sig[4];
209 short vendorID;
210 short deviceID;
211 short vpdData;
212 short size;
213 char rev;
214 char class;
215 char subclass;
216 char interface;
217 short chunks;
218 char rom_version_min;
219 char rom_version_maj;
220 char codetype;
221 char lastRom;
222 short reserved;
223} __attribute__((packed));
224
225/* Function prototypes for bootstrapping */
226#ifdef CONFIG_VMI
227extern void vmi_init(void);
228extern void vmi_activate(void);
229extern void vmi_bringup(void);
230#else
231static inline void vmi_init(void) {}
232static inline void vmi_activate(void) {}
233static inline void vmi_bringup(void) {}
234#endif
235
236/* State needed to start an application processor in an SMP system. */
237struct vmi_ap_state {
238 u32 cr0;
239 u32 cr2;
240 u32 cr3;
241 u32 cr4;
242
243 u64 efer;
244
245 u32 eip;
246 u32 eflags;
247 u32 eax;
248 u32 ebx;
249 u32 ecx;
250 u32 edx;
251 u32 esp;
252 u32 ebp;
253 u32 esi;
254 u32 edi;
255 u16 cs;
256 u16 ss;
257 u16 ds;
258 u16 es;
259 u16 fs;
260 u16 gs;
261 u16 ldtr;
262
263 u16 gdtr_limit;
264 u32 gdtr_base;
265 u32 idtr_base;
266 u16 idtr_limit;
267};
268
269#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3c..000000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
1/*
2 * VMI Time wrappers
3 *
4 * Copyright (C) 2006, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25#ifndef _ASM_X86_VMI_TIME_H
26#define _ASM_X86_VMI_TIME_H
27
28/*
29 * Raw VMI call indices for timer functions
30 */
31#define VMI_CALL_GetCycleFrequency 66
32#define VMI_CALL_GetCycleCounter 67
33#define VMI_CALL_SetAlarm 68
34#define VMI_CALL_CancelAlarm 69
35#define VMI_CALL_GetWallclockTime 70
36#define VMI_CALL_WallclockUpdated 71
37
38/* Cached VMI timer operations */
39extern struct vmi_timer_ops {
40 u64 (*get_cycle_frequency)(void);
41 u64 (*get_cycle_counter)(int);
42 u64 (*get_wallclock)(void);
43 int (*wallclock_updated)(void);
44 void (*set_alarm)(u32 flags, u64 expiry, u64 period);
45 void (*cancel_alarm)(u32 flags);
46} vmi_timer_ops;
47
48/* Prototypes */
49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_sched_clock(void);
53extern unsigned long vmi_tsc_khz(void);
54
55#ifdef CONFIG_X86_LOCAL_APIC
56extern void __devinit vmi_time_bsp_init(void);
57extern void __devinit vmi_time_ap_init(void);
58#endif
59
60/*
61 * When run under a hypervisor, a vcpu is always in one of three states:
62 * running, halted, or ready. The vcpu is in the 'running' state if it
63 * is executing. When the vcpu executes the halt interface, the vcpu
64 * enters the 'halted' state and remains halted until there is some work
65 * pending for the vcpu (e.g. an alarm expires, host I/O completes on
66 * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
67 * state (waiting for the hypervisor to reschedule it). Finally, at any
68 * time when the vcpu is not in the 'running' state nor the 'halted'
69 * state, it is in the 'ready' state.
70 *
71 * Real time is advances while the vcpu is 'running', 'ready', or
72 * 'halted'. Stolen time is the time in which the vcpu is in the
73 * 'ready' state. Available time is the remaining time -- the vcpu is
74 * either 'running' or 'halted'.
75 *
76 * All three views of time are accessible through the VMI cycle
77 * counters.
78 */
79
80/* The cycle counters. */
81#define VMI_CYCLES_REAL 0
82#define VMI_CYCLES_AVAILABLE 1
83#define VMI_CYCLES_STOLEN 2
84
85/* The alarm interface 'flags' bits */
86#define VMI_ALARM_COUNTERS 2
87
88#define VMI_ALARM_COUNTER_MASK 0x000000ff
89
90#define VMI_ALARM_WIRED_IRQ0 0x00000000
91#define VMI_ALARM_WIRED_LVTT 0x00010000
92
93#define VMI_ALARM_IS_ONESHOT 0x00000000
94#define VMI_ALARM_IS_PERIODIC 0x00000100
95
96#define CONFIG_VMI_ALARM_HZ 100
97
98#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 7fda040a76cd..a3c28ae4025b 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -200,6 +200,23 @@ extern struct { char _entry[32]; } hypercall_page[];
200 (type)__res; \ 200 (type)__res; \
201}) 201})
202 202
203static inline long
204privcmd_call(unsigned call,
205 unsigned long a1, unsigned long a2,
206 unsigned long a3, unsigned long a4,
207 unsigned long a5)
208{
209 __HYPERCALL_DECLS;
210 __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
211
212 asm volatile("call *%[call]"
213 : __HYPERCALL_5PARAM
214 : [call] "a" (&hypercall_page[call])
215 : __HYPERCALL_CLOBBER5);
216
217 return (long)__res;
218}
219
203static inline int 220static inline int
204HYPERVISOR_set_trap_table(struct trap_info *table) 221HYPERVISOR_set_trap_table(struct trap_info *table)
205{ 222{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bf5f7d32bd08..dd8c1414b3d5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -37,14 +37,21 @@ typedef struct xpaddr {
37 37
38 38
39extern unsigned long get_phys_to_machine(unsigned long pfn); 39extern unsigned long get_phys_to_machine(unsigned long pfn);
40extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); 40extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
41 41
42static inline unsigned long pfn_to_mfn(unsigned long pfn) 42static inline unsigned long pfn_to_mfn(unsigned long pfn)
43{ 43{
44 unsigned long mfn;
45
44 if (xen_feature(XENFEAT_auto_translated_physmap)) 46 if (xen_feature(XENFEAT_auto_translated_physmap))
45 return pfn; 47 return pfn;
46 48
47 return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; 49 mfn = get_phys_to_machine(pfn);
50
51 if (mfn != INVALID_P2M_ENTRY)
52 mfn &= ~FOREIGN_FRAME_BIT;
53
54 return mfn;
48} 55}
49 56
50static inline int phys_to_machine_mapping_valid(unsigned long pfn) 57static inline int phys_to_machine_mapping_valid(unsigned long pfn)
@@ -159,6 +166,7 @@ static inline pte_t __pte_ma(pteval_t x)
159 166
160#define pgd_val_ma(x) ((x).pgd) 167#define pgd_val_ma(x) ((x).pgd)
161 168
169void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
162 170
163xmaddr_t arbitrary_virt_to_machine(void *address); 171xmaddr_t arbitrary_virt_to_machine(void *address);
164unsigned long arbitrary_virt_to_mfn(void *vaddr); 172unsigned long arbitrary_virt_to_mfn(void *vaddr);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fedf32a8c3ec..9e13763b6092 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -34,8 +34,8 @@ GCOV_PROFILE_paravirt.o := n
34obj-y := process_$(BITS).o signal.o entry_$(BITS).o 34obj-y := process_$(BITS).o signal.o entry_$(BITS).o
35obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 35obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
36obj-y += time.o ioport.o ldt.o dumpstack.o 36obj-y += time.o ioport.o ldt.o dumpstack.o
37obj-y += setup.o x86_init.o i8259.o irqinit.o 37obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
38obj-$(CONFIG_X86_VISWS) += visws_quirks.o 38obj-$(CONFIG_IRQ_WORK) += irq_work.o
39obj-$(CONFIG_X86_32) += probe_roms_32.o 39obj-$(CONFIG_X86_32) += probe_roms_32.o
40obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 40obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
@@ -44,6 +44,7 @@ obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 46obj-y += tsc.o io_delay.o rtc.o
47obj-y += pci-iommu_table.o
47 48
48obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 49obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
49obj-y += process.o 50obj-y += process.o
@@ -56,7 +57,6 @@ obj-$(CONFIG_INTEL_TXT) += tboot.o
56obj-$(CONFIG_STACKTRACE) += stacktrace.o 57obj-$(CONFIG_STACKTRACE) += stacktrace.o
57obj-y += cpu/ 58obj-y += cpu/
58obj-y += acpi/ 59obj-y += acpi/
59obj-$(CONFIG_SFI) += sfi.o
60obj-y += reboot.o 60obj-y += reboot.o
61obj-$(CONFIG_MCA) += mca_32.o 61obj-$(CONFIG_MCA) += mca_32.o
62obj-$(CONFIG_X86_MSR) += msr.o 62obj-$(CONFIG_X86_MSR) += msr.o
@@ -80,20 +80,19 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
80obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 80obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
81obj-$(CONFIG_KPROBES) += kprobes.o 81obj-$(CONFIG_KPROBES) += kprobes.o
82obj-$(CONFIG_MODULES) += module.o 82obj-$(CONFIG_MODULES) += module.o
83obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
84obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 83obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
85obj-$(CONFIG_KGDB) += kgdb.o 84obj-$(CONFIG_KGDB) += kgdb.o
86obj-$(CONFIG_VM86) += vm86_32.o 85obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 86obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
87obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
88 88
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o 90obj-$(CONFIG_APB_TIMER) += apb_timer.o
91 91
92obj-$(CONFIG_K8_NB) += k8.o 92obj-$(CONFIG_AMD_NB) += amd_nb.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 95
96obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
97obj-$(CONFIG_KVM_GUEST) += kvm.o 96obj-$(CONFIG_KVM_GUEST) += kvm.o
98obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 97obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
99obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 98obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
@@ -102,13 +101,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
102 101
103obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 102obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
104 103
105obj-$(CONFIG_SCx200) += scx200.o
106scx200-y += scx200_32.o
107
108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
110obj-$(CONFIG_X86_MRST) += mrst.o
111
112microcode-y := microcode_core.o 104microcode-y := microcode_core.o
113microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 105microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
114microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 106microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
@@ -121,8 +113,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
121### 113###
122# 64 bit specific files 114# 64 bit specific files
123ifeq ($(CONFIG_X86_64),y) 115ifeq ($(CONFIG_X86_64),y)
124 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
125 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
126 obj-$(CONFIG_AUDIT) += audit_64.o 116 obj-$(CONFIG_AUDIT) += audit_64.o
127 117
128 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 118 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59be..5812404a0d4c 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
13 13
14#include <acpi/processor.h> 14#include <acpi/processor.h>
15#include <asm/acpi.h> 15#include <asm/acpi.h>
16#include <asm/mwait.h>
16 17
17/* 18/*
18 * Initialize bm_flags based on the CPU cache properties 19 * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
65 66
66static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 67static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
67 68
68#define MWAIT_SUBSTATE_MASK (0xf)
69#define MWAIT_CSTATE_MASK (0xf)
70#define MWAIT_SUBSTATE_SIZE (4)
71
72#define CPUID_MWAIT_LEAF (5)
73#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
74#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
75
76#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
77
78#define NATIVE_CSTATE_BEYOND_HALT (2) 69#define NATIVE_CSTATE_BEYOND_HALT (2)
79 70
80static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) 71static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33cec152070d..69fd72aa5594 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -7,11 +7,16 @@
7 7
8#include <linux/acpi.h> 8#include <linux/acpi.h>
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/memblock.h>
10#include <linux/dmi.h> 11#include <linux/dmi.h>
11#include <linux/cpumask.h> 12#include <linux/cpumask.h>
12#include <asm/segment.h> 13#include <asm/segment.h>
13#include <asm/desc.h> 14#include <asm/desc.h>
14 15
16#ifdef CONFIG_X86_32
17#include <asm/pgtable.h>
18#endif
19
15#include "realmode/wakeup.h" 20#include "realmode/wakeup.h"
16#include "sleep.h" 21#include "sleep.h"
17 22
@@ -90,7 +95,7 @@ int acpi_save_state_mem(void)
90 95
91#ifndef CONFIG_64BIT 96#ifndef CONFIG_64BIT
92 header->pmode_entry = (u32)&wakeup_pmode_return; 97 header->pmode_entry = (u32)&wakeup_pmode_return;
93 header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); 98 header->pmode_cr3 = (u32)__pa(&initial_page_table);
94 saved_magic = 0x12345678; 99 saved_magic = 0x12345678;
95#else /* CONFIG_64BIT */ 100#else /* CONFIG_64BIT */
96 header->trampoline_segment = setup_trampoline() >> 4; 101 header->trampoline_segment = setup_trampoline() >> 4;
@@ -125,7 +130,7 @@ void acpi_restore_state_mem(void)
125 */ 130 */
126void __init acpi_reserve_wakeup_memory(void) 131void __init acpi_reserve_wakeup_memory(void)
127{ 132{
128 unsigned long mem; 133 phys_addr_t mem;
129 134
130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 135 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
131 printk(KERN_ERR 136 printk(KERN_ERR
@@ -133,15 +138,15 @@ void __init acpi_reserve_wakeup_memory(void)
133 return; 138 return;
134 } 139 }
135 140
136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); 141 mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
137 142
138 if (mem == -1L) { 143 if (mem == MEMBLOCK_ERROR) {
139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 144 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
140 return; 145 return;
141 } 146 }
142 acpi_realmode = (unsigned long) phys_to_virt(mem); 147 acpi_realmode = (unsigned long) phys_to_virt(mem);
143 acpi_wakeup_address = mem; 148 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); 149 memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 150}
146 151
147 152
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f65ab8b014c4..a36bb90aef53 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
195 195
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 196extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern s32 __smp_locks[], __smp_locks_end[]; 197extern s32 __smp_locks[], __smp_locks_end[];
198static void *text_poke_early(void *addr, const void *opcode, size_t len); 198void *text_poke_early(void *addr, const void *opcode, size_t len);
199 199
200/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
201 This runs before SMP is initialized to avoid SMP problems with 201 This runs before SMP is initialized to avoid SMP problems with
@@ -522,7 +522,7 @@ void __init alternative_instructions(void)
522 * instructions. And on the local CPU you need to be protected again NMI or MCE 522 * instructions. And on the local CPU you need to be protected again NMI or MCE
523 * handlers seeing an inconsistent instruction while you patch. 523 * handlers seeing an inconsistent instruction while you patch.
524 */ 524 */
525static void *__init_or_module text_poke_early(void *addr, const void *opcode, 525void *__init_or_module text_poke_early(void *addr, const void *opcode,
526 size_t len) 526 size_t len)
527{ 527{
528 unsigned long flags; 528 unsigned long flags;
@@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
637 tpp.len = len; 637 tpp.len = len;
638 atomic_set(&stop_machine_first, 1); 638 atomic_set(&stop_machine_first, 1);
639 wrote_text = 0; 639 wrote_text = 0;
640 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); 640 /* Use __stop_machine() because the caller already got online_cpus. */
641 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
641 return addr; 642 return addr;
642} 643}
643 644
645#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
646
647unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
648
649void __init arch_init_ideal_nop5(void)
650{
651 extern const unsigned char ftrace_test_p6nop[];
652 extern const unsigned char ftrace_test_nop5[];
653 extern const unsigned char ftrace_test_jmp[];
654 int faulted = 0;
655
656 /*
657 * There is no good nop for all x86 archs.
658 * We will default to using the P6_NOP5, but first we
659 * will test to make sure that the nop will actually
660 * work on this CPU. If it faults, we will then
661 * go to a lesser efficient 5 byte nop. If that fails
662 * we then just use a jmp as our nop. This isn't the most
663 * efficient nop, but we can not use a multi part nop
664 * since we would then risk being preempted in the middle
665 * of that nop, and if we enabled tracing then, it might
666 * cause a system crash.
667 *
668 * TODO: check the cpuid to determine the best nop.
669 */
670 asm volatile (
671 "ftrace_test_jmp:"
672 "jmp ftrace_test_p6nop\n"
673 "nop\n"
674 "nop\n"
675 "nop\n" /* 2 byte jmp + 3 bytes */
676 "ftrace_test_p6nop:"
677 P6_NOP5
678 "jmp 1f\n"
679 "ftrace_test_nop5:"
680 ".byte 0x66,0x66,0x66,0x66,0x90\n"
681 "1:"
682 ".section .fixup, \"ax\"\n"
683 "2: movl $1, %0\n"
684 " jmp ftrace_test_nop5\n"
685 "3: movl $2, %0\n"
686 " jmp 1b\n"
687 ".previous\n"
688 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
689 _ASM_EXTABLE(ftrace_test_nop5, 3b)
690 : "=r"(faulted) : "0" (faulted));
691
692 switch (faulted) {
693 case 0:
694 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
695 memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5);
696 break;
697 case 1:
698 pr_info("converting mcount calls to 66 66 66 66 90\n");
699 memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5);
700 break;
701 case 2:
702 pr_info("converting mcount calls to jmp . + 5\n");
703 memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5);
704 break;
705 }
706
707}
708#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382b..d2fdb0826df2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed8..6e11c8134158 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -31,7 +31,7 @@
31#include <asm/iommu.h> 31#include <asm/iommu.h>
32#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h> 33#include <asm/x86_init.h>
34 34#include <asm/iommu_table.h>
35/* 35/*
36 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
37 */ 37 */
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
194 return 1UL << shift; 194 return 1UL << shift;
195} 195}
196 196
197/* Access to l1 and l2 indexed register spaces */
198
199static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
200{
201 u32 val;
202
203 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
204 pci_read_config_dword(iommu->dev, 0xfc, &val);
205 return val;
206}
207
208static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
209{
210 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
211 pci_write_config_dword(iommu->dev, 0xfc, val);
212 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
213}
214
215static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
216{
217 u32 val;
218
219 pci_write_config_dword(iommu->dev, 0xf0, address);
220 pci_read_config_dword(iommu->dev, 0xf4, &val);
221 return val;
222}
223
224static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
225{
226 pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
227 pci_write_config_dword(iommu->dev, 0xf4, val);
228}
229
197/**************************************************************************** 230/****************************************************************************
198 * 231 *
199 * AMD IOMMU MMIO register space handling functions 232 * AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
619{ 652{
620 int cap_ptr = iommu->cap_ptr; 653 int cap_ptr = iommu->cap_ptr;
621 u32 range, misc; 654 u32 range, misc;
655 int i, j;
622 656
623 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, 657 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
624 &iommu->cap); 658 &iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
633 MMIO_GET_LD(range)); 667 MMIO_GET_LD(range));
634 iommu->evt_msi_num = MMIO_MSI_NUM(misc); 668 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
635 669
636 if (is_rd890_iommu(iommu->dev)) { 670 if (!is_rd890_iommu(iommu->dev))
637 pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); 671 return;
638 pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); 672
639 pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); 673 /*
640 pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); 674 * Some rd890 systems may not be fully reconfigured by the BIOS, so
641 } 675 * it's necessary for us to store this information so it can be
676 * reprogrammed on resume
677 */
678
679 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
680 &iommu->stored_addr_lo);
681 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
682 &iommu->stored_addr_hi);
683
684 /* Low bit locks writes to configuration space */
685 iommu->stored_addr_lo &= ~1;
686
687 for (i = 0; i < 6; i++)
688 for (j = 0; j < 0x12; j++)
689 iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
690
691 for (i = 0; i < 0x83; i++)
692 iommu->stored_l2[i] = iommu_read_l2(iommu, i);
642} 693}
643 694
644/* 695/*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
1127 iommu_feature_enable(iommu, CONTROL_COHERENT_EN); 1178 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1128} 1179}
1129 1180
1130static void iommu_apply_quirks(struct amd_iommu *iommu) 1181static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
1131{ 1182{
1132 if (is_rd890_iommu(iommu->dev)) { 1183 int i, j;
1133 pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); 1184 u32 ioc_feature_control;
1134 pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); 1185 struct pci_dev *pdev = NULL;
1135 pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); 1186
1136 pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); 1187 /* RD890 BIOSes may not have completely reconfigured the iommu */
1137 } 1188 if (!is_rd890_iommu(iommu->dev))
1189 return;
1190
1191 /*
1192 * First, we need to ensure that the iommu is enabled. This is
1193 * controlled by a register in the northbridge
1194 */
1195 pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
1196
1197 if (!pdev)
1198 return;
1199
1200 /* Select Northbridge indirect register 0x75 and enable writing */
1201 pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
1202 pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
1203
1204 /* Enable the iommu */
1205 if (!(ioc_feature_control & 0x1))
1206 pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
1207
1208 pci_dev_put(pdev);
1209
1210 /* Restore the iommu BAR */
1211 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1212 iommu->stored_addr_lo);
1213 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
1214 iommu->stored_addr_hi);
1215
1216 /* Restore the l1 indirect regs for each of the 6 l1s */
1217 for (i = 0; i < 6; i++)
1218 for (j = 0; j < 0x12; j++)
1219 iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
1220
1221 /* Restore the l2 indirect regs */
1222 for (i = 0; i < 0x83; i++)
1223 iommu_write_l2(iommu, i, iommu->stored_l2[i]);
1224
1225 /* Lock PCI setup registers */
1226 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1227 iommu->stored_addr_lo | 1);
1138} 1228}
1139 1229
1140/* 1230/*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
1147 1237
1148 for_each_iommu(iommu) { 1238 for_each_iommu(iommu) {
1149 iommu_disable(iommu); 1239 iommu_disable(iommu);
1150 iommu_apply_quirks(iommu);
1151 iommu_init_flags(iommu); 1240 iommu_init_flags(iommu);
1152 iommu_set_device_table(iommu); 1241 iommu_set_device_table(iommu);
1153 iommu_enable_command_buffer(iommu); 1242 iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
1173 1262
1174static int amd_iommu_resume(struct sys_device *dev) 1263static int amd_iommu_resume(struct sys_device *dev)
1175{ 1264{
1265 struct amd_iommu *iommu;
1266
1267 for_each_iommu(iommu)
1268 iommu_apply_resume_quirks(iommu);
1269
1176 /* re-load the hardware */ 1270 /* re-load the hardware */
1177 enable_iommus(); 1271 enable_iommus();
1178 1272
@@ -1405,13 +1499,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1405 return 0; 1499 return 0;
1406} 1500}
1407 1501
1408void __init amd_iommu_detect(void) 1502int __init amd_iommu_detect(void)
1409{ 1503{
1410 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 1504 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1411 return; 1505 return -ENODEV;
1412 1506
1413 if (amd_iommu_disabled) 1507 if (amd_iommu_disabled)
1414 return; 1508 return -ENODEV;
1415 1509
1416 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1510 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1417 iommu_detected = 1; 1511 iommu_detected = 1;
@@ -1420,7 +1514,9 @@ void __init amd_iommu_detect(void)
1420 1514
1421 /* Make sure ACS will be enabled */ 1515 /* Make sure ACS will be enabled */
1422 pci_request_acs(); 1516 pci_request_acs();
1517 return 1;
1423 } 1518 }
1519 return -ENODEV;
1424} 1520}
1425 1521
1426/**************************************************************************** 1522/****************************************************************************
@@ -1451,3 +1547,8 @@ static int __init parse_amd_iommu_options(char *str)
1451 1547
1452__setup("amd_iommu_dump", parse_amd_iommu_dump); 1548__setup("amd_iommu_dump", parse_amd_iommu_dump);
1453__setup("amd_iommu=", parse_amd_iommu_options); 1549__setup("amd_iommu=", parse_amd_iommu_options);
1550
1551IOMMU_INIT_FINISH(amd_iommu_detect,
1552 gart_iommu_hole_init,
1553 0,
1554 0);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c
index 0f7bc20cfcde..8f6463d8ed0d 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -8,21 +8,19 @@
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <asm/k8.h> 11#include <asm/amd_nb.h>
12
13int num_k8_northbridges;
14EXPORT_SYMBOL(num_k8_northbridges);
15 12
16static u32 *flush_words; 13static u32 *flush_words;
17 14
18struct pci_device_id k8_nb_ids[] = { 15struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
21 {} 19 {}
22}; 20};
23EXPORT_SYMBOL(k8_nb_ids); 21EXPORT_SYMBOL(k8_nb_ids);
24 22
25struct pci_dev **k8_northbridges; 23struct k8_northbridge_info k8_northbridges;
26EXPORT_SYMBOL(k8_northbridges); 24EXPORT_SYMBOL(k8_northbridges);
27 25
28static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) 26static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
@@ -40,36 +38,45 @@ int cache_k8_northbridges(void)
40 int i; 38 int i;
41 struct pci_dev *dev; 39 struct pci_dev *dev;
42 40
43 if (num_k8_northbridges) 41 if (k8_northbridges.num)
44 return 0; 42 return 0;
45 43
46 dev = NULL; 44 dev = NULL;
47 while ((dev = next_k8_northbridge(dev)) != NULL) 45 while ((dev = next_k8_northbridge(dev)) != NULL)
48 num_k8_northbridges++; 46 k8_northbridges.num++;
47
48 /* some CPU families (e.g. family 0x11) do not support GART */
49 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
50 boot_cpu_data.x86 == 0x15)
51 k8_northbridges.gart_supported = 1;
49 52
50 k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), 53 k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
51 GFP_KERNEL); 54 sizeof(void *), GFP_KERNEL);
52 if (!k8_northbridges) 55 if (!k8_northbridges.nb_misc)
53 return -ENOMEM; 56 return -ENOMEM;
54 57
55 if (!num_k8_northbridges) { 58 if (!k8_northbridges.num) {
56 k8_northbridges[0] = NULL; 59 k8_northbridges.nb_misc[0] = NULL;
57 return 0; 60 return 0;
58 } 61 }
59 62
60 flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); 63 if (k8_northbridges.gart_supported) {
61 if (!flush_words) { 64 flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
62 kfree(k8_northbridges); 65 GFP_KERNEL);
63 return -ENOMEM; 66 if (!flush_words) {
67 kfree(k8_northbridges.nb_misc);
68 return -ENOMEM;
69 }
64 } 70 }
65 71
66 dev = NULL; 72 dev = NULL;
67 i = 0; 73 i = 0;
68 while ((dev = next_k8_northbridge(dev)) != NULL) { 74 while ((dev = next_k8_northbridge(dev)) != NULL) {
69 k8_northbridges[i] = dev; 75 k8_northbridges.nb_misc[i] = dev;
70 pci_read_config_dword(dev, 0x9c, &flush_words[i++]); 76 if (k8_northbridges.gart_supported)
77 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
71 } 78 }
72 k8_northbridges[i] = NULL; 79 k8_northbridges.nb_misc[i] = NULL;
73 return 0; 80 return 0;
74} 81}
75EXPORT_SYMBOL_GPL(cache_k8_northbridges); 82EXPORT_SYMBOL_GPL(cache_k8_northbridges);
@@ -93,22 +100,25 @@ void k8_flush_garts(void)
93 unsigned long flags; 100 unsigned long flags;
94 static DEFINE_SPINLOCK(gart_lock); 101 static DEFINE_SPINLOCK(gart_lock);
95 102
103 if (!k8_northbridges.gart_supported)
104 return;
105
96 /* Avoid races between AGP and IOMMU. In theory it's not needed 106 /* Avoid races between AGP and IOMMU. In theory it's not needed
97 but I'm not sure if the hardware won't lose flush requests 107 but I'm not sure if the hardware won't lose flush requests
98 when another is pending. This whole thing is so expensive anyways 108 when another is pending. This whole thing is so expensive anyways
99 that it doesn't matter to serialize more. -AK */ 109 that it doesn't matter to serialize more. -AK */
100 spin_lock_irqsave(&gart_lock, flags); 110 spin_lock_irqsave(&gart_lock, flags);
101 flushed = 0; 111 flushed = 0;
102 for (i = 0; i < num_k8_northbridges; i++) { 112 for (i = 0; i < k8_northbridges.num; i++) {
103 pci_write_config_dword(k8_northbridges[i], 0x9c, 113 pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
104 flush_words[i]|1); 114 flush_words[i]|1);
105 flushed++; 115 flushed++;
106 } 116 }
107 for (i = 0; i < num_k8_northbridges; i++) { 117 for (i = 0; i < k8_northbridges.num; i++) {
108 u32 w; 118 u32 w;
109 /* Make sure the hardware actually executed the flush*/ 119 /* Make sure the hardware actually executed the flush*/
110 for (;;) { 120 for (;;) {
111 pci_read_config_dword(k8_northbridges[i], 121 pci_read_config_dword(k8_northbridges.nb_misc[i],
112 0x9c, &w); 122 0x9c, &w);
113 if (!(w & 1)) 123 if (!(w & 1))
114 break; 124 break;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 42a70a2accc0..92543c73cf8e 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -392,7 +392,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
392 } 392 }
393 break; 393 break;
394 default: 394 default:
395 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); 395 pr_debug("APBT notified %lu, no action\n", action);
396 } 396 }
397 return NOTIFY_OK; 397 return NOTIFY_OK;
398} 398}
@@ -546,7 +546,7 @@ bad_count:
546 pr_debug("APB CS going back %lx:%lx:%lx ", 546 pr_debug("APB CS going back %lx:%lx:%lx ",
547 t2, last_read, t2 - last_read); 547 t2, last_read, t2 - last_read);
548bad_count_x3: 548bad_count_x3:
549 pr_debug(KERN_INFO "tripple check enforced\n"); 549 pr_debug("triple check enforced\n");
550 t0 = apbt_readl(phy_cs_timer_id, 550 t0 = apbt_readl(phy_cs_timer_id,
551 APBTMR_N_CURRENT_VALUE); 551 APBTMR_N_CURRENT_VALUE);
552 udelay(1); 552 udelay(1);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e17..b3a16e8f0703 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
27#include <asm/gart.h> 27#include <asm/gart.h>
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/amd_nb.h>
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33int gart_iommu_aperture; 33int gart_iommu_aperture;
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
307 continue; 307 continue;
308 308
309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
310 aper_enabled = ctl & AMD64_GARTEN; 310 aper_enabled = ctl & GARTEN;
311 aper_order = (ctl >> 1) & 7; 311 aper_order = (ctl >> 1) & 7;
312 aper_size = (32 * 1024 * 1024) << aper_order; 312 aper_size = (32 * 1024 * 1024) << aper_order;
313 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 313 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
362 continue; 362 continue;
363 363
364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
365 ctl &= ~AMD64_GARTEN; 365 ctl &= ~GARTEN;
366 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); 366 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
367 } 367 }
368 } 368 }
@@ -371,7 +371,7 @@ void __init early_gart_iommu_check(void)
371 371
372static int __initdata printed_gart_size_msg; 372static int __initdata printed_gart_size_msg;
373 373
374void __init gart_iommu_hole_init(void) 374int __init gart_iommu_hole_init(void)
375{ 375{
376 u32 agp_aper_base = 0, agp_aper_order = 0; 376 u32 agp_aper_base = 0, agp_aper_order = 0;
377 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 377 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -381,7 +381,7 @@ void __init gart_iommu_hole_init(void)
381 381
382 if (gart_iommu_aperture_disabled || !fix_aperture || 382 if (gart_iommu_aperture_disabled || !fix_aperture ||
383 !early_pci_allowed()) 383 !early_pci_allowed())
384 return; 384 return -ENODEV;
385 385
386 printk(KERN_INFO "Checking aperture...\n"); 386 printk(KERN_INFO "Checking aperture...\n");
387 387
@@ -463,8 +463,9 @@ out:
463 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 463 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
464 464
465 insert_aperture_resource((u32)last_aper_base, n); 465 insert_aperture_resource((u32)last_aper_base, n);
466 return 1;
466 } 467 }
467 return; 468 return 0;
468 } 469 }
469 470
470 if (!fallback_aper_force) { 471 if (!fallback_aper_force) {
@@ -500,13 +501,18 @@ out:
500 panic("Not enough memory for aperture"); 501 panic("Not enough memory for aperture");
501 } 502 }
502 } else { 503 } else {
503 return; 504 return 0;
504 } 505 }
505 506
506 /* Fix up the north bridges */ 507 /* Fix up the north bridges */
507 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 508 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
508 int bus; 509 int bus, dev_base, dev_limit;
509 int dev_base, dev_limit; 510
511 /*
512 * Don't enable translation yet but enable GART IO and CPU
513 * accesses and set DISTLBWALKPRB since GART table memory is UC.
514 */
515 u32 ctl = DISTLBWALKPRB | aper_order << 1;
510 516
511 bus = bus_dev_ranges[i].bus; 517 bus = bus_dev_ranges[i].bus;
512 dev_base = bus_dev_ranges[i].dev_base; 518 dev_base = bus_dev_ranges[i].dev_base;
@@ -515,13 +521,12 @@ out:
515 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 521 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
516 continue; 522 continue;
517 523
518 /* Don't enable translation yet. That is done later. 524 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
519 Assume this BIOS didn't initialise the GART so
520 just overwrite all previous bits */
521 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
522 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); 525 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
523 } 526 }
524 } 527 }
525 528
526 set_up_gart_resume(aper_order, aper_alloc); 529 set_up_gart_resume(aper_order, aper_alloc);
530
531 return 1;
527} 532}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8cf86fb3b4e3..850657d1b0ed 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,6 +52,7 @@
52#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h> 53#include <asm/kvm_para.h>
54#include <asm/tsc.h> 54#include <asm/tsc.h>
55#include <asm/atomic.h>
55 56
56unsigned int num_processors; 57unsigned int num_processors;
57 58
@@ -370,38 +371,87 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
370} 371}
371 372
372/* 373/*
373 * Setup extended LVT, AMD specific (K8, family 10h) 374 * Setup extended LVT, AMD specific
374 * 375 *
375 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and 376 * Software should use the LVT offsets the BIOS provides. The offsets
376 * MCE interrupts are supported. Thus MCE offset must be set to 0. 377 * are determined by the subsystems using it like those for MCE
378 * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
379 * are supported. Beginning with family 10h at least 4 offsets are
380 * available.
377 * 381 *
378 * If mask=1, the LVT entry does not generate interrupts while mask=0 382 * Since the offsets must be consistent for all cores, we keep track
379 * enables the vector. See also the BKDGs. 383 * of the LVT offsets in software and reserve the offset for the same
384 * vector also to be used on other cores. An offset is freed by
385 * setting the entry to APIC_EILVT_MASKED.
386 *
387 * If the BIOS is right, there should be no conflicts. Otherwise a
388 * "[Firmware Bug]: ..." error message is generated. However, if
389 * software does not properly determines the offsets, it is not
390 * necessarily a BIOS bug.
380 */ 391 */
381 392
382#define APIC_EILVT_LVTOFF_MCE 0 393static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
383#define APIC_EILVT_LVTOFF_IBS 1
384 394
385static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 395static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
386{ 396{
387 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); 397 return (old & APIC_EILVT_MASKED)
388 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 398 || (new == APIC_EILVT_MASKED)
389 399 || ((new & ~APIC_EILVT_MASKED) == old);
390 apic_write(reg, v);
391} 400}
392 401
393u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) 402static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
394{ 403{
395 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); 404 unsigned int rsvd; /* 0: uninitialized */
396 return APIC_EILVT_LVTOFF_MCE; 405
406 if (offset >= APIC_EILVT_NR_MAX)
407 return ~0;
408
409 rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
410 do {
411 if (rsvd &&
412 !eilvt_entry_is_changeable(rsvd, new))
413 /* may not change if vectors are different */
414 return rsvd;
415 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
416 } while (rsvd != new);
417
418 return new;
397} 419}
398 420
399u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) 421/*
422 * If mask=1, the LVT entry does not generate interrupts while mask=0
423 * enables the vector. See also the BKDGs.
424 */
425
426int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
400{ 427{
401 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); 428 unsigned long reg = APIC_EILVTn(offset);
402 return APIC_EILVT_LVTOFF_IBS; 429 unsigned int new, old, reserved;
430
431 new = (mask << 16) | (msg_type << 8) | vector;
432 old = apic_read(reg);
433 reserved = reserve_eilvt_offset(offset, new);
434
435 if (reserved != new) {
436 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
437 "vector 0x%x was already reserved by another core, "
438 "APIC%lX=0x%x\n",
439 smp_processor_id(), new, reserved, reg, old);
440 return -EINVAL;
441 }
442
443 if (!eilvt_entry_is_changeable(old, new)) {
444 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
445 "register already in use, APIC%lX=0x%x\n",
446 smp_processor_id(), new, reg, old);
447 return -EBUSY;
448 }
449
450 apic_write(reg, new);
451
452 return 0;
403} 453}
404EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); 454EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
405 455
406/* 456/*
407 * Program the next event, relative to now 457 * Program the next event, relative to now
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0885a4120737..0929191d83cf 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3109,7 +3109,8 @@ void destroy_irq(unsigned int irq)
3109 3109
3110 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 3110 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
3111 3111
3112 free_irte(irq); 3112 if (intr_remapping_enabled)
3113 free_irte(irq);
3113 raw_spin_lock_irqsave(&vector_lock, flags); 3114 raw_spin_lock_irqsave(&vector_lock, flags);
3114 __clear_irq_vector(irq, cfg); 3115 __clear_irq_vector(irq, cfg);
3115 raw_spin_unlock_irqrestore(&vector_lock, flags); 3116 raw_spin_unlock_irqrestore(&vector_lock, flags);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3e28401f161c..960f26ab5c9f 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -26,6 +26,7 @@
26#include <linux/nodemask.h> 26#include <linux/nodemask.h>
27#include <linux/topology.h> 27#include <linux/topology.h>
28#include <linux/bootmem.h> 28#include <linux/bootmem.h>
29#include <linux/memblock.h>
29#include <linux/threads.h> 30#include <linux/threads.h>
30#include <linux/cpumask.h> 31#include <linux/cpumask.h>
31#include <linux/kernel.h> 32#include <linux/kernel.h>
@@ -88,7 +89,7 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
88 node_end_pfn[node] = 89 node_end_pfn[node] =
89 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); 90 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
90 91
91 e820_register_active_regions(node, node_start_pfn[node], 92 memblock_x86_register_active_regions(node, node_start_pfn[node],
92 node_end_pfn[node]); 93 node_end_pfn[node]);
93 94
94 memory_present(node, node_start_pfn[node], node_end_pfn[node]); 95 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b7..0e4f24c2a746 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -189,8 +189,8 @@
189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. 189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
190 * 190 *
191 * [This document is available free from Intel by calling 800.628.8686 (fax 191 * [This document is available free from Intel by calling 800.628.8686 (fax
192 * 916.356.6100) or 800.548.4725; or via anonymous ftp from 192 * 916.356.6100) or 800.548.4725; or from
193 * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also 193 * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
194 * available from Microsoft by calling 206.882.8080.] 194 * available from Microsoft by calling 206.882.8080.]
195 * 195 *
196 * APM 1.2 Reference: 196 * APM 1.2 Reference:
@@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = {
1926 .unlocked_ioctl = do_ioctl, 1926 .unlocked_ioctl = do_ioctl,
1927 .open = do_open, 1927 .open = do_open,
1928 .release = do_release, 1928 .release = do_release,
1929 .llseek = noop_llseek,
1929}; 1930};
1930 1931
1931static struct miscdevice apm_device = { 1932static struct miscdevice apm_device = {
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf6403895..1a4088dda37a 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -99,9 +99,7 @@ void foo(void)
99 99
100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); 101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
102 DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); 102 DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
103 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
104 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
105 103
106 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 104 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
107 105
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46a..13a389179514 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -2,7 +2,8 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/kthread.h> 3#include <linux/kthread.h>
4#include <linux/workqueue.h> 4#include <linux/workqueue.h>
5#include <asm/e820.h> 5#include <linux/memblock.h>
6
6#include <asm/proto.h> 7#include <asm/proto.h>
7 8
8/* 9/*
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1;
18static unsigned __read_mostly corruption_check_size = 64*1024; 19static unsigned __read_mostly corruption_check_size = 64*1024;
19static unsigned __read_mostly corruption_check_period = 60; /* seconds */ 20static unsigned __read_mostly corruption_check_period = 60; /* seconds */
20 21
21static struct e820entry scan_areas[MAX_SCAN_AREAS]; 22static struct scan_area {
23 u64 addr;
24 u64 size;
25} scan_areas[MAX_SCAN_AREAS];
22static int num_scan_areas; 26static int num_scan_areas;
23 27
24
25static __init int set_corruption_check(char *arg) 28static __init int set_corruption_check(char *arg)
26{ 29{
27 char *end; 30 char *end;
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void)
81 84
82 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { 85 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
83 u64 size; 86 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE); 87 addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
85 88
86 if (!(addr + 1)) 89 if (addr == MEMBLOCK_ERROR)
87 break; 90 break;
88 91
89 if (addr >= corruption_check_size) 92 if (addr >= corruption_check_size)
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void)
92 if ((addr + size) > corruption_check_size) 95 if ((addr + size) > corruption_check_size)
93 size = corruption_check_size - addr; 96 size = corruption_check_size - addr;
94 97
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 98 memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
96 scan_areas[num_scan_areas].addr = addr; 99 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 100 scan_areas[num_scan_areas].size = size;
98 num_scan_areas++; 101 num_scan_areas++;
@@ -105,7 +108,6 @@ void __init setup_bios_corruption_check(void)
105 108
106 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", 109 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
107 num_scan_areas); 110 num_scan_areas);
108 update_e820();
109} 111}
110 112
111 113
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a8b4d91b8394..9e093f8fe78c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid)
253#endif 253#endif
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for
257 * Assumption: Number of cores in each internal node is the same. 257 * (1) AMD multi-node processors
258 * Assumption: Number of cores in each internal node is the same.
259 * (2) AMD processors supporting compute units
258 */ 260 */
259#ifdef CONFIG_X86_HT 261#ifdef CONFIG_X86_HT
260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 262static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
261{ 263{
262 unsigned long long value; 264 u32 nodes;
263 u32 nodes, cores_per_node; 265 u8 node_id;
264 int cpu = smp_processor_id(); 266 int cpu = smp_processor_id();
265 267
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) 268 /* get information required for multi-node processors */
267 return; 269 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
270 u32 eax, ebx, ecx, edx;
268 271
269 /* fixup topology information only once for a core */ 272 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 273 nodes = ((ecx >> 8) & 7) + 1;
271 return; 274 node_id = ecx & 7;
272 275
273 rdmsrl(MSR_FAM10H_NODE_ID, value); 276 /* get compute unit information */
277 smp_num_siblings = ((ebx >> 8) & 3) + 1;
278 c->compute_unit_id = ebx & 0xff;
279 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
280 u64 value;
274 281
275 nodes = ((value >> 3) & 7) + 1; 282 rdmsrl(MSR_FAM10H_NODE_ID, value);
276 if (nodes == 1) 283 nodes = ((value >> 3) & 7) + 1;
284 node_id = value & 7;
285 } else
277 return; 286 return;
278 287
279 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 288 /* fixup multi-node processor information */
280 cores_per_node = c->x86_max_cores / nodes; 289 if (nodes > 1) {
290 u32 cores_per_node;
291
292 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
293 cores_per_node = c->x86_max_cores / nodes;
281 294
282 /* store NodeID, use llc_shared_map to store sibling info */ 295 /* store NodeID, use llc_shared_map to store sibling info */
283 per_cpu(cpu_llc_id, cpu) = value & 7; 296 per_cpu(cpu_llc_id, cpu) = node_id;
284 297
285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */ 298 /* core id to be in range from 0 to (cores_per_node - 1) */
286 c->cpu_core_id = c->cpu_core_id % cores_per_node; 299 c->cpu_core_id = c->cpu_core_id % cores_per_node;
300 }
287} 301}
288#endif 302#endif
289 303
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
304 c->phys_proc_id = c->initial_apicid >> bits; 318 c->phys_proc_id = c->initial_apicid >> bits;
305 /* use socket ID also for last level cache */ 319 /* use socket ID also for last level cache */
306 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; 320 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
307 /* fixup topology information on multi-node processors */ 321 amd_get_topology(c);
308 if ((c->x86 == 0x10) && (c->x86_model == 9))
309 amd_fixup_dcm(c);
310#endif 322#endif
311} 323}
312 324
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
412 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 424 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
413 } 425 }
414#endif 426#endif
427
428 /* We need to do the following only once */
429 if (c != &boot_cpu_data)
430 return;
431
432 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
433
434 if (c->x86 > 0x10 ||
435 (c->x86 == 0x10 && c->x86_model >= 0x2)) {
436 u64 val;
437
438 rdmsrl(MSR_K7_HWCR, val);
439 if (!(val & BIT(24)))
440 printk(KERN_WARNING FW_BUG "TSC doesn't count "
441 "with P0 frequency!\n");
442 }
443 }
415} 444}
416 445
417static void __cpuinit init_amd(struct cpuinfo_x86 *c) 446static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
523#endif 552#endif
524 553
525 if (c->extended_cpuid_level >= 0x80000006) { 554 if (c->extended_cpuid_level >= 0x80000006) {
526 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) 555 if (cpuid_edx(0x80000006) & 0xf000)
527 num_cache_leaves = 4; 556 num_cache_leaves = 4;
528 else 557 else
529 num_cache_leaves = 3; 558 num_cache_leaves = 3;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 15c671385f59..4b68bda30938 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -704,16 +704,21 @@ void __init early_cpu_init(void)
704} 704}
705 705
706/* 706/*
707 * The NOPL instruction is supposed to exist on all CPUs with 707 * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
708 * family >= 6; unfortunately, that's not true in practice because 708 * unfortunately, that's not true in practice because of early VIA
709 * of early VIA chips and (more importantly) broken virtualizers that 709 * chips and (more importantly) broken virtualizers that are not easy
710 * are not easy to detect. In the latter case it doesn't even *fail* 710 * to detect. In the latter case it doesn't even *fail* reliably, so
711 * reliably, so probing for it doesn't even work. Disable it completely 711 * probing for it doesn't even work. Disable it completely on 32-bit
712 * unless we can find a reliable way to detect all the broken cases. 712 * unless we can find a reliable way to detect all the broken cases.
713 * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
713 */ 714 */
714static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) 715static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
715{ 716{
717#ifdef CONFIG_X86_32
716 clear_cpu_cap(c, X86_FEATURE_NOPL); 718 clear_cpu_cap(c, X86_FEATURE_NOPL);
719#else
720 set_cpu_cap(c, X86_FEATURE_NOPL);
721#endif
717} 722}
718 723
719static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 724static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void)
1264 clear_all_debug_regs(); 1269 clear_all_debug_regs();
1265 dbg_restore_debug_regs(); 1270 dbg_restore_debug_regs();
1266 1271
1267 /*
1268 * Force FPU initialization:
1269 */
1270 current_thread_info()->status = 0;
1271 clear_used_math();
1272 mxcsr_feature_mask_init();
1273
1274 fpu_init(); 1272 fpu_init();
1275 xsave_init(); 1273 xsave_init();
1276} 1274}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d43..e765633f210e 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void get_cpu_cap(struct cpuinfo_x86 *c);
35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36extern void get_cpu_cap(struct cpuinfo_x86 *c); 37extern void get_cpu_cap(struct cpuinfo_x86 *c);
37 38
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index cd8da247dda1..a2baafb2fe6d 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -701,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
701 per_cpu(acfreq_data, policy->cpu) = NULL; 701 per_cpu(acfreq_data, policy->cpu) = NULL;
702 acpi_processor_unregister_performance(data->acpi_data, 702 acpi_processor_unregister_performance(data->acpi_data,
703 policy->cpu); 703 policy->cpu);
704 kfree(data->freq_table);
704 kfree(data); 705 kfree(data);
705 } 706 }
706 707
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 733093d60436..141abebc4516 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = {
393 * Detects nForce2 A2 and C1 stepping 393 * Detects nForce2 A2 and C1 stepping
394 * 394 *
395 */ 395 */
396static unsigned int nforce2_detect_chipset(void) 396static int nforce2_detect_chipset(void)
397{ 397{
398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
399 PCI_DEVICE_ID_NVIDIA_NFORCE2, 399 PCI_DEVICE_ID_NVIDIA_NFORCE2,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index fc09f142d94d..d9f51367666b 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq;
35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS 35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
36 * and MSR_TMTA_LONGRUN_CTRL 36 * and MSR_TMTA_LONGRUN_CTRL
37 */ 37 */
38static void __init longrun_get_policy(struct cpufreq_policy *policy) 38static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
39{ 39{
40 u32 msr_lo, msr_hi; 40 u32 msr_lo, msr_hi;
41 41
@@ -165,7 +165,7 @@ static unsigned int longrun_get(unsigned int cpu)
165 * TMTA rules: 165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) 166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */ 167 */
168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, 168static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq) 169 unsigned int *high_freq)
170{ 170{
171 u32 msr_lo, msr_hi; 171 u32 msr_lo, msr_hi;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 695f17731e23..d16c2c53d6bf 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -284,9 +284,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
284 /* Don't do the funky fallback heuristics the AMD version employs 284 /* Don't do the funky fallback heuristics the AMD version employs
285 for now. */ 285 for now. */
286 node = apicid_to_node[apicid]; 286 node = apicid_to_node[apicid];
287 if (node == NUMA_NO_NODE) 287 if (node == NUMA_NO_NODE || !node_online(node)) {
288 node = first_node(node_online_map);
289 else if (!node_online(node)) {
290 /* reuse the value from init_cpu_to_node() */ 288 /* reuse the value from init_cpu_to_node() */
291 node = cpu_to_node(cpu); 289 node = cpu_to_node(cpu);
292 } 290 }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 898c2f4eab88..17ad03366211 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/amd_nb.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22 22
23#define LVL_1_INST 1 23#define LVL_1_INST 1
@@ -306,7 +306,7 @@ struct _cache_attr {
306 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); 306 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
307}; 307};
308 308
309#ifdef CONFIG_CPU_SUP_AMD 309#ifdef CONFIG_AMD_NB
310 310
311/* 311/*
312 * L3 cache descriptors 312 * L3 cache descriptors
@@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); 327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
328 328
329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
330} 331}
331 332
332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) 333static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
@@ -369,7 +370,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
369 return; 370 return;
370 371
371 /* not in virtualized environments */ 372 /* not in virtualized environments */
372 if (num_k8_northbridges == 0) 373 if (k8_northbridges.num == 0)
373 return; 374 return;
374 375
375 /* 376 /*
@@ -377,7 +378,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
377 * never freed but this is done only on shutdown so it doesn't matter. 378 * never freed but this is done only on shutdown so it doesn't matter.
378 */ 379 */
379 if (!l3_caches) { 380 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); 381 int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
381 382
382 l3_caches = kzalloc(size, GFP_ATOMIC); 383 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches) 384 if (!l3_caches)
@@ -556,12 +557,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
556static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 557static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
557 show_cache_disable_1, store_cache_disable_1); 558 show_cache_disable_1, store_cache_disable_1);
558 559
559#else /* CONFIG_CPU_SUP_AMD */ 560#else /* CONFIG_AMD_NB */
560static void __cpuinit 561static void __cpuinit
561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) 562amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
562{ 563{
563}; 564};
564#endif /* CONFIG_CPU_SUP_AMD */ 565#endif /* CONFIG_AMD_NB */
565 566
566static int 567static int
567__cpuinit cpuid4_cache_lookup_regs(int index, 568__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1001,7 @@ static struct attribute *default_attrs[] = {
1000 1001
1001static struct attribute *default_l3_attrs[] = { 1002static struct attribute *default_l3_attrs[] = {
1002 DEFAULT_SYSFS_CACHE_ATTRS, 1003 DEFAULT_SYSFS_CACHE_ATTRS,
1003#ifdef CONFIG_CPU_SUP_AMD 1004#ifdef CONFIG_AMD_NB
1004 &cache_disable_0.attr, 1005 &cache_disable_0.attr,
1005 &cache_disable_1.attr, 1006 &cache_disable_1.attr,
1006#endif 1007#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa1..1e8d66c1336a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
192 .release = seq_release, 192 .release = seq_release,
193 .read = seq_read, 193 .read = seq_read,
194 .write = severities_coverage_write, 194 .write = severities_coverage_write,
195 .llseek = seq_lseek,
195}; 196};
196 197
197static int __init severities_debugfs_init(void) 198static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ed41562909fe..7a35b72d7c03 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1665,6 +1665,7 @@ struct file_operations mce_chrdev_ops = {
1665 .read = mce_read, 1665 .read = mce_read,
1666 .poll = mce_poll, 1666 .poll = mce_poll,
1667 .unlocked_ioctl = mce_ioctl, 1667 .unlocked_ioctl = mce_ioctl,
1668 .llseek = no_llseek,
1668}; 1669};
1669EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1670EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1670 1671
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab2..80c482382d5c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
131 u32 low = 0, high = 0, address = 0; 131 u32 low = 0, high = 0, address = 0;
132 unsigned int bank, block; 132 unsigned int bank, block;
133 struct thresh_restart tr; 133 struct thresh_restart tr;
134 u8 lvt_off; 134 int lvt_off = -1;
135 u8 offset;
135 136
136 for (bank = 0; bank < NR_BANKS; ++bank) { 137 for (bank = 0; bank < NR_BANKS; ++bank) {
137 for (block = 0; block < NR_BLOCKS; ++block) { 138 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
162 if (shared_bank[bank] && c->cpu_core_id) 163 if (shared_bank[bank] && c->cpu_core_id)
163 break; 164 break;
164#endif 165#endif
165 lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, 166 offset = (high & MASK_LVTOFF_HI) >> 20;
166 APIC_EILVT_MSG_FIX, 0); 167 if (lvt_off < 0) {
168 if (setup_APIC_eilvt(offset,
169 THRESHOLD_APIC_VECTOR,
170 APIC_EILVT_MSG_FIX, 0)) {
171 pr_err(FW_BUG "cpu %d, failed to "
172 "setup threshold interrupt "
173 "for bank %d, block %d "
174 "(MSR%08X=0x%x%08x)",
175 smp_processor_id(), bank, block,
176 address, high, low);
177 continue;
178 }
179 lvt_off = offset;
180 } else if (lvt_off != offset) {
181 pr_err(FW_BUG "cpu %d, invalid threshold "
182 "interrupt offset %d for bank %d,"
183 "block %d (MSR%08X=0x%x%08x)",
184 smp_processor_id(), lvt_off, bank,
185 block, address, high, low);
186 continue;
187 }
167 188
168 high &= ~MASK_LVTOFF_HI; 189 high &= ~MASK_LVTOFF_HI;
169 high |= lvt_off << 20; 190 high |= lvt_off << 20;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f8..4b683267eca5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -350,7 +350,7 @@ static void intel_thermal_interrupt(void)
350 350
351static void unexpected_thermal_interrupt(void) 351static void unexpected_thermal_interrupt(void)
352{ 352{
353 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 353 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
354 smp_processor_id()); 354 smp_processor_id());
355 add_taint(TAINT_MACHINE_CHECK); 355 add_taint(TAINT_MACHINE_CHECK);
356} 356}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d071425..ac140c7be396 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
827 827
828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
829 return 0; 829 return 0;
830 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 830 if (boot_cpu_data.x86 < 0xf)
831 return 0; 831 return 0;
832 /* In case some hypervisor doesn't pass SYSCFG through: */ 832 /* In case some hypervisor doesn't pass SYSCFG through: */
833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) 833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d03885..9f27228ceffd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
64 } 64 }
65} 65}
66 66
67/* Get the size of contiguous MTRR range */
68static u64 get_mtrr_size(u64 mask)
69{
70 u64 size;
71
72 mask >>= PAGE_SHIFT;
73 mask |= size_or_mask;
74 size = -mask;
75 size <<= PAGE_SHIFT;
76 return size;
77}
78
67/* 79/*
68 * Returns the effective MTRR type for the region 80 * Check and return the effective type for MTRR-MTRR type overlap.
69 * Error returns: 81 * Returns 1 if the effective type is UNCACHEABLE, else returns 0
70 * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
71 * - 0xFF - when MTRR is not enabled
72 */ 82 */
73u8 mtrr_type_lookup(u64 start, u64 end) 83static int check_type_overlap(u8 *prev, u8 *curr)
84{
85 if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
86 *prev = MTRR_TYPE_UNCACHABLE;
87 *curr = MTRR_TYPE_UNCACHABLE;
88 return 1;
89 }
90
91 if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
92 (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
93 *prev = MTRR_TYPE_WRTHROUGH;
94 *curr = MTRR_TYPE_WRTHROUGH;
95 }
96
97 if (*prev != *curr) {
98 *prev = MTRR_TYPE_UNCACHABLE;
99 *curr = MTRR_TYPE_UNCACHABLE;
100 return 1;
101 }
102
103 return 0;
104}
105
106/*
107 * Error/Semi-error returns:
108 * 0xFF - when MTRR is not enabled
109 * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
110 * corresponds only to [start:*partial_end].
111 * Caller has to lookup again for [*partial_end:end].
112 */
113static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
74{ 114{
75 int i; 115 int i;
76 u64 base, mask; 116 u64 base, mask;
77 u8 prev_match, curr_match; 117 u8 prev_match, curr_match;
78 118
119 *repeat = 0;
79 if (!mtrr_state_set) 120 if (!mtrr_state_set)
80 return 0xFF; 121 return 0xFF;
81 122
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
126 167
127 start_state = ((start & mask) == (base & mask)); 168 start_state = ((start & mask) == (base & mask));
128 end_state = ((end & mask) == (base & mask)); 169 end_state = ((end & mask) == (base & mask));
129 if (start_state != end_state) 170
130 return 0xFE; 171 if (start_state != end_state) {
172 /*
173 * We have start:end spanning across an MTRR.
174 * We split the region into
175 * either
176 * (start:mtrr_end) (mtrr_end:end)
177 * or
178 * (start:mtrr_start) (mtrr_start:end)
179 * depending on kind of overlap.
180 * Return the type for first region and a pointer to
181 * the start of second region so that caller will
182 * lookup again on the second region.
183 * Note: This way we handle multiple overlaps as well.
184 */
185 if (start_state)
186 *partial_end = base + get_mtrr_size(mask);
187 else
188 *partial_end = base;
189
190 if (unlikely(*partial_end <= start)) {
191 WARN_ON(1);
192 *partial_end = start + PAGE_SIZE;
193 }
194
195 end = *partial_end - 1; /* end is inclusive */
196 *repeat = 1;
197 }
131 198
132 if ((start & mask) != (base & mask)) 199 if ((start & mask) != (base & mask))
133 continue; 200 continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
138 continue; 205 continue;
139 } 206 }
140 207
141 if (prev_match == MTRR_TYPE_UNCACHABLE || 208 if (check_type_overlap(&prev_match, &curr_match))
142 curr_match == MTRR_TYPE_UNCACHABLE) { 209 return curr_match;
143 return MTRR_TYPE_UNCACHABLE;
144 }
145
146 if ((prev_match == MTRR_TYPE_WRBACK &&
147 curr_match == MTRR_TYPE_WRTHROUGH) ||
148 (prev_match == MTRR_TYPE_WRTHROUGH &&
149 curr_match == MTRR_TYPE_WRBACK)) {
150 prev_match = MTRR_TYPE_WRTHROUGH;
151 curr_match = MTRR_TYPE_WRTHROUGH;
152 }
153
154 if (prev_match != curr_match)
155 return MTRR_TYPE_UNCACHABLE;
156 } 210 }
157 211
158 if (mtrr_tom2) { 212 if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
166 return mtrr_state.def_type; 220 return mtrr_state.def_type;
167} 221}
168 222
223/*
224 * Returns the effective MTRR type for the region
225 * Error return:
226 * 0xFF - when MTRR is not enabled
227 */
228u8 mtrr_type_lookup(u64 start, u64 end)
229{
230 u8 type, prev_type;
231 int repeat;
232 u64 partial_end;
233
234 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
235
236 /*
237 * Common path is with repeat = 0.
238 * However, we can have cases where [start:end] spans across some
239 * MTRR range. Do repeated lookups for that case here.
240 */
241 while (repeat) {
242 prev_type = type;
243 start = partial_end;
244 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
245
246 if (check_type_overlap(&prev_type, &type))
247 return type;
248 }
249
250 return type;
251}
252
169/* Get the MSR pair relating to a var range */ 253/* Get the MSR pair relating to a var range */
170static void 254static void
171get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 255get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 03a5b0385ad6..ed6310183efb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -49,7 +49,6 @@ static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n) 49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{ 50{
51 unsigned long offset, addr = (unsigned long)from; 51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0; 52 unsigned long size, len = 0;
54 struct page *page; 53 struct page *page;
55 void *map; 54 void *map;
@@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
63 offset = addr & (PAGE_SIZE - 1); 62 offset = addr & (PAGE_SIZE - 1);
64 size = min(PAGE_SIZE - offset, n - len); 63 size = min(PAGE_SIZE - offset, n - len);
65 64
66 map = kmap_atomic(page, type); 65 map = kmap_atomic(page);
67 memcpy(to, map+offset, size); 66 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type); 67 kunmap_atomic(map);
69 put_page(page); 68 put_page(page);
70 69
71 len += size; 70 len += size;
@@ -238,6 +237,7 @@ struct x86_pmu {
238 * Intel DebugStore bits 237 * Intel DebugStore bits
239 */ 238 */
240 int bts, pebs; 239 int bts, pebs;
240 int bts_active, pebs_active;
241 int pebs_record_size; 241 int pebs_record_size;
242 void (*drain_pebs)(struct pt_regs *regs); 242 void (*drain_pebs)(struct pt_regs *regs);
243 struct event_constraint *pebs_constraints; 243 struct event_constraint *pebs_constraints;
@@ -381,7 +381,7 @@ static void release_pmc_hardware(void) {}
381 381
382#endif 382#endif
383 383
384static int reserve_ds_buffers(void); 384static void reserve_ds_buffers(void);
385static void release_ds_buffers(void); 385static void release_ds_buffers(void);
386 386
387static void hw_perf_event_destroy(struct perf_event *event) 387static void hw_perf_event_destroy(struct perf_event *event)
@@ -478,7 +478,7 @@ static int x86_setup_perfctr(struct perf_event *event)
478 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 478 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
479 (hwc->sample_period == 1)) { 479 (hwc->sample_period == 1)) {
480 /* BTS is not supported by this architecture. */ 480 /* BTS is not supported by this architecture. */
481 if (!x86_pmu.bts) 481 if (!x86_pmu.bts_active)
482 return -EOPNOTSUPP; 482 return -EOPNOTSUPP;
483 483
484 /* BTS is currently only allowed for user-mode. */ 484 /* BTS is currently only allowed for user-mode. */
@@ -497,12 +497,13 @@ static int x86_pmu_hw_config(struct perf_event *event)
497 int precise = 0; 497 int precise = 0;
498 498
499 /* Support for constant skid */ 499 /* Support for constant skid */
500 if (x86_pmu.pebs) 500 if (x86_pmu.pebs_active) {
501 precise++; 501 precise++;
502 502
503 /* Support for IP fixup */ 503 /* Support for IP fixup */
504 if (x86_pmu.lbr_nr) 504 if (x86_pmu.lbr_nr)
505 precise++; 505 precise++;
506 }
506 507
507 if (event->attr.precise_ip > precise) 508 if (event->attr.precise_ip > precise)
508 return -EOPNOTSUPP; 509 return -EOPNOTSUPP;
@@ -531,7 +532,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
531/* 532/*
532 * Setup the hardware configuration for a given attr_type 533 * Setup the hardware configuration for a given attr_type
533 */ 534 */
534static int __hw_perf_event_init(struct perf_event *event) 535static int __x86_pmu_event_init(struct perf_event *event)
535{ 536{
536 int err; 537 int err;
537 538
@@ -544,11 +545,8 @@ static int __hw_perf_event_init(struct perf_event *event)
544 if (atomic_read(&active_events) == 0) { 545 if (atomic_read(&active_events) == 0) {
545 if (!reserve_pmc_hardware()) 546 if (!reserve_pmc_hardware())
546 err = -EBUSY; 547 err = -EBUSY;
547 else { 548 else
548 err = reserve_ds_buffers(); 549 reserve_ds_buffers();
549 if (err)
550 release_pmc_hardware();
551 }
552 } 550 }
553 if (!err) 551 if (!err)
554 atomic_inc(&active_events); 552 atomic_inc(&active_events);
@@ -584,7 +582,7 @@ static void x86_pmu_disable_all(void)
584 } 582 }
585} 583}
586 584
587void hw_perf_disable(void) 585static void x86_pmu_disable(struct pmu *pmu)
588{ 586{
589 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 587 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
590 588
@@ -619,7 +617,7 @@ static void x86_pmu_enable_all(int added)
619 } 617 }
620} 618}
621 619
622static const struct pmu pmu; 620static struct pmu pmu;
623 621
624static inline int is_x86_event(struct perf_event *event) 622static inline int is_x86_event(struct perf_event *event)
625{ 623{
@@ -801,10 +799,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
801 hwc->last_tag == cpuc->tags[i]; 799 hwc->last_tag == cpuc->tags[i];
802} 800}
803 801
804static int x86_pmu_start(struct perf_event *event); 802static void x86_pmu_start(struct perf_event *event, int flags);
805static void x86_pmu_stop(struct perf_event *event); 803static void x86_pmu_stop(struct perf_event *event, int flags);
806 804
807void hw_perf_enable(void) 805static void x86_pmu_enable(struct pmu *pmu)
808{ 806{
809 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 807 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
810 struct perf_event *event; 808 struct perf_event *event;
@@ -840,7 +838,14 @@ void hw_perf_enable(void)
840 match_prev_assignment(hwc, cpuc, i)) 838 match_prev_assignment(hwc, cpuc, i))
841 continue; 839 continue;
842 840
843 x86_pmu_stop(event); 841 /*
842 * Ensure we don't accidentally enable a stopped
843 * counter simply because we rescheduled.
844 */
845 if (hwc->state & PERF_HES_STOPPED)
846 hwc->state |= PERF_HES_ARCH;
847
848 x86_pmu_stop(event, PERF_EF_UPDATE);
844 } 849 }
845 850
846 for (i = 0; i < cpuc->n_events; i++) { 851 for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +857,10 @@ void hw_perf_enable(void)
852 else if (i < n_running) 857 else if (i < n_running)
853 continue; 858 continue;
854 859
855 x86_pmu_start(event); 860 if (hwc->state & PERF_HES_ARCH)
861 continue;
862
863 x86_pmu_start(event, PERF_EF_RELOAD);
856 } 864 }
857 cpuc->n_added = 0; 865 cpuc->n_added = 0;
858 perf_events_lapic_init(); 866 perf_events_lapic_init();
@@ -953,15 +961,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
953} 961}
954 962
955/* 963/*
956 * activate a single event 964 * Add a single event to the PMU.
957 * 965 *
958 * The event is added to the group of enabled events 966 * The event is added to the group of enabled events
959 * but only if it can be scehduled with existing events. 967 * but only if it can be scehduled with existing events.
960 *
961 * Called with PMU disabled. If successful and return value 1,
962 * then guaranteed to call perf_enable() and hw_perf_enable()
963 */ 968 */
964static int x86_pmu_enable(struct perf_event *event) 969static int x86_pmu_add(struct perf_event *event, int flags)
965{ 970{
966 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 971 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
967 struct hw_perf_event *hwc; 972 struct hw_perf_event *hwc;
@@ -970,58 +975,67 @@ static int x86_pmu_enable(struct perf_event *event)
970 975
971 hwc = &event->hw; 976 hwc = &event->hw;
972 977
978 perf_pmu_disable(event->pmu);
973 n0 = cpuc->n_events; 979 n0 = cpuc->n_events;
974 n = collect_events(cpuc, event, false); 980 ret = n = collect_events(cpuc, event, false);
975 if (n < 0) 981 if (ret < 0)
976 return n; 982 goto out;
983
984 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
985 if (!(flags & PERF_EF_START))
986 hwc->state |= PERF_HES_ARCH;
977 987
978 /* 988 /*
979 * If group events scheduling transaction was started, 989 * If group events scheduling transaction was started,
980 * skip the schedulability test here, it will be peformed 990 * skip the schedulability test here, it will be peformed
981 * at commit time(->commit_txn) as a whole 991 * at commit time (->commit_txn) as a whole
982 */ 992 */
983 if (cpuc->group_flag & PERF_EVENT_TXN) 993 if (cpuc->group_flag & PERF_EVENT_TXN)
984 goto out; 994 goto done_collect;
985 995
986 ret = x86_pmu.schedule_events(cpuc, n, assign); 996 ret = x86_pmu.schedule_events(cpuc, n, assign);
987 if (ret) 997 if (ret)
988 return ret; 998 goto out;
989 /* 999 /*
990 * copy new assignment, now we know it is possible 1000 * copy new assignment, now we know it is possible
991 * will be used by hw_perf_enable() 1001 * will be used by hw_perf_enable()
992 */ 1002 */
993 memcpy(cpuc->assign, assign, n*sizeof(int)); 1003 memcpy(cpuc->assign, assign, n*sizeof(int));
994 1004
995out: 1005done_collect:
996 cpuc->n_events = n; 1006 cpuc->n_events = n;
997 cpuc->n_added += n - n0; 1007 cpuc->n_added += n - n0;
998 cpuc->n_txn += n - n0; 1008 cpuc->n_txn += n - n0;
999 1009
1000 return 0; 1010 ret = 0;
1011out:
1012 perf_pmu_enable(event->pmu);
1013 return ret;
1001} 1014}
1002 1015
1003static int x86_pmu_start(struct perf_event *event) 1016static void x86_pmu_start(struct perf_event *event, int flags)
1004{ 1017{
1005 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1018 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1006 int idx = event->hw.idx; 1019 int idx = event->hw.idx;
1007 1020
1008 if (idx == -1) 1021 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1009 return -EAGAIN; 1022 return;
1023
1024 if (WARN_ON_ONCE(idx == -1))
1025 return;
1026
1027 if (flags & PERF_EF_RELOAD) {
1028 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1029 x86_perf_event_set_period(event);
1030 }
1031
1032 event->hw.state = 0;
1010 1033
1011 x86_perf_event_set_period(event);
1012 cpuc->events[idx] = event; 1034 cpuc->events[idx] = event;
1013 __set_bit(idx, cpuc->active_mask); 1035 __set_bit(idx, cpuc->active_mask);
1014 __set_bit(idx, cpuc->running); 1036 __set_bit(idx, cpuc->running);
1015 x86_pmu.enable(event); 1037 x86_pmu.enable(event);
1016 perf_event_update_userpage(event); 1038 perf_event_update_userpage(event);
1017
1018 return 0;
1019}
1020
1021static void x86_pmu_unthrottle(struct perf_event *event)
1022{
1023 int ret = x86_pmu_start(event);
1024 WARN_ON_ONCE(ret);
1025} 1039}
1026 1040
1027void perf_event_print_debug(void) 1041void perf_event_print_debug(void)
@@ -1078,27 +1092,29 @@ void perf_event_print_debug(void)
1078 local_irq_restore(flags); 1092 local_irq_restore(flags);
1079} 1093}
1080 1094
1081static void x86_pmu_stop(struct perf_event *event) 1095static void x86_pmu_stop(struct perf_event *event, int flags)
1082{ 1096{
1083 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1097 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1084 struct hw_perf_event *hwc = &event->hw; 1098 struct hw_perf_event *hwc = &event->hw;
1085 int idx = hwc->idx;
1086
1087 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1088 return;
1089
1090 x86_pmu.disable(event);
1091 1099
1092 /* 1100 if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1093 * Drain the remaining delta count out of a event 1101 x86_pmu.disable(event);
1094 * that we are disabling: 1102 cpuc->events[hwc->idx] = NULL;
1095 */ 1103 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1096 x86_perf_event_update(event); 1104 hwc->state |= PERF_HES_STOPPED;
1105 }
1097 1106
1098 cpuc->events[idx] = NULL; 1107 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1108 /*
1109 * Drain the remaining delta count out of a event
1110 * that we are disabling:
1111 */
1112 x86_perf_event_update(event);
1113 hwc->state |= PERF_HES_UPTODATE;
1114 }
1099} 1115}
1100 1116
1101static void x86_pmu_disable(struct perf_event *event) 1117static void x86_pmu_del(struct perf_event *event, int flags)
1102{ 1118{
1103 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1119 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1104 int i; 1120 int i;
@@ -1111,7 +1127,7 @@ static void x86_pmu_disable(struct perf_event *event)
1111 if (cpuc->group_flag & PERF_EVENT_TXN) 1127 if (cpuc->group_flag & PERF_EVENT_TXN)
1112 return; 1128 return;
1113 1129
1114 x86_pmu_stop(event); 1130 x86_pmu_stop(event, PERF_EF_UPDATE);
1115 1131
1116 for (i = 0; i < cpuc->n_events; i++) { 1132 for (i = 0; i < cpuc->n_events; i++) {
1117 if (event == cpuc->event_list[i]) { 1133 if (event == cpuc->event_list[i]) {
@@ -1134,7 +1150,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1134 struct perf_sample_data data; 1150 struct perf_sample_data data;
1135 struct cpu_hw_events *cpuc; 1151 struct cpu_hw_events *cpuc;
1136 struct perf_event *event; 1152 struct perf_event *event;
1137 struct hw_perf_event *hwc;
1138 int idx, handled = 0; 1153 int idx, handled = 0;
1139 u64 val; 1154 u64 val;
1140 1155
@@ -1155,7 +1170,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1155 } 1170 }
1156 1171
1157 event = cpuc->events[idx]; 1172 event = cpuc->events[idx];
1158 hwc = &event->hw;
1159 1173
1160 val = x86_perf_event_update(event); 1174 val = x86_perf_event_update(event);
1161 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1175 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1185,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1171 continue; 1185 continue;
1172 1186
1173 if (perf_event_overflow(event, 1, &data, regs)) 1187 if (perf_event_overflow(event, 1, &data, regs))
1174 x86_pmu_stop(event); 1188 x86_pmu_stop(event, 0);
1175 } 1189 }
1176 1190
1177 if (handled) 1191 if (handled)
@@ -1180,25 +1194,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1180 return handled; 1194 return handled;
1181} 1195}
1182 1196
1183void smp_perf_pending_interrupt(struct pt_regs *regs)
1184{
1185 irq_enter();
1186 ack_APIC_irq();
1187 inc_irq_stat(apic_pending_irqs);
1188 perf_event_do_pending();
1189 irq_exit();
1190}
1191
1192void set_perf_event_pending(void)
1193{
1194#ifdef CONFIG_X86_LOCAL_APIC
1195 if (!x86_pmu.apic || !x86_pmu_initialized())
1196 return;
1197
1198 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1199#endif
1200}
1201
1202void perf_events_lapic_init(void) 1197void perf_events_lapic_init(void)
1203{ 1198{
1204 if (!x86_pmu.apic || !x86_pmu_initialized()) 1199 if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1388,7 +1383,6 @@ void __init init_hw_perf_events(void)
1388 x86_pmu.num_counters = X86_PMC_MAX_GENERIC; 1383 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1389 } 1384 }
1390 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1385 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1391 perf_max_events = x86_pmu.num_counters;
1392 1386
1393 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 1387 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1394 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1388 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,6 +1418,7 @@ void __init init_hw_perf_events(void)
1424 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1418 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1425 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 1419 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1426 1420
1421 perf_pmu_register(&pmu);
1427 perf_cpu_notifier(x86_pmu_notifier); 1422 perf_cpu_notifier(x86_pmu_notifier);
1428} 1423}
1429 1424
@@ -1437,10 +1432,11 @@ static inline void x86_pmu_read(struct perf_event *event)
1437 * Set the flag to make pmu::enable() not perform the 1432 * Set the flag to make pmu::enable() not perform the
1438 * schedulability test, it will be performed at commit time 1433 * schedulability test, it will be performed at commit time
1439 */ 1434 */
1440static void x86_pmu_start_txn(const struct pmu *pmu) 1435static void x86_pmu_start_txn(struct pmu *pmu)
1441{ 1436{
1442 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1437 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1443 1438
1439 perf_pmu_disable(pmu);
1444 cpuc->group_flag |= PERF_EVENT_TXN; 1440 cpuc->group_flag |= PERF_EVENT_TXN;
1445 cpuc->n_txn = 0; 1441 cpuc->n_txn = 0;
1446} 1442}
@@ -1450,7 +1446,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
1450 * Clear the flag and pmu::enable() will perform the 1446 * Clear the flag and pmu::enable() will perform the
1451 * schedulability test. 1447 * schedulability test.
1452 */ 1448 */
1453static void x86_pmu_cancel_txn(const struct pmu *pmu) 1449static void x86_pmu_cancel_txn(struct pmu *pmu)
1454{ 1450{
1455 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1451 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1456 1452
@@ -1460,6 +1456,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1460 */ 1456 */
1461 cpuc->n_added -= cpuc->n_txn; 1457 cpuc->n_added -= cpuc->n_txn;
1462 cpuc->n_events -= cpuc->n_txn; 1458 cpuc->n_events -= cpuc->n_txn;
1459 perf_pmu_enable(pmu);
1463} 1460}
1464 1461
1465/* 1462/*
@@ -1467,7 +1464,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1467 * Perform the group schedulability test as a whole 1464 * Perform the group schedulability test as a whole
1468 * Return 0 if success 1465 * Return 0 if success
1469 */ 1466 */
1470static int x86_pmu_commit_txn(const struct pmu *pmu) 1467static int x86_pmu_commit_txn(struct pmu *pmu)
1471{ 1468{
1472 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1469 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1473 int assign[X86_PMC_IDX_MAX]; 1470 int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1486,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
1489 memcpy(cpuc->assign, assign, n*sizeof(int)); 1486 memcpy(cpuc->assign, assign, n*sizeof(int));
1490 1487
1491 cpuc->group_flag &= ~PERF_EVENT_TXN; 1488 cpuc->group_flag &= ~PERF_EVENT_TXN;
1492 1489 perf_pmu_enable(pmu);
1493 return 0; 1490 return 0;
1494} 1491}
1495 1492
1496static const struct pmu pmu = {
1497 .enable = x86_pmu_enable,
1498 .disable = x86_pmu_disable,
1499 .start = x86_pmu_start,
1500 .stop = x86_pmu_stop,
1501 .read = x86_pmu_read,
1502 .unthrottle = x86_pmu_unthrottle,
1503 .start_txn = x86_pmu_start_txn,
1504 .cancel_txn = x86_pmu_cancel_txn,
1505 .commit_txn = x86_pmu_commit_txn,
1506};
1507
1508/* 1493/*
1509 * validate that we can schedule this event 1494 * validate that we can schedule this event
1510 */ 1495 */
@@ -1579,12 +1564,22 @@ out:
1579 return ret; 1564 return ret;
1580} 1565}
1581 1566
1582const struct pmu *hw_perf_event_init(struct perf_event *event) 1567int x86_pmu_event_init(struct perf_event *event)
1583{ 1568{
1584 const struct pmu *tmp; 1569 struct pmu *tmp;
1585 int err; 1570 int err;
1586 1571
1587 err = __hw_perf_event_init(event); 1572 switch (event->attr.type) {
1573 case PERF_TYPE_RAW:
1574 case PERF_TYPE_HARDWARE:
1575 case PERF_TYPE_HW_CACHE:
1576 break;
1577
1578 default:
1579 return -ENOENT;
1580 }
1581
1582 err = __x86_pmu_event_init(event);
1588 if (!err) { 1583 if (!err) {
1589 /* 1584 /*
1590 * we temporarily connect event to its pmu 1585 * we temporarily connect event to its pmu
@@ -1604,26 +1599,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1604 if (err) { 1599 if (err) {
1605 if (event->destroy) 1600 if (event->destroy)
1606 event->destroy(event); 1601 event->destroy(event);
1607 return ERR_PTR(err);
1608 } 1602 }
1609 1603
1610 return &pmu; 1604 return err;
1611} 1605}
1612 1606
1613/* 1607static struct pmu pmu = {
1614 * callchain support 1608 .pmu_enable = x86_pmu_enable,
1615 */ 1609 .pmu_disable = x86_pmu_disable,
1616 1610
1617static inline 1611 .event_init = x86_pmu_event_init,
1618void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1619{
1620 if (entry->nr < PERF_MAX_STACK_DEPTH)
1621 entry->ip[entry->nr++] = ip;
1622}
1623 1612
1624static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1613 .add = x86_pmu_add,
1625static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1614 .del = x86_pmu_del,
1615 .start = x86_pmu_start,
1616 .stop = x86_pmu_stop,
1617 .read = x86_pmu_read,
1626 1618
1619 .start_txn = x86_pmu_start_txn,
1620 .cancel_txn = x86_pmu_cancel_txn,
1621 .commit_txn = x86_pmu_commit_txn,
1622};
1623
1624/*
1625 * callchain support
1626 */
1627 1627
1628static void 1628static void
1629backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) 1629backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
@@ -1645,7 +1645,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1645{ 1645{
1646 struct perf_callchain_entry *entry = data; 1646 struct perf_callchain_entry *entry = data;
1647 1647
1648 callchain_store(entry, addr); 1648 perf_callchain_store(entry, addr);
1649} 1649}
1650 1650
1651static const struct stacktrace_ops backtrace_ops = { 1651static const struct stacktrace_ops backtrace_ops = {
@@ -1656,11 +1656,15 @@ static const struct stacktrace_ops backtrace_ops = {
1656 .walk_stack = print_context_stack_bp, 1656 .walk_stack = print_context_stack_bp,
1657}; 1657};
1658 1658
1659static void 1659void
1660perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1660perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1661{ 1661{
1662 callchain_store(entry, PERF_CONTEXT_KERNEL); 1662 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1663 callchain_store(entry, regs->ip); 1663 /* TODO: We don't support guest os callchain now */
1664 return;
1665 }
1666
1667 perf_callchain_store(entry, regs->ip);
1664 1668
1665 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1669 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1666} 1670}
@@ -1689,7 +1693,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1689 if (fp < compat_ptr(regs->sp)) 1693 if (fp < compat_ptr(regs->sp))
1690 break; 1694 break;
1691 1695
1692 callchain_store(entry, frame.return_address); 1696 perf_callchain_store(entry, frame.return_address);
1693 fp = compat_ptr(frame.next_frame); 1697 fp = compat_ptr(frame.next_frame);
1694 } 1698 }
1695 return 1; 1699 return 1;
@@ -1702,19 +1706,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1702} 1706}
1703#endif 1707#endif
1704 1708
1705static void 1709void
1706perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1710perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1707{ 1711{
1708 struct stack_frame frame; 1712 struct stack_frame frame;
1709 const void __user *fp; 1713 const void __user *fp;
1710 1714
1711 if (!user_mode(regs)) 1715 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1712 regs = task_pt_regs(current); 1716 /* TODO: We don't support guest os callchain now */
1717 return;
1718 }
1713 1719
1714 fp = (void __user *)regs->bp; 1720 fp = (void __user *)regs->bp;
1715 1721
1716 callchain_store(entry, PERF_CONTEXT_USER); 1722 perf_callchain_store(entry, regs->ip);
1717 callchain_store(entry, regs->ip);
1718 1723
1719 if (perf_callchain_user32(regs, entry)) 1724 if (perf_callchain_user32(regs, entry))
1720 return; 1725 return;
@@ -1731,52 +1736,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1731 if ((unsigned long)fp < regs->sp) 1736 if ((unsigned long)fp < regs->sp)
1732 break; 1737 break;
1733 1738
1734 callchain_store(entry, frame.return_address); 1739 perf_callchain_store(entry, frame.return_address);
1735 fp = frame.next_frame; 1740 fp = frame.next_frame;
1736 } 1741 }
1737} 1742}
1738 1743
1739static void
1740perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1741{
1742 int is_user;
1743
1744 if (!regs)
1745 return;
1746
1747 is_user = user_mode(regs);
1748
1749 if (is_user && current->state != TASK_RUNNING)
1750 return;
1751
1752 if (!is_user)
1753 perf_callchain_kernel(regs, entry);
1754
1755 if (current->mm)
1756 perf_callchain_user(regs, entry);
1757}
1758
1759struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1760{
1761 struct perf_callchain_entry *entry;
1762
1763 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1764 /* TODO: We don't support guest os callchain now */
1765 return NULL;
1766 }
1767
1768 if (in_nmi())
1769 entry = &__get_cpu_var(pmc_nmi_entry);
1770 else
1771 entry = &__get_cpu_var(pmc_irq_entry);
1772
1773 entry->nr = 0;
1774
1775 perf_do_callchain(regs, entry);
1776
1777 return entry;
1778}
1779
1780unsigned long perf_instruction_pointer(struct pt_regs *regs) 1744unsigned long perf_instruction_pointer(struct pt_regs *regs)
1781{ 1745{
1782 unsigned long ip; 1746 unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3b..46d58448c3af 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids
52 [ C(DTLB) ] = { 52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = { 53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ 55 [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
56 }, 56 },
57 [ C(OP_WRITE) ] = { 57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0, 58 [ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids
66 [ C(ITLB) ] = { 66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = { 67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ 68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ 69 [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
70 }, 70 },
71 [ C(OP_WRITE) ] = { 71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1, 72 [ C(RESULT_ACCESS) ] = -1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d2..c8f5c088cad1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
713 struct cpu_hw_events *cpuc; 713 struct cpu_hw_events *cpuc;
714 int bit, loops; 714 int bit, loops;
715 u64 status; 715 u64 status;
716 int handled = 0; 716 int handled;
717 717
718 perf_sample_data_init(&data, 0); 718 perf_sample_data_init(&data, 0);
719 719
720 cpuc = &__get_cpu_var(cpu_hw_events); 720 cpuc = &__get_cpu_var(cpu_hw_events);
721 721
722 intel_pmu_disable_all(); 722 intel_pmu_disable_all();
723 intel_pmu_drain_bts_buffer(); 723 handled = intel_pmu_drain_bts_buffer();
724 status = intel_pmu_get_status(); 724 status = intel_pmu_get_status();
725 if (!status) { 725 if (!status) {
726 intel_pmu_enable_all(0); 726 intel_pmu_enable_all(0);
727 return 0; 727 return handled;
728 } 728 }
729 729
730 loops = 0; 730 loops = 0;
@@ -763,7 +763,7 @@ again:
763 data.period = event->hw.last_period; 763 data.period = event->hw.last_period;
764 764
765 if (perf_event_overflow(event, 1, &data, regs)) 765 if (perf_event_overflow(event, 1, &data, regs))
766 x86_pmu_stop(event); 766 x86_pmu_stop(event, 0);
767 } 767 }
768 768
769 /* 769 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311cd..b7dcd9f2b8a0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu)
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); 74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75} 75}
76 76
77static int alloc_pebs_buffer(int cpu)
78{
79 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
80 int node = cpu_to_node(cpu);
81 int max, thresh = 1; /* always use a single PEBS record */
82 void *buffer;
83
84 if (!x86_pmu.pebs)
85 return 0;
86
87 buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
88 if (unlikely(!buffer))
89 return -ENOMEM;
90
91 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
92
93 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
94 ds->pebs_index = ds->pebs_buffer_base;
95 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
96 max * x86_pmu.pebs_record_size;
97
98 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
99 thresh * x86_pmu.pebs_record_size;
100
101 return 0;
102}
103
104static void release_pebs_buffer(int cpu)
105{
106 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
107
108 if (!ds || !x86_pmu.pebs)
109 return;
110
111 kfree((void *)(unsigned long)ds->pebs_buffer_base);
112 ds->pebs_buffer_base = 0;
113}
114
115static int alloc_bts_buffer(int cpu)
116{
117 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
118 int node = cpu_to_node(cpu);
119 int max, thresh;
120 void *buffer;
121
122 if (!x86_pmu.bts)
123 return 0;
124
125 buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
126 if (unlikely(!buffer))
127 return -ENOMEM;
128
129 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
130 thresh = max / 16;
131
132 ds->bts_buffer_base = (u64)(unsigned long)buffer;
133 ds->bts_index = ds->bts_buffer_base;
134 ds->bts_absolute_maximum = ds->bts_buffer_base +
135 max * BTS_RECORD_SIZE;
136 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
137 thresh * BTS_RECORD_SIZE;
138
139 return 0;
140}
141
142static void release_bts_buffer(int cpu)
143{
144 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
145
146 if (!ds || !x86_pmu.bts)
147 return;
148
149 kfree((void *)(unsigned long)ds->bts_buffer_base);
150 ds->bts_buffer_base = 0;
151}
152
153static int alloc_ds_buffer(int cpu)
154{
155 int node = cpu_to_node(cpu);
156 struct debug_store *ds;
157
158 ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
159 if (unlikely(!ds))
160 return -ENOMEM;
161
162 per_cpu(cpu_hw_events, cpu).ds = ds;
163
164 return 0;
165}
166
167static void release_ds_buffer(int cpu)
168{
169 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
170
171 if (!ds)
172 return;
173
174 per_cpu(cpu_hw_events, cpu).ds = NULL;
175 kfree(ds);
176}
177
77static void release_ds_buffers(void) 178static void release_ds_buffers(void)
78{ 179{
79 int cpu; 180 int cpu;
@@ -82,93 +183,77 @@ static void release_ds_buffers(void)
82 return; 183 return;
83 184
84 get_online_cpus(); 185 get_online_cpus();
85
86 for_each_online_cpu(cpu) 186 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu); 187 fini_debug_store_on_cpu(cpu);
88 188
89 for_each_possible_cpu(cpu) { 189 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 190 release_pebs_buffer(cpu);
91 191 release_bts_buffer(cpu);
92 if (!ds) 192 release_ds_buffer(cpu);
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 } 193 }
101
102 put_online_cpus(); 194 put_online_cpus();
103} 195}
104 196
105static int reserve_ds_buffers(void) 197static void reserve_ds_buffers(void)
106{ 198{
107 int cpu, err = 0; 199 int bts_err = 0, pebs_err = 0;
200 int cpu;
201
202 x86_pmu.bts_active = 0;
203 x86_pmu.pebs_active = 0;
108 204
109 if (!x86_pmu.bts && !x86_pmu.pebs) 205 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0; 206 return;
207
208 if (!x86_pmu.bts)
209 bts_err = 1;
210
211 if (!x86_pmu.pebs)
212 pebs_err = 1;
111 213
112 get_online_cpus(); 214 get_online_cpus();
113 215
114 for_each_possible_cpu(cpu) { 216 for_each_possible_cpu(cpu) {
115 struct debug_store *ds; 217 if (alloc_ds_buffer(cpu)) {
116 void *buffer; 218 bts_err = 1;
117 int max, thresh; 219 pebs_err = 1;
220 }
221
222 if (!bts_err && alloc_bts_buffer(cpu))
223 bts_err = 1;
224
225 if (!pebs_err && alloc_pebs_buffer(cpu))
226 pebs_err = 1;
118 227
119 err = -ENOMEM; 228 if (bts_err && pebs_err)
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break; 229 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds; 230 }
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140 231
141 if (x86_pmu.pebs) { 232 if (bts_err) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); 233 for_each_possible_cpu(cpu)
143 if (unlikely(!buffer)) 234 release_bts_buffer(cpu);
144 break; 235 }
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158 236
159 err = 0; 237 if (pebs_err) {
238 for_each_possible_cpu(cpu)
239 release_pebs_buffer(cpu);
160 } 240 }
161 241
162 if (err) 242 if (bts_err && pebs_err) {
163 release_ds_buffers(); 243 for_each_possible_cpu(cpu)
164 else { 244 release_ds_buffer(cpu);
245 } else {
246 if (x86_pmu.bts && !bts_err)
247 x86_pmu.bts_active = 1;
248
249 if (x86_pmu.pebs && !pebs_err)
250 x86_pmu.pebs_active = 1;
251
165 for_each_online_cpu(cpu) 252 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu); 253 init_debug_store_on_cpu(cpu);
167 } 254 }
168 255
169 put_online_cpus(); 256 put_online_cpus();
170
171 return err;
172} 257}
173 258
174/* 259/*
@@ -214,7 +299,7 @@ static void intel_pmu_disable_bts(void)
214 update_debugctlmsr(debugctlmsr); 299 update_debugctlmsr(debugctlmsr);
215} 300}
216 301
217static void intel_pmu_drain_bts_buffer(void) 302static int intel_pmu_drain_bts_buffer(void)
218{ 303{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 304 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds; 305 struct debug_store *ds = cpuc->ds;
@@ -231,16 +316,16 @@ static void intel_pmu_drain_bts_buffer(void)
231 struct pt_regs regs; 316 struct pt_regs regs;
232 317
233 if (!event) 318 if (!event)
234 return; 319 return 0;
235 320
236 if (!ds) 321 if (!x86_pmu.bts_active)
237 return; 322 return 0;
238 323
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; 324 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index; 325 top = (struct bts_record *)(unsigned long)ds->bts_index;
241 326
242 if (top <= at) 327 if (top <= at)
243 return; 328 return 0;
244 329
245 ds->bts_index = ds->bts_buffer_base; 330 ds->bts_index = ds->bts_buffer_base;
246 331
@@ -256,7 +341,7 @@ static void intel_pmu_drain_bts_buffer(void)
256 perf_prepare_sample(&header, &data, event, &regs); 341 perf_prepare_sample(&header, &data, event, &regs);
257 342
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) 343 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return; 344 return 1;
260 345
261 for (; at < top; at++) { 346 for (; at < top; at++) {
262 data.ip = at->from; 347 data.ip = at->from;
@@ -270,6 +355,7 @@ static void intel_pmu_drain_bts_buffer(void)
270 /* There's new data available. */ 355 /* There's new data available. */
271 event->hw.interrupts++; 356 event->hw.interrupts++;
272 event->pending_kill = POLL_IN; 357 event->pending_kill = POLL_IN;
358 return 1;
273} 359}
274 360
275/* 361/*
@@ -491,7 +577,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
491 regs.flags &= ~PERF_EFLAGS_EXACT; 577 regs.flags &= ~PERF_EFLAGS_EXACT;
492 578
493 if (perf_event_overflow(event, 1, &data, &regs)) 579 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event); 580 x86_pmu_stop(event, 0);
495} 581}
496 582
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) 583static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -502,7 +588,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
502 struct pebs_record_core *at, *top; 588 struct pebs_record_core *at, *top;
503 int n; 589 int n;
504 590
505 if (!ds || !x86_pmu.pebs) 591 if (!x86_pmu.pebs_active)
506 return; 592 return;
507 593
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; 594 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
@@ -544,7 +630,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
544 u64 status = 0; 630 u64 status = 0;
545 int bit, n; 631 int bit, n;
546 632
547 if (!ds || !x86_pmu.pebs) 633 if (!x86_pmu.pebs_active)
548 return; 634 return;
549 635
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; 636 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
@@ -629,9 +715,8 @@ static void intel_ds_init(void)
629 715
630#else /* CONFIG_CPU_SUP_INTEL */ 716#else /* CONFIG_CPU_SUP_INTEL */
631 717
632static int reserve_ds_buffers(void) 718static void reserve_ds_buffers(void)
633{ 719{
634 return 0;
635} 720}
636 721
637static void release_ds_buffers(void) 722static void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 249015173992..81400b93e694 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,8 @@
18struct p4_event_bind { 18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */ 19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */ 20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 unsigned int escr_emask; /* valid ESCR EventMask bits */
22 unsigned int shared; /* event is shared across threads */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ 23 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22}; 24};
23 25
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
66 [P4_EVENT_TC_DELIVER_MODE] = { 68 [P4_EVENT_TC_DELIVER_MODE] = {
67 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), 69 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
68 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, 70 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
71 .escr_emask =
72 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
73 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
74 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
75 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
76 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
77 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
78 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
79 .shared = 1,
69 .cntr = { {4, 5, -1}, {6, 7, -1} }, 80 .cntr = { {4, 5, -1}, {6, 7, -1} },
70 }, 81 },
71 [P4_EVENT_BPU_FETCH_REQUEST] = { 82 [P4_EVENT_BPU_FETCH_REQUEST] = {
72 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), 83 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
73 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, 84 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
85 .escr_emask =
86 P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
74 .cntr = { {0, -1, -1}, {2, -1, -1} }, 87 .cntr = { {0, -1, -1}, {2, -1, -1} },
75 }, 88 },
76 [P4_EVENT_ITLB_REFERENCE] = { 89 [P4_EVENT_ITLB_REFERENCE] = {
77 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), 90 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
78 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, 91 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
92 .escr_emask =
93 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
94 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
95 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
79 .cntr = { {0, -1, -1}, {2, -1, -1} }, 96 .cntr = { {0, -1, -1}, {2, -1, -1} },
80 }, 97 },
81 [P4_EVENT_MEMORY_CANCEL] = { 98 [P4_EVENT_MEMORY_CANCEL] = {
82 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), 99 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
83 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, 100 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
101 .escr_emask =
102 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
103 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
84 .cntr = { {8, 9, -1}, {10, 11, -1} }, 104 .cntr = { {8, 9, -1}, {10, 11, -1} },
85 }, 105 },
86 [P4_EVENT_MEMORY_COMPLETE] = { 106 [P4_EVENT_MEMORY_COMPLETE] = {
87 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), 107 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
88 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, 108 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
109 .escr_emask =
110 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
111 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
89 .cntr = { {8, 9, -1}, {10, 11, -1} }, 112 .cntr = { {8, 9, -1}, {10, 11, -1} },
90 }, 113 },
91 [P4_EVENT_LOAD_PORT_REPLAY] = { 114 [P4_EVENT_LOAD_PORT_REPLAY] = {
92 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), 115 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
93 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, 116 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
117 .escr_emask =
118 P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
94 .cntr = { {8, 9, -1}, {10, 11, -1} }, 119 .cntr = { {8, 9, -1}, {10, 11, -1} },
95 }, 120 },
96 [P4_EVENT_STORE_PORT_REPLAY] = { 121 [P4_EVENT_STORE_PORT_REPLAY] = {
97 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), 122 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
98 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, 123 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
124 .escr_emask =
125 P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
99 .cntr = { {8, 9, -1}, {10, 11, -1} }, 126 .cntr = { {8, 9, -1}, {10, 11, -1} },
100 }, 127 },
101 [P4_EVENT_MOB_LOAD_REPLAY] = { 128 [P4_EVENT_MOB_LOAD_REPLAY] = {
102 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), 129 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
103 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, 130 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
131 .escr_emask =
132 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
133 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
134 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
135 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
104 .cntr = { {0, -1, -1}, {2, -1, -1} }, 136 .cntr = { {0, -1, -1}, {2, -1, -1} },
105 }, 137 },
106 [P4_EVENT_PAGE_WALK_TYPE] = { 138 [P4_EVENT_PAGE_WALK_TYPE] = {
107 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), 139 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
108 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, 140 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
141 .escr_emask =
142 P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
143 P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
144 .shared = 1,
109 .cntr = { {0, -1, -1}, {2, -1, -1} }, 145 .cntr = { {0, -1, -1}, {2, -1, -1} },
110 }, 146 },
111 [P4_EVENT_BSQ_CACHE_REFERENCE] = { 147 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
112 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), 148 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
113 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, 149 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
150 .escr_emask =
151 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
152 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
153 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
154 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
155 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
156 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
157 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
158 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
159 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
114 .cntr = { {0, -1, -1}, {2, -1, -1} }, 160 .cntr = { {0, -1, -1}, {2, -1, -1} },
115 }, 161 },
116 [P4_EVENT_IOQ_ALLOCATION] = { 162 [P4_EVENT_IOQ_ALLOCATION] = {
117 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), 163 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
118 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 164 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
165 .escr_emask =
166 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
167 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
168 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
169 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
170 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
171 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
172 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
173 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
174 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
175 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
176 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
119 .cntr = { {0, -1, -1}, {2, -1, -1} }, 177 .cntr = { {0, -1, -1}, {2, -1, -1} },
120 }, 178 },
121 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ 179 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
122 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), 180 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
123 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, 181 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
182 .escr_emask =
183 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
184 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
185 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
186 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
187 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
188 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
189 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
190 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
191 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
192 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
193 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
124 .cntr = { {2, -1, -1}, {3, -1, -1} }, 194 .cntr = { {2, -1, -1}, {3, -1, -1} },
125 }, 195 },
126 [P4_EVENT_FSB_DATA_ACTIVITY] = { 196 [P4_EVENT_FSB_DATA_ACTIVITY] = {
127 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), 197 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
128 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 198 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
199 .escr_emask =
200 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
201 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
202 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
203 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
204 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
205 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
206 .shared = 1,
129 .cntr = { {0, -1, -1}, {2, -1, -1} }, 207 .cntr = { {0, -1, -1}, {2, -1, -1} },
130 }, 208 },
131 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ 209 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
132 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), 210 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
133 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, 211 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
212 .escr_emask =
213 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
214 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
215 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
216 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
217 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
218 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
219 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
220 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
221 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
222 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
223 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
224 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
225 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
134 .cntr = { {0, -1, -1}, {1, -1, -1} }, 226 .cntr = { {0, -1, -1}, {1, -1, -1} },
135 }, 227 },
136 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ 228 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
137 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), 229 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
138 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, 230 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
231 .escr_emask =
232 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
233 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
234 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
235 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
236 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
237 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
238 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
239 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
240 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
241 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
242 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
243 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
244 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
139 .cntr = { {2, -1, -1}, {3, -1, -1} }, 245 .cntr = { {2, -1, -1}, {3, -1, -1} },
140 }, 246 },
141 [P4_EVENT_SSE_INPUT_ASSIST] = { 247 [P4_EVENT_SSE_INPUT_ASSIST] = {
142 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), 248 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
143 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 249 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
250 .escr_emask =
251 P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
252 .shared = 1,
144 .cntr = { {8, 9, -1}, {10, 11, -1} }, 253 .cntr = { {8, 9, -1}, {10, 11, -1} },
145 }, 254 },
146 [P4_EVENT_PACKED_SP_UOP] = { 255 [P4_EVENT_PACKED_SP_UOP] = {
147 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), 256 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
148 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 257 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
258 .escr_emask =
259 P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
260 .shared = 1,
149 .cntr = { {8, 9, -1}, {10, 11, -1} }, 261 .cntr = { {8, 9, -1}, {10, 11, -1} },
150 }, 262 },
151 [P4_EVENT_PACKED_DP_UOP] = { 263 [P4_EVENT_PACKED_DP_UOP] = {
152 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), 264 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
153 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 265 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
266 .escr_emask =
267 P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
268 .shared = 1,
154 .cntr = { {8, 9, -1}, {10, 11, -1} }, 269 .cntr = { {8, 9, -1}, {10, 11, -1} },
155 }, 270 },
156 [P4_EVENT_SCALAR_SP_UOP] = { 271 [P4_EVENT_SCALAR_SP_UOP] = {
157 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), 272 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
158 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 273 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
274 .escr_emask =
275 P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
276 .shared = 1,
159 .cntr = { {8, 9, -1}, {10, 11, -1} }, 277 .cntr = { {8, 9, -1}, {10, 11, -1} },
160 }, 278 },
161 [P4_EVENT_SCALAR_DP_UOP] = { 279 [P4_EVENT_SCALAR_DP_UOP] = {
162 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), 280 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
163 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 281 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
282 .escr_emask =
283 P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
284 .shared = 1,
164 .cntr = { {8, 9, -1}, {10, 11, -1} }, 285 .cntr = { {8, 9, -1}, {10, 11, -1} },
165 }, 286 },
166 [P4_EVENT_64BIT_MMX_UOP] = { 287 [P4_EVENT_64BIT_MMX_UOP] = {
167 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), 288 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
168 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 289 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
290 .escr_emask =
291 P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
292 .shared = 1,
169 .cntr = { {8, 9, -1}, {10, 11, -1} }, 293 .cntr = { {8, 9, -1}, {10, 11, -1} },
170 }, 294 },
171 [P4_EVENT_128BIT_MMX_UOP] = { 295 [P4_EVENT_128BIT_MMX_UOP] = {
172 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), 296 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
173 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 297 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
298 .escr_emask =
299 P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
300 .shared = 1,
174 .cntr = { {8, 9, -1}, {10, 11, -1} }, 301 .cntr = { {8, 9, -1}, {10, 11, -1} },
175 }, 302 },
176 [P4_EVENT_X87_FP_UOP] = { 303 [P4_EVENT_X87_FP_UOP] = {
177 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), 304 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
178 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 305 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
306 .escr_emask =
307 P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
308 .shared = 1,
179 .cntr = { {8, 9, -1}, {10, 11, -1} }, 309 .cntr = { {8, 9, -1}, {10, 11, -1} },
180 }, 310 },
181 [P4_EVENT_TC_MISC] = { 311 [P4_EVENT_TC_MISC] = {
182 .opcode = P4_OPCODE(P4_EVENT_TC_MISC), 312 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
183 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, 313 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
314 .escr_emask =
315 P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
184 .cntr = { {4, 5, -1}, {6, 7, -1} }, 316 .cntr = { {4, 5, -1}, {6, 7, -1} },
185 }, 317 },
186 [P4_EVENT_GLOBAL_POWER_EVENTS] = { 318 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
187 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), 319 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
188 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 320 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
321 .escr_emask =
322 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
189 .cntr = { {0, -1, -1}, {2, -1, -1} }, 323 .cntr = { {0, -1, -1}, {2, -1, -1} },
190 }, 324 },
191 [P4_EVENT_TC_MS_XFER] = { 325 [P4_EVENT_TC_MS_XFER] = {
192 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), 326 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
193 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, 327 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
328 .escr_emask =
329 P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
194 .cntr = { {4, 5, -1}, {6, 7, -1} }, 330 .cntr = { {4, 5, -1}, {6, 7, -1} },
195 }, 331 },
196 [P4_EVENT_UOP_QUEUE_WRITES] = { 332 [P4_EVENT_UOP_QUEUE_WRITES] = {
197 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), 333 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
198 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, 334 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
335 .escr_emask =
336 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
337 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
338 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
199 .cntr = { {4, 5, -1}, {6, 7, -1} }, 339 .cntr = { {4, 5, -1}, {6, 7, -1} },
200 }, 340 },
201 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { 341 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
202 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), 342 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
203 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, 343 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
344 .escr_emask =
345 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
346 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
347 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
348 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
204 .cntr = { {4, 5, -1}, {6, 7, -1} }, 349 .cntr = { {4, 5, -1}, {6, 7, -1} },
205 }, 350 },
206 [P4_EVENT_RETIRED_BRANCH_TYPE] = { 351 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
207 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), 352 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
208 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, 353 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
354 .escr_emask =
355 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
356 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
358 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
209 .cntr = { {4, 5, -1}, {6, 7, -1} }, 359 .cntr = { {4, 5, -1}, {6, 7, -1} },
210 }, 360 },
211 [P4_EVENT_RESOURCE_STALL] = { 361 [P4_EVENT_RESOURCE_STALL] = {
212 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), 362 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
213 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, 363 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
364 .escr_emask =
365 P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
214 .cntr = { {12, 13, 16}, {14, 15, 17} }, 366 .cntr = { {12, 13, 16}, {14, 15, 17} },
215 }, 367 },
216 [P4_EVENT_WC_BUFFER] = { 368 [P4_EVENT_WC_BUFFER] = {
217 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), 369 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
218 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, 370 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
371 .escr_emask =
372 P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
373 P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
374 .shared = 1,
219 .cntr = { {8, 9, -1}, {10, 11, -1} }, 375 .cntr = { {8, 9, -1}, {10, 11, -1} },
220 }, 376 },
221 [P4_EVENT_B2B_CYCLES] = { 377 [P4_EVENT_B2B_CYCLES] = {
222 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), 378 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
223 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 379 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
380 .escr_emask = 0,
224 .cntr = { {0, -1, -1}, {2, -1, -1} }, 381 .cntr = { {0, -1, -1}, {2, -1, -1} },
225 }, 382 },
226 [P4_EVENT_BNR] = { 383 [P4_EVENT_BNR] = {
227 .opcode = P4_OPCODE(P4_EVENT_BNR), 384 .opcode = P4_OPCODE(P4_EVENT_BNR),
228 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 385 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
386 .escr_emask = 0,
229 .cntr = { {0, -1, -1}, {2, -1, -1} }, 387 .cntr = { {0, -1, -1}, {2, -1, -1} },
230 }, 388 },
231 [P4_EVENT_SNOOP] = { 389 [P4_EVENT_SNOOP] = {
232 .opcode = P4_OPCODE(P4_EVENT_SNOOP), 390 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
233 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 391 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
392 .escr_emask = 0,
234 .cntr = { {0, -1, -1}, {2, -1, -1} }, 393 .cntr = { {0, -1, -1}, {2, -1, -1} },
235 }, 394 },
236 [P4_EVENT_RESPONSE] = { 395 [P4_EVENT_RESPONSE] = {
237 .opcode = P4_OPCODE(P4_EVENT_RESPONSE), 396 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
238 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 397 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
398 .escr_emask = 0,
239 .cntr = { {0, -1, -1}, {2, -1, -1} }, 399 .cntr = { {0, -1, -1}, {2, -1, -1} },
240 }, 400 },
241 [P4_EVENT_FRONT_END_EVENT] = { 401 [P4_EVENT_FRONT_END_EVENT] = {
242 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), 402 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
243 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 403 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
404 .escr_emask =
405 P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
406 P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
244 .cntr = { {12, 13, 16}, {14, 15, 17} }, 407 .cntr = { {12, 13, 16}, {14, 15, 17} },
245 }, 408 },
246 [P4_EVENT_EXECUTION_EVENT] = { 409 [P4_EVENT_EXECUTION_EVENT] = {
247 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), 410 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
248 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 411 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
412 .escr_emask =
413 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
414 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
415 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
416 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
417 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
418 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
419 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
420 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
249 .cntr = { {12, 13, 16}, {14, 15, 17} }, 421 .cntr = { {12, 13, 16}, {14, 15, 17} },
250 }, 422 },
251 [P4_EVENT_REPLAY_EVENT] = { 423 [P4_EVENT_REPLAY_EVENT] = {
252 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), 424 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
253 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 425 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
426 .escr_emask =
427 P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
428 P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
254 .cntr = { {12, 13, 16}, {14, 15, 17} }, 429 .cntr = { {12, 13, 16}, {14, 15, 17} },
255 }, 430 },
256 [P4_EVENT_INSTR_RETIRED] = { 431 [P4_EVENT_INSTR_RETIRED] = {
257 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), 432 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
258 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 433 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
434 .escr_emask =
435 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
436 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
437 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
438 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
259 .cntr = { {12, 13, 16}, {14, 15, 17} }, 439 .cntr = { {12, 13, 16}, {14, 15, 17} },
260 }, 440 },
261 [P4_EVENT_UOPS_RETIRED] = { 441 [P4_EVENT_UOPS_RETIRED] = {
262 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), 442 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
263 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 443 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
444 .escr_emask =
445 P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
446 P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
264 .cntr = { {12, 13, 16}, {14, 15, 17} }, 447 .cntr = { {12, 13, 16}, {14, 15, 17} },
265 }, 448 },
266 [P4_EVENT_UOP_TYPE] = { 449 [P4_EVENT_UOP_TYPE] = {
267 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), 450 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
268 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, 451 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
452 .escr_emask =
453 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
454 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
269 .cntr = { {12, 13, 16}, {14, 15, 17} }, 455 .cntr = { {12, 13, 16}, {14, 15, 17} },
270 }, 456 },
271 [P4_EVENT_BRANCH_RETIRED] = { 457 [P4_EVENT_BRANCH_RETIRED] = {
272 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), 458 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
273 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 459 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
460 .escr_emask =
461 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
462 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
463 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
464 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
274 .cntr = { {12, 13, 16}, {14, 15, 17} }, 465 .cntr = { {12, 13, 16}, {14, 15, 17} },
275 }, 466 },
276 [P4_EVENT_MISPRED_BRANCH_RETIRED] = { 467 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
277 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), 468 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
278 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 469 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
470 .escr_emask =
471 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
279 .cntr = { {12, 13, 16}, {14, 15, 17} }, 472 .cntr = { {12, 13, 16}, {14, 15, 17} },
280 }, 473 },
281 [P4_EVENT_X87_ASSIST] = { 474 [P4_EVENT_X87_ASSIST] = {
282 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), 475 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
283 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 476 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
477 .escr_emask =
478 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
479 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
480 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
481 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
482 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
284 .cntr = { {12, 13, 16}, {14, 15, 17} }, 483 .cntr = { {12, 13, 16}, {14, 15, 17} },
285 }, 484 },
286 [P4_EVENT_MACHINE_CLEAR] = { 485 [P4_EVENT_MACHINE_CLEAR] = {
287 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), 486 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
288 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 487 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
488 .escr_emask =
489 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
490 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
491 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
289 .cntr = { {12, 13, 16}, {14, 15, 17} }, 492 .cntr = { {12, 13, 16}, {14, 15, 17} },
290 }, 493 },
291 [P4_EVENT_INSTR_COMPLETED] = { 494 [P4_EVENT_INSTR_COMPLETED] = {
292 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), 495 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
293 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 496 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
497 .escr_emask =
498 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
499 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
294 .cntr = { {12, 13, 16}, {14, 15, 17} }, 500 .cntr = { {12, 13, 16}, {14, 15, 17} },
295 }, 501 },
296}; 502};
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
428 return config; 634 return config;
429} 635}
430 636
637/* check cpu model specifics */
638static bool p4_event_match_cpu_model(unsigned int event_idx)
639{
640 /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
641 if (event_idx == P4_EVENT_INSTR_COMPLETED) {
642 if (boot_cpu_data.x86_model != 3 &&
643 boot_cpu_data.x86_model != 4 &&
644 boot_cpu_data.x86_model != 6)
645 return false;
646 }
647
648 /*
649 * For info
650 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
651 */
652
653 return true;
654}
655
431static int p4_validate_raw_event(struct perf_event *event) 656static int p4_validate_raw_event(struct perf_event *event)
432{ 657{
433 unsigned int v; 658 unsigned int v, emask;
434 659
435 /* user data may have out-of-bound event index */ 660 /* User data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config); 661 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) { 662 if (v >= ARRAY_SIZE(p4_event_bind_map))
438 pr_warning("P4 PMU: Unknown event code: %d\n", v); 663 return -EINVAL;
664
665 /* It may be unsupported: */
666 if (!p4_event_match_cpu_model(v))
439 return -EINVAL; 667 return -EINVAL;
668
669 /*
670 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
671 * in Architectural Performance Monitoring, it means not
672 * on _which_ logical cpu to count but rather _when_, ie it
673 * depends on logical cpu state -- count event if one cpu active,
674 * none, both or any, so we just allow user to pass any value
675 * desired.
676 *
677 * In turn we always set Tx_OS/Tx_USR bits bound to logical
678 * cpu without their propagation to another cpu
679 */
680
681 /*
682 * if an event is shared accross the logical threads
683 * the user needs special permissions to be able to use it
684 */
685 if (p4_event_bind_map[v].shared) {
686 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
687 return -EACCES;
440 } 688 }
441 689
690 /* ESCR EventMask bits may be invalid */
691 emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
692 if (emask & ~p4_event_bind_map[v].escr_emask)
693 return -EINVAL;
694
442 /* 695 /*
443 * it may have some screwed PEBS bits 696 * it may have some invalid PEBS bits
444 */ 697 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { 698 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL; 699 return -EINVAL;
448 } 700
449 v = p4_config_unpack_metric(event->attr.config); 701 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { 702 if (v >= ARRAY_SIZE(p4_pebs_bind_map))
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL; 703 return -EINVAL;
453 }
454 704
455 return 0; 705 return 0;
456} 706}
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event)
478 728
479 if (event->attr.type == PERF_TYPE_RAW) { 729 if (event->attr.type == PERF_TYPE_RAW) {
480 730
731 /*
732 * Clear bits we reserve to be managed by kernel itself
733 * and never allowed from a user space
734 */
735 event->attr.config &= P4_CONFIG_MASK;
736
481 rc = p4_validate_raw_event(event); 737 rc = p4_validate_raw_event(event);
482 if (rc) 738 if (rc)
483 goto out; 739 goto out;
484 740
485 /* 741 /*
486 * We don't control raw events so it's up to the caller
487 * to pass sane values (and we don't count the thread number
488 * on HT machine but allow HT-compatible specifics to be
489 * passed on)
490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED 742 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc) 743 * bits since we keep additional info here (for cache events and etc)
493 *
494 * XXX: HT wide things should check perf_paranoid_cpu() &&
495 * CAP_SYS_ADMIN
496 */ 744 */
497 event->hw.config |= event->attr.config & 745 event->hw.config |= event->attr.config;
498 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
500
501 event->hw.config &= ~P4_CCCR_FORCE_OVF;
502 } 746 }
503 747
504 rc = x86_setup_perfctr(event); 748 rc = x86_setup_perfctr(event);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..d9f4ff8fcd69 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
700{ 700{
701 switch (boot_cpu_data.x86_vendor) { 701 switch (boot_cpu_data.x86_vendor) {
702 case X86_VENDOR_AMD: 702 case X86_VENDOR_AMD:
703 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 703 if (boot_cpu_data.x86 == 6 ||
704 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) 704 (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
705 return; 705 wd_ops = &k7_wd_ops;
706 wd_ops = &k7_wd_ops; 706 return;
707 break;
708 case X86_VENDOR_INTEL: 707 case X86_VENDOR_INTEL:
709 /* Work around where perfctr1 doesn't have a working enable 708 /* Work around where perfctr1 doesn't have a working enable
710 * bit as described in the following errata: 709 * bit as described in the following errata:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d49079515122..c7f64e6f537a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, 44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, 45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, 46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
47 { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
48 { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
49 { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
50 { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
51 { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
52 { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
47 { 0, 0, 0, 0, 0 } 53 { 0, 0, 0, 0, 0 }
48 }; 54 };
49 55
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3cc..d5cd13945d5a 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
61 if (!is_crashed_pfn_valid(pfn)) 61 if (!is_crashed_pfn_valid(pfn))
62 return -EFAULT; 62 return -EFAULT;
63 63
64 vaddr = kmap_atomic_pfn(pfn, KM_PTE0); 64 vaddr = kmap_atomic_pfn(pfn);
65 65
66 if (!userbuf) { 66 if (!userbuf) {
67 memcpy(buf, (vaddr + offset), csize); 67 memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..994828899e09 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
34 if (!csize) 34 if (!csize)
35 return 0; 35 return 0;
36 36
37 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 37 vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
38 if (!vaddr) 38 if (!vaddr)
39 return -ENOMEM; 39 return -ENOMEM;
40 40
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
46 } else 46 } else
47 memcpy(buf, vaddr + offset, csize); 47 memcpy(buf, vaddr + offset, csize);
48 48
49 set_iounmap_nonlazy();
49 iounmap(vaddr); 50 iounmap(vaddr);
50 return csize; 51 return csize;
51} 52}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 0f6376ffa2d9..1bc7f75a5bda 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,11 +82,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
82 if (kstack_end(stack)) 82 if (kstack_end(stack))
83 break; 83 break;
84 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 84 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
85 printk("\n%s", log_lvl); 85 printk(KERN_CONT "\n");
86 printk(" %08lx", *stack++); 86 printk(KERN_CONT " %08lx", *stack++);
87 touch_nmi_watchdog(); 87 touch_nmi_watchdog();
88 } 88 }
89 printk("\n"); 89 printk(KERN_CONT "\n");
90 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 90 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
91} 91}
92 92
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 57a21f11c791..6a340485249a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -265,20 +265,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
265 if (stack >= irq_stack && stack <= irq_stack_end) { 265 if (stack >= irq_stack && stack <= irq_stack_end) {
266 if (stack == irq_stack_end) { 266 if (stack == irq_stack_end) {
267 stack = (unsigned long *) (irq_stack_end[-1]); 267 stack = (unsigned long *) (irq_stack_end[-1]);
268 printk(" <EOI> "); 268 printk(KERN_CONT " <EOI> ");
269 } 269 }
270 } else { 270 } else {
271 if (((long) stack & (THREAD_SIZE-1)) == 0) 271 if (((long) stack & (THREAD_SIZE-1)) == 0)
272 break; 272 break;
273 } 273 }
274 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 274 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
275 printk("\n%s", log_lvl); 275 printk(KERN_CONT "\n");
276 printk(" %016lx", *stack++); 276 printk(KERN_CONT " %016lx", *stack++);
277 touch_nmi_watchdog(); 277 touch_nmi_watchdog();
278 } 278 }
279 preempt_enable(); 279 preempt_enable();
280 280
281 printk("\n"); 281 printk(KERN_CONT "\n");
282 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 282 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
283} 283}
284 284
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb1..0c2b7ef7a34d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -15,6 +15,7 @@
15#include <linux/pfn.h> 15#include <linux/pfn.h>
16#include <linux/suspend.h> 16#include <linux/suspend.h>
17#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
18#include <linux/memblock.h>
18 19
19#include <asm/e820.h> 20#include <asm/e820.h>
20#include <asm/proto.h> 21#include <asm/proto.h>
@@ -738,73 +739,7 @@ core_initcall(e820_mark_nvs_memory);
738#endif 739#endif
739 740
740/* 741/*
741 * Find a free area with specified alignment in a specific range. 742 * pre allocated 4k and reserved it in memblock and e820_saved
742 */
743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
744{
745 int i;
746
747 for (i = 0; i < e820.nr_map; i++) {
748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
751
752 if (ei->type != E820_RAM)
753 continue;
754
755 ei_last = ei->addr + ei->size;
756 ei_start = ei->addr;
757 addr = find_early_area(ei_start, ei_last, start, end,
758 size, align);
759
760 if (addr != -1ULL)
761 return addr;
762 }
763 return -1ULL;
764}
765
766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
767{
768 return find_e820_area(start, end, size, align);
769}
770
771u64 __init get_max_mapped(void)
772{
773 u64 end = max_pfn_mapped;
774
775 end <<= PAGE_SHIFT;
776
777 return end;
778}
779/*
780 * Find next free range after *start
781 */
782u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
783{
784 int i;
785
786 for (i = 0; i < e820.nr_map; i++) {
787 struct e820entry *ei = &e820.map[i];
788 u64 addr;
789 u64 ei_start, ei_last;
790
791 if (ei->type != E820_RAM)
792 continue;
793
794 ei_last = ei->addr + ei->size;
795 ei_start = ei->addr;
796 addr = find_early_area_size(ei_start, ei_last, start,
797 sizep, align);
798
799 if (addr != -1ULL)
800 return addr;
801 }
802
803 return -1ULL;
804}
805
806/*
807 * pre allocated 4k and reserved it in e820
808 */ 743 */
809u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) 744u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
810{ 745{
@@ -813,8 +748,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
813 u64 start; 748 u64 start;
814 749
815 for (start = startt; ; start += size) { 750 for (start = startt; ; start += size) {
816 start = find_e820_area_size(start, &size, align); 751 start = memblock_x86_find_in_range_size(start, &size, align);
817 if (!(start + 1)) 752 if (start == MEMBLOCK_ERROR)
818 return 0; 753 return 0;
819 if (size >= sizet) 754 if (size >= sizet)
820 break; 755 break;
@@ -830,10 +765,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
830 addr = round_down(start + size - sizet, align); 765 addr = round_down(start + size - sizet, align);
831 if (addr < start) 766 if (addr < start)
832 return 0; 767 return 0;
833 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 768 memblock_x86_reserve_range(addr, addr + sizet, "new next");
834 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); 769 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
835 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 770 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
836 update_e820();
837 update_e820_saved(); 771 update_e820_saved();
838 772
839 return addr; 773 return addr;
@@ -895,74 +829,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
895{ 829{
896 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); 830 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
897} 831}
898/*
899 * Finds an active region in the address range from start_pfn to last_pfn and
900 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
901 */
902int __init e820_find_active_region(const struct e820entry *ei,
903 unsigned long start_pfn,
904 unsigned long last_pfn,
905 unsigned long *ei_startpfn,
906 unsigned long *ei_endpfn)
907{
908 u64 align = PAGE_SIZE;
909
910 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
911 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
912
913 /* Skip map entries smaller than a page */
914 if (*ei_startpfn >= *ei_endpfn)
915 return 0;
916
917 /* Skip if map is outside the node */
918 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
919 *ei_startpfn >= last_pfn)
920 return 0;
921
922 /* Check for overlaps */
923 if (*ei_startpfn < start_pfn)
924 *ei_startpfn = start_pfn;
925 if (*ei_endpfn > last_pfn)
926 *ei_endpfn = last_pfn;
927
928 return 1;
929}
930
931/* Walk the e820 map and register active regions within a node */
932void __init e820_register_active_regions(int nid, unsigned long start_pfn,
933 unsigned long last_pfn)
934{
935 unsigned long ei_startpfn;
936 unsigned long ei_endpfn;
937 int i;
938
939 for (i = 0; i < e820.nr_map; i++)
940 if (e820_find_active_region(&e820.map[i],
941 start_pfn, last_pfn,
942 &ei_startpfn, &ei_endpfn))
943 add_active_range(nid, ei_startpfn, ei_endpfn);
944}
945
946/*
947 * Find the hole size (in bytes) in the memory range.
948 * @start: starting address of the memory range to scan
949 * @end: ending address of the memory range to scan
950 */
951u64 __init e820_hole_size(u64 start, u64 end)
952{
953 unsigned long start_pfn = start >> PAGE_SHIFT;
954 unsigned long last_pfn = end >> PAGE_SHIFT;
955 unsigned long ei_startpfn, ei_endpfn, ram = 0;
956 int i;
957
958 for (i = 0; i < e820.nr_map; i++) {
959 if (e820_find_active_region(&e820.map[i],
960 start_pfn, last_pfn,
961 &ei_startpfn, &ei_endpfn))
962 ram += ei_endpfn - ei_startpfn;
963 }
964 return end - start - ((u64)ram << PAGE_SHIFT);
965}
966 832
967static void early_panic(char *msg) 833static void early_panic(char *msg)
968{ 834{
@@ -1210,3 +1076,48 @@ void __init setup_memory_map(void)
1210 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1076 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1211 e820_print_map(who); 1077 e820_print_map(who);
1212} 1078}
1079
1080void __init memblock_x86_fill(void)
1081{
1082 int i;
1083 u64 end;
1084
1085 /*
1086 * EFI may have more than 128 entries
1087 * We are safe to enable resizing, beause memblock_x86_fill()
1088 * is rather later for x86
1089 */
1090 memblock_can_resize = 1;
1091
1092 for (i = 0; i < e820.nr_map; i++) {
1093 struct e820entry *ei = &e820.map[i];
1094
1095 end = ei->addr + ei->size;
1096 if (end != (resource_size_t)end)
1097 continue;
1098
1099 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
1100 continue;
1101
1102 memblock_add(ei->addr, ei->size);
1103 }
1104
1105 memblock_analyze();
1106 memblock_dump_all();
1107}
1108
1109void __init memblock_find_dma_reserve(void)
1110{
1111#ifdef CONFIG_X86_64
1112 u64 free_size_pfn;
1113 u64 mem_size_pfn;
1114 /*
1115 * need to find out used area below MAX_DMA_PFN
1116 * need to use memblock to get free size in [0, MAX_DMA_PFN]
1117 * at first, and assume boot_mem will not take below MAX_DMA_PFN
1118 */
1119 mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1120 free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1121 set_dma_reserve(mem_size_pfn - free_size_pfn);
1122#endif
1123}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..4572f25f9325 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
14#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h> 15#include <asm/pci-direct.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/mrst.h>
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
18#include <linux/usb/ehci_def.h> 19#include <linux/usb/ehci_def.h>
19 20
@@ -239,6 +240,18 @@ static int __init setup_early_printk(char *buf)
239 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
240 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
241#endif 242#endif
243#ifdef CONFIG_X86_MRST_EARLY_PRINTK
244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep);
247 }
248
249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init();
251 early_console_register(&early_hsu_console, keep);
252 }
253
254#endif
242 buf++; 255 buf++;
243 } 256 }
244 return 0; 257 return 0;
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
new file mode 100644
index 000000000000..65df603622b2
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,319 @@
1/*
2 * early_printk_mrst.c - early consoles for Intel MID platforms
3 *
4 * Copyright (c) 2008-2010, Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12/*
13 * This file implements two early consoles named mrst and hsu.
14 * mrst is based on Maxim3110 spi-uart device, it exists in both
15 * Moorestown and Medfield platforms, while hsu is based on a High
16 * Speed UART device which only exists in the Medfield platform
17 */
18
19#include <linux/serial_reg.h>
20#include <linux/serial_mfd.h>
21#include <linux/kmsg_dump.h>
22#include <linux/console.h>
23#include <linux/kernel.h>
24#include <linux/delay.h>
25#include <linux/init.h>
26#include <linux/io.h>
27
28#include <asm/fixmap.h>
29#include <asm/pgtable.h>
30#include <asm/mrst.h>
31
32#define MRST_SPI_TIMEOUT 0x200000
33#define MRST_REGBASE_SPI0 0xff128000
34#define MRST_REGBASE_SPI1 0xff128400
35#define MRST_CLK_SPI0_REG 0xff11d86c
36
37/* Bit fields in CTRLR0 */
38#define SPI_DFS_OFFSET 0
39
40#define SPI_FRF_OFFSET 4
41#define SPI_FRF_SPI 0x0
42#define SPI_FRF_SSP 0x1
43#define SPI_FRF_MICROWIRE 0x2
44#define SPI_FRF_RESV 0x3
45
46#define SPI_MODE_OFFSET 6
47#define SPI_SCPH_OFFSET 6
48#define SPI_SCOL_OFFSET 7
49#define SPI_TMOD_OFFSET 8
50#define SPI_TMOD_TR 0x0 /* xmit & recv */
51#define SPI_TMOD_TO 0x1 /* xmit only */
52#define SPI_TMOD_RO 0x2 /* recv only */
53#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
54
55#define SPI_SLVOE_OFFSET 10
56#define SPI_SRL_OFFSET 11
57#define SPI_CFS_OFFSET 12
58
59/* Bit fields in SR, 7 bits */
60#define SR_MASK 0x7f /* cover 7 bits */
61#define SR_BUSY (1 << 0)
62#define SR_TF_NOT_FULL (1 << 1)
63#define SR_TF_EMPT (1 << 2)
64#define SR_RF_NOT_EMPT (1 << 3)
65#define SR_RF_FULL (1 << 4)
66#define SR_TX_ERR (1 << 5)
67#define SR_DCOL (1 << 6)
68
69struct dw_spi_reg {
70 u32 ctrl0;
71 u32 ctrl1;
72 u32 ssienr;
73 u32 mwcr;
74 u32 ser;
75 u32 baudr;
76 u32 txfltr;
77 u32 rxfltr;
78 u32 txflr;
79 u32 rxflr;
80 u32 sr;
81 u32 imr;
82 u32 isr;
83 u32 risr;
84 u32 txoicr;
85 u32 rxoicr;
86 u32 rxuicr;
87 u32 msticr;
88 u32 icr;
89 u32 dmacr;
90 u32 dmatdlr;
91 u32 dmardlr;
92 u32 idr;
93 u32 version;
94
95 /* Currently operates as 32 bits, though only the low 16 bits matter */
96 u32 dr;
97} __packed;
98
99#define dw_readl(dw, name) __raw_readl(&(dw)->name)
100#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
101
102/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
104
105static u32 *pclk_spi0;
106/* Always contains an accessable address, start with 0 */
107static struct dw_spi_reg *pspi;
108
109static struct kmsg_dumper dw_dumper;
110static int dumper_registered;
111
112static void dw_kmsg_dump(struct kmsg_dumper *dumper,
113 enum kmsg_dump_reason reason,
114 const char *s1, unsigned long l1,
115 const char *s2, unsigned long l2)
116{
117 int i;
118
119 /* When run to this, we'd better re-init the HW */
120 mrst_early_console_init();
121
122 for (i = 0; i < l1; i++)
123 early_mrst_console.write(&early_mrst_console, s1 + i, 1);
124 for (i = 0; i < l2; i++)
125 early_mrst_console.write(&early_mrst_console, s2 + i, 1);
126}
127
128/* Set the ratio rate to 115200, 8n1, IRQ disabled */
129static void max3110_write_config(void)
130{
131 u16 config;
132
133 config = 0xc001;
134 dw_writel(pspi, dr, config);
135}
136
137/* Translate char to a eligible word and send to max3110 */
138static void max3110_write_data(char c)
139{
140 u16 data;
141
142 data = 0x8000 | c;
143 dw_writel(pspi, dr, data);
144}
145
146void mrst_early_console_init(void)
147{
148 u32 ctrlr0 = 0;
149 u32 spi0_cdiv;
150 u32 freq; /* Freqency info only need be searched once */
151
152 /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
153 pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
154 MRST_CLK_SPI0_REG);
155 spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
156 freq = 100000000 / (spi0_cdiv + 1);
157
158 if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
159 mrst_spi_paddr = MRST_REGBASE_SPI1;
160
161 pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
162 mrst_spi_paddr);
163
164 /* Disable SPI controller */
165 dw_writel(pspi, ssienr, 0);
166
167 /* Set control param, 8 bits, transmit only mode */
168 ctrlr0 = dw_readl(pspi, ctrl0);
169
170 ctrlr0 &= 0xfcc0;
171 ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
172 | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
173 dw_writel(pspi, ctrl0, ctrlr0);
174
175 /*
176 * Change the spi0 clk to comply with 115200 bps, use 100000 to
177 * calculate the clk dividor to make the clock a little slower
178 * than real baud rate.
179 */
180 dw_writel(pspi, baudr, freq/100000);
181
182 /* Disable all INT for early phase */
183 dw_writel(pspi, imr, 0x0);
184
185 /* Set the cs to spi-uart */
186 dw_writel(pspi, ser, 0x2);
187
188 /* Enable the HW, the last step for HW init */
189 dw_writel(pspi, ssienr, 0x1);
190
191 /* Set the default configuration */
192 max3110_write_config();
193
194 /* Register the kmsg dumper */
195 if (!dumper_registered) {
196 dw_dumper.dump = dw_kmsg_dump;
197 kmsg_dump_register(&dw_dumper);
198 dumper_registered = 1;
199 }
200}
201
202/* Slave select should be called in the read/write function */
203static void early_mrst_spi_putc(char c)
204{
205 unsigned int timeout;
206 u32 sr;
207
208 timeout = MRST_SPI_TIMEOUT;
209 /* Early putc needs to make sure the TX FIFO is not full */
210 while (--timeout) {
211 sr = dw_readl(pspi, sr);
212 if (!(sr & SR_TF_NOT_FULL))
213 cpu_relax();
214 else
215 break;
216 }
217
218 if (!timeout)
219 pr_warning("MRST earlycon: timed out\n");
220 else
221 max3110_write_data(c);
222}
223
224/* Early SPI only uses polling mode */
225static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
226{
227 int i;
228
229 for (i = 0; i < n && *str; i++) {
230 if (*str == '\n')
231 early_mrst_spi_putc('\r');
232 early_mrst_spi_putc(*str);
233 str++;
234 }
235}
236
237struct console early_mrst_console = {
238 .name = "earlymrst",
239 .write = early_mrst_spi_write,
240 .flags = CON_PRINTBUFFER,
241 .index = -1,
242};
243
244/*
245 * Following is the early console based on Medfield HSU (High
246 * Speed UART) device.
247 */
248#define HSU_PORT2_PADDR 0xffa28180
249
250static void __iomem *phsu;
251
252void hsu_early_console_init(void)
253{
254 u8 lcr;
255
256 phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
257 HSU_PORT2_PADDR);
258
259 /* Disable FIFO */
260 writeb(0x0, phsu + UART_FCR);
261
262 /* Set to default 115200 bps, 8n1 */
263 lcr = readb(phsu + UART_LCR);
264 writeb((0x80 | lcr), phsu + UART_LCR);
265 writeb(0x18, phsu + UART_DLL);
266 writeb(lcr, phsu + UART_LCR);
267 writel(0x3600, phsu + UART_MUL*4);
268
269 writeb(0x8, phsu + UART_MCR);
270 writeb(0x7, phsu + UART_FCR);
271 writeb(0x3, phsu + UART_LCR);
272
273 /* Clear IRQ status */
274 readb(phsu + UART_LSR);
275 readb(phsu + UART_RX);
276 readb(phsu + UART_IIR);
277 readb(phsu + UART_MSR);
278
279 /* Enable FIFO */
280 writeb(0x7, phsu + UART_FCR);
281}
282
283#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
284
285static void early_hsu_putc(char ch)
286{
287 unsigned int timeout = 10000; /* 10ms */
288 u8 status;
289
290 while (--timeout) {
291 status = readb(phsu + UART_LSR);
292 if (status & BOTH_EMPTY)
293 break;
294 udelay(1);
295 }
296
297 /* Only write the char when there was no timeout */
298 if (timeout)
299 writeb(ch, phsu + UART_TX);
300}
301
302static void early_hsu_write(struct console *con, const char *str, unsigned n)
303{
304 int i;
305
306 for (i = 0; i < n && *str; i++) {
307 if (*str == '\n')
308 early_hsu_putc('\r');
309 early_hsu_putc(*str);
310 str++;
311 }
312}
313
314struct console early_hsu_console = {
315 .name = "earlyhsu",
316 .write = early_hsu_write,
317 .flags = CON_PRINTBUFFER,
318 .index = -1,
319};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2f..59e175e89599 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
115 115
116 /* unfortunately push/pop can't be no-op */ 116 /* unfortunately push/pop can't be no-op */
117.macro PUSH_GS 117.macro PUSH_GS
118 pushl $0 118 pushl_cfi $0
119 CFI_ADJUST_CFA_OFFSET 4
120.endm 119.endm
121.macro POP_GS pop=0 120.macro POP_GS pop=0
122 addl $(4 + \pop), %esp 121 addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
140#else /* CONFIG_X86_32_LAZY_GS */ 139#else /* CONFIG_X86_32_LAZY_GS */
141 140
142.macro PUSH_GS 141.macro PUSH_GS
143 pushl %gs 142 pushl_cfi %gs
144 CFI_ADJUST_CFA_OFFSET 4
145 /*CFI_REL_OFFSET gs, 0*/ 143 /*CFI_REL_OFFSET gs, 0*/
146.endm 144.endm
147 145
148.macro POP_GS pop=0 146.macro POP_GS pop=0
14998: popl %gs 14798: popl_cfi %gs
150 CFI_ADJUST_CFA_OFFSET -4
151 /*CFI_RESTORE gs*/ 148 /*CFI_RESTORE gs*/
152 .if \pop <> 0 149 .if \pop <> 0
153 add $\pop, %esp 150 add $\pop, %esp
@@ -195,35 +192,25 @@
195.macro SAVE_ALL 192.macro SAVE_ALL
196 cld 193 cld
197 PUSH_GS 194 PUSH_GS
198 pushl %fs 195 pushl_cfi %fs
199 CFI_ADJUST_CFA_OFFSET 4
200 /*CFI_REL_OFFSET fs, 0;*/ 196 /*CFI_REL_OFFSET fs, 0;*/
201 pushl %es 197 pushl_cfi %es
202 CFI_ADJUST_CFA_OFFSET 4
203 /*CFI_REL_OFFSET es, 0;*/ 198 /*CFI_REL_OFFSET es, 0;*/
204 pushl %ds 199 pushl_cfi %ds
205 CFI_ADJUST_CFA_OFFSET 4
206 /*CFI_REL_OFFSET ds, 0;*/ 200 /*CFI_REL_OFFSET ds, 0;*/
207 pushl %eax 201 pushl_cfi %eax
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET eax, 0 202 CFI_REL_OFFSET eax, 0
210 pushl %ebp 203 pushl_cfi %ebp
211 CFI_ADJUST_CFA_OFFSET 4
212 CFI_REL_OFFSET ebp, 0 204 CFI_REL_OFFSET ebp, 0
213 pushl %edi 205 pushl_cfi %edi
214 CFI_ADJUST_CFA_OFFSET 4
215 CFI_REL_OFFSET edi, 0 206 CFI_REL_OFFSET edi, 0
216 pushl %esi 207 pushl_cfi %esi
217 CFI_ADJUST_CFA_OFFSET 4
218 CFI_REL_OFFSET esi, 0 208 CFI_REL_OFFSET esi, 0
219 pushl %edx 209 pushl_cfi %edx
220 CFI_ADJUST_CFA_OFFSET 4
221 CFI_REL_OFFSET edx, 0 210 CFI_REL_OFFSET edx, 0
222 pushl %ecx 211 pushl_cfi %ecx
223 CFI_ADJUST_CFA_OFFSET 4
224 CFI_REL_OFFSET ecx, 0 212 CFI_REL_OFFSET ecx, 0
225 pushl %ebx 213 pushl_cfi %ebx
226 CFI_ADJUST_CFA_OFFSET 4
227 CFI_REL_OFFSET ebx, 0 214 CFI_REL_OFFSET ebx, 0
228 movl $(__USER_DS), %edx 215 movl $(__USER_DS), %edx
229 movl %edx, %ds 216 movl %edx, %ds
@@ -234,39 +221,29 @@
234.endm 221.endm
235 222
236.macro RESTORE_INT_REGS 223.macro RESTORE_INT_REGS
237 popl %ebx 224 popl_cfi %ebx
238 CFI_ADJUST_CFA_OFFSET -4
239 CFI_RESTORE ebx 225 CFI_RESTORE ebx
240 popl %ecx 226 popl_cfi %ecx
241 CFI_ADJUST_CFA_OFFSET -4
242 CFI_RESTORE ecx 227 CFI_RESTORE ecx
243 popl %edx 228 popl_cfi %edx
244 CFI_ADJUST_CFA_OFFSET -4
245 CFI_RESTORE edx 229 CFI_RESTORE edx
246 popl %esi 230 popl_cfi %esi
247 CFI_ADJUST_CFA_OFFSET -4
248 CFI_RESTORE esi 231 CFI_RESTORE esi
249 popl %edi 232 popl_cfi %edi
250 CFI_ADJUST_CFA_OFFSET -4
251 CFI_RESTORE edi 233 CFI_RESTORE edi
252 popl %ebp 234 popl_cfi %ebp
253 CFI_ADJUST_CFA_OFFSET -4
254 CFI_RESTORE ebp 235 CFI_RESTORE ebp
255 popl %eax 236 popl_cfi %eax
256 CFI_ADJUST_CFA_OFFSET -4
257 CFI_RESTORE eax 237 CFI_RESTORE eax
258.endm 238.endm
259 239
260.macro RESTORE_REGS pop=0 240.macro RESTORE_REGS pop=0
261 RESTORE_INT_REGS 241 RESTORE_INT_REGS
2621: popl %ds 2421: popl_cfi %ds
263 CFI_ADJUST_CFA_OFFSET -4
264 /*CFI_RESTORE ds;*/ 243 /*CFI_RESTORE ds;*/
2652: popl %es 2442: popl_cfi %es
266 CFI_ADJUST_CFA_OFFSET -4
267 /*CFI_RESTORE es;*/ 245 /*CFI_RESTORE es;*/
2683: popl %fs 2463: popl_cfi %fs
269 CFI_ADJUST_CFA_OFFSET -4
270 /*CFI_RESTORE fs;*/ 247 /*CFI_RESTORE fs;*/
271 POP_GS \pop 248 POP_GS \pop
272.pushsection .fixup, "ax" 249.pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
320 297
321ENTRY(ret_from_fork) 298ENTRY(ret_from_fork)
322 CFI_STARTPROC 299 CFI_STARTPROC
323 pushl %eax 300 pushl_cfi %eax
324 CFI_ADJUST_CFA_OFFSET 4
325 call schedule_tail 301 call schedule_tail
326 GET_THREAD_INFO(%ebp) 302 GET_THREAD_INFO(%ebp)
327 popl %eax 303 popl_cfi %eax
328 CFI_ADJUST_CFA_OFFSET -4 304 pushl_cfi $0x0202 # Reset kernel eflags
329 pushl $0x0202 # Reset kernel eflags 305 popfl_cfi
330 CFI_ADJUST_CFA_OFFSET 4
331 popfl
332 CFI_ADJUST_CFA_OFFSET -4
333 jmp syscall_exit 306 jmp syscall_exit
334 CFI_ENDPROC 307 CFI_ENDPROC
335END(ret_from_fork) 308END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
409 * enough kernel state to call TRACE_IRQS_OFF can be called - but 382 * enough kernel state to call TRACE_IRQS_OFF can be called - but
410 * we immediately enable interrupts at that point anyway. 383 * we immediately enable interrupts at that point anyway.
411 */ 384 */
412 pushl $(__USER_DS) 385 pushl_cfi $__USER_DS
413 CFI_ADJUST_CFA_OFFSET 4
414 /*CFI_REL_OFFSET ss, 0*/ 386 /*CFI_REL_OFFSET ss, 0*/
415 pushl %ebp 387 pushl_cfi %ebp
416 CFI_ADJUST_CFA_OFFSET 4
417 CFI_REL_OFFSET esp, 0 388 CFI_REL_OFFSET esp, 0
418 pushfl 389 pushfl_cfi
419 orl $X86_EFLAGS_IF, (%esp) 390 orl $X86_EFLAGS_IF, (%esp)
420 CFI_ADJUST_CFA_OFFSET 4 391 pushl_cfi $__USER_CS
421 pushl $(__USER_CS)
422 CFI_ADJUST_CFA_OFFSET 4
423 /*CFI_REL_OFFSET cs, 0*/ 392 /*CFI_REL_OFFSET cs, 0*/
424 /* 393 /*
425 * Push current_thread_info()->sysenter_return to the stack. 394 * Push current_thread_info()->sysenter_return to the stack.
426 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
427 * pushed above; +8 corresponds to copy_thread's esp0 setting. 396 * pushed above; +8 corresponds to copy_thread's esp0 setting.
428 */ 397 */
429 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) 398 pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp)
430 CFI_ADJUST_CFA_OFFSET 4
431 CFI_REL_OFFSET eip, 0 399 CFI_REL_OFFSET eip, 0
432 400
433 pushl %eax 401 pushl_cfi %eax
434 CFI_ADJUST_CFA_OFFSET 4
435 SAVE_ALL 402 SAVE_ALL
436 ENABLE_INTERRUPTS(CLBR_NONE) 403 ENABLE_INTERRUPTS(CLBR_NONE)
437 404
@@ -486,8 +453,7 @@ sysenter_audit:
486 movl %eax,%edx /* 2nd arg: syscall number */ 453 movl %eax,%edx /* 2nd arg: syscall number */
487 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 454 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
488 call audit_syscall_entry 455 call audit_syscall_entry
489 pushl %ebx 456 pushl_cfi %ebx
490 CFI_ADJUST_CFA_OFFSET 4
491 movl PT_EAX(%esp),%eax /* reload syscall number */ 457 movl PT_EAX(%esp),%eax /* reload syscall number */
492 jmp sysenter_do_call 458 jmp sysenter_do_call
493 459
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
529 # system call handler stub 495 # system call handler stub
530ENTRY(system_call) 496ENTRY(system_call)
531 RING0_INT_FRAME # can't unwind into user space anyway 497 RING0_INT_FRAME # can't unwind into user space anyway
532 pushl %eax # save orig_eax 498 pushl_cfi %eax # save orig_eax
533 CFI_ADJUST_CFA_OFFSET 4
534 SAVE_ALL 499 SAVE_ALL
535 GET_THREAD_INFO(%ebp) 500 GET_THREAD_INFO(%ebp)
536 # system call tracing in operation / emulation 501 # system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
566 je ldt_ss # returning to user-space with LDT SS 531 je ldt_ss # returning to user-space with LDT SS
567restore_nocheck: 532restore_nocheck:
568 RESTORE_REGS 4 # skip orig_eax/error_code 533 RESTORE_REGS 4 # skip orig_eax/error_code
569 CFI_ADJUST_CFA_OFFSET -4
570irq_return: 534irq_return:
571 INTERRUPT_RETURN 535 INTERRUPT_RETURN
572.section .fixup,"ax" 536.section .fixup,"ax"
@@ -619,10 +583,8 @@ ldt_ss:
619 shr $16, %edx 583 shr $16, %edx
620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 584 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 585 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 586 pushl_cfi $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 587 pushl_cfi %eax /* new kernel esp */
624 push %eax /* new kernel esp */
625 CFI_ADJUST_CFA_OFFSET 4
626 /* Disable interrupts, but do not irqtrace this section: we 588 /* Disable interrupts, but do not irqtrace this section: we
627 * will soon execute iret and the tracer was already set to 589 * will soon execute iret and the tracer was already set to
628 * the irqstate after the iret */ 590 * the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and
666 628
667 ALIGN 629 ALIGN
668work_notifysig_v86: 630work_notifysig_v86:
669 pushl %ecx # save ti_flags for do_notify_resume 631 pushl_cfi %ecx # save ti_flags for do_notify_resume
670 CFI_ADJUST_CFA_OFFSET 4
671 call save_v86_state # %eax contains pt_regs pointer 632 call save_v86_state # %eax contains pt_regs pointer
672 popl %ecx 633 popl_cfi %ecx
673 CFI_ADJUST_CFA_OFFSET -4
674 movl %eax, %esp 634 movl %eax, %esp
675#else 635#else
676 movl %esp, %eax 636 movl %esp, %eax
@@ -750,14 +710,18 @@ ptregs_##name: \
750#define PTREGSCALL3(name) \ 710#define PTREGSCALL3(name) \
751 ALIGN; \ 711 ALIGN; \
752ptregs_##name: \ 712ptregs_##name: \
713 CFI_STARTPROC; \
753 leal 4(%esp),%eax; \ 714 leal 4(%esp),%eax; \
754 pushl %eax; \ 715 pushl_cfi %eax; \
755 movl PT_EDX(%eax),%ecx; \ 716 movl PT_EDX(%eax),%ecx; \
756 movl PT_ECX(%eax),%edx; \ 717 movl PT_ECX(%eax),%edx; \
757 movl PT_EBX(%eax),%eax; \ 718 movl PT_EBX(%eax),%eax; \
758 call sys_##name; \ 719 call sys_##name; \
759 addl $4,%esp; \ 720 addl $4,%esp; \
760 ret 721 CFI_ADJUST_CFA_OFFSET -4; \
722 ret; \
723 CFI_ENDPROC; \
724ENDPROC(ptregs_##name)
761 725
762PTREGSCALL1(iopl) 726PTREGSCALL1(iopl)
763PTREGSCALL0(fork) 727PTREGSCALL0(fork)
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old)
772/* Clone is an oddball. The 4th arg is in %edi */ 736/* Clone is an oddball. The 4th arg is in %edi */
773 ALIGN; 737 ALIGN;
774ptregs_clone: 738ptregs_clone:
739 CFI_STARTPROC
775 leal 4(%esp),%eax 740 leal 4(%esp),%eax
776 pushl %eax 741 pushl_cfi %eax
777 pushl PT_EDI(%eax) 742 pushl_cfi PT_EDI(%eax)
778 movl PT_EDX(%eax),%ecx 743 movl PT_EDX(%eax),%ecx
779 movl PT_ECX(%eax),%edx 744 movl PT_ECX(%eax),%edx
780 movl PT_EBX(%eax),%eax 745 movl PT_EBX(%eax),%eax
781 call sys_clone 746 call sys_clone
782 addl $8,%esp 747 addl $8,%esp
748 CFI_ADJUST_CFA_OFFSET -8
783 ret 749 ret
750 CFI_ENDPROC
751ENDPROC(ptregs_clone)
784 752
785.macro FIXUP_ESPFIX_STACK 753.macro FIXUP_ESPFIX_STACK
786/* 754/*
@@ -795,10 +763,8 @@ ptregs_clone:
795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ 763 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 shl $16, %eax 764 shl $16, %eax
797 addl %esp, %eax /* the adjusted stack pointer */ 765 addl %esp, %eax /* the adjusted stack pointer */
798 pushl $__KERNEL_DS 766 pushl_cfi $__KERNEL_DS
799 CFI_ADJUST_CFA_OFFSET 4 767 pushl_cfi %eax
800 pushl %eax
801 CFI_ADJUST_CFA_OFFSET 4
802 lss (%esp), %esp /* switch to the normal stack segment */ 768 lss (%esp), %esp /* switch to the normal stack segment */
803 CFI_ADJUST_CFA_OFFSET -8 769 CFI_ADJUST_CFA_OFFSET -8
804.endm 770.endm
@@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
835 .if vector <> FIRST_EXTERNAL_VECTOR 801 .if vector <> FIRST_EXTERNAL_VECTOR
836 CFI_ADJUST_CFA_OFFSET -4 802 CFI_ADJUST_CFA_OFFSET -4
837 .endif 803 .endif
8381: pushl $(~vector+0x80) /* Note: always in signed byte range */ 8041: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
839 CFI_ADJUST_CFA_OFFSET 4
840 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 805 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
841 jmp 2f 806 jmp 2f
842 .endif 807 .endif
@@ -876,8 +841,7 @@ ENDPROC(common_interrupt)
876#define BUILD_INTERRUPT3(name, nr, fn) \ 841#define BUILD_INTERRUPT3(name, nr, fn) \
877ENTRY(name) \ 842ENTRY(name) \
878 RING0_INT_FRAME; \ 843 RING0_INT_FRAME; \
879 pushl $~(nr); \ 844 pushl_cfi $~(nr); \
880 CFI_ADJUST_CFA_OFFSET 4; \
881 SAVE_ALL; \ 845 SAVE_ALL; \
882 TRACE_IRQS_OFF \ 846 TRACE_IRQS_OFF \
883 movl %esp,%eax; \ 847 movl %esp,%eax; \
@@ -893,21 +857,18 @@ ENDPROC(name)
893 857
894ENTRY(coprocessor_error) 858ENTRY(coprocessor_error)
895 RING0_INT_FRAME 859 RING0_INT_FRAME
896 pushl $0 860 pushl_cfi $0
897 CFI_ADJUST_CFA_OFFSET 4 861 pushl_cfi $do_coprocessor_error
898 pushl $do_coprocessor_error
899 CFI_ADJUST_CFA_OFFSET 4
900 jmp error_code 862 jmp error_code
901 CFI_ENDPROC 863 CFI_ENDPROC
902END(coprocessor_error) 864END(coprocessor_error)
903 865
904ENTRY(simd_coprocessor_error) 866ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 867 RING0_INT_FRAME
906 pushl $0 868 pushl_cfi $0
907 CFI_ADJUST_CFA_OFFSET 4
908#ifdef CONFIG_X86_INVD_BUG 869#ifdef CONFIG_X86_INVD_BUG
909 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 870 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
910661: pushl $do_general_protection 871661: pushl_cfi $do_general_protection
911662: 872662:
912.section .altinstructions,"a" 873.section .altinstructions,"a"
913 .balign 4 874 .balign 4
@@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error)
922664: 883664:
923.previous 884.previous
924#else 885#else
925 pushl $do_simd_coprocessor_error 886 pushl_cfi $do_simd_coprocessor_error
926#endif 887#endif
927 CFI_ADJUST_CFA_OFFSET 4
928 jmp error_code 888 jmp error_code
929 CFI_ENDPROC 889 CFI_ENDPROC
930END(simd_coprocessor_error) 890END(simd_coprocessor_error)
931 891
932ENTRY(device_not_available) 892ENTRY(device_not_available)
933 RING0_INT_FRAME 893 RING0_INT_FRAME
934 pushl $-1 # mark this as an int 894 pushl_cfi $-1 # mark this as an int
935 CFI_ADJUST_CFA_OFFSET 4 895 pushl_cfi $do_device_not_available
936 pushl $do_device_not_available
937 CFI_ADJUST_CFA_OFFSET 4
938 jmp error_code 896 jmp error_code
939 CFI_ENDPROC 897 CFI_ENDPROC
940END(device_not_available) 898END(device_not_available)
@@ -956,82 +914,68 @@ END(native_irq_enable_sysexit)
956 914
957ENTRY(overflow) 915ENTRY(overflow)
958 RING0_INT_FRAME 916 RING0_INT_FRAME
959 pushl $0 917 pushl_cfi $0
960 CFI_ADJUST_CFA_OFFSET 4 918 pushl_cfi $do_overflow
961 pushl $do_overflow
962 CFI_ADJUST_CFA_OFFSET 4
963 jmp error_code 919 jmp error_code
964 CFI_ENDPROC 920 CFI_ENDPROC
965END(overflow) 921END(overflow)
966 922
967ENTRY(bounds) 923ENTRY(bounds)
968 RING0_INT_FRAME 924 RING0_INT_FRAME
969 pushl $0 925 pushl_cfi $0
970 CFI_ADJUST_CFA_OFFSET 4 926 pushl_cfi $do_bounds
971 pushl $do_bounds
972 CFI_ADJUST_CFA_OFFSET 4
973 jmp error_code 927 jmp error_code
974 CFI_ENDPROC 928 CFI_ENDPROC
975END(bounds) 929END(bounds)
976 930
977ENTRY(invalid_op) 931ENTRY(invalid_op)
978 RING0_INT_FRAME 932 RING0_INT_FRAME
979 pushl $0 933 pushl_cfi $0
980 CFI_ADJUST_CFA_OFFSET 4 934 pushl_cfi $do_invalid_op
981 pushl $do_invalid_op
982 CFI_ADJUST_CFA_OFFSET 4
983 jmp error_code 935 jmp error_code
984 CFI_ENDPROC 936 CFI_ENDPROC
985END(invalid_op) 937END(invalid_op)
986 938
987ENTRY(coprocessor_segment_overrun) 939ENTRY(coprocessor_segment_overrun)
988 RING0_INT_FRAME 940 RING0_INT_FRAME
989 pushl $0 941 pushl_cfi $0
990 CFI_ADJUST_CFA_OFFSET 4 942 pushl_cfi $do_coprocessor_segment_overrun
991 pushl $do_coprocessor_segment_overrun
992 CFI_ADJUST_CFA_OFFSET 4
993 jmp error_code 943 jmp error_code
994 CFI_ENDPROC 944 CFI_ENDPROC
995END(coprocessor_segment_overrun) 945END(coprocessor_segment_overrun)
996 946
997ENTRY(invalid_TSS) 947ENTRY(invalid_TSS)
998 RING0_EC_FRAME 948 RING0_EC_FRAME
999 pushl $do_invalid_TSS 949 pushl_cfi $do_invalid_TSS
1000 CFI_ADJUST_CFA_OFFSET 4
1001 jmp error_code 950 jmp error_code
1002 CFI_ENDPROC 951 CFI_ENDPROC
1003END(invalid_TSS) 952END(invalid_TSS)
1004 953
1005ENTRY(segment_not_present) 954ENTRY(segment_not_present)
1006 RING0_EC_FRAME 955 RING0_EC_FRAME
1007 pushl $do_segment_not_present 956 pushl_cfi $do_segment_not_present
1008 CFI_ADJUST_CFA_OFFSET 4
1009 jmp error_code 957 jmp error_code
1010 CFI_ENDPROC 958 CFI_ENDPROC
1011END(segment_not_present) 959END(segment_not_present)
1012 960
1013ENTRY(stack_segment) 961ENTRY(stack_segment)
1014 RING0_EC_FRAME 962 RING0_EC_FRAME
1015 pushl $do_stack_segment 963 pushl_cfi $do_stack_segment
1016 CFI_ADJUST_CFA_OFFSET 4
1017 jmp error_code 964 jmp error_code
1018 CFI_ENDPROC 965 CFI_ENDPROC
1019END(stack_segment) 966END(stack_segment)
1020 967
1021ENTRY(alignment_check) 968ENTRY(alignment_check)
1022 RING0_EC_FRAME 969 RING0_EC_FRAME
1023 pushl $do_alignment_check 970 pushl_cfi $do_alignment_check
1024 CFI_ADJUST_CFA_OFFSET 4
1025 jmp error_code 971 jmp error_code
1026 CFI_ENDPROC 972 CFI_ENDPROC
1027END(alignment_check) 973END(alignment_check)
1028 974
1029ENTRY(divide_error) 975ENTRY(divide_error)
1030 RING0_INT_FRAME 976 RING0_INT_FRAME
1031 pushl $0 # no error code 977 pushl_cfi $0 # no error code
1032 CFI_ADJUST_CFA_OFFSET 4 978 pushl_cfi $do_divide_error
1033 pushl $do_divide_error
1034 CFI_ADJUST_CFA_OFFSET 4
1035 jmp error_code 979 jmp error_code
1036 CFI_ENDPROC 980 CFI_ENDPROC
1037END(divide_error) 981END(divide_error)
@@ -1039,10 +983,8 @@ END(divide_error)
1039#ifdef CONFIG_X86_MCE 983#ifdef CONFIG_X86_MCE
1040ENTRY(machine_check) 984ENTRY(machine_check)
1041 RING0_INT_FRAME 985 RING0_INT_FRAME
1042 pushl $0 986 pushl_cfi $0
1043 CFI_ADJUST_CFA_OFFSET 4 987 pushl_cfi machine_check_vector
1044 pushl machine_check_vector
1045 CFI_ADJUST_CFA_OFFSET 4
1046 jmp error_code 988 jmp error_code
1047 CFI_ENDPROC 989 CFI_ENDPROC
1048END(machine_check) 990END(machine_check)
@@ -1050,10 +992,8 @@ END(machine_check)
1050 992
1051ENTRY(spurious_interrupt_bug) 993ENTRY(spurious_interrupt_bug)
1052 RING0_INT_FRAME 994 RING0_INT_FRAME
1053 pushl $0 995 pushl_cfi $0
1054 CFI_ADJUST_CFA_OFFSET 4 996 pushl_cfi $do_spurious_interrupt_bug
1055 pushl $do_spurious_interrupt_bug
1056 CFI_ADJUST_CFA_OFFSET 4
1057 jmp error_code 997 jmp error_code
1058 CFI_ENDPROC 998 CFI_ENDPROC
1059END(spurious_interrupt_bug) 999END(spurious_interrupt_bug)
@@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target)
1084 1024
1085ENTRY(xen_hypervisor_callback) 1025ENTRY(xen_hypervisor_callback)
1086 CFI_STARTPROC 1026 CFI_STARTPROC
1087 pushl $0 1027 pushl_cfi $0
1088 CFI_ADJUST_CFA_OFFSET 4
1089 SAVE_ALL 1028 SAVE_ALL
1090 TRACE_IRQS_OFF 1029 TRACE_IRQS_OFF
1091 1030
@@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
1121# We distinguish between categories by maintaining a status value in EAX. 1060# We distinguish between categories by maintaining a status value in EAX.
1122ENTRY(xen_failsafe_callback) 1061ENTRY(xen_failsafe_callback)
1123 CFI_STARTPROC 1062 CFI_STARTPROC
1124 pushl %eax 1063 pushl_cfi %eax
1125 CFI_ADJUST_CFA_OFFSET 4
1126 movl $1,%eax 1064 movl $1,%eax
11271: mov 4(%esp),%ds 10651: mov 4(%esp),%ds
11282: mov 8(%esp),%es 10662: mov 8(%esp),%es
11293: mov 12(%esp),%fs 10673: mov 12(%esp),%fs
11304: mov 16(%esp),%gs 10684: mov 16(%esp),%gs
1131 testl %eax,%eax 1069 testl %eax,%eax
1132 popl %eax 1070 popl_cfi %eax
1133 CFI_ADJUST_CFA_OFFSET -4
1134 lea 16(%esp),%esp 1071 lea 16(%esp),%esp
1135 CFI_ADJUST_CFA_OFFSET -16 1072 CFI_ADJUST_CFA_OFFSET -16
1136 jz 5f 1073 jz 5f
1137 addl $16,%esp 1074 addl $16,%esp
1138 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) 1075 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
11395: pushl $0 # EAX == 0 => Category 1 (Bad segment) 10765: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
1140 CFI_ADJUST_CFA_OFFSET 4
1141 SAVE_ALL 1077 SAVE_ALL
1142 jmp ret_from_exception 1078 jmp ret_from_exception
1143 CFI_ENDPROC 1079 CFI_ENDPROC
@@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
1287 1223
1288ENTRY(page_fault) 1224ENTRY(page_fault)
1289 RING0_EC_FRAME 1225 RING0_EC_FRAME
1290 pushl $do_page_fault 1226 pushl_cfi $do_page_fault
1291 CFI_ADJUST_CFA_OFFSET 4
1292 ALIGN 1227 ALIGN
1293error_code: 1228error_code:
1294 /* the function address is in %gs's slot on the stack */ 1229 /* the function address is in %gs's slot on the stack */
1295 pushl %fs 1230 pushl_cfi %fs
1296 CFI_ADJUST_CFA_OFFSET 4
1297 /*CFI_REL_OFFSET fs, 0*/ 1231 /*CFI_REL_OFFSET fs, 0*/
1298 pushl %es 1232 pushl_cfi %es
1299 CFI_ADJUST_CFA_OFFSET 4
1300 /*CFI_REL_OFFSET es, 0*/ 1233 /*CFI_REL_OFFSET es, 0*/
1301 pushl %ds 1234 pushl_cfi %ds
1302 CFI_ADJUST_CFA_OFFSET 4
1303 /*CFI_REL_OFFSET ds, 0*/ 1235 /*CFI_REL_OFFSET ds, 0*/
1304 pushl %eax 1236 pushl_cfi %eax
1305 CFI_ADJUST_CFA_OFFSET 4
1306 CFI_REL_OFFSET eax, 0 1237 CFI_REL_OFFSET eax, 0
1307 pushl %ebp 1238 pushl_cfi %ebp
1308 CFI_ADJUST_CFA_OFFSET 4
1309 CFI_REL_OFFSET ebp, 0 1239 CFI_REL_OFFSET ebp, 0
1310 pushl %edi 1240 pushl_cfi %edi
1311 CFI_ADJUST_CFA_OFFSET 4
1312 CFI_REL_OFFSET edi, 0 1241 CFI_REL_OFFSET edi, 0
1313 pushl %esi 1242 pushl_cfi %esi
1314 CFI_ADJUST_CFA_OFFSET 4
1315 CFI_REL_OFFSET esi, 0 1243 CFI_REL_OFFSET esi, 0
1316 pushl %edx 1244 pushl_cfi %edx
1317 CFI_ADJUST_CFA_OFFSET 4
1318 CFI_REL_OFFSET edx, 0 1245 CFI_REL_OFFSET edx, 0
1319 pushl %ecx 1246 pushl_cfi %ecx
1320 CFI_ADJUST_CFA_OFFSET 4
1321 CFI_REL_OFFSET ecx, 0 1247 CFI_REL_OFFSET ecx, 0
1322 pushl %ebx 1248 pushl_cfi %ebx
1323 CFI_ADJUST_CFA_OFFSET 4
1324 CFI_REL_OFFSET ebx, 0 1249 CFI_REL_OFFSET ebx, 0
1325 cld 1250 cld
1326 movl $(__KERNEL_PERCPU), %ecx 1251 movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1287,9 @@ END(page_fault)
1362 movl TSS_sysenter_sp0 + \offset(%esp), %esp 1287 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1363 CFI_DEF_CFA esp, 0 1288 CFI_DEF_CFA esp, 0
1364 CFI_UNDEFINED eip 1289 CFI_UNDEFINED eip
1365 pushfl 1290 pushfl_cfi
1366 CFI_ADJUST_CFA_OFFSET 4 1291 pushl_cfi $__KERNEL_CS
1367 pushl $__KERNEL_CS 1292 pushl_cfi $sysenter_past_esp
1368 CFI_ADJUST_CFA_OFFSET 4
1369 pushl $sysenter_past_esp
1370 CFI_ADJUST_CFA_OFFSET 4
1371 CFI_REL_OFFSET eip, 0 1293 CFI_REL_OFFSET eip, 0
1372.endm 1294.endm
1373 1295
@@ -1377,8 +1299,7 @@ ENTRY(debug)
1377 jne debug_stack_correct 1299 jne debug_stack_correct
1378 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1300 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1379debug_stack_correct: 1301debug_stack_correct:
1380 pushl $-1 # mark this as an int 1302 pushl_cfi $-1 # mark this as an int
1381 CFI_ADJUST_CFA_OFFSET 4
1382 SAVE_ALL 1303 SAVE_ALL
1383 TRACE_IRQS_OFF 1304 TRACE_IRQS_OFF
1384 xorl %edx,%edx # error code 0 1305 xorl %edx,%edx # error code 0
@@ -1398,32 +1319,27 @@ END(debug)
1398 */ 1319 */
1399ENTRY(nmi) 1320ENTRY(nmi)
1400 RING0_INT_FRAME 1321 RING0_INT_FRAME
1401 pushl %eax 1322 pushl_cfi %eax
1402 CFI_ADJUST_CFA_OFFSET 4
1403 movl %ss, %eax 1323 movl %ss, %eax
1404 cmpw $__ESPFIX_SS, %ax 1324 cmpw $__ESPFIX_SS, %ax
1405 popl %eax 1325 popl_cfi %eax
1406 CFI_ADJUST_CFA_OFFSET -4
1407 je nmi_espfix_stack 1326 je nmi_espfix_stack
1408 cmpl $ia32_sysenter_target,(%esp) 1327 cmpl $ia32_sysenter_target,(%esp)
1409 je nmi_stack_fixup 1328 je nmi_stack_fixup
1410 pushl %eax 1329 pushl_cfi %eax
1411 CFI_ADJUST_CFA_OFFSET 4
1412 movl %esp,%eax 1330 movl %esp,%eax
1413 /* Do not access memory above the end of our stack page, 1331 /* Do not access memory above the end of our stack page,
1414 * it might not exist. 1332 * it might not exist.
1415 */ 1333 */
1416 andl $(THREAD_SIZE-1),%eax 1334 andl $(THREAD_SIZE-1),%eax
1417 cmpl $(THREAD_SIZE-20),%eax 1335 cmpl $(THREAD_SIZE-20),%eax
1418 popl %eax 1336 popl_cfi %eax
1419 CFI_ADJUST_CFA_OFFSET -4
1420 jae nmi_stack_correct 1337 jae nmi_stack_correct
1421 cmpl $ia32_sysenter_target,12(%esp) 1338 cmpl $ia32_sysenter_target,12(%esp)
1422 je nmi_debug_stack_check 1339 je nmi_debug_stack_check
1423nmi_stack_correct: 1340nmi_stack_correct:
1424 /* We have a RING0_INT_FRAME here */ 1341 /* We have a RING0_INT_FRAME here */
1425 pushl %eax 1342 pushl_cfi %eax
1426 CFI_ADJUST_CFA_OFFSET 4
1427 SAVE_ALL 1343 SAVE_ALL
1428 xorl %edx,%edx # zero error code 1344 xorl %edx,%edx # zero error code
1429 movl %esp,%eax # pt_regs pointer 1345 movl %esp,%eax # pt_regs pointer
@@ -1452,18 +1368,14 @@ nmi_espfix_stack:
1452 * 1368 *
1453 * create the pointer to lss back 1369 * create the pointer to lss back
1454 */ 1370 */
1455 pushl %ss 1371 pushl_cfi %ss
1456 CFI_ADJUST_CFA_OFFSET 4 1372 pushl_cfi %esp
1457 pushl %esp
1458 CFI_ADJUST_CFA_OFFSET 4
1459 addl $4, (%esp) 1373 addl $4, (%esp)
1460 /* copy the iret frame of 12 bytes */ 1374 /* copy the iret frame of 12 bytes */
1461 .rept 3 1375 .rept 3
1462 pushl 16(%esp) 1376 pushl_cfi 16(%esp)
1463 CFI_ADJUST_CFA_OFFSET 4
1464 .endr 1377 .endr
1465 pushl %eax 1378 pushl_cfi %eax
1466 CFI_ADJUST_CFA_OFFSET 4
1467 SAVE_ALL 1379 SAVE_ALL
1468 FIXUP_ESPFIX_STACK # %eax == %esp 1380 FIXUP_ESPFIX_STACK # %eax == %esp
1469 xorl %edx,%edx # zero error code 1381 xorl %edx,%edx # zero error code
@@ -1477,8 +1389,7 @@ END(nmi)
1477 1389
1478ENTRY(int3) 1390ENTRY(int3)
1479 RING0_INT_FRAME 1391 RING0_INT_FRAME
1480 pushl $-1 # mark this as an int 1392 pushl_cfi $-1 # mark this as an int
1481 CFI_ADJUST_CFA_OFFSET 4
1482 SAVE_ALL 1393 SAVE_ALL
1483 TRACE_IRQS_OFF 1394 TRACE_IRQS_OFF
1484 xorl %edx,%edx # zero error code 1395 xorl %edx,%edx # zero error code
@@ -1490,8 +1401,7 @@ END(int3)
1490 1401
1491ENTRY(general_protection) 1402ENTRY(general_protection)
1492 RING0_EC_FRAME 1403 RING0_EC_FRAME
1493 pushl $do_general_protection 1404 pushl_cfi $do_general_protection
1494 CFI_ADJUST_CFA_OFFSET 4
1495 jmp error_code 1405 jmp error_code
1496 CFI_ENDPROC 1406 CFI_ENDPROC
1497END(general_protection) 1407END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 17be5ec7cbba..fe2690d71c0c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
213 .macro FAKE_STACK_FRAME child_rip 213 .macro FAKE_STACK_FRAME child_rip
214 /* push in order ss, rsp, eflags, cs, rip */ 214 /* push in order ss, rsp, eflags, cs, rip */
215 xorl %eax, %eax 215 xorl %eax, %eax
216 pushq $__KERNEL_DS /* ss */ 216 pushq_cfi $__KERNEL_DS /* ss */
217 CFI_ADJUST_CFA_OFFSET 8
218 /*CFI_REL_OFFSET ss,0*/ 217 /*CFI_REL_OFFSET ss,0*/
219 pushq %rax /* rsp */ 218 pushq_cfi %rax /* rsp */
220 CFI_ADJUST_CFA_OFFSET 8
221 CFI_REL_OFFSET rsp,0 219 CFI_REL_OFFSET rsp,0
222 pushq $X86_EFLAGS_IF /* eflags - interrupts on */ 220 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
223 CFI_ADJUST_CFA_OFFSET 8
224 /*CFI_REL_OFFSET rflags,0*/ 221 /*CFI_REL_OFFSET rflags,0*/
225 pushq $__KERNEL_CS /* cs */ 222 pushq_cfi $__KERNEL_CS /* cs */
226 CFI_ADJUST_CFA_OFFSET 8
227 /*CFI_REL_OFFSET cs,0*/ 223 /*CFI_REL_OFFSET cs,0*/
228 pushq \child_rip /* rip */ 224 pushq_cfi \child_rip /* rip */
229 CFI_ADJUST_CFA_OFFSET 8
230 CFI_REL_OFFSET rip,0 225 CFI_REL_OFFSET rip,0
231 pushq %rax /* orig rax */ 226 pushq_cfi %rax /* orig rax */
232 CFI_ADJUST_CFA_OFFSET 8
233 .endm 227 .endm
234 228
235 .macro UNFAKE_STACK_FRAME 229 .macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
398 392
399 LOCK ; btr $TIF_FORK,TI_flags(%r8) 393 LOCK ; btr $TIF_FORK,TI_flags(%r8)
400 394
401 push kernel_eflags(%rip) 395 pushq_cfi kernel_eflags(%rip)
402 CFI_ADJUST_CFA_OFFSET 8 396 popfq_cfi # reset kernel eflags
403 popf # reset kernel eflags
404 CFI_ADJUST_CFA_OFFSET -8
405 397
406 call schedule_tail # rdi: 'prev' task parameter 398 call schedule_tail # rdi: 'prev' task parameter
407 399
@@ -521,11 +513,9 @@ sysret_careful:
521 jnc sysret_signal 513 jnc sysret_signal
522 TRACE_IRQS_ON 514 TRACE_IRQS_ON
523 ENABLE_INTERRUPTS(CLBR_NONE) 515 ENABLE_INTERRUPTS(CLBR_NONE)
524 pushq %rdi 516 pushq_cfi %rdi
525 CFI_ADJUST_CFA_OFFSET 8
526 call schedule 517 call schedule
527 popq %rdi 518 popq_cfi %rdi
528 CFI_ADJUST_CFA_OFFSET -8
529 jmp sysret_check 519 jmp sysret_check
530 520
531 /* Handle a signal */ 521 /* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
634 jnc int_very_careful 624 jnc int_very_careful
635 TRACE_IRQS_ON 625 TRACE_IRQS_ON
636 ENABLE_INTERRUPTS(CLBR_NONE) 626 ENABLE_INTERRUPTS(CLBR_NONE)
637 pushq %rdi 627 pushq_cfi %rdi
638 CFI_ADJUST_CFA_OFFSET 8
639 call schedule 628 call schedule
640 popq %rdi 629 popq_cfi %rdi
641 CFI_ADJUST_CFA_OFFSET -8
642 DISABLE_INTERRUPTS(CLBR_NONE) 630 DISABLE_INTERRUPTS(CLBR_NONE)
643 TRACE_IRQS_OFF 631 TRACE_IRQS_OFF
644 jmp int_with_check 632 jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
652 /* Check for syscall exit trace */ 640 /* Check for syscall exit trace */
653 testl $_TIF_WORK_SYSCALL_EXIT,%edx 641 testl $_TIF_WORK_SYSCALL_EXIT,%edx
654 jz int_signal 642 jz int_signal
655 pushq %rdi 643 pushq_cfi %rdi
656 CFI_ADJUST_CFA_OFFSET 8
657 leaq 8(%rsp),%rdi # &ptregs -> arg1 644 leaq 8(%rsp),%rdi # &ptregs -> arg1
658 call syscall_trace_leave 645 call syscall_trace_leave
659 popq %rdi 646 popq_cfi %rdi
660 CFI_ADJUST_CFA_OFFSET -8
661 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 647 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
662 jmp int_restore_rest 648 jmp int_restore_rest
663 649
@@ -714,9 +700,8 @@ END(ptregscall_common)
714 700
715ENTRY(stub_execve) 701ENTRY(stub_execve)
716 CFI_STARTPROC 702 CFI_STARTPROC
717 popq %r11 703 addq $8, %rsp
718 CFI_ADJUST_CFA_OFFSET -8 704 PARTIAL_FRAME 0
719 CFI_REGISTER rip, r11
720 SAVE_REST 705 SAVE_REST
721 FIXUP_TOP_OF_STACK %r11 706 FIXUP_TOP_OF_STACK %r11
722 movq %rsp, %rcx 707 movq %rsp, %rcx
@@ -735,7 +720,7 @@ END(stub_execve)
735ENTRY(stub_rt_sigreturn) 720ENTRY(stub_rt_sigreturn)
736 CFI_STARTPROC 721 CFI_STARTPROC
737 addq $8, %rsp 722 addq $8, %rsp
738 CFI_ADJUST_CFA_OFFSET -8 723 PARTIAL_FRAME 0
739 SAVE_REST 724 SAVE_REST
740 movq %rsp,%rdi 725 movq %rsp,%rdi
741 FIXUP_TOP_OF_STACK %r11 726 FIXUP_TOP_OF_STACK %r11
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
766 .if vector <> FIRST_EXTERNAL_VECTOR 751 .if vector <> FIRST_EXTERNAL_VECTOR
767 CFI_ADJUST_CFA_OFFSET -8 752 CFI_ADJUST_CFA_OFFSET -8
768 .endif 753 .endif
7691: pushq $(~vector+0x80) /* Note: always in signed byte range */ 7541: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
770 CFI_ADJUST_CFA_OFFSET 8
771 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 755 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
772 jmp 2f 756 jmp 2f
773 .endif 757 .endif
@@ -796,8 +780,8 @@ END(interrupt)
796 780
797/* 0(%rsp): ~(interrupt number) */ 781/* 0(%rsp): ~(interrupt number) */
798 .macro interrupt func 782 .macro interrupt func
799 subq $10*8, %rsp 783 subq $ORIG_RAX-ARGOFFSET+8, %rsp
800 CFI_ADJUST_CFA_OFFSET 10*8 784 CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
801 call save_args 785 call save_args
802 PARTIAL_FRAME 0 786 PARTIAL_FRAME 0
803 call \func 787 call \func
@@ -822,6 +806,7 @@ ret_from_intr:
822 TRACE_IRQS_OFF 806 TRACE_IRQS_OFF
823 decl PER_CPU_VAR(irq_count) 807 decl PER_CPU_VAR(irq_count)
824 leaveq 808 leaveq
809 CFI_RESTORE rbp
825 CFI_DEF_CFA_REGISTER rsp 810 CFI_DEF_CFA_REGISTER rsp
826 CFI_ADJUST_CFA_OFFSET -8 811 CFI_ADJUST_CFA_OFFSET -8
827exit_intr: 812exit_intr:
@@ -903,11 +888,9 @@ retint_careful:
903 jnc retint_signal 888 jnc retint_signal
904 TRACE_IRQS_ON 889 TRACE_IRQS_ON
905 ENABLE_INTERRUPTS(CLBR_NONE) 890 ENABLE_INTERRUPTS(CLBR_NONE)
906 pushq %rdi 891 pushq_cfi %rdi
907 CFI_ADJUST_CFA_OFFSET 8
908 call schedule 892 call schedule
909 popq %rdi 893 popq_cfi %rdi
910 CFI_ADJUST_CFA_OFFSET -8
911 GET_THREAD_INFO(%rcx) 894 GET_THREAD_INFO(%rcx)
912 DISABLE_INTERRUPTS(CLBR_NONE) 895 DISABLE_INTERRUPTS(CLBR_NONE)
913 TRACE_IRQS_OFF 896 TRACE_IRQS_OFF
@@ -956,8 +939,7 @@ END(common_interrupt)
956.macro apicinterrupt num sym do_sym 939.macro apicinterrupt num sym do_sym
957ENTRY(\sym) 940ENTRY(\sym)
958 INTR_FRAME 941 INTR_FRAME
959 pushq $~(\num) 942 pushq_cfi $~(\num)
960 CFI_ADJUST_CFA_OFFSET 8
961 interrupt \do_sym 943 interrupt \do_sym
962 jmp ret_from_intr 944 jmp ret_from_intr
963 CFI_ENDPROC 945 CFI_ENDPROC
@@ -981,22 +963,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
981 x86_platform_ipi smp_x86_platform_ipi 963 x86_platform_ipi smp_x86_platform_ipi
982 964
983#ifdef CONFIG_SMP 965#ifdef CONFIG_SMP
984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 966.irpc idx, "01234567"
985 invalidate_interrupt0 smp_invalidate_interrupt 967apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
986apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ 968 invalidate_interrupt\idx smp_invalidate_interrupt
987 invalidate_interrupt1 smp_invalidate_interrupt 969.endr
988apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
989 invalidate_interrupt2 smp_invalidate_interrupt
990apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
991 invalidate_interrupt3 smp_invalidate_interrupt
992apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
993 invalidate_interrupt4 smp_invalidate_interrupt
994apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
995 invalidate_interrupt5 smp_invalidate_interrupt
996apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
997 invalidate_interrupt6 smp_invalidate_interrupt
998apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
999 invalidate_interrupt7 smp_invalidate_interrupt
1000#endif 970#endif
1001 971
1002apicinterrupt THRESHOLD_APIC_VECTOR \ 972apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1023,9 +993,9 @@ apicinterrupt ERROR_APIC_VECTOR \
1023apicinterrupt SPURIOUS_APIC_VECTOR \ 993apicinterrupt SPURIOUS_APIC_VECTOR \
1024 spurious_interrupt smp_spurious_interrupt 994 spurious_interrupt smp_spurious_interrupt
1025 995
1026#ifdef CONFIG_PERF_EVENTS 996#ifdef CONFIG_IRQ_WORK
1027apicinterrupt LOCAL_PENDING_VECTOR \ 997apicinterrupt IRQ_WORK_VECTOR \
1028 perf_pending_interrupt smp_perf_pending_interrupt 998 irq_work_interrupt smp_irq_work_interrupt
1029#endif 999#endif
1030 1000
1031/* 1001/*
@@ -1036,8 +1006,8 @@ ENTRY(\sym)
1036 INTR_FRAME 1006 INTR_FRAME
1037 PARAVIRT_ADJUST_EXCEPTION_FRAME 1007 PARAVIRT_ADJUST_EXCEPTION_FRAME
1038 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1008 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1039 subq $15*8,%rsp 1009 subq $ORIG_RAX-R15, %rsp
1040 CFI_ADJUST_CFA_OFFSET 15*8 1010 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1041 call error_entry 1011 call error_entry
1042 DEFAULT_FRAME 0 1012 DEFAULT_FRAME 0
1043 movq %rsp,%rdi /* pt_regs pointer */ 1013 movq %rsp,%rdi /* pt_regs pointer */
@@ -1052,9 +1022,9 @@ END(\sym)
1052ENTRY(\sym) 1022ENTRY(\sym)
1053 INTR_FRAME 1023 INTR_FRAME
1054 PARAVIRT_ADJUST_EXCEPTION_FRAME 1024 PARAVIRT_ADJUST_EXCEPTION_FRAME
1055 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1025 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1056 CFI_ADJUST_CFA_OFFSET 8 1026 subq $ORIG_RAX-R15, %rsp
1057 subq $15*8, %rsp 1027 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1058 call save_paranoid 1028 call save_paranoid
1059 TRACE_IRQS_OFF 1029 TRACE_IRQS_OFF
1060 movq %rsp,%rdi /* pt_regs pointer */ 1030 movq %rsp,%rdi /* pt_regs pointer */
@@ -1070,9 +1040,9 @@ END(\sym)
1070ENTRY(\sym) 1040ENTRY(\sym)
1071 INTR_FRAME 1041 INTR_FRAME
1072 PARAVIRT_ADJUST_EXCEPTION_FRAME 1042 PARAVIRT_ADJUST_EXCEPTION_FRAME
1073 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1043 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1074 CFI_ADJUST_CFA_OFFSET 8 1044 subq $ORIG_RAX-R15, %rsp
1075 subq $15*8, %rsp 1045 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1076 call save_paranoid 1046 call save_paranoid
1077 TRACE_IRQS_OFF 1047 TRACE_IRQS_OFF
1078 movq %rsp,%rdi /* pt_regs pointer */ 1048 movq %rsp,%rdi /* pt_regs pointer */
@@ -1089,8 +1059,8 @@ END(\sym)
1089ENTRY(\sym) 1059ENTRY(\sym)
1090 XCPT_FRAME 1060 XCPT_FRAME
1091 PARAVIRT_ADJUST_EXCEPTION_FRAME 1061 PARAVIRT_ADJUST_EXCEPTION_FRAME
1092 subq $15*8,%rsp 1062 subq $ORIG_RAX-R15, %rsp
1093 CFI_ADJUST_CFA_OFFSET 15*8 1063 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1094 call error_entry 1064 call error_entry
1095 DEFAULT_FRAME 0 1065 DEFAULT_FRAME 0
1096 movq %rsp,%rdi /* pt_regs pointer */ 1066 movq %rsp,%rdi /* pt_regs pointer */
@@ -1107,8 +1077,8 @@ END(\sym)
1107ENTRY(\sym) 1077ENTRY(\sym)
1108 XCPT_FRAME 1078 XCPT_FRAME
1109 PARAVIRT_ADJUST_EXCEPTION_FRAME 1079 PARAVIRT_ADJUST_EXCEPTION_FRAME
1110 subq $15*8,%rsp 1080 subq $ORIG_RAX-R15, %rsp
1111 CFI_ADJUST_CFA_OFFSET 15*8 1081 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1112 call save_paranoid 1082 call save_paranoid
1113 DEFAULT_FRAME 0 1083 DEFAULT_FRAME 0
1114 TRACE_IRQS_OFF 1084 TRACE_IRQS_OFF
@@ -1139,16 +1109,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
1139 /* edi: new selector */ 1109 /* edi: new selector */
1140ENTRY(native_load_gs_index) 1110ENTRY(native_load_gs_index)
1141 CFI_STARTPROC 1111 CFI_STARTPROC
1142 pushf 1112 pushfq_cfi
1143 CFI_ADJUST_CFA_OFFSET 8
1144 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 1113 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1145 SWAPGS 1114 SWAPGS
1146gs_change: 1115gs_change:
1147 movl %edi,%gs 1116 movl %edi,%gs
11482: mfence /* workaround */ 11172: mfence /* workaround */
1149 SWAPGS 1118 SWAPGS
1150 popf 1119 popfq_cfi
1151 CFI_ADJUST_CFA_OFFSET -8
1152 ret 1120 ret
1153 CFI_ENDPROC 1121 CFI_ENDPROC
1154END(native_load_gs_index) 1122END(native_load_gs_index)
@@ -1215,8 +1183,7 @@ END(kernel_execve)
1215/* Call softirq on interrupt stack. Interrupts are off. */ 1183/* Call softirq on interrupt stack. Interrupts are off. */
1216ENTRY(call_softirq) 1184ENTRY(call_softirq)
1217 CFI_STARTPROC 1185 CFI_STARTPROC
1218 push %rbp 1186 pushq_cfi %rbp
1219 CFI_ADJUST_CFA_OFFSET 8
1220 CFI_REL_OFFSET rbp,0 1187 CFI_REL_OFFSET rbp,0
1221 mov %rsp,%rbp 1188 mov %rsp,%rbp
1222 CFI_DEF_CFA_REGISTER rbp 1189 CFI_DEF_CFA_REGISTER rbp
@@ -1225,6 +1192,7 @@ ENTRY(call_softirq)
1225 push %rbp # backlink for old unwinder 1192 push %rbp # backlink for old unwinder
1226 call __do_softirq 1193 call __do_softirq
1227 leaveq 1194 leaveq
1195 CFI_RESTORE rbp
1228 CFI_DEF_CFA_REGISTER rsp 1196 CFI_DEF_CFA_REGISTER rsp
1229 CFI_ADJUST_CFA_OFFSET -8 1197 CFI_ADJUST_CFA_OFFSET -8
1230 decl PER_CPU_VAR(irq_count) 1198 decl PER_CPU_VAR(irq_count)
@@ -1368,7 +1336,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
1368 1336
1369 /* ebx: no swapgs flag */ 1337 /* ebx: no swapgs flag */
1370ENTRY(paranoid_exit) 1338ENTRY(paranoid_exit)
1371 INTR_FRAME 1339 DEFAULT_FRAME
1372 DISABLE_INTERRUPTS(CLBR_NONE) 1340 DISABLE_INTERRUPTS(CLBR_NONE)
1373 TRACE_IRQS_OFF 1341 TRACE_IRQS_OFF
1374 testl %ebx,%ebx /* swapgs needed? */ 1342 testl %ebx,%ebx /* swapgs needed? */
@@ -1445,7 +1413,6 @@ error_swapgs:
1445error_sti: 1413error_sti:
1446 TRACE_IRQS_OFF 1414 TRACE_IRQS_OFF
1447 ret 1415 ret
1448 CFI_ENDPROC
1449 1416
1450/* 1417/*
1451 * There are two places in the kernel that can potentially fault with 1418 * There are two places in the kernel that can potentially fault with
@@ -1470,6 +1437,7 @@ bstep_iret:
1470 /* Fix truncated RIP */ 1437 /* Fix truncated RIP */
1471 movq %rcx,RIP+8(%rsp) 1438 movq %rcx,RIP+8(%rsp)
1472 jmp error_swapgs 1439 jmp error_swapgs
1440 CFI_ENDPROC
1473END(error_entry) 1441END(error_entry)
1474 1442
1475 1443
@@ -1498,8 +1466,8 @@ ENTRY(nmi)
1498 INTR_FRAME 1466 INTR_FRAME
1499 PARAVIRT_ADJUST_EXCEPTION_FRAME 1467 PARAVIRT_ADJUST_EXCEPTION_FRAME
1500 pushq_cfi $-1 1468 pushq_cfi $-1
1501 subq $15*8, %rsp 1469 subq $ORIG_RAX-R15, %rsp
1502 CFI_ADJUST_CFA_OFFSET 15*8 1470 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1503 call save_paranoid 1471 call save_paranoid
1504 DEFAULT_FRAME 0 1472 DEFAULT_FRAME 0
1505 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1473 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54ee..3afb33f14d2d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
257 return mod_code_status; 257 return mod_code_status;
258} 258}
259 259
260
261
262
263static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
264
265static unsigned char *ftrace_nop_replace(void) 260static unsigned char *ftrace_nop_replace(void)
266{ 261{
267 return ftrace_nop; 262 return ideal_nop5;
268} 263}
269 264
270static int 265static int
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
338 333
339int __init ftrace_dyn_arch_init(void *data) 334int __init ftrace_dyn_arch_init(void *data)
340{ 335{
341 extern const unsigned char ftrace_test_p6nop[];
342 extern const unsigned char ftrace_test_nop5[];
343 extern const unsigned char ftrace_test_jmp[];
344 int faulted = 0;
345
346 /*
347 * There is no good nop for all x86 archs.
348 * We will default to using the P6_NOP5, but first we
349 * will test to make sure that the nop will actually
350 * work on this CPU. If it faults, we will then
351 * go to a lesser efficient 5 byte nop. If that fails
352 * we then just use a jmp as our nop. This isn't the most
353 * efficient nop, but we can not use a multi part nop
354 * since we would then risk being preempted in the middle
355 * of that nop, and if we enabled tracing then, it might
356 * cause a system crash.
357 *
358 * TODO: check the cpuid to determine the best nop.
359 */
360 asm volatile (
361 "ftrace_test_jmp:"
362 "jmp ftrace_test_p6nop\n"
363 "nop\n"
364 "nop\n"
365 "nop\n" /* 2 byte jmp + 3 bytes */
366 "ftrace_test_p6nop:"
367 P6_NOP5
368 "jmp 1f\n"
369 "ftrace_test_nop5:"
370 ".byte 0x66,0x66,0x66,0x66,0x90\n"
371 "1:"
372 ".section .fixup, \"ax\"\n"
373 "2: movl $1, %0\n"
374 " jmp ftrace_test_nop5\n"
375 "3: movl $2, %0\n"
376 " jmp 1b\n"
377 ".previous\n"
378 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
379 _ASM_EXTABLE(ftrace_test_nop5, 3b)
380 : "=r"(faulted) : "0" (faulted));
381
382 switch (faulted) {
383 case 0:
384 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
385 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
386 break;
387 case 1:
388 pr_info("converting mcount calls to 66 66 66 66 90\n");
389 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
390 break;
391 case 2:
392 pr_info("converting mcount calls to jmp . + 5\n");
393 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
394 break;
395 }
396
397 /* The return code is retured via data */ 336 /* The return code is retured via data */
398 *(unsigned long *)data = 0; 337 *(unsigned long *)data = 0;
399 338
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..af0699ba48cf 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -1,5 +1,6 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/init.h> 2#include <linux/init.h>
3#include <linux/memblock.h>
3 4
4#include <asm/setup.h> 5#include <asm/setup.h>
5#include <asm/bios_ebda.h> 6#include <asm/bios_ebda.h>
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void)
51 lowmem = 0x9f000; 52 lowmem = 0x9f000;
52 53
53 /* reserve all memory between lowmem and the 1MB mark */ 54 /* reserve all memory between lowmem and the 1MB mark */
54 reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); 55 memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
55} 56}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 784360c0625c..763310165fa0 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/memblock.h>
11 12
12#include <asm/setup.h> 13#include <asm/setup.h>
13#include <asm/sections.h> 14#include <asm/sections.h>
@@ -17,6 +18,7 @@
17#include <asm/apic.h> 18#include <asm/apic.h>
18#include <asm/io_apic.h> 19#include <asm/io_apic.h>
19#include <asm/bios_ebda.h> 20#include <asm/bios_ebda.h>
21#include <asm/tlbflush.h>
20 22
21static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
22{ 24{
@@ -30,17 +32,18 @@ static void __init i386_default_early_setup(void)
30 32
31void __init i386_start_kernel(void) 33void __init i386_start_kernel(void)
32{ 34{
35 memblock_init();
36
33#ifdef CONFIG_X86_TRAMPOLINE 37#ifdef CONFIG_X86_TRAMPOLINE
34 /* 38 /*
35 * But first pinch a few for the stack/trampoline stuff 39 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix 40 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff) 41 * trampoline before removing it. (see the GDT stuff)
38 */ 42 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, 43 memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
40 "EX TRAMPOLINE");
41#endif 44#endif
42 45
43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 46 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
44 47
45#ifdef CONFIG_BLK_DEV_INITRD 48#ifdef CONFIG_BLK_DEV_INITRD
46 /* Reserve INITRD */ 49 /* Reserve INITRD */
@@ -49,7 +52,7 @@ void __init i386_start_kernel(void)
49 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 52 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
50 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 53 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 54 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 55 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
53 } 56 }
54#endif 57#endif
55 58
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd614..2d2673c28aff 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -12,6 +12,7 @@
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/start_kernel.h> 13#include <linux/start_kernel.h>
14#include <linux/io.h> 14#include <linux/io.h>
15#include <linux/memblock.h>
15 16
16#include <asm/processor.h> 17#include <asm/processor.h>
17#include <asm/proto.h> 18#include <asm/proto.h>
@@ -79,6 +80,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
79 /* Cleanup the over mapped high alias */ 80 /* Cleanup the over mapped high alias */
80 cleanup_highmap(); 81 cleanup_highmap();
81 82
83 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
84
82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 85 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
83#ifdef CONFIG_EARLY_PRINTK 86#ifdef CONFIG_EARLY_PRINTK
84 set_intr_gate(i, &early_idt_handlers[i]); 87 set_intr_gate(i, &early_idt_handlers[i]);
@@ -98,7 +101,9 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 101{
99 copy_bootdata(__va(real_mode_data)); 102 copy_bootdata(__va(real_mode_data));
100 103
101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 104 memblock_init();
105
106 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
102 107
103#ifdef CONFIG_BLK_DEV_INITRD 108#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */ 109 /* Reserve INITRD */
@@ -107,7 +112,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 112 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 113 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 114 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 115 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
111 } 116 }
112#endif 117#endif
113 118
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fa8c1b8e09fb..bcece91dd311 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -183,13 +183,12 @@ default_entry:
183#ifdef CONFIG_X86_PAE 183#ifdef CONFIG_X86_PAE
184 184
185 /* 185 /*
186 * In PAE mode swapper_pg_dir is statically defined to contain enough 186 * In PAE mode initial_page_table is statically defined to contain
187 * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 187 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
188 * entries). The identity mapping is handled by pointing two PGD 188 * entries). The identity mapping is handled by pointing two PGD entries
189 * entries to the first kernel PMD. 189 * to the first kernel PMD.
190 * 190 *
191 * Note the upper half of each PMD or PTE are always zero at 191 * Note the upper half of each PMD or PTE are always zero at this stage.
192 * this stage.
193 */ 192 */
194 193
195#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ 194#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
@@ -197,7 +196,7 @@ default_entry:
197 xorl %ebx,%ebx /* %ebx is kept at zero */ 196 xorl %ebx,%ebx /* %ebx is kept at zero */
198 197
199 movl $pa(__brk_base), %edi 198 movl $pa(__brk_base), %edi
200 movl $pa(swapper_pg_pmd), %edx 199 movl $pa(initial_pg_pmd), %edx
201 movl $PTE_IDENT_ATTR, %eax 200 movl $PTE_IDENT_ATTR, %eax
20210: 20110:
203 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ 202 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
@@ -226,14 +225,14 @@ default_entry:
226 movl %eax, pa(max_pfn_mapped) 225 movl %eax, pa(max_pfn_mapped)
227 226
228 /* Do early initialization of the fixmap area */ 227 /* Do early initialization of the fixmap area */
229 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax 228 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
230 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 229 movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
231#else /* Not PAE */ 230#else /* Not PAE */
232 231
233page_pde_offset = (__PAGE_OFFSET >> 20); 232page_pde_offset = (__PAGE_OFFSET >> 20);
234 233
235 movl $pa(__brk_base), %edi 234 movl $pa(__brk_base), %edi
236 movl $pa(swapper_pg_dir), %edx 235 movl $pa(initial_page_table), %edx
237 movl $PTE_IDENT_ATTR, %eax 236 movl $PTE_IDENT_ATTR, %eax
23810: 23710:
239 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ 238 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
@@ -257,8 +256,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 256 movl %eax, pa(max_pfn_mapped)
258 257
259 /* Do early initialization of the fixmap area */ 258 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax 259 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 260 movl %eax,pa(initial_page_table+0xffc)
262#endif 261#endif
263 jmp 3f 262 jmp 3f
264/* 263/*
@@ -334,7 +333,7 @@ ENTRY(startup_32_smp)
334/* 333/*
335 * Enable paging 334 * Enable paging
336 */ 335 */
337 movl pa(initial_page_table), %eax 336 movl $pa(initial_page_table), %eax
338 movl %eax,%cr3 /* set the page table pointer.. */ 337 movl %eax,%cr3 /* set the page table pointer.. */
339 movl %cr0,%eax 338 movl %cr0,%eax
340 orl $X86_CR0_PG,%eax 339 orl $X86_CR0_PG,%eax
@@ -614,8 +613,6 @@ ignore_int:
614.align 4 613.align 4
615ENTRY(initial_code) 614ENTRY(initial_code)
616 .long i386_start_kernel 615 .long i386_start_kernel
617ENTRY(initial_page_table)
618 .long pa(swapper_pg_dir)
619 616
620/* 617/*
621 * BSS section 618 * BSS section
@@ -623,20 +620,18 @@ ENTRY(initial_page_table)
623__PAGE_ALIGNED_BSS 620__PAGE_ALIGNED_BSS
624 .align PAGE_SIZE_asm 621 .align PAGE_SIZE_asm
625#ifdef CONFIG_X86_PAE 622#ifdef CONFIG_X86_PAE
626swapper_pg_pmd: 623initial_pg_pmd:
627 .fill 1024*KPMDS,4,0 624 .fill 1024*KPMDS,4,0
628#else 625#else
629ENTRY(swapper_pg_dir) 626ENTRY(initial_page_table)
630 .fill 1024,4,0 627 .fill 1024,4,0
631#endif 628#endif
632swapper_pg_fixmap: 629initial_pg_fixmap:
633 .fill 1024,4,0 630 .fill 1024,4,0
634#ifdef CONFIG_X86_TRAMPOLINE
635ENTRY(trampoline_pg_dir)
636 .fill 1024,4,0
637#endif
638ENTRY(empty_zero_page) 631ENTRY(empty_zero_page)
639 .fill 4096,1,0 632 .fill 4096,1,0
633ENTRY(swapper_pg_dir)
634 .fill 1024,4,0
640 635
641/* 636/*
642 * This starts the data section. 637 * This starts the data section.
@@ -645,20 +640,20 @@ ENTRY(empty_zero_page)
645__PAGE_ALIGNED_DATA 640__PAGE_ALIGNED_DATA
646 /* Page-aligned for the benefit of paravirt? */ 641 /* Page-aligned for the benefit of paravirt? */
647 .align PAGE_SIZE_asm 642 .align PAGE_SIZE_asm
648ENTRY(swapper_pg_dir) 643ENTRY(initial_page_table)
649 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 644 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
650# if KPMDS == 3 645# if KPMDS == 3
651 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 646 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
652 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 647 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
653 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 648 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
654# elif KPMDS == 2 649# elif KPMDS == 2
655 .long 0,0 650 .long 0,0
656 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 651 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
657 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 652 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
658# elif KPMDS == 1 653# elif KPMDS == 1
659 .long 0,0 654 .long 0,0
660 .long 0,0 655 .long 0,0
661 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 656 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
662# else 657# else
663# error "Kernel PMDs should be 1, 2 or 3" 658# error "Kernel PMDs should be 1, 2 or 3"
664# endif 659# endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index efaf906daf93..ae03cab4352e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -380,44 +380,35 @@ static int hpet_next_event(unsigned long delta,
380 struct clock_event_device *evt, int timer) 380 struct clock_event_device *evt, int timer)
381{ 381{
382 u32 cnt; 382 u32 cnt;
383 s32 res;
383 384
384 cnt = hpet_readl(HPET_COUNTER); 385 cnt = hpet_readl(HPET_COUNTER);
385 cnt += (u32) delta; 386 cnt += (u32) delta;
386 hpet_writel(cnt, HPET_Tn_CMP(timer)); 387 hpet_writel(cnt, HPET_Tn_CMP(timer));
387 388
388 /* 389 /*
389 * We need to read back the CMP register on certain HPET 390 * HPETs are a complete disaster. The compare register is
390 * implementations (ATI chipsets) which seem to delay the 391 * based on a equal comparison and neither provides a less
391 * transfer of the compare register into the internal compare 392 * than or equal functionality (which would require to take
392 * logic. With small deltas this might actually be too late as 393 * the wraparound into account) nor a simple count down event
393 * the counter could already be higher than the compare value 394 * mode. Further the write to the comparator register is
394 * at that point and we would wait for the next hpet interrupt 395 * delayed internally up to two HPET clock cycles in certain
395 * forever. We found out that reading the CMP register back 396 * chipsets (ATI, ICH9,10). We worked around that by reading
396 * forces the transfer so we can rely on the comparison with 397 * back the compare register, but that required another
397 * the counter register below. If the read back from the 398 * workaround for ICH9,10 chips where the first readout after
398 * compare register does not match the value we programmed 399 * write can return the old stale value. We already have a
399 * then we might have a real hardware problem. We can not do 400 * minimum delta of 5us enforced, but a NMI or SMI hitting
400 * much about it here, but at least alert the user/admin with 401 * between the counter readout and the comparator write can
401 * a prominent warning. 402 * move us behind that point easily. Now instead of reading
402 * 403 * the compare register back several times, we make the ETIME
403 * An erratum on some chipsets (ICH9,..), results in 404 * decision based on the following: Return ETIME if the
404 * comparator read immediately following a write returning old 405 * counter value after the write is less than 8 HPET cycles
405 * value. Workaround for this is to read this value second 406 * away from the event or if the counter is already ahead of
406 * time, when first read returns old value. 407 * the event.
407 *
408 * In fact the write to the comparator register is delayed up
409 * to two HPET cycles so the workaround we tried to restrict
410 * the readback to those known to be borked ATI chipsets
411 * failed miserably. So we give up on optimizations forever
412 * and penalize all HPET incarnations unconditionally.
413 */ 408 */
414 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { 409 res = (s32)(cnt - hpet_readl(HPET_COUNTER));
415 if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
416 printk_once(KERN_WARNING
417 "hpet: compare register read back failed.\n");
418 }
419 410
420 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 411 return res < 8 ? -ETIME : 0;
421} 412}
422 413
423static void hpet_legacy_set_mode(enum clock_event_mode mode, 414static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -722,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
722 713
723 switch (action & 0xf) { 714 switch (action & 0xf) {
724 case CPU_ONLINE: 715 case CPU_ONLINE:
725 INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); 716 INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
726 init_completion(&work.complete); 717 init_completion(&work.complete);
727 /* FIXME: add schedule_work_on() */ 718 /* FIXME: add schedule_work_on() */
728 schedule_delayed_work_on(cpu, &work.work, 0); 719 schedule_delayed_work_on(cpu, &work.work, 0);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0c..58bb239a2fd7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
68 */ 68 */
69 69
70 if (!HAVE_HWFP) { 70 if (!HAVE_HWFP) {
71 /*
72 * Disable xsave as we do not support it if i387
73 * emulation is enabled.
74 */
75 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
76 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
71 xstate_size = sizeof(struct i387_soft_struct); 77 xstate_size = sizeof(struct i387_soft_struct);
72 return; 78 return;
73 } 79 }
74 80
75 if (cpu_has_fxsr) 81 if (cpu_has_fxsr)
76 xstate_size = sizeof(struct i387_fxsave_struct); 82 xstate_size = sizeof(struct i387_fxsave_struct);
77#ifdef CONFIG_X86_32
78 else 83 else
79 xstate_size = sizeof(struct i387_fsave_struct); 84 xstate_size = sizeof(struct i387_fsave_struct);
80#endif
81} 85}
82 86
83#ifdef CONFIG_X86_64
84/* 87/*
85 * Called at bootup to set up the initial FPU state that is later cloned 88 * Called at bootup to set up the initial FPU state that is later cloned
86 * into all processes. 89 * into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
88 91
89void __cpuinit fpu_init(void) 92void __cpuinit fpu_init(void)
90{ 93{
91 unsigned long oldcr0 = read_cr0(); 94 unsigned long cr0;
92 95 unsigned long cr4_mask = 0;
93 set_in_cr4(X86_CR4_OSFXSR);
94 set_in_cr4(X86_CR4_OSXMMEXCPT);
95 96
96 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 97 if (cpu_has_fxsr)
98 cr4_mask |= X86_CR4_OSFXSR;
99 if (cpu_has_xmm)
100 cr4_mask |= X86_CR4_OSXMMEXCPT;
101 if (cr4_mask)
102 set_in_cr4(cr4_mask);
103
104 cr0 = read_cr0();
105 cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
106 if (!HAVE_HWFP)
107 cr0 |= X86_CR0_EM;
108 write_cr0(cr0);
97 109
98 if (!smp_processor_id()) 110 if (!smp_processor_id())
99 init_thread_xstate(); 111 init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
104 clear_used_math(); 116 clear_used_math();
105} 117}
106 118
107#else /* CONFIG_X86_64 */
108
109void __cpuinit fpu_init(void)
110{
111 if (!smp_processor_id())
112 init_thread_xstate();
113}
114
115#endif /* CONFIG_X86_32 */
116
117void fpu_finit(struct fpu *fpu) 119void fpu_finit(struct fpu *fpu)
118{ 120{
119#ifdef CONFIG_X86_32
120 if (!HAVE_HWFP) { 121 if (!HAVE_HWFP) {
121 finit_soft_fpu(&fpu->state->soft); 122 finit_soft_fpu(&fpu->state->soft);
122 return; 123 return;
123 } 124 }
124#endif
125 125
126 if (cpu_has_fxsr) { 126 if (cpu_has_fxsr) {
127 struct i387_fxsave_struct *fx = &fpu->state->fxsave; 127 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
386#ifdef CONFIG_X86_64 386#ifdef CONFIG_X86_64
387 env->fip = fxsave->rip; 387 env->fip = fxsave->rip;
388 env->foo = fxsave->rdp; 388 env->foo = fxsave->rdp;
389 /*
390 * should be actually ds/cs at fpu exception time, but
391 * that information is not available in 64bit mode.
392 */
393 env->fcs = task_pt_regs(tsk)->cs;
389 if (tsk == current) { 394 if (tsk == current) {
390 /* 395 savesegment(ds, env->fos);
391 * should be actually ds/cs at fpu exception time, but
392 * that information is not available in 64bit mode.
393 */
394 asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
395 asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
396 } else { 396 } else {
397 struct pt_regs *regs = task_pt_regs(tsk); 397 env->fos = tsk->thread.ds;
398
399 env->fos = 0xffff0000 | tsk->thread.ds;
400 env->fcs = regs->cs;
401 } 398 }
399 env->fos |= 0xffff0000;
402#else 400#else
403 env->fip = fxsave->fip; 401 env->fip = fxsave->fip;
404 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); 402 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d765bdc48074..83ec0175f986 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
67 for_each_online_cpu(j) 67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance monitoring interrupts\n"); 69 seq_printf(p, " Performance monitoring interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND"); 70 seq_printf(p, "%*s: ", prec, "IWI");
71 for_each_online_cpu(j) 71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " IRQ work interrupts\n");
74#endif 74#endif
75 if (x86_platform_ipi_callback) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
185 sum += irq_stats(cpu)->apic_timer_irqs; 185 sum += irq_stats(cpu)->apic_timer_irqs;
186 sum += irq_stats(cpu)->irq_spurious_count; 186 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_irq_work_irqs;
189#endif 189#endif
190 if (x86_platform_ipi_callback) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->x86_platform_ipis; 191 sum += irq_stats(cpu)->x86_platform_ipis;
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d166..64668dbf00a4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -49,21 +49,17 @@ static inline int check_stack_overflow(void) { return 0; }
49static inline void print_stack_overflow(void) { } 49static inline void print_stack_overflow(void) { }
50#endif 50#endif
51 51
52#ifdef CONFIG_4KSTACKS
53/* 52/*
54 * per-CPU IRQ handling contexts (thread information and stack) 53 * per-CPU IRQ handling contexts (thread information and stack)
55 */ 54 */
56union irq_ctx { 55union irq_ctx {
57 struct thread_info tinfo; 56 struct thread_info tinfo;
58 u32 stack[THREAD_SIZE/sizeof(u32)]; 57 u32 stack[THREAD_SIZE/sizeof(u32)];
59} __attribute__((aligned(PAGE_SIZE))); 58} __attribute__((aligned(THREAD_SIZE)));
60 59
61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); 60static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); 61static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
63 62
64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
66
67static void call_on_stack(void *func, void *stack) 63static void call_on_stack(void *func, void *stack)
68{ 64{
69 asm volatile("xchgl %%ebx,%%esp \n" 65 asm volatile("xchgl %%ebx,%%esp \n"
@@ -129,7 +125,7 @@ void __cpuinit irq_ctx_init(int cpu)
129 if (per_cpu(hardirq_ctx, cpu)) 125 if (per_cpu(hardirq_ctx, cpu))
130 return; 126 return;
131 127
132 irqctx = &per_cpu(hardirq_stack, cpu); 128 irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER);
133 irqctx->tinfo.task = NULL; 129 irqctx->tinfo.task = NULL;
134 irqctx->tinfo.exec_domain = NULL; 130 irqctx->tinfo.exec_domain = NULL;
135 irqctx->tinfo.cpu = cpu; 131 irqctx->tinfo.cpu = cpu;
@@ -138,7 +134,7 @@ void __cpuinit irq_ctx_init(int cpu)
138 134
139 per_cpu(hardirq_ctx, cpu) = irqctx; 135 per_cpu(hardirq_ctx, cpu) = irqctx;
140 136
141 irqctx = &per_cpu(softirq_stack, cpu); 137 irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER);
142 irqctx->tinfo.task = NULL; 138 irqctx->tinfo.task = NULL;
143 irqctx->tinfo.exec_domain = NULL; 139 irqctx->tinfo.exec_domain = NULL;
144 irqctx->tinfo.cpu = cpu; 140 irqctx->tinfo.cpu = cpu;
@@ -151,11 +147,6 @@ void __cpuinit irq_ctx_init(int cpu)
151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); 147 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
152} 148}
153 149
154void irq_ctx_exit(int cpu)
155{
156 per_cpu(hardirq_ctx, cpu) = NULL;
157}
158
159asmlinkage void do_softirq(void) 150asmlinkage void do_softirq(void)
160{ 151{
161 unsigned long flags; 152 unsigned long flags;
@@ -187,11 +178,6 @@ asmlinkage void do_softirq(void)
187 local_irq_restore(flags); 178 local_irq_restore(flags);
188} 179}
189 180
190#else
191static inline int
192execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
193#endif
194
195bool handle_irq(unsigned irq, struct pt_regs *regs) 181bool handle_irq(unsigned irq, struct pt_regs *regs)
196{ 182{
197 struct irq_desc *desc; 183 struct irq_desc *desc;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..ca8f703a1e70
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
1/*
2 * x86 specific code for irq_work
3 *
4 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 */
6
7#include <linux/kernel.h>
8#include <linux/irq_work.h>
9#include <linux/hardirq.h>
10#include <asm/apic.h>
11
12void smp_irq_work_interrupt(struct pt_regs *regs)
13{
14 irq_enter();
15 ack_APIC_irq();
16 inc_irq_stat(apic_irq_work_irqs);
17 irq_work_run();
18 irq_exit();
19}
20
21void arch_irq_work_raise(void)
22{
23#ifdef CONFIG_X86_LOCAL_APIC
24 if (!cpu_has_apic)
25 return;
26
27 apic->send_IPI_self(IRQ_WORK_VECTOR);
28 apic_wait_icr_idle();
29#endif
30}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a91ab503e24f..c752e973958d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -215,9 +215,9 @@ static void __init apic_intr_init(void)
215 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 215 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
216 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 216 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
217 217
218 /* Performance monitoring interrupts: */ 218 /* IRQ work interrupts: */
219# ifdef CONFIG_PERF_EVENTS 219# ifdef CONFIG_IRQ_WORK
220 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 220 alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
221# endif 221# endif
222 222
223#endif 223#endif
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..961b6b30ba90
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,50 @@
1/*
2 * jump label x86 support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/cpu.h>
14#include <asm/kprobes.h>
15#include <asm/alternative.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19union jump_code_union {
20 char code[JUMP_LABEL_NOP_SIZE];
21 struct {
22 char jump;
23 int offset;
24 } __attribute__((packed));
25};
26
27void arch_jump_label_transform(struct jump_entry *entry,
28 enum jump_label_type type)
29{
30 union jump_code_union code;
31
32 if (type == JUMP_LABEL_ENABLE) {
33 code.jump = 0xe9;
34 code.offset = entry->target -
35 (entry->code + JUMP_LABEL_NOP_SIZE);
36 } else
37 memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
38 get_online_cpus();
39 mutex_lock(&text_mutex);
40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
41 mutex_unlock(&text_mutex);
42 put_online_cpus();
43}
44
45void arch_jump_label_text_poke_early(jump_label_t addr)
46{
47 text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
48}
49
50#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f10..90fcf62854bb 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
78static const struct file_operations fops_setup_data = { 78static const struct file_operations fops_setup_data = {
79 .read = setup_data_read, 79 .read = setup_data_read,
80 .open = setup_data_open, 80 .open = setup_data_open,
81 .llseek = default_llseek,
81}; 82};
82 83
83static int __init 84static int __init
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 852b81967a37..d81cfebb848f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -477,8 +477,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
477 raw_smp_processor_id()); 477 raw_smp_processor_id());
478 } 478 }
479 479
480 kgdb_correct_hw_break();
481
482 return 0; 480 return 0;
483 } 481 }
484 482
@@ -621,7 +619,12 @@ int kgdb_arch_init(void)
621static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, 619static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
622 struct perf_sample_data *data, struct pt_regs *regs) 620 struct perf_sample_data *data, struct pt_regs *regs)
623{ 621{
624 kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); 622 struct task_struct *tsk = current;
623 int i;
624
625 for (i = 0; i < 4; i++)
626 if (breakinfo[i].enabled)
627 tsk->thread.debugreg6 |= (DR_TRAP0 << i);
625} 628}
626 629
627void kgdb_arch_late(void) 630void kgdb_arch_late(void)
@@ -644,7 +647,7 @@ void kgdb_arch_late(void)
644 if (breakinfo[i].pev) 647 if (breakinfo[i].pev)
645 continue; 648 continue;
646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 649 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
647 if (IS_ERR(breakinfo[i].pev)) { 650 if (IS_ERR((void * __force)breakinfo[i].pev)) {
648 printk(KERN_ERR "kgdb: Could not allocate hw" 651 printk(KERN_ERR "kgdb: Could not allocate hw"
649 "breakpoints\nDisabling the kernel debugger\n"); 652 "breakpoints\nDisabling the kernel debugger\n");
650 breakinfo[i].pev = NULL; 653 breakinfo[i].pev = NULL;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 770ebfb349e9..1cbd54c0df99 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
230 return 0; 230 return 0;
231} 231}
232 232
233/* Dummy buffers for kallsyms_lookup */
234static char __dummy_buf[KSYM_NAME_LEN];
235
236/* Check if paddr is at an instruction boundary */ 233/* Check if paddr is at an instruction boundary */
237static int __kprobes can_probe(unsigned long paddr) 234static int __kprobes can_probe(unsigned long paddr)
238{ 235{
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
241 struct insn insn; 238 struct insn insn;
242 kprobe_opcode_t buf[MAX_INSN_SIZE]; 239 kprobe_opcode_t buf[MAX_INSN_SIZE];
243 240
244 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) 241 if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
245 return 0; 242 return 0;
246 243
247 /* Decode instructions */ 244 /* Decode instructions */
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1129 *(unsigned long *)addr = val; 1126 *(unsigned long *)addr = val;
1130} 1127}
1131 1128
1132void __kprobes kprobes_optinsn_template_holder(void) 1129static void __used __kprobes kprobes_optinsn_template_holder(void)
1133{ 1130{
1134 asm volatile ( 1131 asm volatile (
1135 ".global optprobe_template_entry\n" 1132 ".global optprobe_template_entry\n"
@@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1221 } 1218 }
1222 /* Check whether the address range is reserved */ 1219 /* Check whether the address range is reserved */
1223 if (ftrace_text_reserved(src, src + len - 1) || 1220 if (ftrace_text_reserved(src, src + len - 1) ||
1224 alternatives_text_reserved(src, src + len - 1)) 1221 alternatives_text_reserved(src, src + len - 1) ||
1222 jump_label_text_reserved(src, src + len - 1))
1225 return -EBUSY; 1223 return -EBUSY;
1226 1224
1227 return len; 1225 return len;
@@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr)
1269 unsigned long addr, size = 0, offset = 0; 1267 unsigned long addr, size = 0, offset = 0;
1270 struct insn insn; 1268 struct insn insn;
1271 kprobe_opcode_t buf[MAX_INSN_SIZE]; 1269 kprobe_opcode_t buf[MAX_INSN_SIZE];
1272 /* Dummy buffers for lookup_symbol_attrs */
1273 static char __dummy_buf[KSYM_NAME_LEN];
1274 1270
1275 /* Lookup symbol including addr */ 1271 /* Lookup symbol including addr */
1276 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) 1272 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1277 return 0; 1273 return 0;
1278 1274
1279 /* Check there is enough space for a relative jump. */ 1275 /* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..ca43ce31a19c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = {
128static int kvm_register_clock(char *txt) 128static int kvm_register_clock(char *txt)
129{ 129{
130 int cpu = smp_processor_id(); 130 int cpu = smp_processor_id();
131 int low, high; 131 int low, high, ret;
132
132 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 133 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 134 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
135 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 136 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
135 cpu, high, low, txt); 137 cpu, high, low, txt);
136 138
137 return native_write_msr_safe(msr_kvm_system_time, low, high); 139 return ret;
138} 140}
139 141
140#ifdef CONFIG_X86_LOCAL_APIC 142#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c10..1cca374a2bac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -12,7 +12,7 @@
12 * Software Developer's Manual 12 * Software Developer's Manual
13 * Order Number 253668 or free download from: 13 * Order Number 253668 or free download from:
14 * 14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm 15 * http://developer.intel.com/Assets/PDF/manual/253668.pdf
16 * 16 *
17 * For more information, go to http://www.urbanmyth.org/microcode 17 * For more information, go to http://www.urbanmyth.org/microcode
18 * 18 *
@@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = {
232 .owner = THIS_MODULE, 232 .owner = THIS_MODULE,
233 .write = microcode_write, 233 .write = microcode_write,
234 .open = microcode_open, 234 .open = microcode_open,
235 .llseek = no_llseek,
235}; 236};
236 237
237static struct miscdevice microcode_dev = { 238static struct miscdevice microcode_dev = {
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 356170262a93..dcb65cc0a053 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -12,7 +12,7 @@
12 * Software Developer's Manual 12 * Software Developer's Manual
13 * Order Number 253668 or free download from: 13 * Order Number 253668 or free download from:
14 * 14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm 15 * http://developer.intel.com/Assets/PDF/manual/253668.pdf
16 * 16 *
17 * For more information, go to http://www.urbanmyth.org/microcode 17 * For more information, go to http://www.urbanmyth.org/microcode
18 * 18 *
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 1c355c550960..8f2956091735 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr,
239 apply_paravirt(pseg, pseg + para->sh_size); 239 apply_paravirt(pseg, pseg + para->sh_size);
240 } 240 }
241 241
242 /* make jump label nops */
243 jump_label_apply_nops(me);
244
242 return 0; 245 return 0;
243} 246}
244 247
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d7b6f7fb4fec..9af64d9c4b67 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -11,6 +11,7 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/memblock.h>
14#include <linux/kernel_stat.h> 15#include <linux/kernel_stat.h>
15#include <linux/mc146818rtc.h> 16#include <linux/mc146818rtc.h>
16#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -657,7 +658,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
657{ 658{
658 unsigned long size = get_mpc_size(mpf->physptr); 659 unsigned long size = get_mpc_size(mpf->physptr);
659 660
660 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); 661 memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
661} 662}
662 663
663static int __init smp_scan_config(unsigned long base, unsigned long length) 664static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -686,7 +687,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
686 mpf, (u64)virt_to_phys(mpf)); 687 mpf, (u64)virt_to_phys(mpf));
687 688
688 mem = virt_to_phys(mpf); 689 mem = virt_to_phys(mpf);
689 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); 690 memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
690 if (mpf->physptr) 691 if (mpf->physptr)
691 smp_reserve_memory(mpf); 692 smp_reserve_memory(mpf);
692 693
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..c5b250011fd4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
413 413
414 .alloc_pte = paravirt_nop, 414 .alloc_pte = paravirt_nop,
415 .alloc_pmd = paravirt_nop, 415 .alloc_pmd = paravirt_nop,
416 .alloc_pmd_clone = paravirt_nop,
417 .alloc_pud = paravirt_nop, 416 .alloc_pud = paravirt_nop,
418 .release_pte = paravirt_nop, 417 .release_pte = paravirt_nop,
419 .release_pmd = paravirt_nop, 418 .release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 078d4ec1a9d9..f56a117cef68 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -47,6 +47,7 @@
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h> 49#include <asm/x86_init.h>
50#include <asm/iommu_table.h>
50 51
51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 52#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
52int use_calgary __read_mostly = 1; 53int use_calgary __read_mostly = 1;
@@ -1364,7 +1365,7 @@ static int __init calgary_iommu_init(void)
1364 return 0; 1365 return 0;
1365} 1366}
1366 1367
1367void __init detect_calgary(void) 1368int __init detect_calgary(void)
1368{ 1369{
1369 int bus; 1370 int bus;
1370 void *tbl; 1371 void *tbl;
@@ -1378,13 +1379,13 @@ void __init detect_calgary(void)
1378 * another HW IOMMU already, bail out. 1379 * another HW IOMMU already, bail out.
1379 */ 1380 */
1380 if (no_iommu || iommu_detected) 1381 if (no_iommu || iommu_detected)
1381 return; 1382 return -ENODEV;
1382 1383
1383 if (!use_calgary) 1384 if (!use_calgary)
1384 return; 1385 return -ENODEV;
1385 1386
1386 if (!early_pci_allowed()) 1387 if (!early_pci_allowed())
1387 return; 1388 return -ENODEV;
1388 1389
1389 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); 1390 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
1390 1391
@@ -1410,13 +1411,13 @@ void __init detect_calgary(void)
1410 if (!rio_table_hdr) { 1411 if (!rio_table_hdr) {
1411 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " 1412 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
1412 "in EBDA - bailing!\n"); 1413 "in EBDA - bailing!\n");
1413 return; 1414 return -ENODEV;
1414 } 1415 }
1415 1416
1416 ret = build_detail_arrays(); 1417 ret = build_detail_arrays();
1417 if (ret) { 1418 if (ret) {
1418 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); 1419 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
1419 return; 1420 return -ENOMEM;
1420 } 1421 }
1421 1422
1422 specified_table_size = determine_tce_table_size((is_kdump_kernel() ? 1423 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
@@ -1464,7 +1465,7 @@ void __init detect_calgary(void)
1464 1465
1465 x86_init.iommu.iommu_init = calgary_iommu_init; 1466 x86_init.iommu.iommu_init = calgary_iommu_init;
1466 } 1467 }
1467 return; 1468 return calgary_found;
1468 1469
1469cleanup: 1470cleanup:
1470 for (--bus; bus >= 0; --bus) { 1471 for (--bus; bus >= 0; --bus) {
@@ -1473,6 +1474,7 @@ cleanup:
1473 if (info->tce_space) 1474 if (info->tce_space)
1474 free_tce_table(info->tce_space); 1475 free_tce_table(info->tce_space);
1475 } 1476 }
1477 return -ENOMEM;
1476} 1478}
1477 1479
1478static int __init calgary_parse_options(char *p) 1480static int __init calgary_parse_options(char *p)
@@ -1594,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void)
1594 * and before device_initcall. 1596 * and before device_initcall.
1595 */ 1597 */
1596rootfs_initcall(calgary_fixup_tce_spaces); 1598rootfs_initcall(calgary_fixup_tce_spaces);
1599
1600IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9f07cfcbd3a5..9ea999a4dcc1 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,9 +11,8 @@
11#include <asm/iommu.h> 11#include <asm/iommu.h>
12#include <asm/gart.h> 12#include <asm/gart.h>
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h> 14#include <asm/x86_init.h>
16#include <asm/xen/swiotlb-xen.h> 15#include <asm/iommu_table.h>
17 16
18static int forbid_dac __read_mostly; 17static int forbid_dac __read_mostly;
19 18
@@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0;
45 */ 44 */
46int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
47 46
47extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
48
48/* Dummy device used for NULL arguments (normally ISA). */ 49/* Dummy device used for NULL arguments (normally ISA). */
49struct device x86_dma_fallback_dev = { 50struct device x86_dma_fallback_dev = {
50 .init_name = "fallback device", 51 .init_name = "fallback device",
@@ -130,26 +131,24 @@ static void __init dma32_free_bootmem(void)
130 131
131void __init pci_iommu_alloc(void) 132void __init pci_iommu_alloc(void)
132{ 133{
134 struct iommu_table_entry *p;
135
133 /* free the range so iommu could get some range less than 4G */ 136 /* free the range so iommu could get some range less than 4G */
134 dma32_free_bootmem(); 137 dma32_free_bootmem();
135 138
136 if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) 139 sort_iommu_table(__iommu_table, __iommu_table_end);
137 goto out; 140 check_iommu_entries(__iommu_table, __iommu_table_end);
138
139 gart_iommu_hole_init();
140
141 detect_calgary();
142
143 detect_intel_iommu();
144 141
145 /* needs to be called after gart_iommu_hole_init */ 142 for (p = __iommu_table; p < __iommu_table_end; p++) {
146 amd_iommu_detect(); 143 if (p && p->detect && p->detect() > 0) {
147out: 144 p->flags |= IOMMU_DETECTED;
148 pci_xen_swiotlb_init(); 145 if (p->early_init)
149 146 p->early_init();
150 pci_swiotlb_init(); 147 if (p->flags & IOMMU_FINISH_IF_DETECTED)
148 break;
149 }
150 }
151} 151}
152
153void *dma_generic_alloc_coherent(struct device *dev, size_t size, 152void *dma_generic_alloc_coherent(struct device *dev, size_t size,
154 dma_addr_t *dma_addr, gfp_t flag) 153 dma_addr_t *dma_addr, gfp_t flag)
155{ 154{
@@ -292,6 +291,7 @@ EXPORT_SYMBOL(dma_supported);
292 291
293static int __init pci_iommu_init(void) 292static int __init pci_iommu_init(void)
294{ 293{
294 struct iommu_table_entry *p;
295 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); 295 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
296 296
297#ifdef CONFIG_PCI 297#ifdef CONFIG_PCI
@@ -299,12 +299,10 @@ static int __init pci_iommu_init(void)
299#endif 299#endif
300 x86_init.iommu.iommu_init(); 300 x86_init.iommu.iommu_init();
301 301
302 if (swiotlb || xen_swiotlb) { 302 for (p = __iommu_table; p < __iommu_table_end; p++) {
303 printk(KERN_INFO "PCI-DMA: " 303 if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
304 "Using software bounce buffering for IO (SWIOTLB)\n"); 304 p->late_init();
305 swiotlb_print_info(); 305 }
306 } else
307 swiotlb_free();
308 306
309 return 0; 307 return 0;
310} 308}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa67..ba0f0ca9f280 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,8 +39,9 @@
39#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
40#include <asm/swiotlb.h> 40#include <asm/swiotlb.h>
41#include <asm/dma.h> 41#include <asm/dma.h>
42#include <asm/k8.h> 42#include <asm/amd_nb.h>
43#include <asm/x86_init.h> 43#include <asm/x86_init.h>
44#include <asm/iommu_table.h>
44 45
45static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 46static unsigned long iommu_bus_base; /* GART remapping area (physical) */
46static unsigned long iommu_size; /* size of remapping area bytes */ 47static unsigned long iommu_size; /* size of remapping area bytes */
@@ -560,8 +561,11 @@ static void enable_gart_translations(void)
560{ 561{
561 int i; 562 int i;
562 563
563 for (i = 0; i < num_k8_northbridges; i++) { 564 if (!k8_northbridges.gart_supported)
564 struct pci_dev *dev = k8_northbridges[i]; 565 return;
566
567 for (i = 0; i < k8_northbridges.num; i++) {
568 struct pci_dev *dev = k8_northbridges.nb_misc[i];
565 569
566 enable_gart_translation(dev, __pa(agp_gatt_table)); 570 enable_gart_translation(dev, __pa(agp_gatt_table));
567 } 571 }
@@ -592,16 +596,19 @@ static void gart_fixup_northbridges(struct sys_device *dev)
592 if (!fix_up_north_bridges) 596 if (!fix_up_north_bridges)
593 return; 597 return;
594 598
599 if (!k8_northbridges.gart_supported)
600 return;
601
595 pr_info("PCI-DMA: Restoring GART aperture settings\n"); 602 pr_info("PCI-DMA: Restoring GART aperture settings\n");
596 603
597 for (i = 0; i < num_k8_northbridges; i++) { 604 for (i = 0; i < k8_northbridges.num; i++) {
598 struct pci_dev *dev = k8_northbridges[i]; 605 struct pci_dev *dev = k8_northbridges.nb_misc[i];
599 606
600 /* 607 /*
601 * Don't enable translations just yet. That is the next 608 * Don't enable translations just yet. That is the next
602 * step. Restore the pre-suspend aperture settings. 609 * step. Restore the pre-suspend aperture settings.
603 */ 610 */
604 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); 611 gart_set_size_and_enable(dev, aperture_order);
605 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); 612 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
606 } 613 }
607} 614}
@@ -649,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
649 656
650 aper_size = aper_base = info->aper_size = 0; 657 aper_size = aper_base = info->aper_size = 0;
651 dev = NULL; 658 dev = NULL;
652 for (i = 0; i < num_k8_northbridges; i++) { 659 for (i = 0; i < k8_northbridges.num; i++) {
653 dev = k8_northbridges[i]; 660 dev = k8_northbridges.nb_misc[i];
654 new_aper_base = read_aperture(dev, &new_aper_size); 661 new_aper_base = read_aperture(dev, &new_aper_size);
655 if (!new_aper_base) 662 if (!new_aper_base)
656 goto nommu; 663 goto nommu;
@@ -718,10 +725,13 @@ static void gart_iommu_shutdown(void)
718 if (!no_agp) 725 if (!no_agp)
719 return; 726 return;
720 727
721 for (i = 0; i < num_k8_northbridges; i++) { 728 if (!k8_northbridges.gart_supported)
729 return;
730
731 for (i = 0; i < k8_northbridges.num; i++) {
722 u32 ctl; 732 u32 ctl;
723 733
724 dev = k8_northbridges[i]; 734 dev = k8_northbridges.nb_misc[i];
725 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 735 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
726 736
727 ctl &= ~GARTEN; 737 ctl &= ~GARTEN;
@@ -739,7 +749,7 @@ int __init gart_iommu_init(void)
739 unsigned long scratch; 749 unsigned long scratch;
740 long i; 750 long i;
741 751
742 if (num_k8_northbridges == 0) 752 if (!k8_northbridges.gart_supported)
743 return 0; 753 return 0;
744 754
745#ifndef CONFIG_AGP_AMD64 755#ifndef CONFIG_AGP_AMD64
@@ -896,3 +906,4 @@ void __init gart_parse_options(char *p)
896 } 906 }
897 } 907 }
898} 908}
909IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 000000000000..55d745ec1181
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,89 @@
1#include <linux/dma-mapping.h>
2#include <asm/iommu_table.h>
3#include <linux/string.h>
4#include <linux/kallsyms.h>
5
6
7#define DEBUG 1
8
9static struct iommu_table_entry * __init
10find_dependents_of(struct iommu_table_entry *start,
11 struct iommu_table_entry *finish,
12 struct iommu_table_entry *q)
13{
14 struct iommu_table_entry *p;
15
16 if (!q)
17 return NULL;
18
19 for (p = start; p < finish; p++)
20 if (p->detect == q->depend)
21 return p;
22
23 return NULL;
24}
25
26
27void __init sort_iommu_table(struct iommu_table_entry *start,
28 struct iommu_table_entry *finish) {
29
30 struct iommu_table_entry *p, *q, tmp;
31
32 for (p = start; p < finish; p++) {
33again:
34 q = find_dependents_of(start, finish, p);
35 /* We are bit sneaky here. We use the memory address to figure
36 * out if the node we depend on is past our point, if so, swap.
37 */
38 if (q > p) {
39 tmp = *p;
40 memmove(p, q, sizeof(*p));
41 *q = tmp;
42 goto again;
43 }
44 }
45
46}
47
48#ifdef DEBUG
49void __init check_iommu_entries(struct iommu_table_entry *start,
50 struct iommu_table_entry *finish)
51{
52 struct iommu_table_entry *p, *q, *x;
53 char sym_p[KSYM_SYMBOL_LEN];
54 char sym_q[KSYM_SYMBOL_LEN];
55
56 /* Simple cyclic dependency checker. */
57 for (p = start; p < finish; p++) {
58 q = find_dependents_of(start, finish, p);
59 x = find_dependents_of(start, finish, q);
60 if (p == x) {
61 sprint_symbol(sym_p, (unsigned long)p->detect);
62 sprint_symbol(sym_q, (unsigned long)q->detect);
63
64 printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
65 " on %s and vice-versa. BREAKING IT.\n",
66 sym_p, sym_q);
67 /* Heavy handed way..*/
68 x->depend = 0;
69 }
70 }
71
72 for (p = start; p < finish; p++) {
73 q = find_dependents_of(p, finish, p);
74 if (q && q > p) {
75 sprint_symbol(sym_p, (unsigned long)p->detect);
76 sprint_symbol(sym_q, (unsigned long)q->detect);
77
78 printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
79 "should be called before %s!\n",
80 sym_p, sym_q);
81 }
82 }
83}
84#else
85inline void check_iommu_entries(struct iommu_table_entry *start,
86 struct iommu_table_entry *finish)
87{
88}
89#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a5bc528d4328..8f972cbddef0 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -10,7 +10,8 @@
10#include <asm/iommu.h> 10#include <asm/iommu.h>
11#include <asm/swiotlb.h> 11#include <asm/swiotlb.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13#include <asm/xen/swiotlb-xen.h>
14#include <asm/iommu_table.h>
14int swiotlb __read_mostly; 15int swiotlb __read_mostly;
15 16
16static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -41,25 +42,42 @@ static struct dma_map_ops swiotlb_dma_ops = {
41}; 42};
42 43
43/* 44/*
44 * pci_swiotlb_detect - set swiotlb to 1 if necessary 45 * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
45 * 46 *
46 * This returns non-zero if we are forced to use swiotlb (by the boot 47 * This returns non-zero if we are forced to use swiotlb (by the boot
47 * option). 48 * option).
48 */ 49 */
49int __init pci_swiotlb_detect(void) 50int __init pci_swiotlb_detect_override(void)
50{ 51{
51 int use_swiotlb = swiotlb | swiotlb_force; 52 int use_swiotlb = swiotlb | swiotlb_force;
52 53
54 if (swiotlb_force)
55 swiotlb = 1;
56
57 return use_swiotlb;
58}
59IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
60 pci_xen_swiotlb_detect,
61 pci_swiotlb_init,
62 pci_swiotlb_late_init);
63
64/*
65 * if 4GB or more detected (and iommu=off not set) return 1
66 * and set swiotlb to 1.
67 */
68int __init pci_swiotlb_detect_4gb(void)
69{
53 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 70 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
54#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
55 if (!no_iommu && max_pfn > MAX_DMA32_PFN) 72 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
56 swiotlb = 1; 73 swiotlb = 1;
57#endif 74#endif
58 if (swiotlb_force) 75 return swiotlb;
59 swiotlb = 1;
60
61 return use_swiotlb;
62} 76}
77IOMMU_INIT(pci_swiotlb_detect_4gb,
78 pci_swiotlb_detect_override,
79 pci_swiotlb_init,
80 pci_swiotlb_late_init);
63 81
64void __init pci_swiotlb_init(void) 82void __init pci_swiotlb_init(void)
65{ 83{
@@ -68,3 +86,15 @@ void __init pci_swiotlb_init(void)
68 dma_ops = &swiotlb_dma_ops; 86 dma_ops = &swiotlb_dma_ops;
69 } 87 }
70} 88}
89
90void __init pci_swiotlb_late_init(void)
91{
92 /* An IOMMU turned us off. */
93 if (!swiotlb)
94 swiotlb_free();
95 else {
96 printk(KERN_INFO "PCI-DMA: "
97 "Using software bounce buffering for IO (SWIOTLB)\n");
98 swiotlb_print_info();
99 }
100}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <linux/acpi_pmtmr.h>
23
24#include <asm/io.h>
25#include <asm/proto.h>
26#include <asm/msr.h>
27#include <asm/vsyscall.h>
28
29static inline u32 cyc2us(u32 cycles)
30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
32 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
33 *
34 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
35 * easily be multiplied with 286 (=0x11E) without having to fear
36 * u32 overflows.
37 */
38 cycles *= 286;
39 return (cycles >> 10);
40}
41
42static unsigned pmtimer_wait_tick(void)
43{
44 u32 a, b;
45 for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
46 a == b;
47 b = inl(pmtmr_ioport) & ACPI_PM_MASK)
48 cpu_relax();
49 return b;
50}
51
52/* note: wait time is rounded up to one tick */
53void pmtimer_wait(unsigned us)
54{
55 u32 a, b;
56 a = pmtimer_wait_tick();
57 do {
58 b = inl(pmtmr_ioport);
59 cpu_relax();
60 } while (cyc2us(b - a) < us);
61}
62
63static int __init nopmtimer_setup(char *s)
64{
65 pmtmr_ioport = 0;
66 return 1;
67}
68
69__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd1..b3d7a3a04f38 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
424 load_TLS(next, cpu); 424 load_TLS(next, cpu);
425 425
426 /* Must be after DS reload */ 426 /* Must be after DS reload */
427 unlazy_fpu(prev_p); 427 __unlazy_fpu(prev_p);
428 428
429 /* Make sure cpu is ready for new context */ 429 /* Make sure cpu is ready for new context */
430 if (preload_fpu) 430 if (preload_fpu)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8aa..45892dc4b72a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child)
801static const struct user_regset_view user_x86_32_view; /* Initialized below. */ 801static const struct user_regset_view user_x86_32_view; /* Initialized below. */
802#endif 802#endif
803 803
804long arch_ptrace(struct task_struct *child, long request, long addr, long data) 804long arch_ptrace(struct task_struct *child, long request,
805 unsigned long addr, unsigned long data)
805{ 806{
806 int ret; 807 int ret;
807 unsigned long __user *datap = (unsigned long __user *)data; 808 unsigned long __user *datap = (unsigned long __user *)data;
@@ -812,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
812 unsigned long tmp; 813 unsigned long tmp;
813 814
814 ret = -EIO; 815 ret = -EIO;
815 if ((addr & (sizeof(data) - 1)) || addr < 0 || 816 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
816 addr >= sizeof(struct user))
817 break; 817 break;
818 818
819 tmp = 0; /* Default return condition */ 819 tmp = 0; /* Default return condition */
@@ -830,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
830 830
831 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ 831 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
832 ret = -EIO; 832 ret = -EIO;
833 if ((addr & (sizeof(data) - 1)) || addr < 0 || 833 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
834 addr >= sizeof(struct user))
835 break; 834 break;
836 835
837 if (addr < sizeof(struct user_regs_struct)) 836 if (addr < sizeof(struct user_regs_struct))
@@ -888,17 +887,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
888 887
889#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 888#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
890 case PTRACE_GET_THREAD_AREA: 889 case PTRACE_GET_THREAD_AREA:
891 if (addr < 0) 890 if ((int) addr < 0)
892 return -EIO; 891 return -EIO;
893 ret = do_get_thread_area(child, addr, 892 ret = do_get_thread_area(child, addr,
894 (struct user_desc __user *) data); 893 (struct user_desc __user *)data);
895 break; 894 break;
896 895
897 case PTRACE_SET_THREAD_AREA: 896 case PTRACE_SET_THREAD_AREA:
898 if (addr < 0) 897 if ((int) addr < 0)
899 return -EIO; 898 return -EIO;
900 ret = do_set_thread_area(child, addr, 899 ret = do_set_thread_area(child, addr,
901 (struct user_desc __user *) data, 0); 900 (struct user_desc __user *)data, 0);
902 break; 901 break;
903#endif 902#endif
904 903
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..bab3b9e6f66d 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
82static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 82static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
83{ 83{
84 u64 delta = native_read_tsc() - shadow->tsc_timestamp; 84 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
85 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); 85 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
86 shadow->tsc_shift);
86} 87}
87 88
88/* 89/*
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245f..8bbe8c56916d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
344 vt8237_force_enable_hpet); 344 vt8237_force_enable_hpet);
345DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, 345DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
346 vt8237_force_enable_hpet); 346 vt8237_force_enable_hpet);
347DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
348 vt8237_force_enable_hpet);
347 349
348static void ati_force_hpet_resume(void) 350static void ati_force_hpet_resume(void)
349{ 351{
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 7a4cf14223ba..c495aa8d4815 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length)
371 CMOS_WRITE(0x00, 0x8f); 371 CMOS_WRITE(0x00, 0x8f);
372 spin_unlock(&rtc_lock); 372 spin_unlock(&rtc_lock);
373 373
374 /* Remap the kernel at virtual address zero, as well as offset zero
375 from the kernel segment. This assumes the kernel segment starts at
376 virtual address PAGE_OFFSET. */
377 memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
378 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
379
380 /* 374 /*
381 * Use `swapper_pg_dir' as our page directory. 375 * Switch back to the initial page table.
382 */ 376 */
383 load_cr3(swapper_pg_dir); 377 load_cr3(initial_page_table);
384 378
385 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads 379 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
386 this on booting to tell it to "Bypass memory test (also warm 380 this on booting to tell it to "Bypass memory test (also warm
@@ -641,7 +635,7 @@ void native_machine_shutdown(void)
641 /* O.K Now that I'm on the appropriate processor, 635 /* O.K Now that I'm on the appropriate processor,
642 * stop all of the others. 636 * stop all of the others.
643 */ 637 */
644 smp_send_stop(); 638 stop_other_cpus();
645#endif 639#endif
646 640
647 lapic_shutdown(); 641 lapic_shutdown();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 7d5ee08c982d..21c6746338af 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,6 +31,7 @@
31#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
32#include <linux/initrd.h> 32#include <linux/initrd.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/console.h> 36#include <linux/console.h>
36#include <linux/mca.h> 37#include <linux/mca.h>
@@ -83,7 +84,6 @@
83#include <asm/dmi.h> 84#include <asm/dmi.h>
84#include <asm/io_apic.h> 85#include <asm/io_apic.h>
85#include <asm/ist.h> 86#include <asm/ist.h>
86#include <asm/vmi.h>
87#include <asm/setup_arch.h> 87#include <asm/setup_arch.h>
88#include <asm/bios_ebda.h> 88#include <asm/bios_ebda.h>
89#include <asm/cacheflush.h> 89#include <asm/cacheflush.h>
@@ -107,11 +107,12 @@
107#include <asm/percpu.h> 107#include <asm/percpu.h>
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/k8.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64 111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h> 112#include <asm/numa_64.h>
113#endif 113#endif
114#include <asm/mce.h> 114#include <asm/mce.h>
115#include <asm/alternative.h>
115 116
116/* 117/*
117 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 118 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -301,7 +302,7 @@ static inline void init_gbpages(void)
301static void __init reserve_brk(void) 302static void __init reserve_brk(void)
302{ 303{
303 if (_brk_end > _brk_start) 304 if (_brk_end > _brk_start)
304 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); 305 memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
305 306
306 /* Mark brk area as locked down and no longer taking any 307 /* Mark brk area as locked down and no longer taking any
307 new allocations */ 308 new allocations */
@@ -323,17 +324,16 @@ static void __init relocate_initrd(void)
323 char *p, *q; 324 char *p, *q;
324 325
325 /* We need to move the initrd down into lowmem */ 326 /* We need to move the initrd down into lowmem */
326 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, 327 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
327 PAGE_SIZE); 328 PAGE_SIZE);
328 329
329 if (ramdisk_here == -1ULL) 330 if (ramdisk_here == MEMBLOCK_ERROR)
330 panic("Cannot find place for new RAMDISK of size %lld\n", 331 panic("Cannot find place for new RAMDISK of size %lld\n",
331 ramdisk_size); 332 ramdisk_size);
332 333
333 /* Note: this includes all the lowmem currently occupied by 334 /* Note: this includes all the lowmem currently occupied by
334 the initrd, we rely on that fact to keep the data intact. */ 335 the initrd, we rely on that fact to keep the data intact. */
335 reserve_early(ramdisk_here, ramdisk_here + area_size, 336 memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
336 "NEW RAMDISK");
337 initrd_start = ramdisk_here + PAGE_OFFSET; 337 initrd_start = ramdisk_here + PAGE_OFFSET;
338 initrd_end = initrd_start + ramdisk_size; 338 initrd_end = initrd_start + ramdisk_size;
339 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", 339 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -389,7 +389,7 @@ static void __init reserve_initrd(void)
389 initrd_start = 0; 389 initrd_start = 0;
390 390
391 if (ramdisk_size >= (end_of_lowmem>>1)) { 391 if (ramdisk_size >= (end_of_lowmem>>1)) {
392 free_early(ramdisk_image, ramdisk_end); 392 memblock_x86_free_range(ramdisk_image, ramdisk_end);
393 printk(KERN_ERR "initrd too large to handle, " 393 printk(KERN_ERR "initrd too large to handle, "
394 "disabling initrd\n"); 394 "disabling initrd\n");
395 return; 395 return;
@@ -412,7 +412,7 @@ static void __init reserve_initrd(void)
412 412
413 relocate_initrd(); 413 relocate_initrd();
414 414
415 free_early(ramdisk_image, ramdisk_end); 415 memblock_x86_free_range(ramdisk_image, ramdisk_end);
416} 416}
417#else 417#else
418static void __init reserve_initrd(void) 418static void __init reserve_initrd(void)
@@ -468,7 +468,7 @@ static void __init e820_reserve_setup_data(void)
468 e820_print_map("reserve setup_data"); 468 e820_print_map("reserve setup_data");
469} 469}
470 470
471static void __init reserve_early_setup_data(void) 471static void __init memblock_x86_reserve_range_setup_data(void)
472{ 472{
473 struct setup_data *data; 473 struct setup_data *data;
474 u64 pa_data; 474 u64 pa_data;
@@ -480,7 +480,7 @@ static void __init reserve_early_setup_data(void)
480 while (pa_data) { 480 while (pa_data) {
481 data = early_memremap(pa_data, sizeof(*data)); 481 data = early_memremap(pa_data, sizeof(*data));
482 sprintf(buf, "setup data %x", data->type); 482 sprintf(buf, "setup data %x", data->type);
483 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 483 memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
484 pa_data = data->next; 484 pa_data = data->next;
485 early_iounmap(data, sizeof(*data)); 485 early_iounmap(data, sizeof(*data));
486 } 486 }
@@ -501,6 +501,7 @@ static inline unsigned long long get_total_mem(void)
501 return total << PAGE_SHIFT; 501 return total << PAGE_SHIFT;
502} 502}
503 503
504#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF
504static void __init reserve_crashkernel(void) 505static void __init reserve_crashkernel(void)
505{ 506{
506 unsigned long long total_mem; 507 unsigned long long total_mem;
@@ -518,23 +519,27 @@ static void __init reserve_crashkernel(void)
518 if (crash_base <= 0) { 519 if (crash_base <= 0) {
519 const unsigned long long alignment = 16<<20; /* 16M */ 520 const unsigned long long alignment = 16<<20; /* 16M */
520 521
521 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, 522 /*
522 alignment); 523 * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX
523 if (crash_base == -1ULL) { 524 */
525 crash_base = memblock_find_in_range(alignment,
526 DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment);
527
528 if (crash_base == MEMBLOCK_ERROR) {
524 pr_info("crashkernel reservation failed - No suitable area found.\n"); 529 pr_info("crashkernel reservation failed - No suitable area found.\n");
525 return; 530 return;
526 } 531 }
527 } else { 532 } else {
528 unsigned long long start; 533 unsigned long long start;
529 534
530 start = find_e820_area(crash_base, ULONG_MAX, crash_size, 535 start = memblock_find_in_range(crash_base,
531 1<<20); 536 crash_base + crash_size, crash_size, 1<<20);
532 if (start != crash_base) { 537 if (start != crash_base) {
533 pr_info("crashkernel reservation failed - memory is in use.\n"); 538 pr_info("crashkernel reservation failed - memory is in use.\n");
534 return; 539 return;
535 } 540 }
536 } 541 }
537 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); 542 memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
538 543
539 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 544 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
540 "for crashkernel (System RAM: %ldMB)\n", 545 "for crashkernel (System RAM: %ldMB)\n",
@@ -614,82 +619,10 @@ static __init void reserve_ibft_region(void)
614 addr = find_ibft_region(&size); 619 addr = find_ibft_region(&size);
615 620
616 if (size) 621 if (size)
617 reserve_early_overlap_ok(addr, addr + size, "ibft"); 622 memblock_x86_reserve_range(addr, addr + size, "* ibft");
618} 623}
619 624
620#ifdef CONFIG_X86_RESERVE_LOW_64K 625static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
621static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
622{
623 printk(KERN_NOTICE
624 "%s detected: BIOS may corrupt low RAM, working around it.\n",
625 d->ident);
626
627 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
628 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
629
630 return 0;
631}
632#endif
633
634/* List of systems that have known low memory corruption BIOS problems */
635static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
636#ifdef CONFIG_X86_RESERVE_LOW_64K
637 {
638 .callback = dmi_low_memory_corruption,
639 .ident = "AMI BIOS",
640 .matches = {
641 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
642 },
643 },
644 {
645 .callback = dmi_low_memory_corruption,
646 .ident = "Phoenix BIOS",
647 .matches = {
648 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
649 },
650 },
651 {
652 .callback = dmi_low_memory_corruption,
653 .ident = "Phoenix/MSC BIOS",
654 .matches = {
655 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
656 },
657 },
658 /*
659 * AMI BIOS with low memory corruption was found on Intel DG45ID and
660 * DG45FC boards.
661 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
662 * match only DMI_BOARD_NAME and see if there is more bad products
663 * with this vendor.
664 */
665 {
666 .callback = dmi_low_memory_corruption,
667 .ident = "AMI BIOS",
668 .matches = {
669 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
670 },
671 },
672 {
673 .callback = dmi_low_memory_corruption,
674 .ident = "AMI BIOS",
675 .matches = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 },
678 },
679 /*
680 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
681 * match on the product name.
682 */
683 {
684 .callback = dmi_low_memory_corruption,
685 .ident = "Phoenix BIOS",
686 .matches = {
687 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
688 },
689 },
690#endif
691 {}
692};
693 626
694static void __init trim_bios_range(void) 627static void __init trim_bios_range(void)
695{ 628{
@@ -697,8 +630,14 @@ static void __init trim_bios_range(void)
697 * A special case is the first 4Kb of memory; 630 * A special case is the first 4Kb of memory;
698 * This is a BIOS owned area, not kernel ram, but generally 631 * This is a BIOS owned area, not kernel ram, but generally
699 * not listed as such in the E820 table. 632 * not listed as such in the E820 table.
633 *
634 * This typically reserves additional memory (64KiB by default)
635 * since some BIOSes are known to corrupt low memory. See the
636 * Kconfig help text for X86_RESERVE_LOW.
700 */ 637 */
701 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); 638 e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
639 E820_RAM, E820_RESERVED);
640
702 /* 641 /*
703 * special case: Some BIOSen report the PC BIOS 642 * special case: Some BIOSen report the PC BIOS
704 * area (640->1Mb) as ram even though it is not. 643 * area (640->1Mb) as ram even though it is not.
@@ -708,6 +647,37 @@ static void __init trim_bios_range(void)
708 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 647 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
709} 648}
710 649
650static int __init parse_reservelow(char *p)
651{
652 unsigned long long size;
653
654 if (!p)
655 return -EINVAL;
656
657 size = memparse(p, &p);
658
659 if (size < 4096)
660 size = 4096;
661
662 if (size > 640*1024)
663 size = 640*1024;
664
665 reserve_low = size;
666
667 return 0;
668}
669
670early_param("reservelow", parse_reservelow);
671
672static u64 __init get_max_mapped(void)
673{
674 u64 end = max_pfn_mapped;
675
676 end <<= PAGE_SHIFT;
677
678 return end;
679}
680
711/* 681/*
712 * Determine if we were loaded by an EFI loader. If so, then we have also been 682 * Determine if we were loaded by an EFI loader. If so, then we have also been
713 * passed the efi memmap, systab, etc., so we should use these data structures 683 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -725,18 +695,30 @@ void __init setup_arch(char **cmdline_p)
725{ 695{
726 int acpi = 0; 696 int acpi = 0;
727 int k8 = 0; 697 int k8 = 0;
698 unsigned long flags;
728 699
729#ifdef CONFIG_X86_32 700#ifdef CONFIG_X86_32
730 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 701 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
731 visws_early_detect(); 702 visws_early_detect();
703
704 /*
705 * copy kernel address range established so far and switch
706 * to the proper swapper page table
707 */
708 clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
709 initial_page_table + KERNEL_PGD_BOUNDARY,
710 KERNEL_PGD_PTRS);
711
712 load_cr3(swapper_pg_dir);
713 __flush_tlb_all();
732#else 714#else
733 printk(KERN_INFO "Command line: %s\n", boot_command_line); 715 printk(KERN_INFO "Command line: %s\n", boot_command_line);
734#endif 716#endif
735 717
736 /* VMI may relocate the fixmap; do this before touching ioremap area */ 718 /*
737 vmi_init(); 719 * If we have OLPC OFW, we might end up relocating the fixmap due to
738 720 * reserve_top(), so do this before touching the ioremap area.
739 /* OFW also may relocate the fixmap */ 721 */
740 olpc_ofw_detect(); 722 olpc_ofw_detect();
741 723
742 early_trap_init(); 724 early_trap_init();
@@ -781,12 +763,14 @@ void __init setup_arch(char **cmdline_p)
781#endif 763#endif
782 4)) { 764 4)) {
783 efi_enabled = 1; 765 efi_enabled = 1;
784 efi_reserve_early(); 766 efi_memblock_x86_reserve_range();
785 } 767 }
786#endif 768#endif
787 769
788 x86_init.oem.arch_setup(); 770 x86_init.oem.arch_setup();
789 771
772 resource_alloc_from_bottom = 0;
773 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
790 setup_memory_map(); 774 setup_memory_map();
791 parse_setup_data(); 775 parse_setup_data();
792 /* update the e820_saved too */ 776 /* update the e820_saved too */
@@ -837,11 +821,8 @@ void __init setup_arch(char **cmdline_p)
837 821
838 x86_report_nx(); 822 x86_report_nx();
839 823
840 /* Must be before kernel pagetables are setup */
841 vmi_activate();
842
843 /* after early param, so could get panic from serial */ 824 /* after early param, so could get panic from serial */
844 reserve_early_setup_data(); 825 memblock_x86_reserve_range_setup_data();
845 826
846 if (acpi_mps_check()) { 827 if (acpi_mps_check()) {
847#ifdef CONFIG_X86_LOCAL_APIC 828#ifdef CONFIG_X86_LOCAL_APIC
@@ -862,8 +843,6 @@ void __init setup_arch(char **cmdline_p)
862 843
863 dmi_scan_machine(); 844 dmi_scan_machine();
864 845
865 dmi_check_system(bad_bios_dmi_table);
866
867 /* 846 /*
868 * VMware detection requires dmi to be available, so this 847 * VMware detection requires dmi to be available, so this
869 * needs to be done after dmi_scan_machine, for the BP. 848 * needs to be done after dmi_scan_machine, for the BP.
@@ -896,8 +875,6 @@ void __init setup_arch(char **cmdline_p)
896 */ 875 */
897 max_pfn = e820_end_of_ram_pfn(); 876 max_pfn = e820_end_of_ram_pfn();
898 877
899 /* preallocate 4k for mptable mpc */
900 early_reserve_e820_mpc_new();
901 /* update e820 for memory not covered by WB MTRRs */ 878 /* update e820 for memory not covered by WB MTRRs */
902 mtrr_bp_init(); 879 mtrr_bp_init();
903 if (mtrr_trim_uncached_memory(max_pfn)) 880 if (mtrr_trim_uncached_memory(max_pfn))
@@ -919,18 +896,8 @@ void __init setup_arch(char **cmdline_p)
919 max_low_pfn = max_pfn; 896 max_low_pfn = max_pfn;
920 897
921 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 898 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
922 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
923#endif 899#endif
924 900
925#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
926 setup_bios_corruption_check();
927#endif
928
929 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
930 max_pfn_mapped<<PAGE_SHIFT);
931
932 reserve_brk();
933
934 /* 901 /*
935 * Find and reserve possible boot-time SMP configuration: 902 * Find and reserve possible boot-time SMP configuration:
936 */ 903 */
@@ -938,6 +905,26 @@ void __init setup_arch(char **cmdline_p)
938 905
939 reserve_ibft_region(); 906 reserve_ibft_region();
940 907
908 /*
909 * Need to conclude brk, before memblock_x86_fill()
910 * it could use memblock_find_in_range, could overlap with
911 * brk area.
912 */
913 reserve_brk();
914
915 memblock.current_limit = get_max_mapped();
916 memblock_x86_fill();
917
918 /* preallocate 4k for mptable mpc */
919 early_reserve_e820_mpc_new();
920
921#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
922 setup_bios_corruption_check();
923#endif
924
925 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
926 max_pfn_mapped<<PAGE_SHIFT);
927
941 reserve_trampoline_memory(); 928 reserve_trampoline_memory();
942 929
943#ifdef CONFIG_ACPI_SLEEP 930#ifdef CONFIG_ACPI_SLEEP
@@ -961,6 +948,7 @@ void __init setup_arch(char **cmdline_p)
961 max_low_pfn = max_pfn; 948 max_low_pfn = max_pfn;
962 } 949 }
963#endif 950#endif
951 memblock.current_limit = get_max_mapped();
964 952
965 /* 953 /*
966 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 954 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -999,10 +987,7 @@ void __init setup_arch(char **cmdline_p)
999#endif 987#endif
1000 988
1001 initmem_init(0, max_pfn, acpi, k8); 989 initmem_init(0, max_pfn, acpi, k8);
1002#ifndef CONFIG_NO_BOOTMEM 990 memblock_find_dma_reserve();
1003 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
1004#endif
1005
1006 dma32_reserve_bootmem(); 991 dma32_reserve_bootmem();
1007 992
1008#ifdef CONFIG_KVM_CLOCK 993#ifdef CONFIG_KVM_CLOCK
@@ -1013,7 +998,12 @@ void __init setup_arch(char **cmdline_p)
1013 paging_init(); 998 paging_init();
1014 x86_init.paging.pagetable_setup_done(swapper_pg_dir); 999 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
1015 1000
1016 setup_trampoline_page_table(); 1001#ifdef CONFIG_X86_32
1002 /* sync back kernel address range */
1003 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
1004 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
1005 KERNEL_PGD_PTRS);
1006#endif
1017 1007
1018 tboot_probe(); 1008 tboot_probe();
1019 1009
@@ -1070,6 +1060,10 @@ void __init setup_arch(char **cmdline_p)
1070 x86_init.oem.banner(); 1060 x86_init.oem.banner();
1071 1061
1072 mcheck_init(); 1062 mcheck_init();
1063
1064 local_irq_save(flags);
1065 arch_init_ideal_nop5();
1066 local_irq_restore(flags);
1073} 1067}
1074 1068
1075#ifdef CONFIG_X86_32 1069#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2335c15c93a4..002b79685f73 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
131 131
132static void __init pcpu_fc_free(void *ptr, size_t size) 132static void __init pcpu_fc_free(void *ptr, size_t size)
133{ 133{
134#ifdef CONFIG_NO_BOOTMEM
135 u64 start = __pa(ptr);
136 u64 end = start + size;
137 free_early_partial(start, end);
138#else
139 free_bootmem(__pa(ptr), size); 134 free_bootmem(__pa(ptr), size);
140#endif
141} 135}
142 136
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 137static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d801210945d6..513deac7228d 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void)
159 irq_exit(); 159 irq_exit();
160} 160}
161 161
162static void native_smp_send_stop(void) 162static void native_stop_other_cpus(int wait)
163{ 163{
164 unsigned long flags; 164 unsigned long flags;
165 unsigned long wait; 165 unsigned long timeout;
166 166
167 if (reboot_force) 167 if (reboot_force)
168 return; 168 return;
@@ -179,9 +179,12 @@ static void native_smp_send_stop(void)
179 if (num_online_cpus() > 1) { 179 if (num_online_cpus() > 1) {
180 apic->send_IPI_allbutself(REBOOT_VECTOR); 180 apic->send_IPI_allbutself(REBOOT_VECTOR);
181 181
182 /* Don't wait longer than a second */ 182 /*
183 wait = USEC_PER_SEC; 183 * Don't wait longer than a second if the caller
184 while (num_online_cpus() > 1 && wait--) 184 * didn't ask us to wait.
185 */
186 timeout = USEC_PER_SEC;
187 while (num_online_cpus() > 1 && (wait || timeout--))
185 udelay(1); 188 udelay(1);
186 } 189 }
187 190
@@ -227,7 +230,7 @@ struct smp_ops smp_ops = {
227 .smp_prepare_cpus = native_smp_prepare_cpus, 230 .smp_prepare_cpus = native_smp_prepare_cpus,
228 .smp_cpus_done = native_smp_cpus_done, 231 .smp_cpus_done = native_smp_cpus_done,
229 232
230 .smp_send_stop = native_smp_send_stop, 233 .stop_other_cpus = native_stop_other_cpus,
231 .smp_send_reschedule = native_smp_send_reschedule, 234 .smp_send_reschedule = native_smp_send_reschedule,
232 235
233 .cpu_up = native_cpu_up, 236 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 864b386f6c0e..083e99d1b7df 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
62#include <asm/pgtable.h> 62#include <asm/pgtable.h>
63#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/vmi.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/setup.h> 67#include <asm/setup.h>
68#include <asm/uv/uv.h> 68#include <asm/uv/uv.h>
@@ -299,23 +299,16 @@ notrace static void __cpuinit start_secondary(void *unused)
299 * fragile that we want to limit the things done here to the 299 * fragile that we want to limit the things done here to the
300 * most necessary things. 300 * most necessary things.
301 */ 301 */
302 cpu_init();
303 preempt_disable();
304 smp_callin();
302 305
303#ifdef CONFIG_X86_32 306#ifdef CONFIG_X86_32
304 /* 307 /* switch away from the initial page table */
305 * Switch away from the trampoline page-table
306 *
307 * Do this before cpu_init() because it needs to access per-cpu
308 * data which may not be mapped in the trampoline page-table.
309 */
310 load_cr3(swapper_pg_dir); 308 load_cr3(swapper_pg_dir);
311 __flush_tlb_all(); 309 __flush_tlb_all();
312#endif 310#endif
313 311
314 vmi_bringup();
315 cpu_init();
316 preempt_disable();
317 smp_callin();
318
319 /* otherwise gcc will move up smp_processor_id before the cpu_init */ 312 /* otherwise gcc will move up smp_processor_id before the cpu_init */
320 barrier(); 313 barrier();
321 /* 314 /*
@@ -397,6 +390,19 @@ void __cpuinit smp_store_cpu_info(int id)
397 identify_secondary_cpu(c); 390 identify_secondary_cpu(c);
398} 391}
399 392
393static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
394{
395 struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
396 struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
397
398 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
399 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
400 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
401 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
402 cpumask_set_cpu(cpu1, c2->llc_shared_map);
403 cpumask_set_cpu(cpu2, c1->llc_shared_map);
404}
405
400 406
401void __cpuinit set_cpu_sibling_map(int cpu) 407void __cpuinit set_cpu_sibling_map(int cpu)
402{ 408{
@@ -409,14 +415,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
409 for_each_cpu(i, cpu_sibling_setup_mask) { 415 for_each_cpu(i, cpu_sibling_setup_mask) {
410 struct cpuinfo_x86 *o = &cpu_data(i); 416 struct cpuinfo_x86 *o = &cpu_data(i);
411 417
412 if (c->phys_proc_id == o->phys_proc_id && 418 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
413 c->cpu_core_id == o->cpu_core_id) { 419 if (c->phys_proc_id == o->phys_proc_id &&
414 cpumask_set_cpu(i, cpu_sibling_mask(cpu)); 420 c->compute_unit_id == o->compute_unit_id)
415 cpumask_set_cpu(cpu, cpu_sibling_mask(i)); 421 link_thread_siblings(cpu, i);
416 cpumask_set_cpu(i, cpu_core_mask(cpu)); 422 } else if (c->phys_proc_id == o->phys_proc_id &&
417 cpumask_set_cpu(cpu, cpu_core_mask(i)); 423 c->cpu_core_id == o->cpu_core_id) {
418 cpumask_set_cpu(i, c->llc_shared_map); 424 link_thread_siblings(cpu, i);
419 cpumask_set_cpu(cpu, o->llc_shared_map);
420 } 425 }
421 } 426 }
422 } else { 427 } else {
@@ -742,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
742 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 747 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
743 }; 748 };
744 749
745 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); 750 INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
746 751
747 alternatives_smp_switch(1); 752 alternatives_smp_switch(1);
748 753
@@ -774,7 +779,6 @@ do_rest:
774#ifdef CONFIG_X86_32 779#ifdef CONFIG_X86_32
775 /* Stack for startup_32 can be just as for start_secondary onwards */ 780 /* Stack for startup_32 can be just as for start_secondary onwards */
776 irq_ctx_init(cpu); 781 irq_ctx_init(cpu);
777 initial_page_table = __pa(&trampoline_pg_dir);
778#else 782#else
779 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 783 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
780 initial_gs = per_cpu_offset(cpu); 784 initial_gs = per_cpu_offset(cpu);
@@ -923,7 +927,6 @@ int __cpuinit native_cpu_up(unsigned int cpu)
923 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 927 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
924 928
925 err = do_boot_cpu(apicid, cpu); 929 err = do_boot_cpu(apicid, cpu);
926
927 if (err) { 930 if (err) {
928 pr_debug("do_boot_cpu failed %d\n", err); 931 pr_debug("do_boot_cpu failed %d\n", err);
929 return -EIO; 932 return -EIO;
@@ -1370,7 +1373,6 @@ void play_dead_common(void)
1370{ 1373{
1371 idle_task_exit(); 1374 idle_task_exit();
1372 reset_lazy_tlbstate(); 1375 reset_lazy_tlbstate();
1373 irq_ctx_exit(raw_smp_processor_id());
1374 c1e_remove_cpu(raw_smp_processor_id()); 1376 c1e_remove_cpu(raw_smp_processor_id());
1375 1377
1376 mb(); 1378 mb();
@@ -1383,11 +1385,88 @@ void play_dead_common(void)
1383 local_irq_disable(); 1385 local_irq_disable();
1384} 1386}
1385 1387
1388/*
1389 * We need to flush the caches before going to sleep, lest we have
1390 * dirty data in our caches when we come back up.
1391 */
1392static inline void mwait_play_dead(void)
1393{
1394 unsigned int eax, ebx, ecx, edx;
1395 unsigned int highest_cstate = 0;
1396 unsigned int highest_subcstate = 0;
1397 int i;
1398 void *mwait_ptr;
1399
1400 if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
1401 return;
1402 if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
1403 return;
1404 if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
1405 return;
1406
1407 eax = CPUID_MWAIT_LEAF;
1408 ecx = 0;
1409 native_cpuid(&eax, &ebx, &ecx, &edx);
1410
1411 /*
1412 * eax will be 0 if EDX enumeration is not valid.
1413 * Initialized below to cstate, sub_cstate value when EDX is valid.
1414 */
1415 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1416 eax = 0;
1417 } else {
1418 edx >>= MWAIT_SUBSTATE_SIZE;
1419 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1420 if (edx & MWAIT_SUBSTATE_MASK) {
1421 highest_cstate = i;
1422 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1423 }
1424 }
1425 eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1426 (highest_subcstate - 1);
1427 }
1428
1429 /*
1430 * This should be a memory location in a cache line which is
1431 * unlikely to be touched by other processors. The actual
1432 * content is immaterial as it is not actually modified in any way.
1433 */
1434 mwait_ptr = &current_thread_info()->flags;
1435
1436 wbinvd();
1437
1438 while (1) {
1439 /*
1440 * The CLFLUSH is a workaround for erratum AAI65 for
1441 * the Xeon 7400 series. It's not clear it is actually
1442 * needed, but it should be harmless in either case.
1443 * The WBINVD is insufficient due to the spurious-wakeup
1444 * case where we return around the loop.
1445 */
1446 clflush(mwait_ptr);
1447 __monitor(mwait_ptr, 0, 0);
1448 mb();
1449 __mwait(eax, 0);
1450 }
1451}
1452
1453static inline void hlt_play_dead(void)
1454{
1455 if (current_cpu_data.x86 >= 4)
1456 wbinvd();
1457
1458 while (1) {
1459 native_halt();
1460 }
1461}
1462
1386void native_play_dead(void) 1463void native_play_dead(void)
1387{ 1464{
1388 play_dead_common(); 1465 play_dead_common();
1389 tboot_shutdown(TB_SHUTDOWN_WFS); 1466 tboot_shutdown(TB_SHUTDOWN_WFS);
1390 wbinvd_halt(); 1467
1468 mwait_play_dead(); /* Only returns on failure */
1469 hlt_play_dead();
1391} 1470}
1392 1471
1393#else /* ... !CONFIG_HOTPLUG_CPU */ 1472#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34a..0b0cb5fede19 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
33 const char *const envp[]) 33 const char *const envp[])
34{ 34{
35 long __res; 35 long __res;
36 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" 36 asm volatile ("int $0x80"
37 : "=a" (__res) 37 : "=a" (__res)
38 : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); 38 : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
39 return __res; 39 return __res;
40} 40}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index e2a595257390..a375616d77f7 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,8 +1,8 @@
1#include <linux/io.h> 1#include <linux/io.h>
2#include <linux/memblock.h>
2 3
3#include <asm/trampoline.h> 4#include <asm/trampoline.h>
4#include <asm/pgtable.h> 5#include <asm/pgtable.h>
5#include <asm/e820.h>
6 6
7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
8#define __trampinit 8#define __trampinit
@@ -17,15 +17,15 @@ unsigned char *__trampinitdata trampoline_base;
17 17
18void __init reserve_trampoline_memory(void) 18void __init reserve_trampoline_memory(void)
19{ 19{
20 unsigned long mem; 20 phys_addr_t mem;
21 21
22 /* Has to be in very low memory so we can execute real-mode AP code. */ 22 /* Has to be in very low memory so we can execute real-mode AP code. */
23 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); 23 mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
24 if (mem == -1L) 24 if (mem == MEMBLOCK_ERROR)
25 panic("Cannot allocate trampoline\n"); 25 panic("Cannot allocate trampoline\n");
26 26
27 trampoline_base = __va(mem); 27 trampoline_base = __va(mem);
28 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); 28 memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
29} 29}
30 30
31/* 31/*
@@ -38,19 +38,3 @@ unsigned long __trampinit setup_trampoline(void)
38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
39 return virt_to_phys(trampoline_base); 39 return virt_to_phys(trampoline_base);
40} 40}
41
42void __init setup_trampoline_page_table(void)
43{
44#ifdef CONFIG_X86_32
45 /* Copy kernel address range */
46 clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
47 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
48 KERNEL_PGD_PTRS);
49
50 /* Initialize low mappings */
51 clone_pgd_range(trampoline_pg_dir,
52 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
53 min_t(unsigned long, KERNEL_PGD_PTRS,
54 KERNEL_PGD_BOUNDARY));
55#endif
56}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8a..cb838ca42c96 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -575,6 +575,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
575 if (regs->flags & X86_VM_MASK) { 575 if (regs->flags & X86_VM_MASK) {
576 handle_vm86_trap((struct kernel_vm86_regs *) regs, 576 handle_vm86_trap((struct kernel_vm86_regs *) regs,
577 error_code, 1); 577 error_code, 1);
578 preempt_conditional_cli(regs);
578 return; 579 return;
579 } 580 }
580 581
@@ -776,21 +777,10 @@ asmlinkage void math_state_restore(void)
776} 777}
777EXPORT_SYMBOL_GPL(math_state_restore); 778EXPORT_SYMBOL_GPL(math_state_restore);
778 779
779#ifndef CONFIG_MATH_EMULATION
780void math_emulate(struct math_emu_info *info)
781{
782 printk(KERN_EMERG
783 "math-emulation not enabled and no coprocessor found.\n");
784 printk(KERN_EMERG "killing %s.\n", current->comm);
785 force_sig(SIGFPE, current);
786 schedule();
787}
788#endif /* CONFIG_MATH_EMULATION */
789
790dotraplinkage void __kprobes 780dotraplinkage void __kprobes
791do_device_not_available(struct pt_regs *regs, long error_code) 781do_device_not_available(struct pt_regs *regs, long error_code)
792{ 782{
793#ifdef CONFIG_X86_32 783#ifdef CONFIG_MATH_EMULATION
794 if (read_cr0() & X86_CR0_EM) { 784 if (read_cr0() & X86_CR0_EM) {
795 struct math_emu_info info = { }; 785 struct math_emu_info info = { };
796 786
@@ -798,12 +788,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
798 788
799 info.regs = regs; 789 info.regs = regs;
800 math_emulate(&info); 790 math_emulate(&info);
801 } else { 791 return;
802 math_state_restore(); /* interrupts still off */
803 conditional_sti(regs);
804 } 792 }
805#else 793#endif
806 math_state_restore(); 794 math_state_restore(); /* interrupts still off */
795#ifdef CONFIG_X86_32
796 conditional_sti(regs);
807#endif 797#endif
808} 798}
809 799
@@ -881,18 +871,6 @@ void __init trap_init(void)
881#endif 871#endif
882 872
883#ifdef CONFIG_X86_32 873#ifdef CONFIG_X86_32
884 if (cpu_has_fxsr) {
885 printk(KERN_INFO "Enabling fast FPU save and restore... ");
886 set_in_cr4(X86_CR4_OSFXSR);
887 printk("done.\n");
888 }
889 if (cpu_has_xmm) {
890 printk(KERN_INFO
891 "Enabling unmasked SIMD FPU exception support... ");
892 set_in_cr4(X86_CR4_OSXMMEXCPT);
893 printk("done.\n");
894 }
895
896 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 874 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
897 set_bit(SYSCALL_VECTOR, used_vectors); 875 set_bit(SYSCALL_VECTOR, used_vectors);
898#endif 876#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a8..0c40d8b72416 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
104 104
105__setup("notsc", notsc_setup); 105__setup("notsc", notsc_setup);
106 106
107static int no_sched_irq_time;
108
107static int __init tsc_setup(char *str) 109static int __init tsc_setup(char *str)
108{ 110{
109 if (!strcmp(str, "reliable")) 111 if (!strcmp(str, "reliable"))
110 tsc_clocksource_reliable = 1; 112 tsc_clocksource_reliable = 1;
113 if (!strncmp(str, "noirqtime", 9))
114 no_sched_irq_time = 1;
111 return 1; 115 return 1;
112} 116}
113 117
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
801 if (!tsc_unstable) { 805 if (!tsc_unstable) {
802 tsc_unstable = 1; 806 tsc_unstable = 1;
803 sched_clock_stable = 0; 807 sched_clock_stable = 0;
808 disable_sched_clock_irqtime();
804 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 809 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
805 /* Change only the rating, when not registered */ 810 /* Change only the rating, when not registered */
806 if (clocksource_tsc.mult) 811 if (clocksource_tsc.mult)
@@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void)
892 clocksource_register_khz(&clocksource_tsc, tsc_khz); 897 clocksource_register_khz(&clocksource_tsc, tsc_khz);
893} 898}
894 899
895#ifdef CONFIG_X86_64
896/*
897 * calibrate_cpu is used on systems with fixed rate TSCs to determine
898 * processor frequency
899 */
900#define TICK_COUNT 100000000
901static unsigned long __init calibrate_cpu(void)
902{
903 int tsc_start, tsc_now;
904 int i, no_ctr_free;
905 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
906 unsigned long flags;
907
908 for (i = 0; i < 4; i++)
909 if (avail_to_resrv_perfctr_nmi_bit(i))
910 break;
911 no_ctr_free = (i == 4);
912 if (no_ctr_free) {
913 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
914 "cpu_khz value may be incorrect.\n");
915 i = 3;
916 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
917 wrmsrl(MSR_K7_EVNTSEL3, 0);
918 rdmsrl(MSR_K7_PERFCTR3, pmc3);
919 } else {
920 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
921 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
922 }
923 local_irq_save(flags);
924 /* start measuring cycles, incrementing from 0 */
925 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
926 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
927 rdtscl(tsc_start);
928 do {
929 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
930 tsc_now = get_cycles();
931 } while ((tsc_now - tsc_start) < TICK_COUNT);
932
933 local_irq_restore(flags);
934 if (no_ctr_free) {
935 wrmsrl(MSR_K7_EVNTSEL3, 0);
936 wrmsrl(MSR_K7_PERFCTR3, pmc3);
937 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
938 } else {
939 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
940 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
941 }
942
943 return pmc_now * tsc_khz / (tsc_now - tsc_start);
944}
945#else
946static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
947#endif
948
949void __init tsc_init(void) 900void __init tsc_init(void)
950{ 901{
951 u64 lpj; 902 u64 lpj;
@@ -964,10 +915,6 @@ void __init tsc_init(void)
964 return; 915 return;
965 } 916 }
966 917
967 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
968 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
969 cpu_khz = calibrate_cpu();
970
971 printk("Detected %lu.%03lu MHz processor.\n", 918 printk("Detected %lu.%03lu MHz processor.\n",
972 (unsigned long)cpu_khz / 1000, 919 (unsigned long)cpu_khz / 1000,
973 (unsigned long)cpu_khz % 1000); 920 (unsigned long)cpu_khz % 1000);
@@ -987,6 +934,9 @@ void __init tsc_init(void)
987 /* now allow native_sched_clock() to use rdtsc */ 934 /* now allow native_sched_clock() to use rdtsc */
988 tsc_disabled = 0; 935 tsc_disabled = 0;
989 936
937 if (!no_sched_irq_time)
938 enable_sched_clock_irqtime();
939
990 lpj = ((u64)tsc_khz * 1000); 940 lpj = ((u64)tsc_khz * 1000);
991 do_div(lpj, HZ); 941 do_div(lpj, HZ);
992 lpj_fine = lpj; 942 lpj_fine = lpj;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f793..61fb98519622 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -551,8 +551,14 @@ cannot_handle:
551int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) 551int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
552{ 552{
553 if (VMPI.is_vm86pus) { 553 if (VMPI.is_vm86pus) {
554 if ((trapno == 3) || (trapno == 1)) 554 if ((trapno == 3) || (trapno == 1)) {
555 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 555 KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
556 /* setting this flag forces the code in entry_32.S to
557 call save_v86_state() and change the stack pointer
558 to KVM86->regs32 */
559 set_thread_flag(TIF_IRET);
560 return 0;
561 }
556 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); 562 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
557 return 0; 563 return 0;
558 } 564 }
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/bootmem.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <linux/gfp.h>
32#include <asm/vmi.h>
33#include <asm/io.h>
34#include <asm/fixmap.h>
35#include <asm/apicdef.h>
36#include <asm/apic.h>
37#include <asm/pgalloc.h>
38#include <asm/processor.h>
39#include <asm/timer.h>
40#include <asm/vmi_time.h>
41#include <asm/kmap_types.h>
42#include <asm/setup.h>
43
44/* Convenient for calling VMI functions indirectly in the ROM */
45typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
46typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
47
48#define call_vrom_func(rom,func) \
49 (((VROMFUNC *)(rom->func))())
50
51#define call_vrom_long_func(rom,func,arg) \
52 (((VROMLONGFUNC *)(rom->func)) (arg))
53
54static struct vrom_header *vmi_rom;
55static int disable_pge;
56static int disable_pse;
57static int disable_sep;
58static int disable_tsc;
59static int disable_mtrr;
60static int disable_noidle;
61static int disable_vmi_timer;
62
63/* Cached VMI operations */
64static struct {
65 void (*cpuid)(void /* non-c */);
66 void (*_set_ldt)(u32 selector);
67 void (*set_tr)(u32 selector);
68 void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
69 void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
70 void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
71 void (*set_kernel_stack)(u32 selector, u32 sp0);
72 void (*allocate_page)(u32, u32, u32, u32, u32);
73 void (*release_page)(u32, u32);
74 void (*set_pte)(pte_t, pte_t *, unsigned);
75 void (*update_pte)(pte_t *, unsigned);
76 void (*set_linear_mapping)(int, void *, u32, u32);
77 void (*_flush_tlb)(int);
78 void (*set_initial_ap_state)(int, int);
79 void (*halt)(void);
80 void (*set_lazy_mode)(int mode);
81} vmi_ops;
82
83/* Cached VMI operations */
84struct vmi_timer_ops vmi_timer_ops;
85
86/*
87 * VMI patching routines.
88 */
89#define MNEM_CALL 0xe8
90#define MNEM_JMP 0xe9
91#define MNEM_RET 0xc3
92
93#define IRQ_PATCH_INT_MASK 0
94#define IRQ_PATCH_DISABLE 5
95
96static inline void patch_offset(void *insnbuf,
97 unsigned long ip, unsigned long dest)
98{
99 *(unsigned long *)(insnbuf+1) = dest-ip-5;
100}
101
102static unsigned patch_internal(int call, unsigned len, void *insnbuf,
103 unsigned long ip)
104{
105 u64 reloc;
106 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
107 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
108 switch(rel->type) {
109 case VMI_RELOCATION_CALL_REL:
110 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_CALL;
112 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
113 return 5;
114
115 case VMI_RELOCATION_JUMP_REL:
116 BUG_ON(len < 5);
117 *(char *)insnbuf = MNEM_JMP;
118 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
119 return 5;
120
121 case VMI_RELOCATION_NOP:
122 /* obliterate the whole thing */
123 return 0;
124
125 case VMI_RELOCATION_NONE:
126 /* leave native code in place */
127 break;
128
129 default:
130 BUG();
131 }
132 return len;
133}
134
135/*
136 * Apply patch if appropriate, return length of new instruction
137 * sequence. The callee does nop padding for us.
138 */
139static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
140 unsigned long ip, unsigned len)
141{
142 switch (type) {
143 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
144 return patch_internal(VMI_CALL_DisableInterrupts, len,
145 insns, ip);
146 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
147 return patch_internal(VMI_CALL_EnableInterrupts, len,
148 insns, ip);
149 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
150 return patch_internal(VMI_CALL_SetInterruptMask, len,
151 insns, ip);
152 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
153 return patch_internal(VMI_CALL_GetInterruptMask, len,
154 insns, ip);
155 case PARAVIRT_PATCH(pv_cpu_ops.iret):
156 return patch_internal(VMI_CALL_IRET, len, insns, ip);
157 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
158 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
159 default:
160 break;
161 }
162 return len;
163}
164
165/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
166static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
167 unsigned int *cx, unsigned int *dx)
168{
169 int override = 0;
170 if (*ax == 1)
171 override = 1;
172 asm volatile ("call *%6"
173 : "=a" (*ax),
174 "=b" (*bx),
175 "=c" (*cx),
176 "=d" (*dx)
177 : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
178 if (override) {
179 if (disable_pse)
180 *dx &= ~X86_FEATURE_PSE;
181 if (disable_pge)
182 *dx &= ~X86_FEATURE_PGE;
183 if (disable_sep)
184 *dx &= ~X86_FEATURE_SEP;
185 if (disable_tsc)
186 *dx &= ~X86_FEATURE_TSC;
187 if (disable_mtrr)
188 *dx &= ~X86_FEATURE_MTRR;
189 }
190}
191
192static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
193{
194 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
195 write_gdt_entry(gdt, nr, new, 0);
196}
197
198static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
199{
200 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
201 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
202 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
203 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
204}
205
206static void vmi_set_ldt(const void *addr, unsigned entries)
207{
208 unsigned cpu = smp_processor_id();
209 struct desc_struct desc;
210
211 pack_descriptor(&desc, (unsigned long)addr,
212 entries * sizeof(struct desc_struct) - 1,
213 DESC_LDT, 0);
214 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
215 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
216}
217
218static void vmi_set_tr(void)
219{
220 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
221}
222
223static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
224{
225 u32 *idt_entry = (u32 *)g;
226 vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
227}
228
229static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
230 const void *desc, int type)
231{
232 u32 *gdt_entry = (u32 *)desc;
233 vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
234}
235
236static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
237 const void *desc)
238{
239 u32 *ldt_entry = (u32 *)desc;
240 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
241}
242
243static void vmi_load_sp0(struct tss_struct *tss,
244 struct thread_struct *thread)
245{
246 tss->x86_tss.sp0 = thread->sp0;
247
248 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
249 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
250 tss->x86_tss.ss1 = thread->sysenter_cs;
251 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
252 }
253 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
254}
255
256static void vmi_flush_tlb_user(void)
257{
258 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
259}
260
261static void vmi_flush_tlb_kernel(void)
262{
263 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
264}
265
266/* Stub to do nothing at all; used for delays and unimplemented calls */
267static void vmi_nop(void)
268{
269}
270
271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
272{
273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
274}
275
276static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
277{
278 /*
279 * This call comes in very early, before mem_map is setup.
280 * It is called only for swapper_pg_dir, which already has
281 * data on it.
282 */
283 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
284}
285
286static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
287{
288 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
289}
290
291static void vmi_release_pte(unsigned long pfn)
292{
293 vmi_ops.release_page(pfn, VMI_PAGE_L1);
294}
295
296static void vmi_release_pmd(unsigned long pfn)
297{
298 vmi_ops.release_page(pfn, VMI_PAGE_L2);
299}
300
301/*
302 * We use the pgd_free hook for releasing the pgd page:
303 */
304static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
305{
306 unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
307
308 vmi_ops.release_page(pfn, VMI_PAGE_L2);
309}
310
311/*
312 * Helper macros for MMU update flags. We can defer updates until a flush
313 * or page invalidation only if the update is to the current address space
314 * (otherwise, there is no flush). We must check against init_mm, since
315 * this could be a kernel update, which usually passes init_mm, although
316 * sometimes this check can be skipped if we know the particular function
317 * is only called on user mode PTEs. We could change the kernel to pass
318 * current->active_mm here, but in particular, I was unsure if changing
319 * mm/highmem.c to do this would still be correct on other architectures.
320 */
321#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
322 (!mustbeuser && (mm) == &init_mm))
323#define vmi_flags_addr(mm, addr, level, user) \
324 ((level) | (is_current_as(mm, user) ? \
325 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
326#define vmi_flags_addr_defer(mm, addr, level, user) \
327 ((level) | (is_current_as(mm, user) ? \
328 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
329
330static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
331{
332 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
333}
334
335static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
336{
337 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
338}
339
340static void vmi_set_pte(pte_t *ptep, pte_t pte)
341{
342 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
343 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
344}
345
346static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
347{
348 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
349}
350
351static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
352{
353#ifdef CONFIG_X86_PAE
354 const pte_t pte = { .pte = pmdval.pmd };
355#else
356 const pte_t pte = { pmdval.pud.pgd.pgd };
357#endif
358 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
359}
360
361#ifdef CONFIG_X86_PAE
362
363static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
364{
365 /*
366 * XXX This is called from set_pmd_pte, but at both PT
367 * and PD layers so the VMI_PAGE_PT flag is wrong. But
368 * it is only called for large page mapping changes,
369 * the Xen backend, doesn't support large pages, and the
370 * ESX backend doesn't depend on the flag.
371 */
372 set_64bit((unsigned long long *)ptep,pte_val(pteval));
373 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
374}
375
376static void vmi_set_pud(pud_t *pudp, pud_t pudval)
377{
378 /* Um, eww */
379 const pte_t pte = { .pte = pudval.pgd.pgd };
380 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
381}
382
383static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
384{
385 const pte_t pte = { .pte = 0 };
386 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
387}
388
389static void vmi_pmd_clear(pmd_t *pmd)
390{
391 const pte_t pte = { .pte = 0 };
392 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
393}
394#endif
395
396#ifdef CONFIG_SMP
397static void __devinit
398vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
399 unsigned long start_esp)
400{
401 struct vmi_ap_state ap;
402
403 /* Default everything to zero. This is fine for most GPRs. */
404 memset(&ap, 0, sizeof(struct vmi_ap_state));
405
406 ap.gdtr_limit = GDT_SIZE - 1;
407 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
408
409 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
410 ap.idtr_base = (unsigned long) idt_table;
411
412 ap.ldtr = 0;
413
414 ap.cs = __KERNEL_CS;
415 ap.eip = (unsigned long) start_eip;
416 ap.ss = __KERNEL_DS;
417 ap.esp = (unsigned long) start_esp;
418
419 ap.ds = __USER_DS;
420 ap.es = __USER_DS;
421 ap.fs = __KERNEL_PERCPU;
422 ap.gs = __KERNEL_STACK_CANARY;
423
424 ap.eflags = 0;
425
426#ifdef CONFIG_X86_PAE
427 /* efer should match BSP efer. */
428 if (cpu_has_nx) {
429 unsigned l, h;
430 rdmsr(MSR_EFER, l, h);
431 ap.efer = (unsigned long long) h << 32 | l;
432 }
433#endif
434
435 ap.cr3 = __pa(swapper_pg_dir);
436 /* Protected mode, paging, AM, WP, NE, MP. */
437 ap.cr0 = 0x80050023;
438 ap.cr4 = mmu_cr4_features;
439 vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
440}
441#endif
442
443static void vmi_start_context_switch(struct task_struct *prev)
444{
445 paravirt_start_context_switch(prev);
446 vmi_ops.set_lazy_mode(2);
447}
448
449static void vmi_end_context_switch(struct task_struct *next)
450{
451 vmi_ops.set_lazy_mode(0);
452 paravirt_end_context_switch(next);
453}
454
455static void vmi_enter_lazy_mmu(void)
456{
457 paravirt_enter_lazy_mmu();
458 vmi_ops.set_lazy_mode(1);
459}
460
461static void vmi_leave_lazy_mmu(void)
462{
463 vmi_ops.set_lazy_mode(0);
464 paravirt_leave_lazy_mmu();
465}
466
467static inline int __init check_vmi_rom(struct vrom_header *rom)
468{
469 struct pci_header *pci;
470 struct pnp_header *pnp;
471 const char *manufacturer = "UNKNOWN";
472 const char *product = "UNKNOWN";
473 const char *license = "unspecified";
474
475 if (rom->rom_signature != 0xaa55)
476 return 0;
477 if (rom->vrom_signature != VMI_SIGNATURE)
478 return 0;
479 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
480 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
481 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
482 rom->api_version_maj,
483 rom->api_version_min);
484 return 0;
485 }
486
487 /*
488 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
489 * the PCI header and device type to make sure this is really a
490 * VMI device.
491 */
492 if (!rom->pci_header_offs) {
493 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
494 return 0;
495 }
496
497 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
498 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
499 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
500 /* Allow it to run... anyways, but warn */
501 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
502 }
503
504 if (rom->pnp_header_offs) {
505 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
506 if (pnp->manufacturer_offset)
507 manufacturer = (const char *)rom+pnp->manufacturer_offset;
508 if (pnp->product_offset)
509 product = (const char *)rom+pnp->product_offset;
510 }
511
512 if (rom->license_offs)
513 license = (char *)rom+rom->license_offs;
514
515 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
516 manufacturer, product,
517 rom->api_version_maj, rom->api_version_min,
518 pci->rom_version_maj, pci->rom_version_min);
519
520 /* Don't allow BSD/MIT here for now because we don't want to end up
521 with any binary only shim layers */
522 if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
523 printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
524 license);
525 return 0;
526 }
527
528 return 1;
529}
530
531/*
532 * Probe for the VMI option ROM
533 */
534static inline int __init probe_vmi_rom(void)
535{
536 unsigned long base;
537
538 /* VMI ROM is in option ROM area, check signature */
539 for (base = 0xC0000; base < 0xE0000; base += 2048) {
540 struct vrom_header *romstart;
541 romstart = (struct vrom_header *)isa_bus_to_virt(base);
542 if (check_vmi_rom(romstart)) {
543 vmi_rom = romstart;
544 return 1;
545 }
546 }
547 return 0;
548}
549
550/*
551 * VMI setup common to all processors
552 */
553void vmi_bringup(void)
554{
555 /* We must establish the lowmem mapping for MMU ops to work */
556 if (vmi_ops.set_linear_mapping)
557 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
558}
559
560/*
561 * Return a pointer to a VMI function or NULL if unimplemented
562 */
563static void *vmi_get_function(int vmicall)
564{
565 u64 reloc;
566 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
567 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
568 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
569 if (rel->type == VMI_RELOCATION_CALL_REL)
570 return (void *)rel->eip;
571 else
572 return NULL;
573}
574
575/*
576 * Helper macro for making the VMI paravirt-ops fill code readable.
577 * For unimplemented operations, fall back to default, unless nop
578 * is returned by the ROM.
579 */
580#define para_fill(opname, vmicall) \
581do { \
582 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
583 VMI_CALL_##vmicall); \
584 if (rel->type == VMI_RELOCATION_CALL_REL) \
585 opname = (void *)rel->eip; \
586 else if (rel->type == VMI_RELOCATION_NOP) \
587 opname = (void *)vmi_nop; \
588 else if (rel->type != VMI_RELOCATION_NONE) \
589 printk(KERN_WARNING "VMI: Unknown relocation " \
590 "type %d for " #vmicall"\n",\
591 rel->type); \
592} while (0)
593
594/*
595 * Helper macro for making the VMI paravirt-ops fill code readable.
596 * For cached operations which do not match the VMI ROM ABI and must
597 * go through a tranlation stub. Ignore NOPs, since it is not clear
598 * a NOP * VMI function corresponds to a NOP paravirt-op when the
599 * functions are not in 1-1 correspondence.
600 */
601#define para_wrap(opname, wrapper, cache, vmicall) \
602do { \
603 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
604 VMI_CALL_##vmicall); \
605 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
606 if (rel->type == VMI_RELOCATION_CALL_REL) { \
607 opname = wrapper; \
608 vmi_ops.cache = (void *)rel->eip; \
609 } \
610} while (0)
611
612/*
613 * Activate the VMI interface and switch into paravirtualized mode
614 */
615static inline int __init activate_vmi(void)
616{
617 short kernel_cs;
618 u64 reloc;
619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
628 printk(KERN_ERR "VMI ROM failed to initialize!");
629 return 0;
630 }
631 savesegment(cs, kernel_cs);
632
633 pv_info.paravirt_enabled = 1;
634 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
635 pv_info.name = "vmi [deprecated]";
636
637 pv_init_ops.patch = vmi_patch;
638
639 /*
640 * Many of these operations are ABI compatible with VMI.
641 * This means we can fill in the paravirt-ops with direct
642 * pointers into the VMI ROM. If the calling convention for
643 * these operations changes, this code needs to be updated.
644 *
645 * Exceptions
646 * CPUID paravirt-op uses pointers, not the native ISA
647 * halt has no VMI equivalent; all VMI halts are "safe"
648 * no MSR support yet - just trap and emulate. VMI uses the
649 * same ABI as the native ISA, but Linux wants exceptions
650 * from bogus MSR read / write handled
651 * rdpmc is not yet used in Linux
652 */
653
654 /* CPUID is special, so very special it gets wrapped like a present */
655 para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
656
657 para_fill(pv_cpu_ops.clts, CLTS);
658 para_fill(pv_cpu_ops.get_debugreg, GetDR);
659 para_fill(pv_cpu_ops.set_debugreg, SetDR);
660 para_fill(pv_cpu_ops.read_cr0, GetCR0);
661 para_fill(pv_mmu_ops.read_cr2, GetCR2);
662 para_fill(pv_mmu_ops.read_cr3, GetCR3);
663 para_fill(pv_cpu_ops.read_cr4, GetCR4);
664 para_fill(pv_cpu_ops.write_cr0, SetCR0);
665 para_fill(pv_mmu_ops.write_cr2, SetCR2);
666 para_fill(pv_mmu_ops.write_cr3, SetCR3);
667 para_fill(pv_cpu_ops.write_cr4, SetCR4);
668
669 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
670 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
671 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
672 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
673
674 para_fill(pv_cpu_ops.wbinvd, WBINVD);
675 para_fill(pv_cpu_ops.read_tsc, RDTSC);
676
677 /* The following we emulate with trap and emulate for now */
678 /* paravirt_ops.read_msr = vmi_rdmsr */
679 /* paravirt_ops.write_msr = vmi_wrmsr */
680 /* paravirt_ops.rdpmc = vmi_rdpmc */
681
682 /* TR interface doesn't pass TR value, wrap */
683 para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
684
685 /* LDT is special, too */
686 para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
687
688 para_fill(pv_cpu_ops.load_gdt, SetGDT);
689 para_fill(pv_cpu_ops.load_idt, SetIDT);
690 para_fill(pv_cpu_ops.store_gdt, GetGDT);
691 para_fill(pv_cpu_ops.store_idt, GetIDT);
692 para_fill(pv_cpu_ops.store_tr, GetTR);
693 pv_cpu_ops.load_tls = vmi_load_tls;
694 para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
695 write_ldt_entry, WriteLDTEntry);
696 para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
697 write_gdt_entry, WriteGDTEntry);
698 para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
699 write_idt_entry, WriteIDTEntry);
700 para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
701 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
702 para_fill(pv_cpu_ops.io_delay, IODelay);
703
704 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
705 set_lazy_mode, SetLazyMode);
706 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
707 set_lazy_mode, SetLazyMode);
708
709 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
710 set_lazy_mode, SetLazyMode);
711 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
712 set_lazy_mode, SetLazyMode);
713
714 /* user and kernel flush are just handled with different flags to FlushTLB */
715 para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
716 para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
717 para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
718
719 /*
720 * Until a standard flag format can be agreed on, we need to
721 * implement these as wrappers in Linux. Get the VMI ROM
722 * function pointers for the two backend calls.
723 */
724#ifdef CONFIG_X86_PAE
725 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
726 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
727#else
728 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
729 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
730#endif
731
732 if (vmi_ops.set_pte) {
733 pv_mmu_ops.set_pte = vmi_set_pte;
734 pv_mmu_ops.set_pte_at = vmi_set_pte_at;
735 pv_mmu_ops.set_pmd = vmi_set_pmd;
736#ifdef CONFIG_X86_PAE
737 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
738 pv_mmu_ops.set_pud = vmi_set_pud;
739 pv_mmu_ops.pte_clear = vmi_pte_clear;
740 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
741#endif
742 }
743
744 if (vmi_ops.update_pte) {
745 pv_mmu_ops.pte_update = vmi_update_pte;
746 pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
747 }
748
749 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
750 if (vmi_ops.allocate_page) {
751 pv_mmu_ops.alloc_pte = vmi_allocate_pte;
752 pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
753 pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
754 }
755
756 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
757 if (vmi_ops.release_page) {
758 pv_mmu_ops.release_pte = vmi_release_pte;
759 pv_mmu_ops.release_pmd = vmi_release_pmd;
760 pv_mmu_ops.pgd_free = vmi_pgd_free;
761 }
762
763 /* Set linear is needed in all cases */
764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
765
766 /*
767 * These MUST always be patched. Don't support indirect jumps
768 * through these operations, as the VMI interface may use either
769 * a jump or a call to get to these operations, depending on
770 * the backend. They are performance critical anyway, so requiring
771 * a patch is not a big problem.
772 */
773 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
774 pv_cpu_ops.iret = (void *)0xbadbab0;
775
776#ifdef CONFIG_SMP
777 para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
778#endif
779
780#ifdef CONFIG_X86_LOCAL_APIC
781 para_fill(apic->read, APICRead);
782 para_fill(apic->write, APICWrite);
783#endif
784
785 /*
786 * Check for VMI timer functionality by probing for a cycle frequency method
787 */
788 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
789 if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
790 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
791 vmi_timer_ops.get_cycle_counter =
792 vmi_get_function(VMI_CALL_GetCycleCounter);
793 vmi_timer_ops.get_wallclock =
794 vmi_get_function(VMI_CALL_GetWallclockTime);
795 vmi_timer_ops.wallclock_updated =
796 vmi_get_function(VMI_CALL_WallclockUpdated);
797 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
798 vmi_timer_ops.cancel_alarm =
799 vmi_get_function(VMI_CALL_CancelAlarm);
800 x86_init.timers.timer_init = vmi_time_init;
801#ifdef CONFIG_X86_LOCAL_APIC
802 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
803 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
804#endif
805 pv_time_ops.sched_clock = vmi_sched_clock;
806 x86_platform.calibrate_tsc = vmi_tsc_khz;
807 x86_platform.get_wallclock = vmi_get_wallclock;
808 x86_platform.set_wallclock = vmi_set_wallclock;
809
810 /* We have true wallclock functions; disable CMOS clock sync */
811 no_sync_cmos_clock = 1;
812 } else {
813 disable_noidle = 1;
814 disable_vmi_timer = 1;
815 }
816
817 para_fill(pv_irq_ops.safe_halt, Halt);
818
819 /*
820 * Alternative instruction rewriting doesn't happen soon enough
821 * to convert VMI_IRET to a call instead of a jump; so we have
822 * to do this before IRQs get reenabled. Fortunately, it is
823 * idempotent.
824 */
825 apply_paravirt(__parainstructions, __parainstructions_end);
826
827 vmi_bringup();
828
829 return 1;
830}
831
832#undef para_fill
833
834void __init vmi_init(void)
835{
836 if (!vmi_rom)
837 probe_vmi_rom();
838 else
839 check_vmi_rom(vmi_rom);
840
841 /* In case probing for or validating the ROM failed, basil */
842 if (!vmi_rom)
843 return;
844
845 reserve_top_address(-vmi_rom->virtual_top);
846
847#ifdef CONFIG_X86_IO_APIC
848 /* This is virtual hardware; timer routing is wired correctly */
849 no_timer_check = 1;
850#endif
851}
852
853void __init vmi_activate(void)
854{
855 unsigned long flags;
856
857 if (!vmi_rom)
858 return;
859
860 local_irq_save(flags);
861 activate_vmi();
862 local_irq_restore(flags & X86_EFLAGS_IF);
863}
864
865static int __init parse_vmi(char *arg)
866{
867 if (!arg)
868 return -EINVAL;
869
870 if (!strcmp(arg, "disable_pge")) {
871 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
872 disable_pge = 1;
873 } else if (!strcmp(arg, "disable_pse")) {
874 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
875 disable_pse = 1;
876 } else if (!strcmp(arg, "disable_sep")) {
877 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
878 disable_sep = 1;
879 } else if (!strcmp(arg, "disable_tsc")) {
880 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
881 disable_tsc = 1;
882 } else if (!strcmp(arg, "disable_mtrr")) {
883 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
884 disable_mtrr = 1;
885 } else if (!strcmp(arg, "disable_timer")) {
886 disable_vmi_timer = 1;
887 disable_noidle = 1;
888 } else if (!strcmp(arg, "disable_noidle"))
889 disable_noidle = 1;
890 return 0;
891}
892
893early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/apicdef.h>
32#include <asm/apic.h>
33#include <asm/timer.h>
34#include <asm/i8253.h>
35#include <asm/irq_vectors.h>
36
37#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
38#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
39
40static DEFINE_PER_CPU(struct clock_event_device, local_events);
41
42static inline u32 vmi_counter(u32 flags)
43{
44 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
45 * cycle counter. */
46 return flags & VMI_ALARM_COUNTER_MASK;
47}
48
49/* paravirt_ops.get_wallclock = vmi_get_wallclock */
50unsigned long vmi_get_wallclock(void)
51{
52 unsigned long long wallclock;
53 wallclock = vmi_timer_ops.get_wallclock(); // nsec
54 (void)do_div(wallclock, 1000000000); // sec
55
56 return wallclock;
57}
58
59/* paravirt_ops.set_wallclock = vmi_set_wallclock */
60int vmi_set_wallclock(unsigned long now)
61{
62 return 0;
63}
64
65/* paravirt_ops.sched_clock = vmi_sched_clock */
66unsigned long long vmi_sched_clock(void)
67{
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69}
70
71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void)
73{
74 unsigned long long khz;
75 khz = vmi_timer_ops.get_cycle_frequency();
76 (void)do_div(khz, 1000);
77 return khz;
78}
79
80static inline unsigned int vmi_get_timer_vector(void)
81{
82 return IRQ0_VECTOR;
83}
84
85/** vmi clockchip */
86#ifdef CONFIG_X86_LOCAL_APIC
87static unsigned int startup_timer_irq(unsigned int irq)
88{
89 unsigned long val = apic_read(APIC_LVTT);
90 apic_write(APIC_LVTT, vmi_get_timer_vector());
91
92 return (val & APIC_SEND_PENDING);
93}
94
95static void mask_timer_irq(unsigned int irq)
96{
97 unsigned long val = apic_read(APIC_LVTT);
98 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
99}
100
101static void unmask_timer_irq(unsigned int irq)
102{
103 unsigned long val = apic_read(APIC_LVTT);
104 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
105}
106
107static void ack_timer_irq(unsigned int irq)
108{
109 ack_APIC_irq();
110}
111
112static struct irq_chip vmi_chip __read_mostly = {
113 .name = "VMI-LOCAL",
114 .startup = startup_timer_irq,
115 .mask = mask_timer_irq,
116 .unmask = unmask_timer_irq,
117 .ack = ack_timer_irq
118};
119#endif
120
121/** vmi clockevent */
122#define VMI_ALARM_WIRED_IRQ0 0x00000000
123#define VMI_ALARM_WIRED_LVTT 0x00010000
124static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
125
126static inline int vmi_get_alarm_wiring(void)
127{
128 return vmi_wiring;
129}
130
131static void vmi_timer_set_mode(enum clock_event_mode mode,
132 struct clock_event_device *evt)
133{
134 cycle_t now, cycles_per_hz;
135 BUG_ON(!irqs_disabled());
136
137 switch (mode) {
138 case CLOCK_EVT_MODE_ONESHOT:
139 case CLOCK_EVT_MODE_RESUME:
140 break;
141 case CLOCK_EVT_MODE_PERIODIC:
142 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
143 (void)do_div(cycles_per_hz, HZ);
144 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
145 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
146 break;
147 case CLOCK_EVT_MODE_UNUSED:
148 case CLOCK_EVT_MODE_SHUTDOWN:
149 switch (evt->mode) {
150 case CLOCK_EVT_MODE_ONESHOT:
151 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
152 break;
153 case CLOCK_EVT_MODE_PERIODIC:
154 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
155 break;
156 default:
157 break;
158 }
159 break;
160 default:
161 break;
162 }
163}
164
165static int vmi_timer_next_event(unsigned long delta,
166 struct clock_event_device *evt)
167{
168 /* Unfortunately, set_next_event interface only passes relative
169 * expiry, but we want absolute expiry. It'd be better if were
170 * were passed an absolute expiry, since a bunch of time may
171 * have been stolen between the time the delta is computed and
172 * when we set the alarm below. */
173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
174
175 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
176 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
177 return 0;
178}
179
180static struct clock_event_device vmi_clockevent = {
181 .name = "vmi-timer",
182 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
183 .shift = 22,
184 .set_mode = vmi_timer_set_mode,
185 .set_next_event = vmi_timer_next_event,
186 .rating = 1000,
187 .irq = 0,
188};
189
190static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
191{
192 struct clock_event_device *evt = &__get_cpu_var(local_events);
193 evt->event_handler(evt);
194 return IRQ_HANDLED;
195}
196
197static struct irqaction vmi_clock_action = {
198 .name = "vmi-timer",
199 .handler = vmi_timer_interrupt,
200 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
201};
202
203static void __devinit vmi_time_init_clockevent(void)
204{
205 cycle_t cycles_per_msec;
206 struct clock_event_device *evt;
207
208 int cpu = smp_processor_id();
209 evt = &__get_cpu_var(local_events);
210
211 /* Use cycles_per_msec since div_sc params are 32-bits. */
212 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
213 (void)do_div(cycles_per_msec, 1000);
214
215 memcpy(evt, &vmi_clockevent, sizeof(*evt));
216 /* Must pick .shift such that .mult fits in 32-bits. Choosing
217 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
218 * before overflow. */
219 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
220 /* Upper bound is clockevent's use of ulong for cycle deltas. */
221 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
222 evt->min_delta_ns = clockevent_delta2ns(1, evt);
223 evt->cpumask = cpumask_of(cpu);
224
225 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
226 evt->name, evt->mult, evt->shift);
227 clockevents_register_device(evt);
228}
229
230void __init vmi_time_init(void)
231{
232 unsigned int cpu;
233 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
234 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
235
236 vmi_time_init_clockevent();
237 setup_irq(0, &vmi_clock_action);
238 for_each_possible_cpu(cpu)
239 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
240}
241
242#ifdef CONFIG_X86_LOCAL_APIC
243void __devinit vmi_time_bsp_init(void)
244{
245 /*
246 * On APIC systems, we want local timers to fire on each cpu. We do
247 * this by programming LVTT to deliver timer events to the IRQ handler
248 * for IRQ-0, since we can't re-use the APIC local timer handler
249 * without interfering with that code.
250 */
251 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
252 local_irq_disable();
253#ifdef CONFIG_SMP
254 /*
255 * XXX handle_percpu_irq only defined for SMP; we need to switch over
256 * to using it, since this is a local interrupt, which each CPU must
257 * handle individually without locking out or dropping simultaneous
258 * local timers on other CPUs. We also don't want to trigger the
259 * quirk workaround code for interrupts which gets invoked from
260 * handle_percpu_irq via eoi, so we use our own IRQ chip.
261 */
262 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
263#else
264 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
265#endif
266 vmi_wiring = VMI_ALARM_WIRED_LVTT;
267 apic_write(APIC_LVTT, vmi_get_timer_vector());
268 local_irq_enable();
269 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
270}
271
272void __devinit vmi_time_ap_init(void)
273{
274 vmi_time_init_clockevent();
275 apic_write(APIC_LVTT, vmi_get_timer_vector());
276}
277#endif
278
279/** vmi clocksource */
280static struct clocksource clocksource_vmi;
281
282static cycle_t read_real_cycles(struct clocksource *cs)
283{
284 cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
285 return max(ret, clocksource_vmi.cycle_last);
286}
287
288static struct clocksource clocksource_vmi = {
289 .name = "vmi-timer",
290 .rating = 450,
291 .read = read_real_cycles,
292 .mask = CLOCKSOURCE_MASK(64),
293 .mult = 0, /* to be set */
294 .shift = 22,
295 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
296};
297
298static int __init init_vmi_clocksource(void)
299{
300 cycle_t cycles_per_msec;
301
302 if (!vmi_timer_ops.get_cycle_frequency)
303 return 0;
304 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
305 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
306 (void)do_div(cycles_per_msec, 1000);
307
308 /* Note that clocksource.{mult, shift} converts in the opposite direction
309 * as clockevents. */
310 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
311 clocksource_vmi.shift);
312
313 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
314 return clocksource_register(&clocksource_vmi);
315
316}
317module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa3..e03530aebfd0 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -242,6 +242,12 @@ SECTIONS
242 __x86_cpu_dev_end = .; 242 __x86_cpu_dev_end = .;
243 } 243 }
244 244
245 /*
246 * start address and size of operations which during runtime
247 * can be patched with virtualization friendly instructions or
248 * baremetal native ones. Think page table operations.
249 * Details in paravirt_types.h
250 */
245 . = ALIGN(8); 251 . = ALIGN(8);
246 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 252 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
247 __parainstructions = .; 253 __parainstructions = .;
@@ -249,6 +255,11 @@ SECTIONS
249 __parainstructions_end = .; 255 __parainstructions_end = .;
250 } 256 }
251 257
258 /*
259 * struct alt_inst entries. From the header (alternative.h):
260 * "Alternative instructions for different CPU types or capabilities"
261 * Think locking instructions on spinlocks.
262 */
252 . = ALIGN(8); 263 . = ALIGN(8);
253 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 264 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
254 __alt_instructions = .; 265 __alt_instructions = .;
@@ -256,11 +267,28 @@ SECTIONS
256 __alt_instructions_end = .; 267 __alt_instructions_end = .;
257 } 268 }
258 269
270 /*
271 * And here are the replacement instructions. The linker sticks
272 * them as binary blobs. The .altinstructions has enough data to
273 * get the address and the length of them to patch the kernel safely.
274 */
259 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 275 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
260 *(.altinstr_replacement) 276 *(.altinstr_replacement)
261 } 277 }
262 278
263 /* 279 /*
280 * struct iommu_table_entry entries are injected in this section.
281 * It is an array of IOMMUs which during run time gets sorted depending
282 * on its dependency order. After rootfs_initcall is complete
283 * this section can be safely removed.
284 */
285 .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
286 __iommu_table = .;
287 *(.iommu_table)
288 __iommu_table_end = .;
289 }
290 . = ALIGN(8);
291 /*
264 * .exit.text is discard at runtime, not link time, to deal with 292 * .exit.text is discard at runtime, not link time, to deal with
265 * references from .altinstructions and .eh_frame 293 * references from .altinstructions and .eh_frame
266 */ 294 */
@@ -273,7 +301,7 @@ SECTIONS
273 } 301 }
274 302
275#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 303#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
276 PERCPU(PAGE_SIZE) 304 PERCPU(THREAD_SIZE)
277#endif 305#endif
278 306
279 . = ALIGN(PAGE_SIZE); 307 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..ddc131ff438f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
64 To compile this as a module, choose M here: the module 64 To compile this as a module, choose M here: the module
65 will be called kvm-amd. 65 will be called kvm-amd.
66 66
67config KVM_MMU_AUDIT
68 bool "Audit KVM MMU"
69 depends on KVM && TRACEPOINTS
70 ---help---
71 This option adds a R/W kVM module parameter 'mmu_audit', which allows
72 audit KVM MMU at runtime.
73
67# OK, it's a little counter-intuitive to do this, but it puts it neatly under 74# OK, it's a little counter-intuitive to do this, but it puts it neatly under
68# the virtualization menu. 75# the virtualization menu.
69source drivers/vhost/Kconfig 76source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd6..38b6e8dafaff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates. 12 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
13 * 13 *
14 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
15 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -51,13 +51,13 @@
51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
52#define DstReg (2<<1) /* Register operand. */ 52#define DstReg (2<<1) /* Register operand. */
53#define DstMem (3<<1) /* Memory operand. */ 53#define DstMem (3<<1) /* Memory operand. */
54#define DstAcc (4<<1) /* Destination Accumulator */ 54#define DstAcc (4<<1) /* Destination Accumulator */
55#define DstDI (5<<1) /* Destination is in ES:(E)DI */ 55#define DstDI (5<<1) /* Destination is in ES:(E)DI */
56#define DstMem64 (6<<1) /* 64bit memory operand */ 56#define DstMem64 (6<<1) /* 64bit memory operand */
57#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
57#define DstMask (7<<1) 58#define DstMask (7<<1)
58/* Source operand type. */ 59/* Source operand type. */
59#define SrcNone (0<<4) /* No source operand. */ 60#define SrcNone (0<<4) /* No source operand. */
60#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
61#define SrcReg (1<<4) /* Register operand. */ 61#define SrcReg (1<<4) /* Register operand. */
62#define SrcMem (2<<4) /* Memory operand. */ 62#define SrcMem (2<<4) /* Memory operand. */
63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ 63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
@@ -71,6 +71,7 @@
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ 71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ 72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */ 73#define SrcAcc (0xd<<4) /* Source Accumulator */
74#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
74#define SrcMask (0xf<<4) 75#define SrcMask (0xf<<4)
75/* Generic ModRM decode. */ 76/* Generic ModRM decode. */
76#define ModRM (1<<8) 77#define ModRM (1<<8)
@@ -82,8 +83,10 @@
82#define Stack (1<<13) /* Stack instruction (push/pop) */ 83#define Stack (1<<13) /* Stack instruction (push/pop) */
83#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 84#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
84#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 85#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
85#define GroupMask 0xff /* Group number stored in bits 0:7 */
86/* Misc flags */ 86/* Misc flags */
87#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
88#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
89#define Undefined (1<<25) /* No Such Instruction */
87#define Lock (1<<26) /* lock prefix is allowed for the instruction */ 90#define Lock (1<<26) /* lock prefix is allowed for the instruction */
88#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 91#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
89#define No64 (1<<28) 92#define No64 (1<<28)
@@ -92,285 +95,30 @@
92#define Src2CL (1<<29) 95#define Src2CL (1<<29)
93#define Src2ImmByte (2<<29) 96#define Src2ImmByte (2<<29)
94#define Src2One (3<<29) 97#define Src2One (3<<29)
98#define Src2Imm (4<<29)
95#define Src2Mask (7<<29) 99#define Src2Mask (7<<29)
96 100
97enum { 101#define X2(x...) x, x
98 Group1_80, Group1_81, Group1_82, Group1_83, 102#define X3(x...) X2(x), x
99 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 103#define X4(x...) X2(x), X2(x)
100 Group8, Group9, 104#define X5(x...) X4(x), x
105#define X6(x...) X4(x), X2(x)
106#define X7(x...) X4(x), X3(x)
107#define X8(x...) X4(x), X4(x)
108#define X16(x...) X8(x), X8(x)
109
110struct opcode {
111 u32 flags;
112 union {
113 int (*execute)(struct x86_emulate_ctxt *ctxt);
114 struct opcode *group;
115 struct group_dual *gdual;
116 } u;
101}; 117};
102 118
103static u32 opcode_table[256] = { 119struct group_dual {
104 /* 0x00 - 0x07 */ 120 struct opcode mod012[8];
105 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 121 struct opcode mod3[8];
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
109 /* 0x08 - 0x0F */
110 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, 0,
114 /* 0x10 - 0x17 */
115 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
118 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
119 /* 0x18 - 0x1F */
120 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
121 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
122 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
123 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
139 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
140 0, 0,
141 /* 0x40 - 0x47 */
142 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
143 /* 0x48 - 0x4F */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x50 - 0x57 */
146 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
147 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
148 /* 0x58 - 0x5F */
149 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
150 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
151 /* 0x60 - 0x67 */
152 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
153 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
154 0, 0, 0, 0,
155 /* 0x68 - 0x6F */
156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
159 /* 0x70 - 0x77 */
160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
162 /* 0x78 - 0x7F */
163 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
164 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
165 /* 0x80 - 0x87 */
166 Group | Group1_80, Group | Group1_81,
167 Group | Group1_82, Group | Group1_83,
168 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
169 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */
178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */
181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */
186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */
190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
192 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
193 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
194 /* 0xB8 - 0xBF */
195 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
196 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
197 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
198 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
199 /* 0xC0 - 0xC7 */
200 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
201 0, ImplicitOps | Stack, 0, 0,
202 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
203 /* 0xC8 - 0xCF */
204 0, 0, 0, ImplicitOps | Stack,
205 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
206 /* 0xD0 - 0xD7 */
207 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
208 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
209 0, 0, 0, 0,
210 /* 0xD8 - 0xDF */
211 0, 0, 0, 0, 0, 0, 0, 0,
212 /* 0xE0 - 0xE7 */
213 0, 0, 0, 0,
214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */
222 0, 0, 0, 0,
223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
224 /* 0xF8 - 0xFF */
225 ImplicitOps, 0, ImplicitOps, ImplicitOps,
226 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
227};
228
229static u32 twobyte_table[256] = {
230 /* 0x00 - 0x0F */
231 0, Group | GroupDual | Group7, 0, 0,
232 0, ImplicitOps, ImplicitOps | Priv, 0,
233 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
234 0, ImplicitOps | ModRM, 0, 0,
235 /* 0x10 - 0x1F */
236 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
237 /* 0x20 - 0x2F */
238 ModRM | ImplicitOps | Priv, ModRM | Priv,
239 ModRM | ImplicitOps | Priv, ModRM | Priv,
240 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 /* 0x30 - 0x3F */
243 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
244 ImplicitOps, ImplicitOps | Priv, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0x40 - 0x47 */
247 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
248 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
249 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
250 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
251 /* 0x48 - 0x4F */
252 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
253 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
254 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
255 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
256 /* 0x50 - 0x5F */
257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258 /* 0x60 - 0x6F */
259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
260 /* 0x70 - 0x7F */
261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
262 /* 0x80 - 0x8F */
263 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
264 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
265 /* 0x90 - 0x9F */
266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
267 /* 0xA0 - 0xA7 */
268 ImplicitOps | Stack, ImplicitOps | Stack,
269 0, DstMem | SrcReg | ModRM | BitOp,
270 DstMem | SrcReg | Src2ImmByte | ModRM,
271 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
272 /* 0xA8 - 0xAF */
273 ImplicitOps | Stack, ImplicitOps | Stack,
274 0, DstMem | SrcReg | ModRM | BitOp | Lock,
275 DstMem | SrcReg | Src2ImmByte | ModRM,
276 DstMem | SrcReg | Src2CL | ModRM,
277 ModRM, 0,
278 /* 0xB0 - 0xB7 */
279 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
280 0, DstMem | SrcReg | ModRM | BitOp | Lock,
281 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
282 DstReg | SrcMem16 | ModRM | Mov,
283 /* 0xB8 - 0xBF */
284 0, 0,
285 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
286 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
287 DstReg | SrcMem16 | ModRM | Mov,
288 /* 0xC0 - 0xCF */
289 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
290 0, 0, 0, Group | GroupDual | Group9,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 /* 0xD0 - 0xDF */
293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
294 /* 0xE0 - 0xEF */
295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
296 /* 0xF0 - 0xFF */
297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
298};
299
300static u32 group_table[] = {
301 [Group1_80*8] =
302 ByteOp | DstMem | SrcImm | ModRM | Lock,
303 ByteOp | DstMem | SrcImm | ModRM | Lock,
304 ByteOp | DstMem | SrcImm | ModRM | Lock,
305 ByteOp | DstMem | SrcImm | ModRM | Lock,
306 ByteOp | DstMem | SrcImm | ModRM | Lock,
307 ByteOp | DstMem | SrcImm | ModRM | Lock,
308 ByteOp | DstMem | SrcImm | ModRM | Lock,
309 ByteOp | DstMem | SrcImm | ModRM,
310 [Group1_81*8] =
311 DstMem | SrcImm | ModRM | Lock,
312 DstMem | SrcImm | ModRM | Lock,
313 DstMem | SrcImm | ModRM | Lock,
314 DstMem | SrcImm | ModRM | Lock,
315 DstMem | SrcImm | ModRM | Lock,
316 DstMem | SrcImm | ModRM | Lock,
317 DstMem | SrcImm | ModRM | Lock,
318 DstMem | SrcImm | ModRM,
319 [Group1_82*8] =
320 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
321 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
322 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
323 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
324 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
325 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
326 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
327 ByteOp | DstMem | SrcImm | ModRM | No64,
328 [Group1_83*8] =
329 DstMem | SrcImmByte | ModRM | Lock,
330 DstMem | SrcImmByte | ModRM | Lock,
331 DstMem | SrcImmByte | ModRM | Lock,
332 DstMem | SrcImmByte | ModRM | Lock,
333 DstMem | SrcImmByte | ModRM | Lock,
334 DstMem | SrcImmByte | ModRM | Lock,
335 DstMem | SrcImmByte | ModRM | Lock,
336 DstMem | SrcImmByte | ModRM,
337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0,
343 [Group3*8] =
344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0,
347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0,
350 [Group5*8] =
351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
357 SrcNone | ModRM | DstMem | Mov, 0,
358 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
359 [Group8*8] =
360 0, 0, 0, 0,
361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
363 [Group9*8] =
364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
365};
366
367static u32 group2_table[] = {
368 [Group7*8] =
369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
370 SrcNone | ModRM | DstMem | Mov, 0,
371 SrcMem16 | ModRM | Mov | Priv, 0,
372 [Group9*8] =
373 0, 0, 0, 0, 0, 0, 0, 0,
374}; 122};
375 123
376/* EFLAGS bit definitions. */ 124/* EFLAGS bit definitions. */
@@ -392,6 +140,9 @@ static u32 group2_table[] = {
392#define EFLG_PF (1<<2) 140#define EFLG_PF (1<<2)
393#define EFLG_CF (1<<0) 141#define EFLG_CF (1<<0)
394 142
143#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
144#define EFLG_RESERVED_ONE_MASK 2
145
395/* 146/*
396 * Instruction emulation: 147 * Instruction emulation:
397 * Most instructions are emulated directly via a fragment of inline assembly 148 * Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +195,13 @@ static u32 group2_table[] = {
444#define ON64(x) 195#define ON64(x)
445#endif 196#endif
446 197
447#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ 198#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
448 do { \ 199 do { \
449 __asm__ __volatile__ ( \ 200 __asm__ __volatile__ ( \
450 _PRE_EFLAGS("0", "4", "2") \ 201 _PRE_EFLAGS("0", "4", "2") \
451 _op _suffix " %"_x"3,%1; " \ 202 _op _suffix " %"_x"3,%1; " \
452 _POST_EFLAGS("0", "4", "2") \ 203 _POST_EFLAGS("0", "4", "2") \
453 : "=m" (_eflags), "=m" ((_dst).val), \ 204 : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
454 "=&r" (_tmp) \ 205 "=&r" (_tmp) \
455 : _y ((_src).val), "i" (EFLAGS_MASK)); \ 206 : _y ((_src).val), "i" (EFLAGS_MASK)); \
456 } while (0) 207 } while (0)
@@ -463,13 +214,13 @@ static u32 group2_table[] = {
463 \ 214 \
464 switch ((_dst).bytes) { \ 215 switch ((_dst).bytes) { \
465 case 2: \ 216 case 2: \
466 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ 217 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
467 break; \ 218 break; \
468 case 4: \ 219 case 4: \
469 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ 220 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
470 break; \ 221 break; \
471 case 8: \ 222 case 8: \
472 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ 223 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
473 break; \ 224 break; \
474 } \ 225 } \
475 } while (0) 226 } while (0)
@@ -479,7 +230,7 @@ static u32 group2_table[] = {
479 unsigned long _tmp; \ 230 unsigned long _tmp; \
480 switch ((_dst).bytes) { \ 231 switch ((_dst).bytes) { \
481 case 1: \ 232 case 1: \
482 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ 233 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
483 break; \ 234 break; \
484 default: \ 235 default: \
485 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 236 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -566,6 +317,74 @@ static u32 group2_table[] = {
566 } \ 317 } \
567 } while (0) 318 } while (0)
568 319
320#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
321 do { \
322 unsigned long _tmp; \
323 \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0", "4", "1") \
326 _op _suffix " %5; " \
327 _POST_EFLAGS("0", "4", "1") \
328 : "=m" (_eflags), "=&r" (_tmp), \
329 "+a" (_rax), "+d" (_rdx) \
330 : "i" (EFLAGS_MASK), "m" ((_src).val), \
331 "a" (_rax), "d" (_rdx)); \
332 } while (0)
333
334#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
335 do { \
336 unsigned long _tmp; \
337 \
338 __asm__ __volatile__ ( \
339 _PRE_EFLAGS("0", "5", "1") \
340 "1: \n\t" \
341 _op _suffix " %6; " \
342 "2: \n\t" \
343 _POST_EFLAGS("0", "5", "1") \
344 ".pushsection .fixup,\"ax\" \n\t" \
345 "3: movb $1, %4 \n\t" \
346 "jmp 2b \n\t" \
347 ".popsection \n\t" \
348 _ASM_EXTABLE(1b, 3b) \
349 : "=m" (_eflags), "=&r" (_tmp), \
350 "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
351 : "i" (EFLAGS_MASK), "m" ((_src).val), \
352 "a" (_rax), "d" (_rdx)); \
353 } while (0)
354
355/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
356#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
357 do { \
358 switch((_src).bytes) { \
359 case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
360 case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
361 case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
362 case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
363 } \
364 } while (0)
365
366#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
367 do { \
368 switch((_src).bytes) { \
369 case 1: \
370 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
371 _eflags, "b", _ex); \
372 break; \
373 case 2: \
374 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
375 _eflags, "w", _ex); \
376 break; \
377 case 4: \
378 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
379 _eflags, "l", _ex); \
380 break; \
381 case 8: ON64( \
382 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
383 _eflags, "q", _ex)); \
384 break; \
385 } \
386 } while (0)
387
569/* Fetch next part of the instruction being emulated. */ 388/* Fetch next part of the instruction being emulated. */
570#define insn_fetch(_type, _size, _eip) \ 389#define insn_fetch(_type, _size, _eip) \
571({ unsigned long _x; \ 390({ unsigned long _x; \
@@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
661 ctxt->exception = vec; 480 ctxt->exception = vec;
662 ctxt->error_code = error; 481 ctxt->error_code = error;
663 ctxt->error_code_valid = valid; 482 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665} 483}
666 484
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
669 emulate_exception(ctxt, GP_VECTOR, err, true); 487 emulate_exception(ctxt, GP_VECTOR, err, true);
670} 488}
671 489
672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, 490static void emulate_pf(struct x86_emulate_ctxt *ctxt)
673 int err)
674{ 491{
675 ctxt->cr2 = addr; 492 emulate_exception(ctxt, PF_VECTOR, 0, true);
676 emulate_exception(ctxt, PF_VECTOR, err, true);
677} 493}
678 494
679static void emulate_ud(struct x86_emulate_ctxt *ctxt) 495static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
686 emulate_exception(ctxt, TS_VECTOR, err, true); 502 emulate_exception(ctxt, TS_VECTOR, err, true);
687} 503}
688 504
505static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{
507 emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509}
510
689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
690 struct x86_emulate_ops *ops, 512 struct x86_emulate_ops *ops,
691 unsigned long eip, u8 *dest) 513 unsigned long eip, u8 *dest)
@@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
742 564
743static int read_descriptor(struct x86_emulate_ctxt *ctxt, 565static int read_descriptor(struct x86_emulate_ctxt *ctxt,
744 struct x86_emulate_ops *ops, 566 struct x86_emulate_ops *ops,
745 void *ptr, 567 ulong addr,
746 u16 *size, unsigned long *address, int op_bytes) 568 u16 *size, unsigned long *address, int op_bytes)
747{ 569{
748 int rc; 570 int rc;
@@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
750 if (op_bytes == 2) 572 if (op_bytes == 2)
751 op_bytes = 3; 573 op_bytes = 3;
752 *address = 0; 574 *address = 0;
753 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
754 ctxt->vcpu, NULL);
755 if (rc != X86EMUL_CONTINUE) 576 if (rc != X86EMUL_CONTINUE)
756 return rc; 577 return rc;
757 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
758 ctxt->vcpu, NULL);
759 return rc; 579 return rc;
760} 580}
761 581
@@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
794 return (!!rc ^ (condition & 1)); 614 return (!!rc ^ (condition & 1));
795} 615}
796 616
617static void fetch_register_operand(struct operand *op)
618{
619 switch (op->bytes) {
620 case 1:
621 op->val = *(u8 *)op->addr.reg;
622 break;
623 case 2:
624 op->val = *(u16 *)op->addr.reg;
625 break;
626 case 4:
627 op->val = *(u32 *)op->addr.reg;
628 break;
629 case 8:
630 op->val = *(u64 *)op->addr.reg;
631 break;
632 }
633}
634
797static void decode_register_operand(struct operand *op, 635static void decode_register_operand(struct operand *op,
798 struct decode_cache *c, 636 struct decode_cache *c,
799 int inhibit_bytereg) 637 int inhibit_bytereg)
@@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op,
805 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 643 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
806 op->type = OP_REG; 644 op->type = OP_REG;
807 if ((c->d & ByteOp) && !inhibit_bytereg) { 645 if ((c->d & ByteOp) && !inhibit_bytereg) {
808 op->ptr = decode_register(reg, c->regs, highbyte_regs); 646 op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
809 op->val = *(u8 *)op->ptr;
810 op->bytes = 1; 647 op->bytes = 1;
811 } else { 648 } else {
812 op->ptr = decode_register(reg, c->regs, 0); 649 op->addr.reg = decode_register(reg, c->regs, 0);
813 op->bytes = c->op_bytes; 650 op->bytes = c->op_bytes;
814 switch (op->bytes) {
815 case 2:
816 op->val = *(u16 *)op->ptr;
817 break;
818 case 4:
819 op->val = *(u32 *)op->ptr;
820 break;
821 case 8:
822 op->val = *(u64 *) op->ptr;
823 break;
824 }
825 } 651 }
652 fetch_register_operand(op);
826 op->orig_val = op->val; 653 op->orig_val = op->val;
827} 654}
828 655
829static int decode_modrm(struct x86_emulate_ctxt *ctxt, 656static int decode_modrm(struct x86_emulate_ctxt *ctxt,
830 struct x86_emulate_ops *ops) 657 struct x86_emulate_ops *ops,
658 struct operand *op)
831{ 659{
832 struct decode_cache *c = &ctxt->decode; 660 struct decode_cache *c = &ctxt->decode;
833 u8 sib; 661 u8 sib;
834 int index_reg = 0, base_reg = 0, scale; 662 int index_reg = 0, base_reg = 0, scale;
835 int rc = X86EMUL_CONTINUE; 663 int rc = X86EMUL_CONTINUE;
664 ulong modrm_ea = 0;
836 665
837 if (c->rex_prefix) { 666 if (c->rex_prefix) {
838 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 667 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
844 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 673 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
845 c->modrm_reg |= (c->modrm & 0x38) >> 3; 674 c->modrm_reg |= (c->modrm & 0x38) >> 3;
846 c->modrm_rm |= (c->modrm & 0x07); 675 c->modrm_rm |= (c->modrm & 0x07);
847 c->modrm_ea = 0; 676 c->modrm_seg = VCPU_SREG_DS;
848 c->use_modrm_ea = 1;
849 677
850 if (c->modrm_mod == 3) { 678 if (c->modrm_mod == 3) {
851 c->modrm_ptr = decode_register(c->modrm_rm, 679 op->type = OP_REG;
680 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
681 op->addr.reg = decode_register(c->modrm_rm,
852 c->regs, c->d & ByteOp); 682 c->regs, c->d & ByteOp);
853 c->modrm_val = *(unsigned long *)c->modrm_ptr; 683 fetch_register_operand(op);
854 return rc; 684 return rc;
855 } 685 }
856 686
687 op->type = OP_MEM;
688
857 if (c->ad_bytes == 2) { 689 if (c->ad_bytes == 2) {
858 unsigned bx = c->regs[VCPU_REGS_RBX]; 690 unsigned bx = c->regs[VCPU_REGS_RBX];
859 unsigned bp = c->regs[VCPU_REGS_RBP]; 691 unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
864 switch (c->modrm_mod) { 696 switch (c->modrm_mod) {
865 case 0: 697 case 0:
866 if (c->modrm_rm == 6) 698 if (c->modrm_rm == 6)
867 c->modrm_ea += insn_fetch(u16, 2, c->eip); 699 modrm_ea += insn_fetch(u16, 2, c->eip);
868 break; 700 break;
869 case 1: 701 case 1:
870 c->modrm_ea += insn_fetch(s8, 1, c->eip); 702 modrm_ea += insn_fetch(s8, 1, c->eip);
871 break; 703 break;
872 case 2: 704 case 2:
873 c->modrm_ea += insn_fetch(u16, 2, c->eip); 705 modrm_ea += insn_fetch(u16, 2, c->eip);
874 break; 706 break;
875 } 707 }
876 switch (c->modrm_rm) { 708 switch (c->modrm_rm) {
877 case 0: 709 case 0:
878 c->modrm_ea += bx + si; 710 modrm_ea += bx + si;
879 break; 711 break;
880 case 1: 712 case 1:
881 c->modrm_ea += bx + di; 713 modrm_ea += bx + di;
882 break; 714 break;
883 case 2: 715 case 2:
884 c->modrm_ea += bp + si; 716 modrm_ea += bp + si;
885 break; 717 break;
886 case 3: 718 case 3:
887 c->modrm_ea += bp + di; 719 modrm_ea += bp + di;
888 break; 720 break;
889 case 4: 721 case 4:
890 c->modrm_ea += si; 722 modrm_ea += si;
891 break; 723 break;
892 case 5: 724 case 5:
893 c->modrm_ea += di; 725 modrm_ea += di;
894 break; 726 break;
895 case 6: 727 case 6:
896 if (c->modrm_mod != 0) 728 if (c->modrm_mod != 0)
897 c->modrm_ea += bp; 729 modrm_ea += bp;
898 break; 730 break;
899 case 7: 731 case 7:
900 c->modrm_ea += bx; 732 modrm_ea += bx;
901 break; 733 break;
902 } 734 }
903 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 735 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
904 (c->modrm_rm == 6 && c->modrm_mod != 0)) 736 (c->modrm_rm == 6 && c->modrm_mod != 0))
905 if (!c->has_seg_override) 737 c->modrm_seg = VCPU_SREG_SS;
906 set_seg_override(c, VCPU_SREG_SS); 738 modrm_ea = (u16)modrm_ea;
907 c->modrm_ea = (u16)c->modrm_ea;
908 } else { 739 } else {
909 /* 32/64-bit ModR/M decode. */ 740 /* 32/64-bit ModR/M decode. */
910 if ((c->modrm_rm & 7) == 4) { 741 if ((c->modrm_rm & 7) == 4) {
@@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
914 scale = sib >> 6; 745 scale = sib >> 6;
915 746
916 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 747 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
917 c->modrm_ea += insn_fetch(s32, 4, c->eip); 748 modrm_ea += insn_fetch(s32, 4, c->eip);
918 else 749 else
919 c->modrm_ea += c->regs[base_reg]; 750 modrm_ea += c->regs[base_reg];
920 if (index_reg != 4) 751 if (index_reg != 4)
921 c->modrm_ea += c->regs[index_reg] << scale; 752 modrm_ea += c->regs[index_reg] << scale;
922 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 753 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
923 if (ctxt->mode == X86EMUL_MODE_PROT64) 754 if (ctxt->mode == X86EMUL_MODE_PROT64)
924 c->rip_relative = 1; 755 c->rip_relative = 1;
925 } else 756 } else
926 c->modrm_ea += c->regs[c->modrm_rm]; 757 modrm_ea += c->regs[c->modrm_rm];
927 switch (c->modrm_mod) { 758 switch (c->modrm_mod) {
928 case 0: 759 case 0:
929 if (c->modrm_rm == 5) 760 if (c->modrm_rm == 5)
930 c->modrm_ea += insn_fetch(s32, 4, c->eip); 761 modrm_ea += insn_fetch(s32, 4, c->eip);
931 break; 762 break;
932 case 1: 763 case 1:
933 c->modrm_ea += insn_fetch(s8, 1, c->eip); 764 modrm_ea += insn_fetch(s8, 1, c->eip);
934 break; 765 break;
935 case 2: 766 case 2:
936 c->modrm_ea += insn_fetch(s32, 4, c->eip); 767 modrm_ea += insn_fetch(s32, 4, c->eip);
937 break; 768 break;
938 } 769 }
939 } 770 }
771 op->addr.mem = modrm_ea;
940done: 772done:
941 return rc; 773 return rc;
942} 774}
943 775
944static int decode_abs(struct x86_emulate_ctxt *ctxt, 776static int decode_abs(struct x86_emulate_ctxt *ctxt,
945 struct x86_emulate_ops *ops) 777 struct x86_emulate_ops *ops,
778 struct operand *op)
946{ 779{
947 struct decode_cache *c = &ctxt->decode; 780 struct decode_cache *c = &ctxt->decode;
948 int rc = X86EMUL_CONTINUE; 781 int rc = X86EMUL_CONTINUE;
949 782
783 op->type = OP_MEM;
950 switch (c->ad_bytes) { 784 switch (c->ad_bytes) {
951 case 2: 785 case 2:
952 c->modrm_ea = insn_fetch(u16, 2, c->eip); 786 op->addr.mem = insn_fetch(u16, 2, c->eip);
953 break; 787 break;
954 case 4: 788 case 4:
955 c->modrm_ea = insn_fetch(u32, 4, c->eip); 789 op->addr.mem = insn_fetch(u32, 4, c->eip);
956 break; 790 break;
957 case 8: 791 case 8:
958 c->modrm_ea = insn_fetch(u64, 8, c->eip); 792 op->addr.mem = insn_fetch(u64, 8, c->eip);
959 break; 793 break;
960 } 794 }
961done: 795done:
962 return rc; 796 return rc;
963} 797}
964 798
965int 799static void fetch_bit_operand(struct decode_cache *c)
966x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
967{ 800{
968 struct decode_cache *c = &ctxt->decode; 801 long sv = 0, mask;
969 int rc = X86EMUL_CONTINUE;
970 int mode = ctxt->mode;
971 int def_op_bytes, def_ad_bytes, group;
972
973
974 /* we cannot decode insn before we complete previous rep insn */
975 WARN_ON(ctxt->restart);
976
977 c->eip = ctxt->eip;
978 c->fetch.start = c->fetch.end = c->eip;
979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
980
981 switch (mode) {
982 case X86EMUL_MODE_REAL:
983 case X86EMUL_MODE_VM86:
984 case X86EMUL_MODE_PROT16:
985 def_op_bytes = def_ad_bytes = 2;
986 break;
987 case X86EMUL_MODE_PROT32:
988 def_op_bytes = def_ad_bytes = 4;
989 break;
990#ifdef CONFIG_X86_64
991 case X86EMUL_MODE_PROT64:
992 def_op_bytes = 4;
993 def_ad_bytes = 8;
994 break;
995#endif
996 default:
997 return -1;
998 }
999
1000 c->op_bytes = def_op_bytes;
1001 c->ad_bytes = def_ad_bytes;
1002
1003 /* Legacy prefixes. */
1004 for (;;) {
1005 switch (c->b = insn_fetch(u8, 1, c->eip)) {
1006 case 0x66: /* operand-size override */
1007 /* switch between 2/4 bytes */
1008 c->op_bytes = def_op_bytes ^ 6;
1009 break;
1010 case 0x67: /* address-size override */
1011 if (mode == X86EMUL_MODE_PROT64)
1012 /* switch between 4/8 bytes */
1013 c->ad_bytes = def_ad_bytes ^ 12;
1014 else
1015 /* switch between 2/4 bytes */
1016 c->ad_bytes = def_ad_bytes ^ 6;
1017 break;
1018 case 0x26: /* ES override */
1019 case 0x2e: /* CS override */
1020 case 0x36: /* SS override */
1021 case 0x3e: /* DS override */
1022 set_seg_override(c, (c->b >> 3) & 3);
1023 break;
1024 case 0x64: /* FS override */
1025 case 0x65: /* GS override */
1026 set_seg_override(c, c->b & 7);
1027 break;
1028 case 0x40 ... 0x4f: /* REX */
1029 if (mode != X86EMUL_MODE_PROT64)
1030 goto done_prefixes;
1031 c->rex_prefix = c->b;
1032 continue;
1033 case 0xf0: /* LOCK */
1034 c->lock_prefix = 1;
1035 break;
1036 case 0xf2: /* REPNE/REPNZ */
1037 c->rep_prefix = REPNE_PREFIX;
1038 break;
1039 case 0xf3: /* REP/REPE/REPZ */
1040 c->rep_prefix = REPE_PREFIX;
1041 break;
1042 default:
1043 goto done_prefixes;
1044 }
1045
1046 /* Any legacy prefix after a REX prefix nullifies its effect. */
1047
1048 c->rex_prefix = 0;
1049 }
1050
1051done_prefixes:
1052
1053 /* REX prefix. */
1054 if (c->rex_prefix)
1055 if (c->rex_prefix & 8)
1056 c->op_bytes = 8; /* REX.W */
1057
1058 /* Opcode byte(s). */
1059 c->d = opcode_table[c->b];
1060 if (c->d == 0) {
1061 /* Two-byte opcode? */
1062 if (c->b == 0x0f) {
1063 c->twobyte = 1;
1064 c->b = insn_fetch(u8, 1, c->eip);
1065 c->d = twobyte_table[c->b];
1066 }
1067 }
1068
1069 if (c->d & Group) {
1070 group = c->d & GroupMask;
1071 c->modrm = insn_fetch(u8, 1, c->eip);
1072 --c->eip;
1073
1074 group = (group << 3) + ((c->modrm >> 3) & 7);
1075 if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
1076 c->d = group2_table[group];
1077 else
1078 c->d = group_table[group];
1079 }
1080
1081 /* Unrecognised? */
1082 if (c->d == 0) {
1083 DPRINTF("Cannot emulate %02x\n", c->b);
1084 return -1;
1085 }
1086
1087 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
1088 c->op_bytes = 8;
1089
1090 /* ModRM and SIB bytes. */
1091 if (c->d & ModRM)
1092 rc = decode_modrm(ctxt, ops);
1093 else if (c->d & MemAbs)
1094 rc = decode_abs(ctxt, ops);
1095 if (rc != X86EMUL_CONTINUE)
1096 goto done;
1097
1098 if (!c->has_seg_override)
1099 set_seg_override(c, VCPU_SREG_DS);
1100
1101 if (!(!c->twobyte && c->b == 0x8d))
1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1103
1104 if (c->ad_bytes != 8)
1105 c->modrm_ea = (u32)c->modrm_ea;
1106
1107 if (c->rip_relative)
1108 c->modrm_ea += c->eip;
1109
1110 /*
1111 * Decode and fetch the source operand: register, memory
1112 * or immediate.
1113 */
1114 switch (c->d & SrcMask) {
1115 case SrcNone:
1116 break;
1117 case SrcReg:
1118 decode_register_operand(&c->src, c, 0);
1119 break;
1120 case SrcMem16:
1121 c->src.bytes = 2;
1122 goto srcmem_common;
1123 case SrcMem32:
1124 c->src.bytes = 4;
1125 goto srcmem_common;
1126 case SrcMem:
1127 c->src.bytes = (c->d & ByteOp) ? 1 :
1128 c->op_bytes;
1129 /* Don't fetch the address for invlpg: it could be unmapped. */
1130 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
1131 break;
1132 srcmem_common:
1133 /*
1134 * For instructions with a ModR/M byte, switch to register
1135 * access if Mod = 3.
1136 */
1137 if ((c->d & ModRM) && c->modrm_mod == 3) {
1138 c->src.type = OP_REG;
1139 c->src.val = c->modrm_val;
1140 c->src.ptr = c->modrm_ptr;
1141 break;
1142 }
1143 c->src.type = OP_MEM;
1144 c->src.ptr = (unsigned long *)c->modrm_ea;
1145 c->src.val = 0;
1146 break;
1147 case SrcImm:
1148 case SrcImmU:
1149 c->src.type = OP_IMM;
1150 c->src.ptr = (unsigned long *)c->eip;
1151 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1152 if (c->src.bytes == 8)
1153 c->src.bytes = 4;
1154 /* NB. Immediates are sign-extended as necessary. */
1155 switch (c->src.bytes) {
1156 case 1:
1157 c->src.val = insn_fetch(s8, 1, c->eip);
1158 break;
1159 case 2:
1160 c->src.val = insn_fetch(s16, 2, c->eip);
1161 break;
1162 case 4:
1163 c->src.val = insn_fetch(s32, 4, c->eip);
1164 break;
1165 }
1166 if ((c->d & SrcMask) == SrcImmU) {
1167 switch (c->src.bytes) {
1168 case 1:
1169 c->src.val &= 0xff;
1170 break;
1171 case 2:
1172 c->src.val &= 0xffff;
1173 break;
1174 case 4:
1175 c->src.val &= 0xffffffff;
1176 break;
1177 }
1178 }
1179 break;
1180 case SrcImmByte:
1181 case SrcImmUByte:
1182 c->src.type = OP_IMM;
1183 c->src.ptr = (unsigned long *)c->eip;
1184 c->src.bytes = 1;
1185 if ((c->d & SrcMask) == SrcImmByte)
1186 c->src.val = insn_fetch(s8, 1, c->eip);
1187 else
1188 c->src.val = insn_fetch(u8, 1, c->eip);
1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1209 case SrcOne:
1210 c->src.bytes = 1;
1211 c->src.val = 1;
1212 break;
1213 case SrcSI:
1214 c->src.type = OP_MEM;
1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1216 c->src.ptr = (unsigned long *)
1217 register_address(c, seg_override_base(ctxt, ops, c),
1218 c->regs[VCPU_REGS_RSI]);
1219 c->src.val = 0;
1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1232 }
1233 802
1234 /* 803 if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
1235 * Decode and fetch the second source operand: register, memory 804 mask = ~(c->dst.bytes * 8 - 1);
1236 * or immediate.
1237 */
1238 switch (c->d & Src2Mask) {
1239 case Src2None:
1240 break;
1241 case Src2CL:
1242 c->src2.bytes = 1;
1243 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
1244 break;
1245 case Src2ImmByte:
1246 c->src2.type = OP_IMM;
1247 c->src2.ptr = (unsigned long *)c->eip;
1248 c->src2.bytes = 1;
1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1250 break;
1251 case Src2One:
1252 c->src2.bytes = 1;
1253 c->src2.val = 1;
1254 break;
1255 }
1256 805
1257 /* Decode and fetch the destination operand: register or memory. */ 806 if (c->src.bytes == 2)
1258 switch (c->d & DstMask) { 807 sv = (s16)c->src.val & (s16)mask;
1259 case ImplicitOps: 808 else if (c->src.bytes == 4)
1260 /* Special instructions do their own operand decoding. */ 809 sv = (s32)c->src.val & (s32)mask;
1261 return 0;
1262 case DstReg:
1263 decode_register_operand(&c->dst, c,
1264 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1265 break;
1266 case DstMem:
1267 case DstMem64:
1268 if ((c->d & ModRM) && c->modrm_mod == 3) {
1269 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1270 c->dst.type = OP_REG;
1271 c->dst.val = c->dst.orig_val = c->modrm_val;
1272 c->dst.ptr = c->modrm_ptr;
1273 break;
1274 }
1275 c->dst.type = OP_MEM;
1276 c->dst.ptr = (unsigned long *)c->modrm_ea;
1277 if ((c->d & DstMask) == DstMem64)
1278 c->dst.bytes = 8;
1279 else
1280 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1281 c->dst.val = 0;
1282 if (c->d & BitOp) {
1283 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1284 810
1285 c->dst.ptr = (void *)c->dst.ptr + 811 c->dst.addr.mem += (sv >> 3);
1286 (c->src.val & mask) / 8;
1287 }
1288 break;
1289 case DstAcc:
1290 c->dst.type = OP_REG;
1291 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1292 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1293 switch (c->dst.bytes) {
1294 case 1:
1295 c->dst.val = *(u8 *)c->dst.ptr;
1296 break;
1297 case 2:
1298 c->dst.val = *(u16 *)c->dst.ptr;
1299 break;
1300 case 4:
1301 c->dst.val = *(u32 *)c->dst.ptr;
1302 break;
1303 case 8:
1304 c->dst.val = *(u64 *)c->dst.ptr;
1305 break;
1306 }
1307 c->dst.orig_val = c->dst.val;
1308 break;
1309 case DstDI:
1310 c->dst.type = OP_MEM;
1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1312 c->dst.ptr = (unsigned long *)
1313 register_address(c, es_base(ctxt, ops),
1314 c->regs[VCPU_REGS_RDI]);
1315 c->dst.val = 0;
1316 break;
1317 } 812 }
1318 813
1319done: 814 /* only subword offset */
1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 815 c->src.val &= (c->dst.bytes << 3) - 1;
1321} 816}
1322 817
1323static int read_emulated(struct x86_emulate_ctxt *ctxt, 818static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu); 833 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT) 834 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err); 835 emulate_pf(ctxt);
1341 if (rc != X86EMUL_CONTINUE) 836 if (rc != X86EMUL_CONTINUE)
1342 return rc; 837 return rc;
1343 mc->end += n; 838 mc->end += n;
@@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1424 addr = dt.address + index * 8; 919 addr = dt.address + index * 8;
1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1426 if (ret == X86EMUL_PROPAGATE_FAULT) 921 if (ret == X86EMUL_PROPAGATE_FAULT)
1427 emulate_pf(ctxt, addr, err); 922 emulate_pf(ctxt);
1428 923
1429 return ret; 924 return ret;
1430} 925}
@@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1450 addr = dt.address + index * 8; 945 addr = dt.address + index * 8;
1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1452 if (ret == X86EMUL_PROPAGATE_FAULT) 947 if (ret == X86EMUL_PROPAGATE_FAULT)
1453 emulate_pf(ctxt, addr, err); 948 emulate_pf(ctxt);
1454 949
1455 return ret; 950 return ret;
1456} 951}
@@ -1573,6 +1068,25 @@ exception:
1573 return X86EMUL_PROPAGATE_FAULT; 1068 return X86EMUL_PROPAGATE_FAULT;
1574} 1069}
1575 1070
1071static void write_register_operand(struct operand *op)
1072{
1073 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1074 switch (op->bytes) {
1075 case 1:
1076 *(u8 *)op->addr.reg = (u8)op->val;
1077 break;
1078 case 2:
1079 *(u16 *)op->addr.reg = (u16)op->val;
1080 break;
1081 case 4:
1082 *op->addr.reg = (u32)op->val;
1083 break; /* 64b: zero-extend */
1084 case 8:
1085 *op->addr.reg = op->val;
1086 break;
1087 }
1088}
1089
1576static inline int writeback(struct x86_emulate_ctxt *ctxt, 1090static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops) 1091 struct x86_emulate_ops *ops)
1578{ 1092{
@@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1582 1096
1583 switch (c->dst.type) { 1097 switch (c->dst.type) {
1584 case OP_REG: 1098 case OP_REG:
1585 /* The 4-byte case *is* correct: 1099 write_register_operand(&c->dst);
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break; 1100 break;
1603 case OP_MEM: 1101 case OP_MEM:
1604 if (c->lock_prefix) 1102 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated( 1103 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr, 1104 c->dst.addr.mem,
1607 &c->dst.orig_val, 1105 &c->dst.orig_val,
1608 &c->dst.val, 1106 &c->dst.val,
1609 c->dst.bytes, 1107 c->dst.bytes,
@@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1611 ctxt->vcpu); 1109 ctxt->vcpu);
1612 else 1110 else
1613 rc = ops->write_emulated( 1111 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr, 1112 c->dst.addr.mem,
1615 &c->dst.val, 1113 &c->dst.val,
1616 c->dst.bytes, 1114 c->dst.bytes,
1617 &err, 1115 &err,
1618 ctxt->vcpu); 1116 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT) 1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt, 1118 emulate_pf(ctxt);
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE) 1119 if (rc != X86EMUL_CONTINUE)
1623 return rc; 1120 return rc;
1624 break; 1121 break;
@@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1640 c->dst.bytes = c->op_bytes; 1137 c->dst.bytes = c->op_bytes;
1641 c->dst.val = c->src.val; 1138 c->dst.val = c->src.val;
1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), 1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
1644 c->regs[VCPU_REGS_RSP]); 1141 c->regs[VCPU_REGS_RSP]);
1645} 1142}
1646 1143
1647static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1144static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1701 *(unsigned long *)dest = 1198 *(unsigned long *)dest =
1702 (ctxt->eflags & ~change_mask) | (val & change_mask); 1199 (ctxt->eflags & ~change_mask) | (val & change_mask);
1703 1200
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1704 return rc; 1204 return rc;
1705} 1205}
1706 1206
@@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1778 return rc; 1278 return rc;
1779} 1279}
1780 1280
1281int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1282 struct x86_emulate_ops *ops, int irq)
1283{
1284 struct decode_cache *c = &ctxt->decode;
1285 int rc;
1286 struct desc_ptr dt;
1287 gva_t cs_addr;
1288 gva_t eip_addr;
1289 u16 cs, eip;
1290 u32 err;
1291
1292 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags;
1294 emulate_push(ctxt, ops);
1295 rc = writeback(ctxt, ops);
1296 if (rc != X86EMUL_CONTINUE)
1297 return rc;
1298
1299 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1300
1301 c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
1302 emulate_push(ctxt, ops);
1303 rc = writeback(ctxt, ops);
1304 if (rc != X86EMUL_CONTINUE)
1305 return rc;
1306
1307 c->src.val = c->eip;
1308 emulate_push(ctxt, ops);
1309 rc = writeback(ctxt, ops);
1310 if (rc != X86EMUL_CONTINUE)
1311 return rc;
1312
1313 c->dst.type = OP_NONE;
1314
1315 ops->get_idt(&dt, ctxt->vcpu);
1316
1317 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2;
1319
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
1321 if (rc != X86EMUL_CONTINUE)
1322 return rc;
1323
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
1325 if (rc != X86EMUL_CONTINUE)
1326 return rc;
1327
1328 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
1329 if (rc != X86EMUL_CONTINUE)
1330 return rc;
1331
1332 c->eip = eip;
1333
1334 return rc;
1335}
1336
1337static int emulate_int(struct x86_emulate_ctxt *ctxt,
1338 struct x86_emulate_ops *ops, int irq)
1339{
1340 switch(ctxt->mode) {
1341 case X86EMUL_MODE_REAL:
1342 return emulate_int_real(ctxt, ops, irq);
1343 case X86EMUL_MODE_VM86:
1344 case X86EMUL_MODE_PROT16:
1345 case X86EMUL_MODE_PROT32:
1346 case X86EMUL_MODE_PROT64:
1347 default:
1348 /* Protected mode interrupts unimplemented yet */
1349 return X86EMUL_UNHANDLEABLE;
1350 }
1351}
1352
1353static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops)
1355{
1356 struct decode_cache *c = &ctxt->decode;
1357 int rc = X86EMUL_CONTINUE;
1358 unsigned long temp_eip = 0;
1359 unsigned long temp_eflags = 0;
1360 unsigned long cs = 0;
1361 unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
1362 EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
1363 EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
1364 unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
1365
1366 /* TODO: Add stack limit check */
1367
1368 rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
1369
1370 if (rc != X86EMUL_CONTINUE)
1371 return rc;
1372
1373 if (temp_eip & ~0xffff) {
1374 emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379
1380 if (rc != X86EMUL_CONTINUE)
1381 return rc;
1382
1383 rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
1384
1385 if (rc != X86EMUL_CONTINUE)
1386 return rc;
1387
1388 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1389
1390 if (rc != X86EMUL_CONTINUE)
1391 return rc;
1392
1393 c->eip = temp_eip;
1394
1395
1396 if (c->op_bytes == 4)
1397 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1398 else if (c->op_bytes == 2) {
1399 ctxt->eflags &= ~0xffff;
1400 ctxt->eflags |= temp_eflags;
1401 }
1402
1403 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
1404 ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
1405
1406 return rc;
1407}
1408
1409static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1410 struct x86_emulate_ops* ops)
1411{
1412 switch(ctxt->mode) {
1413 case X86EMUL_MODE_REAL:
1414 return emulate_iret_real(ctxt, ops);
1415 case X86EMUL_MODE_VM86:
1416 case X86EMUL_MODE_PROT16:
1417 case X86EMUL_MODE_PROT32:
1418 case X86EMUL_MODE_PROT64:
1419 default:
1420 /* iret from protected mode unimplemented yet */
1421 return X86EMUL_UNHANDLEABLE;
1422 }
1423}
1424
1781static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1425static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1782 struct x86_emulate_ops *ops) 1426 struct x86_emulate_ops *ops)
1783{ 1427{
@@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1819 struct x86_emulate_ops *ops) 1463 struct x86_emulate_ops *ops)
1820{ 1464{
1821 struct decode_cache *c = &ctxt->decode; 1465 struct decode_cache *c = &ctxt->decode;
1466 unsigned long *rax = &c->regs[VCPU_REGS_RAX];
1467 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1468 u8 de = 0;
1822 1469
1823 switch (c->modrm_reg) { 1470 switch (c->modrm_reg) {
1824 case 0 ... 1: /* test */ 1471 case 0 ... 1: /* test */
@@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1830 case 3: /* neg */ 1477 case 3: /* neg */
1831 emulate_1op("neg", c->dst, ctxt->eflags); 1478 emulate_1op("neg", c->dst, ctxt->eflags);
1832 break; 1479 break;
1480 case 4: /* mul */
1481 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
1482 break;
1483 case 5: /* imul */
1484 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
1485 break;
1486 case 6: /* div */
1487 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
1488 ctxt->eflags, de);
1489 break;
1490 case 7: /* idiv */
1491 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
1492 ctxt->eflags, de);
1493 break;
1833 default: 1494 default:
1834 return 0; 1495 return X86EMUL_UNHANDLEABLE;
1835 } 1496 }
1836 return 1; 1497 if (de)
1498 return emulate_de(ctxt);
1499 return X86EMUL_CONTINUE;
1837} 1500}
1838 1501
1839static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1502static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1905 return rc; 1568 return rc;
1906} 1569}
1907 1570
1571static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
1572 struct x86_emulate_ops *ops, int seg)
1573{
1574 struct decode_cache *c = &ctxt->decode;
1575 unsigned short sel;
1576 int rc;
1577
1578 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1579
1580 rc = load_segment_descriptor(ctxt, ops, sel, seg);
1581 if (rc != X86EMUL_CONTINUE)
1582 return rc;
1583
1584 c->dst.val = c->src.val;
1585 return rc;
1586}
1587
1908static inline void 1588static inline void
1909setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1589setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1910 struct x86_emulate_ops *ops, struct desc_struct *cs, 1590 struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2160 struct x86_emulate_ops *ops, 1840 struct x86_emulate_ops *ops,
2161 u16 port, u16 len) 1841 u16 port, u16 len)
2162{ 1842{
1843 if (ctxt->perm_ok)
1844 return true;
1845
2163 if (emulator_bad_iopl(ctxt, ops)) 1846 if (emulator_bad_iopl(ctxt, ops))
2164 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 1847 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
2165 return false; 1848 return false;
1849
1850 ctxt->perm_ok = true;
1851
2166 return true; 1852 return true;
2167} 1853}
2168 1854
@@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2254 &err); 1940 &err);
2255 if (ret == X86EMUL_PROPAGATE_FAULT) { 1941 if (ret == X86EMUL_PROPAGATE_FAULT) {
2256 /* FIXME: need to provide precise fault address */ 1942 /* FIXME: need to provide precise fault address */
2257 emulate_pf(ctxt, old_tss_base, err); 1943 emulate_pf(ctxt);
2258 return ret; 1944 return ret;
2259 } 1945 }
2260 1946
@@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2264 &err); 1950 &err);
2265 if (ret == X86EMUL_PROPAGATE_FAULT) { 1951 if (ret == X86EMUL_PROPAGATE_FAULT) {
2266 /* FIXME: need to provide precise fault address */ 1952 /* FIXME: need to provide precise fault address */
2267 emulate_pf(ctxt, old_tss_base, err); 1953 emulate_pf(ctxt);
2268 return ret; 1954 return ret;
2269 } 1955 }
2270 1956
@@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2272 &err); 1958 &err);
2273 if (ret == X86EMUL_PROPAGATE_FAULT) { 1959 if (ret == X86EMUL_PROPAGATE_FAULT) {
2274 /* FIXME: need to provide precise fault address */ 1960 /* FIXME: need to provide precise fault address */
2275 emulate_pf(ctxt, new_tss_base, err); 1961 emulate_pf(ctxt);
2276 return ret; 1962 return ret;
2277 } 1963 }
2278 1964
@@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2285 ctxt->vcpu, &err); 1971 ctxt->vcpu, &err);
2286 if (ret == X86EMUL_PROPAGATE_FAULT) { 1972 if (ret == X86EMUL_PROPAGATE_FAULT) {
2287 /* FIXME: need to provide precise fault address */ 1973 /* FIXME: need to provide precise fault address */
2288 emulate_pf(ctxt, new_tss_base, err); 1974 emulate_pf(ctxt);
2289 return ret; 1975 return ret;
2290 } 1976 }
2291 } 1977 }
@@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2396 &err); 2082 &err);
2397 if (ret == X86EMUL_PROPAGATE_FAULT) { 2083 if (ret == X86EMUL_PROPAGATE_FAULT) {
2398 /* FIXME: need to provide precise fault address */ 2084 /* FIXME: need to provide precise fault address */
2399 emulate_pf(ctxt, old_tss_base, err); 2085 emulate_pf(ctxt);
2400 return ret; 2086 return ret;
2401 } 2087 }
2402 2088
@@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2406 &err); 2092 &err);
2407 if (ret == X86EMUL_PROPAGATE_FAULT) { 2093 if (ret == X86EMUL_PROPAGATE_FAULT) {
2408 /* FIXME: need to provide precise fault address */ 2094 /* FIXME: need to provide precise fault address */
2409 emulate_pf(ctxt, old_tss_base, err); 2095 emulate_pf(ctxt);
2410 return ret; 2096 return ret;
2411 } 2097 }
2412 2098
@@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2414 &err); 2100 &err);
2415 if (ret == X86EMUL_PROPAGATE_FAULT) { 2101 if (ret == X86EMUL_PROPAGATE_FAULT) {
2416 /* FIXME: need to provide precise fault address */ 2102 /* FIXME: need to provide precise fault address */
2417 emulate_pf(ctxt, new_tss_base, err); 2103 emulate_pf(ctxt);
2418 return ret; 2104 return ret;
2419 } 2105 }
2420 2106
@@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2427 ctxt->vcpu, &err); 2113 ctxt->vcpu, &err);
2428 if (ret == X86EMUL_PROPAGATE_FAULT) { 2114 if (ret == X86EMUL_PROPAGATE_FAULT) {
2429 /* FIXME: need to provide precise fault address */ 2115 /* FIXME: need to provide precise fault address */
2430 emulate_pf(ctxt, new_tss_base, err); 2116 emulate_pf(ctxt);
2431 return ret; 2117 return ret;
2432 } 2118 }
2433 } 2119 }
@@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2523} 2209}
2524 2210
2525int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2211int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2526 struct x86_emulate_ops *ops,
2527 u16 tss_selector, int reason, 2212 u16 tss_selector, int reason,
2528 bool has_error_code, u32 error_code) 2213 bool has_error_code, u32 error_code)
2529{ 2214{
2215 struct x86_emulate_ops *ops = ctxt->ops;
2530 struct decode_cache *c = &ctxt->decode; 2216 struct decode_cache *c = &ctxt->decode;
2531 int rc; 2217 int rc;
2532 2218
@@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2552 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2553 2239
2554 register_address_increment(c, &c->regs[reg], df * op->bytes); 2240 register_address_increment(c, &c->regs[reg], df * op->bytes);
2555 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); 2241 op->addr.mem = register_address(c, base, c->regs[reg]);
2242}
2243
2244static int em_push(struct x86_emulate_ctxt *ctxt)
2245{
2246 emulate_push(ctxt, ctxt->ops);
2247 return X86EMUL_CONTINUE;
2248}
2249
2250static int em_das(struct x86_emulate_ctxt *ctxt)
2251{
2252 struct decode_cache *c = &ctxt->decode;
2253 u8 al, old_al;
2254 bool af, cf, old_cf;
2255
2256 cf = ctxt->eflags & X86_EFLAGS_CF;
2257 al = c->dst.val;
2258
2259 old_al = al;
2260 old_cf = cf;
2261 cf = false;
2262 af = ctxt->eflags & X86_EFLAGS_AF;
2263 if ((al & 0x0f) > 9 || af) {
2264 al -= 6;
2265 cf = old_cf | (al >= 250);
2266 af = true;
2267 } else {
2268 af = false;
2269 }
2270 if (old_al > 0x99 || old_cf) {
2271 al -= 0x60;
2272 cf = true;
2273 }
2274
2275 c->dst.val = al;
2276 /* Set PF, ZF, SF */
2277 c->src.type = OP_IMM;
2278 c->src.val = 0;
2279 c->src.bytes = 1;
2280 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2281 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2282 if (cf)
2283 ctxt->eflags |= X86_EFLAGS_CF;
2284 if (af)
2285 ctxt->eflags |= X86_EFLAGS_AF;
2286 return X86EMUL_CONTINUE;
2287}
2288
2289static int em_call_far(struct x86_emulate_ctxt *ctxt)
2290{
2291 struct decode_cache *c = &ctxt->decode;
2292 u16 sel, old_cs;
2293 ulong old_eip;
2294 int rc;
2295
2296 old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2297 old_eip = c->eip;
2298
2299 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2300 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
2301 return X86EMUL_CONTINUE;
2302
2303 c->eip = 0;
2304 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2305
2306 c->src.val = old_cs;
2307 emulate_push(ctxt, ctxt->ops);
2308 rc = writeback(ctxt, ctxt->ops);
2309 if (rc != X86EMUL_CONTINUE)
2310 return rc;
2311
2312 c->src.val = old_eip;
2313 emulate_push(ctxt, ctxt->ops);
2314 rc = writeback(ctxt, ctxt->ops);
2315 if (rc != X86EMUL_CONTINUE)
2316 return rc;
2317
2318 c->dst.type = OP_NONE;
2319
2320 return X86EMUL_CONTINUE;
2321}
2322
2323static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2324{
2325 struct decode_cache *c = &ctxt->decode;
2326 int rc;
2327
2328 c->dst.type = OP_REG;
2329 c->dst.addr.reg = &c->eip;
2330 c->dst.bytes = c->op_bytes;
2331 rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
2332 if (rc != X86EMUL_CONTINUE)
2333 return rc;
2334 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
2335 return X86EMUL_CONTINUE;
2336}
2337
2338static int em_imul(struct x86_emulate_ctxt *ctxt)
2339{
2340 struct decode_cache *c = &ctxt->decode;
2341
2342 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
2343 return X86EMUL_CONTINUE;
2344}
2345
2346static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2347{
2348 struct decode_cache *c = &ctxt->decode;
2349
2350 c->dst.val = c->src2.val;
2351 return em_imul(ctxt);
2352}
2353
2354static int em_cwd(struct x86_emulate_ctxt *ctxt)
2355{
2356 struct decode_cache *c = &ctxt->decode;
2357
2358 c->dst.type = OP_REG;
2359 c->dst.bytes = c->src.bytes;
2360 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2361 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2362
2363 return X86EMUL_CONTINUE;
2364}
2365
2366static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2367{
2368 unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
2369 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0;
2371
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
2373 emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32;
2379 return X86EMUL_CONTINUE;
2380}
2381
2382static int em_mov(struct x86_emulate_ctxt *ctxt)
2383{
2384 struct decode_cache *c = &ctxt->decode;
2385 c->dst.val = c->src.val;
2386 return X86EMUL_CONTINUE;
2387}
2388
2389#define D(_y) { .flags = (_y) }
2390#define N D(0)
2391#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
2392#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
2393#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
2394
2395#define D2bv(_f) D((_f) | ByteOp), D(_f)
2396#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
2397
2398#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
2399 D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
2400 D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
2401
2402
2403static struct opcode group1[] = {
2404 X7(D(Lock)), N
2405};
2406
2407static struct opcode group1A[] = {
2408 D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
2409};
2410
2411static struct opcode group3[] = {
2412 D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
2413 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
2414 X4(D(SrcMem | ModRM)),
2415};
2416
2417static struct opcode group4[] = {
2418 D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
2419 N, N, N, N, N, N,
2420};
2421
2422static struct opcode group5[] = {
2423 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
2424 D(SrcMem | ModRM | Stack),
2425 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
2426 D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
2427 D(SrcMem | ModRM | Stack), N,
2428};
2429
2430static struct group_dual group7 = { {
2431 N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
2432 D(SrcNone | ModRM | DstMem | Mov), N,
2433 D(SrcMem16 | ModRM | Mov | Priv),
2434 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2435}, {
2436 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
2437 D(SrcNone | ModRM | DstMem | Mov), N,
2438 D(SrcMem16 | ModRM | Mov | Priv), N,
2439} };
2440
2441static struct opcode group8[] = {
2442 N, N, N, N,
2443 D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
2444 D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
2445};
2446
2447static struct group_dual group9 = { {
2448 N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
2449}, {
2450 N, N, N, N, N, N, N, N,
2451} };
2452
2453static struct opcode group11[] = {
2454 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
2455};
2456
2457static struct opcode opcode_table[256] = {
2458 /* 0x00 - 0x07 */
2459 D6ALU(Lock),
2460 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2461 /* 0x08 - 0x0F */
2462 D6ALU(Lock),
2463 D(ImplicitOps | Stack | No64), N,
2464 /* 0x10 - 0x17 */
2465 D6ALU(Lock),
2466 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2467 /* 0x18 - 0x1F */
2468 D6ALU(Lock),
2469 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2470 /* 0x20 - 0x27 */
2471 D6ALU(Lock), N, N,
2472 /* 0x28 - 0x2F */
2473 D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
2474 /* 0x30 - 0x37 */
2475 D6ALU(Lock), N, N,
2476 /* 0x38 - 0x3F */
2477 D6ALU(0), N, N,
2478 /* 0x40 - 0x4F */
2479 X16(D(DstReg)),
2480 /* 0x50 - 0x57 */
2481 X8(I(SrcReg | Stack, em_push)),
2482 /* 0x58 - 0x5F */
2483 X8(D(DstReg | Stack)),
2484 /* 0x60 - 0x67 */
2485 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2486 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
2487 N, N, N, N,
2488 /* 0x68 - 0x6F */
2489 I(SrcImm | Mov | Stack, em_push),
2490 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
2491 I(SrcImmByte | Mov | Stack, em_push),
2492 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
2493 D2bv(DstDI | Mov | String), /* insb, insw/insd */
2494 D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
2495 /* 0x70 - 0x7F */
2496 X16(D(SrcImmByte)),
2497 /* 0x80 - 0x87 */
2498 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
2499 G(DstMem | SrcImm | ModRM | Group, group1),
2500 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
2501 G(DstMem | SrcImmByte | ModRM | Group, group1),
2502 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
2503 /* 0x88 - 0x8F */
2504 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
2505 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
2506 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
2507 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
2508 /* 0x90 - 0x97 */
2509 X8(D(SrcAcc | DstReg)),
2510 /* 0x98 - 0x9F */
2511 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
2512 I(SrcImmFAddr | No64, em_call_far), N,
2513 D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
2514 /* 0xA0 - 0xA7 */
2515 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
2516 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
2517 I2bv(SrcSI | DstDI | Mov | String, em_mov),
2518 D2bv(SrcSI | DstDI | String),
2519 /* 0xA8 - 0xAF */
2520 D2bv(DstAcc | SrcImm),
2521 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
2522 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
2523 D2bv(SrcAcc | DstDI | String),
2524 /* 0xB0 - 0xB7 */
2525 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
2526 /* 0xB8 - 0xBF */
2527 X8(I(DstReg | SrcImm | Mov, em_mov)),
2528 /* 0xC0 - 0xC7 */
2529 D2bv(DstMem | SrcImmByte | ModRM),
2530 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
2531 D(ImplicitOps | Stack),
2532 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
2533 G(ByteOp, group11), G(0, group11),
2534 /* 0xC8 - 0xCF */
2535 N, N, N, D(ImplicitOps | Stack),
2536 D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
2537 /* 0xD0 - 0xD7 */
2538 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
2539 N, N, N, N,
2540 /* 0xD8 - 0xDF */
2541 N, N, N, N, N, N, N, N,
2542 /* 0xE0 - 0xE7 */
2543 X4(D(SrcImmByte)),
2544 D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
2545 /* 0xE8 - 0xEF */
2546 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
2547 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
2548 D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
2549 /* 0xF0 - 0xF7 */
2550 N, N, N, N,
2551 D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
2552 /* 0xF8 - 0xFF */
2553 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
2554 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
2555};
2556
2557static struct opcode twobyte_table[256] = {
2558 /* 0x00 - 0x0F */
2559 N, GD(0, &group7), N, N,
2560 N, D(ImplicitOps), D(ImplicitOps | Priv), N,
2561 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2562 N, D(ImplicitOps | ModRM), N, N,
2563 /* 0x10 - 0x1F */
2564 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
2565 /* 0x20 - 0x2F */
2566 D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
2567 D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
2568 N, N, N, N,
2569 N, N, N, N, N, N, N, N,
2570 /* 0x30 - 0x3F */
2571 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2572 D(ImplicitOps | Priv), N,
2573 D(ImplicitOps), D(ImplicitOps | Priv), N, N,
2574 N, N, N, N, N, N, N, N,
2575 /* 0x40 - 0x4F */
2576 X16(D(DstReg | SrcMem | ModRM | Mov)),
2577 /* 0x50 - 0x5F */
2578 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2579 /* 0x60 - 0x6F */
2580 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2581 /* 0x70 - 0x7F */
2582 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2583 /* 0x80 - 0x8F */
2584 X16(D(SrcImm)),
2585 /* 0x90 - 0x9F */
2586 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
2587 /* 0xA0 - 0xA7 */
2588 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2589 N, D(DstMem | SrcReg | ModRM | BitOp),
2590 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2591 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
2592 /* 0xA8 - 0xAF */
2593 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2594 N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
2595 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2596 D(DstMem | SrcReg | Src2CL | ModRM),
2597 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
2598 /* 0xB0 - 0xB7 */
2599 D2bv(DstMem | SrcReg | ModRM | Lock),
2600 D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2601 D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
2602 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
2603 /* 0xB8 - 0xBF */
2604 N, N,
2605 G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2606 D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
2607 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
2608 /* 0xC0 - 0xCF */
2609 D2bv(DstMem | SrcReg | ModRM | Lock),
2610 N, D(DstMem | SrcReg | ModRM | Mov),
2611 N, N, N, GD(0, &group9),
2612 N, N, N, N, N, N, N, N,
2613 /* 0xD0 - 0xDF */
2614 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2615 /* 0xE0 - 0xEF */
2616 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2617 /* 0xF0 - 0xFF */
2618 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
2619};
2620
2621#undef D
2622#undef N
2623#undef G
2624#undef GD
2625#undef I
2626
2627#undef D2bv
2628#undef I2bv
2629#undef D6ALU
2630
2631static unsigned imm_size(struct decode_cache *c)
2632{
2633 unsigned size;
2634
2635 size = (c->d & ByteOp) ? 1 : c->op_bytes;
2636 if (size == 8)
2637 size = 4;
2638 return size;
2639}
2640
2641static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2642 unsigned size, bool sign_extension)
2643{
2644 struct decode_cache *c = &ctxt->decode;
2645 struct x86_emulate_ops *ops = ctxt->ops;
2646 int rc = X86EMUL_CONTINUE;
2647
2648 op->type = OP_IMM;
2649 op->bytes = size;
2650 op->addr.mem = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) {
2653 case 1:
2654 op->val = insn_fetch(s8, 1, c->eip);
2655 break;
2656 case 2:
2657 op->val = insn_fetch(s16, 2, c->eip);
2658 break;
2659 case 4:
2660 op->val = insn_fetch(s32, 4, c->eip);
2661 break;
2662 }
2663 if (!sign_extension) {
2664 switch (op->bytes) {
2665 case 1:
2666 op->val &= 0xff;
2667 break;
2668 case 2:
2669 op->val &= 0xffff;
2670 break;
2671 case 4:
2672 op->val &= 0xffffffff;
2673 break;
2674 }
2675 }
2676done:
2677 return rc;
2556} 2678}
2557 2679
2558int 2680int
2559x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2681x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2560{ 2682{
2683 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode;
2685 int rc = X86EMUL_CONTINUE;
2686 int mode = ctxt->mode;
2687 int def_op_bytes, def_ad_bytes, dual, goffset;
2688 struct opcode opcode, *g_mod012, *g_mod3;
2689 struct operand memop = { .type = OP_NONE };
2690
2691 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip;
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694
2695 switch (mode) {
2696 case X86EMUL_MODE_REAL:
2697 case X86EMUL_MODE_VM86:
2698 case X86EMUL_MODE_PROT16:
2699 def_op_bytes = def_ad_bytes = 2;
2700 break;
2701 case X86EMUL_MODE_PROT32:
2702 def_op_bytes = def_ad_bytes = 4;
2703 break;
2704#ifdef CONFIG_X86_64
2705 case X86EMUL_MODE_PROT64:
2706 def_op_bytes = 4;
2707 def_ad_bytes = 8;
2708 break;
2709#endif
2710 default:
2711 return -1;
2712 }
2713
2714 c->op_bytes = def_op_bytes;
2715 c->ad_bytes = def_ad_bytes;
2716
2717 /* Legacy prefixes. */
2718 for (;;) {
2719 switch (c->b = insn_fetch(u8, 1, c->eip)) {
2720 case 0x66: /* operand-size override */
2721 /* switch between 2/4 bytes */
2722 c->op_bytes = def_op_bytes ^ 6;
2723 break;
2724 case 0x67: /* address-size override */
2725 if (mode == X86EMUL_MODE_PROT64)
2726 /* switch between 4/8 bytes */
2727 c->ad_bytes = def_ad_bytes ^ 12;
2728 else
2729 /* switch between 2/4 bytes */
2730 c->ad_bytes = def_ad_bytes ^ 6;
2731 break;
2732 case 0x26: /* ES override */
2733 case 0x2e: /* CS override */
2734 case 0x36: /* SS override */
2735 case 0x3e: /* DS override */
2736 set_seg_override(c, (c->b >> 3) & 3);
2737 break;
2738 case 0x64: /* FS override */
2739 case 0x65: /* GS override */
2740 set_seg_override(c, c->b & 7);
2741 break;
2742 case 0x40 ... 0x4f: /* REX */
2743 if (mode != X86EMUL_MODE_PROT64)
2744 goto done_prefixes;
2745 c->rex_prefix = c->b;
2746 continue;
2747 case 0xf0: /* LOCK */
2748 c->lock_prefix = 1;
2749 break;
2750 case 0xf2: /* REPNE/REPNZ */
2751 c->rep_prefix = REPNE_PREFIX;
2752 break;
2753 case 0xf3: /* REP/REPE/REPZ */
2754 c->rep_prefix = REPE_PREFIX;
2755 break;
2756 default:
2757 goto done_prefixes;
2758 }
2759
2760 /* Any legacy prefix after a REX prefix nullifies its effect. */
2761
2762 c->rex_prefix = 0;
2763 }
2764
2765done_prefixes:
2766
2767 /* REX prefix. */
2768 if (c->rex_prefix & 8)
2769 c->op_bytes = 8; /* REX.W */
2770
2771 /* Opcode byte(s). */
2772 opcode = opcode_table[c->b];
2773 /* Two-byte opcode? */
2774 if (c->b == 0x0f) {
2775 c->twobyte = 1;
2776 c->b = insn_fetch(u8, 1, c->eip);
2777 opcode = twobyte_table[c->b];
2778 }
2779 c->d = opcode.flags;
2780
2781 if (c->d & Group) {
2782 dual = c->d & GroupDual;
2783 c->modrm = insn_fetch(u8, 1, c->eip);
2784 --c->eip;
2785
2786 if (c->d & GroupDual) {
2787 g_mod012 = opcode.u.gdual->mod012;
2788 g_mod3 = opcode.u.gdual->mod3;
2789 } else
2790 g_mod012 = g_mod3 = opcode.u.group;
2791
2792 c->d &= ~(Group | GroupDual);
2793
2794 goffset = (c->modrm >> 3) & 7;
2795
2796 if ((c->modrm >> 6) == 3)
2797 opcode = g_mod3[goffset];
2798 else
2799 opcode = g_mod012[goffset];
2800 c->d |= opcode.flags;
2801 }
2802
2803 c->execute = opcode.u.execute;
2804
2805 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) {
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1;
2809 }
2810
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8;
2813
2814 if (c->d & Op3264) {
2815 if (mode == X86EMUL_MODE_PROT64)
2816 c->op_bytes = 8;
2817 else
2818 c->op_bytes = 4;
2819 }
2820
2821 /* ModRM and SIB bytes. */
2822 if (c->d & ModRM) {
2823 rc = decode_modrm(ctxt, ops, &memop);
2824 if (!c->has_seg_override)
2825 set_seg_override(c, c->modrm_seg);
2826 } else if (c->d & MemAbs)
2827 rc = decode_abs(ctxt, ops, &memop);
2828 if (rc != X86EMUL_CONTINUE)
2829 goto done;
2830
2831 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS);
2833
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836
2837 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem;
2839
2840 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip;
2842
2843 /*
2844 * Decode and fetch the source operand: register, memory
2845 * or immediate.
2846 */
2847 switch (c->d & SrcMask) {
2848 case SrcNone:
2849 break;
2850 case SrcReg:
2851 decode_register_operand(&c->src, c, 0);
2852 break;
2853 case SrcMem16:
2854 memop.bytes = 2;
2855 goto srcmem_common;
2856 case SrcMem32:
2857 memop.bytes = 4;
2858 goto srcmem_common;
2859 case SrcMem:
2860 memop.bytes = (c->d & ByteOp) ? 1 :
2861 c->op_bytes;
2862 srcmem_common:
2863 c->src = memop;
2864 break;
2865 case SrcImmU16:
2866 rc = decode_imm(ctxt, &c->src, 2, false);
2867 break;
2868 case SrcImm:
2869 rc = decode_imm(ctxt, &c->src, imm_size(c), true);
2870 break;
2871 case SrcImmU:
2872 rc = decode_imm(ctxt, &c->src, imm_size(c), false);
2873 break;
2874 case SrcImmByte:
2875 rc = decode_imm(ctxt, &c->src, 1, true);
2876 break;
2877 case SrcImmUByte:
2878 rc = decode_imm(ctxt, &c->src, 1, false);
2879 break;
2880 case SrcAcc:
2881 c->src.type = OP_REG;
2882 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2883 c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
2884 fetch_register_operand(&c->src);
2885 break;
2886 case SrcOne:
2887 c->src.bytes = 1;
2888 c->src.val = 1;
2889 break;
2890 case SrcSI:
2891 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem =
2894 register_address(c, seg_override_base(ctxt, ops, c),
2895 c->regs[VCPU_REGS_RSI]);
2896 c->src.val = 0;
2897 break;
2898 case SrcImmFAddr:
2899 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip;
2901 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break;
2904 case SrcMemFAddr:
2905 memop.bytes = c->op_bytes + 2;
2906 goto srcmem_common;
2907 break;
2908 }
2909
2910 if (rc != X86EMUL_CONTINUE)
2911 goto done;
2912
2913 /*
2914 * Decode and fetch the second source operand: register, memory
2915 * or immediate.
2916 */
2917 switch (c->d & Src2Mask) {
2918 case Src2None:
2919 break;
2920 case Src2CL:
2921 c->src2.bytes = 1;
2922 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
2923 break;
2924 case Src2ImmByte:
2925 rc = decode_imm(ctxt, &c->src2, 1, true);
2926 break;
2927 case Src2One:
2928 c->src2.bytes = 1;
2929 c->src2.val = 1;
2930 break;
2931 case Src2Imm:
2932 rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
2933 break;
2934 }
2935
2936 if (rc != X86EMUL_CONTINUE)
2937 goto done;
2938
2939 /* Decode and fetch the destination operand: register or memory. */
2940 switch (c->d & DstMask) {
2941 case DstReg:
2942 decode_register_operand(&c->dst, c,
2943 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
2944 break;
2945 case DstImmUByte:
2946 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip;
2948 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break;
2951 case DstMem:
2952 case DstMem64:
2953 c->dst = memop;
2954 if ((c->d & DstMask) == DstMem64)
2955 c->dst.bytes = 8;
2956 else
2957 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2958 if (c->d & BitOp)
2959 fetch_bit_operand(c);
2960 c->dst.orig_val = c->dst.val;
2961 break;
2962 case DstAcc:
2963 c->dst.type = OP_REG;
2964 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2965 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
2966 fetch_register_operand(&c->dst);
2967 c->dst.orig_val = c->dst.val;
2968 break;
2969 case DstDI:
2970 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem =
2973 register_address(c, es_base(ctxt, ops),
2974 c->regs[VCPU_REGS_RDI]);
2975 c->dst.val = 0;
2976 break;
2977 case ImplicitOps:
2978 /* Special instructions do their own operand decoding. */
2979 default:
2980 c->dst.type = OP_NONE; /* Disable writeback. */
2981 return 0;
2982 }
2983
2984done:
2985 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2986}
2987
2988static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
2989{
2990 struct decode_cache *c = &ctxt->decode;
2991
2992 /* The second termination condition only applies for REPE
2993 * and REPNE. Test if the repeat string operation prefix is
2994 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2995 * corresponding termination condition according to:
2996 * - if REPE/REPZ and ZF = 0 then done
2997 * - if REPNE/REPNZ and ZF = 1 then done
2998 */
2999 if (((c->b == 0xa6) || (c->b == 0xa7) ||
3000 (c->b == 0xae) || (c->b == 0xaf))
3001 && (((c->rep_prefix == REPE_PREFIX) &&
3002 ((ctxt->eflags & EFLG_ZF) == 0))
3003 || ((c->rep_prefix == REPNE_PREFIX) &&
3004 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3005 return true;
3006
3007 return false;
3008}
3009
3010int
3011x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3012{
3013 struct x86_emulate_ops *ops = ctxt->ops;
2561 u64 msr_data; 3014 u64 msr_data;
2562 struct decode_cache *c = &ctxt->decode; 3015 struct decode_cache *c = &ctxt->decode;
2563 int rc = X86EMUL_CONTINUE; 3016 int rc = X86EMUL_CONTINUE;
2564 int saved_dst_type = c->dst.type; 3017 int saved_dst_type = c->dst.type;
3018 int irq; /* Used for int 3, int, and into */
2565 3019
2566 ctxt->decode.mem_read.pos = 0; 3020 ctxt->decode.mem_read.pos = 0;
2567 3021
@@ -2576,6 +3030,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2576 goto done; 3030 goto done;
2577 } 3031 }
2578 3032
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt);
3035 goto done;
3036 }
3037
2579 /* Privileged instruction can be executed only in CPL=0 */ 3038 /* Privileged instruction can be executed only in CPL=0 */
2580 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2581 emulate_gp(ctxt, 0); 3040 emulate_gp(ctxt, 0);
@@ -2583,35 +3042,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2583 } 3042 }
2584 3043
2585 if (c->rep_prefix && (c->d & String)) { 3044 if (c->rep_prefix && (c->d & String)) {
2586 ctxt->restart = true;
2587 /* All REP prefixes have the same first termination condition */ 3045 /* All REP prefixes have the same first termination condition */
2588 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3046 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2589 string_done:
2590 ctxt->restart = false;
2591 ctxt->eip = c->eip; 3047 ctxt->eip = c->eip;
2592 goto done; 3048 goto done;
2593 } 3049 }
2594 /* The second termination condition only applies for REPE
2595 * and REPNE. Test if the repeat string operation prefix is
2596 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2597 * corresponding termination condition according to:
2598 * - if REPE/REPZ and ZF = 0 then done
2599 * - if REPNE/REPNZ and ZF = 1 then done
2600 */
2601 if ((c->b == 0xa6) || (c->b == 0xa7) ||
2602 (c->b == 0xae) || (c->b == 0xaf)) {
2603 if ((c->rep_prefix == REPE_PREFIX) &&
2604 ((ctxt->eflags & EFLG_ZF) == 0))
2605 goto string_done;
2606 if ((c->rep_prefix == REPNE_PREFIX) &&
2607 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
2608 goto string_done;
2609 }
2610 c->eip = ctxt->eip;
2611 } 3050 }
2612 3051
2613 if (c->src.type == OP_MEM) { 3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
2614 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, 3053 rc = read_emulated(ctxt, ops, c->src.addr.mem,
2615 c->src.valptr, c->src.bytes); 3054 c->src.valptr, c->src.bytes);
2616 if (rc != X86EMUL_CONTINUE) 3055 if (rc != X86EMUL_CONTINUE)
2617 goto done; 3056 goto done;
@@ -2619,7 +3058,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2619 } 3058 }
2620 3059
2621 if (c->src2.type == OP_MEM) { 3060 if (c->src2.type == OP_MEM) {
2622 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, 3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem,
2623 &c->src2.val, c->src2.bytes); 3062 &c->src2.val, c->src2.bytes);
2624 if (rc != X86EMUL_CONTINUE) 3063 if (rc != X86EMUL_CONTINUE)
2625 goto done; 3064 goto done;
@@ -2631,7 +3070,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2631 3070
2632 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2633 /* optimisation - avoid slow emulated read if Mov */ 3072 /* optimisation - avoid slow emulated read if Mov */
2634 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, 3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem,
2635 &c->dst.val, c->dst.bytes); 3074 &c->dst.val, c->dst.bytes);
2636 if (rc != X86EMUL_CONTINUE) 3075 if (rc != X86EMUL_CONTINUE)
2637 goto done; 3076 goto done;
@@ -2640,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2640 3079
2641special_insn: 3080special_insn:
2642 3081
3082 if (c->execute) {
3083 rc = c->execute(ctxt);
3084 if (rc != X86EMUL_CONTINUE)
3085 goto done;
3086 goto writeback;
3087 }
3088
2643 if (c->twobyte) 3089 if (c->twobyte)
2644 goto twobyte_insn; 3090 goto twobyte_insn;
2645 3091
@@ -2653,8 +3099,6 @@ special_insn:
2653 break; 3099 break;
2654 case 0x07: /* pop es */ 3100 case 0x07: /* pop es */
2655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3101 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
2656 if (rc != X86EMUL_CONTINUE)
2657 goto done;
2658 break; 3102 break;
2659 case 0x08 ... 0x0d: 3103 case 0x08 ... 0x0d:
2660 or: /* or */ 3104 or: /* or */
@@ -2672,8 +3116,6 @@ special_insn:
2672 break; 3116 break;
2673 case 0x17: /* pop ss */ 3117 case 0x17: /* pop ss */
2674 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3118 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
2675 if (rc != X86EMUL_CONTINUE)
2676 goto done;
2677 break; 3119 break;
2678 case 0x18 ... 0x1d: 3120 case 0x18 ... 0x1d:
2679 sbb: /* sbb */ 3121 sbb: /* sbb */
@@ -2684,8 +3126,6 @@ special_insn:
2684 break; 3126 break;
2685 case 0x1f: /* pop ds */ 3127 case 0x1f: /* pop ds */
2686 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3128 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
2687 if (rc != X86EMUL_CONTINUE)
2688 goto done;
2689 break; 3129 break;
2690 case 0x20 ... 0x25: 3130 case 0x20 ... 0x25:
2691 and: /* and */ 3131 and: /* and */
@@ -2709,58 +3149,29 @@ special_insn:
2709 case 0x48 ... 0x4f: /* dec r16/r32 */ 3149 case 0x48 ... 0x4f: /* dec r16/r32 */
2710 emulate_1op("dec", c->dst, ctxt->eflags); 3150 emulate_1op("dec", c->dst, ctxt->eflags);
2711 break; 3151 break;
2712 case 0x50 ... 0x57: /* push reg */
2713 emulate_push(ctxt, ops);
2714 break;
2715 case 0x58 ... 0x5f: /* pop reg */ 3152 case 0x58 ... 0x5f: /* pop reg */
2716 pop_instruction: 3153 pop_instruction:
2717 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 3154 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
2718 if (rc != X86EMUL_CONTINUE)
2719 goto done;
2720 break; 3155 break;
2721 case 0x60: /* pusha */ 3156 case 0x60: /* pusha */
2722 rc = emulate_pusha(ctxt, ops); 3157 rc = emulate_pusha(ctxt, ops);
2723 if (rc != X86EMUL_CONTINUE)
2724 goto done;
2725 break; 3158 break;
2726 case 0x61: /* popa */ 3159 case 0x61: /* popa */
2727 rc = emulate_popa(ctxt, ops); 3160 rc = emulate_popa(ctxt, ops);
2728 if (rc != X86EMUL_CONTINUE)
2729 goto done;
2730 break; 3161 break;
2731 case 0x63: /* movsxd */ 3162 case 0x63: /* movsxd */
2732 if (ctxt->mode != X86EMUL_MODE_PROT64) 3163 if (ctxt->mode != X86EMUL_MODE_PROT64)
2733 goto cannot_emulate; 3164 goto cannot_emulate;
2734 c->dst.val = (s32) c->src.val; 3165 c->dst.val = (s32) c->src.val;
2735 break; 3166 break;
2736 case 0x68: /* push imm */
2737 case 0x6a: /* push imm8 */
2738 emulate_push(ctxt, ops);
2739 break;
2740 case 0x6c: /* insb */ 3167 case 0x6c: /* insb */
2741 case 0x6d: /* insw/insd */ 3168 case 0x6d: /* insw/insd */
2742 c->dst.bytes = min(c->dst.bytes, 4u); 3169 c->src.val = c->regs[VCPU_REGS_RDX];
2743 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3170 goto do_io_in;
2744 c->dst.bytes)) {
2745 emulate_gp(ctxt, 0);
2746 goto done;
2747 }
2748 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2749 c->regs[VCPU_REGS_RDX], &c->dst.val))
2750 goto done; /* IO is needed, skip writeback */
2751 break;
2752 case 0x6e: /* outsb */ 3171 case 0x6e: /* outsb */
2753 case 0x6f: /* outsw/outsd */ 3172 case 0x6f: /* outsw/outsd */
2754 c->src.bytes = min(c->src.bytes, 4u); 3173 c->dst.val = c->regs[VCPU_REGS_RDX];
2755 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3174 goto do_io_out;
2756 c->src.bytes)) {
2757 emulate_gp(ctxt, 0);
2758 goto done;
2759 }
2760 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2761 &c->src.val, 1, ctxt->vcpu);
2762
2763 c->dst.type = OP_NONE; /* nothing to writeback */
2764 break; 3175 break;
2765 case 0x70 ... 0x7f: /* jcc (short) */ 3176 case 0x70 ... 0x7f: /* jcc (short) */
2766 if (test_cc(c->b, ctxt->eflags)) 3177 if (test_cc(c->b, ctxt->eflags))
@@ -2793,29 +3204,15 @@ special_insn:
2793 case 0x86 ... 0x87: /* xchg */ 3204 case 0x86 ... 0x87: /* xchg */
2794 xchg: 3205 xchg:
2795 /* Write back the register source. */ 3206 /* Write back the register source. */
2796 switch (c->dst.bytes) { 3207 c->src.val = c->dst.val;
2797 case 1: 3208 write_register_operand(&c->src);
2798 *(u8 *) c->src.ptr = (u8) c->dst.val;
2799 break;
2800 case 2:
2801 *(u16 *) c->src.ptr = (u16) c->dst.val;
2802 break;
2803 case 4:
2804 *c->src.ptr = (u32) c->dst.val;
2805 break; /* 64b reg: zero-extend */
2806 case 8:
2807 *c->src.ptr = c->dst.val;
2808 break;
2809 }
2810 /* 3209 /*
2811 * Write back the memory destination with implicit LOCK 3210 * Write back the memory destination with implicit LOCK
2812 * prefix. 3211 * prefix.
2813 */ 3212 */
2814 c->dst.val = c->src.val; 3213 c->dst.val = c->src.orig_val;
2815 c->lock_prefix = 1; 3214 c->lock_prefix = 1;
2816 break; 3215 break;
2817 case 0x88 ... 0x8b: /* mov */
2818 goto mov;
2819 case 0x8c: /* mov r/m, sreg */ 3216 case 0x8c: /* mov r/m, sreg */
2820 if (c->modrm_reg > VCPU_SREG_GS) { 3217 if (c->modrm_reg > VCPU_SREG_GS) {
2821 emulate_ud(ctxt); 3218 emulate_ud(ctxt);
@@ -2824,7 +3221,7 @@ special_insn:
2824 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2825 break; 3222 break;
2826 case 0x8d: /* lea r16/r32, m */ 3223 case 0x8d: /* lea r16/r32, m */
2827 c->dst.val = c->modrm_ea; 3224 c->dst.val = c->src.addr.mem;
2828 break; 3225 break;
2829 case 0x8e: { /* mov seg, r/m16 */ 3226 case 0x8e: { /* mov seg, r/m16 */
2830 uint16_t sel; 3227 uint16_t sel;
@@ -2847,76 +3244,87 @@ special_insn:
2847 } 3244 }
2848 case 0x8f: /* pop (sole member of Grp1a) */ 3245 case 0x8f: /* pop (sole member of Grp1a) */
2849 rc = emulate_grp1a(ctxt, ops); 3246 rc = emulate_grp1a(ctxt, ops);
2850 if (rc != X86EMUL_CONTINUE)
2851 goto done;
2852 break; 3247 break;
2853 case 0x90: /* nop / xchg r8,rax */ 3248 case 0x90 ... 0x97: /* nop / xchg reg, rax */
2854 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { 3249 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
2855 c->dst.type = OP_NONE; /* nop */
2856 break; 3250 break;
2857 }
2858 case 0x91 ... 0x97: /* xchg reg,rax */
2859 c->src.type = OP_REG;
2860 c->src.bytes = c->op_bytes;
2861 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2862 c->src.val = *(c->src.ptr);
2863 goto xchg; 3251 goto xchg;
3252 case 0x98: /* cbw/cwde/cdqe */
3253 switch (c->op_bytes) {
3254 case 2: c->dst.val = (s8)c->dst.val; break;
3255 case 4: c->dst.val = (s16)c->dst.val; break;
3256 case 8: c->dst.val = (s32)c->dst.val; break;
3257 }
3258 break;
2864 case 0x9c: /* pushf */ 3259 case 0x9c: /* pushf */
2865 c->src.val = (unsigned long) ctxt->eflags; 3260 c->src.val = (unsigned long) ctxt->eflags;
2866 emulate_push(ctxt, ops); 3261 emulate_push(ctxt, ops);
2867 break; 3262 break;
2868 case 0x9d: /* popf */ 3263 case 0x9d: /* popf */
2869 c->dst.type = OP_REG; 3264 c->dst.type = OP_REG;
2870 c->dst.ptr = (unsigned long *) &ctxt->eflags; 3265 c->dst.addr.reg = &ctxt->eflags;
2871 c->dst.bytes = c->op_bytes; 3266 c->dst.bytes = c->op_bytes;
2872 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); 3267 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2873 if (rc != X86EMUL_CONTINUE)
2874 goto done;
2875 break; 3268 break;
2876 case 0xa0 ... 0xa3: /* mov */
2877 case 0xa4 ... 0xa5: /* movs */
2878 goto mov;
2879 case 0xa6 ... 0xa7: /* cmps */ 3269 case 0xa6 ... 0xa7: /* cmps */
2880 c->dst.type = OP_NONE; /* Disable writeback. */ 3270 c->dst.type = OP_NONE; /* Disable writeback. */
2881 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
2882 goto cmp; 3272 goto cmp;
2883 case 0xa8 ... 0xa9: /* test ax, imm */ 3273 case 0xa8 ... 0xa9: /* test ax, imm */
2884 goto test; 3274 goto test;
2885 case 0xaa ... 0xab: /* stos */
2886 c->dst.val = c->regs[VCPU_REGS_RAX];
2887 break;
2888 case 0xac ... 0xad: /* lods */
2889 goto mov;
2890 case 0xae ... 0xaf: /* scas */ 3275 case 0xae ... 0xaf: /* scas */
2891 DPRINTF("Urk! I don't handle SCAS.\n"); 3276 goto cmp;
2892 goto cannot_emulate;
2893 case 0xb0 ... 0xbf: /* mov r, imm */
2894 goto mov;
2895 case 0xc0 ... 0xc1: 3277 case 0xc0 ... 0xc1:
2896 emulate_grp2(ctxt); 3278 emulate_grp2(ctxt);
2897 break; 3279 break;
2898 case 0xc3: /* ret */ 3280 case 0xc3: /* ret */
2899 c->dst.type = OP_REG; 3281 c->dst.type = OP_REG;
2900 c->dst.ptr = &c->eip; 3282 c->dst.addr.reg = &c->eip;
2901 c->dst.bytes = c->op_bytes; 3283 c->dst.bytes = c->op_bytes;
2902 goto pop_instruction; 3284 goto pop_instruction;
2903 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 3285 case 0xc4: /* les */
2904 mov: 3286 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
2905 c->dst.val = c->src.val; 3287 break;
3288 case 0xc5: /* lds */
3289 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
2906 break; 3290 break;
2907 case 0xcb: /* ret far */ 3291 case 0xcb: /* ret far */
2908 rc = emulate_ret_far(ctxt, ops); 3292 rc = emulate_ret_far(ctxt, ops);
2909 if (rc != X86EMUL_CONTINUE) 3293 break;
2910 goto done; 3294 case 0xcc: /* int3 */
3295 irq = 3;
3296 goto do_interrupt;
3297 case 0xcd: /* int n */
3298 irq = c->src.val;
3299 do_interrupt:
3300 rc = emulate_int(ctxt, ops, irq);
3301 break;
3302 case 0xce: /* into */
3303 if (ctxt->eflags & EFLG_OF) {
3304 irq = 4;
3305 goto do_interrupt;
3306 }
3307 break;
3308 case 0xcf: /* iret */
3309 rc = emulate_iret(ctxt, ops);
2911 break; 3310 break;
2912 case 0xd0 ... 0xd1: /* Grp2 */ 3311 case 0xd0 ... 0xd1: /* Grp2 */
2913 c->src.val = 1;
2914 emulate_grp2(ctxt); 3312 emulate_grp2(ctxt);
2915 break; 3313 break;
2916 case 0xd2 ... 0xd3: /* Grp2 */ 3314 case 0xd2 ... 0xd3: /* Grp2 */
2917 c->src.val = c->regs[VCPU_REGS_RCX]; 3315 c->src.val = c->regs[VCPU_REGS_RCX];
2918 emulate_grp2(ctxt); 3316 emulate_grp2(ctxt);
2919 break; 3317 break;
3318 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
3319 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3320 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
3321 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
3322 jmp_rel(c, c->src.val);
3323 break;
3324 case 0xe3: /* jcxz/jecxz/jrcxz */
3325 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
3326 jmp_rel(c, c->src.val);
3327 break;
2920 case 0xe4: /* inb */ 3328 case 0xe4: /* inb */
2921 case 0xe5: /* in */ 3329 case 0xe5: /* in */
2922 goto do_io_in; 3330 goto do_io_in;
@@ -2964,15 +3372,16 @@ special_insn:
2964 break; 3372 break;
2965 case 0xee: /* out dx,al */ 3373 case 0xee: /* out dx,al */
2966 case 0xef: /* out dx,(e/r)ax */ 3374 case 0xef: /* out dx,(e/r)ax */
2967 c->src.val = c->regs[VCPU_REGS_RDX]; 3375 c->dst.val = c->regs[VCPU_REGS_RDX];
2968 do_io_out: 3376 do_io_out:
2969 c->dst.bytes = min(c->dst.bytes, 4u); 3377 c->src.bytes = min(c->src.bytes, 4u);
2970 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3378 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) {
2971 emulate_gp(ctxt, 0); 3380 emulate_gp(ctxt, 0);
2972 goto done; 3381 goto done;
2973 } 3382 }
2974 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 3383 ops->pio_out_emulated(c->src.bytes, c->dst.val,
2975 ctxt->vcpu); 3384 &c->src.val, 1, ctxt->vcpu);
2976 c->dst.type = OP_NONE; /* Disable writeback. */ 3385 c->dst.type = OP_NONE; /* Disable writeback. */
2977 break; 3386 break;
2978 case 0xf4: /* hlt */ 3387 case 0xf4: /* hlt */
@@ -2981,24 +3390,22 @@ special_insn:
2981 case 0xf5: /* cmc */ 3390 case 0xf5: /* cmc */
2982 /* complement carry flag from eflags reg */ 3391 /* complement carry flag from eflags reg */
2983 ctxt->eflags ^= EFLG_CF; 3392 ctxt->eflags ^= EFLG_CF;
2984 c->dst.type = OP_NONE; /* Disable writeback. */
2985 break; 3393 break;
2986 case 0xf6 ... 0xf7: /* Grp3 */ 3394 case 0xf6 ... 0xf7: /* Grp3 */
2987 if (!emulate_grp3(ctxt, ops)) 3395 rc = emulate_grp3(ctxt, ops);
2988 goto cannot_emulate;
2989 break; 3396 break;
2990 case 0xf8: /* clc */ 3397 case 0xf8: /* clc */
2991 ctxt->eflags &= ~EFLG_CF; 3398 ctxt->eflags &= ~EFLG_CF;
2992 c->dst.type = OP_NONE; /* Disable writeback. */ 3399 break;
3400 case 0xf9: /* stc */
3401 ctxt->eflags |= EFLG_CF;
2993 break; 3402 break;
2994 case 0xfa: /* cli */ 3403 case 0xfa: /* cli */
2995 if (emulator_bad_iopl(ctxt, ops)) { 3404 if (emulator_bad_iopl(ctxt, ops)) {
2996 emulate_gp(ctxt, 0); 3405 emulate_gp(ctxt, 0);
2997 goto done; 3406 goto done;
2998 } else { 3407 } else
2999 ctxt->eflags &= ~X86_EFLAGS_IF; 3408 ctxt->eflags &= ~X86_EFLAGS_IF;
3000 c->dst.type = OP_NONE; /* Disable writeback. */
3001 }
3002 break; 3409 break;
3003 case 0xfb: /* sti */ 3410 case 0xfb: /* sti */
3004 if (emulator_bad_iopl(ctxt, ops)) { 3411 if (emulator_bad_iopl(ctxt, ops)) {
@@ -3007,29 +3414,29 @@ special_insn:
3007 } else { 3414 } else {
3008 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
3009 ctxt->eflags |= X86_EFLAGS_IF; 3416 ctxt->eflags |= X86_EFLAGS_IF;
3010 c->dst.type = OP_NONE; /* Disable writeback. */
3011 } 3417 }
3012 break; 3418 break;
3013 case 0xfc: /* cld */ 3419 case 0xfc: /* cld */
3014 ctxt->eflags &= ~EFLG_DF; 3420 ctxt->eflags &= ~EFLG_DF;
3015 c->dst.type = OP_NONE; /* Disable writeback. */
3016 break; 3421 break;
3017 case 0xfd: /* std */ 3422 case 0xfd: /* std */
3018 ctxt->eflags |= EFLG_DF; 3423 ctxt->eflags |= EFLG_DF;
3019 c->dst.type = OP_NONE; /* Disable writeback. */
3020 break; 3424 break;
3021 case 0xfe: /* Grp4 */ 3425 case 0xfe: /* Grp4 */
3022 grp45: 3426 grp45:
3023 rc = emulate_grp45(ctxt, ops); 3427 rc = emulate_grp45(ctxt, ops);
3024 if (rc != X86EMUL_CONTINUE)
3025 goto done;
3026 break; 3428 break;
3027 case 0xff: /* Grp5 */ 3429 case 0xff: /* Grp5 */
3028 if (c->modrm_reg == 5) 3430 if (c->modrm_reg == 5)
3029 goto jump_far; 3431 goto jump_far;
3030 goto grp45; 3432 goto grp45;
3433 default:
3434 goto cannot_emulate;
3031 } 3435 }
3032 3436
3437 if (rc != X86EMUL_CONTINUE)
3438 goto done;
3439
3033writeback: 3440writeback:
3034 rc = writeback(ctxt, ops); 3441 rc = writeback(ctxt, ops);
3035 if (rc != X86EMUL_CONTINUE) 3442 if (rc != X86EMUL_CONTINUE)
@@ -3050,25 +3457,32 @@ writeback:
3050 &c->dst); 3457 &c->dst);
3051 3458
3052 if (c->rep_prefix && (c->d & String)) { 3459 if (c->rep_prefix && (c->d & String)) {
3053 struct read_cache *rc = &ctxt->decode.io_read; 3460 struct read_cache *r = &ctxt->decode.io_read;
3054 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 3461 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3055 /* 3462
3056 * Re-enter guest when pio read ahead buffer is empty or, 3463 if (!string_insn_completed(ctxt)) {
3057 * if it is not used, after each 1024 iteration. 3464 /*
3058 */ 3465 * Re-enter guest when pio read ahead buffer is empty
3059 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || 3466 * or, if it is not used, after each 1024 iteration.
3060 (rc->end != 0 && rc->end == rc->pos)) 3467 */
3061 ctxt->restart = false; 3468 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
3469 (r->end == 0 || r->end != r->pos)) {
3470 /*
3471 * Reset read cache. Usually happens before
3472 * decode, but since instruction is restarted
3473 * we have to do it here.
3474 */
3475 ctxt->decode.mem_read.end = 0;
3476 return EMULATION_RESTART;
3477 }
3478 goto done; /* skip rip writeback */
3479 }
3062 } 3480 }
3063 /* 3481
3064 * reset read cache here in case string instruction is restared
3065 * without decoding
3066 */
3067 ctxt->decode.mem_read.end = 0;
3068 ctxt->eip = c->eip; 3482 ctxt->eip = c->eip;
3069 3483
3070done: 3484done:
3071 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3072 3486
3073twobyte_insn: 3487twobyte_insn:
3074 switch (c->b) { 3488 switch (c->b) {
@@ -3091,7 +3505,7 @@ twobyte_insn:
3091 c->dst.type = OP_NONE; 3505 c->dst.type = OP_NONE;
3092 break; 3506 break;
3093 case 2: /* lgdt */ 3507 case 2: /* lgdt */
3094 rc = read_descriptor(ctxt, ops, c->src.ptr, 3508 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3095 &size, &address, c->op_bytes); 3509 &size, &address, c->op_bytes);
3096 if (rc != X86EMUL_CONTINUE) 3510 if (rc != X86EMUL_CONTINUE)
3097 goto done; 3511 goto done;
@@ -3104,14 +3518,12 @@ twobyte_insn:
3104 switch (c->modrm_rm) { 3518 switch (c->modrm_rm) {
3105 case 1: 3519 case 1:
3106 rc = kvm_fix_hypercall(ctxt->vcpu); 3520 rc = kvm_fix_hypercall(ctxt->vcpu);
3107 if (rc != X86EMUL_CONTINUE)
3108 goto done;
3109 break; 3521 break;
3110 default: 3522 default:
3111 goto cannot_emulate; 3523 goto cannot_emulate;
3112 } 3524 }
3113 } else { 3525 } else {
3114 rc = read_descriptor(ctxt, ops, c->src.ptr, 3526 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3115 &size, &address, 3527 &size, &address,
3116 c->op_bytes); 3528 c->op_bytes);
3117 if (rc != X86EMUL_CONTINUE) 3529 if (rc != X86EMUL_CONTINUE)
@@ -3126,7 +3538,7 @@ twobyte_insn:
3126 c->dst.val = ops->get_cr(0, ctxt->vcpu); 3538 c->dst.val = ops->get_cr(0, ctxt->vcpu);
3127 break; 3539 break;
3128 case 6: /* lmsw */ 3540 case 6: /* lmsw */
3129 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | 3541 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
3130 (c->src.val & 0x0f), ctxt->vcpu); 3542 (c->src.val & 0x0f), ctxt->vcpu);
3131 c->dst.type = OP_NONE; 3543 c->dst.type = OP_NONE;
3132 break; 3544 break;
@@ -3134,7 +3546,7 @@ twobyte_insn:
3134 emulate_ud(ctxt); 3546 emulate_ud(ctxt);
3135 goto done; 3547 goto done;
3136 case 7: /* invlpg*/ 3548 case 7: /* invlpg*/
3137 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
3138 /* Disable writeback. */ 3550 /* Disable writeback. */
3139 c->dst.type = OP_NONE; 3551 c->dst.type = OP_NONE;
3140 break; 3552 break;
@@ -3144,23 +3556,16 @@ twobyte_insn:
3144 break; 3556 break;
3145 case 0x05: /* syscall */ 3557 case 0x05: /* syscall */
3146 rc = emulate_syscall(ctxt, ops); 3558 rc = emulate_syscall(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE)
3148 goto done;
3149 else
3150 goto writeback;
3151 break; 3559 break;
3152 case 0x06: 3560 case 0x06:
3153 emulate_clts(ctxt->vcpu); 3561 emulate_clts(ctxt->vcpu);
3154 c->dst.type = OP_NONE;
3155 break; 3562 break;
3156 case 0x09: /* wbinvd */ 3563 case 0x09: /* wbinvd */
3157 kvm_emulate_wbinvd(ctxt->vcpu); 3564 kvm_emulate_wbinvd(ctxt->vcpu);
3158 c->dst.type = OP_NONE;
3159 break; 3565 break;
3160 case 0x08: /* invd */ 3566 case 0x08: /* invd */
3161 case 0x0d: /* GrpP (prefetch) */ 3567 case 0x0d: /* GrpP (prefetch) */
3162 case 0x18: /* Grp16 (prefetch/nop) */ 3568 case 0x18: /* Grp16 (prefetch/nop) */
3163 c->dst.type = OP_NONE;
3164 break; 3569 break;
3165 case 0x20: /* mov cr, reg */ 3570 case 0x20: /* mov cr, reg */
3166 switch (c->modrm_reg) { 3571 switch (c->modrm_reg) {
@@ -3170,8 +3575,7 @@ twobyte_insn:
3170 emulate_ud(ctxt); 3575 emulate_ud(ctxt);
3171 goto done; 3576 goto done;
3172 } 3577 }
3173 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
3174 c->dst.type = OP_NONE; /* no writeback */
3175 break; 3579 break;
3176 case 0x21: /* mov from dr to reg */ 3580 case 0x21: /* mov from dr to reg */
3177 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
@@ -3179,11 +3583,10 @@ twobyte_insn:
3179 emulate_ud(ctxt); 3583 emulate_ud(ctxt);
3180 goto done; 3584 goto done;
3181 } 3585 }
3182 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); 3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
3183 c->dst.type = OP_NONE; /* no writeback */
3184 break; 3587 break;
3185 case 0x22: /* mov reg, cr */ 3588 case 0x22: /* mov reg, cr */
3186 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { 3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3187 emulate_gp(ctxt, 0); 3590 emulate_gp(ctxt, 0);
3188 goto done; 3591 goto done;
3189 } 3592 }
@@ -3196,7 +3599,7 @@ twobyte_insn:
3196 goto done; 3599 goto done;
3197 } 3600 }
3198 3601
3199 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & 3602 if (ops->set_dr(c->modrm_reg, c->src.val &
3200 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 3603 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3201 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3604 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3202 /* #UD condition is already handled by the code above */ 3605 /* #UD condition is already handled by the code above */
@@ -3215,7 +3618,6 @@ twobyte_insn:
3215 goto done; 3618 goto done;
3216 } 3619 }
3217 rc = X86EMUL_CONTINUE; 3620 rc = X86EMUL_CONTINUE;
3218 c->dst.type = OP_NONE;
3219 break; 3621 break;
3220 case 0x32: 3622 case 0x32:
3221 /* rdmsr */ 3623 /* rdmsr */
@@ -3227,21 +3629,12 @@ twobyte_insn:
3227 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3629 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
3228 } 3630 }
3229 rc = X86EMUL_CONTINUE; 3631 rc = X86EMUL_CONTINUE;
3230 c->dst.type = OP_NONE;
3231 break; 3632 break;
3232 case 0x34: /* sysenter */ 3633 case 0x34: /* sysenter */
3233 rc = emulate_sysenter(ctxt, ops); 3634 rc = emulate_sysenter(ctxt, ops);
3234 if (rc != X86EMUL_CONTINUE)
3235 goto done;
3236 else
3237 goto writeback;
3238 break; 3635 break;
3239 case 0x35: /* sysexit */ 3636 case 0x35: /* sysexit */
3240 rc = emulate_sysexit(ctxt, ops); 3637 rc = emulate_sysexit(ctxt, ops);
3241 if (rc != X86EMUL_CONTINUE)
3242 goto done;
3243 else
3244 goto writeback;
3245 break; 3638 break;
3246 case 0x40 ... 0x4f: /* cmov */ 3639 case 0x40 ... 0x4f: /* cmov */
3247 c->dst.val = c->dst.orig_val = c->src.val; 3640 c->dst.val = c->dst.orig_val = c->src.val;
@@ -3251,15 +3644,15 @@ twobyte_insn:
3251 case 0x80 ... 0x8f: /* jnz rel, etc*/ 3644 case 0x80 ... 0x8f: /* jnz rel, etc*/
3252 if (test_cc(c->b, ctxt->eflags)) 3645 if (test_cc(c->b, ctxt->eflags))
3253 jmp_rel(c, c->src.val); 3646 jmp_rel(c, c->src.val);
3254 c->dst.type = OP_NONE; 3647 break;
3648 case 0x90 ... 0x9f: /* setcc r/m8 */
3649 c->dst.val = test_cc(c->b, ctxt->eflags);
3255 break; 3650 break;
3256 case 0xa0: /* push fs */ 3651 case 0xa0: /* push fs */
3257 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 3652 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3258 break; 3653 break;
3259 case 0xa1: /* pop fs */ 3654 case 0xa1: /* pop fs */
3260 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
3261 if (rc != X86EMUL_CONTINUE)
3262 goto done;
3263 break; 3656 break;
3264 case 0xa3: 3657 case 0xa3:
3265 bt: /* bt */ 3658 bt: /* bt */
@@ -3277,13 +3670,9 @@ twobyte_insn:
3277 break; 3670 break;
3278 case 0xa9: /* pop gs */ 3671 case 0xa9: /* pop gs */
3279 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3672 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
3280 if (rc != X86EMUL_CONTINUE)
3281 goto done;
3282 break; 3673 break;
3283 case 0xab: 3674 case 0xab:
3284 bts: /* bts */ 3675 bts: /* bts */
3285 /* only subword offset */
3286 c->src.val &= (c->dst.bytes << 3) - 1;
3287 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 3676 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
3288 break; 3677 break;
3289 case 0xac: /* shrd imm8, r, r/m */ 3678 case 0xac: /* shrd imm8, r, r/m */
@@ -3306,15 +3695,22 @@ twobyte_insn:
3306 } else { 3695 } else {
3307 /* Failure: write the value we saw to EAX. */ 3696 /* Failure: write the value we saw to EAX. */
3308 c->dst.type = OP_REG; 3697 c->dst.type = OP_REG;
3309 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 3698 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
3310 } 3699 }
3311 break; 3700 break;
3701 case 0xb2: /* lss */
3702 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
3703 break;
3312 case 0xb3: 3704 case 0xb3:
3313 btr: /* btr */ 3705 btr: /* btr */
3314 /* only subword offset */
3315 c->src.val &= (c->dst.bytes << 3) - 1;
3316 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 3706 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
3317 break; 3707 break;
3708 case 0xb4: /* lfs */
3709 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
3710 break;
3711 case 0xb5: /* lgs */
3712 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
3713 break;
3318 case 0xb6 ... 0xb7: /* movzx */ 3714 case 0xb6 ... 0xb7: /* movzx */
3319 c->dst.bytes = c->op_bytes; 3715 c->dst.bytes = c->op_bytes;
3320 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 3716 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3334,15 +3730,43 @@ twobyte_insn:
3334 break; 3730 break;
3335 case 0xbb: 3731 case 0xbb:
3336 btc: /* btc */ 3732 btc: /* btc */
3337 /* only subword offset */
3338 c->src.val &= (c->dst.bytes << 3) - 1;
3339 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 3733 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
3340 break; 3734 break;
3735 case 0xbc: { /* bsf */
3736 u8 zf;
3737 __asm__ ("bsf %2, %0; setz %1"
3738 : "=r"(c->dst.val), "=q"(zf)
3739 : "r"(c->src.val));
3740 ctxt->eflags &= ~X86_EFLAGS_ZF;
3741 if (zf) {
3742 ctxt->eflags |= X86_EFLAGS_ZF;
3743 c->dst.type = OP_NONE; /* Disable writeback. */
3744 }
3745 break;
3746 }
3747 case 0xbd: { /* bsr */
3748 u8 zf;
3749 __asm__ ("bsr %2, %0; setz %1"
3750 : "=r"(c->dst.val), "=q"(zf)
3751 : "r"(c->src.val));
3752 ctxt->eflags &= ~X86_EFLAGS_ZF;
3753 if (zf) {
3754 ctxt->eflags |= X86_EFLAGS_ZF;
3755 c->dst.type = OP_NONE; /* Disable writeback. */
3756 }
3757 break;
3758 }
3341 case 0xbe ... 0xbf: /* movsx */ 3759 case 0xbe ... 0xbf: /* movsx */
3342 c->dst.bytes = c->op_bytes; 3760 c->dst.bytes = c->op_bytes;
3343 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 3761 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
3344 (s16) c->src.val; 3762 (s16) c->src.val;
3345 break; 3763 break;
3764 case 0xc0 ... 0xc1: /* xadd */
3765 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
3766 /* Write back the register source. */
3767 c->src.val = c->dst.orig_val;
3768 write_register_operand(&c->src);
3769 break;
3346 case 0xc3: /* movnti */ 3770 case 0xc3: /* movnti */
3347 c->dst.bytes = c->op_bytes; 3771 c->dst.bytes = c->op_bytes;
3348 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 3772 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
@@ -3350,10 +3774,14 @@ twobyte_insn:
3350 break; 3774 break;
3351 case 0xc7: /* Grp9 (cmpxchg8b) */ 3775 case 0xc7: /* Grp9 (cmpxchg8b) */
3352 rc = emulate_grp9(ctxt, ops); 3776 rc = emulate_grp9(ctxt, ops);
3353 if (rc != X86EMUL_CONTINUE)
3354 goto done;
3355 break; 3777 break;
3778 default:
3779 goto cannot_emulate;
3356 } 3780 }
3781
3782 if (rc != X86EMUL_CONTINUE)
3783 goto done;
3784
3357 goto writeback; 3785 goto writeback;
3358 3786
3359cannot_emulate: 3787cannot_emulate:
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb2314b522..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
232 } 232 }
233} 233}
234 234
235int pit_has_pending_timer(struct kvm_vcpu *vcpu)
236{
237 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
238
239 if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
240 return atomic_read(&pit->pit_state.pit_timer.pending);
241 return 0;
242}
243
244static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) 235static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
245{ 236{
246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 237 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73ce2098..f628234fbeca 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates. 6 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
7 * 7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
39static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
40 __acquires(&s->lock) 40 __acquires(&s->lock)
41{ 41{
42 raw_spin_lock(&s->lock); 42 spin_lock(&s->lock);
43} 43}
44 44
45static void pic_unlock(struct kvm_pic *s) 45static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
51 51
52 s->wakeup_needed = false; 52 s->wakeup_needed = false;
53 53
54 raw_spin_unlock(&s->lock); 54 spin_unlock(&s->lock);
55 55
56 if (wakeup) { 56 if (wakeup) {
57 kvm_for_each_vcpu(i, vcpu, s->kvm) { 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s)
67 if (!found) 67 if (!found)
68 return; 68 return;
69 69
70 kvm_make_request(KVM_REQ_EVENT, found);
70 kvm_vcpu_kick(found); 71 kvm_vcpu_kick(found);
71 } 72 }
72} 73}
@@ -308,13 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
308 addr &= 1; 309 addr &= 1;
309 if (addr == 0) { 310 if (addr == 0) {
310 if (val & 0x10) { 311 if (val & 0x10) {
311 kvm_pic_reset(s); /* init */
312 /*
313 * deassert a pending interrupt
314 */
315 pic_irq_request(s->pics_state->kvm, 0);
316 s->init_state = 1;
317 s->init4 = val & 1; 312 s->init4 = val & 1;
313 s->last_irr = 0;
314 s->imr = 0;
315 s->priority_add = 0;
316 s->special_mask = 0;
317 s->read_reg_select = 0;
318 if (!s->init4) {
319 s->special_fully_nested_mode = 0;
320 s->auto_eoi = 0;
321 }
322 s->init_state = 1;
318 if (val & 0x02) 323 if (val & 0x02)
319 printk(KERN_ERR "single mode not supported"); 324 printk(KERN_ERR "single mode not supported");
320 if (val & 0x08) 325 if (val & 0x08)
@@ -564,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
564 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 569 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
565 if (!s) 570 if (!s)
566 return NULL; 571 return NULL;
567 raw_spin_lock_init(&s->lock); 572 spin_lock_init(&s->lock);
568 s->kvm = kvm; 573 s->kvm = kvm;
569 s->pics[0].elcr_mask = 0xf8; 574 s->pics[0].elcr_mask = 0xf8;
570 s->pics[1].elcr_mask = 0xde; 575 s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a049835e..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates. 4 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -33,12 +33,7 @@
33 */ 33 */
34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
35{ 35{
36 int ret; 36 return apic_has_pending_timer(vcpu);
37
38 ret = pit_has_pending_timer(vcpu);
39 ret |= apic_has_pending_timer(vcpu);
40
41 return ret;
42} 37}
43EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
44 39
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c314502993..ba910d149410 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
60}; 60};
61 61
62struct kvm_pic { 62struct kvm_pic {
63 raw_spinlock_t lock; 63 spinlock_t lock;
64 bool wakeup_needed; 64 bool wakeup_needed;
65 unsigned pending_acks; 65 unsigned pending_acks;
66 struct kvm *kvm; 66 struct kvm *kvm;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8e755b..975bb45329a1 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
42 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
44 44
45 return vcpu->arch.pdptrs[index]; 45 return vcpu->arch.walk_mmu->pdptrs[index];
46}
47
48static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
49{
50 load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
51
52 return mmu->pdptrs[index];
46} 53}
47 54
48static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 55static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 22b06f7660f4..413f8973a855 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
259 259
260static void apic_update_ppr(struct kvm_lapic *apic) 260static void apic_update_ppr(struct kvm_lapic *apic)
261{ 261{
262 u32 tpr, isrv, ppr; 262 u32 tpr, isrv, ppr, old_ppr;
263 int isr; 263 int isr;
264 264
265 old_ppr = apic_get_reg(apic, APIC_PROCPRI);
265 tpr = apic_get_reg(apic, APIC_TASKPRI); 266 tpr = apic_get_reg(apic, APIC_TASKPRI);
266 isr = apic_find_highest_isr(apic); 267 isr = apic_find_highest_isr(apic);
267 isrv = (isr != -1) ? isr : 0; 268 isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
274 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", 275 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
275 apic, ppr, isr, isrv); 276 apic, ppr, isr, isrv);
276 277
277 apic_set_reg(apic, APIC_PROCPRI, ppr); 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 }
278} 282}
279 283
280static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) 284static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
391 break; 395 break;
392 } 396 }
393 397
398 kvm_make_request(KVM_REQ_EVENT, vcpu);
394 kvm_vcpu_kick(vcpu); 399 kvm_vcpu_kick(vcpu);
395 break; 400 break;
396 401
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
416 "INIT on a runnable vcpu %d\n", 421 "INIT on a runnable vcpu %d\n",
417 vcpu->vcpu_id); 422 vcpu->vcpu_id);
418 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 423 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
424 kvm_make_request(KVM_REQ_EVENT, vcpu);
419 kvm_vcpu_kick(vcpu); 425 kvm_vcpu_kick(vcpu);
420 } else { 426 } else {
421 apic_debug("Ignoring de-assert INIT to vcpu %d\n", 427 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
430 result = 1; 436 result = 1;
431 vcpu->arch.sipi_vector = vector; 437 vcpu->arch.sipi_vector = vector;
432 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 438 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
439 kvm_make_request(KVM_REQ_EVENT, vcpu);
433 kvm_vcpu_kick(vcpu); 440 kvm_vcpu_kick(vcpu);
434 } 441 }
435 break; 442 break;
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
475 trigger_mode = IOAPIC_EDGE_TRIG; 482 trigger_mode = IOAPIC_EDGE_TRIG;
476 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 483 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
477 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 484 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
485 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
478} 486}
479 487
480static void apic_send_ipi(struct kvm_lapic *apic) 488static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1151,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1151 update_divide_count(apic); 1159 update_divide_count(apic);
1152 start_apic_timer(apic); 1160 start_apic_timer(apic);
1153 apic->irr_pending = true; 1161 apic->irr_pending = true;
1162 kvm_make_request(KVM_REQ_EVENT, vcpu);
1154} 1163}
1155 1164
1156void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1165void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..908ea5464a51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -49,15 +49,25 @@
49 */ 49 */
50bool tdp_enabled = false; 50bool tdp_enabled = false;
51 51
52#undef MMU_DEBUG 52enum {
53 AUDIT_PRE_PAGE_FAULT,
54 AUDIT_POST_PAGE_FAULT,
55 AUDIT_PRE_PTE_WRITE,
56 AUDIT_POST_PTE_WRITE,
57 AUDIT_PRE_SYNC,
58 AUDIT_POST_SYNC
59};
53 60
54#undef AUDIT 61char *audit_point_name[] = {
62 "pre page fault",
63 "post page fault",
64 "pre pte write",
65 "post pte write",
66 "pre sync",
67 "post sync"
68};
55 69
56#ifdef AUDIT 70#undef MMU_DEBUG
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61 71
62#ifdef MMU_DEBUG 72#ifdef MMU_DEBUG
63 73
@@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
71 81
72#endif 82#endif
73 83
74#if defined(MMU_DEBUG) || defined(AUDIT) 84#ifdef MMU_DEBUG
75static int dbg = 0; 85static int dbg = 0;
76module_param(dbg, bool, 0644); 86module_param(dbg, bool, 0644);
77#endif 87#endif
@@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644);
89 } 99 }
90#endif 100#endif
91 101
102#define PTE_PREFETCH_NUM 8
103
92#define PT_FIRST_AVAIL_BITS_SHIFT 9 104#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 105#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 106
@@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
178static struct kmem_cache *pte_chain_cache; 190static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 191static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache; 192static struct kmem_cache *mmu_page_header_cache;
193static struct percpu_counter kvm_total_used_mmu_pages;
181 194
182static u64 __read_mostly shadow_trap_nonpresent_pte; 195static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte; 196static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
299#endif 312#endif
300} 313}
301 314
315static bool spte_has_volatile_bits(u64 spte)
316{
317 if (!shadow_accessed_mask)
318 return false;
319
320 if (!is_shadow_present_pte(spte))
321 return false;
322
323 if ((spte & shadow_accessed_mask) &&
324 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
325 return false;
326
327 return true;
328}
329
330static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
331{
332 return (old_spte & bit_mask) && !(new_spte & bit_mask);
333}
334
302static void update_spte(u64 *sptep, u64 new_spte) 335static void update_spte(u64 *sptep, u64 new_spte)
303{ 336{
304 u64 old_spte; 337 u64 mask, old_spte = *sptep;
338
339 WARN_ON(!is_rmap_spte(new_spte));
340
341 new_spte |= old_spte & shadow_dirty_mask;
305 342
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 343 mask = shadow_accessed_mask;
307 !is_rmap_spte(*sptep)) 344 if (is_writable_pte(old_spte))
345 mask |= shadow_dirty_mask;
346
347 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
308 __set_spte(sptep, new_spte); 348 __set_spte(sptep, new_spte);
309 else { 349 else
310 old_spte = __xchg_spte(sptep, new_spte); 350 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask) 351
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 352 if (!shadow_accessed_mask)
313 } 353 return;
354
355 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
356 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
357 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
358 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
314} 359}
315 360
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 361static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
367 if (r) 412 if (r)
368 goto out; 413 goto out;
369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 414 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370 rmap_desc_cache, 4); 415 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
371 if (r) 416 if (r)
372 goto out; 417 goto out;
373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 418 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
591 desc->sptes[0] = (u64 *)*rmapp; 636 desc->sptes[0] = (u64 *)*rmapp;
592 desc->sptes[1] = spte; 637 desc->sptes[1] = spte;
593 *rmapp = (unsigned long)desc | 1; 638 *rmapp = (unsigned long)desc | 1;
639 ++count;
594 } else { 640 } else {
595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 641 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 642 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
603 desc = desc->more; 649 desc = desc->more;
604 } 650 }
605 for (i = 0; desc->sptes[i]; ++i) 651 for (i = 0; desc->sptes[i]; ++i)
606 ; 652 ++count;
607 desc->sptes[i] = spte; 653 desc->sptes[i] = spte;
608 } 654 }
609 return count; 655 return count;
@@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 691 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 692 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647 if (!*rmapp) { 693 if (!*rmapp) {
648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 694 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
649 BUG(); 695 BUG();
650 } else if (!(*rmapp & 1)) { 696 } else if (!(*rmapp & 1)) {
651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 697 rmap_printk("rmap_remove: %p 1->0\n", spte);
652 if ((u64 *)*rmapp != spte) { 698 if ((u64 *)*rmapp != spte) {
653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 699 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
654 spte, *spte);
655 BUG(); 700 BUG();
656 } 701 }
657 *rmapp = 0; 702 *rmapp = 0;
658 } else { 703 } else {
659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 704 rmap_printk("rmap_remove: %p many->many\n", spte);
660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 705 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 prev_desc = NULL; 706 prev_desc = NULL;
662 while (desc) { 707 while (desc) {
@@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
670 prev_desc = desc; 715 prev_desc = desc;
671 desc = desc->more; 716 desc = desc->more;
672 } 717 }
673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 718 pr_err("rmap_remove: %p many->many\n", spte);
674 BUG(); 719 BUG();
675 } 720 }
676} 721}
@@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
680 pfn_t pfn; 725 pfn_t pfn;
681 u64 old_spte = *sptep; 726 u64 old_spte = *sptep;
682 727
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 728 if (!spte_has_volatile_bits(old_spte))
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte); 729 __set_spte(sptep, new_spte);
686 } else 730 else
687 old_spte = __xchg_spte(sptep, new_spte); 731 old_spte = __xchg_spte(sptep, new_spte);
688 732
689 if (!is_rmap_spte(old_spte)) 733 if (!is_rmap_spte(old_spte))
690 return; 734 return;
735
691 pfn = spte_to_pfn(old_spte); 736 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 737 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn); 738 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte)) 739 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
695 kvm_set_pfn_dirty(pfn); 740 kvm_set_pfn_dirty(pfn);
696} 741}
697 742
@@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
746 } 791 }
747 spte = rmap_next(kvm, rmapp, spte); 792 spte = rmap_next(kvm, rmapp, spte);
748 } 793 }
749 if (write_protected) {
750 pfn_t pfn;
751
752 spte = rmap_next(kvm, rmapp, NULL);
753 pfn = spte_to_pfn(*spte);
754 kvm_set_pfn_dirty(pfn);
755 }
756 794
757 /* check for huge page mappings */ 795 /* check for huge page mappings */
758 for (i = PT_DIRECTORY_LEVEL; 796 for (i = PT_DIRECTORY_LEVEL;
@@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt)
947} 985}
948#endif 986#endif
949 987
988/*
989 * This value is the sum of all of the kvm instances's
990 * kvm->arch.n_used_mmu_pages values. We need a global,
991 * aggregate version in order to make the slab shrinker
992 * faster
993 */
994static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
995{
996 kvm->arch.n_used_mmu_pages += nr;
997 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
998}
999
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1000static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{ 1001{
952 ASSERT(is_empty_shadow_page(sp->spt)); 1002 ASSERT(is_empty_shadow_page(sp->spt));
@@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
956 if (!sp->role.direct) 1006 if (!sp->role.direct)
957 __free_page(virt_to_page(sp->gfns)); 1007 __free_page(virt_to_page(sp->gfns));
958 kmem_cache_free(mmu_page_header_cache, sp); 1008 kmem_cache_free(mmu_page_header_cache, sp);
959 ++kvm->arch.n_free_mmu_pages; 1009 kvm_mod_used_mmu_pages(kvm, -1);
960} 1010}
961 1011
962static unsigned kvm_page_table_hashfn(gfn_t gfn) 1012static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1029 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 sp->multimapped = 0; 1030 sp->multimapped = 0;
981 sp->parent_pte = parent_pte; 1031 sp->parent_pte = parent_pte;
982 --vcpu->kvm->arch.n_free_mmu_pages; 1032 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
983 return sp; 1033 return sp;
984} 1034}
985 1035
@@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1403 if (role.direct) 1453 if (role.direct)
1404 role.cr4_pae = 0; 1454 role.cr4_pae = 0;
1405 role.access = access; 1455 role.access = access;
1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1456 if (!vcpu->arch.mmu.direct_map
1457 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1458 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1459 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409 role.quadrant = quadrant; 1460 role.quadrant = quadrant;
@@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1458 iterator->addr = addr; 1509 iterator->addr = addr;
1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1510 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1511 iterator->level = vcpu->arch.mmu.shadow_root_level;
1512
1513 if (iterator->level == PT64_ROOT_LEVEL &&
1514 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1515 !vcpu->arch.mmu.direct_map)
1516 --iterator->level;
1517
1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1518 if (iterator->level == PT32E_ROOT_LEVEL) {
1462 iterator->shadow_addr 1519 iterator->shadow_addr
1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1520 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1665 1722
1666/* 1723/*
1667 * Changing the number of mmu pages allocated to the vm 1724 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1725 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1669 */ 1726 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1727void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1671{ 1728{
1672 int used_pages;
1673 LIST_HEAD(invalid_list); 1729 LIST_HEAD(invalid_list);
1674
1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676 used_pages = max(0, used_pages);
1677
1678 /* 1730 /*
1679 * If we set the number of mmu pages to be smaller be than the 1731 * If we set the number of mmu pages to be smaller be than the
1680 * number of actived pages , we must to free some mmu pages before we 1732 * number of actived pages , we must to free some mmu pages before we
1681 * change the value 1733 * change the value
1682 */ 1734 */
1683 1735
1684 if (used_pages > kvm_nr_mmu_pages) { 1736 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1685 while (used_pages > kvm_nr_mmu_pages && 1737 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1738 !list_empty(&kvm->arch.active_mmu_pages)) {
1687 struct kvm_mmu_page *page; 1739 struct kvm_mmu_page *page;
1688 1740
1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1741 page = container_of(kvm->arch.active_mmu_pages.prev,
1690 struct kvm_mmu_page, link); 1742 struct kvm_mmu_page, link);
1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1743 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1692 &invalid_list); 1744 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1693 } 1745 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1746 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1695 kvm_nr_mmu_pages = used_pages;
1696 kvm->arch.n_free_mmu_pages = 0;
1697 } 1747 }
1698 else
1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700 - kvm->arch.n_alloc_mmu_pages;
1701 1748
1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1749 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1703} 1750}
1704 1751
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1752static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1709 LIST_HEAD(invalid_list); 1756 LIST_HEAD(invalid_list);
1710 int r; 1757 int r;
1711 1758
1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1759 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1713 r = 0; 1760 r = 0;
1714 1761
1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1762 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1763 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1717 sp->role.word); 1764 sp->role.word);
1718 r = 1; 1765 r = 1;
1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1766 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1729 LIST_HEAD(invalid_list); 1776 LIST_HEAD(invalid_list);
1730 1777
1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1778 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 pgprintk("%s: zap %lx %x\n", 1779 pgprintk("%s: zap %llx %x\n",
1733 __func__, gfn, sp->role.word); 1780 __func__, gfn, sp->role.word);
1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1781 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735 } 1782 }
@@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1925 * whether the guest actually used the pte (in order to detect 1972 * whether the guest actually used the pte (in order to detect
1926 * demand paging). 1973 * demand paging).
1927 */ 1974 */
1928 spte = shadow_base_present_pte | shadow_dirty_mask; 1975 spte = shadow_base_present_pte;
1929 if (!speculative) 1976 if (!speculative)
1930 spte |= shadow_accessed_mask; 1977 spte |= shadow_accessed_mask;
1931 if (!dirty) 1978 if (!dirty)
@@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1948 spte |= (u64)pfn << PAGE_SHIFT; 1995 spte |= (u64)pfn << PAGE_SHIFT;
1949 1996
1950 if ((pte_access & ACC_WRITE_MASK) 1997 if ((pte_access & ACC_WRITE_MASK)
1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 1998 || (!vcpu->arch.mmu.direct_map && write_fault
1952 && !user_fault)) { 1999 && !is_write_protection(vcpu) && !user_fault)) {
1953 2000
1954 if (level > PT_PAGE_TABLE_LEVEL && 2001 if (level > PT_PAGE_TABLE_LEVEL &&
1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2002 has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1960 2007
1961 spte |= PT_WRITABLE_MASK; 2008 spte |= PT_WRITABLE_MASK;
1962 2009
1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 2010 if (!vcpu->arch.mmu.direct_map
2011 && !(pte_access & ACC_WRITE_MASK))
1964 spte &= ~PT_USER_MASK; 2012 spte &= ~PT_USER_MASK;
1965 2013
1966 /* 2014 /*
@@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 goto set_pte; 2021 goto set_pte;
1974 2022
1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2023 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 2024 pgprintk("%s: found shadow page for %llx, marking ro\n",
1977 __func__, gfn); 2025 __func__, gfn);
1978 ret = 1; 2026 ret = 1;
1979 pte_access &= ~ACC_WRITE_MASK; 2027 pte_access &= ~ACC_WRITE_MASK;
@@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1986 mark_page_dirty(vcpu->kvm, gfn); 2034 mark_page_dirty(vcpu->kvm, gfn);
1987 2035
1988set_pte: 2036set_pte:
1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte); 2037 update_spte(sptep, spte);
1992done: 2038done:
1993 return ret; 2039 return ret;
@@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 int rmap_count; 2050 int rmap_count;
2005 2051
2006 pgprintk("%s: spte %llx access %x write_fault %d" 2052 pgprintk("%s: spte %llx access %x write_fault %d"
2007 " user_fault %d gfn %lx\n", 2053 " user_fault %d gfn %llx\n",
2008 __func__, *sptep, pt_access, 2054 __func__, *sptep, pt_access,
2009 write_fault, user_fault, gfn); 2055 write_fault, user_fault, gfn);
2010 2056
@@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2069 __set_spte(sptep, shadow_trap_nonpresent_pte);
2024 kvm_flush_remote_tlbs(vcpu->kvm); 2070 kvm_flush_remote_tlbs(vcpu->kvm);
2025 } else if (pfn != spte_to_pfn(*sptep)) { 2071 } else if (pfn != spte_to_pfn(*sptep)) {
2026 pgprintk("hfn old %lx new %lx\n", 2072 pgprintk("hfn old %llx new %llx\n",
2027 spte_to_pfn(*sptep), pfn); 2073 spte_to_pfn(*sptep), pfn);
2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2074 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm); 2075 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2040 } 2086 }
2041 2087
2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2088 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2089 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2044 is_large_pte(*sptep)? "2MB" : "4kB", 2090 is_large_pte(*sptep)? "2MB" : "4kB",
2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2091 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046 *sptep, sptep); 2092 *sptep, sptep);
@@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{ 2110{
2065} 2111}
2066 2112
2113static struct kvm_memory_slot *
2114pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2115{
2116 struct kvm_memory_slot *slot;
2117
2118 slot = gfn_to_memslot(vcpu->kvm, gfn);
2119 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2120 (no_dirty_log && slot->dirty_bitmap))
2121 slot = NULL;
2122
2123 return slot;
2124}
2125
2126static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2127 bool no_dirty_log)
2128{
2129 struct kvm_memory_slot *slot;
2130 unsigned long hva;
2131
2132 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
2133 if (!slot) {
2134 get_page(bad_page);
2135 return page_to_pfn(bad_page);
2136 }
2137
2138 hva = gfn_to_hva_memslot(slot, gfn);
2139
2140 return hva_to_pfn_atomic(vcpu->kvm, hva);
2141}
2142
2143static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2144 struct kvm_mmu_page *sp,
2145 u64 *start, u64 *end)
2146{
2147 struct page *pages[PTE_PREFETCH_NUM];
2148 unsigned access = sp->role.access;
2149 int i, ret;
2150 gfn_t gfn;
2151
2152 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2153 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
2154 return -1;
2155
2156 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2157 if (ret <= 0)
2158 return -1;
2159
2160 for (i = 0; i < ret; i++, gfn++, start++)
2161 mmu_set_spte(vcpu, start, ACC_ALL,
2162 access, 0, 0, 1, NULL,
2163 sp->role.level, gfn,
2164 page_to_pfn(pages[i]), true, true);
2165
2166 return 0;
2167}
2168
2169static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2170 struct kvm_mmu_page *sp, u64 *sptep)
2171{
2172 u64 *spte, *start = NULL;
2173 int i;
2174
2175 WARN_ON(!sp->role.direct);
2176
2177 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2178 spte = sp->spt + i;
2179
2180 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2181 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2182 if (!start)
2183 continue;
2184 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2185 break;
2186 start = NULL;
2187 } else if (!start)
2188 start = spte;
2189 }
2190}
2191
2192static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2193{
2194 struct kvm_mmu_page *sp;
2195
2196 /*
2197 * Since it's no accessed bit on EPT, it's no way to
2198 * distinguish between actually accessed translations
2199 * and prefetched, so disable pte prefetch if EPT is
2200 * enabled.
2201 */
2202 if (!shadow_accessed_mask)
2203 return;
2204
2205 sp = page_header(__pa(sptep));
2206 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2207 return;
2208
2209 __direct_pte_prefetch(vcpu, sp, sptep);
2210}
2211
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2212static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068 int level, gfn_t gfn, pfn_t pfn) 2213 int level, gfn_t gfn, pfn_t pfn)
2069{ 2214{
@@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2222 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2078 0, write, 1, &pt_write, 2223 0, write, 1, &pt_write,
2079 level, gfn, pfn, false, true); 2224 level, gfn, pfn, false, true);
2225 direct_pte_prefetch(vcpu, iterator.sptep);
2080 ++vcpu->stat.pf_fixed; 2226 ++vcpu->stat.pf_fixed;
2081 break; 2227 break;
2082 } 2228 }
@@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2098 __set_spte(iterator.sptep, 2244 __set_spte(iterator.sptep,
2099 __pa(sp->spt) 2245 __pa(sp->spt)
2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2246 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101 | shadow_user_mask | shadow_x_mask); 2247 | shadow_user_mask | shadow_x_mask
2248 | shadow_accessed_mask);
2102 } 2249 }
2103 } 2250 }
2104 return pt_write; 2251 return pt_write;
2105} 2252}
2106 2253
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2254static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2108{ 2255{
2109 char buf[1]; 2256 siginfo_t info;
2110 void __user *hva; 2257
2111 int r; 2258 info.si_signo = SIGBUS;
2259 info.si_errno = 0;
2260 info.si_code = BUS_MCEERR_AR;
2261 info.si_addr = (void __user *)address;
2262 info.si_addr_lsb = PAGE_SHIFT;
2112 2263
2113 /* Touch the page, so send SIGBUS */ 2264 send_sig_info(SIGBUS, &info, tsk);
2114 hva = (void __user *)gfn_to_hva(kvm, gfn);
2115 r = copy_from_user(buf, hva, 1);
2116} 2265}
2117 2266
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2267static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{ 2268{
2120 kvm_release_pfn_clean(pfn); 2269 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) { 2270 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn); 2271 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2123 return 0; 2272 return 0;
2124 } else if (is_fault_pfn(pfn)) 2273 } else if (is_fault_pfn(pfn))
2125 return -EFAULT; 2274 return -EFAULT;
@@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2328 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180 return; 2329 return;
2181 spin_lock(&vcpu->kvm->mmu_lock); 2330 spin_lock(&vcpu->kvm->mmu_lock);
2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2331 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2332 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2333 vcpu->arch.mmu.direct_map)) {
2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2334 hpa_t root = vcpu->arch.mmu.root_hpa;
2184 2335
2185 sp = page_header(root); 2336 sp = page_header(root);
@@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2222 return ret; 2373 return ret;
2223} 2374}
2224 2375
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2376static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2226{ 2377{
2227 int i;
2228 gfn_t root_gfn;
2229 struct kvm_mmu_page *sp; 2378 struct kvm_mmu_page *sp;
2230 int direct = 0; 2379 unsigned i;
2231 u64 pdptr;
2232
2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234 2380
2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2381 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2382 spin_lock(&vcpu->kvm->mmu_lock);
2383 kvm_mmu_free_some_pages(vcpu);
2384 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2385 1, ACC_ALL, NULL);
2386 ++sp->root_count;
2387 spin_unlock(&vcpu->kvm->mmu_lock);
2388 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2389 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2390 for (i = 0; i < 4; ++i) {
2391 hpa_t root = vcpu->arch.mmu.pae_root[i];
2392
2393 ASSERT(!VALID_PAGE(root));
2394 spin_lock(&vcpu->kvm->mmu_lock);
2395 kvm_mmu_free_some_pages(vcpu);
2396 sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
2397 PT32_ROOT_LEVEL, 1, ACC_ALL,
2398 NULL);
2399 root = __pa(sp->spt);
2400 ++sp->root_count;
2401 spin_unlock(&vcpu->kvm->mmu_lock);
2402 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2403 }
2404 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2405 } else
2406 BUG();
2407
2408 return 0;
2409}
2410
2411static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2412{
2413 struct kvm_mmu_page *sp;
2414 u64 pdptr, pm_mask;
2415 gfn_t root_gfn;
2416 int i;
2417
2418 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2419
2420 if (mmu_check_root(vcpu, root_gfn))
2421 return 1;
2422
2423 /*
2424 * Do we shadow a long mode page table? If so we need to
2425 * write-protect the guests page table root.
2426 */
2427 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2428 hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2429
2238 ASSERT(!VALID_PAGE(root)); 2430 ASSERT(!VALID_PAGE(root));
2239 if (mmu_check_root(vcpu, root_gfn)) 2431
2240 return 1;
2241 if (tdp_enabled) {
2242 direct = 1;
2243 root_gfn = 0;
2244 }
2245 spin_lock(&vcpu->kvm->mmu_lock); 2432 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu); 2433 kvm_mmu_free_some_pages(vcpu);
2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2434 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2248 PT64_ROOT_LEVEL, direct, 2435 0, ACC_ALL, NULL);
2249 ACC_ALL, NULL);
2250 root = __pa(sp->spt); 2436 root = __pa(sp->spt);
2251 ++sp->root_count; 2437 ++sp->root_count;
2252 spin_unlock(&vcpu->kvm->mmu_lock); 2438 spin_unlock(&vcpu->kvm->mmu_lock);
2253 vcpu->arch.mmu.root_hpa = root; 2439 vcpu->arch.mmu.root_hpa = root;
2254 return 0; 2440 return 0;
2255 } 2441 }
2256 direct = !is_paging(vcpu); 2442
2443 /*
2444 * We shadow a 32 bit page table. This may be a legacy 2-level
2445 * or a PAE 3-level page table. In either case we need to be aware that
2446 * the shadow page table may be a PAE or a long mode page table.
2447 */
2448 pm_mask = PT_PRESENT_MASK;
2449 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2450 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2451
2257 for (i = 0; i < 4; ++i) { 2452 for (i = 0; i < 4; ++i) {
2258 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2453 hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2454
2260 ASSERT(!VALID_PAGE(root)); 2455 ASSERT(!VALID_PAGE(root));
2261 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2456 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2262 pdptr = kvm_pdptr_read(vcpu, i); 2457 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2263 if (!is_present_gpte(pdptr)) { 2458 if (!is_present_gpte(pdptr)) {
2264 vcpu->arch.mmu.pae_root[i] = 0; 2459 vcpu->arch.mmu.pae_root[i] = 0;
2265 continue; 2460 continue;
2266 } 2461 }
2267 root_gfn = pdptr >> PAGE_SHIFT; 2462 root_gfn = pdptr >> PAGE_SHIFT;
2268 } else if (vcpu->arch.mmu.root_level == 0) 2463 if (mmu_check_root(vcpu, root_gfn))
2269 root_gfn = 0; 2464 return 1;
2270 if (mmu_check_root(vcpu, root_gfn))
2271 return 1;
2272 if (tdp_enabled) {
2273 direct = 1;
2274 root_gfn = i << 30;
2275 } 2465 }
2276 spin_lock(&vcpu->kvm->mmu_lock); 2466 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu); 2467 kvm_mmu_free_some_pages(vcpu);
2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2468 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279 PT32_ROOT_LEVEL, direct, 2469 PT32_ROOT_LEVEL, 0,
2280 ACC_ALL, NULL); 2470 ACC_ALL, NULL);
2281 root = __pa(sp->spt); 2471 root = __pa(sp->spt);
2282 ++sp->root_count; 2472 ++sp->root_count;
2283 spin_unlock(&vcpu->kvm->mmu_lock); 2473 spin_unlock(&vcpu->kvm->mmu_lock);
2284 2474
2285 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2475 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2286 } 2476 }
2287 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2477 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2478
2479 /*
2480 * If we shadow a 32 bit page table with a long mode page
2481 * table we enter this path.
2482 */
2483 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2484 if (vcpu->arch.mmu.lm_root == NULL) {
2485 /*
2486 * The additional page necessary for this is only
2487 * allocated on demand.
2488 */
2489
2490 u64 *lm_root;
2491
2492 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2493 if (lm_root == NULL)
2494 return 1;
2495
2496 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2497
2498 vcpu->arch.mmu.lm_root = lm_root;
2499 }
2500
2501 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2502 }
2503
2288 return 0; 2504 return 0;
2289} 2505}
2290 2506
2507static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2508{
2509 if (vcpu->arch.mmu.direct_map)
2510 return mmu_alloc_direct_roots(vcpu);
2511 else
2512 return mmu_alloc_shadow_roots(vcpu);
2513}
2514
2291static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2515static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2292{ 2516{
2293 int i; 2517 int i;
2294 struct kvm_mmu_page *sp; 2518 struct kvm_mmu_page *sp;
2295 2519
2520 if (vcpu->arch.mmu.direct_map)
2521 return;
2522
2296 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2523 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2297 return; 2524 return;
2298 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2525
2526 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2527 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2299 hpa_t root = vcpu->arch.mmu.root_hpa; 2528 hpa_t root = vcpu->arch.mmu.root_hpa;
2300 sp = page_header(root); 2529 sp = page_header(root);
2301 mmu_sync_children(vcpu, sp); 2530 mmu_sync_children(vcpu, sp);
@@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2310 mmu_sync_children(vcpu, sp); 2539 mmu_sync_children(vcpu, sp);
2311 } 2540 }
2312 } 2541 }
2542 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2313} 2543}
2314 2544
2315void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2545void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2327 return vaddr; 2557 return vaddr;
2328} 2558}
2329 2559
2560static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2561 u32 access, u32 *error)
2562{
2563 if (error)
2564 *error = 0;
2565 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2566}
2567
2330static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2568static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2331 u32 error_code) 2569 u32 error_code)
2332{ 2570{
@@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
2393 mmu_free_roots(vcpu); 2631 mmu_free_roots(vcpu);
2394} 2632}
2395 2633
2396static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2634static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2635 struct kvm_mmu *context)
2397{ 2636{
2398 struct kvm_mmu *context = &vcpu->arch.mmu;
2399
2400 context->new_cr3 = nonpaging_new_cr3; 2637 context->new_cr3 = nonpaging_new_cr3;
2401 context->page_fault = nonpaging_page_fault; 2638 context->page_fault = nonpaging_page_fault;
2402 context->gva_to_gpa = nonpaging_gva_to_gpa; 2639 context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2407 context->root_level = 0; 2644 context->root_level = 0;
2408 context->shadow_root_level = PT32E_ROOT_LEVEL; 2645 context->shadow_root_level = PT32E_ROOT_LEVEL;
2409 context->root_hpa = INVALID_PAGE; 2646 context->root_hpa = INVALID_PAGE;
2647 context->direct_map = true;
2648 context->nx = false;
2410 return 0; 2649 return 0;
2411} 2650}
2412 2651
@@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
2422 mmu_free_roots(vcpu); 2661 mmu_free_roots(vcpu);
2423} 2662}
2424 2663
2425static void inject_page_fault(struct kvm_vcpu *vcpu, 2664static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2426 u64 addr, 2665{
2427 u32 err_code) 2666 return vcpu->arch.cr3;
2667}
2668
2669static void inject_page_fault(struct kvm_vcpu *vcpu)
2428{ 2670{
2429 kvm_inject_page_fault(vcpu, addr, err_code); 2671 vcpu->arch.mmu.inject_page_fault(vcpu);
2430} 2672}
2431 2673
2432static void paging_free(struct kvm_vcpu *vcpu) 2674static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
2434 nonpaging_free(vcpu); 2676 nonpaging_free(vcpu);
2435} 2677}
2436 2678
2437static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2679static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2438{ 2680{
2439 int bit7; 2681 int bit7;
2440 2682
2441 bit7 = (gpte >> 7) & 1; 2683 bit7 = (gpte >> 7) & 1;
2442 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2684 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2443} 2685}
2444 2686
2445#define PTTYPE 64 2687#define PTTYPE 64
@@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2450#include "paging_tmpl.h" 2692#include "paging_tmpl.h"
2451#undef PTTYPE 2693#undef PTTYPE
2452 2694
2453static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2695static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2696 struct kvm_mmu *context,
2697 int level)
2454{ 2698{
2455 struct kvm_mmu *context = &vcpu->arch.mmu;
2456 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2699 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2457 u64 exb_bit_rsvd = 0; 2700 u64 exb_bit_rsvd = 0;
2458 2701
2459 if (!is_nx(vcpu)) 2702 if (!context->nx)
2460 exb_bit_rsvd = rsvd_bits(63, 63); 2703 exb_bit_rsvd = rsvd_bits(63, 63);
2461 switch (level) { 2704 switch (level) {
2462 case PT32_ROOT_LEVEL: 2705 case PT32_ROOT_LEVEL:
@@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2511 } 2754 }
2512} 2755}
2513 2756
2514static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2757static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2758 struct kvm_mmu *context,
2759 int level)
2515{ 2760{
2516 struct kvm_mmu *context = &vcpu->arch.mmu; 2761 context->nx = is_nx(vcpu);
2762
2763 reset_rsvds_bits_mask(vcpu, context, level);
2517 2764
2518 ASSERT(is_pae(vcpu)); 2765 ASSERT(is_pae(vcpu));
2519 context->new_cr3 = paging_new_cr3; 2766 context->new_cr3 = paging_new_cr3;
@@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2526 context->root_level = level; 2773 context->root_level = level;
2527 context->shadow_root_level = level; 2774 context->shadow_root_level = level;
2528 context->root_hpa = INVALID_PAGE; 2775 context->root_hpa = INVALID_PAGE;
2776 context->direct_map = false;
2529 return 0; 2777 return 0;
2530} 2778}
2531 2779
2532static int paging64_init_context(struct kvm_vcpu *vcpu) 2780static int paging64_init_context(struct kvm_vcpu *vcpu,
2781 struct kvm_mmu *context)
2533{ 2782{
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2783 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2535 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2536} 2784}
2537 2785
2538static int paging32_init_context(struct kvm_vcpu *vcpu) 2786static int paging32_init_context(struct kvm_vcpu *vcpu,
2787 struct kvm_mmu *context)
2539{ 2788{
2540 struct kvm_mmu *context = &vcpu->arch.mmu; 2789 context->nx = false;
2790
2791 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2541 2792
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->new_cr3 = paging_new_cr3; 2793 context->new_cr3 = paging_new_cr3;
2544 context->page_fault = paging32_page_fault; 2794 context->page_fault = paging32_page_fault;
2545 context->gva_to_gpa = paging32_gva_to_gpa; 2795 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2550 context->root_level = PT32_ROOT_LEVEL; 2800 context->root_level = PT32_ROOT_LEVEL;
2551 context->shadow_root_level = PT32E_ROOT_LEVEL; 2801 context->shadow_root_level = PT32E_ROOT_LEVEL;
2552 context->root_hpa = INVALID_PAGE; 2802 context->root_hpa = INVALID_PAGE;
2803 context->direct_map = false;
2553 return 0; 2804 return 0;
2554} 2805}
2555 2806
2556static int paging32E_init_context(struct kvm_vcpu *vcpu) 2807static int paging32E_init_context(struct kvm_vcpu *vcpu,
2808 struct kvm_mmu *context)
2557{ 2809{
2558 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2810 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2559 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2560} 2811}
2561 2812
2562static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2813static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2563{ 2814{
2564 struct kvm_mmu *context = &vcpu->arch.mmu; 2815 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2565 2816
2566 context->new_cr3 = nonpaging_new_cr3; 2817 context->new_cr3 = nonpaging_new_cr3;
2567 context->page_fault = tdp_page_fault; 2818 context->page_fault = tdp_page_fault;
@@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2571 context->invlpg = nonpaging_invlpg; 2822 context->invlpg = nonpaging_invlpg;
2572 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2823 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 context->root_hpa = INVALID_PAGE; 2824 context->root_hpa = INVALID_PAGE;
2825 context->direct_map = true;
2826 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2827 context->get_cr3 = get_cr3;
2828 context->inject_page_fault = kvm_inject_page_fault;
2829 context->nx = is_nx(vcpu);
2574 2830
2575 if (!is_paging(vcpu)) { 2831 if (!is_paging(vcpu)) {
2832 context->nx = false;
2576 context->gva_to_gpa = nonpaging_gva_to_gpa; 2833 context->gva_to_gpa = nonpaging_gva_to_gpa;
2577 context->root_level = 0; 2834 context->root_level = 0;
2578 } else if (is_long_mode(vcpu)) { 2835 } else if (is_long_mode(vcpu)) {
2579 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2836 context->nx = is_nx(vcpu);
2837 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2580 context->gva_to_gpa = paging64_gva_to_gpa; 2838 context->gva_to_gpa = paging64_gva_to_gpa;
2581 context->root_level = PT64_ROOT_LEVEL; 2839 context->root_level = PT64_ROOT_LEVEL;
2582 } else if (is_pae(vcpu)) { 2840 } else if (is_pae(vcpu)) {
2583 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2841 context->nx = is_nx(vcpu);
2842 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2584 context->gva_to_gpa = paging64_gva_to_gpa; 2843 context->gva_to_gpa = paging64_gva_to_gpa;
2585 context->root_level = PT32E_ROOT_LEVEL; 2844 context->root_level = PT32E_ROOT_LEVEL;
2586 } else { 2845 } else {
2587 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2846 context->nx = false;
2847 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2588 context->gva_to_gpa = paging32_gva_to_gpa; 2848 context->gva_to_gpa = paging32_gva_to_gpa;
2589 context->root_level = PT32_ROOT_LEVEL; 2849 context->root_level = PT32_ROOT_LEVEL;
2590 } 2850 }
@@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2592 return 0; 2852 return 0;
2593} 2853}
2594 2854
2595static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 2855int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2596{ 2856{
2597 int r; 2857 int r;
2598
2599 ASSERT(vcpu); 2858 ASSERT(vcpu);
2600 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2859 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601 2860
2602 if (!is_paging(vcpu)) 2861 if (!is_paging(vcpu))
2603 r = nonpaging_init_context(vcpu); 2862 r = nonpaging_init_context(vcpu, context);
2604 else if (is_long_mode(vcpu)) 2863 else if (is_long_mode(vcpu))
2605 r = paging64_init_context(vcpu); 2864 r = paging64_init_context(vcpu, context);
2606 else if (is_pae(vcpu)) 2865 else if (is_pae(vcpu))
2607 r = paging32E_init_context(vcpu); 2866 r = paging32E_init_context(vcpu, context);
2608 else 2867 else
2609 r = paging32_init_context(vcpu); 2868 r = paging32_init_context(vcpu, context);
2610 2869
2611 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 2870 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 2871 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 2872
2614 return r; 2873 return r;
2615} 2874}
2875EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2876
2877static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2878{
2879 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2880
2881 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
2882 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
2883 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
2884
2885 return r;
2886}
2887
2888static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
2889{
2890 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
2891
2892 g_context->get_cr3 = get_cr3;
2893 g_context->inject_page_fault = kvm_inject_page_fault;
2894
2895 /*
2896 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
2897 * translation of l2_gpa to l1_gpa addresses is done using the
2898 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
2899 * functions between mmu and nested_mmu are swapped.
2900 */
2901 if (!is_paging(vcpu)) {
2902 g_context->nx = false;
2903 g_context->root_level = 0;
2904 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
2905 } else if (is_long_mode(vcpu)) {
2906 g_context->nx = is_nx(vcpu);
2907 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
2908 g_context->root_level = PT64_ROOT_LEVEL;
2909 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2910 } else if (is_pae(vcpu)) {
2911 g_context->nx = is_nx(vcpu);
2912 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
2913 g_context->root_level = PT32E_ROOT_LEVEL;
2914 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2915 } else {
2916 g_context->nx = false;
2917 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
2918 g_context->root_level = PT32_ROOT_LEVEL;
2919 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
2920 }
2921
2922 return 0;
2923}
2616 2924
2617static int init_kvm_mmu(struct kvm_vcpu *vcpu) 2925static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2618{ 2926{
2619 vcpu->arch.update_pte.pfn = bad_pfn; 2927 vcpu->arch.update_pte.pfn = bad_pfn;
2620 2928
2621 if (tdp_enabled) 2929 if (mmu_is_nested(vcpu))
2930 return init_kvm_nested_mmu(vcpu);
2931 else if (tdp_enabled)
2622 return init_kvm_tdp_mmu(vcpu); 2932 return init_kvm_tdp_mmu(vcpu);
2623 else 2933 else
2624 return init_kvm_softmmu(vcpu); 2934 return init_kvm_softmmu(vcpu);
@@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2653 if (r) 2963 if (r)
2654 goto out; 2964 goto out;
2655 /* set_cr3() should ensure TLB has been flushed */ 2965 /* set_cr3() should ensure TLB has been flushed */
2656 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2966 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657out: 2967out:
2658 return r; 2968 return r;
2659} 2969}
@@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2663{ 2973{
2664 mmu_free_roots(vcpu); 2974 mmu_free_roots(vcpu);
2665} 2975}
2976EXPORT_SYMBOL_GPL(kvm_mmu_unload);
2666 2977
2667static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 2978static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668 struct kvm_mmu_page *sp, 2979 struct kvm_mmu_page *sp,
@@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2695 return; 3006 return;
2696 } 3007 }
2697 3008
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) 3009 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return; 3010 return;
2700 3011
2701 ++vcpu->kvm->stat.mmu_pte_updated; 3012 ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2837 kvm_mmu_access_page(vcpu, gfn); 3148 kvm_mmu_access_page(vcpu, gfn);
2838 kvm_mmu_free_some_pages(vcpu); 3149 kvm_mmu_free_some_pages(vcpu);
2839 ++vcpu->kvm->stat.mmu_pte_write; 3150 ++vcpu->kvm->stat.mmu_pte_write;
2840 kvm_mmu_audit(vcpu, "pre pte write"); 3151 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
2841 if (guest_initiated) { 3152 if (guest_initiated) {
2842 if (gfn == vcpu->arch.last_pt_write_gfn 3153 if (gfn == vcpu->arch.last_pt_write_gfn
2843 && !last_updated_pte_accessed(vcpu)) { 3154 && !last_updated_pte_accessed(vcpu)) {
@@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2910 } 3221 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3222 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3223 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913 kvm_mmu_audit(vcpu, "post pte write"); 3224 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
2914 spin_unlock(&vcpu->kvm->mmu_lock); 3225 spin_unlock(&vcpu->kvm->mmu_lock);
2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 3226 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2916 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); 3227 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2923 gpa_t gpa; 3234 gpa_t gpa;
2924 int r; 3235 int r;
2925 3236
2926 if (tdp_enabled) 3237 if (vcpu->arch.mmu.direct_map)
2927 return 0; 3238 return 0;
2928 3239
2929 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3240 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937 3248
2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 3249void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2939{ 3250{
2940 int free_pages;
2941 LIST_HEAD(invalid_list); 3251 LIST_HEAD(invalid_list);
2942 3252
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 3253 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
2944 while (free_pages < KVM_REFILL_PAGES &&
2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 3254 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946 struct kvm_mmu_page *sp; 3255 struct kvm_mmu_page *sp;
2947 3256
2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3257 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949 struct kvm_mmu_page, link); 3258 struct kvm_mmu_page, link);
2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3259 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2951 &invalid_list); 3260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2952 ++vcpu->kvm->stat.mmu_recycled; 3261 ++vcpu->kvm->stat.mmu_recycled;
2953 } 3262 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2955} 3263}
2956 3264
2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3265int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3013static void free_mmu_pages(struct kvm_vcpu *vcpu) 3321static void free_mmu_pages(struct kvm_vcpu *vcpu)
3014{ 3322{
3015 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3323 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3324 if (vcpu->arch.mmu.lm_root != NULL)
3325 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3016} 3326}
3017 3327
3018static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3328static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054 return init_kvm_mmu(vcpu); 3364 return init_kvm_mmu(vcpu);
3055} 3365}
3056 3366
3057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3058{
3059 ASSERT(vcpu);
3060
3061 destroy_kvm_mmu(vcpu);
3062 free_mmu_pages(vcpu);
3063 mmu_free_memory_caches(vcpu);
3064}
3065
3066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3367void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3067{ 3368{
3068 struct kvm_mmu_page *sp; 3369 struct kvm_mmu_page *sp;
@@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3112{ 3413{
3113 struct kvm *kvm; 3414 struct kvm *kvm;
3114 struct kvm *kvm_freed = NULL; 3415 struct kvm *kvm_freed = NULL;
3115 int cache_count = 0; 3416
3417 if (nr_to_scan == 0)
3418 goto out;
3116 3419
3117 spin_lock(&kvm_lock); 3420 spin_lock(&kvm_lock);
3118 3421
3119 list_for_each_entry(kvm, &vm_list, vm_list) { 3422 list_for_each_entry(kvm, &vm_list, vm_list) {
3120 int npages, idx, freed_pages; 3423 int idx, freed_pages;
3121 LIST_HEAD(invalid_list); 3424 LIST_HEAD(invalid_list);
3122 3425
3123 idx = srcu_read_lock(&kvm->srcu); 3426 idx = srcu_read_lock(&kvm->srcu);
3124 spin_lock(&kvm->mmu_lock); 3427 spin_lock(&kvm->mmu_lock);
3125 npages = kvm->arch.n_alloc_mmu_pages - 3428 if (!kvm_freed && nr_to_scan > 0 &&
3126 kvm->arch.n_free_mmu_pages; 3429 kvm->arch.n_used_mmu_pages > 0) {
3127 cache_count += npages;
3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3430 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list); 3431 &invalid_list);
3131 cache_count -= freed_pages;
3132 kvm_freed = kvm; 3432 kvm_freed = kvm;
3133 } 3433 }
3134 nr_to_scan--; 3434 nr_to_scan--;
@@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3142 3442
3143 spin_unlock(&kvm_lock); 3443 spin_unlock(&kvm_lock);
3144 3444
3145 return cache_count; 3445out:
3446 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3146} 3447}
3147 3448
3148static struct shrinker mmu_shrinker = { 3449static struct shrinker mmu_shrinker = {
@@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void)
3163void kvm_mmu_module_exit(void) 3464void kvm_mmu_module_exit(void)
3164{ 3465{
3165 mmu_destroy_caches(); 3466 mmu_destroy_caches();
3467 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3166 unregister_shrinker(&mmu_shrinker); 3468 unregister_shrinker(&mmu_shrinker);
3167} 3469}
3168 3470
@@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void)
3185 if (!mmu_page_header_cache) 3487 if (!mmu_page_header_cache)
3186 goto nomem; 3488 goto nomem;
3187 3489
3490 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3491 goto nomem;
3492
3188 register_shrinker(&mmu_shrinker); 3493 register_shrinker(&mmu_shrinker);
3189 3494
3190 return 0; 3495 return 0;
@@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3355} 3660}
3356EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3661EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3357 3662
3358#ifdef AUDIT 3663#ifdef CONFIG_KVM_MMU_AUDIT
3359 3664#include "mmu_audit.c"
3360static const char *audit_msg; 3665#else
3361 3666static void mmu_audit_disable(void) { }
3362static gva_t canonicalize(gva_t gva)
3363{
3364#ifdef CONFIG_X86_64
3365 gva = (long long)(gva << 16) >> 16;
3366#endif 3667#endif
3367 return gva;
3368}
3369
3370
3371typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372
3373static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3374 inspect_spte_fn fn)
3375{
3376 int i;
3377
3378 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3379 u64 ent = sp->spt[i];
3380
3381 if (is_shadow_present_pte(ent)) {
3382 if (!is_last_spte(ent, sp->role.level)) {
3383 struct kvm_mmu_page *child;
3384 child = page_header(ent & PT64_BASE_ADDR_MASK);
3385 __mmu_spte_walk(kvm, child, fn);
3386 } else
3387 fn(kvm, &sp->spt[i]);
3388 }
3389 }
3390}
3391
3392static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3393{
3394 int i;
3395 struct kvm_mmu_page *sp;
3396
3397 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3398 return;
3399 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3400 hpa_t root = vcpu->arch.mmu.root_hpa;
3401 sp = page_header(root);
3402 __mmu_spte_walk(vcpu->kvm, sp, fn);
3403 return;
3404 }
3405 for (i = 0; i < 4; ++i) {
3406 hpa_t root = vcpu->arch.mmu.pae_root[i];
3407
3408 if (root && VALID_PAGE(root)) {
3409 root &= PT64_BASE_ADDR_MASK;
3410 sp = page_header(root);
3411 __mmu_spte_walk(vcpu->kvm, sp, fn);
3412 }
3413 }
3414 return;
3415}
3416
3417static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3418 gva_t va, int level)
3419{
3420 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3421 int i;
3422 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3423
3424 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3425 u64 ent = pt[i];
3426
3427 if (ent == shadow_trap_nonpresent_pte)
3428 continue;
3429
3430 va = canonicalize(va);
3431 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3432 audit_mappings_page(vcpu, ent, va, level - 1);
3433 else {
3434 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3435 gfn_t gfn = gpa >> PAGE_SHIFT;
3436 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3437 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438 3668
3439 if (is_error_pfn(pfn)) { 3669void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3440 kvm_release_pfn_clean(pfn);
3441 continue;
3442 }
3443
3444 if (is_shadow_present_pte(ent)
3445 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 printk(KERN_ERR "xx audit error: (%s) levels %d"
3447 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448 audit_msg, vcpu->arch.mmu.root_level,
3449 va, gpa, hpa, ent,
3450 is_shadow_present_pte(ent));
3451 else if (ent == shadow_notrap_nonpresent_pte
3452 && !is_error_hpa(hpa))
3453 printk(KERN_ERR "audit: (%s) notrap shadow,"
3454 " valid guest gva %lx\n", audit_msg, va);
3455 kvm_release_pfn_clean(pfn);
3456
3457 }
3458 }
3459}
3460
3461static void audit_mappings(struct kvm_vcpu *vcpu)
3462{
3463 unsigned i;
3464
3465 if (vcpu->arch.mmu.root_level == 4)
3466 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 else
3468 for (i = 0; i < 4; ++i)
3469 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470 audit_mappings_page(vcpu,
3471 vcpu->arch.mmu.pae_root[i],
3472 i << 30,
3473 2);
3474}
3475
3476static int count_rmaps(struct kvm_vcpu *vcpu)
3477{
3478 struct kvm *kvm = vcpu->kvm;
3479 struct kvm_memslots *slots;
3480 int nmaps = 0;
3481 int i, j, k, idx;
3482
3483 idx = srcu_read_lock(&kvm->srcu);
3484 slots = kvm_memslots(kvm);
3485 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486 struct kvm_memory_slot *m = &slots->memslots[i];
3487 struct kvm_rmap_desc *d;
3488
3489 for (j = 0; j < m->npages; ++j) {
3490 unsigned long *rmapp = &m->rmap[j];
3491
3492 if (!*rmapp)
3493 continue;
3494 if (!(*rmapp & 1)) {
3495 ++nmaps;
3496 continue;
3497 }
3498 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 while (d) {
3500 for (k = 0; k < RMAP_EXT; ++k)
3501 if (d->sptes[k])
3502 ++nmaps;
3503 else
3504 break;
3505 d = d->more;
3506 }
3507 }
3508 }
3509 srcu_read_unlock(&kvm->srcu, idx);
3510 return nmaps;
3511}
3512
3513void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514{
3515 unsigned long *rmapp;
3516 struct kvm_mmu_page *rev_sp;
3517 gfn_t gfn;
3518
3519 if (is_writable_pte(*sptep)) {
3520 rev_sp = page_header(__pa(sptep));
3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522
3523 if (!gfn_to_memslot(kvm, gfn)) {
3524 if (!printk_ratelimit())
3525 return;
3526 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3527 audit_msg, gfn);
3528 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529 audit_msg, (long int)(sptep - rev_sp->spt),
3530 rev_sp->gfn);
3531 dump_stack();
3532 return;
3533 }
3534
3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 if (!*rmapp) {
3537 if (!printk_ratelimit())
3538 return;
3539 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3540 audit_msg, *sptep);
3541 dump_stack();
3542 }
3543 }
3544
3545}
3546
3547void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3548{
3549 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3550}
3551
3552static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553{
3554 struct kvm_mmu_page *sp;
3555 int i;
3556
3557 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558 u64 *pt = sp->spt;
3559
3560 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 continue;
3562
3563 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3564 u64 ent = pt[i];
3565
3566 if (!(ent & PT_PRESENT_MASK))
3567 continue;
3568 if (!is_writable_pte(ent))
3569 continue;
3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 }
3572 }
3573 return;
3574}
3575
3576static void audit_rmap(struct kvm_vcpu *vcpu)
3577{
3578 check_writable_mappings_rmap(vcpu);
3579 count_rmaps(vcpu);
3580}
3581
3582static void audit_write_protection(struct kvm_vcpu *vcpu)
3583{
3584 struct kvm_mmu_page *sp;
3585 struct kvm_memory_slot *slot;
3586 unsigned long *rmapp;
3587 u64 *spte;
3588 gfn_t gfn;
3589
3590 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591 if (sp->role.direct)
3592 continue;
3593 if (sp->unsync)
3594 continue;
3595
3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3598
3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3600 while (spte) {
3601 if (is_writable_pte(*spte))
3602 printk(KERN_ERR "%s: (%s) shadow page has "
3603 "writable mappings: gfn %lx role %x\n",
3604 __func__, audit_msg, sp->gfn,
3605 sp->role.word);
3606 spte = rmap_next(vcpu->kvm, rmapp, spte);
3607 }
3608 }
3609}
3610
3611static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3612{ 3670{
3613 int olddbg = dbg; 3671 ASSERT(vcpu);
3614 3672
3615 dbg = 0; 3673 destroy_kvm_mmu(vcpu);
3616 audit_msg = msg; 3674 free_mmu_pages(vcpu);
3617 audit_rmap(vcpu); 3675 mmu_free_memory_caches(vcpu);
3618 audit_write_protection(vcpu); 3676 mmu_audit_disable();
3619 if (strcmp("pre pte write", audit_msg) != 0)
3620 audit_mappings(vcpu);
3621 audit_writable_sptes_have_rmaps(vcpu);
3622 dbg = olddbg;
3623} 3677}
3624
3625#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,10 +49,17 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
55{
56 return kvm->arch.n_max_mmu_pages -
57 kvm->arch.n_used_mmu_pages;
58}
52 59
53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 60static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
54{ 61{
55 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 62 if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
56 __kvm_mmu_free_some_pages(vcpu); 63 __kvm_mmu_free_some_pages(vcpu);
57} 64}
58 65
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..ba2bcdde6221
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,299 @@
1/*
2 * mmu_audit.c:
3 *
4 * Audit code for KVM MMU
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 *
9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com>
11 * Avi Kivity <avi@qumranet.com>
12 * Marcelo Tosatti <mtosatti@redhat.com>
13 * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include <linux/ratelimit.h>
21
22static int audit_point;
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args)
27
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29
30static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
31 inspect_spte_fn fn, int level)
32{
33 int i;
34
35 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
36 u64 *ent = sp->spt;
37
38 fn(vcpu, ent + i, level);
39
40 if (is_shadow_present_pte(ent[i]) &&
41 !is_last_spte(ent[i], level)) {
42 struct kvm_mmu_page *child;
43
44 child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
45 __mmu_spte_walk(vcpu, child, fn, level - 1);
46 }
47 }
48}
49
50static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
51{
52 int i;
53 struct kvm_mmu_page *sp;
54
55 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
56 return;
57
58 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
59 hpa_t root = vcpu->arch.mmu.root_hpa;
60
61 sp = page_header(root);
62 __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
63 return;
64 }
65
66 for (i = 0; i < 4; ++i) {
67 hpa_t root = vcpu->arch.mmu.pae_root[i];
68
69 if (root && VALID_PAGE(root)) {
70 root &= PT64_BASE_ADDR_MASK;
71 sp = page_header(root);
72 __mmu_spte_walk(vcpu, sp, fn, 2);
73 }
74 }
75
76 return;
77}
78
79typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
80
81static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
82{
83 struct kvm_mmu_page *sp;
84
85 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
86 fn(kvm, sp);
87}
88
89static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
90{
91 struct kvm_mmu_page *sp;
92 gfn_t gfn;
93 pfn_t pfn;
94 hpa_t hpa;
95
96 sp = page_header(__pa(sptep));
97
98 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level);
101 return;
102 }
103
104 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp);
112 return;
113 }
114
115 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
116 return;
117
118 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
119 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
120
121 if (is_error_pfn(pfn)) {
122 kvm_release_pfn_clean(pfn);
123 return;
124 }
125
126 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
130}
131
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
133{
134 unsigned long *rmapp;
135 struct kvm_mmu_page *rev_sp;
136 gfn_t gfn;
137
138
139 rev_sp = page_header(__pa(sptep));
140 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
141
142 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit())
144 return;
145 audit_printk("no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack();
149 return;
150 }
151
152 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
153 if (!*rmapp) {
154 if (!printk_ratelimit())
155 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep);
157 dump_stack();
158 }
159}
160
161static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
162{
163 if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
164 inspect_spte_has_rmap(vcpu->kvm, sptep);
165}
166
167static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp);
173}
174
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
176{
177 int i;
178
179 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
180 return;
181
182 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
183 if (!is_rmap_spte(sp->spt[i]))
184 continue;
185
186 inspect_spte_has_rmap(kvm, sp->spt + i);
187 }
188}
189
190static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
191{
192 struct kvm_memory_slot *slot;
193 unsigned long *rmapp;
194 u64 *spte;
195
196 if (sp->role.direct || sp->unsync || sp->role.invalid)
197 return;
198
199 slot = gfn_to_memslot(kvm, sp->gfn);
200 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
201
202 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) {
204 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn "
206 "%llx role %x\n", sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte);
208 }
209}
210
211static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
212{
213 check_mappings_rmap(kvm, sp);
214 audit_write_protection(kvm, sp);
215}
216
217static void audit_all_active_sps(struct kvm *kvm)
218{
219 walk_all_active_sps(kvm, audit_sp);
220}
221
222static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
223{
224 audit_sptes_have_rmaps(vcpu, sptep, level);
225 audit_mappings(vcpu, sptep, level);
226 audit_spte_after_sync(vcpu, sptep, level);
227}
228
229static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
230{
231 mmu_spte_walk(vcpu, audit_spte);
232}
233
234static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
235{
236 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
237
238 if (!__ratelimit(&ratelimit_state))
239 return;
240
241 audit_point = point;
242 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu);
244}
245
246static bool mmu_audit;
247
248static void mmu_audit_enable(void)
249{
250 int ret;
251
252 if (mmu_audit)
253 return;
254
255 ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
256 WARN_ON(ret);
257
258 mmu_audit = true;
259}
260
261static void mmu_audit_disable(void)
262{
263 if (!mmu_audit)
264 return;
265
266 unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
267 tracepoint_synchronize_unregister();
268 mmu_audit = false;
269}
270
271static int mmu_audit_set(const char *val, const struct kernel_param *kp)
272{
273 int ret;
274 unsigned long enable;
275
276 ret = strict_strtoul(val, 10, &enable);
277 if (ret < 0)
278 return -EINVAL;
279
280 switch (enable) {
281 case 0:
282 mmu_audit_disable();
283 break;
284 case 1:
285 mmu_audit_enable();
286 break;
287 default:
288 return -EINVAL;
289 }
290
291 return 0;
292}
293
294static struct kernel_param_ops audit_param_ops = {
295 .set = mmu_audit_set,
296 .get = param_get_bool,
297};
298
299module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930ef..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198
199TRACE_EVENT(
200 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
202 TP_ARGS(vcpu, audit_point),
203
204 TP_STRUCT__entry(
205 __field(struct kvm_vcpu *, vcpu)
206 __field(int, audit_point)
207 ),
208
209 TP_fast_assign(
210 __entry->vcpu = vcpu;
211 __entry->audit_point = audit_point;
212 ),
213
214 TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
215 audit_point_name[__entry->audit_point])
216);
198#endif /* _TRACE_KVMMMU_H */ 217#endif /* _TRACE_KVMMMU_H */
199 218
200#undef TRACE_INCLUDE_PATH 219#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960d..cd7a833a3b52 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +67,7 @@ struct guest_walker {
67 int level; 67 int level;
68 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 68 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
69 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 69 pt_element_t ptes[PT_MAX_FULL_LEVELS];
70 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 71 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 unsigned pt_access; 72 unsigned pt_access;
72 unsigned pte_access; 73 unsigned pte_access;
@@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
104 105
105 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 106 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
106#if PTTYPE == 64 107#if PTTYPE == 64
107 if (is_nx(vcpu)) 108 if (vcpu->arch.mmu.nx)
108 access &= ~(gpte >> PT64_NX_SHIFT); 109 access &= ~(gpte >> PT64_NX_SHIFT);
109#endif 110#endif
110 return access; 111 return access;
@@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113/* 114/*
114 * Fetch a guest pte for a guest virtual address 115 * Fetch a guest pte for a guest virtual address
115 */ 116 */
116static int FNAME(walk_addr)(struct guest_walker *walker, 117static int FNAME(walk_addr_generic)(struct guest_walker *walker,
117 struct kvm_vcpu *vcpu, gva_t addr, 118 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
118 int write_fault, int user_fault, int fetch_fault) 119 gva_t addr, u32 access)
119{ 120{
120 pt_element_t pte; 121 pt_element_t pte;
121 gfn_t table_gfn; 122 gfn_t table_gfn;
122 unsigned index, pt_access, uninitialized_var(pte_access); 123 unsigned index, pt_access, uninitialized_var(pte_access);
123 gpa_t pte_gpa; 124 gpa_t pte_gpa;
124 bool eperm, present, rsvd_fault; 125 bool eperm, present, rsvd_fault;
126 int offset, write_fault, user_fault, fetch_fault;
127
128 write_fault = access & PFERR_WRITE_MASK;
129 user_fault = access & PFERR_USER_MASK;
130 fetch_fault = access & PFERR_FETCH_MASK;
125 131
126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 132 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
127 fetch_fault); 133 fetch_fault);
128walk: 134walk:
129 present = true; 135 present = true;
130 eperm = rsvd_fault = false; 136 eperm = rsvd_fault = false;
131 walker->level = vcpu->arch.mmu.root_level; 137 walker->level = mmu->root_level;
132 pte = vcpu->arch.cr3; 138 pte = mmu->get_cr3(vcpu);
139
133#if PTTYPE == 64 140#if PTTYPE == 64
134 if (!is_long_mode(vcpu)) { 141 if (walker->level == PT32E_ROOT_LEVEL) {
135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 142 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
136 trace_kvm_mmu_paging_element(pte, walker->level); 143 trace_kvm_mmu_paging_element(pte, walker->level);
137 if (!is_present_gpte(pte)) { 144 if (!is_present_gpte(pte)) {
138 present = false; 145 present = false;
@@ -142,7 +149,7 @@ walk:
142 } 149 }
143#endif 150#endif
144 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 151 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
145 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); 152 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
146 153
147 pt_access = ACC_ALL; 154 pt_access = ACC_ALL;
148 155
@@ -150,12 +157,14 @@ walk:
150 index = PT_INDEX(addr, walker->level); 157 index = PT_INDEX(addr, walker->level);
151 158
152 table_gfn = gpte_to_gfn(pte); 159 table_gfn = gpte_to_gfn(pte);
153 pte_gpa = gfn_to_gpa(table_gfn); 160 offset = index * sizeof(pt_element_t);
154 pte_gpa += index * sizeof(pt_element_t); 161 pte_gpa = gfn_to_gpa(table_gfn) + offset;
155 walker->table_gfn[walker->level - 1] = table_gfn; 162 walker->table_gfn[walker->level - 1] = table_gfn;
156 walker->pte_gpa[walker->level - 1] = pte_gpa; 163 walker->pte_gpa[walker->level - 1] = pte_gpa;
157 164
158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { 165 if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
166 offset, sizeof(pte),
167 PFERR_USER_MASK|PFERR_WRITE_MASK)) {
159 present = false; 168 present = false;
160 break; 169 break;
161 } 170 }
@@ -167,7 +176,7 @@ walk:
167 break; 176 break;
168 } 177 }
169 178
170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) { 179 if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
171 rsvd_fault = true; 180 rsvd_fault = true;
172 break; 181 break;
173 } 182 }
@@ -204,17 +213,28 @@ walk:
204 (PTTYPE == 64 || is_pse(vcpu))) || 213 (PTTYPE == 64 || is_pse(vcpu))) ||
205 ((walker->level == PT_PDPE_LEVEL) && 214 ((walker->level == PT_PDPE_LEVEL) &&
206 is_large_pte(pte) && 215 is_large_pte(pte) &&
207 is_long_mode(vcpu))) { 216 mmu->root_level == PT64_ROOT_LEVEL)) {
208 int lvl = walker->level; 217 int lvl = walker->level;
218 gpa_t real_gpa;
219 gfn_t gfn;
220 u32 ac;
209 221
210 walker->gfn = gpte_to_gfn_lvl(pte, lvl); 222 gfn = gpte_to_gfn_lvl(pte, lvl);
211 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) 223 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
212 >> PAGE_SHIFT;
213 224
214 if (PTTYPE == 32 && 225 if (PTTYPE == 32 &&
215 walker->level == PT_DIRECTORY_LEVEL && 226 walker->level == PT_DIRECTORY_LEVEL &&
216 is_cpuid_PSE36()) 227 is_cpuid_PSE36())
217 walker->gfn += pse36_gfn_delta(pte); 228 gfn += pse36_gfn_delta(pte);
229
230 ac = write_fault | fetch_fault | user_fault;
231
232 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
233 ac);
234 if (real_gpa == UNMAPPED_GVA)
235 return 0;
236
237 walker->gfn = real_gpa >> PAGE_SHIFT;
218 238
219 break; 239 break;
220 } 240 }
@@ -249,18 +269,36 @@ error:
249 walker->error_code = 0; 269 walker->error_code = 0;
250 if (present) 270 if (present)
251 walker->error_code |= PFERR_PRESENT_MASK; 271 walker->error_code |= PFERR_PRESENT_MASK;
252 if (write_fault) 272
253 walker->error_code |= PFERR_WRITE_MASK; 273 walker->error_code |= write_fault | user_fault;
254 if (user_fault) 274
255 walker->error_code |= PFERR_USER_MASK; 275 if (fetch_fault && mmu->nx)
256 if (fetch_fault && is_nx(vcpu))
257 walker->error_code |= PFERR_FETCH_MASK; 276 walker->error_code |= PFERR_FETCH_MASK;
258 if (rsvd_fault) 277 if (rsvd_fault)
259 walker->error_code |= PFERR_RSVD_MASK; 278 walker->error_code |= PFERR_RSVD_MASK;
279
280 vcpu->arch.fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code;
282
260 trace_kvm_mmu_walker_error(walker->error_code); 283 trace_kvm_mmu_walker_error(walker->error_code);
261 return 0; 284 return 0;
262} 285}
263 286
287static int FNAME(walk_addr)(struct guest_walker *walker,
288 struct kvm_vcpu *vcpu, gva_t addr, u32 access)
289{
290 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
291 access);
292}
293
294static int FNAME(walk_addr_nested)(struct guest_walker *walker,
295 struct kvm_vcpu *vcpu, gva_t addr,
296 u32 access)
297{
298 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
299 addr, access);
300}
301
264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
265 u64 *spte, const void *pte) 303 u64 *spte, const void *pte)
266{ 304{
@@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 340static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level) 341 struct guest_walker *gw, int level)
304{ 342{
305 int r;
306 pt_element_t curr_pte; 343 pt_element_t curr_pte;
307 344 gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], 345 u64 mask;
346 int r, index;
347
348 if (level == PT_PAGE_TABLE_LEVEL) {
349 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
350 base_gpa = pte_gpa & ~mask;
351 index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
352
353 r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
354 gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
355 curr_pte = gw->prefetch_ptes[index];
356 } else
357 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
309 &curr_pte, sizeof(curr_pte)); 358 &curr_pte, sizeof(curr_pte));
359
310 return r || curr_pte != gw->ptes[level - 1]; 360 return r || curr_pte != gw->ptes[level - 1];
311} 361}
312 362
363static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep)
365{
366 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte;
370 int i;
371
372 sp = page_header(__pa(sptep));
373
374 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
375 return;
376
377 if (sp->role.direct)
378 return __direct_pte_prefetch(vcpu, sp, sptep);
379
380 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
381 spte = sp->spt + i;
382
383 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
384 pt_element_t gpte;
385 unsigned pte_access;
386 gfn_t gfn;
387 pfn_t pfn;
388 bool dirty;
389
390 if (spte == sptep)
391 continue;
392
393 if (*spte != shadow_trap_nonpresent_pte)
394 continue;
395
396 gpte = gptep[i];
397
398 if (!is_present_gpte(gpte) ||
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue;
407
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
409 gfn = gpte_to_gfn(gpte);
410 dirty = is_dirty_gpte(gpte);
411 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
412 (pte_access & ACC_WRITE_MASK) && dirty);
413 if (is_error_pfn(pfn)) {
414 kvm_release_pfn_clean(pfn);
415 break;
416 }
417
418 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
419 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
420 pfn, true, true);
421 }
422}
423
313/* 424/*
314 * Fetch a shadow pte for a specific level in the paging hierarchy. 425 * Fetch a shadow pte for a specific level in the paging hierarchy.
315 */ 426 */
@@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level, 503 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true); 504 gw->gfn, pfn, false, true);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
394 506
395 return it.sptep; 507 return it.sptep;
396 508
@@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
420{ 532{
421 int write_fault = error_code & PFERR_WRITE_MASK; 533 int write_fault = error_code & PFERR_WRITE_MASK;
422 int user_fault = error_code & PFERR_USER_MASK; 534 int user_fault = error_code & PFERR_USER_MASK;
423 int fetch_fault = error_code & PFERR_FETCH_MASK;
424 struct guest_walker walker; 535 struct guest_walker walker;
425 u64 *sptep; 536 u64 *sptep;
426 int write_pt = 0; 537 int write_pt = 0;
@@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
430 unsigned long mmu_seq; 541 unsigned long mmu_seq;
431 542
432 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
433 kvm_mmu_audit(vcpu, "pre page fault");
434 544
435 r = mmu_topup_memory_caches(vcpu); 545 r = mmu_topup_memory_caches(vcpu);
436 if (r) 546 if (r)
@@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 /* 549 /*
440 * Look up the guest pte for the faulting address. 550 * Look up the guest pte for the faulting address.
441 */ 551 */
442 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 552 r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
443 fetch_fault);
444 553
445 /* 554 /*
446 * The page is not mapped by the guest. Let the guest handle it. 555 * The page is not mapped by the guest. Let the guest handle it.
447 */ 556 */
448 if (!r) { 557 if (!r) {
449 pgprintk("%s: guest page fault\n", __func__); 558 pgprintk("%s: guest page fault\n", __func__);
450 inject_page_fault(vcpu, addr, walker.error_code); 559 inject_page_fault(vcpu);
451 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
452 return 0; 561 return 0;
453 } 562 }
@@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
468 spin_lock(&vcpu->kvm->mmu_lock); 577 spin_lock(&vcpu->kvm->mmu_lock);
469 if (mmu_notifier_retry(vcpu, mmu_seq)) 578 if (mmu_notifier_retry(vcpu, mmu_seq))
470 goto out_unlock; 579 goto out_unlock;
580
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
471 kvm_mmu_free_some_pages(vcpu); 582 kvm_mmu_free_some_pages(vcpu);
472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
473 level, &write_pt, pfn); 584 level, &write_pt, pfn);
@@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
479 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 590 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
480 591
481 ++vcpu->stat.pf_fixed; 592 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 593 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
483 spin_unlock(&vcpu->kvm->mmu_lock); 594 spin_unlock(&vcpu->kvm->mmu_lock);
484 595
485 return write_pt; 596 return write_pt;
@@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
556 gpa_t gpa = UNMAPPED_GVA; 667 gpa_t gpa = UNMAPPED_GVA;
557 int r; 668 int r;
558 669
559 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 670 r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
560 !!(access & PFERR_WRITE_MASK), 671
561 !!(access & PFERR_USER_MASK), 672 if (r) {
562 !!(access & PFERR_FETCH_MASK)); 673 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error)
676 *error = walker.error_code;
677
678 return gpa;
679}
680
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error)
683{
684 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA;
686 int r;
687
688 r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
563 689
564 if (r) { 690 if (r) {
565 gpa = gfn_to_gpa(walker.gfn); 691 gpa = gfn_to_gpa(walker.gfn);
@@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
638 return -EINVAL; 764 return -EINVAL;
639 765
640 gfn = gpte_to_gfn(gpte); 766 gfn = gpte_to_gfn(gpte);
641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) 767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte) 768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) { 769 || !(gpte & PT_ACCESSED_MASK)) {
644 u64 nonpresent; 770 u64 nonpresent;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 81ed28cb36e6..82e144a4e514 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates. 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 * 8 *
9 * Authors: 9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -88,6 +88,14 @@ struct nested_state {
88 /* A VMEXIT is required but not yet emulated */ 88 /* A VMEXIT is required but not yet emulated */
89 bool exit_required; 89 bool exit_required;
90 90
91 /*
92 * If we vmexit during an instruction emulation we need this to restore
93 * the l1 guest rip after the emulation
94 */
95 unsigned long vmexit_rip;
96 unsigned long vmexit_rsp;
97 unsigned long vmexit_rax;
98
91 /* cache for intercepts of the guest */ 99 /* cache for intercepts of the guest */
92 u16 intercept_cr_read; 100 u16 intercept_cr_read;
93 u16 intercept_cr_write; 101 u16 intercept_cr_write;
@@ -96,6 +104,8 @@ struct nested_state {
96 u32 intercept_exceptions; 104 u32 intercept_exceptions;
97 u64 intercept; 105 u64 intercept;
98 106
107 /* Nested Paging related state */
108 u64 nested_cr3;
99}; 109};
100 110
101#define MSRPM_OFFSETS 16 111#define MSRPM_OFFSETS 16
@@ -284,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
284 force_new_asid(vcpu); 294 force_new_asid(vcpu);
285} 295}
286 296
297static int get_npt_level(void)
298{
299#ifdef CONFIG_X86_64
300 return PT64_ROOT_LEVEL;
301#else
302 return PT32E_ROOT_LEVEL;
303#endif
304}
305
287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 306static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
288{ 307{
289 vcpu->arch.efer = efer; 308 vcpu->arch.efer = efer;
@@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
701 seg->base = 0; 720 seg->base = 0;
702} 721}
703 722
723static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
724{
725 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0;
727
728 if (is_nested(svm)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset;
732 }
733
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
735}
736
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
738{
739 struct vcpu_svm *svm = to_svm(vcpu);
740
741 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm))
743 svm->nested.hsave->control.tsc_offset += adjustment;
744}
745
704static void init_vmcb(struct vcpu_svm *svm) 746static void init_vmcb(struct vcpu_svm *svm)
705{ 747{
706 struct vmcb_control_area *control = &svm->vmcb->control; 748 struct vmcb_control_area *control = &svm->vmcb->control;
@@ -793,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm)
793 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 835 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
794 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 836 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
795 837
796 save->efer = EFER_SVME; 838 svm_set_efer(&svm->vcpu, 0);
797 save->dr6 = 0xffff0ff0; 839 save->dr6 = 0xffff0ff0;
798 save->dr7 = 0x400; 840 save->dr7 = 0x400;
799 save->rflags = 2; 841 save->rflags = 2;
@@ -804,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm)
804 * This is the guest-visible cr0 value. 846 * This is the guest-visible cr0 value.
805 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 847 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
806 */ 848 */
807 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 849 svm->vcpu.arch.cr0 = 0;
808 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 850 (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
809 851
810 save->cr4 = X86_CR4_PAE; 852 save->cr4 = X86_CR4_PAE;
811 /* rdx = ?? */ 853 /* rdx = ?? */
@@ -901,7 +943,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
901 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 943 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
902 svm->asid_generation = 0; 944 svm->asid_generation = 0;
903 init_vmcb(svm); 945 init_vmcb(svm);
904 svm->vmcb->control.tsc_offset = 0-native_read_tsc(); 946 kvm_write_tsc(&svm->vcpu, 0);
905 947
906 err = fx_init(&svm->vcpu); 948 err = fx_init(&svm->vcpu);
907 if (err) 949 if (err)
@@ -947,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
947 int i; 989 int i;
948 990
949 if (unlikely(cpu != vcpu->cpu)) { 991 if (unlikely(cpu != vcpu->cpu)) {
950 u64 delta;
951
952 if (check_tsc_unstable()) {
953 /*
954 * Make sure that the guest sees a monotonically
955 * increasing TSC.
956 */
957 delta = vcpu->arch.host_tsc - native_read_tsc();
958 svm->vmcb->control.tsc_offset += delta;
959 if (is_nested(svm))
960 svm->nested.hsave->control.tsc_offset += delta;
961 }
962 vcpu->cpu = cpu;
963 kvm_migrate_timers(vcpu);
964 svm->asid_generation = 0; 992 svm->asid_generation = 0;
965 } 993 }
966 994
@@ -976,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
976 ++vcpu->stat.host_state_reload; 1004 ++vcpu->stat.host_state_reload;
977 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
978 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
979
980 vcpu->arch.host_tsc = native_read_tsc();
981} 1007}
982 1008
983static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1009static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -995,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
995 switch (reg) { 1021 switch (reg) {
996 case VCPU_EXREG_PDPTR: 1022 case VCPU_EXREG_PDPTR:
997 BUG_ON(!npt_enabled); 1023 BUG_ON(!npt_enabled);
998 load_pdptrs(vcpu, vcpu->arch.cr3); 1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
999 break; 1025 break;
1000 default: 1026 default:
1001 BUG(); 1027 BUG();
@@ -1206,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1206 if (old == new) { 1232 if (old == new) {
1207 /* cr0 write with ts and mp unchanged */ 1233 /* cr0 write with ts and mp unchanged */
1208 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 1234 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1209 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) 1235 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
1236 svm->nested.vmexit_rip = kvm_rip_read(vcpu);
1237 svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
1238 svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
1210 return; 1239 return;
1240 }
1211 } 1241 }
1212 } 1242 }
1213 1243
@@ -1581,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1581 return 1; 1611 return 1;
1582} 1612}
1583 1613
1614static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1615{
1616 struct vcpu_svm *svm = to_svm(vcpu);
1617
1618 return svm->nested.nested_cr3;
1619}
1620
1621static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1622 unsigned long root)
1623{
1624 struct vcpu_svm *svm = to_svm(vcpu);
1625
1626 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu);
1628}
1629
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
1631{
1632 struct vcpu_svm *svm = to_svm(vcpu);
1633
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
1638
1639 nested_svm_vmexit(svm);
1640}
1641
1642static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1643{
1644 int r;
1645
1646 r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
1647
1648 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
1649 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
1650 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1651 vcpu->arch.mmu.shadow_root_level = get_npt_level();
1652 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
1653
1654 return r;
1655}
1656
1657static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1658{
1659 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
1660}
1661
1584static int nested_svm_check_permissions(struct vcpu_svm *svm) 1662static int nested_svm_check_permissions(struct vcpu_svm *svm)
1585{ 1663{
1586 if (!(svm->vcpu.arch.efer & EFER_SVME) 1664 if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1629,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1629 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1707 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1630 return false; 1708 return false;
1631 1709
1710 /*
1711 * if vmexit was already requested (by intercepted exception
1712 * for instance) do not overwrite it with "external interrupt"
1713 * vmexit.
1714 */
1715 if (svm->nested.exit_required)
1716 return false;
1717
1632 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1718 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1633 svm->vmcb->control.exit_info_1 = 0; 1719 svm->vmcb->control.exit_info_1 = 0;
1634 svm->vmcb->control.exit_info_2 = 0; 1720 svm->vmcb->control.exit_info_2 = 0;
@@ -1896,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1896 nested_vmcb->save.ds = vmcb->save.ds; 1982 nested_vmcb->save.ds = vmcb->save.ds;
1897 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1983 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1898 nested_vmcb->save.idtr = vmcb->save.idtr; 1984 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1899 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1900 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1901 nested_vmcb->save.cr2 = vmcb->save.cr2; 1988 nested_vmcb->save.cr2 = vmcb->save.cr2;
@@ -1917,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1917 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 2004 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1918 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 2005 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1919 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 2006 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2007 nested_vmcb->control.next_rip = vmcb->control.next_rip;
1920 2008
1921 /* 2009 /*
1922 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 2010 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -1947,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1947 kvm_clear_exception_queue(&svm->vcpu); 2035 kvm_clear_exception_queue(&svm->vcpu);
1948 kvm_clear_interrupt_queue(&svm->vcpu); 2036 kvm_clear_interrupt_queue(&svm->vcpu);
1949 2037
2038 svm->nested.nested_cr3 = 0;
2039
1950 /* Restore selected save entries */ 2040 /* Restore selected save entries */
1951 svm->vmcb->save.es = hsave->save.es; 2041 svm->vmcb->save.es = hsave->save.es;
1952 svm->vmcb->save.cs = hsave->save.cs; 2042 svm->vmcb->save.cs = hsave->save.cs;
@@ -1973,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1973 2063
1974 nested_svm_unmap(page); 2064 nested_svm_unmap(page);
1975 2065
2066 nested_svm_uninit_mmu_context(&svm->vcpu);
1976 kvm_mmu_reset_context(&svm->vcpu); 2067 kvm_mmu_reset_context(&svm->vcpu);
1977 kvm_mmu_load(&svm->vcpu); 2068 kvm_mmu_load(&svm->vcpu);
1978 2069
@@ -2012,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2012 return true; 2103 return true;
2013} 2104}
2014 2105
2106static bool nested_vmcb_checks(struct vmcb *vmcb)
2107{
2108 if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2109 return false;
2110
2111 if (vmcb->control.asid == 0)
2112 return false;
2113
2114 if (vmcb->control.nested_ctl && !npt_enabled)
2115 return false;
2116
2117 return true;
2118}
2119
2015static bool nested_svm_vmrun(struct vcpu_svm *svm) 2120static bool nested_svm_vmrun(struct vcpu_svm *svm)
2016{ 2121{
2017 struct vmcb *nested_vmcb; 2122 struct vmcb *nested_vmcb;
@@ -2026,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2026 if (!nested_vmcb) 2131 if (!nested_vmcb)
2027 return false; 2132 return false;
2028 2133
2029 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, 2134 if (!nested_vmcb_checks(nested_vmcb)) {
2135 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
2136 nested_vmcb->control.exit_code_hi = 0;
2137 nested_vmcb->control.exit_info_1 = 0;
2138 nested_vmcb->control.exit_info_2 = 0;
2139
2140 nested_svm_unmap(page);
2141
2142 return false;
2143 }
2144
2145 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2030 nested_vmcb->save.rip, 2146 nested_vmcb->save.rip,
2031 nested_vmcb->control.int_ctl, 2147 nested_vmcb->control.int_ctl,
2032 nested_vmcb->control.event_inj, 2148 nested_vmcb->control.event_inj,
@@ -2055,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2055 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2171 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2056 hsave->save.cr4 = svm->vcpu.arch.cr4; 2172 hsave->save.cr4 = svm->vcpu.arch.cr4;
2057 hsave->save.rflags = vmcb->save.rflags; 2173 hsave->save.rflags = vmcb->save.rflags;
2058 hsave->save.rip = svm->next_rip; 2174 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2059 hsave->save.rsp = vmcb->save.rsp; 2175 hsave->save.rsp = vmcb->save.rsp;
2060 hsave->save.rax = vmcb->save.rax; 2176 hsave->save.rax = vmcb->save.rax;
2061 if (npt_enabled) 2177 if (npt_enabled)
@@ -2070,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2070 else 2186 else
2071 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 2187 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2072 2188
2189 if (nested_vmcb->control.nested_ctl) {
2190 kvm_mmu_unload(&svm->vcpu);
2191 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2192 nested_svm_init_mmu_context(&svm->vcpu);
2193 }
2194
2073 /* Load the nested guest state */ 2195 /* Load the nested guest state */
2074 svm->vmcb->save.es = nested_vmcb->save.es; 2196 svm->vmcb->save.es = nested_vmcb->save.es;
2075 svm->vmcb->save.cs = nested_vmcb->save.cs; 2197 svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2227,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
2227 if (nested_svm_check_permissions(svm)) 2349 if (nested_svm_check_permissions(svm))
2228 return 1; 2350 return 1;
2229 2351
2230 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2352 /* Save rip after vmrun instruction */
2231 skip_emulated_instruction(&svm->vcpu); 2353 kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2232 2354
2233 if (!nested_svm_vmrun(svm)) 2355 if (!nested_svm_vmrun(svm))
2234 return 1; 2356 return 1;
@@ -2257,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm)
2257 2379
2258 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2380 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2259 skip_emulated_instruction(&svm->vcpu); 2381 skip_emulated_instruction(&svm->vcpu);
2382 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2260 2383
2261 enable_gif(svm); 2384 enable_gif(svm);
2262 2385
@@ -2399,6 +2522,23 @@ static int emulate_on_interception(struct vcpu_svm *svm)
2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2400} 2523}
2401 2524
2525static int cr0_write_interception(struct vcpu_svm *svm)
2526{
2527 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r;
2529
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0);
2531
2532 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
2534 kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
2535 kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
2536 svm->nested.vmexit_rip = 0;
2537 }
2538
2539 return r == EMULATE_DONE;
2540}
2541
2402static int cr8_write_interception(struct vcpu_svm *svm) 2542static int cr8_write_interception(struct vcpu_svm *svm)
2403{ 2543{
2404 struct kvm_run *kvm_run = svm->vcpu.run; 2544 struct kvm_run *kvm_run = svm->vcpu.run;
@@ -2542,20 +2682,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2542 struct vcpu_svm *svm = to_svm(vcpu); 2682 struct vcpu_svm *svm = to_svm(vcpu);
2543 2683
2544 switch (ecx) { 2684 switch (ecx) {
2545 case MSR_IA32_TSC: { 2685 case MSR_IA32_TSC:
2546 u64 tsc_offset = data - native_read_tsc(); 2686 kvm_write_tsc(vcpu, data);
2547 u64 g_tsc_offset = 0;
2548
2549 if (is_nested(svm)) {
2550 g_tsc_offset = svm->vmcb->control.tsc_offset -
2551 svm->nested.hsave->control.tsc_offset;
2552 svm->nested.hsave->control.tsc_offset = tsc_offset;
2553 }
2554
2555 svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
2556
2557 break; 2687 break;
2558 }
2559 case MSR_STAR: 2688 case MSR_STAR:
2560 svm->vmcb->save.star = data; 2689 svm->vmcb->save.star = data;
2561 break; 2690 break;
@@ -2643,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2643{ 2772{
2644 struct kvm_run *kvm_run = svm->vcpu.run; 2773 struct kvm_run *kvm_run = svm->vcpu.run;
2645 2774
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2646 svm_clear_vintr(svm); 2776 svm_clear_vintr(svm);
2647 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2648 /* 2778 /*
@@ -2672,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2672 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2802 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2673 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2803 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2674 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2675 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2676 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2677 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2678 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -2871,7 +3001,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2871 3001
2872 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3002 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2873 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3003 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2874 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) 3004 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3005 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
2875 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3006 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2876 "exit_code 0x%x\n", 3007 "exit_code 0x%x\n",
2877 __func__, svm->vmcb->control.exit_int_info, 3008 __func__, svm->vmcb->control.exit_int_info,
@@ -3088,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3088 3219
3089 svm->int3_injected = 0; 3220 svm->int3_injected = 0;
3090 3221
3091 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 3222 if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
3092 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3223 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3224 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3225 }
3093 3226
3094 svm->vcpu.arch.nmi_injected = false; 3227 svm->vcpu.arch.nmi_injected = false;
3095 kvm_clear_exception_queue(&svm->vcpu); 3228 kvm_clear_exception_queue(&svm->vcpu);
@@ -3098,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3098 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3231 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3099 return; 3232 return;
3100 3233
3234 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3235
3101 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3236 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3102 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3237 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3103 3238
@@ -3134,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3134 } 3269 }
3135} 3270}
3136 3271
3272static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3273{
3274 struct vcpu_svm *svm = to_svm(vcpu);
3275 struct vmcb_control_area *control = &svm->vmcb->control;
3276
3277 control->exit_int_info = control->event_inj;
3278 control->exit_int_info_err = control->event_inj_err;
3279 control->event_inj = 0;
3280 svm_complete_interrupts(svm);
3281}
3282
3137#ifdef CONFIG_X86_64 3283#ifdef CONFIG_X86_64
3138#define R "r" 3284#define R "r"
3139#else 3285#else
@@ -3163,13 +3309,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3163 sync_lapic_to_cr8(vcpu); 3309 sync_lapic_to_cr8(vcpu);
3164 3310
3165 save_host_msrs(vcpu); 3311 save_host_msrs(vcpu);
3166 fs_selector = kvm_read_fs(); 3312 savesegment(fs, fs_selector);
3167 gs_selector = kvm_read_gs(); 3313 savesegment(gs, gs_selector);
3168 ldt_selector = kvm_read_ldt(); 3314 ldt_selector = kvm_read_ldt();
3169 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3315 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3170 /* required for live migration with NPT */
3171 if (npt_enabled)
3172 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3173 3316
3174 clgi(); 3317 clgi();
3175 3318
@@ -3251,10 +3394,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3251 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3394 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3252 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3253 3396
3254 kvm_load_fs(fs_selector);
3255 kvm_load_gs(gs_selector);
3256 kvm_load_ldt(ldt_selector);
3257 load_host_msrs(vcpu); 3397 load_host_msrs(vcpu);
3398 loadsegment(fs, fs_selector);
3399#ifdef CONFIG_X86_64
3400 load_gs_index(gs_selector);
3401 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3402#else
3403 loadsegment(gs, gs_selector);
3404#endif
3405 kvm_load_ldt(ldt_selector);
3258 3406
3259 reload_tss(vcpu); 3407 reload_tss(vcpu);
3260 3408
@@ -3286,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3286{ 3434{
3287 struct vcpu_svm *svm = to_svm(vcpu); 3435 struct vcpu_svm *svm = to_svm(vcpu);
3288 3436
3289 if (npt_enabled) {
3290 svm->vmcb->control.nested_cr3 = root;
3291 force_new_asid(vcpu);
3292 return;
3293 }
3294
3295 svm->vmcb->save.cr3 = root; 3437 svm->vmcb->save.cr3 = root;
3296 force_new_asid(vcpu); 3438 force_new_asid(vcpu);
3297} 3439}
3298 3440
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3442{
3443 struct vcpu_svm *svm = to_svm(vcpu);
3444
3445 svm->vmcb->control.nested_cr3 = root;
3446
3447 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3449
3450 force_new_asid(vcpu);
3451}
3452
3299static int is_disabled(void) 3453static int is_disabled(void)
3300{ 3454{
3301 u64 vm_cr; 3455 u64 vm_cr;
@@ -3328,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
3328 return false; 3482 return false;
3329} 3483}
3330 3484
3331static int get_npt_level(void)
3332{
3333#ifdef CONFIG_X86_64
3334 return PT64_ROOT_LEVEL;
3335#else
3336 return PT32E_ROOT_LEVEL;
3337#endif
3338}
3339
3340static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 3485static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3341{ 3486{
3342 return 0; 3487 return 0;
@@ -3349,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3349static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3350{ 3495{
3351 switch (func) { 3496 switch (func) {
3497 case 0x80000001:
3498 if (nested)
3499 entry->ecx |= (1 << 2); /* Set SVM bit */
3500 break;
3352 case 0x8000000A: 3501 case 0x8000000A:
3353 entry->eax = 1; /* SVM revision 1 */ 3502 entry->eax = 1; /* SVM revision 1 */
3354 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 3503 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3355 ASID emulation to nested SVM */ 3504 ASID emulation to nested SVM */
3356 entry->ecx = 0; /* Reserved */ 3505 entry->ecx = 0; /* Reserved */
3357 entry->edx = 0; /* Do not support any additional features */ 3506 entry->edx = 0; /* Per default do not support any
3507 additional features */
3508
3509 /* Support next_rip if host supports it */
3510 if (svm_has(SVM_FEATURE_NRIP))
3511 entry->edx |= SVM_FEATURE_NRIP;
3512
3513 /* Support NPT for the guest if enabled */
3514 if (npt_enabled)
3515 entry->edx |= SVM_FEATURE_NPT;
3358 3516
3359 break; 3517 break;
3360 } 3518 }
@@ -3492,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3492 .set_irq = svm_set_irq, 3650 .set_irq = svm_set_irq,
3493 .set_nmi = svm_inject_nmi, 3651 .set_nmi = svm_inject_nmi,
3494 .queue_exception = svm_queue_exception, 3652 .queue_exception = svm_queue_exception,
3653 .cancel_injection = svm_cancel_injection,
3495 .interrupt_allowed = svm_interrupt_allowed, 3654 .interrupt_allowed = svm_interrupt_allowed,
3496 .nmi_allowed = svm_nmi_allowed, 3655 .nmi_allowed = svm_nmi_allowed,
3497 .get_nmi_mask = svm_get_nmi_mask, 3656 .get_nmi_mask = svm_get_nmi_mask,
@@ -3514,6 +3673,11 @@ static struct kvm_x86_ops svm_x86_ops = {
3514 .set_supported_cpuid = svm_set_supported_cpuid, 3673 .set_supported_cpuid = svm_set_supported_cpuid,
3515 3674
3516 .has_wbinvd_exit = svm_has_wbinvd_exit, 3675 .has_wbinvd_exit = svm_has_wbinvd_exit,
3676
3677 .write_tsc_offset = svm_write_tsc_offset,
3678 .adjust_tsc_offset = svm_adjust_tsc_offset,
3679
3680 .set_tdp_cr3 = set_tdp_cr3,
3517}; 3681};
3518 3682
3519static int __init svm_init(void) 3683static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d8..fc7a101c4a35 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * timer support 7 * timer support
8 * 8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory. 12 * the COPYING file in the top-level directory.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 49b25eee25ac..8da0e45ff7c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -125,6 +125,7 @@ struct vcpu_vmx {
125 unsigned long host_rsp; 125 unsigned long host_rsp;
126 int launched; 126 int launched;
127 u8 fail; 127 u8 fail;
128 u32 exit_intr_info;
128 u32 idt_vectoring_info; 129 u32 idt_vectoring_info;
129 struct shared_msr_entry *guest_msrs; 130 struct shared_msr_entry *guest_msrs;
130 int nmsrs; 131 int nmsrs;
@@ -154,11 +155,6 @@ struct vcpu_vmx {
154 u32 limit; 155 u32 limit;
155 u32 ar; 156 u32 ar;
156 } tr, es, ds, fs, gs; 157 } tr, es, ds, fs, gs;
157 struct {
158 bool pending;
159 u8 vector;
160 unsigned rip;
161 } irq;
162 } rmode; 158 } rmode;
163 int vpid; 159 int vpid;
164 bool emulation_required; 160 bool emulation_required;
@@ -505,7 +501,6 @@ static void __vcpu_clear(void *arg)
505 vmcs_clear(vmx->vmcs); 501 vmcs_clear(vmx->vmcs);
506 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 502 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
507 per_cpu(current_vmcs, cpu) = NULL; 503 per_cpu(current_vmcs, cpu) = NULL;
508 rdtscll(vmx->vcpu.arch.host_tsc);
509 list_del(&vmx->local_vcpus_link); 504 list_del(&vmx->local_vcpus_link);
510 vmx->vcpu.cpu = -1; 505 vmx->vcpu.cpu = -1;
511 vmx->launched = 0; 506 vmx->launched = 0;
@@ -706,11 +701,10 @@ static void reload_tss(void)
706 /* 701 /*
707 * VT restores TR but not its size. Useless. 702 * VT restores TR but not its size. Useless.
708 */ 703 */
709 struct desc_ptr gdt; 704 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
710 struct desc_struct *descs; 705 struct desc_struct *descs;
711 706
712 native_store_gdt(&gdt); 707 descs = (void *)gdt->address;
713 descs = (void *)gdt.address;
714 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 708 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
715 load_TR_desc(); 709 load_TR_desc();
716} 710}
@@ -753,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
753 747
754static unsigned long segment_base(u16 selector) 748static unsigned long segment_base(u16 selector)
755{ 749{
756 struct desc_ptr gdt; 750 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
757 struct desc_struct *d; 751 struct desc_struct *d;
758 unsigned long table_base; 752 unsigned long table_base;
759 unsigned long v; 753 unsigned long v;
@@ -761,8 +755,7 @@ static unsigned long segment_base(u16 selector)
761 if (!(selector & ~3)) 755 if (!(selector & ~3))
762 return 0; 756 return 0;
763 757
764 native_store_gdt(&gdt); 758 table_base = gdt->address;
765 table_base = gdt.address;
766 759
767 if (selector & 4) { /* from ldt */ 760 if (selector & 4) { /* from ldt */
768 u16 ldt_selector = kvm_read_ldt(); 761 u16 ldt_selector = kvm_read_ldt();
@@ -803,7 +796,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
803 */ 796 */
804 vmx->host_state.ldt_sel = kvm_read_ldt(); 797 vmx->host_state.ldt_sel = kvm_read_ldt();
805 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 798 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
806 vmx->host_state.fs_sel = kvm_read_fs(); 799 savesegment(fs, vmx->host_state.fs_sel);
807 if (!(vmx->host_state.fs_sel & 7)) { 800 if (!(vmx->host_state.fs_sel & 7)) {
808 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 801 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
809 vmx->host_state.fs_reload_needed = 0; 802 vmx->host_state.fs_reload_needed = 0;
@@ -811,7 +804,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
811 vmcs_write16(HOST_FS_SELECTOR, 0); 804 vmcs_write16(HOST_FS_SELECTOR, 0);
812 vmx->host_state.fs_reload_needed = 1; 805 vmx->host_state.fs_reload_needed = 1;
813 } 806 }
814 vmx->host_state.gs_sel = kvm_read_gs(); 807 savesegment(gs, vmx->host_state.gs_sel);
815 if (!(vmx->host_state.gs_sel & 7)) 808 if (!(vmx->host_state.gs_sel & 7))
816 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 809 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
817 else { 810 else {
@@ -841,27 +834,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
841 834
842static void __vmx_load_host_state(struct vcpu_vmx *vmx) 835static void __vmx_load_host_state(struct vcpu_vmx *vmx)
843{ 836{
844 unsigned long flags;
845
846 if (!vmx->host_state.loaded) 837 if (!vmx->host_state.loaded)
847 return; 838 return;
848 839
849 ++vmx->vcpu.stat.host_state_reload; 840 ++vmx->vcpu.stat.host_state_reload;
850 vmx->host_state.loaded = 0; 841 vmx->host_state.loaded = 0;
851 if (vmx->host_state.fs_reload_needed) 842 if (vmx->host_state.fs_reload_needed)
852 kvm_load_fs(vmx->host_state.fs_sel); 843 loadsegment(fs, vmx->host_state.fs_sel);
853 if (vmx->host_state.gs_ldt_reload_needed) { 844 if (vmx->host_state.gs_ldt_reload_needed) {
854 kvm_load_ldt(vmx->host_state.ldt_sel); 845 kvm_load_ldt(vmx->host_state.ldt_sel);
855 /*
856 * If we have to reload gs, we must take care to
857 * preserve our gs base.
858 */
859 local_irq_save(flags);
860 kvm_load_gs(vmx->host_state.gs_sel);
861#ifdef CONFIG_X86_64 846#ifdef CONFIG_X86_64
862 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 847 load_gs_index(vmx->host_state.gs_sel);
848 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
849#else
850 loadsegment(gs, vmx->host_state.gs_sel);
863#endif 851#endif
864 local_irq_restore(flags);
865 } 852 }
866 reload_tss(); 853 reload_tss();
867#ifdef CONFIG_X86_64 854#ifdef CONFIG_X86_64
@@ -889,7 +876,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
889static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 876static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
890{ 877{
891 struct vcpu_vmx *vmx = to_vmx(vcpu); 878 struct vcpu_vmx *vmx = to_vmx(vcpu);
892 u64 tsc_this, delta, new_offset;
893 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 879 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
894 880
895 if (!vmm_exclusive) 881 if (!vmm_exclusive)
@@ -903,37 +889,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
903 } 889 }
904 890
905 if (vcpu->cpu != cpu) { 891 if (vcpu->cpu != cpu) {
906 struct desc_ptr dt; 892 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
907 unsigned long sysenter_esp; 893 unsigned long sysenter_esp;
908 894
909 kvm_migrate_timers(vcpu);
910 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 895 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
911 local_irq_disable(); 896 local_irq_disable();
912 list_add(&vmx->local_vcpus_link, 897 list_add(&vmx->local_vcpus_link,
913 &per_cpu(vcpus_on_cpu, cpu)); 898 &per_cpu(vcpus_on_cpu, cpu));
914 local_irq_enable(); 899 local_irq_enable();
915 900
916 vcpu->cpu = cpu;
917 /* 901 /*
918 * Linux uses per-cpu TSS and GDT, so set these when switching 902 * Linux uses per-cpu TSS and GDT, so set these when switching
919 * processors. 903 * processors.
920 */ 904 */
921 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 905 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
922 native_store_gdt(&dt); 906 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
923 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
924 907
925 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 908 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
926 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 909 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
927
928 /*
929 * Make sure the time stamp counter is monotonous.
930 */
931 rdtscll(tsc_this);
932 if (tsc_this < vcpu->arch.host_tsc) {
933 delta = vcpu->arch.host_tsc - tsc_this;
934 new_offset = vmcs_read64(TSC_OFFSET) + delta;
935 vmcs_write64(TSC_OFFSET, new_offset);
936 }
937 } 910 }
938} 911}
939 912
@@ -1050,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1050 } 1023 }
1051 1024
1052 if (vmx->rmode.vm86_active) { 1025 if (vmx->rmode.vm86_active) {
1053 vmx->rmode.irq.pending = true; 1026 if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
1054 vmx->rmode.irq.vector = nr; 1027 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1055 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
1056 if (kvm_exception_is_soft(nr))
1057 vmx->rmode.irq.rip +=
1058 vmx->vcpu.arch.event_exit_inst_len;
1059 intr_info |= INTR_TYPE_SOFT_INTR;
1060 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1061 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1062 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
1063 return; 1028 return;
1064 } 1029 }
1065 1030
@@ -1155,12 +1120,17 @@ static u64 guest_read_tsc(void)
1155} 1120}
1156 1121
1157/* 1122/*
1158 * writes 'guest_tsc' into guest's timestamp counter "register" 1123 * writes 'offset' into guest's timestamp counter offset register
1159 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
1160 */ 1124 */
1161static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) 1125static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1162{ 1126{
1163 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 1127 vmcs_write64(TSC_OFFSET, offset);
1128}
1129
1130static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1131{
1132 u64 offset = vmcs_read64(TSC_OFFSET);
1133 vmcs_write64(TSC_OFFSET, offset + adjustment);
1164} 1134}
1165 1135
1166/* 1136/*
@@ -1233,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1233{ 1203{
1234 struct vcpu_vmx *vmx = to_vmx(vcpu); 1204 struct vcpu_vmx *vmx = to_vmx(vcpu);
1235 struct shared_msr_entry *msr; 1205 struct shared_msr_entry *msr;
1236 u64 host_tsc;
1237 int ret = 0; 1206 int ret = 0;
1238 1207
1239 switch (msr_index) { 1208 switch (msr_index) {
@@ -1263,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1263 vmcs_writel(GUEST_SYSENTER_ESP, data); 1232 vmcs_writel(GUEST_SYSENTER_ESP, data);
1264 break; 1233 break;
1265 case MSR_IA32_TSC: 1234 case MSR_IA32_TSC:
1266 rdtscll(host_tsc); 1235 kvm_write_tsc(vcpu, data);
1267 guest_write_tsc(data, host_tsc);
1268 break; 1236 break;
1269 case MSR_IA32_CR_PAT: 1237 case MSR_IA32_CR_PAT:
1270 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1238 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1862,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1862 return; 1830 return;
1863 1831
1864 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1832 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1865 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 1833 vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
1866 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 1834 vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
1867 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 1835 vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
1868 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); 1836 vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
1869 } 1837 }
1870} 1838}
1871 1839
1872static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 1840static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1873{ 1841{
1874 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1842 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1875 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 1843 vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1876 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 1844 vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1877 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 1845 vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1878 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 1846 vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1879 } 1847 }
1880 1848
1881 __set_bit(VCPU_EXREG_PDPTR, 1849 __set_bit(VCPU_EXREG_PDPTR,
@@ -2521,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2521{ 2489{
2522 u32 host_sysenter_cs, msr_low, msr_high; 2490 u32 host_sysenter_cs, msr_low, msr_high;
2523 u32 junk; 2491 u32 junk;
2524 u64 host_pat, tsc_this, tsc_base; 2492 u64 host_pat;
2525 unsigned long a; 2493 unsigned long a;
2526 struct desc_ptr dt; 2494 struct desc_ptr dt;
2527 int i; 2495 int i;
@@ -2589,8 +2557,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2589 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 2557 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
2590 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 2558 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2591 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 2559 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2592 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ 2560 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
2593 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ 2561 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
2594 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 2562 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2595#ifdef CONFIG_X86_64 2563#ifdef CONFIG_X86_64
2596 rdmsrl(MSR_FS_BASE, a); 2564 rdmsrl(MSR_FS_BASE, a);
@@ -2662,12 +2630,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2662 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 2630 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2663 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 2631 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2664 2632
2665 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2633 kvm_write_tsc(&vmx->vcpu, 0);
2666 rdtscll(tsc_this);
2667 if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2668 tsc_base = tsc_this;
2669
2670 guest_write_tsc(0, tsc_base);
2671 2634
2672 return 0; 2635 return 0;
2673} 2636}
@@ -2840,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2840 2803
2841 ++vcpu->stat.irq_injections; 2804 ++vcpu->stat.irq_injections;
2842 if (vmx->rmode.vm86_active) { 2805 if (vmx->rmode.vm86_active) {
2843 vmx->rmode.irq.pending = true; 2806 if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
2844 vmx->rmode.irq.vector = irq; 2807 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2845 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2846 if (vcpu->arch.interrupt.soft)
2847 vmx->rmode.irq.rip +=
2848 vmx->vcpu.arch.event_exit_inst_len;
2849 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2850 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2851 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2852 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2853 return; 2808 return;
2854 } 2809 }
2855 intr = irq | INTR_INFO_VALID_MASK; 2810 intr = irq | INTR_INFO_VALID_MASK;
@@ -2881,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2881 2836
2882 ++vcpu->stat.nmi_injections; 2837 ++vcpu->stat.nmi_injections;
2883 if (vmx->rmode.vm86_active) { 2838 if (vmx->rmode.vm86_active) {
2884 vmx->rmode.irq.pending = true; 2839 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
2885 vmx->rmode.irq.vector = NMI_VECTOR; 2840 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2886 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2887 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2888 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2889 INTR_INFO_VALID_MASK);
2890 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2891 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2892 return; 2841 return;
2893 } 2842 }
2894 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -3352,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3352 3301
3353static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 3302static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3354{ 3303{
3304 kvm_make_request(KVM_REQ_EVENT, vcpu);
3355 return 1; 3305 return 1;
3356} 3306}
3357 3307
@@ -3364,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3364 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 3314 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3365 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3315 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3366 3316
3317 kvm_make_request(KVM_REQ_EVENT, vcpu);
3318
3367 ++vcpu->stat.irq_window_exits; 3319 ++vcpu->stat.irq_window_exits;
3368 3320
3369 /* 3321 /*
@@ -3620,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
3620 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 3572 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3621 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3573 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3622 ++vcpu->stat.nmi_window_exits; 3574 ++vcpu->stat.nmi_window_exits;
3575 kvm_make_request(KVM_REQ_EVENT, vcpu);
3623 3576
3624 return 1; 3577 return 1;
3625} 3578}
@@ -3629,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3629 struct vcpu_vmx *vmx = to_vmx(vcpu); 3582 struct vcpu_vmx *vmx = to_vmx(vcpu);
3630 enum emulation_result err = EMULATE_DONE; 3583 enum emulation_result err = EMULATE_DONE;
3631 int ret = 1; 3584 int ret = 1;
3585 u32 cpu_exec_ctrl;
3586 bool intr_window_requested;
3587
3588 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3589 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
3632 3590
3633 while (!guest_state_valid(vcpu)) { 3591 while (!guest_state_valid(vcpu)) {
3592 if (intr_window_requested
3593 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3594 return handle_interrupt_window(&vmx->vcpu);
3595
3634 err = emulate_instruction(vcpu, 0, 0, 0); 3596 err = emulate_instruction(vcpu, 0, 0, 0);
3635 3597
3636 if (err == EMULATE_DO_MMIO) { 3598 if (err == EMULATE_DO_MMIO) {
@@ -3796,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3796 vmcs_write32(TPR_THRESHOLD, irr); 3758 vmcs_write32(TPR_THRESHOLD, irr);
3797} 3759}
3798 3760
3799static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3761static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3800{ 3762{
3801 u32 exit_intr_info; 3763 u32 exit_intr_info = vmx->exit_intr_info;
3802 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3803 bool unblock_nmi;
3804 u8 vector;
3805 int type;
3806 bool idtv_info_valid;
3807
3808 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3809
3810 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3811 3764
3812 /* Handle machine checks before interrupts are enabled */ 3765 /* Handle machine checks before interrupts are enabled */
3813 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 3766 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3822,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3822 asm("int $2"); 3775 asm("int $2");
3823 kvm_after_handle_nmi(&vmx->vcpu); 3776 kvm_after_handle_nmi(&vmx->vcpu);
3824 } 3777 }
3778}
3825 3779
3826 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3780static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3781{
3782 u32 exit_intr_info = vmx->exit_intr_info;
3783 bool unblock_nmi;
3784 u8 vector;
3785 bool idtv_info_valid;
3786
3787 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3827 3788
3828 if (cpu_has_virtual_nmis()) { 3789 if (cpu_has_virtual_nmis()) {
3829 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 3790 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3845,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3845 } else if (unlikely(vmx->soft_vnmi_blocked)) 3806 } else if (unlikely(vmx->soft_vnmi_blocked))
3846 vmx->vnmi_blocked_time += 3807 vmx->vnmi_blocked_time +=
3847 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 3808 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3809}
3810
3811static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
3812 u32 idt_vectoring_info,
3813 int instr_len_field,
3814 int error_code_field)
3815{
3816 u8 vector;
3817 int type;
3818 bool idtv_info_valid;
3819
3820 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3848 3821
3849 vmx->vcpu.arch.nmi_injected = false; 3822 vmx->vcpu.arch.nmi_injected = false;
3850 kvm_clear_exception_queue(&vmx->vcpu); 3823 kvm_clear_exception_queue(&vmx->vcpu);
@@ -3853,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3853 if (!idtv_info_valid) 3826 if (!idtv_info_valid)
3854 return; 3827 return;
3855 3828
3829 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
3830
3856 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 3831 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3857 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 3832 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3858 3833
@@ -3869,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3869 break; 3844 break;
3870 case INTR_TYPE_SOFT_EXCEPTION: 3845 case INTR_TYPE_SOFT_EXCEPTION:
3871 vmx->vcpu.arch.event_exit_inst_len = 3846 vmx->vcpu.arch.event_exit_inst_len =
3872 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3847 vmcs_read32(instr_len_field);
3873 /* fall through */ 3848 /* fall through */
3874 case INTR_TYPE_HARD_EXCEPTION: 3849 case INTR_TYPE_HARD_EXCEPTION:
3875 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3850 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3876 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3851 u32 err = vmcs_read32(error_code_field);
3877 kvm_queue_exception_e(&vmx->vcpu, vector, err); 3852 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3878 } else 3853 } else
3879 kvm_queue_exception(&vmx->vcpu, vector); 3854 kvm_queue_exception(&vmx->vcpu, vector);
3880 break; 3855 break;
3881 case INTR_TYPE_SOFT_INTR: 3856 case INTR_TYPE_SOFT_INTR:
3882 vmx->vcpu.arch.event_exit_inst_len = 3857 vmx->vcpu.arch.event_exit_inst_len =
3883 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3858 vmcs_read32(instr_len_field);
3884 /* fall through */ 3859 /* fall through */
3885 case INTR_TYPE_EXT_INTR: 3860 case INTR_TYPE_EXT_INTR:
3886 kvm_queue_interrupt(&vmx->vcpu, vector, 3861 kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3891,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3891 } 3866 }
3892} 3867}
3893 3868
3894/* 3869static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3895 * Failure to inject an interrupt should give us the information
3896 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
3897 * when fetching the interrupt redirection bitmap in the real-mode
3898 * tss, this doesn't happen. So we do it ourselves.
3899 */
3900static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3901{ 3870{
3902 vmx->rmode.irq.pending = 0; 3871 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
3903 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) 3872 VM_EXIT_INSTRUCTION_LEN,
3904 return; 3873 IDT_VECTORING_ERROR_CODE);
3905 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); 3874}
3906 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3875
3907 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3876static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3908 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3877{
3909 return; 3878 __vmx_complete_interrupts(to_vmx(vcpu),
3910 } 3879 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
3911 vmx->idt_vectoring_info = 3880 VM_ENTRY_INSTRUCTION_LEN,
3912 VECTORING_INFO_VALID_MASK 3881 VM_ENTRY_EXCEPTION_ERROR_CODE);
3913 | INTR_TYPE_EXT_INTR 3882
3914 | vmx->rmode.irq.vector; 3883 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
3915} 3884}
3916 3885
3917#ifdef CONFIG_X86_64 3886#ifdef CONFIG_X86_64
@@ -4038,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4038#endif 4007#endif
4039 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4008 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
4040 : "cc", "memory" 4009 : "cc", "memory"
4041 , R"bx", R"di", R"si" 4010 , R"ax", R"bx", R"di", R"si"
4042#ifdef CONFIG_X86_64 4011#ifdef CONFIG_X86_64
4043 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 4012 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
4044#endif 4013#endif
@@ -4049,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4049 vcpu->arch.regs_dirty = 0; 4018 vcpu->arch.regs_dirty = 0;
4050 4019
4051 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4020 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4052 if (vmx->rmode.irq.pending)
4053 fixup_rmode_irq(vmx);
4054 4021
4055 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 4022 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4056 vmx->launched = 1; 4023 vmx->launched = 1;
4057 4024
4025 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4026 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4027
4028 vmx_complete_atomic_exit(vmx);
4029 vmx_recover_nmi_blocking(vmx);
4058 vmx_complete_interrupts(vmx); 4030 vmx_complete_interrupts(vmx);
4059} 4031}
4060 4032
@@ -4125,6 +4097,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4125 4097
4126 cpu = get_cpu(); 4098 cpu = get_cpu();
4127 vmx_vcpu_load(&vmx->vcpu, cpu); 4099 vmx_vcpu_load(&vmx->vcpu, cpu);
4100 vmx->vcpu.cpu = cpu;
4128 err = vmx_vcpu_setup(vmx); 4101 err = vmx_vcpu_setup(vmx);
4129 vmx_vcpu_put(&vmx->vcpu); 4102 vmx_vcpu_put(&vmx->vcpu);
4130 put_cpu(); 4103 put_cpu();
@@ -4340,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4340 .set_irq = vmx_inject_irq, 4313 .set_irq = vmx_inject_irq,
4341 .set_nmi = vmx_inject_nmi, 4314 .set_nmi = vmx_inject_nmi,
4342 .queue_exception = vmx_queue_exception, 4315 .queue_exception = vmx_queue_exception,
4316 .cancel_injection = vmx_cancel_injection,
4343 .interrupt_allowed = vmx_interrupt_allowed, 4317 .interrupt_allowed = vmx_interrupt_allowed,
4344 .nmi_allowed = vmx_nmi_allowed, 4318 .nmi_allowed = vmx_nmi_allowed,
4345 .get_nmi_mask = vmx_get_nmi_mask, 4319 .get_nmi_mask = vmx_get_nmi_mask,
@@ -4362,6 +4336,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
4362 .set_supported_cpuid = vmx_set_supported_cpuid, 4336 .set_supported_cpuid = vmx_set_supported_cpuid,
4363 4337
4364 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4338 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4339
4340 .write_tsc_offset = vmx_write_tsc_offset,
4341 .adjust_tsc_offset = vmx_adjust_tsc_offset,
4342
4343 .set_tdp_cr3 = vmx_set_cr3,
4365}; 4344};
4366 4345
4367static int __init vmx_init(void) 4346static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d526..2288ad829b32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * Authors: 11 * Authors:
12 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -55,6 +55,8 @@
55#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/xcr.h> 57#include <asm/xcr.h>
58#include <asm/pvclock.h>
59#include <asm/div64.h>
58 60
59#define MAX_IO_MSRS 256 61#define MAX_IO_MSRS 256
60#define CR0_RESERVED_BITS \ 62#define CR0_RESERVED_BITS \
@@ -71,7 +73,7 @@
71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 73#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
72 74
73#define KVM_MAX_MCE_BANKS 32 75#define KVM_MAX_MCE_BANKS 32
74#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 76#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
75 77
76/* EFER defaults: 78/* EFER defaults:
77 * - enable syscall per default because its emulated by KVM 79 * - enable syscall per default because its emulated by KVM
@@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
282 u32 prev_nr; 284 u32 prev_nr;
283 int class1, class2; 285 int class1, class2;
284 286
287 kvm_make_request(KVM_REQ_EVENT, vcpu);
288
285 if (!vcpu->arch.exception.pending) { 289 if (!vcpu->arch.exception.pending) {
286 queue: 290 queue:
287 vcpu->arch.exception.pending = true; 291 vcpu->arch.exception.pending = true;
@@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
327} 331}
328EXPORT_SYMBOL_GPL(kvm_requeue_exception); 332EXPORT_SYMBOL_GPL(kvm_requeue_exception);
329 333
330void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 334void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
331 u32 error_code)
332{ 335{
336 unsigned error_code = vcpu->arch.fault.error_code;
337
333 ++vcpu->stat.pf_guest; 338 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = addr; 339 vcpu->arch.cr2 = vcpu->arch.fault.address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 340 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
336} 341}
337 342
343void kvm_propagate_fault(struct kvm_vcpu *vcpu)
344{
345 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
346 vcpu->arch.nested_mmu.inject_page_fault(vcpu);
347 else
348 vcpu->arch.mmu.inject_page_fault(vcpu);
349
350 vcpu->arch.fault.nested = false;
351}
352
338void kvm_inject_nmi(struct kvm_vcpu *vcpu) 353void kvm_inject_nmi(struct kvm_vcpu *vcpu)
339{ 354{
355 kvm_make_request(KVM_REQ_EVENT, vcpu);
340 vcpu->arch.nmi_pending = 1; 356 vcpu->arch.nmi_pending = 1;
341} 357}
342EXPORT_SYMBOL_GPL(kvm_inject_nmi); 358EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
367EXPORT_SYMBOL_GPL(kvm_require_cpl); 383EXPORT_SYMBOL_GPL(kvm_require_cpl);
368 384
369/* 385/*
386 * This function will be used to read from the physical memory of the currently
387 * running guest. The difference to kvm_read_guest_page is that this function
388 * can read from guest physical or from the guest's guest physical memory.
389 */
390int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
391 gfn_t ngfn, void *data, int offset, int len,
392 u32 access)
393{
394 gfn_t real_gfn;
395 gpa_t ngpa;
396
397 ngpa = gfn_to_gpa(ngfn);
398 real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
399 if (real_gfn == UNMAPPED_GVA)
400 return -EFAULT;
401
402 real_gfn = gpa_to_gfn(real_gfn);
403
404 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
405}
406EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
407
408int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
409 void *data, int offset, int len, u32 access)
410{
411 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
412 data, offset, len, access);
413}
414
415/*
370 * Load the pae pdptrs. Return true is they are all valid. 416 * Load the pae pdptrs. Return true is they are all valid.
371 */ 417 */
372int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 418int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
373{ 419{
374 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 420 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
375 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 421 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
376 int i; 422 int i;
377 int ret; 423 int ret;
378 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 424 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
379 425
380 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 426 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
381 offset * sizeof(u64), sizeof(pdpte)); 427 offset * sizeof(u64), sizeof(pdpte),
428 PFERR_USER_MASK|PFERR_WRITE_MASK);
382 if (ret < 0) { 429 if (ret < 0) {
383 ret = 0; 430 ret = 0;
384 goto out; 431 goto out;
@@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
392 } 439 }
393 ret = 1; 440 ret = 1;
394 441
395 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 442 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
396 __set_bit(VCPU_EXREG_PDPTR, 443 __set_bit(VCPU_EXREG_PDPTR,
397 (unsigned long *)&vcpu->arch.regs_avail); 444 (unsigned long *)&vcpu->arch.regs_avail);
398 __set_bit(VCPU_EXREG_PDPTR, 445 __set_bit(VCPU_EXREG_PDPTR,
@@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
405 452
406static bool pdptrs_changed(struct kvm_vcpu *vcpu) 453static bool pdptrs_changed(struct kvm_vcpu *vcpu)
407{ 454{
408 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 455 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
409 bool changed = true; 456 bool changed = true;
457 int offset;
458 gfn_t gfn;
410 int r; 459 int r;
411 460
412 if (is_long_mode(vcpu) || !is_pae(vcpu)) 461 if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
416 (unsigned long *)&vcpu->arch.regs_avail)) 465 (unsigned long *)&vcpu->arch.regs_avail))
417 return true; 466 return true;
418 467
419 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 468 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
469 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
470 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
471 PFERR_USER_MASK | PFERR_WRITE_MASK);
420 if (r < 0) 472 if (r < 0)
421 goto out; 473 goto out;
422 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 474 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
423out: 475out:
424 476
425 return changed; 477 return changed;
@@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
458 return 1; 510 return 1;
459 } else 511 } else
460#endif 512#endif
461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 513 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
514 vcpu->arch.cr3))
462 return 1; 515 return 1;
463 } 516 }
464 517
@@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
547 return 1; 600 return 1;
548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 601 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
549 && ((cr4 ^ old_cr4) & pdptr_bits) 602 && ((cr4 ^ old_cr4) & pdptr_bits)
550 && !load_pdptrs(vcpu, vcpu->arch.cr3)) 603 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
551 return 1; 604 return 1;
552 605
553 if (cr4 & X86_CR4_VMXE) 606 if (cr4 & X86_CR4_VMXE)
@@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
580 if (is_pae(vcpu)) { 633 if (is_pae(vcpu)) {
581 if (cr3 & CR3_PAE_RESERVED_BITS) 634 if (cr3 & CR3_PAE_RESERVED_BITS)
582 return 1; 635 return 1;
583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 636 if (is_paging(vcpu) &&
637 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
584 return 1; 638 return 1;
585 } 639 }
586 /* 640 /*
@@ -737,7 +791,7 @@ static u32 msrs_to_save[] = {
737#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 792 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
739#endif 793#endif
740 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 794 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
741}; 795};
742 796
743static unsigned num_msrs_to_save; 797static unsigned num_msrs_to_save;
@@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
838 892
839 /* 893 /*
840 * The guest calculates current wall clock time by adding 894 * The guest calculates current wall clock time by adding
841 * system time (updated by kvm_write_guest_time below) to the 895 * system time (updated by kvm_guest_time_update below) to the
842 * wall clock specified here. guest system time equals host 896 * wall clock specified here. guest system time equals host
843 * system time for us, thus we must fill in host boot time here. 897 * system time for us, thus we must fill in host boot time here.
844 */ 898 */
@@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
866 return quotient; 920 return quotient;
867} 921}
868 922
869static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 923static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
924 s8 *pshift, u32 *pmultiplier)
870{ 925{
871 uint64_t nsecs = 1000000000LL; 926 uint64_t scaled64;
872 int32_t shift = 0; 927 int32_t shift = 0;
873 uint64_t tps64; 928 uint64_t tps64;
874 uint32_t tps32; 929 uint32_t tps32;
875 930
876 tps64 = tsc_khz * 1000LL; 931 tps64 = base_khz * 1000LL;
877 while (tps64 > nsecs*2) { 932 scaled64 = scaled_khz * 1000LL;
933 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
878 tps64 >>= 1; 934 tps64 >>= 1;
879 shift--; 935 shift--;
880 } 936 }
881 937
882 tps32 = (uint32_t)tps64; 938 tps32 = (uint32_t)tps64;
883 while (tps32 <= (uint32_t)nsecs) { 939 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
884 tps32 <<= 1; 940 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
941 scaled64 >>= 1;
942 else
943 tps32 <<= 1;
885 shift++; 944 shift++;
886 } 945 }
887 946
888 hv_clock->tsc_shift = shift; 947 *pshift = shift;
889 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 948 *pmultiplier = div_frac(scaled64, tps32);
890 949
891 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 950 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
892 __func__, tsc_khz, hv_clock->tsc_shift, 951 __func__, base_khz, scaled_khz, shift, *pmultiplier);
893 hv_clock->tsc_to_system_mul); 952}
953
954static inline u64 get_kernel_ns(void)
955{
956 struct timespec ts;
957
958 WARN_ON(preemptible());
959 ktime_get_ts(&ts);
960 monotonic_to_bootbased(&ts);
961 return timespec_to_ns(&ts);
894} 962}
895 963
896static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 964static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
965unsigned long max_tsc_khz;
897 966
898static void kvm_write_guest_time(struct kvm_vcpu *v) 967static inline int kvm_tsc_changes_freq(void)
968{
969 int cpu = get_cpu();
970 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
971 cpufreq_quick_get(cpu) != 0;
972 put_cpu();
973 return ret;
974}
975
976static inline u64 nsec_to_cycles(u64 nsec)
977{
978 u64 ret;
979
980 WARN_ON(preemptible());
981 if (kvm_tsc_changes_freq())
982 printk_once(KERN_WARNING
983 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
984 ret = nsec * __get_cpu_var(cpu_tsc_khz);
985 do_div(ret, USEC_PER_SEC);
986 return ret;
987}
988
989static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
990{
991 /* Compute a scale to convert nanoseconds in TSC cycles */
992 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
993 &kvm->arch.virtual_tsc_shift,
994 &kvm->arch.virtual_tsc_mult);
995 kvm->arch.virtual_tsc_khz = this_tsc_khz;
996}
997
998static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
999{
1000 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1001 vcpu->kvm->arch.virtual_tsc_mult,
1002 vcpu->kvm->arch.virtual_tsc_shift);
1003 tsc += vcpu->arch.last_tsc_write;
1004 return tsc;
1005}
1006
1007void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1008{
1009 struct kvm *kvm = vcpu->kvm;
1010 u64 offset, ns, elapsed;
1011 unsigned long flags;
1012 s64 sdiff;
1013
1014 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1015 offset = data - native_read_tsc();
1016 ns = get_kernel_ns();
1017 elapsed = ns - kvm->arch.last_tsc_nsec;
1018 sdiff = data - kvm->arch.last_tsc_write;
1019 if (sdiff < 0)
1020 sdiff = -sdiff;
1021
1022 /*
1023 * Special case: close write to TSC within 5 seconds of
1024 * another CPU is interpreted as an attempt to synchronize
1025 * The 5 seconds is to accomodate host load / swapping as
1026 * well as any reset of TSC during the boot process.
1027 *
1028 * In that case, for a reliable TSC, we can match TSC offsets,
1029 * or make a best guest using elapsed value.
1030 */
1031 if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
1032 elapsed < 5ULL * NSEC_PER_SEC) {
1033 if (!check_tsc_unstable()) {
1034 offset = kvm->arch.last_tsc_offset;
1035 pr_debug("kvm: matched tsc offset for %llu\n", data);
1036 } else {
1037 u64 delta = nsec_to_cycles(elapsed);
1038 offset += delta;
1039 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1040 }
1041 ns = kvm->arch.last_tsc_nsec;
1042 }
1043 kvm->arch.last_tsc_nsec = ns;
1044 kvm->arch.last_tsc_write = data;
1045 kvm->arch.last_tsc_offset = offset;
1046 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1047 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1048
1049 /* Reset of TSC must disable overshoot protection below */
1050 vcpu->arch.hv_clock.tsc_timestamp = 0;
1051 vcpu->arch.last_tsc_write = data;
1052 vcpu->arch.last_tsc_nsec = ns;
1053}
1054EXPORT_SYMBOL_GPL(kvm_write_tsc);
1055
1056static int kvm_guest_time_update(struct kvm_vcpu *v)
899{ 1057{
900 struct timespec ts;
901 unsigned long flags; 1058 unsigned long flags;
902 struct kvm_vcpu_arch *vcpu = &v->arch; 1059 struct kvm_vcpu_arch *vcpu = &v->arch;
903 void *shared_kaddr; 1060 void *shared_kaddr;
904 unsigned long this_tsc_khz; 1061 unsigned long this_tsc_khz;
1062 s64 kernel_ns, max_kernel_ns;
1063 u64 tsc_timestamp;
905 1064
906 if ((!vcpu->time_page)) 1065 /* Keep irq disabled to prevent changes to the clock */
907 return; 1066 local_irq_save(flags);
1067 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1068 kernel_ns = get_kernel_ns();
1069 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
908 1070
909 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 1071 if (unlikely(this_tsc_khz == 0)) {
910 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 1072 local_irq_restore(flags);
911 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 1073 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
912 vcpu->hv_clock_tsc_khz = this_tsc_khz; 1074 return 1;
1075 }
1076
1077 /*
1078 * We may have to catch up the TSC to match elapsed wall clock
1079 * time for two reasons, even if kvmclock is used.
1080 * 1) CPU could have been running below the maximum TSC rate
1081 * 2) Broken TSC compensation resets the base at each VCPU
1082 * entry to avoid unknown leaps of TSC even when running
1083 * again on the same CPU. This may cause apparent elapsed
1084 * time to disappear, and the guest to stand still or run
1085 * very slowly.
1086 */
1087 if (vcpu->tsc_catchup) {
1088 u64 tsc = compute_guest_tsc(v, kernel_ns);
1089 if (tsc > tsc_timestamp) {
1090 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1091 tsc_timestamp = tsc;
1092 }
913 } 1093 }
914 put_cpu_var(cpu_tsc_khz);
915 1094
916 /* Keep irq disabled to prevent changes to the clock */
917 local_irq_save(flags);
918 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
919 ktime_get_ts(&ts);
920 monotonic_to_bootbased(&ts);
921 local_irq_restore(flags); 1095 local_irq_restore(flags);
922 1096
923 /* With all the info we got, fill in the values */ 1097 if (!vcpu->time_page)
1098 return 0;
924 1099
925 vcpu->hv_clock.system_time = ts.tv_nsec + 1100 /*
926 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 1101 * Time as measured by the TSC may go backwards when resetting the base
1102 * tsc_timestamp. The reason for this is that the TSC resolution is
1103 * higher than the resolution of the other clock scales. Thus, many
1104 * possible measurments of the TSC correspond to one measurement of any
1105 * other clock, and so a spread of values is possible. This is not a
1106 * problem for the computation of the nanosecond clock; with TSC rates
1107 * around 1GHZ, there can only be a few cycles which correspond to one
1108 * nanosecond value, and any path through this code will inevitably
1109 * take longer than that. However, with the kernel_ns value itself,
1110 * the precision may be much lower, down to HZ granularity. If the
1111 * first sampling of TSC against kernel_ns ends in the low part of the
1112 * range, and the second in the high end of the range, we can get:
1113 *
1114 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1115 *
1116 * As the sampling errors potentially range in the thousands of cycles,
1117 * it is possible such a time value has already been observed by the
1118 * guest. To protect against this, we must compute the system time as
1119 * observed by the guest and ensure the new system time is greater.
1120 */
1121 max_kernel_ns = 0;
1122 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1123 max_kernel_ns = vcpu->last_guest_tsc -
1124 vcpu->hv_clock.tsc_timestamp;
1125 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1126 vcpu->hv_clock.tsc_to_system_mul,
1127 vcpu->hv_clock.tsc_shift);
1128 max_kernel_ns += vcpu->last_kernel_ns;
1129 }
927 1130
1131 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1132 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1133 &vcpu->hv_clock.tsc_shift,
1134 &vcpu->hv_clock.tsc_to_system_mul);
1135 vcpu->hw_tsc_khz = this_tsc_khz;
1136 }
1137
1138 if (max_kernel_ns > kernel_ns)
1139 kernel_ns = max_kernel_ns;
1140
1141 /* With all the info we got, fill in the values */
1142 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1143 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1144 vcpu->last_kernel_ns = kernel_ns;
1145 vcpu->last_guest_tsc = tsc_timestamp;
928 vcpu->hv_clock.flags = 0; 1146 vcpu->hv_clock.flags = 0;
929 1147
930 /* 1148 /*
@@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
942 kunmap_atomic(shared_kaddr, KM_USER0); 1160 kunmap_atomic(shared_kaddr, KM_USER0);
943 1161
944 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1162 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
945} 1163 return 0;
946
947static int kvm_request_guest_time_update(struct kvm_vcpu *v)
948{
949 struct kvm_vcpu_arch *vcpu = &v->arch;
950
951 if (!vcpu->time_page)
952 return 0;
953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
954 return 1;
955} 1164}
956 1165
957static bool msr_mtrr_valid(unsigned msr) 1166static bool msr_mtrr_valid(unsigned msr)
@@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1277 } 1486 }
1278 1487
1279 vcpu->arch.time = data; 1488 vcpu->arch.time = data;
1489 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1280 1490
1281 /* we verify if the enable bit is set... */ 1491 /* we verify if the enable bit is set... */
1282 if (!(data & 1)) 1492 if (!(data & 1))
@@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1292 kvm_release_page_clean(vcpu->arch.time_page); 1502 kvm_release_page_clean(vcpu->arch.time_page);
1293 vcpu->arch.time_page = NULL; 1503 vcpu->arch.time_page = NULL;
1294 } 1504 }
1295
1296 kvm_request_guest_time_update(vcpu);
1297 break; 1505 break;
1298 } 1506 }
1299 case MSR_IA32_MCG_CTL: 1507 case MSR_IA32_MCG_CTL:
@@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1330 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1538 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1331 "0x%x data 0x%llx\n", msr, data); 1539 "0x%x data 0x%llx\n", msr, data);
1332 break; 1540 break;
1541 case MSR_K7_CLK_CTL:
1542 /*
1543 * Ignore all writes to this no longer documented MSR.
1544 * Writes are only relevant for old K7 processors,
1545 * all pre-dating SVM, but a recommended workaround from
1546 * AMD for these chips. It is possible to speicify the
1547 * affected processor models on the command line, hence
1548 * the need to ignore the workaround.
1549 */
1550 break;
1333 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1551 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1334 if (kvm_hv_msr_partition_wide(msr)) { 1552 if (kvm_hv_msr_partition_wide(msr)) {
1335 int r; 1553 int r;
@@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1522 case 0xcd: /* fsb frequency */ 1740 case 0xcd: /* fsb frequency */
1523 data = 3; 1741 data = 3;
1524 break; 1742 break;
1743 /*
1744 * MSR_EBC_FREQUENCY_ID
1745 * Conservative value valid for even the basic CPU models.
1746 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1747 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1748 * and 266MHz for model 3, or 4. Set Core Clock
1749 * Frequency to System Bus Frequency Ratio to 1 (bits
1750 * 31:24) even though these are only valid for CPU
1751 * models > 2, however guests may end up dividing or
1752 * multiplying by zero otherwise.
1753 */
1754 case MSR_EBC_FREQUENCY_ID:
1755 data = 1 << 24;
1756 break;
1525 case MSR_IA32_APICBASE: 1757 case MSR_IA32_APICBASE:
1526 data = kvm_get_apic_base(vcpu); 1758 data = kvm_get_apic_base(vcpu);
1527 break; 1759 break;
@@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1555 case MSR_IA32_MCG_STATUS: 1787 case MSR_IA32_MCG_STATUS:
1556 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1788 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1557 return get_msr_mce(vcpu, msr, pdata); 1789 return get_msr_mce(vcpu, msr, pdata);
1790 case MSR_K7_CLK_CTL:
1791 /*
1792 * Provide expected ramp-up count for K7. All other
1793 * are set to zero, indicating minimum divisors for
1794 * every field.
1795 *
1796 * This prevents guest kernels on AMD host with CPU
1797 * type 6, model 8 and higher from exploding due to
1798 * the rdmsr failing.
1799 */
1800 data = 0x20000000;
1801 break;
1558 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1802 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1559 if (kvm_hv_msr_partition_wide(msr)) { 1803 if (kvm_hv_msr_partition_wide(msr)) {
1560 int r; 1804 int r;
@@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1808 } 2052 }
1809 2053
1810 kvm_x86_ops->vcpu_load(vcpu, cpu); 2054 kvm_x86_ops->vcpu_load(vcpu, cpu);
1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 2055 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
1812 unsigned long khz = cpufreq_quick_get(cpu); 2056 /* Make sure TSC doesn't go backwards */
1813 if (!khz) 2057 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
1814 khz = tsc_khz; 2058 native_read_tsc() - vcpu->arch.last_host_tsc;
1815 per_cpu(cpu_tsc_khz, cpu) = khz; 2059 if (tsc_delta < 0)
2060 mark_tsc_unstable("KVM discovered backwards TSC");
2061 if (check_tsc_unstable()) {
2062 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2063 vcpu->arch.tsc_catchup = 1;
2064 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2065 }
2066 if (vcpu->cpu != cpu)
2067 kvm_migrate_timers(vcpu);
2068 vcpu->cpu = cpu;
1816 } 2069 }
1817 kvm_request_guest_time_update(vcpu);
1818} 2070}
1819 2071
1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2072void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1821{ 2073{
1822 kvm_x86_ops->vcpu_put(vcpu); 2074 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu); 2075 kvm_put_guest_fpu(vcpu);
2076 vcpu->arch.last_host_tsc = native_read_tsc();
1824} 2077}
1825 2078
1826static int is_efer_nx(void) 2079static int is_efer_nx(void)
@@ -1991,13 +2244,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 2244 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1992 0 /* Reserved, DCA */ | F(XMM4_1) | 2245 0 /* Reserved, DCA */ | F(XMM4_1) |
1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 2246 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); 2247 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2248 F(F16C);
1995 /* cpuid 0x80000001.ecx */ 2249 /* cpuid 0x80000001.ecx */
1996 const u32 kvm_supported_word6_x86_features = 2250 const u32 kvm_supported_word6_x86_features =
1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 2251 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
1998 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 2252 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1999 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 2253 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2000 0 /* SKINIT */ | 0 /* WDT */; 2254 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2001 2255
2002 /* all calls to cpuid_count() should be made on the same cpu */ 2256 /* all calls to cpuid_count() should be made on the same cpu */
2003 get_cpu(); 2257 get_cpu();
@@ -2203,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2203 return -ENXIO; 2457 return -ENXIO;
2204 2458
2205 kvm_queue_interrupt(vcpu, irq->irq, false); 2459 kvm_queue_interrupt(vcpu, irq->irq, false);
2460 kvm_make_request(KVM_REQ_EVENT, vcpu);
2206 2461
2207 return 0; 2462 return 0;
2208} 2463}
@@ -2356,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2611 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2357 vcpu->arch.sipi_vector = events->sipi_vector; 2612 vcpu->arch.sipi_vector = events->sipi_vector;
2358 2613
2614 kvm_make_request(KVM_REQ_EVENT, vcpu);
2615
2359 return 0; 2616 return 0;
2360} 2617}
2361 2618
@@ -2759,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2759 3016
2760static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3017static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2761{ 3018{
2762 return kvm->arch.n_alloc_mmu_pages; 3019 return kvm->arch.n_max_mmu_pages;
2763} 3020}
2764 3021
2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3022static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2795,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2795 r = 0; 3052 r = 0;
2796 switch (chip->chip_id) { 3053 switch (chip->chip_id) {
2797 case KVM_IRQCHIP_PIC_MASTER: 3054 case KVM_IRQCHIP_PIC_MASTER:
2798 raw_spin_lock(&pic_irqchip(kvm)->lock); 3055 spin_lock(&pic_irqchip(kvm)->lock);
2799 memcpy(&pic_irqchip(kvm)->pics[0], 3056 memcpy(&pic_irqchip(kvm)->pics[0],
2800 &chip->chip.pic, 3057 &chip->chip.pic,
2801 sizeof(struct kvm_pic_state)); 3058 sizeof(struct kvm_pic_state));
2802 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3059 spin_unlock(&pic_irqchip(kvm)->lock);
2803 break; 3060 break;
2804 case KVM_IRQCHIP_PIC_SLAVE: 3061 case KVM_IRQCHIP_PIC_SLAVE:
2805 raw_spin_lock(&pic_irqchip(kvm)->lock); 3062 spin_lock(&pic_irqchip(kvm)->lock);
2806 memcpy(&pic_irqchip(kvm)->pics[1], 3063 memcpy(&pic_irqchip(kvm)->pics[1],
2807 &chip->chip.pic, 3064 &chip->chip.pic,
2808 sizeof(struct kvm_pic_state)); 3065 sizeof(struct kvm_pic_state));
2809 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3066 spin_unlock(&pic_irqchip(kvm)->lock);
2810 break; 3067 break;
2811 case KVM_IRQCHIP_IOAPIC: 3068 case KVM_IRQCHIP_IOAPIC:
2812 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3069 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3200,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3200 break; 3457 break;
3201 } 3458 }
3202 case KVM_SET_CLOCK: { 3459 case KVM_SET_CLOCK: {
3203 struct timespec now;
3204 struct kvm_clock_data user_ns; 3460 struct kvm_clock_data user_ns;
3205 u64 now_ns; 3461 u64 now_ns;
3206 s64 delta; 3462 s64 delta;
@@ -3214,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
3214 goto out; 3470 goto out;
3215 3471
3216 r = 0; 3472 r = 0;
3217 ktime_get_ts(&now); 3473 local_irq_disable();
3218 now_ns = timespec_to_ns(&now); 3474 now_ns = get_kernel_ns();
3219 delta = user_ns.clock - now_ns; 3475 delta = user_ns.clock - now_ns;
3476 local_irq_enable();
3220 kvm->arch.kvmclock_offset = delta; 3477 kvm->arch.kvmclock_offset = delta;
3221 break; 3478 break;
3222 } 3479 }
3223 case KVM_GET_CLOCK: { 3480 case KVM_GET_CLOCK: {
3224 struct timespec now;
3225 struct kvm_clock_data user_ns; 3481 struct kvm_clock_data user_ns;
3226 u64 now_ns; 3482 u64 now_ns;
3227 3483
3228 ktime_get_ts(&now); 3484 local_irq_disable();
3229 now_ns = timespec_to_ns(&now); 3485 now_ns = get_kernel_ns();
3230 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3486 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3487 local_irq_enable();
3231 user_ns.flags = 0; 3488 user_ns.flags = 0;
3232 3489
3233 r = -EFAULT; 3490 r = -EFAULT;
@@ -3291,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3291 kvm_x86_ops->get_segment(vcpu, var, seg); 3548 kvm_x86_ops->get_segment(vcpu, var, seg);
3292} 3549}
3293 3550
3551static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3552{
3553 return gpa;
3554}
3555
3556static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3557{
3558 gpa_t t_gpa;
3559 u32 error;
3560
3561 BUG_ON(!mmu_is_nested(vcpu));
3562
3563 /* NPT walks are always user-walks */
3564 access |= PFERR_USER_MASK;
3565 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
3566 if (t_gpa == UNMAPPED_GVA)
3567 vcpu->arch.fault.nested = true;
3568
3569 return t_gpa;
3570}
3571
3294gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3572gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3295{ 3573{
3296 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3574 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3297 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3575 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3298} 3576}
3299 3577
3300 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3578 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3301{ 3579{
3302 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3580 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3303 access |= PFERR_FETCH_MASK; 3581 access |= PFERR_FETCH_MASK;
3304 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3582 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3305} 3583}
3306 3584
3307gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3585gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3308{ 3586{
3309 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3587 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3310 access |= PFERR_WRITE_MASK; 3588 access |= PFERR_WRITE_MASK;
3311 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3589 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3312} 3590}
3313 3591
3314/* uses this to access any guest's mapped memory without checking CPL */ 3592/* uses this to access any guest's mapped memory without checking CPL */
3315gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3593gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3316{ 3594{
3317 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); 3595 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
3318} 3596}
3319 3597
3320static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3598static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3325,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3325 int r = X86EMUL_CONTINUE; 3603 int r = X86EMUL_CONTINUE;
3326 3604
3327 while (bytes) { 3605 while (bytes) {
3328 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); 3606 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3607 error);
3329 unsigned offset = addr & (PAGE_SIZE-1); 3608 unsigned offset = addr & (PAGE_SIZE-1);
3330 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3609 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3331 int ret; 3610 int ret;
@@ -3380,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3380 int r = X86EMUL_CONTINUE; 3659 int r = X86EMUL_CONTINUE;
3381 3660
3382 while (bytes) { 3661 while (bytes) {
3383 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, 3662 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3384 PFERR_WRITE_MASK, error); 3663 PFERR_WRITE_MASK,
3664 error);
3385 unsigned offset = addr & (PAGE_SIZE-1); 3665 unsigned offset = addr & (PAGE_SIZE-1);
3386 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3666 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3387 int ret; 3667 int ret;
@@ -3623,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3623 if (vcpu->arch.pio.count) 3903 if (vcpu->arch.pio.count)
3624 goto data_avail; 3904 goto data_avail;
3625 3905
3626 trace_kvm_pio(1, port, size, 1); 3906 trace_kvm_pio(0, port, size, 1);
3627 3907
3628 vcpu->arch.pio.port = port; 3908 vcpu->arch.pio.port = port;
3629 vcpu->arch.pio.in = 1; 3909 vcpu->arch.pio.in = 1;
@@ -3651,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3651 const void *val, unsigned int count, 3931 const void *val, unsigned int count,
3652 struct kvm_vcpu *vcpu) 3932 struct kvm_vcpu *vcpu)
3653{ 3933{
3654 trace_kvm_pio(0, port, size, 1); 3934 trace_kvm_pio(1, port, size, 1);
3655 3935
3656 vcpu->arch.pio.port = port; 3936 vcpu->arch.pio.port = port;
3657 vcpu->arch.pio.in = 0; 3937 vcpu->arch.pio.in = 0;
@@ -3790,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3790 kvm_x86_ops->get_gdt(vcpu, dt); 4070 kvm_x86_ops->get_gdt(vcpu, dt);
3791} 4071}
3792 4072
4073static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
4074{
4075 kvm_x86_ops->get_idt(vcpu, dt);
4076}
4077
3793static unsigned long emulator_get_cached_segment_base(int seg, 4078static unsigned long emulator_get_cached_segment_base(int seg,
3794 struct kvm_vcpu *vcpu) 4079 struct kvm_vcpu *vcpu)
3795{ 4080{
@@ -3883,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = {
3883 .set_segment_selector = emulator_set_segment_selector, 4168 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base, 4169 .get_cached_segment_base = emulator_get_cached_segment_base,
3885 .get_gdt = emulator_get_gdt, 4170 .get_gdt = emulator_get_gdt,
4171 .get_idt = emulator_get_idt,
3886 .get_cr = emulator_get_cr, 4172 .get_cr = emulator_get_cr,
3887 .set_cr = emulator_set_cr, 4173 .set_cr = emulator_set_cr,
3888 .cpl = emulator_get_cpl, 4174 .cpl = emulator_get_cpl,
@@ -3918,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{ 4204{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4205 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR) 4206 if (ctxt->exception == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 4207 kvm_propagate_fault(vcpu);
3922 else if (ctxt->error_code_valid) 4208 else if (ctxt->error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4209 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3924 else 4210 else
3925 kvm_queue_exception(vcpu, ctxt->exception); 4211 kvm_queue_exception(vcpu, ctxt->exception);
3926} 4212}
3927 4213
4214static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4215{
4216 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4217 int cs_db, cs_l;
4218
4219 cache_all_regs(vcpu);
4220
4221 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4222
4223 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4224 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4225 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4226 vcpu->arch.emulate_ctxt.mode =
4227 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4228 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4229 ? X86EMUL_MODE_VM86 : cs_l
4230 ? X86EMUL_MODE_PROT64 : cs_db
4231 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4232 memset(c, 0, sizeof(struct decode_cache));
4233 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4234}
4235
4236int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4237{
4238 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4239 int ret;
4240
4241 init_emulate_ctxt(vcpu);
4242
4243 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4244 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4245 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
4246 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4247
4248 if (ret != X86EMUL_CONTINUE)
4249 return EMULATE_FAIL;
4250
4251 vcpu->arch.emulate_ctxt.eip = c->eip;
4252 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4253 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4254 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4255
4256 if (irq == NMI_VECTOR)
4257 vcpu->arch.nmi_pending = false;
4258 else
4259 vcpu->arch.interrupt.pending = false;
4260
4261 return EMULATE_DONE;
4262}
4263EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4264
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4265static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{ 4266{
3930 ++vcpu->stat.insn_emulation_fail; 4267 ++vcpu->stat.insn_emulation_fail;
@@ -3981,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3981 cache_all_regs(vcpu); 4318 cache_all_regs(vcpu);
3982 4319
3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4320 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3984 int cs_db, cs_l; 4321 init_emulate_ctxt(vcpu);
3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3986
3987 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3988 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3989 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3990 vcpu->arch.emulate_ctxt.mode =
3991 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3992 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3993 ? X86EMUL_MODE_VM86 : cs_l
3994 ? X86EMUL_MODE_PROT64 : cs_db
3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0; 4322 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1; 4323 vcpu->arch.emulate_ctxt.exception = -1;
4324 vcpu->arch.emulate_ctxt.perm_ok = false;
4325
4326 r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
4327 if (r == X86EMUL_PROPAGATE_FAULT)
4328 goto done;
4000 4329
4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
4002 trace_kvm_emulate_insn_start(vcpu); 4330 trace_kvm_emulate_insn_start(vcpu);
4003 4331
4004 /* Only allow emulation of specific instructions on #UD 4332 /* Only allow emulation of specific instructions on #UD
@@ -4048,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4376 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4049 4377
4050restart: 4378restart:
4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4379 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4052 4380
4053 if (r) { /* emulation failed */ 4381 if (r == EMULATION_FAILED) {
4054 if (reexecute_instruction(vcpu, cr2)) 4382 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE; 4383 return EMULATE_DONE;
4056 4384
4057 return handle_emulation_failure(vcpu); 4385 return handle_emulation_failure(vcpu);
4058 } 4386 }
4059 4387
4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4388done:
4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4064
4065 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4389 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu); 4390 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE; 4391 r = EMULATE_DONE;
4068 } 4392 } else if (vcpu->arch.pio.count) {
4069
4070 if (vcpu->arch.pio.count) {
4071 if (!vcpu->arch.pio.in) 4393 if (!vcpu->arch.pio.in)
4072 vcpu->arch.pio.count = 0; 4394 vcpu->arch.pio.count = 0;
4073 return EMULATE_DO_MMIO; 4395 r = EMULATE_DO_MMIO;
4074 } 4396 } else if (vcpu->mmio_needed) {
4075
4076 if (vcpu->mmio_needed) {
4077 if (vcpu->mmio_is_write) 4397 if (vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0; 4398 vcpu->mmio_needed = 0;
4079 return EMULATE_DO_MMIO; 4399 r = EMULATE_DO_MMIO;
4080 } 4400 } else if (r == EMULATION_RESTART)
4081
4082 if (vcpu->arch.emulate_ctxt.restart)
4083 goto restart; 4401 goto restart;
4402 else
4403 r = EMULATE_DONE;
4084 4404
4085 return EMULATE_DONE; 4405 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
4406 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4407 kvm_make_request(KVM_REQ_EVENT, vcpu);
4408 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4409 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4410
4411 return r;
4086} 4412}
4087EXPORT_SYMBOL_GPL(emulate_instruction); 4413EXPORT_SYMBOL_GPL(emulate_instruction);
4088 4414
@@ -4096,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4096} 4422}
4097EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4423EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4098 4424
4099static void bounce_off(void *info) 4425static void tsc_bad(void *info)
4426{
4427 __get_cpu_var(cpu_tsc_khz) = 0;
4428}
4429
4430static void tsc_khz_changed(void *data)
4100{ 4431{
4101 /* nothing */ 4432 struct cpufreq_freqs *freq = data;
4433 unsigned long khz = 0;
4434
4435 if (data)
4436 khz = freq->new;
4437 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4438 khz = cpufreq_quick_get(raw_smp_processor_id());
4439 if (!khz)
4440 khz = tsc_khz;
4441 __get_cpu_var(cpu_tsc_khz) = khz;
4102} 4442}
4103 4443
4104static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4444static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4109,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4109 struct kvm_vcpu *vcpu; 4449 struct kvm_vcpu *vcpu;
4110 int i, send_ipi = 0; 4450 int i, send_ipi = 0;
4111 4451
4452 /*
4453 * We allow guests to temporarily run on slowing clocks,
4454 * provided we notify them after, or to run on accelerating
4455 * clocks, provided we notify them before. Thus time never
4456 * goes backwards.
4457 *
4458 * However, we have a problem. We can't atomically update
4459 * the frequency of a given CPU from this function; it is
4460 * merely a notifier, which can be called from any CPU.
4461 * Changing the TSC frequency at arbitrary points in time
4462 * requires a recomputation of local variables related to
4463 * the TSC for each VCPU. We must flag these local variables
4464 * to be updated and be sure the update takes place with the
4465 * new frequency before any guests proceed.
4466 *
4467 * Unfortunately, the combination of hotplug CPU and frequency
4468 * change creates an intractable locking scenario; the order
4469 * of when these callouts happen is undefined with respect to
4470 * CPU hotplug, and they can race with each other. As such,
4471 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4472 * undefined; you can actually have a CPU frequency change take
4473 * place in between the computation of X and the setting of the
4474 * variable. To protect against this problem, all updates of
4475 * the per_cpu tsc_khz variable are done in an interrupt
4476 * protected IPI, and all callers wishing to update the value
4477 * must wait for a synchronous IPI to complete (which is trivial
4478 * if the caller is on the CPU already). This establishes the
4479 * necessary total order on variable updates.
4480 *
4481 * Note that because a guest time update may take place
4482 * anytime after the setting of the VCPU's request bit, the
4483 * correct TSC value must be set before the request. However,
4484 * to ensure the update actually makes it to any guest which
4485 * starts running in hardware virtualization between the set
4486 * and the acquisition of the spinlock, we must also ping the
4487 * CPU after setting the request bit.
4488 *
4489 */
4490
4112 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4491 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
4113 return 0; 4492 return 0;
4114 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4493 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
4115 return 0; 4494 return 0;
4116 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 4495
4496 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4117 4497
4118 spin_lock(&kvm_lock); 4498 spin_lock(&kvm_lock);
4119 list_for_each_entry(kvm, &vm_list, vm_list) { 4499 list_for_each_entry(kvm, &vm_list, vm_list) {
4120 kvm_for_each_vcpu(i, vcpu, kvm) { 4500 kvm_for_each_vcpu(i, vcpu, kvm) {
4121 if (vcpu->cpu != freq->cpu) 4501 if (vcpu->cpu != freq->cpu)
4122 continue; 4502 continue;
4123 if (!kvm_request_guest_time_update(vcpu)) 4503 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4124 continue;
4125 if (vcpu->cpu != smp_processor_id()) 4504 if (vcpu->cpu != smp_processor_id())
4126 send_ipi++; 4505 send_ipi = 1;
4127 } 4506 }
4128 } 4507 }
4129 spin_unlock(&kvm_lock); 4508 spin_unlock(&kvm_lock);
@@ -4141,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4141 * guest context is entered kvmclock will be updated, 4520 * guest context is entered kvmclock will be updated,
4142 * so the guest will not see stale values. 4521 * so the guest will not see stale values.
4143 */ 4522 */
4144 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 4523 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4145 } 4524 }
4146 return 0; 4525 return 0;
4147} 4526}
4148 4527
4149static struct notifier_block kvmclock_cpufreq_notifier_block = { 4528static struct notifier_block kvmclock_cpufreq_notifier_block = {
4150 .notifier_call = kvmclock_cpufreq_notifier 4529 .notifier_call = kvmclock_cpufreq_notifier
4530};
4531
4532static int kvmclock_cpu_notifier(struct notifier_block *nfb,
4533 unsigned long action, void *hcpu)
4534{
4535 unsigned int cpu = (unsigned long)hcpu;
4536
4537 switch (action) {
4538 case CPU_ONLINE:
4539 case CPU_DOWN_FAILED:
4540 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4541 break;
4542 case CPU_DOWN_PREPARE:
4543 smp_call_function_single(cpu, tsc_bad, NULL, 1);
4544 break;
4545 }
4546 return NOTIFY_OK;
4547}
4548
4549static struct notifier_block kvmclock_cpu_notifier_block = {
4550 .notifier_call = kvmclock_cpu_notifier,
4551 .priority = -INT_MAX
4151}; 4552};
4152 4553
4153static void kvm_timer_init(void) 4554static void kvm_timer_init(void)
4154{ 4555{
4155 int cpu; 4556 int cpu;
4156 4557
4558 max_tsc_khz = tsc_khz;
4559 register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4157 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4560 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4561#ifdef CONFIG_CPU_FREQ
4562 struct cpufreq_policy policy;
4563 memset(&policy, 0, sizeof(policy));
4564 cpufreq_get_policy(&policy, get_cpu());
4565 if (policy.cpuinfo.max_freq)
4566 max_tsc_khz = policy.cpuinfo.max_freq;
4567#endif
4158 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4568 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4159 CPUFREQ_TRANSITION_NOTIFIER); 4569 CPUFREQ_TRANSITION_NOTIFIER);
4160 for_each_online_cpu(cpu) {
4161 unsigned long khz = cpufreq_get(cpu);
4162 if (!khz)
4163 khz = tsc_khz;
4164 per_cpu(cpu_tsc_khz, cpu) = khz;
4165 }
4166 } else {
4167 for_each_possible_cpu(cpu)
4168 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
4169 } 4570 }
4571 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
4572 for_each_online_cpu(cpu)
4573 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4170} 4574}
4171 4575
4172static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4576static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4268,6 +4672,7 @@ void kvm_arch_exit(void)
4268 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4672 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4269 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4673 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4270 CPUFREQ_TRANSITION_NOTIFIER); 4674 CPUFREQ_TRANSITION_NOTIFIER);
4675 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4271 kvm_x86_ops = NULL; 4676 kvm_x86_ops = NULL;
4272 kvm_mmu_module_exit(); 4677 kvm_mmu_module_exit();
4273} 4678}
@@ -4683,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4683 kvm_mmu_unload(vcpu); 5088 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5089 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4685 __kvm_migrate_timers(vcpu); 5090 __kvm_migrate_timers(vcpu);
4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 5091 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
4687 kvm_write_guest_time(vcpu); 5092 r = kvm_guest_time_update(vcpu);
5093 if (unlikely(r))
5094 goto out;
5095 }
4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5096 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4689 kvm_mmu_sync_roots(vcpu); 5097 kvm_mmu_sync_roots(vcpu);
4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5098 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4709,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4709 if (unlikely(r)) 5117 if (unlikely(r))
4710 goto out; 5118 goto out;
4711 5119
5120 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5121 inject_pending_event(vcpu);
5122
5123 /* enable NMI/IRQ window open exits if needed */
5124 if (vcpu->arch.nmi_pending)
5125 kvm_x86_ops->enable_nmi_window(vcpu);
5126 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5127 kvm_x86_ops->enable_irq_window(vcpu);
5128
5129 if (kvm_lapic_enabled(vcpu)) {
5130 update_cr8_intercept(vcpu);
5131 kvm_lapic_sync_to_vapic(vcpu);
5132 }
5133 }
5134
4712 preempt_disable(); 5135 preempt_disable();
4713 5136
4714 kvm_x86_ops->prepare_guest_switch(vcpu); 5137 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -4727,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4727 smp_wmb(); 5150 smp_wmb();
4728 local_irq_enable(); 5151 local_irq_enable();
4729 preempt_enable(); 5152 preempt_enable();
5153 kvm_x86_ops->cancel_injection(vcpu);
4730 r = 1; 5154 r = 1;
4731 goto out; 5155 goto out;
4732 } 5156 }
4733 5157
4734 inject_pending_event(vcpu);
4735
4736 /* enable NMI/IRQ window open exits if needed */
4737 if (vcpu->arch.nmi_pending)
4738 kvm_x86_ops->enable_nmi_window(vcpu);
4739 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
4740 kvm_x86_ops->enable_irq_window(vcpu);
4741
4742 if (kvm_lapic_enabled(vcpu)) {
4743 update_cr8_intercept(vcpu);
4744 kvm_lapic_sync_to_vapic(vcpu);
4745 }
4746
4747 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5158 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4748 5159
4749 kvm_guest_enter(); 5160 kvm_guest_enter();
@@ -4769,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4769 if (hw_breakpoint_active()) 5180 if (hw_breakpoint_active())
4770 hw_breakpoint_restore(); 5181 hw_breakpoint_restore();
4771 5182
5183 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5184
4772 atomic_set(&vcpu->guest_mode, 0); 5185 atomic_set(&vcpu->guest_mode, 0);
4773 smp_wmb(); 5186 smp_wmb();
4774 local_irq_enable(); 5187 local_irq_enable();
@@ -4898,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4898 if (!irqchip_in_kernel(vcpu->kvm)) 5311 if (!irqchip_in_kernel(vcpu->kvm))
4899 kvm_set_cr8(vcpu, kvm_run->cr8); 5312 kvm_set_cr8(vcpu, kvm_run->cr8);
4900 5313
4901 if (vcpu->arch.pio.count || vcpu->mmio_needed || 5314 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
4902 vcpu->arch.emulate_ctxt.restart) {
4903 if (vcpu->mmio_needed) { 5315 if (vcpu->mmio_needed) {
4904 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 5316 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4905 vcpu->mmio_read_completed = 1; 5317 vcpu->mmio_read_completed = 1;
@@ -4980,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4980 5392
4981 vcpu->arch.exception.pending = false; 5393 vcpu->arch.exception.pending = false;
4982 5394
5395 kvm_make_request(KVM_REQ_EVENT, vcpu);
5396
4983 return 0; 5397 return 0;
4984} 5398}
4985 5399
@@ -5043,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5043 struct kvm_mp_state *mp_state) 5457 struct kvm_mp_state *mp_state)
5044{ 5458{
5045 vcpu->arch.mp_state = mp_state->mp_state; 5459 vcpu->arch.mp_state = mp_state->mp_state;
5460 kvm_make_request(KVM_REQ_EVENT, vcpu);
5046 return 0; 5461 return 0;
5047} 5462}
5048 5463
@@ -5050,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5050 bool has_error_code, u32 error_code) 5465 bool has_error_code, u32 error_code)
5051{ 5466{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5467 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5053 int cs_db, cs_l, ret; 5468 int ret;
5054 cache_all_regs(vcpu);
5055
5056 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5057 5469
5058 vcpu->arch.emulate_ctxt.vcpu = vcpu; 5470 init_emulate_ctxt(vcpu);
5059 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
5060 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
5061 vcpu->arch.emulate_ctxt.mode =
5062 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
5063 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
5064 ? X86EMUL_MODE_VM86 : cs_l
5065 ? X86EMUL_MODE_PROT64 : cs_db
5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
5069 5471
5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5472 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
5071 tss_selector, reason, has_error_code, 5473 tss_selector, reason, has_error_code,
5072 error_code); 5474 error_code);
5073 5475
@@ -5077,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5479 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5480 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5481 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5482 kvm_make_request(KVM_REQ_EVENT, vcpu);
5080 return EMULATE_DONE; 5483 return EMULATE_DONE;
5081} 5484}
5082EXPORT_SYMBOL_GPL(kvm_task_switch); 5485EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5112,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5112 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5515 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5113 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5516 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5114 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5517 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5115 load_pdptrs(vcpu, vcpu->arch.cr3); 5518 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
5116 mmu_reset_needed = 1; 5519 mmu_reset_needed = 1;
5117 } 5520 }
5118 5521
@@ -5147,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5147 !is_protmode(vcpu)) 5550 !is_protmode(vcpu))
5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5551 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5149 5552
5553 kvm_make_request(KVM_REQ_EVENT, vcpu);
5554
5150 return 0; 5555 return 0;
5151} 5556}
5152 5557
@@ -5333,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5333struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 5738struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
5334 unsigned int id) 5739 unsigned int id)
5335{ 5740{
5741 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
5742 printk_once(KERN_WARNING
5743 "kvm: SMP vm created on host with unstable TSC; "
5744 "guest TSC will not be reliable\n");
5336 return kvm_x86_ops->vcpu_create(kvm, id); 5745 return kvm_x86_ops->vcpu_create(kvm, id);
5337} 5746}
5338 5747
@@ -5375,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5375 vcpu->arch.dr6 = DR6_FIXED_1; 5784 vcpu->arch.dr6 = DR6_FIXED_1;
5376 vcpu->arch.dr7 = DR7_FIXED_1; 5785 vcpu->arch.dr7 = DR7_FIXED_1;
5377 5786
5787 kvm_make_request(KVM_REQ_EVENT, vcpu);
5788
5378 return kvm_x86_ops->vcpu_reset(vcpu); 5789 return kvm_x86_ops->vcpu_reset(vcpu);
5379} 5790}
5380 5791
5381int kvm_arch_hardware_enable(void *garbage) 5792int kvm_arch_hardware_enable(void *garbage)
5382{ 5793{
5383 /* 5794 struct kvm *kvm;
5384 * Since this may be called from a hotplug notifcation, 5795 struct kvm_vcpu *vcpu;
5385 * we can't get the CPU frequency directly. 5796 int i;
5386 */
5387 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5388 int cpu = raw_smp_processor_id();
5389 per_cpu(cpu_tsc_khz, cpu) = 0;
5390 }
5391 5797
5392 kvm_shared_msr_cpu_online(); 5798 kvm_shared_msr_cpu_online();
5393 5799 list_for_each_entry(kvm, &vm_list, vm_list)
5800 kvm_for_each_vcpu(i, vcpu, kvm)
5801 if (vcpu->cpu == smp_processor_id())
5802 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5394 return kvm_x86_ops->hardware_enable(garbage); 5803 return kvm_x86_ops->hardware_enable(garbage);
5395} 5804}
5396 5805
@@ -5424,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5424 BUG_ON(vcpu->kvm == NULL); 5833 BUG_ON(vcpu->kvm == NULL);
5425 kvm = vcpu->kvm; 5834 kvm = vcpu->kvm;
5426 5835
5836 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
5837 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
5427 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5838 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
5839 vcpu->arch.mmu.translate_gpa = translate_gpa;
5840 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5428 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5841 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
5429 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5842 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5430 else 5843 else
@@ -5437,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5437 } 5850 }
5438 vcpu->arch.pio_data = page_address(page); 5851 vcpu->arch.pio_data = page_address(page);
5439 5852
5853 if (!kvm->arch.virtual_tsc_khz)
5854 kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
5855
5440 r = kvm_mmu_create(vcpu); 5856 r = kvm_mmu_create(vcpu);
5441 if (r < 0) 5857 if (r < 0)
5442 goto fail_free_pio_data; 5858 goto fail_free_pio_data;
@@ -5496,7 +5912,7 @@ struct kvm *kvm_arch_create_vm(void)
5496 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 5912 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5497 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 5913 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5498 5914
5499 rdtscll(kvm->arch.vm_init_tsc); 5915 spin_lock_init(&kvm->arch.tsc_write_lock);
5500 5916
5501 return kvm; 5917 return kvm;
5502} 5918}
@@ -5683,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5683 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6099 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5684 rflags |= X86_EFLAGS_TF; 6100 rflags |= X86_EFLAGS_TF;
5685 kvm_x86_ops->set_rflags(vcpu, rflags); 6101 kvm_x86_ops->set_rflags(vcpu, rflags);
6102 kvm_make_request(KVM_REQ_EVENT, vcpu);
5686} 6103}
5687EXPORT_SYMBOL_GPL(kvm_set_rflags); 6104EXPORT_SYMBOL_GPL(kvm_set_rflags);
5688 6105
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..2cea414489f3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
50#endif 50#endif
51} 51}
52 52
53static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
54{
55 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
56}
57
53static inline int is_pae(struct kvm_vcpu *vcpu) 58static inline int is_pae(struct kvm_vcpu *vcpu)
54{ 59{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); 60 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -67,5 +72,8 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
67 72
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 73void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 74void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
75int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
76
77void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
70 78
71#endif 79#endif
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f53..b908a59eccf5 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
22 22
23void *memmove(void *dest, const void *src, size_t n) 23void *memmove(void *dest, const void *src, size_t n)
24{ 24{
25 int d0, d1, d2; 25 int d0,d1,d2,d3,d4,d5;
26 26 char *ret = dest;
27 if (dest < src) { 27
28 memcpy(dest, src, n); 28 __asm__ __volatile__(
29 } else { 29 /* Handle more 16bytes in loop */
30 __asm__ __volatile__( 30 "cmp $0x10, %0\n\t"
31 "std\n\t" 31 "jb 1f\n\t"
32 "rep\n\t" 32
33 "movsb\n\t" 33 /* Decide forward/backward copy mode */
34 "cld" 34 "cmp %2, %1\n\t"
35 : "=&c" (d0), "=&S" (d1), "=&D" (d2) 35 "jb 2f\n\t"
36 :"0" (n), 36
37 "1" (n-1+src), 37 /*
38 "2" (n-1+dest) 38 * movs instruction have many startup latency
39 :"memory"); 39 * so we handle small size by general register.
40 } 40 */
41 return dest; 41 "cmp $680, %0\n\t"
42 "jb 3f\n\t"
43 /*
44 * movs instruction is only good for aligned case.
45 */
46 "mov %1, %3\n\t"
47 "xor %2, %3\n\t"
48 "and $0xff, %3\n\t"
49 "jz 4f\n\t"
50 "3:\n\t"
51 "sub $0x10, %0\n\t"
52
53 /*
54 * We gobble 16byts forward in each loop.
55 */
56 "3:\n\t"
57 "sub $0x10, %0\n\t"
58 "mov 0*4(%1), %3\n\t"
59 "mov 1*4(%1), %4\n\t"
60 "mov %3, 0*4(%2)\n\t"
61 "mov %4, 1*4(%2)\n\t"
62 "mov 2*4(%1), %3\n\t"
63 "mov 3*4(%1), %4\n\t"
64 "mov %3, 2*4(%2)\n\t"
65 "mov %4, 3*4(%2)\n\t"
66 "lea 0x10(%1), %1\n\t"
67 "lea 0x10(%2), %2\n\t"
68 "jae 3b\n\t"
69 "add $0x10, %0\n\t"
70 "jmp 1f\n\t"
71
72 /*
73 * Handle data forward by movs.
74 */
75 ".p2align 4\n\t"
76 "4:\n\t"
77 "mov -4(%1, %0), %3\n\t"
78 "lea -4(%2, %0), %4\n\t"
79 "shr $2, %0\n\t"
80 "rep movsl\n\t"
81 "mov %3, (%4)\n\t"
82 "jmp 11f\n\t"
83 /*
84 * Handle data backward by movs.
85 */
86 ".p2align 4\n\t"
87 "6:\n\t"
88 "mov (%1), %3\n\t"
89 "mov %2, %4\n\t"
90 "lea -4(%1, %0), %1\n\t"
91 "lea -4(%2, %0), %2\n\t"
92 "shr $2, %0\n\t"
93 "std\n\t"
94 "rep movsl\n\t"
95 "mov %3,(%4)\n\t"
96 "cld\n\t"
97 "jmp 11f\n\t"
98
99 /*
100 * Start to prepare for backward copy.
101 */
102 ".p2align 4\n\t"
103 "2:\n\t"
104 "cmp $680, %0\n\t"
105 "jb 5f\n\t"
106 "mov %1, %3\n\t"
107 "xor %2, %3\n\t"
108 "and $0xff, %3\n\t"
109 "jz 6b\n\t"
110
111 /*
112 * Calculate copy position to tail.
113 */
114 "5:\n\t"
115 "add %0, %1\n\t"
116 "add %0, %2\n\t"
117 "sub $0x10, %0\n\t"
118
119 /*
120 * We gobble 16byts backward in each loop.
121 */
122 "7:\n\t"
123 "sub $0x10, %0\n\t"
124
125 "mov -1*4(%1), %3\n\t"
126 "mov -2*4(%1), %4\n\t"
127 "mov %3, -1*4(%2)\n\t"
128 "mov %4, -2*4(%2)\n\t"
129 "mov -3*4(%1), %3\n\t"
130 "mov -4*4(%1), %4\n\t"
131 "mov %3, -3*4(%2)\n\t"
132 "mov %4, -4*4(%2)\n\t"
133 "lea -0x10(%1), %1\n\t"
134 "lea -0x10(%2), %2\n\t"
135 "jae 7b\n\t"
136 /*
137 * Calculate copy position to head.
138 */
139 "add $0x10, %0\n\t"
140 "sub %0, %1\n\t"
141 "sub %0, %2\n\t"
142
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 ".p2align 4\n\t"
147 "1:\n\t"
148 "cmp $8, %0\n\t"
149 "jb 8f\n\t"
150 "mov 0*4(%1), %3\n\t"
151 "mov 1*4(%1), %4\n\t"
152 "mov -2*4(%1, %0), %5\n\t"
153 "mov -1*4(%1, %0), %1\n\t"
154
155 "mov %3, 0*4(%2)\n\t"
156 "mov %4, 1*4(%2)\n\t"
157 "mov %5, -2*4(%2, %0)\n\t"
158 "mov %1, -1*4(%2, %0)\n\t"
159 "jmp 11f\n\t"
160
161 /*
162 * Move data from 4 bytes to 7 bytes.
163 */
164 ".p2align 4\n\t"
165 "8:\n\t"
166 "cmp $4, %0\n\t"
167 "jb 9f\n\t"
168 "mov 0*4(%1), %3\n\t"
169 "mov -1*4(%1, %0), %4\n\t"
170 "mov %3, 0*4(%2)\n\t"
171 "mov %4, -1*4(%2, %0)\n\t"
172 "jmp 11f\n\t"
173
174 /*
175 * Move data from 2 bytes to 3 bytes.
176 */
177 ".p2align 4\n\t"
178 "9:\n\t"
179 "cmp $2, %0\n\t"
180 "jb 10f\n\t"
181 "movw 0*2(%1), %%dx\n\t"
182 "movw -1*2(%1, %0), %%bx\n\t"
183 "movw %%dx, 0*2(%2)\n\t"
184 "movw %%bx, -1*2(%2, %0)\n\t"
185 "jmp 11f\n\t"
186
187 /*
188 * Move data for 1 byte.
189 */
190 ".p2align 4\n\t"
191 "10:\n\t"
192 "cmp $1, %0\n\t"
193 "jb 11f\n\t"
194 "movb (%1), %%cl\n\t"
195 "movb %%cl, (%2)\n\t"
196 ".p2align 4\n\t"
197 "11:"
198 : "=&c" (d0), "=&S" (d1), "=&D" (d2),
199 "=r" (d3),"=r" (d4), "=r"(d5)
200 :"0" (n),
201 "1" (src),
202 "2" (dest)
203 :"memory");
204
205 return ret;
206
42} 207}
43EXPORT_SYMBOL(memmove); 208EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..75ef61e35e38 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
40ENTRY(__memcpy) 40ENTRY(__memcpy)
41ENTRY(memcpy) 41ENTRY(memcpy)
42 CFI_STARTPROC 42 CFI_STARTPROC
43 movq %rdi, %rax
43 44
44 /* 45 /*
45 * Put the number of full 64-byte blocks into %ecx. 46 * Use 32bit CMP here to avoid long NOP padding.
46 * Tail portion is handled at the end:
47 */ 47 */
48 movq %rdi, %rax 48 cmp $0x20, %edx
49 movl %edx, %ecx 49 jb .Lhandle_tail
50 shrl $6, %ecx
51 jz .Lhandle_tail
52 50
53 .p2align 4
54.Lloop_64:
55 /* 51 /*
56 * We decrement the loop index here - and the zero-flag is 52 * We check whether memory false dependece could occur,
57 * checked at the end of the loop (instructions inbetween do 53 * then jump to corresponding copy mode.
58 * not change the zero flag):
59 */ 54 */
60 decl %ecx 55 cmp %dil, %sil
56 jl .Lcopy_backward
57 subl $0x20, %edx
58.Lcopy_forward_loop:
59 subq $0x20, %rdx
61 60
62 /* 61 /*
63 * Move in blocks of 4x16 bytes: 62 * Move in blocks of 4x8 bytes:
64 */ 63 */
65 movq 0*8(%rsi), %r11 64 movq 0*8(%rsi), %r8
66 movq 1*8(%rsi), %r8 65 movq 1*8(%rsi), %r9
67 movq %r11, 0*8(%rdi) 66 movq 2*8(%rsi), %r10
68 movq %r8, 1*8(%rdi) 67 movq 3*8(%rsi), %r11
69 68 leaq 4*8(%rsi), %rsi
70 movq 2*8(%rsi), %r9 69
71 movq 3*8(%rsi), %r10 70 movq %r8, 0*8(%rdi)
72 movq %r9, 2*8(%rdi) 71 movq %r9, 1*8(%rdi)
73 movq %r10, 3*8(%rdi) 72 movq %r10, 2*8(%rdi)
74 73 movq %r11, 3*8(%rdi)
75 movq 4*8(%rsi), %r11 74 leaq 4*8(%rdi), %rdi
76 movq 5*8(%rsi), %r8 75 jae .Lcopy_forward_loop
77 movq %r11, 4*8(%rdi) 76 addq $0x20, %rdx
78 movq %r8, 5*8(%rdi) 77 jmp .Lhandle_tail
79 78
80 movq 6*8(%rsi), %r9 79.Lcopy_backward:
81 movq 7*8(%rsi), %r10 80 /*
82 movq %r9, 6*8(%rdi) 81 * Calculate copy position to tail.
83 movq %r10, 7*8(%rdi) 82 */
84 83 addq %rdx, %rsi
85 leaq 64(%rsi), %rsi 84 addq %rdx, %rdi
86 leaq 64(%rdi), %rdi 85 subq $0x20, %rdx
87 86 /*
88 jnz .Lloop_64 87 * At most 3 ALU operations in one cycle,
88 * so append NOPS in the same 16bytes trunk.
89 */
90 .p2align 4
91.Lcopy_backward_loop:
92 subq $0x20, %rdx
93 movq -1*8(%rsi), %r8
94 movq -2*8(%rsi), %r9
95 movq -3*8(%rsi), %r10
96 movq -4*8(%rsi), %r11
97 leaq -4*8(%rsi), %rsi
98 movq %r8, -1*8(%rdi)
99 movq %r9, -2*8(%rdi)
100 movq %r10, -3*8(%rdi)
101 movq %r11, -4*8(%rdi)
102 leaq -4*8(%rdi), %rdi
103 jae .Lcopy_backward_loop
89 104
105 /*
106 * Calculate copy position to head.
107 */
108 addq $0x20, %rdx
109 subq %rdx, %rsi
110 subq %rdx, %rdi
90.Lhandle_tail: 111.Lhandle_tail:
91 movl %edx, %ecx 112 cmpq $16, %rdx
92 andl $63, %ecx 113 jb .Lless_16bytes
93 shrl $3, %ecx
94 jz .Lhandle_7
95 114
115 /*
116 * Move data from 16 bytes to 31 bytes.
117 */
118 movq 0*8(%rsi), %r8
119 movq 1*8(%rsi), %r9
120 movq -2*8(%rsi, %rdx), %r10
121 movq -1*8(%rsi, %rdx), %r11
122 movq %r8, 0*8(%rdi)
123 movq %r9, 1*8(%rdi)
124 movq %r10, -2*8(%rdi, %rdx)
125 movq %r11, -1*8(%rdi, %rdx)
126 retq
96 .p2align 4 127 .p2align 4
97.Lloop_8: 128.Lless_16bytes:
98 decl %ecx 129 cmpq $8, %rdx
99 movq (%rsi), %r8 130 jb .Lless_8bytes
100 movq %r8, (%rdi) 131 /*
101 leaq 8(%rdi), %rdi 132 * Move data from 8 bytes to 15 bytes.
102 leaq 8(%rsi), %rsi 133 */
103 jnz .Lloop_8 134 movq 0*8(%rsi), %r8
104 135 movq -1*8(%rsi, %rdx), %r9
105.Lhandle_7: 136 movq %r8, 0*8(%rdi)
106 movl %edx, %ecx 137 movq %r9, -1*8(%rdi, %rdx)
107 andl $7, %ecx 138 retq
108 jz .Lend 139 .p2align 4
140.Lless_8bytes:
141 cmpq $4, %rdx
142 jb .Lless_3bytes
109 143
144 /*
145 * Move data from 4 bytes to 7 bytes.
146 */
147 movl (%rsi), %ecx
148 movl -4(%rsi, %rdx), %r8d
149 movl %ecx, (%rdi)
150 movl %r8d, -4(%rdi, %rdx)
151 retq
110 .p2align 4 152 .p2align 4
153.Lless_3bytes:
154 cmpl $0, %edx
155 je .Lend
156 /*
157 * Move data from 1 bytes to 3 bytes.
158 */
111.Lloop_1: 159.Lloop_1:
112 movb (%rsi), %r8b 160 movb (%rsi), %r8b
113 movb %r8b, (%rdi) 161 movb %r8b, (%rdi)
114 incq %rdi 162 incq %rdi
115 incq %rsi 163 incq %rsi
116 decl %ecx 164 decl %edx
117 jnz .Lloop_1 165 jnz .Lloop_1
118 166
119.Lend: 167.Lend:
120 ret 168 retq
121 CFI_ENDPROC 169 CFI_ENDPROC
122ENDPROC(memcpy) 170ENDPROC(memcpy)
123ENDPROC(__memcpy) 171ENDPROC(__memcpy)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909bf122..6d0f0ec41b34 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,185 @@
8#undef memmove 8#undef memmove
9void *memmove(void *dest, const void *src, size_t count) 9void *memmove(void *dest, const void *src, size_t count)
10{ 10{
11 if (dest < src) { 11 unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12 return memcpy(dest, src, count); 12 char *ret;
13 } else { 13
14 char *p = dest + count; 14 __asm__ __volatile__(
15 const char *s = src + count; 15 /* Handle more 32bytes in loop */
16 while (count--) 16 "mov %2, %3\n\t"
17 *--p = *--s; 17 "cmp $0x20, %0\n\t"
18 } 18 "jb 1f\n\t"
19 return dest; 19
20 /* Decide forward/backward copy mode */
21 "cmp %2, %1\n\t"
22 "jb 2f\n\t"
23
24 /*
25 * movsq instruction have many startup latency
26 * so we handle small size by general register.
27 */
28 "cmp $680, %0\n\t"
29 "jb 3f\n\t"
30 /*
31 * movsq instruction is only good for aligned case.
32 */
33 "cmpb %%dil, %%sil\n\t"
34 "je 4f\n\t"
35 "3:\n\t"
36 "sub $0x20, %0\n\t"
37 /*
38 * We gobble 32byts forward in each loop.
39 */
40 "5:\n\t"
41 "sub $0x20, %0\n\t"
42 "movq 0*8(%1), %4\n\t"
43 "movq 1*8(%1), %5\n\t"
44 "movq 2*8(%1), %6\n\t"
45 "movq 3*8(%1), %7\n\t"
46 "leaq 4*8(%1), %1\n\t"
47
48 "movq %4, 0*8(%2)\n\t"
49 "movq %5, 1*8(%2)\n\t"
50 "movq %6, 2*8(%2)\n\t"
51 "movq %7, 3*8(%2)\n\t"
52 "leaq 4*8(%2), %2\n\t"
53 "jae 5b\n\t"
54 "addq $0x20, %0\n\t"
55 "jmp 1f\n\t"
56 /*
57 * Handle data forward by movsq.
58 */
59 ".p2align 4\n\t"
60 "4:\n\t"
61 "movq %0, %8\n\t"
62 "movq -8(%1, %0), %4\n\t"
63 "lea -8(%2, %0), %5\n\t"
64 "shrq $3, %8\n\t"
65 "rep movsq\n\t"
66 "movq %4, (%5)\n\t"
67 "jmp 13f\n\t"
68 /*
69 * Handle data backward by movsq.
70 */
71 ".p2align 4\n\t"
72 "7:\n\t"
73 "movq %0, %8\n\t"
74 "movq (%1), %4\n\t"
75 "movq %2, %5\n\t"
76 "leaq -8(%1, %0), %1\n\t"
77 "leaq -8(%2, %0), %2\n\t"
78 "shrq $3, %8\n\t"
79 "std\n\t"
80 "rep movsq\n\t"
81 "cld\n\t"
82 "movq %4, (%5)\n\t"
83 "jmp 13f\n\t"
84
85 /*
86 * Start to prepare for backward copy.
87 */
88 ".p2align 4\n\t"
89 "2:\n\t"
90 "cmp $680, %0\n\t"
91 "jb 6f \n\t"
92 "cmp %%dil, %%sil\n\t"
93 "je 7b \n\t"
94 "6:\n\t"
95 /*
96 * Calculate copy position to tail.
97 */
98 "addq %0, %1\n\t"
99 "addq %0, %2\n\t"
100 "subq $0x20, %0\n\t"
101 /*
102 * We gobble 32byts backward in each loop.
103 */
104 "8:\n\t"
105 "subq $0x20, %0\n\t"
106 "movq -1*8(%1), %4\n\t"
107 "movq -2*8(%1), %5\n\t"
108 "movq -3*8(%1), %6\n\t"
109 "movq -4*8(%1), %7\n\t"
110 "leaq -4*8(%1), %1\n\t"
111
112 "movq %4, -1*8(%2)\n\t"
113 "movq %5, -2*8(%2)\n\t"
114 "movq %6, -3*8(%2)\n\t"
115 "movq %7, -4*8(%2)\n\t"
116 "leaq -4*8(%2), %2\n\t"
117 "jae 8b\n\t"
118 /*
119 * Calculate copy position to head.
120 */
121 "addq $0x20, %0\n\t"
122 "subq %0, %1\n\t"
123 "subq %0, %2\n\t"
124 "1:\n\t"
125 "cmpq $16, %0\n\t"
126 "jb 9f\n\t"
127 /*
128 * Move data from 16 bytes to 31 bytes.
129 */
130 "movq 0*8(%1), %4\n\t"
131 "movq 1*8(%1), %5\n\t"
132 "movq -2*8(%1, %0), %6\n\t"
133 "movq -1*8(%1, %0), %7\n\t"
134 "movq %4, 0*8(%2)\n\t"
135 "movq %5, 1*8(%2)\n\t"
136 "movq %6, -2*8(%2, %0)\n\t"
137 "movq %7, -1*8(%2, %0)\n\t"
138 "jmp 13f\n\t"
139 ".p2align 4\n\t"
140 "9:\n\t"
141 "cmpq $8, %0\n\t"
142 "jb 10f\n\t"
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 "movq 0*8(%1), %4\n\t"
147 "movq -1*8(%1, %0), %5\n\t"
148 "movq %4, 0*8(%2)\n\t"
149 "movq %5, -1*8(%2, %0)\n\t"
150 "jmp 13f\n\t"
151 "10:\n\t"
152 "cmpq $4, %0\n\t"
153 "jb 11f\n\t"
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 "movl (%1), %4d\n\t"
158 "movl -4(%1, %0), %5d\n\t"
159 "movl %4d, (%2)\n\t"
160 "movl %5d, -4(%2, %0)\n\t"
161 "jmp 13f\n\t"
162 "11:\n\t"
163 "cmp $2, %0\n\t"
164 "jb 12f\n\t"
165 /*
166 * Move data from 2 bytes to 3 bytes.
167 */
168 "movw (%1), %4w\n\t"
169 "movw -2(%1, %0), %5w\n\t"
170 "movw %4w, (%2)\n\t"
171 "movw %5w, -2(%2, %0)\n\t"
172 "jmp 13f\n\t"
173 "12:\n\t"
174 "cmp $1, %0\n\t"
175 "jb 13f\n\t"
176 /*
177 * Move data for 1 byte.
178 */
179 "movb (%1), %4b\n\t"
180 "movb %4b, (%2)\n\t"
181 "13:\n\t"
182 : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184 :"0" (count),
185 "1" (src),
186 "2" (dest)
187 :"memory");
188
189 return ret;
190
20} 191}
21EXPORT_SYMBOL(memmove); 192EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a4c768397baa..55543397a8a7 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -26,4 +26,6 @@ obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_K8_NUMA) += k8topology_64.o 26obj-$(CONFIG_K8_NUMA) += k8topology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28 28
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
30
29obj-$(CONFIG_MEMTEST) += memtest.o 31obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..7d90ceb882a4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */
14 15
15#include <asm/traps.h> /* dotraplinkage, ... */ 16#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 17#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
160 161
161static void 162static void
162force_sig_info_fault(int si_signo, int si_code, unsigned long address, 163force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163 struct task_struct *tsk) 164 struct task_struct *tsk, int fault)
164{ 165{
166 unsigned lsb = 0;
165 siginfo_t info; 167 siginfo_t info;
166 168
167 info.si_signo = si_signo; 169 info.si_signo = si_signo;
168 info.si_errno = 0; 170 info.si_errno = 0;
169 info.si_code = si_code; 171 info.si_code = si_code;
170 info.si_addr = (void __user *)address; 172 info.si_addr = (void __user *)address;
171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 173 if (fault & VM_FAULT_HWPOISON_LARGE)
174 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
175 if (fault & VM_FAULT_HWPOISON)
176 lsb = PAGE_SHIFT;
177 info.si_addr_lsb = lsb;
172 178
173 force_sig_info(si_signo, &info, tsk); 179 force_sig_info(si_signo, &info, tsk);
174} 180}
@@ -229,7 +235,16 @@ void vmalloc_sync_all(void)
229 235
230 spin_lock_irqsave(&pgd_lock, flags); 236 spin_lock_irqsave(&pgd_lock, flags);
231 list_for_each_entry(page, &pgd_list, lru) { 237 list_for_each_entry(page, &pgd_list, lru) {
232 if (!vmalloc_sync_one(page_address(page), address)) 238 spinlock_t *pgt_lock;
239 pmd_t *ret;
240
241 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242
243 spin_lock(pgt_lock);
244 ret = vmalloc_sync_one(page_address(page), address);
245 spin_unlock(pgt_lock);
246
247 if (!ret)
233 break; 248 break;
234 } 249 }
235 spin_unlock_irqrestore(&pgd_lock, flags); 250 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 266 if (!(address >= VMALLOC_START && address < VMALLOC_END))
252 return -1; 267 return -1;
253 268
269 WARN_ON_ONCE(in_nmi());
270
254 /* 271 /*
255 * Synchronize this task's top level page-table 272 * Synchronize this task's top level page-table
256 * with the 'reference' page table. 273 * with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
326 343
327void vmalloc_sync_all(void) 344void vmalloc_sync_all(void)
328{ 345{
329 unsigned long address; 346 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
330
331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
332 address += PGDIR_SIZE) {
333
334 const pgd_t *pgd_ref = pgd_offset_k(address);
335 unsigned long flags;
336 struct page *page;
337
338 if (pgd_none(*pgd_ref))
339 continue;
340
341 spin_lock_irqsave(&pgd_lock, flags);
342 list_for_each_entry(page, &pgd_list, lru) {
343 pgd_t *pgd;
344 pgd = (pgd_t *)page_address(page) + pgd_index(address);
345 if (pgd_none(*pgd))
346 set_pgd(pgd, *pgd_ref);
347 else
348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
349 }
350 spin_unlock_irqrestore(&pgd_lock, flags);
351 }
352} 347}
353 348
354/* 349/*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 364 if (!(address >= VMALLOC_START && address < VMALLOC_END))
370 return -1; 365 return -1;
371 366
367 WARN_ON_ONCE(in_nmi());
368
372 /* 369 /*
373 * Copy kernel mappings over when needed. This can also 370 * Copy kernel mappings over when needed. This can also
374 * happen within a race in page table update. In the later 371 * happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
731 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 728 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
732 tsk->thread.trap_no = 14; 729 tsk->thread.trap_no = 14;
733 730
734 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 731 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
735 732
736 return; 733 return;
737 } 734 }
@@ -816,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
816 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
817 814
818#ifdef CONFIG_MEMORY_FAILURE 815#ifdef CONFIG_MEMORY_FAILURE
819 if (fault & VM_FAULT_HWPOISON) { 816 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
820 printk(KERN_ERR 817 printk(KERN_ERR
821 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
822 tsk->comm, tsk->pid, address); 819 tsk->comm, tsk->pid, address);
823 code = BUS_MCEERR_AR; 820 code = BUS_MCEERR_AR;
824 } 821 }
825#endif 822#endif
826 force_sig_info_fault(SIGBUS, code, address, tsk); 823 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
827} 824}
828 825
829static noinline void 826static noinline void
@@ -833,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
833 if (fault & VM_FAULT_OOM) { 830 if (fault & VM_FAULT_OOM) {
834 out_of_memory(regs, error_code, address); 831 out_of_memory(regs, error_code, address);
835 } else { 832 } else {
836 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
834 VM_FAULT_HWPOISON_LARGE))
837 do_sigbus(regs, error_code, address, fault); 835 do_sigbus(regs, error_code, address, fault);
838 else 836 else
839 BUG(); 837 BUG();
@@ -894,8 +892,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
894 if (pmd_large(*pmd)) 892 if (pmd_large(*pmd))
895 return spurious_fault_check(error_code, (pte_t *) pmd); 893 return spurious_fault_check(error_code, (pte_t *) pmd);
896 894
895 /*
896 * Note: don't use pte_present() here, since it returns true
897 * if the _PAGE_PROTNONE bit is set. However, this aliases the
898 * _PAGE_GLOBAL bit, which for kernel pages give false positives
899 * when CONFIG_DEBUG_PAGEALLOC is used.
900 */
897 pte = pte_offset_kernel(pmd, address); 901 pte = pte_offset_kernel(pmd, address);
898 if (!pte_present(*pte)) 902 if (!(pte_flags(*pte) & _PAGE_PRESENT))
899 return 0; 903 return 0;
900 904
901 ret = spurious_fault_check(error_code, pte); 905 ret = spurious_fault_check(error_code, pte);
@@ -915,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
915int show_unhandled_signals = 1; 919int show_unhandled_signals = 1;
916 920
917static inline int 921static inline int
918access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 922access_error(unsigned long error_code, struct vm_area_struct *vma)
919{ 923{
920 if (write) { 924 if (error_code & PF_WRITE) {
921 /* write, present and write, not present: */ 925 /* write, present and write, not present: */
922 if (unlikely(!(vma->vm_flags & VM_WRITE))) 926 if (unlikely(!(vma->vm_flags & VM_WRITE)))
923 return 1; 927 return 1;
@@ -952,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
952 struct task_struct *tsk; 956 struct task_struct *tsk;
953 unsigned long address; 957 unsigned long address;
954 struct mm_struct *mm; 958 struct mm_struct *mm;
955 int write;
956 int fault; 959 int fault;
960 int write = error_code & PF_WRITE;
961 unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
962 (write ? FAULT_FLAG_WRITE : 0);
957 963
958 tsk = current; 964 tsk = current;
959 mm = tsk->mm; 965 mm = tsk->mm;
@@ -1064,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1064 bad_area_nosemaphore(regs, error_code, address); 1070 bad_area_nosemaphore(regs, error_code, address);
1065 return; 1071 return;
1066 } 1072 }
1073retry:
1067 down_read(&mm->mmap_sem); 1074 down_read(&mm->mmap_sem);
1068 } else { 1075 } else {
1069 /* 1076 /*
@@ -1107,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1107 * we can handle it.. 1114 * we can handle it..
1108 */ 1115 */
1109good_area: 1116good_area:
1110 write = error_code & PF_WRITE; 1117 if (unlikely(access_error(error_code, vma))) {
1111
1112 if (unlikely(access_error(error_code, write, vma))) {
1113 bad_area_access_error(regs, error_code, address); 1118 bad_area_access_error(regs, error_code, address);
1114 return; 1119 return;
1115 } 1120 }
@@ -1119,21 +1124,34 @@ good_area:
1119 * make sure we exit gracefully rather than endlessly redo 1124 * make sure we exit gracefully rather than endlessly redo
1120 * the fault: 1125 * the fault:
1121 */ 1126 */
1122 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1127 fault = handle_mm_fault(mm, vma, address, flags);
1123 1128
1124 if (unlikely(fault & VM_FAULT_ERROR)) { 1129 if (unlikely(fault & VM_FAULT_ERROR)) {
1125 mm_fault_error(regs, error_code, address, fault); 1130 mm_fault_error(regs, error_code, address, fault);
1126 return; 1131 return;
1127 } 1132 }
1128 1133
1129 if (fault & VM_FAULT_MAJOR) { 1134 /*
1130 tsk->maj_flt++; 1135 * Major/minor page fault accounting is only done on the
1131 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1136 * initial attempt. If we go through a retry, it is extremely
1132 regs, address); 1137 * likely that the page will be found in page cache at that point.
1133 } else { 1138 */
1134 tsk->min_flt++; 1139 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1135 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1140 if (fault & VM_FAULT_MAJOR) {
1136 regs, address); 1141 tsk->maj_flt++;
1142 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1143 regs, address);
1144 } else {
1145 tsk->min_flt++;
1146 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1147 regs, address);
1148 }
1149 if (fault & VM_FAULT_RETRY) {
1150 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1151 * of starvation. */
1152 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1153 goto retry;
1154 }
1137 } 1155 }
1138 1156
1139 check_v8086_mode(regs, address, tsk); 1157 check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 5e8fa12ef861..b49962662101 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -9,6 +9,7 @@ void *kmap(struct page *page)
9 return page_address(page); 9 return page_address(page);
10 return kmap_high(page); 10 return kmap_high(page);
11} 11}
12EXPORT_SYMBOL(kmap);
12 13
13void kunmap(struct page *page) 14void kunmap(struct page *page)
14{ 15{
@@ -18,6 +19,7 @@ void kunmap(struct page *page)
18 return; 19 return;
19 kunmap_high(page); 20 kunmap_high(page);
20} 21}
22EXPORT_SYMBOL(kunmap);
21 23
22/* 24/*
23 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because 25 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -27,10 +29,10 @@ void kunmap(struct page *page)
27 * However when holding an atomic kmap it is not legal to sleep, so atomic 29 * However when holding an atomic kmap it is not legal to sleep, so atomic
28 * kmaps are appropriate for short, tight code paths only. 30 * kmaps are appropriate for short, tight code paths only.
29 */ 31 */
30void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) 32void *kmap_atomic_prot(struct page *page, pgprot_t prot)
31{ 33{
32 enum fixed_addresses idx;
33 unsigned long vaddr; 34 unsigned long vaddr;
35 int idx, type;
34 36
35 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 37 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
36 pagefault_disable(); 38 pagefault_disable();
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
38 if (!PageHighMem(page)) 40 if (!PageHighMem(page))
39 return page_address(page); 41 return page_address(page);
40 42
41 debug_kmap_atomic(type); 43 type = kmap_atomic_idx_push();
42
43 idx = type + KM_TYPE_NR*smp_processor_id(); 44 idx = type + KM_TYPE_NR*smp_processor_id();
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 45 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 46 BUG_ON(!pte_none(*(kmap_pte-idx)));
@@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
47 48
48 return (void *)vaddr; 49 return (void *)vaddr;
49} 50}
51EXPORT_SYMBOL(kmap_atomic_prot);
52
53void *__kmap_atomic(struct page *page)
54{
55 return kmap_atomic_prot(page, kmap_prot);
56}
57EXPORT_SYMBOL(__kmap_atomic);
50 58
51void *kmap_atomic(struct page *page, enum km_type type) 59/*
60 * This is the same as kmap_atomic() but can map memory that doesn't
61 * have a struct page associated with it.
62 */
63void *kmap_atomic_pfn(unsigned long pfn)
52{ 64{
53 return kmap_atomic_prot(page, type, kmap_prot); 65 return kmap_atomic_prot_pfn(pfn, kmap_prot);
54} 66}
67EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
55 68
56void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) 69void __kunmap_atomic(void *kvaddr)
57{ 70{
58 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 71 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
59 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); 72
60 73 if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
61 /* 74 vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
62 * Force other mappings to Oops if they'll try to access this pte 75 int idx, type;
63 * without first remap it. Keeping stale mappings around is a bad idea 76
64 * also, in case the page changes cacheability attributes or becomes 77 type = kmap_atomic_idx();
65 * a protected page in a hypervisor. 78 idx = type + KM_TYPE_NR * smp_processor_id();
66 */ 79
67 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 80#ifdef CONFIG_DEBUG_HIGHMEM
81 WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
82#endif
83 /*
84 * Force other mappings to Oops if they'll try to access this
85 * pte without first remap it. Keeping stale mappings around
86 * is a bad idea also, in case the page changes cacheability
87 * attributes or becomes a protected page in a hypervisor.
88 */
68 kpte_clear_flush(kmap_pte-idx, vaddr); 89 kpte_clear_flush(kmap_pte-idx, vaddr);
69 else { 90 kmap_atomic_idx_pop();
91 }
70#ifdef CONFIG_DEBUG_HIGHMEM 92#ifdef CONFIG_DEBUG_HIGHMEM
93 else {
71 BUG_ON(vaddr < PAGE_OFFSET); 94 BUG_ON(vaddr < PAGE_OFFSET);
72 BUG_ON(vaddr >= (unsigned long)high_memory); 95 BUG_ON(vaddr >= (unsigned long)high_memory);
73#endif
74 } 96 }
97#endif
75 98
76 pagefault_enable(); 99 pagefault_enable();
77} 100}
78 101EXPORT_SYMBOL(__kunmap_atomic);
79/*
80 * This is the same as kmap_atomic() but can map memory that doesn't
81 * have a struct page associated with it.
82 */
83void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
84{
85 return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
86}
87EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
88 102
89struct page *kmap_atomic_to_page(void *ptr) 103struct page *kmap_atomic_to_page(void *ptr)
90{ 104{
@@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr)
98 pte = kmap_pte - (idx - FIX_KMAP_BEGIN); 112 pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
99 return pte_page(*pte); 113 return pte_page(*pte);
100} 114}
101
102EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic_notypecheck);
106EXPORT_SYMBOL(kmap_atomic_prot);
107EXPORT_SYMBOL(kmap_atomic_to_page); 115EXPORT_SYMBOL(kmap_atomic_to_page);
108 116
109void __init set_highmem_pages_init(void) 117void __init set_highmem_pages_init(void)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b278535b14aa..c0e28a13de7d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -2,6 +2,7 @@
2#include <linux/initrd.h> 2#include <linux/initrd.h>
3#include <linux/ioport.h> 3#include <linux/ioport.h>
4#include <linux/swap.h> 4#include <linux/swap.h>
5#include <linux/memblock.h>
5 6
6#include <asm/cacheflush.h> 7#include <asm/cacheflush.h>
7#include <asm/e820.h> 8#include <asm/e820.h>
@@ -33,6 +34,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
33 int use_gbpages) 34 int use_gbpages)
34{ 35{
35 unsigned long puds, pmds, ptes, tables, start; 36 unsigned long puds, pmds, ptes, tables, start;
37 phys_addr_t base;
36 38
37 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
38 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 40 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
@@ -75,12 +77,12 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
75#else 77#else
76 start = 0x8000; 78 start = 0x8000;
77#endif 79#endif
78 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 80 base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT,
79 tables, PAGE_SIZE); 81 tables, PAGE_SIZE);
80 if (e820_table_start == -1UL) 82 if (base == MEMBLOCK_ERROR)
81 panic("Cannot find space for the kernel page tables"); 83 panic("Cannot find space for the kernel page tables");
82 84
83 e820_table_start >>= PAGE_SHIFT; 85 e820_table_start = base >> PAGE_SHIFT;
84 e820_table_end = e820_table_start; 86 e820_table_end = e820_table_start;
85 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 87 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
86 88
@@ -299,7 +301,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
299 __flush_tlb_all(); 301 __flush_tlb_all();
300 302
301 if (!after_bootmem && e820_table_end > e820_table_start) 303 if (!after_bootmem && e820_table_end > e820_table_start)
302 reserve_early(e820_table_start << PAGE_SHIFT, 304 memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
303 e820_table_end << PAGE_SHIFT, "PGTABLE"); 305 e820_table_end << PAGE_SHIFT, "PGTABLE");
304 306
305 if (!after_bootmem) 307 if (!after_bootmem)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 558f2d332076..0e969f9f401b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,6 +25,7 @@
25#include <linux/pfn.h> 25#include <linux/pfn.h>
26#include <linux/poison.h> 26#include <linux/poison.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/memblock.h>
28#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
29#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
30#include <linux/initrd.h> 31#include <linux/initrd.h>
@@ -422,49 +423,28 @@ static void __init add_one_highpage_init(struct page *page)
422 totalhigh_pages++; 423 totalhigh_pages++;
423} 424}
424 425
425struct add_highpages_data { 426void __init add_highpages_with_active_regions(int nid,
426 unsigned long start_pfn; 427 unsigned long start_pfn, unsigned long end_pfn)
427 unsigned long end_pfn;
428};
429
430static int __init add_highpages_work_fn(unsigned long start_pfn,
431 unsigned long end_pfn, void *datax)
432{ 428{
433 int node_pfn; 429 struct range *range;
434 struct page *page; 430 int nr_range;
435 unsigned long final_start_pfn, final_end_pfn; 431 int i;
436 struct add_highpages_data *data;
437 432
438 data = (struct add_highpages_data *)datax; 433 nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
439 434
440 final_start_pfn = max(start_pfn, data->start_pfn); 435 for (i = 0; i < nr_range; i++) {
441 final_end_pfn = min(end_pfn, data->end_pfn); 436 struct page *page;
442 if (final_start_pfn >= final_end_pfn) 437 int node_pfn;
443 return 0;
444 438
445 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; 439 for (node_pfn = range[i].start; node_pfn < range[i].end;
446 node_pfn++) { 440 node_pfn++) {
447 if (!pfn_valid(node_pfn)) 441 if (!pfn_valid(node_pfn))
448 continue; 442 continue;
449 page = pfn_to_page(node_pfn); 443 page = pfn_to_page(node_pfn);
450 add_one_highpage_init(page); 444 add_one_highpage_init(page);
445 }
451 } 446 }
452
453 return 0;
454
455} 447}
456
457void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
458 unsigned long end_pfn)
459{
460 struct add_highpages_data data;
461
462 data.start_pfn = start_pfn;
463 data.end_pfn = end_pfn;
464
465 work_with_active_regions(nid, add_highpages_work_fn, &data);
466}
467
468#else 448#else
469static inline void permanent_kmaps_init(pgd_t *pgd_base) 449static inline void permanent_kmaps_init(pgd_t *pgd_base)
470{ 450{
@@ -548,48 +528,6 @@ static void __init pagetable_init(void)
548 permanent_kmaps_init(pgd_base); 528 permanent_kmaps_init(pgd_base);
549} 529}
550 530
551#ifdef CONFIG_ACPI_SLEEP
552/*
553 * ACPI suspend needs this for resume, because things like the intel-agp
554 * driver might have split up a kernel 4MB mapping.
555 */
556char swsusp_pg_dir[PAGE_SIZE]
557 __attribute__ ((aligned(PAGE_SIZE)));
558
559static inline void save_pg_dir(void)
560{
561 copy_page(swsusp_pg_dir, swapper_pg_dir);
562}
563#else /* !CONFIG_ACPI_SLEEP */
564static inline void save_pg_dir(void)
565{
566}
567#endif /* !CONFIG_ACPI_SLEEP */
568
569void zap_low_mappings(bool early)
570{
571 int i;
572
573 /*
574 * Zap initial low-memory mappings.
575 *
576 * Note that "pgd_clear()" doesn't do it for
577 * us, because pgd_clear() is a no-op on i386.
578 */
579 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
580#ifdef CONFIG_X86_PAE
581 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
582#else
583 set_pgd(swapper_pg_dir+i, __pgd(0));
584#endif
585 }
586
587 if (early)
588 __flush_tlb();
589 else
590 flush_tlb_all();
591}
592
593pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 531pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
594EXPORT_SYMBOL_GPL(__supported_pte_mask); 532EXPORT_SYMBOL_GPL(__supported_pte_mask);
595 533
@@ -712,14 +650,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
712 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
713 if (max_pfn > max_low_pfn) 651 if (max_pfn > max_low_pfn)
714 highstart_pfn = max_low_pfn; 652 highstart_pfn = max_low_pfn;
715 e820_register_active_regions(0, 0, highend_pfn); 653 memblock_x86_register_active_regions(0, 0, highend_pfn);
716 sparse_memory_present_with_active_regions(0); 654 sparse_memory_present_with_active_regions(0);
717 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 655 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
718 pages_to_mb(highend_pfn - highstart_pfn)); 656 pages_to_mb(highend_pfn - highstart_pfn));
719 num_physpages = highend_pfn; 657 num_physpages = highend_pfn;
720 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 658 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
721#else 659#else
722 e820_register_active_regions(0, 0, max_low_pfn); 660 memblock_x86_register_active_regions(0, 0, max_low_pfn);
723 sparse_memory_present_with_active_regions(0); 661 sparse_memory_present_with_active_regions(0);
724 num_physpages = max_low_pfn; 662 num_physpages = max_low_pfn;
725 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 663 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
@@ -750,68 +688,12 @@ static void __init zone_sizes_init(void)
750 free_area_init_nodes(max_zone_pfns); 688 free_area_init_nodes(max_zone_pfns);
751} 689}
752 690
753#ifndef CONFIG_NO_BOOTMEM
754static unsigned long __init setup_node_bootmem(int nodeid,
755 unsigned long start_pfn,
756 unsigned long end_pfn,
757 unsigned long bootmap)
758{
759 unsigned long bootmap_size;
760
761 /* don't touch min_low_pfn */
762 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
763 bootmap >> PAGE_SHIFT,
764 start_pfn, end_pfn);
765 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
766 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
768 nodeid, bootmap, bootmap + bootmap_size);
769 free_bootmem_with_active_regions(nodeid, end_pfn);
770
771 return bootmap + bootmap_size;
772}
773#endif
774
775void __init setup_bootmem_allocator(void) 691void __init setup_bootmem_allocator(void)
776{ 692{
777#ifndef CONFIG_NO_BOOTMEM
778 int nodeid;
779 unsigned long bootmap_size, bootmap;
780 /*
781 * Initialize the boot-time allocator (with low memory only):
782 */
783 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
784 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
785 PAGE_SIZE);
786 if (bootmap == -1L)
787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
790
791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 693 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
792 max_pfn_mapped<<PAGE_SHIFT); 694 max_pfn_mapped<<PAGE_SHIFT);
793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 695 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
794 696
795#ifndef CONFIG_NO_BOOTMEM
796 for_each_online_node(nodeid) {
797 unsigned long start_pfn, end_pfn;
798
799#ifdef CONFIG_NEED_MULTIPLE_NODES
800 start_pfn = node_start_pfn[nodeid];
801 end_pfn = node_end_pfn[nodeid];
802 if (start_pfn > max_low_pfn)
803 continue;
804 if (end_pfn > max_low_pfn)
805 end_pfn = max_low_pfn;
806#else
807 start_pfn = 0;
808 end_pfn = max_low_pfn;
809#endif
810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
811 bootmap);
812 }
813#endif
814
815 after_bootmem = 1; 697 after_bootmem = 1;
816} 698}
817 699
@@ -958,9 +840,6 @@ void __init mem_init(void)
958 840
959 if (boot_cpu_data.wp_works_ok < 0) 841 if (boot_cpu_data.wp_works_ok < 0)
960 test_wp_bit(); 842 test_wp_bit();
961
962 save_pg_dir();
963 zap_low_mappings(true);
964} 843}
965 844
966#ifdef CONFIG_MEMORY_HOTPLUG 845#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1070,8 +949,3 @@ void mark_rodata_ro(void)
1070} 949}
1071#endif 950#endif
1072 951
1073int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1074 int flags)
1075{
1076 return reserve_bootmem(phys, len, flags);
1077}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7c48ad4faca3..71a59296af80 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -21,6 +21,7 @@
21#include <linux/initrd.h> 21#include <linux/initrd.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/proc_fs.h> 25#include <linux/proc_fs.h>
25#include <linux/pci.h> 26#include <linux/pci.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
@@ -50,9 +51,6 @@
50#include <asm/numa.h> 51#include <asm/numa.h>
51#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
52#include <asm/init.h> 53#include <asm/init.h>
53#include <linux/bootmem.h>
54
55static unsigned long dma_reserve __initdata;
56 54
57static int __init parse_direct_gbpages_off(char *arg) 55static int __init parse_direct_gbpages_off(char *arg)
58{ 56{
@@ -98,6 +96,43 @@ static int __init nonx32_setup(char *str)
98__setup("noexec32=", nonx32_setup); 96__setup("noexec32=", nonx32_setup);
99 97
100/* 98/*
99 * When memory was added/removed make sure all the processes MM have
100 * suitable PGD entries in the local PGD level page.
101 */
102void sync_global_pgds(unsigned long start, unsigned long end)
103{
104 unsigned long address;
105
106 for (address = start; address <= end; address += PGDIR_SIZE) {
107 const pgd_t *pgd_ref = pgd_offset_k(address);
108 unsigned long flags;
109 struct page *page;
110
111 if (pgd_none(*pgd_ref))
112 continue;
113
114 spin_lock_irqsave(&pgd_lock, flags);
115 list_for_each_entry(page, &pgd_list, lru) {
116 pgd_t *pgd;
117 spinlock_t *pgt_lock;
118
119 pgd = (pgd_t *)page_address(page) + pgd_index(address);
120 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
121 spin_lock(pgt_lock);
122
123 if (pgd_none(*pgd))
124 set_pgd(pgd, *pgd_ref);
125 else
126 BUG_ON(pgd_page_vaddr(*pgd)
127 != pgd_page_vaddr(*pgd_ref));
128
129 spin_unlock(pgt_lock);
130 }
131 spin_unlock_irqrestore(&pgd_lock, flags);
132 }
133}
134
135/*
101 * NOTE: This function is marked __ref because it calls __init function 136 * NOTE: This function is marked __ref because it calls __init function
102 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 137 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
103 */ 138 */
@@ -534,11 +569,13 @@ kernel_physical_mapping_init(unsigned long start,
534 unsigned long end, 569 unsigned long end,
535 unsigned long page_size_mask) 570 unsigned long page_size_mask)
536{ 571{
537 572 bool pgd_changed = false;
538 unsigned long next, last_map_addr = end; 573 unsigned long next, last_map_addr = end;
574 unsigned long addr;
539 575
540 start = (unsigned long)__va(start); 576 start = (unsigned long)__va(start);
541 end = (unsigned long)__va(end); 577 end = (unsigned long)__va(end);
578 addr = start;
542 579
543 for (; start < end; start = next) { 580 for (; start < end; start = next) {
544 pgd_t *pgd = pgd_offset_k(start); 581 pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +600,12 @@ kernel_physical_mapping_init(unsigned long start,
563 spin_lock(&init_mm.page_table_lock); 600 spin_lock(&init_mm.page_table_lock);
564 pgd_populate(&init_mm, pgd, __va(pud_phys)); 601 pgd_populate(&init_mm, pgd, __va(pud_phys));
565 spin_unlock(&init_mm.page_table_lock); 602 spin_unlock(&init_mm.page_table_lock);
603 pgd_changed = true;
566 } 604 }
605
606 if (pgd_changed)
607 sync_global_pgds(addr, end);
608
567 __flush_tlb_all(); 609 __flush_tlb_all();
568 610
569 return last_map_addr; 611 return last_map_addr;
@@ -573,23 +615,7 @@ kernel_physical_mapping_init(unsigned long start,
573void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 615void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
574 int acpi, int k8) 616 int acpi, int k8)
575{ 617{
576#ifndef CONFIG_NO_BOOTMEM 618 memblock_x86_register_active_regions(0, start_pfn, end_pfn);
577 unsigned long bootmap_size, bootmap;
578
579 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
580 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
581 PAGE_SIZE);
582 if (bootmap == -1L)
583 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
584 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
585 /* don't touch min_low_pfn */
586 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
587 0, end_pfn);
588 e820_register_active_regions(0, start_pfn, end_pfn);
589 free_bootmem_with_active_regions(0, end_pfn);
590#else
591 e820_register_active_regions(0, start_pfn, end_pfn);
592#endif
593} 619}
594#endif 620#endif
595 621
@@ -799,52 +825,6 @@ void mark_rodata_ro(void)
799 825
800#endif 826#endif
801 827
802int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
803 int flags)
804{
805#ifdef CONFIG_NUMA
806 int nid, next_nid;
807 int ret;
808#endif
809 unsigned long pfn = phys >> PAGE_SHIFT;
810
811 if (pfn >= max_pfn) {
812 /*
813 * This can happen with kdump kernels when accessing
814 * firmware tables:
815 */
816 if (pfn < max_pfn_mapped)
817 return -EFAULT;
818
819 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
820 phys, len);
821 return -EFAULT;
822 }
823
824 /* Should check here against the e820 map to avoid double free */
825#ifdef CONFIG_NUMA
826 nid = phys_to_nid(phys);
827 next_nid = phys_to_nid(phys + len - 1);
828 if (nid == next_nid)
829 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
830 else
831 ret = reserve_bootmem(phys, len, flags);
832
833 if (ret != 0)
834 return ret;
835
836#else
837 reserve_bootmem(phys, len, flags);
838#endif
839
840 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
841 dma_reserve += len / PAGE_SIZE;
842 set_dma_reserve(dma_reserve);
843 }
844
845 return 0;
846}
847
848int kern_addr_valid(unsigned long addr) 828int kern_addr_valid(unsigned long addr)
849{ 829{
850 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; 830 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
@@ -1003,6 +983,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1003 } 983 }
1004 984
1005 } 985 }
986 sync_global_pgds((unsigned long)start_page, end);
1006 return 0; 987 return 0;
1007} 988}
1008 989
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 72fc70cf6184..7b179b499fa3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
48} 48}
49EXPORT_SYMBOL_GPL(iomap_create_wc); 49EXPORT_SYMBOL_GPL(iomap_create_wc);
50 50
51void 51void iomap_free(resource_size_t base, unsigned long size)
52iomap_free(resource_size_t base, unsigned long size)
53{ 52{
54 io_free_memtype(base, base + size); 53 io_free_memtype(base, base + size);
55} 54}
56EXPORT_SYMBOL_GPL(iomap_free); 55EXPORT_SYMBOL_GPL(iomap_free);
57 56
58void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 57void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
59{ 58{
60 enum fixed_addresses idx;
61 unsigned long vaddr; 59 unsigned long vaddr;
60 int idx, type;
62 61
63 pagefault_disable(); 62 pagefault_disable();
64 63
65 debug_kmap_atomic(type); 64 type = kmap_atomic_idx_push();
66 idx = type + KM_TYPE_NR * smp_processor_id(); 65 idx = type + KM_TYPE_NR * smp_processor_id();
67 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 66 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
68 set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); 67 set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
72} 71}
73 72
74/* 73/*
75 * Map 'pfn' using fixed map 'type' and protections 'prot' 74 * Map 'pfn' using protections 'prot'
76 */ 75 */
77void __iomem * 76void __iomem *
78iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 77iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
79{ 78{
80 /* 79 /*
81 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. 80 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
@@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
86 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) 85 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
87 prot = PAGE_KERNEL_UC_MINUS; 86 prot = PAGE_KERNEL_UC_MINUS;
88 87
89 return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); 88 return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
90} 89}
91EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); 90EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
92 91
93void 92void
94iounmap_atomic(void __iomem *kvaddr, enum km_type type) 93iounmap_atomic(void __iomem *kvaddr)
95{ 94{
96 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 95 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
97 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
98 96
99 /* 97 if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
100 * Force other mappings to Oops if they'll try to access this pte 98 vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
101 * without first remap it. Keeping stale mappings around is a bad idea 99 int idx, type;
102 * also, in case the page changes cacheability attributes or becomes 100
103 * a protected page in a hypervisor. 101 type = kmap_atomic_idx();
104 */ 102 idx = type + KM_TYPE_NR * smp_processor_id();
105 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 103
104#ifdef CONFIG_DEBUG_HIGHMEM
105 WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
106#endif
107 /*
108 * Force other mappings to Oops if they'll try to access this
109 * pte without first remap it. Keeping stale mappings around
110 * is a bad idea also, in case the page changes cacheability
111 * attributes or becomes a protected page in a hypervisor.
112 */
106 kpte_clear_flush(kmap_pte-idx, vaddr); 113 kpte_clear_flush(kmap_pte-idx, vaddr);
114 kmap_atomic_idx_pop();
115 }
107 116
108 pagefault_enable(); 117 pagefault_enable();
109} 118}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3ba6e0608c55..0369843511dc 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -362,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
362 return &bm_pte[pte_index(addr)]; 362 return &bm_pte[pte_index(addr)];
363} 363}
364 364
365bool __init is_early_ioremap_ptep(pte_t *ptep)
366{
367 return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
368}
369
365static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; 370static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
366 371
367void __init early_ioremap_init(void) 372void __init early_ioremap_init(void)
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 240f86462a83..804a3b6c6e14 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -11,6 +11,8 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/memblock.h>
15
14#include <asm/io.h> 16#include <asm/io.h>
15#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
16#include <linux/acpi.h> 18#include <linux/acpi.h>
@@ -22,7 +24,7 @@
22#include <asm/numa.h> 24#include <asm/numa.h>
23#include <asm/mpspec.h> 25#include <asm/mpspec.h>
24#include <asm/apic.h> 26#include <asm/apic.h>
25#include <asm/k8.h> 27#include <asm/amd_nb.h>
26 28
27static struct bootnode __initdata nodes[8]; 29static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 30static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -222,7 +224,7 @@ int __init k8_scan_nodes(void)
222 for_each_node_mask(i, node_possible_map) { 224 for_each_node_mask(i, node_possible_map) {
223 int j; 225 int j;
224 226
225 e820_register_active_regions(i, 227 memblock_x86_register_active_regions(i,
226 nodes[i].start >> PAGE_SHIFT, 228 nodes[i].start >> PAGE_SHIFT,
227 nodes[i].end >> PAGE_SHIFT); 229 nodes[i].end >> PAGE_SHIFT);
228 for (j = apicid_base; j < cores + apicid_base; j++) 230 for (j = apicid_base; j < cores + apicid_base; j++)
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
631 if (!pte) 631 if (!pte)
632 return false; 632 return false;
633 633
634 WARN_ON_ONCE(in_nmi());
635
634 if (error_code & 2) 636 if (error_code & 2)
635 kmemcheck_access(regs, address, KMEMCHECK_WRITE); 637 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
636 else 638 else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
9 b == 0xf0 || b == 0xf2 || b == 0xf3 9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */ 10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e 12 || b == 0x64 || b == 0x65
13 /* Group 3 */ 13 /* Group 3 */
14 || b == 0x66 14 || b == 0x66
15 /* Group 4 */ 15 /* Group 4 */
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
new file mode 100644
index 000000000000..aa1169392b83
--- /dev/null
+++ b/arch/x86/mm/memblock.c
@@ -0,0 +1,348 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bitops.h>
5#include <linux/memblock.h>
6#include <linux/bootmem.h>
7#include <linux/mm.h>
8#include <linux/range.h>
9
10/* Check for already reserved areas */
11static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align)
12{
13 struct memblock_region *r;
14 u64 addr = *addrp, last;
15 u64 size = *sizep;
16 bool changed = false;
17
18again:
19 last = addr + size;
20 for_each_memblock(reserved, r) {
21 if (last > r->base && addr < r->base) {
22 size = r->base - addr;
23 changed = true;
24 goto again;
25 }
26 if (last > (r->base + r->size) && addr < (r->base + r->size)) {
27 addr = round_up(r->base + r->size, align);
28 size = last - addr;
29 changed = true;
30 goto again;
31 }
32 if (last <= (r->base + r->size) && addr >= r->base) {
33 *sizep = 0;
34 return false;
35 }
36 }
37 if (changed) {
38 *addrp = addr;
39 *sizep = size;
40 }
41 return changed;
42}
43
44/*
45 * Find next free range after start, and size is returned in *sizep
46 */
47u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
48{
49 struct memblock_region *r;
50
51 for_each_memblock(memory, r) {
52 u64 ei_start = r->base;
53 u64 ei_last = ei_start + r->size;
54 u64 addr;
55
56 addr = round_up(ei_start, align);
57 if (addr < start)
58 addr = round_up(start, align);
59 if (addr >= ei_last)
60 continue;
61 *sizep = ei_last - addr;
62 while (check_with_memblock_reserved_size(&addr, sizep, align))
63 ;
64
65 if (*sizep)
66 return addr;
67 }
68
69 return MEMBLOCK_ERROR;
70}
71
72static __init struct range *find_range_array(int count)
73{
74 u64 end, size, mem;
75 struct range *range;
76
77 size = sizeof(struct range) * count;
78 end = memblock.current_limit;
79
80 mem = memblock_find_in_range(0, end, size, sizeof(struct range));
81 if (mem == MEMBLOCK_ERROR)
82 panic("can not find more space for range array");
83
84 /*
85 * This range is tempoaray, so don't reserve it, it will not be
86 * overlapped because We will not alloccate new buffer before
87 * We discard this one
88 */
89 range = __va(mem);
90 memset(range, 0, size);
91
92 return range;
93}
94
95static void __init memblock_x86_subtract_reserved(struct range *range, int az)
96{
97 u64 final_start, final_end;
98 struct memblock_region *r;
99
100 /* Take out region array itself at first*/
101 memblock_free_reserved_regions();
102
103 memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
104
105 for_each_memblock(reserved, r) {
106 memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
107 final_start = PFN_DOWN(r->base);
108 final_end = PFN_UP(r->base + r->size);
109 if (final_start >= final_end)
110 continue;
111 subtract_range(range, az, final_start, final_end);
112 }
113
114 /* Put region array back ? */
115 memblock_reserve_reserved_regions();
116}
117
118struct count_data {
119 int nr;
120};
121
122static int __init count_work_fn(unsigned long start_pfn,
123 unsigned long end_pfn, void *datax)
124{
125 struct count_data *data = datax;
126
127 data->nr++;
128
129 return 0;
130}
131
132static int __init count_early_node_map(int nodeid)
133{
134 struct count_data data;
135
136 data.nr = 0;
137 work_with_active_regions(nodeid, count_work_fn, &data);
138
139 return data.nr;
140}
141
142int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
143 unsigned long start_pfn, unsigned long end_pfn)
144{
145 int count;
146 struct range *range;
147 int nr_range;
148
149 count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
150
151 range = find_range_array(count);
152 nr_range = 0;
153
154 /*
155 * Use early_node_map[] and memblock.reserved.region to get range array
156 * at first
157 */
158 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
159 subtract_range(range, count, 0, start_pfn);
160 subtract_range(range, count, end_pfn, -1ULL);
161
162 memblock_x86_subtract_reserved(range, count);
163 nr_range = clean_sort_range(range, count);
164
165 *rangep = range;
166 return nr_range;
167}
168
169int __init get_free_all_memory_range(struct range **rangep, int nodeid)
170{
171 unsigned long end_pfn = -1UL;
172
173#ifdef CONFIG_X86_32
174 end_pfn = max_low_pfn;
175#endif
176 return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
177}
178
179static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
180{
181 int i, count;
182 struct range *range;
183 int nr_range;
184 u64 final_start, final_end;
185 u64 free_size;
186 struct memblock_region *r;
187
188 count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
189
190 range = find_range_array(count);
191 nr_range = 0;
192
193 addr = PFN_UP(addr);
194 limit = PFN_DOWN(limit);
195
196 for_each_memblock(memory, r) {
197 final_start = PFN_UP(r->base);
198 final_end = PFN_DOWN(r->base + r->size);
199 if (final_start >= final_end)
200 continue;
201 if (final_start >= limit || final_end <= addr)
202 continue;
203
204 nr_range = add_range(range, count, nr_range, final_start, final_end);
205 }
206 subtract_range(range, count, 0, addr);
207 subtract_range(range, count, limit, -1ULL);
208
209 /* Subtract memblock.reserved.region in range ? */
210 if (!get_free)
211 goto sort_and_count_them;
212 for_each_memblock(reserved, r) {
213 final_start = PFN_DOWN(r->base);
214 final_end = PFN_UP(r->base + r->size);
215 if (final_start >= final_end)
216 continue;
217 if (final_start >= limit || final_end <= addr)
218 continue;
219
220 subtract_range(range, count, final_start, final_end);
221 }
222
223sort_and_count_them:
224 nr_range = clean_sort_range(range, count);
225
226 free_size = 0;
227 for (i = 0; i < nr_range; i++)
228 free_size += range[i].end - range[i].start;
229
230 return free_size << PAGE_SHIFT;
231}
232
233u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
234{
235 return __memblock_x86_memory_in_range(addr, limit, true);
236}
237
238u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
239{
240 return __memblock_x86_memory_in_range(addr, limit, false);
241}
242
243void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
244{
245 if (start == end)
246 return;
247
248 if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
249 return;
250
251 memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
252
253 memblock_reserve(start, end - start);
254}
255
256void __init memblock_x86_free_range(u64 start, u64 end)
257{
258 if (start == end)
259 return;
260
261 if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
262 return;
263
264 memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
265
266 memblock_free(start, end - start);
267}
268
269/*
270 * Need to call this function after memblock_x86_register_active_regions,
271 * so early_node_map[] is filled already.
272 */
273u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
274{
275 u64 addr;
276 addr = find_memory_core_early(nid, size, align, start, end);
277 if (addr != MEMBLOCK_ERROR)
278 return addr;
279
280 /* Fallback, should already have start end within node range */
281 return memblock_find_in_range(start, end, size, align);
282}
283
284/*
285 * Finds an active region in the address range from start_pfn to last_pfn and
286 * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
287 */
288static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
289 unsigned long start_pfn,
290 unsigned long last_pfn,
291 unsigned long *ei_startpfn,
292 unsigned long *ei_endpfn)
293{
294 u64 align = PAGE_SIZE;
295
296 *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
297 *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
298
299 /* Skip map entries smaller than a page */
300 if (*ei_startpfn >= *ei_endpfn)
301 return 0;
302
303 /* Skip if map is outside the node */
304 if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
305 return 0;
306
307 /* Check for overlaps */
308 if (*ei_startpfn < start_pfn)
309 *ei_startpfn = start_pfn;
310 if (*ei_endpfn > last_pfn)
311 *ei_endpfn = last_pfn;
312
313 return 1;
314}
315
316/* Walk the memblock.memory map and register active regions within a node */
317void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
318 unsigned long last_pfn)
319{
320 unsigned long ei_startpfn;
321 unsigned long ei_endpfn;
322 struct memblock_region *r;
323
324 for_each_memblock(memory, r)
325 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
326 &ei_startpfn, &ei_endpfn))
327 add_active_range(nid, ei_startpfn, ei_endpfn);
328}
329
330/*
331 * Find the hole size (in bytes) in the memory range.
332 * @start: starting address of the memory range to scan
333 * @end: ending address of the memory range to scan
334 */
335u64 __init memblock_x86_hole_size(u64 start, u64 end)
336{
337 unsigned long start_pfn = start >> PAGE_SHIFT;
338 unsigned long last_pfn = end >> PAGE_SHIFT;
339 unsigned long ei_startpfn, ei_endpfn, ram = 0;
340 struct memblock_region *r;
341
342 for_each_memblock(memory, r)
343 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
344 &ei_startpfn, &ei_endpfn))
345 ram += ei_endpfn - ei_startpfn;
346
347 return end - start - ((u64)ram << PAGE_SHIFT);
348}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 18d244f70205..92faf3a1c53e 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -6,8 +6,7 @@
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/pfn.h> 8#include <linux/pfn.h>
9 9#include <linux/memblock.h>
10#include <asm/e820.h>
11 10
12static u64 patterns[] __initdata = { 11static u64 patterns[] __initdata = {
13 0, 12 0,
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
35 (unsigned long long) pattern, 34 (unsigned long long) pattern,
36 (unsigned long long) start_bad, 35 (unsigned long long) start_bad,
37 (unsigned long long) end_bad); 36 (unsigned long long) end_bad);
38 reserve_early(start_bad, end_bad, "BAD RAM"); 37 memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
39} 38}
40 39
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 40static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74 u64 size = 0; 73 u64 size = 0;
75 74
76 while (start < end) { 75 while (start < end) {
77 start = find_e820_area_size(start, &size, 1); 76 start = memblock_x86_find_in_range_size(start, &size, 1);
78 77
79 /* done ? */ 78 /* done ? */
80 if (start >= end) 79 if (start >= end)
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 809baaaf48b1..84a3e4c9f277 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/bootmem.h> 26#include <linux/bootmem.h>
27#include <linux/memblock.h>
27#include <linux/mmzone.h> 28#include <linux/mmzone.h>
28#include <linux/highmem.h> 29#include <linux/highmem.h>
29#include <linux/initrd.h> 30#include <linux/initrd.h>
@@ -120,7 +121,7 @@ int __init get_memcfg_numa_flat(void)
120 121
121 node_start_pfn[0] = 0; 122 node_start_pfn[0] = 0;
122 node_end_pfn[0] = max_pfn; 123 node_end_pfn[0] = max_pfn;
123 e820_register_active_regions(0, 0, max_pfn); 124 memblock_x86_register_active_regions(0, 0, max_pfn);
124 memory_present(0, 0, max_pfn); 125 memory_present(0, 0, max_pfn);
125 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); 126 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
126 127
@@ -161,14 +162,14 @@ static void __init allocate_pgdat(int nid)
161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 162 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
162 else { 163 else {
163 unsigned long pgdat_phys; 164 unsigned long pgdat_phys;
164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, 165 pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
165 max_pfn_mapped<<PAGE_SHIFT, 166 max_pfn_mapped<<PAGE_SHIFT,
166 sizeof(pg_data_t), 167 sizeof(pg_data_t),
167 PAGE_SIZE); 168 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); 169 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 memset(buf, 0, sizeof(buf)); 170 memset(buf, 0, sizeof(buf));
170 sprintf(buf, "NODE_DATA %d", nid); 171 sprintf(buf, "NODE_DATA %d", nid);
171 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); 172 memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
172 } 173 }
173 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", 174 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
174 nid, (unsigned long)NODE_DATA(nid)); 175 nid, (unsigned long)NODE_DATA(nid));
@@ -291,15 +292,15 @@ static __init unsigned long calculate_numa_remap_pages(void)
291 PTRS_PER_PTE); 292 PTRS_PER_PTE);
292 node_kva_target <<= PAGE_SHIFT; 293 node_kva_target <<= PAGE_SHIFT;
293 do { 294 do {
294 node_kva_final = find_e820_area(node_kva_target, 295 node_kva_final = memblock_find_in_range(node_kva_target,
295 ((u64)node_end_pfn[nid])<<PAGE_SHIFT, 296 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
296 ((u64)size)<<PAGE_SHIFT, 297 ((u64)size)<<PAGE_SHIFT,
297 LARGE_PAGE_BYTES); 298 LARGE_PAGE_BYTES);
298 node_kva_target -= LARGE_PAGE_BYTES; 299 node_kva_target -= LARGE_PAGE_BYTES;
299 } while (node_kva_final == -1ULL && 300 } while (node_kva_final == MEMBLOCK_ERROR &&
300 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); 301 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
301 302
302 if (node_kva_final == -1ULL) 303 if (node_kva_final == MEMBLOCK_ERROR)
303 panic("Can not get kva ram\n"); 304 panic("Can not get kva ram\n");
304 305
305 node_remap_size[nid] = size; 306 node_remap_size[nid] = size;
@@ -318,15 +319,13 @@ static __init unsigned long calculate_numa_remap_pages(void)
318 * but we could have some hole in high memory, and it will only 319 * but we could have some hole in high memory, and it will only
319 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide 320 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
320 * to use it as free. 321 * to use it as free.
321 * So reserve_early here, hope we don't run out of that array 322 * So memblock_x86_reserve_range here, hope we don't run out of that array
322 */ 323 */
323 reserve_early(node_kva_final, 324 memblock_x86_reserve_range(node_kva_final,
324 node_kva_final+(((u64)size)<<PAGE_SHIFT), 325 node_kva_final+(((u64)size)<<PAGE_SHIFT),
325 "KVA RAM"); 326 "KVA RAM");
326 327
327 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; 328 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
328 remove_active_range(nid, node_remap_start_pfn[nid],
329 node_remap_start_pfn[nid] + size);
330 } 329 }
331 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", 330 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
332 reserve_pages); 331 reserve_pages);
@@ -367,14 +366,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
367 366
368 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); 367 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
369 do { 368 do {
370 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, 369 kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
371 max_low_pfn<<PAGE_SHIFT, 370 max_low_pfn<<PAGE_SHIFT,
372 kva_pages<<PAGE_SHIFT, 371 kva_pages<<PAGE_SHIFT,
373 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; 372 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
374 kva_target_pfn -= PTRS_PER_PTE; 373 kva_target_pfn -= PTRS_PER_PTE;
375 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); 374 } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
376 375
377 if (kva_start_pfn == -1UL) 376 if (kva_start_pfn == MEMBLOCK_ERROR)
378 panic("Can not get kva space\n"); 377 panic("Can not get kva space\n");
379 378
380 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", 379 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
@@ -382,7 +381,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
382 printk(KERN_INFO "max_pfn = %lx\n", max_pfn); 381 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
383 382
384 /* avoid clash with initrd */ 383 /* avoid clash with initrd */
385 reserve_early(kva_start_pfn<<PAGE_SHIFT, 384 memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
386 (kva_start_pfn + kva_pages)<<PAGE_SHIFT, 385 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
387 "KVA PG"); 386 "KVA PG");
388#ifdef CONFIG_HIGHMEM 387#ifdef CONFIG_HIGHMEM
@@ -419,9 +418,6 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
419 for_each_online_node(nid) { 418 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 419 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid; 420 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
425 } 421 }
426 422
427 setup_bootmem_allocator(); 423 setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..60f498511dd6 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/memblock.h>
10#include <linux/mmzone.h> 11#include <linux/mmzone.h>
11#include <linux/ctype.h> 12#include <linux/ctype.h>
12#include <linux/module.h> 13#include <linux/module.h>
@@ -18,7 +19,7 @@
18#include <asm/dma.h> 19#include <asm/dma.h>
19#include <asm/numa.h> 20#include <asm/numa.h>
20#include <asm/acpi.h> 21#include <asm/acpi.h>
21#include <asm/k8.h> 22#include <asm/amd_nb.h>
22 23
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 24struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 25EXPORT_SYMBOL(node_data);
@@ -86,16 +87,16 @@ static int __init allocate_cachealigned_memnodemap(void)
86 87
87 addr = 0x8000; 88 addr = 0x8000;
88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 89 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 90 nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT,
90 nodemap_size, L1_CACHE_BYTES); 91 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == -1UL) { 92 if (nodemap_addr == MEMBLOCK_ERROR) {
92 printk(KERN_ERR 93 printk(KERN_ERR
93 "NUMA: Unable to allocate Memory to Node hash map\n"); 94 "NUMA: Unable to allocate Memory to Node hash map\n");
94 nodemap_addr = nodemap_size = 0; 95 nodemap_addr = nodemap_size = 0;
95 return -1; 96 return -1;
96 } 97 }
97 memnodemap = phys_to_virt(nodemap_addr); 98 memnodemap = phys_to_virt(nodemap_addr);
98 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); 99 memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
99 100
100 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 101 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
101 nodemap_addr, nodemap_addr + nodemap_size); 102 nodemap_addr, nodemap_addr + nodemap_size);
@@ -171,8 +172,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
171 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && 172 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
172 end > (MAX_DMA32_PFN<<PAGE_SHIFT)) 173 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
173 start = MAX_DMA32_PFN<<PAGE_SHIFT; 174 start = MAX_DMA32_PFN<<PAGE_SHIFT;
174 mem = find_e820_area(start, end, size, align); 175 mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
175 if (mem != -1L) 176 if (mem != MEMBLOCK_ERROR)
176 return __va(mem); 177 return __va(mem);
177 178
178 /* extend the search scope */ 179 /* extend the search scope */
@@ -181,8 +182,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
181 start = MAX_DMA32_PFN<<PAGE_SHIFT; 182 start = MAX_DMA32_PFN<<PAGE_SHIFT;
182 else 183 else
183 start = MAX_DMA_PFN<<PAGE_SHIFT; 184 start = MAX_DMA_PFN<<PAGE_SHIFT;
184 mem = find_e820_area(start, end, size, align); 185 mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
185 if (mem != -1L) 186 if (mem != MEMBLOCK_ERROR)
186 return __va(mem); 187 return __va(mem);
187 188
188 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 189 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
@@ -198,10 +199,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
198 unsigned long start_pfn, last_pfn, nodedata_phys; 199 unsigned long start_pfn, last_pfn, nodedata_phys;
199 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 200 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
200 int nid; 201 int nid;
201#ifndef CONFIG_NO_BOOTMEM
202 unsigned long bootmap_start, bootmap_pages, bootmap_size;
203 void *bootmap;
204#endif
205 202
206 if (!end) 203 if (!end)
207 return; 204 return;
@@ -226,7 +223,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
226 if (node_data[nodeid] == NULL) 223 if (node_data[nodeid] == NULL)
227 return; 224 return;
228 nodedata_phys = __pa(node_data[nodeid]); 225 nodedata_phys = __pa(node_data[nodeid]);
229 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); 226 memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
230 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 227 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
231 nodedata_phys + pgdat_size - 1); 228 nodedata_phys + pgdat_size - 1);
232 nid = phys_to_nid(nodedata_phys); 229 nid = phys_to_nid(nodedata_phys);
@@ -238,47 +235,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
238 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 235 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
239 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 236 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
240 237
241#ifndef CONFIG_NO_BOOTMEM
242 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
243
244 /*
245 * Find a place for the bootmem map
246 * nodedata_phys could be on other nodes by alloc_bootmem,
247 * so need to sure bootmap_start not to be small, otherwise
248 * early_node_mem will get that with find_e820_area instead
249 * of alloc_bootmem, that could clash with reserved range
250 */
251 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
252 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
253 /*
254 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
255 * to use that to align to PAGE_SIZE
256 */
257 bootmap = early_node_mem(nodeid, bootmap_start, end,
258 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
259 if (bootmap == NULL) {
260 free_early(nodedata_phys, nodedata_phys + pgdat_size);
261 node_data[nodeid] = NULL;
262 return;
263 }
264 bootmap_start = __pa(bootmap);
265 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
266 "BOOTMAP");
267
268 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
269 bootmap_start >> PAGE_SHIFT,
270 start_pfn, last_pfn);
271
272 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
273 bootmap_start, bootmap_start + bootmap_size - 1,
274 bootmap_pages);
275 nid = phys_to_nid(bootmap_start);
276 if (nid != nodeid)
277 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
278
279 free_bootmem_with_active_regions(nodeid, end);
280#endif
281
282 node_set_online(nodeid); 238 node_set_online(nodeid);
283} 239}
284 240
@@ -416,7 +372,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
416 nr_nodes = MAX_NUMNODES; 372 nr_nodes = MAX_NUMNODES;
417 } 373 }
418 374
419 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; 375 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
420 /* 376 /*
421 * Calculate the number of big nodes that can be allocated as a result 377 * Calculate the number of big nodes that can be allocated as a result
422 * of consolidating the remainder. 378 * of consolidating the remainder.
@@ -452,7 +408,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
452 * non-reserved memory is less than the per-node size. 408 * non-reserved memory is less than the per-node size.
453 */ 409 */
454 while (end - physnodes[i].start - 410 while (end - physnodes[i].start -
455 e820_hole_size(physnodes[i].start, end) < size) { 411 memblock_x86_hole_size(physnodes[i].start, end) < size) {
456 end += FAKE_NODE_MIN_SIZE; 412 end += FAKE_NODE_MIN_SIZE;
457 if (end > physnodes[i].end) { 413 if (end > physnodes[i].end) {
458 end = physnodes[i].end; 414 end = physnodes[i].end;
@@ -466,7 +422,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
466 * this one must extend to the boundary. 422 * this one must extend to the boundary.
467 */ 423 */
468 if (end < dma32_end && dma32_end - end - 424 if (end < dma32_end && dma32_end - end -
469 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 425 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
470 end = dma32_end; 426 end = dma32_end;
471 427
472 /* 428 /*
@@ -475,7 +431,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
475 * physical node. 431 * physical node.
476 */ 432 */
477 if (physnodes[i].end - end - 433 if (physnodes[i].end - end -
478 e820_hole_size(end, physnodes[i].end) < size) 434 memblock_x86_hole_size(end, physnodes[i].end) < size)
479 end = physnodes[i].end; 435 end = physnodes[i].end;
480 436
481 /* 437 /*
@@ -503,7 +459,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
503{ 459{
504 u64 end = start + size; 460 u64 end = start + size;
505 461
506 while (end - start - e820_hole_size(start, end) < size) { 462 while (end - start - memblock_x86_hole_size(start, end) < size) {
507 end += FAKE_NODE_MIN_SIZE; 463 end += FAKE_NODE_MIN_SIZE;
508 if (end > max_addr) { 464 if (end > max_addr) {
509 end = max_addr; 465 end = max_addr;
@@ -532,7 +488,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
532 * creates a uniform distribution of node sizes across the entire 488 * creates a uniform distribution of node sizes across the entire
533 * machine (but not necessarily over physical nodes). 489 * machine (but not necessarily over physical nodes).
534 */ 490 */
535 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / 491 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
536 MAX_NUMNODES; 492 MAX_NUMNODES;
537 min_size = max(min_size, FAKE_NODE_MIN_SIZE); 493 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
538 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) 494 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
@@ -565,7 +521,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
565 * this one must extend to the boundary. 521 * this one must extend to the boundary.
566 */ 522 */
567 if (end < dma32_end && dma32_end - end - 523 if (end < dma32_end && dma32_end - end -
568 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 524 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
569 end = dma32_end; 525 end = dma32_end;
570 526
571 /* 527 /*
@@ -574,7 +530,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
574 * physical node. 530 * physical node.
575 */ 531 */
576 if (physnodes[i].end - end - 532 if (physnodes[i].end - end -
577 e820_hole_size(end, physnodes[i].end) < size) 533 memblock_x86_hole_size(end, physnodes[i].end) < size)
578 end = physnodes[i].end; 534 end = physnodes[i].end;
579 535
580 /* 536 /*
@@ -638,7 +594,7 @@ static int __init numa_emulation(unsigned long start_pfn,
638 */ 594 */
639 remove_all_active_ranges(); 595 remove_all_active_ranges();
640 for_each_node_mask(i, node_possible_map) { 596 for_each_node_mask(i, node_possible_map) {
641 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 597 memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
642 nodes[i].end >> PAGE_SHIFT); 598 nodes[i].end >> PAGE_SHIFT);
643 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 599 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
644 } 600 }
@@ -691,7 +647,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
691 node_set(0, node_possible_map); 647 node_set(0, node_possible_map);
692 for (i = 0; i < nr_cpu_ids; i++) 648 for (i = 0; i < nr_cpu_ids; i++)
693 numa_set_node(i, 0); 649 numa_set_node(i, 0);
694 e820_register_active_regions(0, start_pfn, last_pfn); 650 memblock_x86_register_active_regions(0, start_pfn, last_pfn);
695 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); 651 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
696} 652}
697 653
@@ -703,9 +659,7 @@ unsigned long __init numa_free_all_bootmem(void)
703 for_each_online_node(i) 659 for_each_online_node(i)
704 pages += free_all_bootmem_node(NODE_DATA(i)); 660 pages += free_all_bootmem_node(NODE_DATA(i));
705 661
706#ifdef CONFIG_NO_BOOTMEM
707 pages += free_all_memory_core_early(MAX_NUMNODES); 662 pages += free_all_memory_core_early(MAX_NUMNODES);
708#endif
709 663
710 return pages; 664 return pages;
711} 665}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8be8c7d7bc89 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
87#define UNSHARED_PTRS_PER_PGD \ 87#define UNSHARED_PTRS_PER_PGD \
88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
89 89
90static void pgd_ctor(pgd_t *pgd) 90
91static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
92{
93 BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
94 virt_to_page(pgd)->index = (pgoff_t)mm;
95}
96
97struct mm_struct *pgd_page_get_mm(struct page *page)
98{
99 return (struct mm_struct *)page->index;
100}
101
102static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
91{ 103{
92 /* If the pgd points to a shared pagetable level (either the 104 /* If the pgd points to a shared pagetable level (either the
93 ptes in non-PAE, or shared PMD in PAE), then just copy the 105 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
98 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 110 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
99 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 111 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
100 KERNEL_PGD_PTRS); 112 KERNEL_PGD_PTRS);
101 paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
102 __pa(swapper_pg_dir) >> PAGE_SHIFT,
103 KERNEL_PGD_BOUNDARY,
104 KERNEL_PGD_PTRS);
105 } 113 }
106 114
107 /* list required to sync kernel mapping updates */ 115 /* list required to sync kernel mapping updates */
108 if (!SHARED_KERNEL_PMD) 116 if (!SHARED_KERNEL_PMD) {
117 pgd_set_mm(pgd, mm);
109 pgd_list_add(pgd); 118 pgd_list_add(pgd);
119 }
110} 120}
111 121
112static void pgd_dtor(pgd_t *pgd) 122static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
272 */ 282 */
273 spin_lock_irqsave(&pgd_lock, flags); 283 spin_lock_irqsave(&pgd_lock, flags);
274 284
275 pgd_ctor(pgd); 285 pgd_ctor(mm, pgd);
276 pgd_prepopulate_pmd(mm, pgd, pmds); 286 pgd_prepopulate_pmd(mm, pgd, pmds);
277 287
278 spin_unlock_irqrestore(&pgd_lock, flags); 288 spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 9324f13492d5..a17dffd136c1 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -25,6 +25,7 @@
25 */ 25 */
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/memblock.h>
28#include <linux/mmzone.h> 29#include <linux/mmzone.h>
29#include <linux/acpi.h> 30#include <linux/acpi.h>
30#include <linux/nodemask.h> 31#include <linux/nodemask.h>
@@ -264,7 +265,7 @@ int __init get_memcfg_from_srat(void)
264 if (node_read_chunk(chunk->nid, chunk)) 265 if (node_read_chunk(chunk->nid, chunk))
265 continue; 266 continue;
266 267
267 e820_register_active_regions(chunk->nid, chunk->start_pfn, 268 memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn)); 269 min(chunk->end_pfn, max_pfn));
269 } 270 }
270 /* for out of order entries in SRAT */ 271 /* for out of order entries in SRAT */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 9c0d0d399c30..a35cb9d8b060 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -16,6 +16,7 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/topology.h> 17#include <linux/topology.h>
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/memblock.h>
19#include <linux/mm.h> 20#include <linux/mm.h>
20#include <asm/proto.h> 21#include <asm/proto.h>
21#include <asm/numa.h> 22#include <asm/numa.h>
@@ -98,15 +99,15 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
98 unsigned long phys; 99 unsigned long phys;
99 100
100 length = slit->header.length; 101 length = slit->header.length;
101 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length, 102 phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
102 PAGE_SIZE); 103 PAGE_SIZE);
103 104
104 if (phys == -1L) 105 if (phys == MEMBLOCK_ERROR)
105 panic(" Can not save slit!\n"); 106 panic(" Can not save slit!\n");
106 107
107 acpi_slit = __va(phys); 108 acpi_slit = __va(phys);
108 memcpy(acpi_slit, slit, length); 109 memcpy(acpi_slit, slit, length);
109 reserve_early(phys, phys + length, "ACPI SLIT"); 110 memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
110} 111}
111 112
112/* Callback for Proximity Domain -> x2APIC mapping */ 113/* Callback for Proximity Domain -> x2APIC mapping */
@@ -324,7 +325,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
324 pxmram = 0; 325 pxmram = 0;
325 } 326 }
326 327
327 e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); 328 e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
328 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 329 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
329 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { 330 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
330 printk(KERN_ERR 331 printk(KERN_ERR
@@ -421,7 +422,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
421 } 422 }
422 423
423 for (i = 0; i < num_node_memblks; i++) 424 for (i = 0; i < num_node_memblks; i++)
424 e820_register_active_regions(memblk_nodeid[i], 425 memblock_x86_register_active_regions(memblk_nodeid[i],
425 node_memblk_range[i].start >> PAGE_SHIFT, 426 node_memblk_range[i].start >> PAGE_SHIFT,
426 node_memblk_range[i].end >> PAGE_SHIFT); 427 node_memblk_range[i].end >> PAGE_SHIFT);
427 428
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..49358481c733 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
5#include <linux/smp.h> 5#include <linux/smp.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cpu.h>
8 9
9#include <asm/tlbflush.h> 10#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
52 want false sharing in the per cpu data segment. */ 53 want false sharing in the per cpu data segment. */
53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; 54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54 55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57
55/* 58/*
56 * We cannot call mmdrop() because we are in interrupt context, 59 * We cannot call mmdrop() because we are in interrupt context,
57 * instead update mm->cpu_vm_mask. 60 * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173 union smp_flush_state *f; 176 union smp_flush_state *f;
174 177
175 /* Caller has disabled preemption */ 178 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 179 sender = this_cpu_read(tlb_vector_offset);
177 f = &flush_state[sender]; 180 f = &flush_state[sender];
178 181
179 /* 182 /*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
218 flush_tlb_others_ipi(cpumask, mm, va); 221 flush_tlb_others_ipi(cpumask, mm, va);
219} 222}
220 223
224static void __cpuinit calculate_tlb_offset(void)
225{
226 int cpu, node, nr_node_vecs;
227 /*
228 * we are changing tlb_vector_offset for each CPU in runtime, but this
229 * will not cause inconsistency, as the write is atomic under X86. we
230 * might see more lock contentions in a short time, but after all CPU's
231 * tlb_vector_offset are changed, everything should go normal
232 *
233 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
234 * waste some vectors.
235 **/
236 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
237 nr_node_vecs = 1;
238 else
239 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
240
241 for_each_online_node(node) {
242 int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
243 nr_node_vecs;
244 int cpu_offset = 0;
245 for_each_cpu(cpu, cpumask_of_node(node)) {
246 per_cpu(tlb_vector_offset, cpu) = node_offset +
247 cpu_offset;
248 cpu_offset++;
249 cpu_offset = cpu_offset % nr_node_vecs;
250 }
251 }
252}
253
254static int tlb_cpuhp_notify(struct notifier_block *n,
255 unsigned long action, void *hcpu)
256{
257 switch (action & 0xf) {
258 case CPU_ONLINE:
259 case CPU_DEAD:
260 calculate_tlb_offset();
261 }
262 return NOTIFY_OK;
263}
264
221static int __cpuinit init_smp_flush(void) 265static int __cpuinit init_smp_flush(void)
222{ 266{
223 int i; 267 int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 269 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 raw_spin_lock_init(&flush_state[i].tlbstate_lock); 270 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 271
272 calculate_tlb_offset();
273 hotcpu_notifier(tlb_cpuhp_notify, 0);
228 return 0; 274 return 0;
229} 275}
230core_initcall(init_smp_flush); 276core_initcall(init_smp_flush);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 3855096c59b8..2d49d4e19a36 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -14,6 +14,7 @@
14#include <asm/ptrace.h> 14#include <asm/ptrace.h>
15#include <asm/uaccess.h> 15#include <asm/uaccess.h>
16#include <asm/stacktrace.h> 16#include <asm/stacktrace.h>
17#include <linux/compat.h>
17 18
18static void backtrace_warning_symbol(void *data, char *msg, 19static void backtrace_warning_symbol(void *data, char *msg,
19 unsigned long symbol) 20 unsigned long symbol)
@@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = {
48 .walk_stack = print_context_stack, 49 .walk_stack = print_context_stack,
49}; 50};
50 51
51struct frame_head { 52#ifdef CONFIG_COMPAT
52 struct frame_head *bp; 53static struct stack_frame_ia32 *
53 unsigned long ret; 54dump_user_backtrace_32(struct stack_frame_ia32 *head)
54} __attribute__((packed));
55
56static struct frame_head *dump_user_backtrace(struct frame_head *head)
57{ 55{
58 struct frame_head bufhead[2]; 56 struct stack_frame_ia32 bufhead[2];
57 struct stack_frame_ia32 *fp;
59 58
60 /* Also check accessibility of one struct frame_head beyond */ 59 /* Also check accessibility of one struct frame_head beyond */
61 if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) 60 if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
@@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head)
63 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) 62 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
64 return NULL; 63 return NULL;
65 64
66 oprofile_add_trace(bufhead[0].ret); 65 fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
66
67 oprofile_add_trace(bufhead[0].return_address);
68
69 /* frame pointers should strictly progress back up the stack
70 * (towards higher addresses) */
71 if (head >= fp)
72 return NULL;
73
74 return fp;
75}
76
77static inline int
78x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
79{
80 struct stack_frame_ia32 *head;
81
82 /* User process is 32-bit */
83 if (!current || !test_thread_flag(TIF_IA32))
84 return 0;
85
86 head = (struct stack_frame_ia32 *) regs->bp;
87 while (depth-- && head)
88 head = dump_user_backtrace_32(head);
89
90 return 1;
91}
92
93#else
94static inline int
95x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
96{
97 return 0;
98}
99#endif /* CONFIG_COMPAT */
100
101static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
102{
103 struct stack_frame bufhead[2];
104
105 /* Also check accessibility of one struct stack_frame beyond */
106 if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
107 return NULL;
108 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
109 return NULL;
110
111 oprofile_add_trace(bufhead[0].return_address);
67 112
68 /* frame pointers should strictly progress back up the stack 113 /* frame pointers should strictly progress back up the stack
69 * (towards higher addresses) */ 114 * (towards higher addresses) */
70 if (head >= bufhead[0].bp) 115 if (head >= bufhead[0].next_frame)
71 return NULL; 116 return NULL;
72 117
73 return bufhead[0].bp; 118 return bufhead[0].next_frame;
74} 119}
75 120
76void 121void
77x86_backtrace(struct pt_regs * const regs, unsigned int depth) 122x86_backtrace(struct pt_regs * const regs, unsigned int depth)
78{ 123{
79 struct frame_head *head = (struct frame_head *)frame_pointer(regs); 124 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
80 125
81 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
82 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
@@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
86 return; 131 return;
87 } 132 }
88 133
134 if (x86_backtrace_32(regs, depth))
135 return;
136
89 while (depth-- && head) 137 while (depth-- && head)
90 head = dump_user_backtrace(head); 138 head = dump_user_backtrace(head);
91} 139}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f1575c9a2572..4e8baad36d37 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -695,9 +695,6 @@ static int __init ppro_init(char **cpu_type)
695 return 1; 695 return 1;
696} 696}
697 697
698/* in order to get sysfs right */
699static int using_nmi;
700
701int __init op_nmi_init(struct oprofile_operations *ops) 698int __init op_nmi_init(struct oprofile_operations *ops)
702{ 699{
703 __u8 vendor = boot_cpu_data.x86_vendor; 700 __u8 vendor = boot_cpu_data.x86_vendor;
@@ -705,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
705 char *cpu_type = NULL; 702 char *cpu_type = NULL;
706 int ret = 0; 703 int ret = 0;
707 704
708 using_nmi = 0;
709
710 if (!cpu_has_apic) 705 if (!cpu_has_apic)
711 return -ENODEV; 706 return -ENODEV;
712 707
@@ -731,6 +726,12 @@ int __init op_nmi_init(struct oprofile_operations *ops)
731 case 0x11: 726 case 0x11:
732 cpu_type = "x86-64/family11h"; 727 cpu_type = "x86-64/family11h";
733 break; 728 break;
729 case 0x12:
730 cpu_type = "x86-64/family12h";
731 break;
732 case 0x14:
733 cpu_type = "x86-64/family14h";
734 break;
734 default: 735 default:
735 return -ENODEV; 736 return -ENODEV;
736 } 737 }
@@ -790,13 +791,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
790 if (ret) 791 if (ret)
791 return ret; 792 return ret;
792 793
793 using_nmi = 1;
794 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 794 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
795 return 0; 795 return 0;
796} 796}
797 797
798void op_nmi_exit(void) 798void op_nmi_exit(void)
799{ 799{
800 if (using_nmi) 800 exit_sysfs();
801 exit_sysfs();
802} 801}
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b67a6b5aa8d4..a011bcc0f943 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -48,31 +48,53 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS];
48 48
49static u32 ibs_caps; 49static u32 ibs_caps;
50 50
51struct op_ibs_config { 51struct ibs_config {
52 unsigned long op_enabled; 52 unsigned long op_enabled;
53 unsigned long fetch_enabled; 53 unsigned long fetch_enabled;
54 unsigned long max_cnt_fetch; 54 unsigned long max_cnt_fetch;
55 unsigned long max_cnt_op; 55 unsigned long max_cnt_op;
56 unsigned long rand_en; 56 unsigned long rand_en;
57 unsigned long dispatched_ops; 57 unsigned long dispatched_ops;
58 unsigned long branch_target;
58}; 59};
59 60
60static struct op_ibs_config ibs_config; 61struct ibs_state {
61static u64 ibs_op_ctl; 62 u64 ibs_op_ctl;
63 int branch_target;
64 unsigned long sample_size;
65};
66
67static struct ibs_config ibs_config;
68static struct ibs_state ibs_state;
62 69
63/* 70/*
64 * IBS cpuid feature detection 71 * IBS cpuid feature detection
65 */ 72 */
66 73
67#define IBS_CPUID_FEATURES 0x8000001b 74#define IBS_CPUID_FEATURES 0x8000001b
68 75
69/* 76/*
70 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but 77 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
71 * bit 0 is used to indicate the existence of IBS. 78 * bit 0 is used to indicate the existence of IBS.
72 */ 79 */
73#define IBS_CAPS_AVAIL (1LL<<0) 80#define IBS_CAPS_AVAIL (1U<<0)
74#define IBS_CAPS_RDWROPCNT (1LL<<3) 81#define IBS_CAPS_FETCHSAM (1U<<1)
75#define IBS_CAPS_OPCNT (1LL<<4) 82#define IBS_CAPS_OPSAM (1U<<2)
83#define IBS_CAPS_RDWROPCNT (1U<<3)
84#define IBS_CAPS_OPCNT (1U<<4)
85#define IBS_CAPS_BRNTRGT (1U<<5)
86#define IBS_CAPS_OPCNTEXT (1U<<6)
87
88#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
89 | IBS_CAPS_FETCHSAM \
90 | IBS_CAPS_OPSAM)
91
92/*
93 * IBS APIC setup
94 */
95#define IBSCTL 0x1cc
96#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
97#define IBSCTL_LVT_OFFSET_MASK 0x0F
76 98
77/* 99/*
78 * IBS randomization macros 100 * IBS randomization macros
@@ -92,12 +114,12 @@ static u32 get_ibs_caps(void)
92 /* check IBS cpuid feature flags */ 114 /* check IBS cpuid feature flags */
93 max_level = cpuid_eax(0x80000000); 115 max_level = cpuid_eax(0x80000000);
94 if (max_level < IBS_CPUID_FEATURES) 116 if (max_level < IBS_CPUID_FEATURES)
95 return IBS_CAPS_AVAIL; 117 return IBS_CAPS_DEFAULT;
96 118
97 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); 119 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
98 if (!(ibs_caps & IBS_CAPS_AVAIL)) 120 if (!(ibs_caps & IBS_CAPS_AVAIL))
99 /* cpuid flags not valid */ 121 /* cpuid flags not valid */
100 return IBS_CAPS_AVAIL; 122 return IBS_CAPS_DEFAULT;
101 123
102 return ibs_caps; 124 return ibs_caps;
103} 125}
@@ -190,8 +212,8 @@ op_amd_handle_ibs(struct pt_regs * const regs,
190 rdmsrl(MSR_AMD64_IBSOPCTL, ctl); 212 rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
191 if (ctl & IBS_OP_VAL) { 213 if (ctl & IBS_OP_VAL) {
192 rdmsrl(MSR_AMD64_IBSOPRIP, val); 214 rdmsrl(MSR_AMD64_IBSOPRIP, val);
193 oprofile_write_reserve(&entry, regs, val, 215 oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
194 IBS_OP_CODE, IBS_OP_SIZE); 216 ibs_state.sample_size);
195 oprofile_add_data64(&entry, val); 217 oprofile_add_data64(&entry, val);
196 rdmsrl(MSR_AMD64_IBSOPDATA, val); 218 rdmsrl(MSR_AMD64_IBSOPDATA, val);
197 oprofile_add_data64(&entry, val); 219 oprofile_add_data64(&entry, val);
@@ -203,10 +225,14 @@ op_amd_handle_ibs(struct pt_regs * const regs,
203 oprofile_add_data64(&entry, val); 225 oprofile_add_data64(&entry, val);
204 rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); 226 rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
205 oprofile_add_data64(&entry, val); 227 oprofile_add_data64(&entry, val);
228 if (ibs_state.branch_target) {
229 rdmsrl(MSR_AMD64_IBSBRTARGET, val);
230 oprofile_add_data(&entry, (unsigned long)val);
231 }
206 oprofile_write_commit(&entry); 232 oprofile_write_commit(&entry);
207 233
208 /* reenable the IRQ */ 234 /* reenable the IRQ */
209 ctl = op_amd_randomize_ibs_op(ibs_op_ctl); 235 ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
210 wrmsrl(MSR_AMD64_IBSOPCTL, ctl); 236 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
211 } 237 }
212 } 238 }
@@ -219,21 +245,32 @@ static inline void op_amd_start_ibs(void)
219 if (!ibs_caps) 245 if (!ibs_caps)
220 return; 246 return;
221 247
248 memset(&ibs_state, 0, sizeof(ibs_state));
249
250 /*
251 * Note: Since the max count settings may out of range we
252 * write back the actual used values so that userland can read
253 * it.
254 */
255
222 if (ibs_config.fetch_enabled) { 256 if (ibs_config.fetch_enabled) {
223 val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; 257 val = ibs_config.max_cnt_fetch >> 4;
258 val = min(val, IBS_FETCH_MAX_CNT);
259 ibs_config.max_cnt_fetch = val << 4;
224 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; 260 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
225 val |= IBS_FETCH_ENABLE; 261 val |= IBS_FETCH_ENABLE;
226 wrmsrl(MSR_AMD64_IBSFETCHCTL, val); 262 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
227 } 263 }
228 264
229 if (ibs_config.op_enabled) { 265 if (ibs_config.op_enabled) {
230 ibs_op_ctl = ibs_config.max_cnt_op >> 4; 266 val = ibs_config.max_cnt_op >> 4;
231 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { 267 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
232 /* 268 /*
233 * IbsOpCurCnt not supported. See 269 * IbsOpCurCnt not supported. See
234 * op_amd_randomize_ibs_op() for details. 270 * op_amd_randomize_ibs_op() for details.
235 */ 271 */
236 ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL); 272 val = clamp(val, 0x0081ULL, 0xFF80ULL);
273 ibs_config.max_cnt_op = val << 4;
237 } else { 274 } else {
238 /* 275 /*
239 * The start value is randomized with a 276 * The start value is randomized with a
@@ -241,13 +278,24 @@ static inline void op_amd_start_ibs(void)
241 * with the half of the randomized range. Also 278 * with the half of the randomized range. Also
242 * avoid underflows. 279 * avoid underflows.
243 */ 280 */
244 ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, 281 val += IBS_RANDOM_MAXCNT_OFFSET;
245 IBS_OP_MAX_CNT); 282 if (ibs_caps & IBS_CAPS_OPCNTEXT)
283 val = min(val, IBS_OP_MAX_CNT_EXT);
284 else
285 val = min(val, IBS_OP_MAX_CNT);
286 ibs_config.max_cnt_op =
287 (val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
288 }
289 val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
290 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
291 val |= IBS_OP_ENABLE;
292 ibs_state.ibs_op_ctl = val;
293 ibs_state.sample_size = IBS_OP_SIZE;
294 if (ibs_config.branch_target) {
295 ibs_state.branch_target = 1;
296 ibs_state.sample_size++;
246 } 297 }
247 if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) 298 val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
248 ibs_op_ctl |= IBS_OP_CNT_CTL;
249 ibs_op_ctl |= IBS_OP_ENABLE;
250 val = op_amd_randomize_ibs_op(ibs_op_ctl);
251 wrmsrl(MSR_AMD64_IBSOPCTL, val); 299 wrmsrl(MSR_AMD64_IBSOPCTL, val);
252 } 300 }
253} 301}
@@ -266,6 +314,70 @@ static void op_amd_stop_ibs(void)
266 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 314 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
267} 315}
268 316
317static inline int eilvt_is_available(int offset)
318{
319 /* check if we may assign a vector */
320 return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
321}
322
323static inline int ibs_eilvt_valid(void)
324{
325 int offset;
326 u64 val;
327
328 rdmsrl(MSR_AMD64_IBSCTL, val);
329 offset = val & IBSCTL_LVT_OFFSET_MASK;
330
331 if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
332 pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
333 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
334 return 0;
335 }
336
337 if (!eilvt_is_available(offset)) {
338 pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
339 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
340 return 0;
341 }
342
343 return 1;
344}
345
346static inline int get_ibs_offset(void)
347{
348 u64 val;
349
350 rdmsrl(MSR_AMD64_IBSCTL, val);
351 if (!(val & IBSCTL_LVT_OFFSET_VALID))
352 return -EINVAL;
353
354 return val & IBSCTL_LVT_OFFSET_MASK;
355}
356
357static void setup_APIC_ibs(void)
358{
359 int offset;
360
361 offset = get_ibs_offset();
362 if (offset < 0)
363 goto failed;
364
365 if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
366 return;
367failed:
368 pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
369 smp_processor_id());
370}
371
372static void clear_APIC_ibs(void)
373{
374 int offset;
375
376 offset = get_ibs_offset();
377 if (offset >= 0)
378 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
379}
380
269#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 381#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
270 382
271static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, 383static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -376,13 +488,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
376 } 488 }
377 489
378 if (ibs_caps) 490 if (ibs_caps)
379 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); 491 setup_APIC_ibs();
380} 492}
381 493
382static void op_amd_cpu_shutdown(void) 494static void op_amd_cpu_shutdown(void)
383{ 495{
384 if (ibs_caps) 496 if (ibs_caps)
385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); 497 clear_APIC_ibs();
386} 498}
387 499
388static int op_amd_check_ctrs(struct pt_regs * const regs, 500static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -445,16 +557,11 @@ static void op_amd_stop(struct op_msrs const * const msrs)
445 op_amd_stop_ibs(); 557 op_amd_stop_ibs();
446} 558}
447 559
448static int __init_ibs_nmi(void) 560static int setup_ibs_ctl(int ibs_eilvt_off)
449{ 561{
450#define IBSCTL_LVTOFFSETVAL (1 << 8)
451#define IBSCTL 0x1cc
452 struct pci_dev *cpu_cfg; 562 struct pci_dev *cpu_cfg;
453 int nodes; 563 int nodes;
454 u32 value = 0; 564 u32 value = 0;
455 u8 ibs_eilvt_off;
456
457 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
458 565
459 nodes = 0; 566 nodes = 0;
460 cpu_cfg = NULL; 567 cpu_cfg = NULL;
@@ -466,24 +573,63 @@ static int __init_ibs_nmi(void)
466 break; 573 break;
467 ++nodes; 574 ++nodes;
468 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off 575 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
469 | IBSCTL_LVTOFFSETVAL); 576 | IBSCTL_LVT_OFFSET_VALID);
470 pci_read_config_dword(cpu_cfg, IBSCTL, &value); 577 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
471 if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { 578 if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
472 pci_dev_put(cpu_cfg); 579 pci_dev_put(cpu_cfg);
473 printk(KERN_DEBUG "Failed to setup IBS LVT offset, " 580 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
474 "IBSCTL = 0x%08x", value); 581 "IBSCTL = 0x%08x\n", value);
475 return 1; 582 return -EINVAL;
476 } 583 }
477 } while (1); 584 } while (1);
478 585
479 if (!nodes) { 586 if (!nodes) {
480 printk(KERN_DEBUG "No CPU node configured for IBS"); 587 printk(KERN_DEBUG "No CPU node configured for IBS\n");
481 return 1; 588 return -ENODEV;
482 } 589 }
483 590
484 return 0; 591 return 0;
485} 592}
486 593
594static int force_ibs_eilvt_setup(void)
595{
596 int i;
597 int ret;
598
599 /* find the next free available EILVT entry */
600 for (i = 1; i < 4; i++) {
601 if (!eilvt_is_available(i))
602 continue;
603 ret = setup_ibs_ctl(i);
604 if (ret)
605 return ret;
606 return 0;
607 }
608
609 printk(KERN_DEBUG "No EILVT entry available\n");
610
611 return -EBUSY;
612}
613
614static int __init_ibs_nmi(void)
615{
616 int ret;
617
618 if (ibs_eilvt_valid())
619 return 0;
620
621 ret = force_ibs_eilvt_setup();
622 if (ret)
623 return ret;
624
625 if (!ibs_eilvt_valid())
626 return -EFAULT;
627
628 pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
629
630 return 0;
631}
632
487/* initialize the APIC for the IBS interrupts if available */ 633/* initialize the APIC for the IBS interrupts if available */
488static void init_ibs(void) 634static void init_ibs(void)
489{ 635{
@@ -521,28 +667,33 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
521 /* model specific files */ 667 /* model specific files */
522 668
523 /* setup some reasonable defaults */ 669 /* setup some reasonable defaults */
670 memset(&ibs_config, 0, sizeof(ibs_config));
524 ibs_config.max_cnt_fetch = 250000; 671 ibs_config.max_cnt_fetch = 250000;
525 ibs_config.fetch_enabled = 0;
526 ibs_config.max_cnt_op = 250000; 672 ibs_config.max_cnt_op = 250000;
527 ibs_config.op_enabled = 0; 673
528 ibs_config.dispatched_ops = 0; 674 if (ibs_caps & IBS_CAPS_FETCHSAM) {
529 675 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
530 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 676 oprofilefs_create_ulong(sb, dir, "enable",
531 oprofilefs_create_ulong(sb, dir, "enable", 677 &ibs_config.fetch_enabled);
532 &ibs_config.fetch_enabled); 678 oprofilefs_create_ulong(sb, dir, "max_count",
533 oprofilefs_create_ulong(sb, dir, "max_count", 679 &ibs_config.max_cnt_fetch);
534 &ibs_config.max_cnt_fetch); 680 oprofilefs_create_ulong(sb, dir, "rand_enable",
535 oprofilefs_create_ulong(sb, dir, "rand_enable", 681 &ibs_config.rand_en);
536 &ibs_config.rand_en); 682 }
537 683
538 dir = oprofilefs_mkdir(sb, root, "ibs_op"); 684 if (ibs_caps & IBS_CAPS_OPSAM) {
539 oprofilefs_create_ulong(sb, dir, "enable", 685 dir = oprofilefs_mkdir(sb, root, "ibs_op");
540 &ibs_config.op_enabled); 686 oprofilefs_create_ulong(sb, dir, "enable",
541 oprofilefs_create_ulong(sb, dir, "max_count", 687 &ibs_config.op_enabled);
542 &ibs_config.max_cnt_op); 688 oprofilefs_create_ulong(sb, dir, "max_count",
543 if (ibs_caps & IBS_CAPS_OPCNT) 689 &ibs_config.max_cnt_op);
544 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 690 if (ibs_caps & IBS_CAPS_OPCNT)
545 &ibs_config.dispatched_ops); 691 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
692 &ibs_config.dispatched_ops);
693 if (ibs_caps & IBS_CAPS_BRNTRGT)
694 oprofilefs_create_ulong(sb, dir, "branch_target",
695 &ibs_config.branch_target);
696 }
546 697
547 return 0; 698 return 0;
548} 699}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 8379c2c3d076..c4bb261c106e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -65,16 +65,21 @@ pcibios_align_resource(void *data, const struct resource *res,
65 resource_size_t size, resource_size_t align) 65 resource_size_t size, resource_size_t align)
66{ 66{
67 struct pci_dev *dev = data; 67 struct pci_dev *dev = data;
68 resource_size_t start = res->start; 68 resource_size_t start = round_down(res->end - size + 1, align);
69 69
70 if (res->flags & IORESOURCE_IO) { 70 if (res->flags & IORESOURCE_IO) {
71 if (skip_isa_ioresource_align(dev)) 71
72 return start; 72 /*
73 if (start & 0x300) 73 * If we're avoiding ISA aliases, the largest contiguous I/O
74 start = (start + 0x3ff) & ~0x3ff; 74 * port space is 256 bytes. Clearing bits 9 and 10 preserves
75 * all 256-byte and smaller alignments, so the result will
76 * still be correctly aligned.
77 */
78 if (!skip_isa_ioresource_align(dev))
79 start &= ~0x300;
75 } else if (res->flags & IORESOURCE_MEM) { 80 } else if (res->flags & IORESOURCE_MEM) {
76 if (start < BIOS_END) 81 if (start < BIOS_END)
77 start = BIOS_END; 82 start = res->end; /* fail; no space */
78 } 83 }
79 return start; 84 return start;
80} 85}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index f547ee05f715..9f9bfb705cf9 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -584,27 +584,28 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
584 case PCI_DEVICE_ID_INTEL_ICH9_3: 584 case PCI_DEVICE_ID_INTEL_ICH9_3:
585 case PCI_DEVICE_ID_INTEL_ICH9_4: 585 case PCI_DEVICE_ID_INTEL_ICH9_4:
586 case PCI_DEVICE_ID_INTEL_ICH9_5: 586 case PCI_DEVICE_ID_INTEL_ICH9_5:
587 case PCI_DEVICE_ID_INTEL_TOLAPAI_0: 587 case PCI_DEVICE_ID_INTEL_EP80579_0:
588 case PCI_DEVICE_ID_INTEL_ICH10_0: 588 case PCI_DEVICE_ID_INTEL_ICH10_0:
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC:
592 r->name = "PIIX/ICH"; 593 r->name = "PIIX/ICH";
593 r->get = pirq_piix_get; 594 r->get = pirq_piix_get;
594 r->set = pirq_piix_set; 595 r->set = pirq_piix_set;
595 return 1; 596 return 1;
596 } 597 }
597 598
598 if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && 599 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) &&
599 (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { 600 (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) {
600 r->name = "PIIX/ICH"; 601 r->name = "PIIX/ICH";
601 r->get = pirq_piix_get; 602 r->get = pirq_piix_get;
602 r->set = pirq_piix_set; 603 r->set = pirq_piix_set;
603 return 1; 604 return 1;
604 } 605 }
605 606
606 if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && 607 if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) &&
607 (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) { 608 (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) {
608 r->name = "PIIX/ICH"; 609 r->name = "PIIX/ICH";
609 r->get = pirq_piix_get; 610 r->get = pirq_piix_get;
610 r->set = pirq_piix_set; 611 r->set = pirq_piix_set;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index a918553ebc75..e282886616a0 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -65,7 +65,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
65 int end, u64 addr) 65 int end, u64 addr)
66{ 66{
67 struct pci_mmcfg_region *new; 67 struct pci_mmcfg_region *new;
68 int num_buses;
69 struct resource *res; 68 struct resource *res;
70 69
71 if (addr == 0) 70 if (addr == 0)
@@ -82,10 +81,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
82 81
83 list_add_sorted(new); 82 list_add_sorted(new);
84 83
85 num_buses = end - start + 1;
86 res = &new->res; 84 res = &new->res;
87 res->start = addr + PCI_MMCFG_BUS_OFFSET(start); 85 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
88 res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; 86 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
89 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 87 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
90 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, 88 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
91 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); 89 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f58..13700ec8e2e4 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
304 304
305int __init pci_olpc_init(void) 305int __init pci_olpc_init(void)
306{ 306{
307 printk(KERN_INFO "PCI: Using configuration type OLPC\n"); 307 printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
308 raw_pci_ops = &pci_olpc_conf; 308 raw_pci_ops = &pci_olpc_conf;
309 is_lx = is_geode_lx(); 309 is_lx = is_geode_lx();
310 return 0; 310 return 0;
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
new file mode 100644
index 000000000000..7bf70b812fa2
--- /dev/null
+++ b/arch/x86/platform/Makefile
@@ -0,0 +1,8 @@
1# Platform specific code goes here
2obj-y += efi/
3obj-y += mrst/
4obj-y += olpc/
5obj-y += scx200/
6obj-y += sfi/
7obj-y += visws/
8obj-y += uv/
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
new file mode 100644
index 000000000000..73b8be0f3675
--- /dev/null
+++ b/arch/x86/platform/efi/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
diff --git a/arch/x86/kernel/efi.c b/arch/x86/platform/efi/efi.c
index c2fa9b8b497e..0fe27d7c6258 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -30,6 +30,7 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/efi.h> 31#include <linux/efi.h>
32#include <linux/bootmem.h> 32#include <linux/bootmem.h>
33#include <linux/memblock.h>
33#include <linux/spinlock.h> 34#include <linux/spinlock.h>
34#include <linux/uaccess.h> 35#include <linux/uaccess.h>
35#include <linux/time.h> 36#include <linux/time.h>
@@ -275,7 +276,7 @@ static void __init do_add_efi_memmap(void)
275 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 276 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
276} 277}
277 278
278void __init efi_reserve_early(void) 279void __init efi_memblock_x86_reserve_range(void)
279{ 280{
280 unsigned long pmap; 281 unsigned long pmap;
281 282
@@ -290,7 +291,7 @@ void __init efi_reserve_early(void)
290 boot_params.efi_info.efi_memdesc_size; 291 boot_params.efi_info.efi_memdesc_size;
291 memmap.desc_version = boot_params.efi_info.efi_memdesc_version; 292 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
292 memmap.desc_size = boot_params.efi_info.efi_memdesc_size; 293 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
293 reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size, 294 memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
294 "EFI memmap"); 295 "EFI memmap");
295} 296}
296 297
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 5cab48ee61a4..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3d..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index fbe66e626c09..fbe66e626c09 100644
--- a/arch/x86/kernel/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 4c07ccab8146..4c07ccab8146 100644
--- a/arch/x86/kernel/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
new file mode 100644
index 000000000000..efbbc552fa95
--- /dev/null
+++ b/arch/x86/platform/mrst/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_MRST) += mrst.o
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/platform/mrst/mrst.c
index 79ae68154e87..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
new file mode 100644
index 000000000000..c31b8fcb5a86
--- /dev/null
+++ b/arch/x86/platform/olpc/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
new file mode 100644
index 000000000000..f5442c03abc3
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -0,0 +1,140 @@
1/*
2 * Support for features of the OLPC XO-1 laptop
3 *
4 * Copyright (C) 2010 One Laptop per Child
5 * Copyright (C) 2006 Red Hat, Inc.
6 * Copyright (C) 2006 Advanced Micro Devices, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/module.h>
15#include <linux/pci.h>
16#include <linux/pci_ids.h>
17#include <linux/platform_device.h>
18#include <linux/pm.h>
19
20#include <asm/io.h>
21#include <asm/olpc.h>
22
23#define DRV_NAME "olpc-xo1"
24
25#define PMS_BAR 4
26#define ACPI_BAR 5
27
28/* PMC registers (PMS block) */
29#define PM_SCLK 0x10
30#define PM_IN_SLPCTL 0x20
31#define PM_WKXD 0x34
32#define PM_WKD 0x30
33#define PM_SSC 0x54
34
35/* PM registers (ACPI block) */
36#define PM1_CNT 0x08
37#define PM_GPE0_STS 0x18
38
39static unsigned long acpi_base;
40static unsigned long pms_base;
41
42static void xo1_power_off(void)
43{
44 printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
45
46 /* Enable all of these controls with 0 delay */
47 outl(0x40000000, pms_base + PM_SCLK);
48 outl(0x40000000, pms_base + PM_IN_SLPCTL);
49 outl(0x40000000, pms_base + PM_WKXD);
50 outl(0x40000000, pms_base + PM_WKD);
51
52 /* Clear status bits (possibly unnecessary) */
53 outl(0x0002ffff, pms_base + PM_SSC);
54 outl(0xffffffff, acpi_base + PM_GPE0_STS);
55
56 /* Write SLP_EN bit to start the machinery */
57 outl(0x00002000, acpi_base + PM1_CNT);
58}
59
60/* Read the base addresses from the PCI BAR info */
61static int __devinit setup_bases(struct pci_dev *pdev)
62{
63 int r;
64
65 r = pci_enable_device_io(pdev);
66 if (r) {
67 dev_err(&pdev->dev, "can't enable device IO\n");
68 return r;
69 }
70
71 r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
72 if (r) {
73 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
74 return r;
75 }
76
77 r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
78 if (r) {
79 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
80 pci_release_region(pdev, ACPI_BAR);
81 return r;
82 }
83
84 acpi_base = pci_resource_start(pdev, ACPI_BAR);
85 pms_base = pci_resource_start(pdev, PMS_BAR);
86
87 return 0;
88}
89
90static int __devinit olpc_xo1_probe(struct platform_device *pdev)
91{
92 struct pci_dev *pcidev;
93 int r;
94
95 pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
96 NULL);
97 if (!pdev)
98 return -ENODEV;
99
100 r = setup_bases(pcidev);
101 if (r)
102 return r;
103
104 pm_power_off = xo1_power_off;
105
106 printk(KERN_INFO "OLPC XO-1 support registered\n");
107 return 0;
108}
109
110static int __devexit olpc_xo1_remove(struct platform_device *pdev)
111{
112 pm_power_off = NULL;
113 return 0;
114}
115
116static struct platform_driver olpc_xo1_driver = {
117 .driver = {
118 .name = DRV_NAME,
119 .owner = THIS_MODULE,
120 },
121 .probe = olpc_xo1_probe,
122 .remove = __devexit_p(olpc_xo1_remove),
123};
124
125static int __init olpc_xo1_init(void)
126{
127 return platform_driver_register(&olpc_xo1_driver);
128}
129
130static void __exit olpc_xo1_exit(void)
131{
132 platform_driver_unregister(&olpc_xo1_driver);
133}
134
135MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
136MODULE_LICENSE("GPL");
137MODULE_ALIAS("platform:olpc-xo1");
138
139module_init(olpc_xo1_init);
140module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/platform/olpc/olpc.c
index 0e0cdde519be..edaf3fe8dc5e 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/platform_device.h>
20 21
21#include <asm/geode.h> 22#include <asm/geode.h>
22#include <asm/setup.h> 23#include <asm/setup.h>
@@ -114,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
114 unsigned long flags; 115 unsigned long flags;
115 int ret = -EIO; 116 int ret = -EIO;
116 int i; 117 int i;
118 int restarts = 0;
117 119
118 spin_lock_irqsave(&ec_lock, flags); 120 spin_lock_irqsave(&ec_lock, flags);
119 121
@@ -169,7 +171,9 @@ restart:
169 if (wait_on_obf(0x6c, 1)) { 171 if (wait_on_obf(0x6c, 1)) {
170 printk(KERN_ERR "olpc-ec: timeout waiting for" 172 printk(KERN_ERR "olpc-ec: timeout waiting for"
171 " EC to provide data!\n"); 173 " EC to provide data!\n");
172 goto restart; 174 if (restarts++ < 10)
175 goto restart;
176 goto err;
173 } 177 }
174 outbuf[i] = inb(0x68); 178 outbuf[i] = inb(0x68);
175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); 179 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
@@ -183,8 +187,21 @@ err:
183} 187}
184EXPORT_SYMBOL_GPL(olpc_ec_cmd); 188EXPORT_SYMBOL_GPL(olpc_ec_cmd);
185 189
186#ifdef CONFIG_OLPC_OPENFIRMWARE 190static bool __init check_ofw_architecture(void)
187static void __init platform_detect(void) 191{
192 size_t propsize;
193 char olpc_arch[5];
194 const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
195 void *res[] = { &propsize };
196
197 if (olpc_ofw("getprop", args, res)) {
198 printk(KERN_ERR "ofw: getprop call failed!\n");
199 return false;
200 }
201 return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
202}
203
204static u32 __init get_board_revision(void)
188{ 205{
189 size_t propsize; 206 size_t propsize;
190 __be32 rev; 207 __be32 rev;
@@ -193,45 +210,43 @@ static void __init platform_detect(void)
193 210
194 if (olpc_ofw("getprop", args, res) || propsize != 4) { 211 if (olpc_ofw("getprop", args, res) || propsize != 4) {
195 printk(KERN_ERR "ofw: getprop call failed!\n"); 212 printk(KERN_ERR "ofw: getprop call failed!\n");
196 rev = cpu_to_be32(0); 213 return cpu_to_be32(0);
197 } 214 }
198 olpc_platform_info.boardrev = be32_to_cpu(rev); 215 return be32_to_cpu(rev);
199} 216}
200#else 217
201static void __init platform_detect(void) 218static bool __init platform_detect(void)
202{ 219{
203 /* stopgap until OFW support is added to the kernel */ 220 if (!check_ofw_architecture())
204 olpc_platform_info.boardrev = olpc_board(0xc2); 221 return false;
222 olpc_platform_info.flags |= OLPC_F_PRESENT;
223 olpc_platform_info.boardrev = get_board_revision();
224 return true;
205} 225}
206#endif
207 226
208static int __init olpc_init(void) 227static int __init add_xo1_platform_devices(void)
209{ 228{
210 unsigned char *romsig; 229 struct platform_device *pdev;
211 230
212 /* The ioremap check is dangerous; limit what we run it on */ 231 pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
213 if (!is_geode() || cs5535_has_vsa2()) 232 if (IS_ERR(pdev))
214 return 0; 233 return PTR_ERR(pdev);
215 234
216 spin_lock_init(&ec_lock); 235 pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
236 if (IS_ERR(pdev))
237 return PTR_ERR(pdev);
217 238
218 romsig = ioremap(0xffffffc0, 16); 239 return 0;
219 if (!romsig) 240}
220 return 0;
221 241
222 if (strncmp(romsig, "CL1 Q", 7)) 242static int __init olpc_init(void)
223 goto unmap; 243{
224 if (strncmp(romsig+6, romsig+13, 3)) { 244 int r = 0;
225 printk(KERN_INFO "OLPC BIOS signature looks invalid. "
226 "Assuming not OLPC\n");
227 goto unmap;
228 }
229 245
230 printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig); 246 if (!olpc_ofw_present() || !platform_detect())
231 olpc_platform_info.flags |= OLPC_F_PRESENT; 247 return 0;
232 248
233 /* get the platform revision */ 249 spin_lock_init(&ec_lock);
234 platform_detect();
235 250
236 /* assume B1 and above models always have a DCON */ 251 /* assume B1 and above models always have a DCON */
237 if (olpc_board_at_least(olpc_board(0xb1))) 252 if (olpc_board_at_least(olpc_board(0xb1)))
@@ -242,8 +257,10 @@ static int __init olpc_init(void)
242 (unsigned char *) &olpc_platform_info.ecver, 1); 257 (unsigned char *) &olpc_platform_info.ecver, 1);
243 258
244#ifdef CONFIG_PCI_OLPC 259#ifdef CONFIG_PCI_OLPC
245 /* If the VSA exists let it emulate PCI, if not emulate in kernel */ 260 /* If the VSA exists let it emulate PCI, if not emulate in kernel.
246 if (!cs5535_has_vsa2()) 261 * XO-1 only. */
262 if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
263 !cs5535_has_vsa2())
247 x86_init.pci.arch_init = pci_olpc_init; 264 x86_init.pci.arch_init = pci_olpc_init;
248#endif 265#endif
249 266
@@ -252,8 +269,12 @@ static int __init olpc_init(void)
252 olpc_platform_info.boardrev >> 4, 269 olpc_platform_info.boardrev >> 4,
253 olpc_platform_info.ecver); 270 olpc_platform_info.ecver);
254 271
255unmap: 272 if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
256 iounmap(romsig); 273 r = add_xo1_platform_devices();
274 if (r)
275 return r;
276 }
277
257 return 0; 278 return 0;
258} 279}
259 280
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
index 3218aa71ab5e..787320464379 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
74} 74}
75EXPORT_SYMBOL_GPL(__olpc_ofw); 75EXPORT_SYMBOL_GPL(__olpc_ofw);
76 76
77bool olpc_ofw_present(void)
78{
79 return olpc_ofw_cif != NULL;
80}
81EXPORT_SYMBOL_GPL(olpc_ofw_present);
82
77/* OFW cif _should_ be above this address */ 83/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000 84#define OFW_MIN 0xff000000
79 85
diff --git a/arch/x86/platform/scx200/Makefile b/arch/x86/platform/scx200/Makefile
new file mode 100644
index 000000000000..762b4c7f4314
--- /dev/null
+++ b/arch/x86/platform/scx200/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_SCx200) += scx200.o
2scx200-y += scx200_32.o
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c
index 7e004acbe526..7e004acbe526 100644
--- a/arch/x86/kernel/scx200_32.c
+++ b/arch/x86/platform/scx200/scx200_32.c
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
new file mode 100644
index 000000000000..cc5db1168a5e
--- /dev/null
+++ b/arch/x86/platform/sfi/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_SFI) += sfi.o
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/platform/sfi/sfi.c
index cb22acf3ed09..dd4c281ffe57 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -34,7 +34,7 @@
34#ifdef CONFIG_X86_LOCAL_APIC 34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36 36
37void __init mp_sfi_register_lapic_address(unsigned long address) 37static void __init mp_sfi_register_lapic_address(unsigned long address)
38{ 38{
39 mp_lapic_addr = address; 39 mp_lapic_addr = address;
40 40
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
46} 46}
47 47
48/* All CPUs enumerated by SFI must be present and enabled */ 48/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id) 49static void __cpuinit mp_sfi_register_lapic(u8 id)
50{ 50{
51 if (MAX_APICS - id <= 0) { 51 if (MAX_APICS - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n", 52 pr_warning("Processor #%d invalid (max %d)\n",
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile
new file mode 100644
index 000000000000..6c40995fefb8
--- /dev/null
+++ b/arch/x86/platform/uv/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 8bc57baaa9ad..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 312ef0292815..20ea20a39e2a 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1001,10 +1001,10 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
1001static ssize_t tunables_read(struct file *file, char __user *userbuf, 1001static ssize_t tunables_read(struct file *file, char __user *userbuf,
1002 size_t count, loff_t *ppos) 1002 size_t count, loff_t *ppos)
1003{ 1003{
1004 char buf[300]; 1004 char *buf;
1005 int ret; 1005 int ret;
1006 1006
1007 ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", 1007 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1008 "max_bau_concurrent plugged_delay plugsb4reset", 1008 "max_bau_concurrent plugged_delay plugsb4reset",
1009 "timeoutsb4reset ipi_reset_limit complete_threshold", 1009 "timeoutsb4reset ipi_reset_limit complete_threshold",
1010 "congested_response_us congested_reps congested_period", 1010 "congested_response_us congested_reps congested_period",
@@ -1012,7 +1012,12 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
1012 timeoutsb4reset, ipi_reset_limit, complete_threshold, 1012 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1013 congested_response_us, congested_reps, congested_period); 1013 congested_response_us, congested_reps, congested_period);
1014 1014
1015 return simple_read_from_buffer(userbuf, count, ppos, buf, ret); 1015 if (!buf)
1016 return -ENOMEM;
1017
1018 ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
1019 kfree(buf);
1020 return ret;
1016} 1021}
1017 1022
1018/* 1023/*
@@ -1285,6 +1290,7 @@ static const struct file_operations tunables_fops = {
1285 .open = tunables_open, 1290 .open = tunables_open,
1286 .read = tunables_read, 1291 .read = tunables_read,
1287 .write = tunables_write, 1292 .write = tunables_write,
1293 .llseek = default_llseek,
1288}; 1294};
1289 1295
1290static int __init uv_ptc_init(void) 1296static int __init uv_ptc_init(void)
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 7b24460917d5..7b24460917d5 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70fb7759..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 56e421bc379b..56e421bc379b 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile
new file mode 100644
index 000000000000..91bc17ab2fd5
--- /dev/null
+++ b/arch/x86/platform/visws/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 3371bd053b89..3371bd053b89 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index a234b9a71ab4..5b54892e4bc3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -29,15 +29,12 @@ config XEN_PVHVM
29 depends on X86_LOCAL_APIC 29 depends on X86_LOCAL_APIC
30 30
31config XEN_MAX_DOMAIN_MEMORY 31config XEN_MAX_DOMAIN_MEMORY
32 int "Maximum allowed size of a domain in gigabytes" 32 int
33 default 8 if X86_32 33 default 128
34 default 32 if X86_64
35 depends on XEN 34 depends on XEN
36 help 35 help
37 The pseudo-physical to machine address array is sized 36 This only affects the sizing of some bss arrays, the unused
38 according to the maximum possible memory size of a Xen 37 portions of which are freed.
39 domain. This array uses 1 page per gigabyte, so there's no
40 need to be too stingy here.
41 38
42config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
43 bool 40 bool
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 1304bcec8ee5..7c0fedd98ea0 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = {
106 .open = u32_array_open, 106 .open = u32_array_open,
107 .release= xen_array_release, 107 .release= xen_array_release,
108 .read = u32_array_read, 108 .read = u32_array_read,
109 .llseek = no_llseek,
109}; 110};
110 111
111struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 112struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d48a32b10a3c..235c0f4d3861 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,6 +30,7 @@
30#include <linux/console.h> 30#include <linux/console.h>
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h>
33 34
34#include <xen/xen.h> 35#include <xen/xen.h>
35#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
@@ -59,7 +60,6 @@
59#include <asm/pgtable.h> 60#include <asm/pgtable.h>
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61#include <asm/reboot.h> 62#include <asm/reboot.h>
62#include <asm/setup.h>
63#include <asm/stackprotector.h> 63#include <asm/stackprotector.h>
64#include <asm/hypervisor.h> 64#include <asm/hypervisor.h>
65 65
@@ -136,9 +136,6 @@ static void xen_vcpu_setup(int cpu)
136 info.mfn = arbitrary_virt_to_mfn(vcpup); 136 info.mfn = arbitrary_virt_to_mfn(vcpup);
137 info.offset = offset_in_page(vcpup); 137 info.offset = offset_in_page(vcpup);
138 138
139 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
140 cpu, vcpup, info.mfn, info.offset);
141
142 /* Check to see if the hypervisor will put the vcpu_info 139 /* Check to see if the hypervisor will put the vcpu_info
143 structure where we want it, which allows direct access via 140 structure where we want it, which allows direct access via
144 a percpu-variable. */ 141 a percpu-variable. */
@@ -152,9 +149,6 @@ static void xen_vcpu_setup(int cpu)
152 /* This cpu is using the registered vcpu info, even if 149 /* This cpu is using the registered vcpu info, even if
153 later ones fail to. */ 150 later ones fail to. */
154 per_cpu(xen_vcpu, cpu) = vcpup; 151 per_cpu(xen_vcpu, cpu) = vcpup;
155
156 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
157 cpu, vcpup);
158 } 152 }
159} 153}
160 154
@@ -837,6 +831,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
837 Xen console noise. */ 831 Xen console noise. */
838 break; 832 break;
839 833
834 case MSR_IA32_CR_PAT:
835 if (smp_processor_id() == 0)
836 xen_set_pat(((u64)high << 32) | low);
837 break;
838
840 default: 839 default:
841 ret = native_write_msr_safe(msr, low, high); 840 ret = native_write_msr_safe(msr, low, high);
842 } 841 }
@@ -875,8 +874,6 @@ void xen_setup_vcpu_info_placement(void)
875 /* xen_vcpu_setup managed to place the vcpu_info within the 874 /* xen_vcpu_setup managed to place the vcpu_info within the
876 percpu area for all cpus, so make use of it */ 875 percpu area for all cpus, so make use of it */
877 if (have_vcpu_info_placement) { 876 if (have_vcpu_info_placement) {
878 printk(KERN_INFO "Xen: using vcpu_info placement\n");
879
880 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 877 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
881 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 878 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
882 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 879 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1020,7 +1017,7 @@ static void xen_reboot(int reason)
1020 struct sched_shutdown r = { .reason = reason }; 1017 struct sched_shutdown r = { .reason = reason };
1021 1018
1022#ifdef CONFIG_SMP 1019#ifdef CONFIG_SMP
1023 smp_send_stop(); 1020 stop_other_cpus();
1024#endif 1021#endif
1025 1022
1026 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1023 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
@@ -1185,10 +1182,15 @@ asmlinkage void __init xen_start_kernel(void)
1185 local_irq_disable(); 1182 local_irq_disable();
1186 early_boot_irqs_off(); 1183 early_boot_irqs_off();
1187 1184
1185 memblock_init();
1186
1188 xen_raw_console_write("mapping kernel into physical memory\n"); 1187 xen_raw_console_write("mapping kernel into physical memory\n");
1189 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1188 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1190 xen_ident_map_ISA(); 1189 xen_ident_map_ISA();
1191 1190
1191 /* Allocate and initialize top and mid mfn levels for p2m structure */
1192 xen_build_mfn_list_list();
1193
1192 init_mm.pgd = pgd; 1194 init_mm.pgd = pgd;
1193 1195
1194 /* keep using Xen gdt for now; no urgent need to change it */ 1196 /* keep using Xen gdt for now; no urgent need to change it */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index eed9c7cee4b7..c237b810b03f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -45,6 +45,7 @@
45#include <linux/vmalloc.h> 45#include <linux/vmalloc.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h>
48 49
49#include <asm/pgtable.h> 50#include <asm/pgtable.h>
50#include <asm/tlbflush.h> 51#include <asm/tlbflush.h>
@@ -55,6 +56,8 @@
55#include <asm/e820.h> 56#include <asm/e820.h>
56#include <asm/linkage.h> 57#include <asm/linkage.h>
57#include <asm/page.h> 58#include <asm/page.h>
59#include <asm/init.h>
60#include <asm/pat.h>
58 61
59#include <asm/xen/hypercall.h> 62#include <asm/xen/hypercall.h>
60#include <asm/xen/hypervisor.h> 63#include <asm/xen/hypervisor.h>
@@ -138,7 +141,8 @@ static inline void check_zero(void)
138 * large enough to allocate page table pages to allocate the rest. 141 * large enough to allocate page table pages to allocate the rest.
139 * Each page can map 2MB. 142 * Each page can map 2MB.
140 */ 143 */
141static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; 144#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
145static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
142 146
143#ifdef CONFIG_X86_64 147#ifdef CONFIG_X86_64
144/* l3 pud for userspace vsyscall mapping */ 148/* l3 pud for userspace vsyscall mapping */
@@ -169,49 +173,182 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
169 */ 173 */
170#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
171 175
176/*
177 * Xen leaves the responsibility for maintaining p2m mappings to the
178 * guests themselves, but it must also access and update the p2m array
179 * during suspend/resume when all the pages are reallocated.
180 *
181 * The p2m table is logically a flat array, but we implement it as a
182 * three-level tree to allow the address space to be sparse.
183 *
184 * Xen
185 * |
186 * p2m_top p2m_top_mfn
187 * / \ / \
188 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
189 * / \ / \ / /
190 * p2m p2m p2m p2m p2m p2m p2m ...
191 *
192 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193 *
194 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195 * maximum representable pseudo-physical address space is:
196 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197 *
198 * P2M_PER_PAGE depends on the architecture, as a mfn is always
199 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200 * 512 and 1024 entries respectively.
201 */
202
203unsigned long xen_max_p2m_pfn __read_mostly;
172 204
173#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 205#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
174#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 206#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
207#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
175 208
176/* Placeholder for holes in the address space */ 209#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
177static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
178 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
179 210
180 /* Array of pointers to pages containing p2m entries */ 211/* Placeholders for holes in the address space */
181static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = 212static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
182 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 213static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
183 215
184/* Arrays of p2m arrays expressed in mfns used for save/restore */ 216static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
185static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; 217static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
186 219
187static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] 220RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
188 __page_aligned_bss; 221RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
189 222
190static inline unsigned p2m_top_index(unsigned long pfn) 223static inline unsigned p2m_top_index(unsigned long pfn)
191{ 224{
192 BUG_ON(pfn >= MAX_DOMAIN_PAGES); 225 BUG_ON(pfn >= MAX_P2M_PFN);
193 return pfn / P2M_ENTRIES_PER_PAGE; 226 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227}
228
229static inline unsigned p2m_mid_index(unsigned long pfn)
230{
231 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
194} 232}
195 233
196static inline unsigned p2m_index(unsigned long pfn) 234static inline unsigned p2m_index(unsigned long pfn)
197{ 235{
198 return pfn % P2M_ENTRIES_PER_PAGE; 236 return pfn % P2M_PER_PAGE;
199} 237}
200 238
201/* Build the parallel p2m_top_mfn structures */ 239static void p2m_top_init(unsigned long ***top)
240{
241 unsigned i;
242
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing;
245}
246
247static void p2m_top_mfn_init(unsigned long *top)
248{
249 unsigned i;
250
251 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253}
254
255static void p2m_top_mfn_p_init(unsigned long **top)
256{
257 unsigned i;
258
259 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 top[i] = p2m_mid_missing_mfn;
261}
262
263static void p2m_mid_init(unsigned long **mid)
264{
265 unsigned i;
266
267 for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 mid[i] = p2m_missing;
269}
270
271static void p2m_mid_mfn_init(unsigned long *mid)
272{
273 unsigned i;
274
275 for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 mid[i] = virt_to_mfn(p2m_missing);
277}
278
279static void p2m_init(unsigned long *p2m)
280{
281 unsigned i;
282
283 for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 p2m[i] = INVALID_P2M_ENTRY;
285}
286
287/*
288 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289 *
290 * This is called both at boot time, and after resuming from suspend:
291 * - At boot time we're called very early, and must use extend_brk()
292 * to allocate memory.
293 *
294 * - After resume we're called from within stop_machine, but the mfn
295 * tree should alreay be completely allocated.
296 */
202void xen_build_mfn_list_list(void) 297void xen_build_mfn_list_list(void)
203{ 298{
204 unsigned pfn, idx; 299 unsigned long pfn;
205 300
206 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { 301 /* Pre-initialize p2m_top_mfn to be completely missing */
207 unsigned topidx = p2m_top_index(pfn); 302 if (p2m_top_mfn == NULL) {
303 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 p2m_mid_mfn_init(p2m_mid_missing_mfn);
305
306 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 p2m_top_mfn_p_init(p2m_top_mfn_p);
208 308
209 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); 309 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 p2m_top_mfn_init(p2m_top_mfn);
311 } else {
312 /* Reinitialise, mfn's all change after migration */
313 p2m_mid_mfn_init(p2m_mid_missing_mfn);
210 } 314 }
211 315
212 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { 316 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
213 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; 317 unsigned topidx = p2m_top_index(pfn);
214 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); 318 unsigned mididx = p2m_mid_index(pfn);
319 unsigned long **mid;
320 unsigned long *mid_mfn_p;
321
322 mid = p2m_top[topidx];
323 mid_mfn_p = p2m_top_mfn_p[topidx];
324
325 /* Don't bother allocating any mfn mid levels if
326 * they're just missing, just update the stored mfn,
327 * since all could have changed over a migrate.
328 */
329 if (mid == p2m_mid_missing) {
330 BUG_ON(mididx);
331 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 continue;
335 }
336
337 if (mid_mfn_p == p2m_mid_missing_mfn) {
338 /*
339 * XXX boot-time only! We should never find
340 * missing parts of the mfn tree after
341 * runtime. extend_brk() will BUG if we call
342 * it too late.
343 */
344 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 p2m_mid_mfn_init(mid_mfn_p);
346
347 p2m_top_mfn_p[topidx] = mid_mfn_p;
348 }
349
350 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
215 } 352 }
216} 353}
217 354
@@ -220,8 +357,8 @@ void xen_setup_mfn_list_list(void)
220 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 357 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
221 358
222 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 359 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
223 virt_to_mfn(p2m_top_mfn_list); 360 virt_to_mfn(p2m_top_mfn);
224 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; 361 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
225} 362}
226 363
227/* Set up p2m_top to point to the domain-builder provided p2m pages */ 364/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -229,98 +366,176 @@ void __init xen_build_dynamic_phys_to_machine(void)
229{ 366{
230 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; 367 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
231 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 368 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
232 unsigned pfn; 369 unsigned long pfn;
370
371 xen_max_p2m_pfn = max_pfn;
233 372
234 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 373 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 p2m_init(p2m_missing);
375
376 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 p2m_mid_init(p2m_mid_missing);
378
379 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 p2m_top_init(p2m_top);
381
382 /*
383 * The domain builder gives us a pre-constructed p2m array in
384 * mfn_list for all the pages initially given to us, so we just
385 * need to graft that into our tree structure.
386 */
387 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
235 unsigned topidx = p2m_top_index(pfn); 388 unsigned topidx = p2m_top_index(pfn);
389 unsigned mididx = p2m_mid_index(pfn);
236 390
237 p2m_top[topidx] = &mfn_list[pfn]; 391 if (p2m_top[topidx] == p2m_mid_missing) {
238 } 392 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 p2m_mid_init(mid);
394
395 p2m_top[topidx] = mid;
396 }
239 397
240 xen_build_mfn_list_list(); 398 p2m_top[topidx][mididx] = &mfn_list[pfn];
399 }
241} 400}
242 401
243unsigned long get_phys_to_machine(unsigned long pfn) 402unsigned long get_phys_to_machine(unsigned long pfn)
244{ 403{
245 unsigned topidx, idx; 404 unsigned topidx, mididx, idx;
246 405
247 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) 406 if (unlikely(pfn >= MAX_P2M_PFN))
248 return INVALID_P2M_ENTRY; 407 return INVALID_P2M_ENTRY;
249 408
250 topidx = p2m_top_index(pfn); 409 topidx = p2m_top_index(pfn);
410 mididx = p2m_mid_index(pfn);
251 idx = p2m_index(pfn); 411 idx = p2m_index(pfn);
252 return p2m_top[topidx][idx]; 412
413 return p2m_top[topidx][mididx][idx];
253} 414}
254EXPORT_SYMBOL_GPL(get_phys_to_machine); 415EXPORT_SYMBOL_GPL(get_phys_to_machine);
255 416
256/* install a new p2m_top page */ 417static void *alloc_p2m_page(void)
257bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
258{ 418{
259 unsigned topidx = p2m_top_index(pfn); 419 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
260 unsigned long **pfnp, *mfnp; 420}
261 unsigned i;
262 421
263 pfnp = &p2m_top[topidx]; 422static void free_p2m_page(void *p)
264 mfnp = &p2m_top_mfn[topidx]; 423{
424 free_page((unsigned long)p);
425}
265 426
266 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) 427/*
267 p[i] = INVALID_P2M_ENTRY; 428 * Fully allocate the p2m structure for a given pfn. We need to check
429 * that both the top and mid levels are allocated, and make sure the
430 * parallel mfn tree is kept in sync. We may race with other cpus, so
431 * the new pages are installed with cmpxchg; if we lose the race then
432 * simply free the page we allocated and use the one that's there.
433 */
434static bool alloc_p2m(unsigned long pfn)
435{
436 unsigned topidx, mididx;
437 unsigned long ***top_p, **mid;
438 unsigned long *top_mfn_p, *mid_mfn;
268 439
269 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { 440 topidx = p2m_top_index(pfn);
270 *mfnp = virt_to_mfn(p); 441 mididx = p2m_mid_index(pfn);
271 return true; 442
443 top_p = &p2m_top[topidx];
444 mid = *top_p;
445
446 if (mid == p2m_mid_missing) {
447 /* Mid level is missing, allocate a new one */
448 mid = alloc_p2m_page();
449 if (!mid)
450 return false;
451
452 p2m_mid_init(mid);
453
454 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 free_p2m_page(mid);
272 } 456 }
273 457
274 return false; 458 top_mfn_p = &p2m_top_mfn[topidx];
275} 459 mid_mfn = p2m_top_mfn_p[topidx];
276 460
277static void alloc_p2m(unsigned long pfn) 461 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
278{ 462
279 unsigned long *p; 463 if (mid_mfn == p2m_mid_missing_mfn) {
464 /* Separately check the mid mfn level */
465 unsigned long missing_mfn;
466 unsigned long mid_mfn_mfn;
467
468 mid_mfn = alloc_p2m_page();
469 if (!mid_mfn)
470 return false;
280 471
281 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); 472 p2m_mid_mfn_init(mid_mfn);
282 BUG_ON(p == NULL);
283 473
284 if (!install_p2mtop_page(pfn, p)) 474 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
285 free_page((unsigned long)p); 475 mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 free_p2m_page(mid_mfn);
478 else
479 p2m_top_mfn_p[topidx] = mid_mfn;
480 }
481
482 if (p2m_top[topidx][mididx] == p2m_missing) {
483 /* p2m leaf page is missing */
484 unsigned long *p2m;
485
486 p2m = alloc_p2m_page();
487 if (!p2m)
488 return false;
489
490 p2m_init(p2m);
491
492 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 free_p2m_page(p2m);
494 else
495 mid_mfn[mididx] = virt_to_mfn(p2m);
496 }
497
498 return true;
286} 499}
287 500
288/* Try to install p2m mapping; fail if intermediate bits missing */ 501/* Try to install p2m mapping; fail if intermediate bits missing */
289bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 502bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
290{ 503{
291 unsigned topidx, idx; 504 unsigned topidx, mididx, idx;
292 505
293 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { 506 if (unlikely(pfn >= MAX_P2M_PFN)) {
294 BUG_ON(mfn != INVALID_P2M_ENTRY); 507 BUG_ON(mfn != INVALID_P2M_ENTRY);
295 return true; 508 return true;
296 } 509 }
297 510
298 topidx = p2m_top_index(pfn); 511 topidx = p2m_top_index(pfn);
299 if (p2m_top[topidx] == p2m_missing) { 512 mididx = p2m_mid_index(pfn);
300 if (mfn == INVALID_P2M_ENTRY)
301 return true;
302 return false;
303 }
304
305 idx = p2m_index(pfn); 513 idx = p2m_index(pfn);
306 p2m_top[topidx][idx] = mfn; 514
515 if (p2m_top[topidx][mididx] == p2m_missing)
516 return mfn == INVALID_P2M_ENTRY;
517
518 p2m_top[topidx][mididx][idx] = mfn;
307 519
308 return true; 520 return true;
309} 521}
310 522
311void set_phys_to_machine(unsigned long pfn, unsigned long mfn) 523bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
312{ 524{
313 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { 525 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
314 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); 526 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
315 return; 527 return true;
316 } 528 }
317 529
318 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 530 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
319 alloc_p2m(pfn); 531 if (!alloc_p2m(pfn))
532 return false;
320 533
321 if (!__set_phys_to_machine(pfn, mfn)) 534 if (!__set_phys_to_machine(pfn, mfn))
322 BUG(); 535 return false;
323 } 536 }
537
538 return true;
324} 539}
325 540
326unsigned long arbitrary_virt_to_mfn(void *vaddr) 541unsigned long arbitrary_virt_to_mfn(void *vaddr)
@@ -359,7 +574,8 @@ void make_lowmem_page_readonly(void *vaddr)
359 unsigned int level; 574 unsigned int level;
360 575
361 pte = lookup_address(address, &level); 576 pte = lookup_address(address, &level);
362 BUG_ON(pte == NULL); 577 if (pte == NULL)
578 return; /* vaddr missing */
363 579
364 ptev = pte_wrprotect(*pte); 580 ptev = pte_wrprotect(*pte);
365 581
@@ -374,7 +590,8 @@ void make_lowmem_page_readwrite(void *vaddr)
374 unsigned int level; 590 unsigned int level;
375 591
376 pte = lookup_address(address, &level); 592 pte = lookup_address(address, &level);
377 BUG_ON(pte == NULL); 593 if (pte == NULL)
594 return; /* vaddr missing */
378 595
379 ptev = pte_mkwrite(*pte); 596 ptev = pte_mkwrite(*pte);
380 597
@@ -395,7 +612,7 @@ static bool xen_iomap_pte(pte_t pte)
395 return pte_flags(pte) & _PAGE_IOMAP; 612 return pte_flags(pte) & _PAGE_IOMAP;
396} 613}
397 614
398static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) 615void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
399{ 616{
400 struct multicall_space mcs; 617 struct multicall_space mcs;
401 struct mmu_update *u; 618 struct mmu_update *u;
@@ -407,10 +624,16 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
407 u->ptr = arbitrary_virt_to_machine(ptep).maddr; 624 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
408 u->val = pte_val_ma(pteval); 625 u->val = pte_val_ma(pteval);
409 626
410 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); 627 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
411 628
412 xen_mc_issue(PARAVIRT_LAZY_MMU); 629 xen_mc_issue(PARAVIRT_LAZY_MMU);
413} 630}
631EXPORT_SYMBOL_GPL(xen_set_domain_pte);
632
633static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
634{
635 xen_set_domain_pte(ptep, pteval, DOMID_IO);
636}
414 637
415static void xen_extend_mmu_update(const struct mmu_update *update) 638static void xen_extend_mmu_update(const struct mmu_update *update)
416{ 639{
@@ -557,7 +780,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
557 if (val & _PAGE_PRESENT) { 780 if (val & _PAGE_PRESENT) {
558 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 781 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
559 pteval_t flags = val & PTE_FLAGS_MASK; 782 pteval_t flags = val & PTE_FLAGS_MASK;
560 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 783 unsigned long mfn = pfn_to_mfn(pfn);
784
785 /*
786 * If there's no mfn for the pfn, then just create an
787 * empty non-present pte. Unfortunately this loses
788 * information about the original pfn, so
789 * pte_mfn_to_pfn is asymmetric.
790 */
791 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
792 mfn = 0;
793 flags = 0;
794 }
795
796 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
561 } 797 }
562 798
563 return val; 799 return val;
@@ -579,10 +815,18 @@ static pteval_t iomap_pte(pteval_t val)
579 815
580pteval_t xen_pte_val(pte_t pte) 816pteval_t xen_pte_val(pte_t pte)
581{ 817{
582 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) 818 pteval_t pteval = pte.pte;
583 return pte.pte;
584 819
585 return pte_mfn_to_pfn(pte.pte); 820 /* If this is a WC pte, convert back from Xen WC to Linux WC */
821 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
822 WARN_ON(!pat_enabled);
823 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
824 }
825
826 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
827 return pteval;
828
829 return pte_mfn_to_pfn(pteval);
586} 830}
587PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 831PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
588 832
@@ -592,10 +836,48 @@ pgdval_t xen_pgd_val(pgd_t pgd)
592} 836}
593PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 837PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
594 838
839/*
840 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
841 * are reserved for now, to correspond to the Intel-reserved PAT
842 * types.
843 *
844 * We expect Linux's PAT set as follows:
845 *
846 * Idx PTE flags Linux Xen Default
847 * 0 WB WB WB
848 * 1 PWT WC WT WT
849 * 2 PCD UC- UC- UC-
850 * 3 PCD PWT UC UC UC
851 * 4 PAT WB WC WB
852 * 5 PAT PWT WC WP WT
853 * 6 PAT PCD UC- UC UC-
854 * 7 PAT PCD PWT UC UC UC
855 */
856
857void xen_set_pat(u64 pat)
858{
859 /* We expect Linux to use a PAT setting of
860 * UC UC- WC WB (ignoring the PAT flag) */
861 WARN_ON(pat != 0x0007010600070106ull);
862}
863
595pte_t xen_make_pte(pteval_t pte) 864pte_t xen_make_pte(pteval_t pte)
596{ 865{
597 phys_addr_t addr = (pte & PTE_PFN_MASK); 866 phys_addr_t addr = (pte & PTE_PFN_MASK);
598 867
868 /* If Linux is trying to set a WC pte, then map to the Xen WC.
869 * If _PAGE_PAT is set, then it probably means it is really
870 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
871 * things work out OK...
872 *
873 * (We should never see kernel mappings with _PAGE_PSE set,
874 * but we could see hugetlbfs mappings, I think.).
875 */
876 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
877 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
878 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
879 }
880
599 /* 881 /*
600 * Unprivileged domains are allowed to do IOMAPpings for 882 * Unprivileged domains are allowed to do IOMAPpings for
601 * PCI passthrough, but not map ISA space. The ISA 883 * PCI passthrough, but not map ISA space. The ISA
@@ -1508,13 +1790,25 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1508#endif 1790#endif
1509} 1791}
1510 1792
1511#ifdef CONFIG_X86_32
1512static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1793static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1513{ 1794{
1795 unsigned long pfn = pte_pfn(pte);
1796
1797#ifdef CONFIG_X86_32
1514 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1798 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1515 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1799 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1516 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1800 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1517 pte_val_ma(pte)); 1801 pte_val_ma(pte));
1802#endif
1803
1804 /*
1805 * If the new pfn is within the range of the newly allocated
1806 * kernel pagetable, and it isn't being mapped into an
1807 * early_ioremap fixmap slot, make sure it is RO.
1808 */
1809 if (!is_early_ioremap_ptep(ptep) &&
1810 pfn >= e820_table_start && pfn < e820_table_end)
1811 pte = pte_wrprotect(pte);
1518 1812
1519 return pte; 1813 return pte;
1520} 1814}
@@ -1527,7 +1821,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1527 1821
1528 xen_set_pte(ptep, pte); 1822 xen_set_pte(ptep, pte);
1529} 1823}
1530#endif
1531 1824
1532static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1825static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1533{ 1826{
@@ -1698,6 +1991,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1698 unsigned ident_pte; 1991 unsigned ident_pte;
1699 unsigned long pfn; 1992 unsigned long pfn;
1700 1993
1994 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1995 PAGE_SIZE);
1996
1701 ident_pte = 0; 1997 ident_pte = 0;
1702 pfn = 0; 1998 pfn = 0;
1703 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1999 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1708,7 +2004,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1708 pte_page = m2v(pmd[pmdidx].pmd); 2004 pte_page = m2v(pmd[pmdidx].pmd);
1709 else { 2005 else {
1710 /* Check for free pte pages */ 2006 /* Check for free pte pages */
1711 if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) 2007 if (ident_pte == LEVEL1_IDENT_ENTRIES)
1712 break; 2008 break;
1713 2009
1714 pte_page = &level1_ident_pgt[ident_pte]; 2010 pte_page = &level1_ident_pgt[ident_pte];
@@ -1815,7 +2111,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1815 __xen_write_cr3(true, __pa(pgd)); 2111 __xen_write_cr3(true, __pa(pgd));
1816 xen_mc_issue(PARAVIRT_LAZY_CPU); 2112 xen_mc_issue(PARAVIRT_LAZY_CPU);
1817 2113
1818 reserve_early(__pa(xen_start_info->pt_base), 2114 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1819 __pa(xen_start_info->pt_base + 2115 __pa(xen_start_info->pt_base +
1820 xen_start_info->nr_pt_frames * PAGE_SIZE), 2116 xen_start_info->nr_pt_frames * PAGE_SIZE),
1821 "XEN PAGETABLES"); 2117 "XEN PAGETABLES");
@@ -1823,13 +2119,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1823 return pgd; 2119 return pgd;
1824} 2120}
1825#else /* !CONFIG_X86_64 */ 2121#else /* !CONFIG_X86_64 */
1826static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; 2122static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
1827 2123
1828__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 2124__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1829 unsigned long max_pfn) 2125 unsigned long max_pfn)
1830{ 2126{
1831 pmd_t *kernel_pmd; 2127 pmd_t *kernel_pmd;
1832 2128
2129 level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
2130
1833 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 2131 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1834 xen_start_info->nr_pt_frames * PAGE_SIZE + 2132 xen_start_info->nr_pt_frames * PAGE_SIZE +
1835 512*1024); 2133 512*1024);
@@ -1853,7 +2151,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1853 2151
1854 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); 2152 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1855 2153
1856 reserve_early(__pa(xen_start_info->pt_base), 2154 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1857 __pa(xen_start_info->pt_base + 2155 __pa(xen_start_info->pt_base +
1858 xen_start_info->nr_pt_frames * PAGE_SIZE), 2156 xen_start_info->nr_pt_frames * PAGE_SIZE),
1859 "XEN PAGETABLES"); 2157 "XEN PAGETABLES");
@@ -2008,14 +2306,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2008 .alloc_pte = xen_alloc_pte_init, 2306 .alloc_pte = xen_alloc_pte_init,
2009 .release_pte = xen_release_pte_init, 2307 .release_pte = xen_release_pte_init,
2010 .alloc_pmd = xen_alloc_pmd_init, 2308 .alloc_pmd = xen_alloc_pmd_init,
2011 .alloc_pmd_clone = paravirt_nop,
2012 .release_pmd = xen_release_pmd_init, 2309 .release_pmd = xen_release_pmd_init,
2013 2310
2014#ifdef CONFIG_X86_64
2015 .set_pte = xen_set_pte,
2016#else
2017 .set_pte = xen_set_pte_init, 2311 .set_pte = xen_set_pte_init,
2018#endif
2019 .set_pte_at = xen_set_pte_at, 2312 .set_pte_at = xen_set_pte_at,
2020 .set_pmd = xen_set_pmd_hyper, 2313 .set_pmd = xen_set_pmd_hyper,
2021 2314
@@ -2300,6 +2593,72 @@ void __init xen_hvm_init_mmu_ops(void)
2300} 2593}
2301#endif 2594#endif
2302 2595
2596#define REMAP_BATCH_SIZE 16
2597
2598struct remap_data {
2599 unsigned long mfn;
2600 pgprot_t prot;
2601 struct mmu_update *mmu_update;
2602};
2603
2604static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2605 unsigned long addr, void *data)
2606{
2607 struct remap_data *rmd = data;
2608 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2609
2610 rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
2611 rmd->mmu_update->val = pte_val_ma(pte);
2612 rmd->mmu_update++;
2613
2614 return 0;
2615}
2616
2617int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2618 unsigned long addr,
2619 unsigned long mfn, int nr,
2620 pgprot_t prot, unsigned domid)
2621{
2622 struct remap_data rmd;
2623 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2624 int batch;
2625 unsigned long range;
2626 int err = 0;
2627
2628 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2629
2630 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2631
2632 rmd.mfn = mfn;
2633 rmd.prot = prot;
2634
2635 while (nr) {
2636 batch = min(REMAP_BATCH_SIZE, nr);
2637 range = (unsigned long)batch << PAGE_SHIFT;
2638
2639 rmd.mmu_update = mmu_update;
2640 err = apply_to_page_range(vma->vm_mm, addr, range,
2641 remap_area_mfn_pte_fn, &rmd);
2642 if (err)
2643 goto out;
2644
2645 err = -EFAULT;
2646 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2647 goto out;
2648
2649 nr -= batch;
2650 addr += range;
2651 }
2652
2653 err = 0;
2654out:
2655
2656 flush_tlb_all();
2657
2658 return err;
2659}
2660EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2661
2303#ifdef CONFIG_XEN_DEBUG_FS 2662#ifdef CONFIG_XEN_DEBUG_FS
2304 2663
2305static struct dentry *d_mmu_debug; 2664static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index fa938c4aa2f7..537bb9aab777 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,7 +12,6 @@ enum pt_level {
12 12
13 13
14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
15bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
16 15
17void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
18 17
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index be4d80a6fae9..bfd0632fe65e 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -6,6 +6,7 @@
6 6
7#include <asm/xen/hypervisor.h> 7#include <asm/xen/hypervisor.h>
8#include <xen/xen.h> 8#include <xen/xen.h>
9#include <asm/iommu_table.h>
9 10
10int xen_swiotlb __read_mostly; 11int xen_swiotlb __read_mostly;
11 12
@@ -60,3 +61,7 @@ void __init pci_xen_swiotlb_init(void)
60 pci_request_acs(); 61 pci_request_acs();
61 } 62 }
62} 63}
64IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
65 0,
66 pci_xen_swiotlb_init,
67 0);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 62ceb7864017..b1dbdaa23ecc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/memblock.h>
11 12
12#include <asm/elf.h> 13#include <asm/elf.h>
13#include <asm/vdso.h> 14#include <asm/vdso.h>
@@ -17,8 +18,10 @@
17#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
18#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
19 20
21#include <xen/xen.h>
20#include <xen/page.h> 22#include <xen/page.h>
21#include <xen/interface/callback.h> 23#include <xen/interface/callback.h>
24#include <xen/interface/memory.h>
22#include <xen/interface/physdev.h> 25#include <xen/interface/physdev.h>
23#include <xen/interface/memory.h> 26#include <xen/interface/memory.h>
24#include <xen/features.h> 27#include <xen/features.h>
@@ -33,6 +36,39 @@ extern void xen_sysenter_target(void);
33extern void xen_syscall_target(void); 36extern void xen_syscall_target(void);
34extern void xen_syscall32_target(void); 37extern void xen_syscall32_target(void);
35 38
39/* Amount of extra memory space we add to the e820 ranges */
40phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
41
42/*
43 * The maximum amount of extra memory compared to the base size. The
44 * main scaling factor is the size of struct page. At extreme ratios
45 * of base:extra, all the base memory can be filled with page
46 * structures for the extra memory, leaving no space for anything
47 * else.
48 *
49 * 10x seems like a reasonable balance between scaling flexibility and
50 * leaving a practically usable system.
51 */
52#define EXTRA_MEM_RATIO (10)
53
54static __init void xen_add_extra_mem(unsigned long pages)
55{
56 u64 size = (u64)pages * PAGE_SIZE;
57 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
58
59 if (!pages)
60 return;
61
62 e820_add_region(extra_start, size, E820_RAM);
63 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
64
65 memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
66
67 xen_extra_mem_size += size;
68
69 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
70}
71
36static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 72static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
37 phys_addr_t end_addr) 73 phys_addr_t end_addr)
38{ 74{
@@ -104,16 +140,65 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
104/** 140/**
105 * machine_specific_memory_setup - Hook for machine specific memory setup. 141 * machine_specific_memory_setup - Hook for machine specific memory setup.
106 **/ 142 **/
107
108char * __init xen_memory_setup(void) 143char * __init xen_memory_setup(void)
109{ 144{
145 static struct e820entry map[E820MAX] __initdata;
146
110 unsigned long max_pfn = xen_start_info->nr_pages; 147 unsigned long max_pfn = xen_start_info->nr_pages;
148 unsigned long long mem_end;
149 int rc;
150 struct xen_memory_map memmap;
151 unsigned long extra_pages = 0;
152 unsigned long extra_limit;
153 int i;
154 int op;
111 155
112 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 156 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
157 mem_end = PFN_PHYS(max_pfn);
158
159 memmap.nr_entries = E820MAX;
160 set_xen_guest_handle(memmap.buffer, map);
161
162 op = xen_initial_domain() ?
163 XENMEM_machine_memory_map :
164 XENMEM_memory_map;
165 rc = HYPERVISOR_memory_op(op, &memmap);
166 if (rc == -ENOSYS) {
167 memmap.nr_entries = 1;
168 map[0].addr = 0ULL;
169 map[0].size = mem_end;
170 /* 8MB slack (to balance backend allocations). */
171 map[0].size += 8ULL << 20;
172 map[0].type = E820_RAM;
173 rc = 0;
174 }
175 BUG_ON(rc);
113 176
114 e820.nr_map = 0; 177 e820.nr_map = 0;
178 xen_extra_mem_start = mem_end;
179 for (i = 0; i < memmap.nr_entries; i++) {
180 unsigned long long end = map[i].addr + map[i].size;
181
182 if (map[i].type == E820_RAM) {
183 if (map[i].addr < mem_end && end > mem_end) {
184 /* Truncate region to max_mem. */
185 u64 delta = end - mem_end;
115 186
116 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); 187 map[i].size -= delta;
188 extra_pages += PFN_DOWN(delta);
189
190 end = mem_end;
191 }
192 }
193
194 if (end > xen_extra_mem_start)
195 xen_extra_mem_start = end;
196
197 /* If region is non-RAM or below mem_end, add what remains */
198 if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
199 map[i].size > 0)
200 e820_add_region(map[i].addr, map[i].size, map[i].type);
201 }
117 202
118 /* 203 /*
119 * Even though this is normal, usable memory under Xen, reserve 204 * Even though this is normal, usable memory under Xen, reserve
@@ -132,13 +217,35 @@ char * __init xen_memory_setup(void)
132 * - xen_start_info 217 * - xen_start_info
133 * See comment above "struct start_info" in <xen/interface/xen.h> 218 * See comment above "struct start_info" in <xen/interface/xen.h>
134 */ 219 */
135 reserve_early(__pa(xen_start_info->mfn_list), 220 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
136 __pa(xen_start_info->pt_base), 221 __pa(xen_start_info->pt_base),
137 "XEN START INFO"); 222 "XEN START INFO");
138 223
139 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 224 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
140 225
141 xen_return_unused_memory(xen_start_info->nr_pages, &e820); 226 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
227
228 /*
229 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
230 * factor the base size. On non-highmem systems, the base
231 * size is the full initial memory allocation; on highmem it
232 * is limited to the max size of lowmem, so that it doesn't
233 * get completely filled.
234 *
235 * In principle there could be a problem in lowmem systems if
236 * the initial memory is also very large with respect to
237 * lowmem, but we won't try to deal with that here.
238 */
239 extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
240 max_pfn + extra_pages);
241
242 if (extra_limit >= max_pfn)
243 extra_pages = extra_limit - max_pfn;
244 else
245 extra_pages = 0;
246
247 if (!xen_initial_domain())
248 xen_add_extra_mem(extra_pages);
142 249
143 return "Xen"; 250 return "Xen";
144} 251}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 834dfeb54e31..72a4c7959045 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -426,9 +426,9 @@ static void stop_self(void *v)
426 BUG(); 426 BUG();
427} 427}
428 428
429static void xen_smp_send_stop(void) 429static void xen_stop_other_cpus(int wait)
430{ 430{
431 smp_call_function(stop_self, NULL, 0); 431 smp_call_function(stop_self, NULL, wait);
432} 432}
433 433
434static void xen_smp_send_reschedule(int cpu) 434static void xen_smp_send_reschedule(int cpu)
@@ -496,7 +496,7 @@ static const struct smp_ops xen_smp_ops __initdata = {
496 .cpu_disable = xen_cpu_disable, 496 .cpu_disable = xen_cpu_disable,
497 .play_dead = xen_play_dead, 497 .play_dead = xen_play_dead,
498 498
499 .smp_send_stop = xen_smp_send_stop, 499 .stop_other_cpus = xen_stop_other_cpus,
500 .smp_send_reschedule = xen_smp_send_reschedule, 500 .smp_send_reschedule = xen_smp_send_reschedule,
501 501
502 .send_call_func_ipi = xen_smp_send_call_function_ipi, 502 .send_call_func_ipi = xen_smp_send_call_function_ipi,
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e0500646585d..23e061b9327b 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab
224 goto out; 224 goto out;
225 } 225 }
226 226
227 flags = __raw_local_save_flags(); 227 flags = arch_local_save_flags();
228 if (irq_enable) { 228 if (irq_enable) {
229 ADD_STATS(taken_slow_irqenable, 1); 229 ADD_STATS(taken_slow_irqenable, 1);
230 raw_local_irq_enable(); 230 raw_local_irq_enable();
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 7c8ab86163e9..64044747348e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void);
30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
32void xen_reserve_top(void); 32void xen_reserve_top(void);
33extern unsigned long xen_max_p2m_pfn;
34
35void xen_set_pat(u64);
33 36
34char * __init xen_memory_setup(void); 37char * __init xen_memory_setup(void);
35void __init xen_arch_setup(void); 38void __init xen_arch_setup(void);