aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig57
-rw-r--r--arch/x86/Kconfig.cpu5
-rw-r--r--arch/x86/Kconfig.debug15
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/boot/compressed/misc.c6
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1832
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c540
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/ia32/sys_ia32.c1
-rw-r--r--arch/x86/include/asm/acpi.h11
-rw-r--r--arch/x86/include/asm/alternative.h8
-rw-r--r--arch/x86/include/asm/amd_nb.h60
-rw-r--r--arch/x86/include/asm/apic.h13
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/cacheflush.h42
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/debugreg.h2
-rw-r--r--arch/x86/include/asm/e820.h3
-rw-r--r--arch/x86/include/asm/fixmap.h8
-rw-r--r--arch/x86/include/asm/gpio.h5
-rw-r--r--arch/x86/include/asm/hypervisor.h12
-rw-r--r--arch/x86/include/asm/i387.h24
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/irq.h7
-rw-r--r--arch/x86/include/asm/jump_label.h2
-rw-r--r--arch/x86/include/asm/kdebug.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h35
-rw-r--r--arch/x86/include/asm/kvm_host.h102
-rw-r--r--arch/x86/include/asm/kvm_para.h24
-rw-r--r--arch/x86/include/asm/mach_traps.h12
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/microcode.h6
-rw-r--r--arch/x86/include/asm/mpspec.h31
-rw-r--r--arch/x86/include/asm/mpspec_def.h7
-rw-r--r--arch/x86/include/asm/msr-index.h18
-rw-r--r--arch/x86/include/asm/nmi.h71
-rw-r--r--arch/x86/include/asm/numa_32.h2
-rw-r--r--arch/x86/include/asm/numa_64.h3
-rw-r--r--arch/x86/include/asm/olpc.h10
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/paravirt.h36
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/percpu.h158
-rw-r--r--arch/x86/include/asm/perf_event.h2
-rw-r--r--arch/x86/include/asm/perf_event_p4.h66
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/prom.h1
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h1
-rw-r--r--arch/x86/include/asm/stacktrace.h33
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h9
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h208
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h35
-rw-r--r--arch/x86/include/asm/xen/interface.h6
-rw-r--r--arch/x86/include/asm/xen/interface_32.h5
-rw-r--r--arch/x86/include/asm/xen/interface_64.h13
-rw-r--r--arch/x86/include/asm/xen/page.h21
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c60
-rw-r--r--arch/x86/kernel/alternative.c52
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_nb.c142
-rw-r--r--arch/x86/kernel/apb_timer.c14
-rw-r--r--arch/x86/kernel/aperture_64.c54
-rw-r--r--arch/x86/kernel/apic/Makefile5
-rw-r--r--arch/x86/kernel/apic/apic.c201
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c44
-rw-r--r--arch/x86/kernel/apic/io_apic.c119
-rw-r--r--arch/x86/kernel/apic/nmi.c567
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c96
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c135
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c41
-rw-r--r--arch/x86/kernel/cpu/perf_event.c110
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c30
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c28
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c644
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/dumpstack.c19
-rw-r--r--arch/x86/kernel/dumpstack_32.c25
-rw-r--r--arch/x86/kernel/dumpstack_64.c26
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/entry_32.S12
-rw-r--r--arch/x86/kernel/entry_64.S41
-rw-r--r--arch/x86/kernel/ftrace.c9
-rw-r--r--arch/x86/kernel/head_32.S93
-rw-r--r--arch/x86/kernel/hpet.c26
-rw-r--r--arch/x86/kernel/hw_breakpoint.c16
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/irq.c16
-rw-r--r--arch/x86/kernel/irq_32.c11
-rw-r--r--arch/x86/kernel/kgdb.c19
-rw-r--r--arch/x86/kernel/kprobes.c127
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kernel/microcode_amd.c36
-rw-r--r--arch/x86/kernel/microcode_intel.c16
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c71
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/mpparse.c114
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/pci-gart_64.c34
-rw-r--r--arch/x86/kernel/process.c45
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/pvclock.c43
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/resource.c48
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/setup.c31
-rw-r--r--arch/x86/kernel/smpboot.c55
-rw-r--r--arch/x86/kernel/stacktrace.c8
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/time.c18
-rw-r--r--arch/x86/kernel/trampoline_64.S2
-rw-r--r--arch/x86/kernel/traps.c131
-rw-r--r--arch/x86/kernel/tsc.c100
-rw-r--r--arch/x86/kernel/verify_cpu.S (renamed from arch/x86/kernel/verify_cpu_64.S)49
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/kernel/xsave.c3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c379
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h156
-rw-r--r--arch/x86/kvm/svm.c861
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c180
-rw-r--r--arch/x86/kvm/x86.c487
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c18
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/amdtopology_64.c (renamed from arch/x86/mm/k8topology_64.c)97
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/init_32.c22
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/numa.c22
-rw-r--r--arch/x86/mm/numa_64.c199
-rw-r--r--arch/x86/mm/pageattr.c33
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/mm/setup_nx.c2
-rw-r--r--arch/x86/mm/srat_32.c2
-rw-r--r--arch/x86/mm/srat_64.c36
-rw-r--r--arch/x86/mm/tlb.c7
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c8
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c5
-rw-r--r--arch/x86/oprofile/op_model_amd.c79
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/oprofile/op_model_ppro.c8
-rw-r--r--arch/x86/pci/acpi.c103
-rw-r--r--arch/x86/pci/amd_bus.c33
-rw-r--r--arch/x86/pci/broadcom_bus.c11
-rw-r--r--arch/x86/pci/common.c41
-rw-r--r--arch/x86/pci/i386.c18
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/pci/pcbios.c23
-rw-r--r--arch/x86/pci/xen.c35
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c2
-rw-r--r--arch/x86/platform/mrst/mrst.c30
-rw-r--r--arch/x86/platform/olpc/Makefile1
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c101
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c183
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c5
-rw-r--r--arch/x86/platform/sfi/sfi.c17
-rw-r--r--arch/x86/platform/uv/tlb_uv.c37
-rw-r--r--arch/x86/platform/uv/uv_time.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c2
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c71
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c454
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/p2m.c522
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/spinlock.c8
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/time.c10
-rw-r--r--arch/x86/xen/xen-ops.h2
214 files changed, 8533 insertions, 4571 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6ab63107eeaf..d5ed94d30aad 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,7 @@ config X86
21 select HAVE_UNSTABLE_SCHED_CLOCK 21 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 22 select HAVE_IDE
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_PERF_EVENTS if (!M386 && !M486) 24 select HAVE_PERF_EVENTS
25 select HAVE_IRQ_WORK 25 select HAVE_IRQ_WORK
26 select HAVE_IOREMAP_PROT 26 select HAVE_IOREMAP_PROT
27 select HAVE_KPROBES 27 select HAVE_KPROBES
@@ -51,6 +51,7 @@ config X86
51 select HAVE_KERNEL_GZIP 51 select HAVE_KERNEL_GZIP
52 select HAVE_KERNEL_BZIP2 52 select HAVE_KERNEL_BZIP2
53 select HAVE_KERNEL_LZMA 53 select HAVE_KERNEL_LZMA
54 select HAVE_KERNEL_XZ
54 select HAVE_KERNEL_LZO 55 select HAVE_KERNEL_LZO
55 select HAVE_HW_BREAKPOINT 56 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS 57 select HAVE_MIXED_BREAKPOINTS_REGS
@@ -65,6 +66,7 @@ config X86
65 select HAVE_SPARSE_IRQ 66 select HAVE_SPARSE_IRQ
66 select GENERIC_IRQ_PROBE 67 select GENERIC_IRQ_PROBE
67 select GENERIC_PENDING_IRQ if SMP 68 select GENERIC_PENDING_IRQ if SMP
69 select USE_GENERIC_SMP_HELPERS if SMP
68 70
69config INSTRUCTION_DECODER 71config INSTRUCTION_DECODER
70 def_bool (KPROBES || PERF_EVENTS) 72 def_bool (KPROBES || PERF_EVENTS)
@@ -203,10 +205,6 @@ config HAVE_INTEL_TXT
203 def_bool y 205 def_bool y
204 depends on EXPERIMENTAL && DMAR && ACPI 206 depends on EXPERIMENTAL && DMAR && ACPI
205 207
206config USE_GENERIC_SMP_HELPERS
207 def_bool y
208 depends on SMP
209
210config X86_32_SMP 208config X86_32_SMP
211 def_bool y 209 def_bool y
212 depends on X86_32 && SMP 210 depends on X86_32 && SMP
@@ -629,11 +627,11 @@ config APB_TIMER
629 as it is off-chip. APB timers are always running regardless of CPU 627 as it is off-chip. APB timers are always running regardless of CPU
630 C states, they are used as per CPU clockevent device when possible. 628 C states, they are used as per CPU clockevent device when possible.
631 629
632# Mark as embedded because too many people got it wrong. 630# Mark as expert because too many people got it wrong.
633# The code disables itself when not needed. 631# The code disables itself when not needed.
634config DMI 632config DMI
635 default y 633 default y
636 bool "Enable DMI scanning" if EMBEDDED 634 bool "Enable DMI scanning" if EXPERT
637 ---help--- 635 ---help---
638 Enabled scanning of DMI to identify machine quirks. Say Y 636 Enabled scanning of DMI to identify machine quirks. Say Y
639 here unless you have verified that your setup is not 637 here unless you have verified that your setup is not
@@ -641,7 +639,7 @@ config DMI
641 BIOS code. 639 BIOS code.
642 640
643config GART_IOMMU 641config GART_IOMMU
644 bool "GART IOMMU support" if EMBEDDED 642 bool "GART IOMMU support" if EXPERT
645 default y 643 default y
646 select SWIOTLB 644 select SWIOTLB
647 depends on X86_64 && PCI && AMD_NB 645 depends on X86_64 && PCI && AMD_NB
@@ -891,7 +889,7 @@ config X86_THERMAL_VECTOR
891 depends on X86_MCE_INTEL 889 depends on X86_MCE_INTEL
892 890
893config VM86 891config VM86
894 bool "Enable VM86 support" if EMBEDDED 892 bool "Enable VM86 support" if EXPERT
895 default y 893 default y
896 depends on X86_32 894 depends on X86_32
897 ---help--- 895 ---help---
@@ -1075,7 +1073,7 @@ endchoice
1075 1073
1076choice 1074choice
1077 depends on EXPERIMENTAL 1075 depends on EXPERIMENTAL
1078 prompt "Memory split" if EMBEDDED 1076 prompt "Memory split" if EXPERT
1079 default VMSPLIT_3G 1077 default VMSPLIT_3G
1080 depends on X86_32 1078 depends on X86_32
1081 ---help--- 1079 ---help---
@@ -1137,7 +1135,7 @@ config ARCH_DMA_ADDR_T_64BIT
1137 def_bool X86_64 || HIGHMEM64G 1135 def_bool X86_64 || HIGHMEM64G
1138 1136
1139config DIRECT_GBPAGES 1137config DIRECT_GBPAGES
1140 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1138 bool "Enable 1GB pages for kernel pagetables" if EXPERT
1141 default y 1139 default y
1142 depends on X86_64 1140 depends on X86_64
1143 ---help--- 1141 ---help---
@@ -1170,16 +1168,16 @@ config NUMA
1170comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" 1168comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1171 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) 1169 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
1172 1170
1173config K8_NUMA 1171config AMD_NUMA
1174 def_bool y 1172 def_bool y
1175 prompt "Old style AMD Opteron NUMA detection" 1173 prompt "Old style AMD Opteron NUMA detection"
1176 depends on X86_64 && NUMA && PCI 1174 depends on X86_64 && NUMA && PCI
1177 ---help--- 1175 ---help---
1178 Enable K8 NUMA node topology detection. You should say Y here if 1176 Enable AMD NUMA node topology detection. You should say Y here if
1179 you have a multi processor AMD K8 system. This uses an old 1177 you have a multi processor AMD system. This uses an old method to
1180 method to read the NUMA configuration directly from the builtin 1178 read the NUMA configuration directly from the builtin Northbridge
1181 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA 1179 of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
1182 instead, which also takes priority if both are compiled in. 1180 which also takes priority if both are compiled in.
1183 1181
1184config X86_64_ACPI_NUMA 1182config X86_64_ACPI_NUMA
1185 def_bool y 1183 def_bool y
@@ -1371,7 +1369,7 @@ config MATH_EMULATION
1371 1369
1372config MTRR 1370config MTRR
1373 def_bool y 1371 def_bool y
1374 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED 1372 prompt "MTRR (Memory Type Range Register) support" if EXPERT
1375 ---help--- 1373 ---help---
1376 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1374 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1377 the Memory Type Range Registers (MTRRs) may be used to control 1375 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1437,7 +1435,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1437 1435
1438config X86_PAT 1436config X86_PAT
1439 def_bool y 1437 def_bool y
1440 prompt "x86 PAT support" if EMBEDDED 1438 prompt "x86 PAT support" if EXPERT
1441 depends on MTRR 1439 depends on MTRR
1442 ---help--- 1440 ---help---
1443 Use PAT attributes to setup page level cache control. 1441 Use PAT attributes to setup page level cache control.
@@ -1541,7 +1539,7 @@ config KEXEC_JUMP
1541 code in physical address mode via KEXEC 1539 code in physical address mode via KEXEC
1542 1540
1543config PHYSICAL_START 1541config PHYSICAL_START
1544 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1542 hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
1545 default "0x1000000" 1543 default "0x1000000"
1546 ---help--- 1544 ---help---
1547 This gives the physical address where the kernel is loaded. 1545 This gives the physical address where the kernel is loaded.
@@ -1936,13 +1934,19 @@ config PCI_MMCONFIG
1936 depends on X86_64 && PCI && ACPI 1934 depends on X86_64 && PCI && ACPI
1937 1935
1938config PCI_CNB20LE_QUIRK 1936config PCI_CNB20LE_QUIRK
1939 bool "Read CNB20LE Host Bridge Windows" 1937 bool "Read CNB20LE Host Bridge Windows" if EXPERT
1940 depends on PCI 1938 default n
1939 depends on PCI && EXPERIMENTAL
1941 help 1940 help
1942 Read the PCI windows out of the CNB20LE host bridge. This allows 1941 Read the PCI windows out of the CNB20LE host bridge. This allows
1943 PCI hotplug to work on systems with the CNB20LE chipset which do 1942 PCI hotplug to work on systems with the CNB20LE chipset which do
1944 not have ACPI. 1943 not have ACPI.
1945 1944
1945 There's no public spec for this chipset, and this functionality
1946 is known to be incomplete.
1947
1948 You should say N unless you know you need this.
1949
1946config DMAR 1950config DMAR
1947 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1951 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1948 depends on PCI_MSI && ACPI && EXPERIMENTAL 1952 depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -2064,13 +2068,14 @@ config OLPC
2064 bool "One Laptop Per Child support" 2068 bool "One Laptop Per Child support"
2065 select GPIOLIB 2069 select GPIOLIB
2066 select OLPC_OPENFIRMWARE 2070 select OLPC_OPENFIRMWARE
2071 depends on !X86_64 && !X86_PAE
2067 ---help--- 2072 ---help---
2068 Add support for detecting the unique features of the OLPC 2073 Add support for detecting the unique features of the OLPC
2069 XO hardware. 2074 XO hardware.
2070 2075
2071config OLPC_XO1 2076config OLPC_XO1
2072 tristate "OLPC XO-1 support" 2077 tristate "OLPC XO-1 support"
2073 depends on OLPC && PCI 2078 depends on OLPC && MFD_CS5535
2074 ---help--- 2079 ---help---
2075 Add support for non-essential features of the OLPC XO-1 laptop. 2080 Add support for non-essential features of the OLPC XO-1 laptop.
2076 2081
@@ -2078,11 +2083,17 @@ config OLPC_OPENFIRMWARE
2078 bool "Support for OLPC's Open Firmware" 2083 bool "Support for OLPC's Open Firmware"
2079 depends on !X86_64 && !X86_PAE 2084 depends on !X86_64 && !X86_PAE
2080 default n 2085 default n
2086 select OF
2081 help 2087 help
2082 This option adds support for the implementation of Open Firmware 2088 This option adds support for the implementation of Open Firmware
2083 that is used on the OLPC XO-1 Children's Machine. 2089 that is used on the OLPC XO-1 Children's Machine.
2084 If unsure, say N here. 2090 If unsure, say N here.
2085 2091
2092config OLPC_OPENFIRMWARE_DT
2093 bool
2094 default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
2095 select OF_PROMTREE
2096
2086endif # X86_32 2097endif # X86_32
2087 2098
2088config AMD_NB 2099config AMD_NB
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2ac9069890cd..283c5a6a03a6 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT
310config X86_CMPXCHG 310config X86_CMPXCHG
311 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
312 312
313config CMPXCHG_LOCAL
314 def_bool X86_64 || (X86_32 && !M386)
315
313config X86_L1_CACHE_SHIFT 316config X86_L1_CACHE_SHIFT
314 int 317 int
315 default "7" if MPENTIUM4 || MPSC 318 default "7" if MPENTIUM4 || MPSC
@@ -421,7 +424,7 @@ config X86_DEBUGCTLMSR
421 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML 424 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
422 425
423menuconfig PROCESSOR_SELECT 426menuconfig PROCESSOR_SELECT
424 bool "Supported processor vendors" if EMBEDDED 427 bool "Supported processor vendors" if EXPERT
425 ---help--- 428 ---help---
426 This lets you choose what x86 vendor support code your kernel 429 This lets you choose what x86 vendor support code your kernel
427 will include. 430 will include.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b59ee765414e..615e18810f48 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP
31 see errors. Disable this if you want silent bootup. 31 see errors. Disable this if you want silent bootup.
32 32
33config EARLY_PRINTK 33config EARLY_PRINTK
34 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EXPERT
35 default y 35 default y
36 ---help--- 36 ---help---
37 Write kernel log output directly into the VGA buffer or to a serial 37 Write kernel log output directly into the VGA buffer or to a serial
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
117 feature as well as for the change_page_attr() infrastructure. 117 feature as well as for the change_page_attr() infrastructure.
118 If in doubt, say "N" 118 If in doubt, say "N"
119 119
120config DEBUG_SET_MODULE_RONX
121 bool "Set loadable kernel module data as NX and text as RO"
122 depends on MODULES
123 ---help---
124 This option helps catch unintended modifications to loadable
125 kernel module's text and read-only data. It also prevents execution
126 of module data. Such protection may interfere with run-time code
127 patching and dynamic kernel tracing - and they might also protect
128 against certain classes of kernel exploits.
129 If in doubt, say "N".
130
120config DEBUG_NX_TEST 131config DEBUG_NX_TEST
121 tristate "Testcase for the NX non-executable stack feature" 132 tristate "Testcase for the NX non-executable stack feature"
122 depends on DEBUG_KERNEL && m 133 depends on DEBUG_KERNEL && m
@@ -127,7 +138,7 @@ config DEBUG_NX_TEST
127 138
128config DOUBLEFAULT 139config DOUBLEFAULT
129 default y 140 default y
130 bool "Enable doublefault exception handler" if EMBEDDED 141 bool "Enable doublefault exception handler" if EXPERT
131 depends on X86_32 142 depends on X86_32
132 ---help--- 143 ---help---
133 This option allows trapping of rare doublefault exceptions that 144 This option allows trapping of rare doublefault exceptions that
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0c229551eead..09664efb9cee 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -49,12 +49,15 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
49 $(call if_changed,bzip2) 49 $(call if_changed,bzip2)
50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE 50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
51 $(call if_changed,lzma) 51 $(call if_changed,lzma)
52$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,xzkern)
52$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE 54$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,lzo) 55 $(call if_changed,lzo)
54 56
55suffix-$(CONFIG_KERNEL_GZIP) := gz 57suffix-$(CONFIG_KERNEL_GZIP) := gz
56suffix-$(CONFIG_KERNEL_BZIP2) := bz2 58suffix-$(CONFIG_KERNEL_BZIP2) := bz2
57suffix-$(CONFIG_KERNEL_LZMA) := lzma 59suffix-$(CONFIG_KERNEL_LZMA) := lzma
60suffix-$(CONFIG_KERNEL_XZ) := xz
58suffix-$(CONFIG_KERNEL_LZO) := lzo 61suffix-$(CONFIG_KERNEL_LZO) := lzo
59 62
60quiet_cmd_mkpiggy = MKPIGGY $@ 63quiet_cmd_mkpiggy = MKPIGGY $@
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 52f85a196fa0..35af09d13dc1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
182 hlt 182 hlt
183 jmp 1b 183 jmp 1b
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu.S"
186 186
187 /* 187 /*
188 * Be careful here startup_64 needs to be at a predictable 188 * Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 23f315c9f215..3a19d04cebeb 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -139,6 +139,10 @@ static int lines, cols;
139#include "../../../../lib/decompress_unlzma.c" 139#include "../../../../lib/decompress_unlzma.c"
140#endif 140#endif
141 141
142#ifdef CONFIG_KERNEL_XZ
143#include "../../../../lib/decompress_unxz.c"
144#endif
145
142#ifdef CONFIG_KERNEL_LZO 146#ifdef CONFIG_KERNEL_LZO
143#include "../../../../lib/decompress_unlzo.c" 147#include "../../../../lib/decompress_unlzo.c"
144#endif 148#endif
@@ -355,7 +359,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
355 if (heap > 0x3fffffffffffUL) 359 if (heap > 0x3fffffffffffUL)
356 error("Destination address too large"); 360 error("Destination address too large");
357#else 361#else
358 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) 362 if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
359 error("Destination address too large"); 363 error("Destination address too large");
360#endif 364#endif
361#ifndef CONFIG_RELOCATABLE 365#ifndef CONFIG_RELOCATABLE
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 5c228129d175..646aa78ba5fd 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -74,7 +74,7 @@ int main(int argc, char *argv[])
74 74
75 offs = (olen > ilen) ? olen - ilen : 0; 75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */ 76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ 77 offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ 78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79 79
80 printf(".section \".rodata..compressed\",\"a\",@progbits\n"); 80 printf(".section \".rodata..compressed\",\"a\",@progbits\n");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..8fe2a4966b7a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
9 * Vinodh Gopal <vinodh.gopal@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir 10 * Kahraman Akdemir
11 * 11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
12 * This program is free software; you can redistribute it and/or modify 26 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by 27 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or 28 * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
18#include <linux/linkage.h> 32#include <linux/linkage.h>
19#include <asm/inst.h> 33#include <asm/inst.h>
20 34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
21.text 56.text
22 57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
23#define STATE1 %xmm0 91#define STATE1 %xmm0
24#define STATE2 %xmm4 92#define STATE2 %xmm4
25#define STATE3 %xmm5 93#define STATE3 %xmm5
@@ -32,12 +100,16 @@
32#define IN IN1 100#define IN IN1
33#define KEY %xmm2 101#define KEY %xmm2
34#define IV %xmm3 102#define IV %xmm3
103
35#define BSWAP_MASK %xmm10 104#define BSWAP_MASK %xmm10
36#define CTR %xmm11 105#define CTR %xmm11
37#define INC %xmm12 106#define INC %xmm12
38 107
108#ifdef __x86_64__
109#define AREG %rax
39#define KEYP %rdi 110#define KEYP %rdi
40#define OUTP %rsi 111#define OUTP %rsi
112#define UKEYP OUTP
41#define INP %rdx 113#define INP %rdx
42#define LEN %rcx 114#define LEN %rcx
43#define IVP %r8 115#define IVP %r8
@@ -46,6 +118,1588 @@
46#define TKEYP T1 118#define TKEYP T1
47#define T2 %r11 119#define T2 %r11
48#define TCTR_LOW T2 120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block seperately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block seperately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1616 sub $16, %r11
1617 add %r13, %r11
1618 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1619 lea SHIFT_MASK+16(%rip), %r12
1620 sub %r13, %r12
1621 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1622 # (%r13 is the number of bytes in plaintext mod 16)
1623 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1624 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1625 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1626 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1627 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1628 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1629 movdqa SHUF_MASK(%rip), %xmm10
1630 PSHUFB_XMM %xmm10,%xmm0
1631
1632 pxor %xmm0, %xmm8
1633 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1634 # GHASH computation for the last <16 byte block
1635 sub %r13, %r11
1636 add $16, %r11
1637 PSHUFB_XMM %xmm10, %xmm1
1638
1639 # shuffle xmm0 back to output as ciphertext
1640
1641 # Output %r13 bytes
1642 MOVQ_R64_XMM %xmm0, %rax
1643 cmp $8, %r13
1644 jle _less_than_8_bytes_left_encrypt
1645 mov %rax, (%arg2 , %r11, 1)
1646 add $8, %r11
1647 psrldq $8, %xmm0
1648 MOVQ_R64_XMM %xmm0, %rax
1649 sub $8, %r13
1650_less_than_8_bytes_left_encrypt:
1651 mov %al, (%arg2, %r11, 1)
1652 add $1, %r11
1653 shr $8, %rax
1654 sub $1, %r13
1655 jne _less_than_8_bytes_left_encrypt
1656_multiple_of_16_bytes_encrypt:
1657 mov arg8, %r12 # %r12 = addLen (number of bytes)
1658 shl $3, %r12
1659 movd %r12d, %xmm15 # len(A) in %xmm15
1660 shl $3, %arg4 # len(C) in bits (*128)
1661 MOVQ_R64_XMM %arg4, %xmm1
1662 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1663 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1664 pxor %xmm15, %xmm8
1665 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1666 # final GHASH computation
1667 movdqa SHUF_MASK(%rip), %xmm10
1668 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1669
1670 mov %arg5, %rax # %rax = *Y0
1671 movdqu (%rax), %xmm0 # %xmm0 = Y0
1672 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1673 pxor %xmm8, %xmm0
1674_return_T_encrypt:
1675 mov arg9, %r10 # %r10 = authTag
1676 mov arg10, %r11 # %r11 = auth_tag_len
1677 cmp $16, %r11
1678 je _T_16_encrypt
1679 cmp $12, %r11
1680 je _T_12_encrypt
1681_T_8_encrypt:
1682 MOVQ_R64_XMM %xmm0, %rax
1683 mov %rax, (%r10)
1684 jmp _return_T_done_encrypt
1685_T_12_encrypt:
1686 MOVQ_R64_XMM %xmm0, %rax
1687 mov %rax, (%r10)
1688 psrldq $8, %xmm0
1689 movd %xmm0, %eax
1690 mov %eax, 8(%r10)
1691 jmp _return_T_done_encrypt
1692_T_16_encrypt:
1693 movdqu %xmm0, (%r10)
1694_return_T_done_encrypt:
1695 mov %r14, %rsp
1696 pop %r14
1697 pop %r13
1698 pop %r12
1699 ret
1700
1701#endif
1702
49 1703
50_key_expansion_128: 1704_key_expansion_128:
51_key_expansion_256a: 1705_key_expansion_256a:
@@ -55,10 +1709,11 @@ _key_expansion_256a:
55 shufps $0b10001100, %xmm0, %xmm4 1709 shufps $0b10001100, %xmm0, %xmm4
56 pxor %xmm4, %xmm0 1710 pxor %xmm4, %xmm0
57 pxor %xmm1, %xmm0 1711 pxor %xmm1, %xmm0
58 movaps %xmm0, (%rcx) 1712 movaps %xmm0, (TKEYP)
59 add $0x10, %rcx 1713 add $0x10, TKEYP
60 ret 1714 ret
61 1715
1716.align 4
62_key_expansion_192a: 1717_key_expansion_192a:
63 pshufd $0b01010101, %xmm1, %xmm1 1718 pshufd $0b01010101, %xmm1, %xmm1
64 shufps $0b00010000, %xmm0, %xmm4 1719 shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1731,13 @@ _key_expansion_192a:
76 1731
77 movaps %xmm0, %xmm1 1732 movaps %xmm0, %xmm1
78 shufps $0b01000100, %xmm0, %xmm6 1733 shufps $0b01000100, %xmm0, %xmm6
79 movaps %xmm6, (%rcx) 1734 movaps %xmm6, (TKEYP)
80 shufps $0b01001110, %xmm2, %xmm1 1735 shufps $0b01001110, %xmm2, %xmm1
81 movaps %xmm1, 16(%rcx) 1736 movaps %xmm1, 0x10(TKEYP)
82 add $0x20, %rcx 1737 add $0x20, TKEYP
83 ret 1738 ret
84 1739
1740.align 4
85_key_expansion_192b: 1741_key_expansion_192b:
86 pshufd $0b01010101, %xmm1, %xmm1 1742 pshufd $0b01010101, %xmm1, %xmm1
87 shufps $0b00010000, %xmm0, %xmm4 1743 shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1752,11 @@ _key_expansion_192b:
96 pxor %xmm3, %xmm2 1752 pxor %xmm3, %xmm2
97 pxor %xmm5, %xmm2 1753 pxor %xmm5, %xmm2
98 1754
99 movaps %xmm0, (%rcx) 1755 movaps %xmm0, (TKEYP)
100 add $0x10, %rcx 1756 add $0x10, TKEYP
101 ret 1757 ret
102 1758
1759.align 4
103_key_expansion_256b: 1760_key_expansion_256b:
104 pshufd $0b10101010, %xmm1, %xmm1 1761 pshufd $0b10101010, %xmm1, %xmm1
105 shufps $0b00010000, %xmm2, %xmm4 1762 shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1764,8 @@ _key_expansion_256b:
107 shufps $0b10001100, %xmm2, %xmm4 1764 shufps $0b10001100, %xmm2, %xmm4
108 pxor %xmm4, %xmm2 1765 pxor %xmm4, %xmm2
109 pxor %xmm1, %xmm2 1766 pxor %xmm1, %xmm2
110 movaps %xmm2, (%rcx) 1767 movaps %xmm2, (TKEYP)
111 add $0x10, %rcx 1768 add $0x10, TKEYP
112 ret 1769 ret
113 1770
114/* 1771/*
@@ -116,17 +1773,23 @@ _key_expansion_256b:
116 * unsigned int key_len) 1773 * unsigned int key_len)
117 */ 1774 */
118ENTRY(aesni_set_key) 1775ENTRY(aesni_set_key)
119 movups (%rsi), %xmm0 # user key (first 16 bytes) 1776#ifndef __x86_64__
120 movaps %xmm0, (%rdi) 1777 pushl KEYP
121 lea 0x10(%rdi), %rcx # key addr 1778 movl 8(%esp), KEYP # ctx
122 movl %edx, 480(%rdi) 1779 movl 12(%esp), UKEYP # in_key
1780 movl 16(%esp), %edx # key_len
1781#endif
1782 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1783 movaps %xmm0, (KEYP)
1784 lea 0x10(KEYP), TKEYP # key addr
1785 movl %edx, 480(KEYP)
123 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1786 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
124 cmp $24, %dl 1787 cmp $24, %dl
125 jb .Lenc_key128 1788 jb .Lenc_key128
126 je .Lenc_key192 1789 je .Lenc_key192
127 movups 0x10(%rsi), %xmm2 # other user key 1790 movups 0x10(UKEYP), %xmm2 # other user key
128 movaps %xmm2, (%rcx) 1791 movaps %xmm2, (TKEYP)
129 add $0x10, %rcx 1792 add $0x10, TKEYP
130 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1793 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
131 call _key_expansion_256a 1794 call _key_expansion_256a
132 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1795 AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1818,7 @@ ENTRY(aesni_set_key)
155 call _key_expansion_256a 1818 call _key_expansion_256a
156 jmp .Ldec_key 1819 jmp .Ldec_key
157.Lenc_key192: 1820.Lenc_key192:
158 movq 0x10(%rsi), %xmm2 # other user key 1821 movq 0x10(UKEYP), %xmm2 # other user key
159 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
160 call _key_expansion_192a 1823 call _key_expansion_192a
161 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1824 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +1858,47 @@ ENTRY(aesni_set_key)
195 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1858 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
196 call _key_expansion_128 1859 call _key_expansion_128
197.Ldec_key: 1860.Ldec_key:
198 sub $0x10, %rcx 1861 sub $0x10, TKEYP
199 movaps (%rdi), %xmm0 1862 movaps (KEYP), %xmm0
200 movaps (%rcx), %xmm1 1863 movaps (TKEYP), %xmm1
201 movaps %xmm0, 240(%rcx) 1864 movaps %xmm0, 240(TKEYP)
202 movaps %xmm1, 240(%rdi) 1865 movaps %xmm1, 240(KEYP)
203 add $0x10, %rdi 1866 add $0x10, KEYP
204 lea 240-16(%rcx), %rsi 1867 lea 240-16(TKEYP), UKEYP
205.align 4 1868.align 4
206.Ldec_key_loop: 1869.Ldec_key_loop:
207 movaps (%rdi), %xmm0 1870 movaps (KEYP), %xmm0
208 AESIMC %xmm0 %xmm1 1871 AESIMC %xmm0 %xmm1
209 movaps %xmm1, (%rsi) 1872 movaps %xmm1, (UKEYP)
210 add $0x10, %rdi 1873 add $0x10, KEYP
211 sub $0x10, %rsi 1874 sub $0x10, UKEYP
212 cmp %rcx, %rdi 1875 cmp TKEYP, KEYP
213 jb .Ldec_key_loop 1876 jb .Ldec_key_loop
214 xor %rax, %rax 1877 xor AREG, AREG
1878#ifndef __x86_64__
1879 popl KEYP
1880#endif
215 ret 1881 ret
216 1882
217/* 1883/*
218 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1884 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
219 */ 1885 */
220ENTRY(aesni_enc) 1886ENTRY(aesni_enc)
1887#ifndef __x86_64__
1888 pushl KEYP
1889 pushl KLEN
1890 movl 12(%esp), KEYP
1891 movl 16(%esp), OUTP
1892 movl 20(%esp), INP
1893#endif
221 movl 480(KEYP), KLEN # key length 1894 movl 480(KEYP), KLEN # key length
222 movups (INP), STATE # input 1895 movups (INP), STATE # input
223 call _aesni_enc1 1896 call _aesni_enc1
224 movups STATE, (OUTP) # output 1897 movups STATE, (OUTP) # output
1898#ifndef __x86_64__
1899 popl KLEN
1900 popl KEYP
1901#endif
225 ret 1902 ret
226 1903
227/* 1904/*
@@ -236,6 +1913,7 @@ ENTRY(aesni_enc)
236 * KEY 1913 * KEY
237 * TKEYP (T1) 1914 * TKEYP (T1)
238 */ 1915 */
1916.align 4
239_aesni_enc1: 1917_aesni_enc1:
240 movaps (KEYP), KEY # key 1918 movaps (KEYP), KEY # key
241 mov KEYP, TKEYP 1919 mov KEYP, TKEYP
@@ -298,6 +1976,7 @@ _aesni_enc1:
298 * KEY 1976 * KEY
299 * TKEYP (T1) 1977 * TKEYP (T1)
300 */ 1978 */
1979.align 4
301_aesni_enc4: 1980_aesni_enc4:
302 movaps (KEYP), KEY # key 1981 movaps (KEYP), KEY # key
303 mov KEYP, TKEYP 1982 mov KEYP, TKEYP
@@ -391,11 +2070,22 @@ _aesni_enc4:
391 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2070 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
392 */ 2071 */
393ENTRY(aesni_dec) 2072ENTRY(aesni_dec)
2073#ifndef __x86_64__
2074 pushl KEYP
2075 pushl KLEN
2076 movl 12(%esp), KEYP
2077 movl 16(%esp), OUTP
2078 movl 20(%esp), INP
2079#endif
394 mov 480(KEYP), KLEN # key length 2080 mov 480(KEYP), KLEN # key length
395 add $240, KEYP 2081 add $240, KEYP
396 movups (INP), STATE # input 2082 movups (INP), STATE # input
397 call _aesni_dec1 2083 call _aesni_dec1
398 movups STATE, (OUTP) #output 2084 movups STATE, (OUTP) #output
2085#ifndef __x86_64__
2086 popl KLEN
2087 popl KEYP
2088#endif
399 ret 2089 ret
400 2090
401/* 2091/*
@@ -410,6 +2100,7 @@ ENTRY(aesni_dec)
410 * KEY 2100 * KEY
411 * TKEYP (T1) 2101 * TKEYP (T1)
412 */ 2102 */
2103.align 4
413_aesni_dec1: 2104_aesni_dec1:
414 movaps (KEYP), KEY # key 2105 movaps (KEYP), KEY # key
415 mov KEYP, TKEYP 2106 mov KEYP, TKEYP
@@ -472,6 +2163,7 @@ _aesni_dec1:
472 * KEY 2163 * KEY
473 * TKEYP (T1) 2164 * TKEYP (T1)
474 */ 2165 */
2166.align 4
475_aesni_dec4: 2167_aesni_dec4:
476 movaps (KEYP), KEY # key 2168 movaps (KEYP), KEY # key
477 mov KEYP, TKEYP 2169 mov KEYP, TKEYP
@@ -566,6 +2258,15 @@ _aesni_dec4:
566 * size_t len) 2258 * size_t len)
567 */ 2259 */
568ENTRY(aesni_ecb_enc) 2260ENTRY(aesni_ecb_enc)
2261#ifndef __x86_64__
2262 pushl LEN
2263 pushl KEYP
2264 pushl KLEN
2265 movl 16(%esp), KEYP
2266 movl 20(%esp), OUTP
2267 movl 24(%esp), INP
2268 movl 28(%esp), LEN
2269#endif
569 test LEN, LEN # check length 2270 test LEN, LEN # check length
570 jz .Lecb_enc_ret 2271 jz .Lecb_enc_ret
571 mov 480(KEYP), KLEN 2272 mov 480(KEYP), KLEN
@@ -602,6 +2303,11 @@ ENTRY(aesni_ecb_enc)
602 cmp $16, LEN 2303 cmp $16, LEN
603 jge .Lecb_enc_loop1 2304 jge .Lecb_enc_loop1
604.Lecb_enc_ret: 2305.Lecb_enc_ret:
2306#ifndef __x86_64__
2307 popl KLEN
2308 popl KEYP
2309 popl LEN
2310#endif
605 ret 2311 ret
606 2312
607/* 2313/*
@@ -609,6 +2315,15 @@ ENTRY(aesni_ecb_enc)
609 * size_t len); 2315 * size_t len);
610 */ 2316 */
611ENTRY(aesni_ecb_dec) 2317ENTRY(aesni_ecb_dec)
2318#ifndef __x86_64__
2319 pushl LEN
2320 pushl KEYP
2321 pushl KLEN
2322 movl 16(%esp), KEYP
2323 movl 20(%esp), OUTP
2324 movl 24(%esp), INP
2325 movl 28(%esp), LEN
2326#endif
612 test LEN, LEN 2327 test LEN, LEN
613 jz .Lecb_dec_ret 2328 jz .Lecb_dec_ret
614 mov 480(KEYP), KLEN 2329 mov 480(KEYP), KLEN
@@ -646,6 +2361,11 @@ ENTRY(aesni_ecb_dec)
646 cmp $16, LEN 2361 cmp $16, LEN
647 jge .Lecb_dec_loop1 2362 jge .Lecb_dec_loop1
648.Lecb_dec_ret: 2363.Lecb_dec_ret:
2364#ifndef __x86_64__
2365 popl KLEN
2366 popl KEYP
2367 popl LEN
2368#endif
649 ret 2369 ret
650 2370
651/* 2371/*
@@ -653,6 +2373,17 @@ ENTRY(aesni_ecb_dec)
653 * size_t len, u8 *iv) 2373 * size_t len, u8 *iv)
654 */ 2374 */
655ENTRY(aesni_cbc_enc) 2375ENTRY(aesni_cbc_enc)
2376#ifndef __x86_64__
2377 pushl IVP
2378 pushl LEN
2379 pushl KEYP
2380 pushl KLEN
2381 movl 20(%esp), KEYP
2382 movl 24(%esp), OUTP
2383 movl 28(%esp), INP
2384 movl 32(%esp), LEN
2385 movl 36(%esp), IVP
2386#endif
656 cmp $16, LEN 2387 cmp $16, LEN
657 jb .Lcbc_enc_ret 2388 jb .Lcbc_enc_ret
658 mov 480(KEYP), KLEN 2389 mov 480(KEYP), KLEN
@@ -670,6 +2401,12 @@ ENTRY(aesni_cbc_enc)
670 jge .Lcbc_enc_loop 2401 jge .Lcbc_enc_loop
671 movups STATE, (IVP) 2402 movups STATE, (IVP)
672.Lcbc_enc_ret: 2403.Lcbc_enc_ret:
2404#ifndef __x86_64__
2405 popl KLEN
2406 popl KEYP
2407 popl LEN
2408 popl IVP
2409#endif
673 ret 2410 ret
674 2411
675/* 2412/*
@@ -677,6 +2414,17 @@ ENTRY(aesni_cbc_enc)
677 * size_t len, u8 *iv) 2414 * size_t len, u8 *iv)
678 */ 2415 */
679ENTRY(aesni_cbc_dec) 2416ENTRY(aesni_cbc_dec)
2417#ifndef __x86_64__
2418 pushl IVP
2419 pushl LEN
2420 pushl KEYP
2421 pushl KLEN
2422 movl 20(%esp), KEYP
2423 movl 24(%esp), OUTP
2424 movl 28(%esp), INP
2425 movl 32(%esp), LEN
2426 movl 36(%esp), IVP
2427#endif
680 cmp $16, LEN 2428 cmp $16, LEN
681 jb .Lcbc_dec_just_ret 2429 jb .Lcbc_dec_just_ret
682 mov 480(KEYP), KLEN 2430 mov 480(KEYP), KLEN
@@ -690,16 +2438,30 @@ ENTRY(aesni_cbc_dec)
690 movaps IN1, STATE1 2438 movaps IN1, STATE1
691 movups 0x10(INP), IN2 2439 movups 0x10(INP), IN2
692 movaps IN2, STATE2 2440 movaps IN2, STATE2
2441#ifdef __x86_64__
693 movups 0x20(INP), IN3 2442 movups 0x20(INP), IN3
694 movaps IN3, STATE3 2443 movaps IN3, STATE3
695 movups 0x30(INP), IN4 2444 movups 0x30(INP), IN4
696 movaps IN4, STATE4 2445 movaps IN4, STATE4
2446#else
2447 movups 0x20(INP), IN1
2448 movaps IN1, STATE3
2449 movups 0x30(INP), IN2
2450 movaps IN2, STATE4
2451#endif
697 call _aesni_dec4 2452 call _aesni_dec4
698 pxor IV, STATE1 2453 pxor IV, STATE1
2454#ifdef __x86_64__
699 pxor IN1, STATE2 2455 pxor IN1, STATE2
700 pxor IN2, STATE3 2456 pxor IN2, STATE3
701 pxor IN3, STATE4 2457 pxor IN3, STATE4
702 movaps IN4, IV 2458 movaps IN4, IV
2459#else
2460 pxor (INP), STATE2
2461 pxor 0x10(INP), STATE3
2462 pxor IN1, STATE4
2463 movaps IN2, IV
2464#endif
703 movups STATE1, (OUTP) 2465 movups STATE1, (OUTP)
704 movups STATE2, 0x10(OUTP) 2466 movups STATE2, 0x10(OUTP)
705 movups STATE3, 0x20(OUTP) 2467 movups STATE3, 0x20(OUTP)
@@ -727,8 +2489,15 @@ ENTRY(aesni_cbc_dec)
727.Lcbc_dec_ret: 2489.Lcbc_dec_ret:
728 movups IV, (IVP) 2490 movups IV, (IVP)
729.Lcbc_dec_just_ret: 2491.Lcbc_dec_just_ret:
2492#ifndef __x86_64__
2493 popl KLEN
2494 popl KEYP
2495 popl LEN
2496 popl IVP
2497#endif
730 ret 2498 ret
731 2499
2500#ifdef __x86_64__
732.align 16 2501.align 16
733.Lbswap_mask: 2502.Lbswap_mask:
734 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2503 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2513,7 @@ ENTRY(aesni_cbc_dec)
744 * INC: == 1, in little endian 2513 * INC: == 1, in little endian
745 * BSWAP_MASK == endian swapping mask 2514 * BSWAP_MASK == endian swapping mask
746 */ 2515 */
2516.align 4
747_aesni_inc_init: 2517_aesni_inc_init:
748 movaps .Lbswap_mask, BSWAP_MASK 2518 movaps .Lbswap_mask, BSWAP_MASK
749 movaps IV, CTR 2519 movaps IV, CTR
@@ -768,6 +2538,7 @@ _aesni_inc_init:
768 * CTR: == output IV, in little endian 2538 * CTR: == output IV, in little endian
769 * TCTR_LOW: == lower qword of CTR 2539 * TCTR_LOW: == lower qword of CTR
770 */ 2540 */
2541.align 4
771_aesni_inc: 2542_aesni_inc:
772 paddq INC, CTR 2543 paddq INC, CTR
773 add $1, TCTR_LOW 2544 add $1, TCTR_LOW
@@ -839,3 +2610,4 @@ ENTRY(aesni_ctr_enc)
839 movups IV, (IVP) 2610 movups IV, (IVP)
840.Lctr_enc_just_ret: 2611.Lctr_enc_just_ret:
841 ret 2612 ret
2613#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..e1e60c7d5813 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
5 * Copyright (C) 2008, Intel Corp. 5 * Copyright (C) 2008, Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com> 6 * Author: Huang Ying <ying.huang@intel.com>
7 * 7 *
8 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
9 * interface for 64-bit kernels.
10 * Authors: Adrian Hoban <adrian.hoban@intel.com>
11 * Gabriele Paoloni <gabriele.paoloni@intel.com>
12 * Tadeusz Struk (tadeusz.struk@intel.com)
13 * Aidan O'Mahony (aidan.o.mahony@intel.com)
14 * Copyright (c) 2010, Intel Corporation.
15 *
8 * This program is free software; you can redistribute it and/or modify 16 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 17 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or 18 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
21#include <crypto/ctr.h> 29#include <crypto/ctr.h>
22#include <asm/i387.h> 30#include <asm/i387.h>
23#include <asm/aes.h> 31#include <asm/aes.h>
32#include <crypto/scatterwalk.h>
33#include <crypto/internal/aead.h>
34#include <linux/workqueue.h>
35#include <linux/spinlock.h>
24 36
25#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) 37#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
26#define HAS_CTR 38#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
42 struct cryptd_ablkcipher *cryptd_tfm; 54 struct cryptd_ablkcipher *cryptd_tfm;
43}; 55};
44 56
45#define AESNI_ALIGN 16 57/* This data is stored at the end of the crypto_tfm struct.
58 * It's a type of per "session" data storage location.
59 * This needs to be 16 byte aligned.
60 */
61struct aesni_rfc4106_gcm_ctx {
62 u8 hash_subkey[16];
63 struct crypto_aes_ctx aes_key_expanded;
64 u8 nonce[4];
65 struct cryptd_aead *cryptd_tfm;
66};
67
68struct aesni_gcm_set_hash_subkey_result {
69 int err;
70 struct completion completion;
71};
72
73struct aesni_hash_subkey_req_data {
74 u8 iv[16];
75 struct aesni_gcm_set_hash_subkey_result result;
76 struct scatterlist sg;
77};
78
79#define AESNI_ALIGN (16)
46#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) 80#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
81#define RFC4106_HASH_SUBKEY_SIZE 16
47 82
48asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 83asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
49 unsigned int key_len); 84 unsigned int key_len);
@@ -59,9 +94,62 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
59 const u8 *in, unsigned int len, u8 *iv); 94 const u8 *in, unsigned int len, u8 *iv);
60asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
61 const u8 *in, unsigned int len, u8 *iv); 96 const u8 *in, unsigned int len, u8 *iv);
97#ifdef CONFIG_X86_64
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 98asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv); 99 const u8 *in, unsigned int len, u8 *iv);
64 100
101/* asmlinkage void aesni_gcm_enc()
102 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
103 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
104 * const u8 *in, Plaintext input
105 * unsigned long plaintext_len, Length of data in bytes for encryption.
106 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
107 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
108 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
109 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
110 * const u8 *aad, Additional Authentication Data (AAD)
111 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
112 * is going to be 8 or 12 bytes
113 * u8 *auth_tag, Authenticated Tag output.
114 * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
115 * Valid values are 16 (most likely), 12 or 8.
116 */
117asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
118 const u8 *in, unsigned long plaintext_len, u8 *iv,
119 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
120 u8 *auth_tag, unsigned long auth_tag_len);
121
122/* asmlinkage void aesni_gcm_dec()
123 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
124 * u8 *out, Plaintext output. Decrypt in-place is allowed.
125 * const u8 *in, Ciphertext input
126 * unsigned long ciphertext_len, Length of data in bytes for decryption.
127 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
128 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
129 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
130 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
131 * const u8 *aad, Additional Authentication Data (AAD)
132 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
133 * to be 8 or 12 bytes
134 * u8 *auth_tag, Authenticated Tag output.
135 * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
136 * Valid values are 16 (most likely), 12 or 8.
137 */
138asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
139 const u8 *in, unsigned long ciphertext_len, u8 *iv,
140 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
141 u8 *auth_tag, unsigned long auth_tag_len);
142
143static inline struct
144aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
145{
146 return
147 (struct aesni_rfc4106_gcm_ctx *)
148 PTR_ALIGN((u8 *)
149 crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
150}
151#endif
152
65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 153static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
66{ 154{
67 unsigned long addr = (unsigned long)raw_ctx; 155 unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = {
324 }, 412 },
325}; 413};
326 414
415#ifdef CONFIG_X86_64
327static void ctr_crypt_final(struct crypto_aes_ctx *ctx, 416static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
328 struct blkcipher_walk *walk) 417 struct blkcipher_walk *walk)
329{ 418{
@@ -389,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = {
389 }, 478 },
390 }, 479 },
391}; 480};
481#endif
392 482
393static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, 483static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
394 unsigned int key_len) 484 unsigned int key_len)
@@ -536,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = {
536 }, 626 },
537}; 627};
538 628
629#ifdef CONFIG_X86_64
539static int ablk_ctr_init(struct crypto_tfm *tfm) 630static int ablk_ctr_init(struct crypto_tfm *tfm)
540{ 631{
541 struct cryptd_ablkcipher *cryptd_tfm; 632 struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
612 }, 703 },
613}; 704};
614#endif 705#endif
706#endif
615 707
616#ifdef HAS_LRW 708#ifdef HAS_LRW
617static int ablk_lrw_init(struct crypto_tfm *tfm) 709static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +822,424 @@ static struct crypto_alg ablk_xts_alg = {
730}; 822};
731#endif 823#endif
732 824
825#ifdef CONFIG_X86_64
826static int rfc4106_init(struct crypto_tfm *tfm)
827{
828 struct cryptd_aead *cryptd_tfm;
829 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
830 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
831 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
832 if (IS_ERR(cryptd_tfm))
833 return PTR_ERR(cryptd_tfm);
834 ctx->cryptd_tfm = cryptd_tfm;
835 tfm->crt_aead.reqsize = sizeof(struct aead_request)
836 + crypto_aead_reqsize(&cryptd_tfm->base);
837 return 0;
838}
839
840static void rfc4106_exit(struct crypto_tfm *tfm)
841{
842 struct aesni_rfc4106_gcm_ctx *ctx =
843 (struct aesni_rfc4106_gcm_ctx *)
844 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
845 if (!IS_ERR(ctx->cryptd_tfm))
846 cryptd_free_aead(ctx->cryptd_tfm);
847 return;
848}
849
850static void
851rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
852{
853 struct aesni_gcm_set_hash_subkey_result *result = req->data;
854
855 if (err == -EINPROGRESS)
856 return;
857 result->err = err;
858 complete(&result->completion);
859}
860
861static int
862rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
863{
864 struct crypto_ablkcipher *ctr_tfm;
865 struct ablkcipher_request *req;
866 int ret = -EINVAL;
867 struct aesni_hash_subkey_req_data *req_data;
868
869 ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
870 if (IS_ERR(ctr_tfm))
871 return PTR_ERR(ctr_tfm);
872
873 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
874
875 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
876 if (ret) {
877 crypto_free_ablkcipher(ctr_tfm);
878 return ret;
879 }
880
881 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
882 if (!req) {
883 crypto_free_ablkcipher(ctr_tfm);
884 return -EINVAL;
885 }
886
887 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
888 if (!req_data) {
889 crypto_free_ablkcipher(ctr_tfm);
890 return -ENOMEM;
891 }
892 memset(req_data->iv, 0, sizeof(req_data->iv));
893
894 /* Clear the data in the hash sub key container to zero.*/
895 /* We want to cipher all zeros to create the hash sub key. */
896 memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
897
898 init_completion(&req_data->result.completion);
899 sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
900 ablkcipher_request_set_tfm(req, ctr_tfm);
901 ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
902 CRYPTO_TFM_REQ_MAY_BACKLOG,
903 rfc4106_set_hash_subkey_done,
904 &req_data->result);
905
906 ablkcipher_request_set_crypt(req, &req_data->sg,
907 &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
908
909 ret = crypto_ablkcipher_encrypt(req);
910 if (ret == -EINPROGRESS || ret == -EBUSY) {
911 ret = wait_for_completion_interruptible
912 (&req_data->result.completion);
913 if (!ret)
914 ret = req_data->result.err;
915 }
916 ablkcipher_request_free(req);
917 kfree(req_data);
918 crypto_free_ablkcipher(ctr_tfm);
919 return ret;
920}
921
922static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
923 unsigned int key_len)
924{
925 int ret = 0;
926 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
927 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
928 u8 *new_key_mem = NULL;
929
930 if (key_len < 4) {
931 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
932 return -EINVAL;
933 }
934 /*Account for 4 byte nonce at the end.*/
935 key_len -= 4;
936 if (key_len != AES_KEYSIZE_128) {
937 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
938 return -EINVAL;
939 }
940
941 memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
942 /*This must be on a 16 byte boundary!*/
943 if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
944 return -EINVAL;
945
946 if ((unsigned long)key % AESNI_ALIGN) {
947 /*key is not aligned: use an auxuliar aligned pointer*/
948 new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
949 if (!new_key_mem)
950 return -ENOMEM;
951
952 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
953 memcpy(new_key_mem, key, key_len);
954 key = new_key_mem;
955 }
956
957 if (!irq_fpu_usable())
958 ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
959 key, key_len);
960 else {
961 kernel_fpu_begin();
962 ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
963 kernel_fpu_end();
964 }
965 /*This must be on a 16 byte boundary!*/
966 if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
967 ret = -EINVAL;
968 goto exit;
969 }
970 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
971exit:
972 kfree(new_key_mem);
973 return ret;
974}
975
976/* This is the Integrity Check Value (aka the authentication tag length and can
977 * be 8, 12 or 16 bytes long. */
978static int rfc4106_set_authsize(struct crypto_aead *parent,
979 unsigned int authsize)
980{
981 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
982 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
983
984 switch (authsize) {
985 case 8:
986 case 12:
987 case 16:
988 break;
989 default:
990 return -EINVAL;
991 }
992 crypto_aead_crt(parent)->authsize = authsize;
993 crypto_aead_crt(cryptd_child)->authsize = authsize;
994 return 0;
995}
996
997static int rfc4106_encrypt(struct aead_request *req)
998{
999 int ret;
1000 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1001 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1002 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1003
1004 if (!irq_fpu_usable()) {
1005 struct aead_request *cryptd_req =
1006 (struct aead_request *) aead_request_ctx(req);
1007 memcpy(cryptd_req, req, sizeof(*req));
1008 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1009 return crypto_aead_encrypt(cryptd_req);
1010 } else {
1011 kernel_fpu_begin();
1012 ret = cryptd_child->base.crt_aead.encrypt(req);
1013 kernel_fpu_end();
1014 return ret;
1015 }
1016}
1017
1018static int rfc4106_decrypt(struct aead_request *req)
1019{
1020 int ret;
1021 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1022 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1023 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1024
1025 if (!irq_fpu_usable()) {
1026 struct aead_request *cryptd_req =
1027 (struct aead_request *) aead_request_ctx(req);
1028 memcpy(cryptd_req, req, sizeof(*req));
1029 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1030 return crypto_aead_decrypt(cryptd_req);
1031 } else {
1032 kernel_fpu_begin();
1033 ret = cryptd_child->base.crt_aead.decrypt(req);
1034 kernel_fpu_end();
1035 return ret;
1036 }
1037}
1038
1039static struct crypto_alg rfc4106_alg = {
1040 .cra_name = "rfc4106(gcm(aes))",
1041 .cra_driver_name = "rfc4106-gcm-aesni",
1042 .cra_priority = 400,
1043 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1044 .cra_blocksize = 1,
1045 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1046 .cra_alignmask = 0,
1047 .cra_type = &crypto_nivaead_type,
1048 .cra_module = THIS_MODULE,
1049 .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
1050 .cra_init = rfc4106_init,
1051 .cra_exit = rfc4106_exit,
1052 .cra_u = {
1053 .aead = {
1054 .setkey = rfc4106_set_key,
1055 .setauthsize = rfc4106_set_authsize,
1056 .encrypt = rfc4106_encrypt,
1057 .decrypt = rfc4106_decrypt,
1058 .geniv = "seqiv",
1059 .ivsize = 8,
1060 .maxauthsize = 16,
1061 },
1062 },
1063};
1064
1065static int __driver_rfc4106_encrypt(struct aead_request *req)
1066{
1067 u8 one_entry_in_sg = 0;
1068 u8 *src, *dst, *assoc;
1069 __be32 counter = cpu_to_be32(1);
1070 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1071 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1072 void *aes_ctx = &(ctx->aes_key_expanded);
1073 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1074 u8 iv_tab[16+AESNI_ALIGN];
1075 u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
1076 struct scatter_walk src_sg_walk;
1077 struct scatter_walk assoc_sg_walk;
1078 struct scatter_walk dst_sg_walk;
1079 unsigned int i;
1080
1081 /* Assuming we are supporting rfc4106 64-bit extended */
1082 /* sequence numbers We need to have the AAD length equal */
1083 /* to 8 or 12 bytes */
1084 if (unlikely(req->assoclen != 8 && req->assoclen != 12))
1085 return -EINVAL;
1086 /* IV below built */
1087 for (i = 0; i < 4; i++)
1088 *(iv+i) = ctx->nonce[i];
1089 for (i = 0; i < 8; i++)
1090 *(iv+4+i) = req->iv[i];
1091 *((__be32 *)(iv+12)) = counter;
1092
1093 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1094 one_entry_in_sg = 1;
1095 scatterwalk_start(&src_sg_walk, req->src);
1096 scatterwalk_start(&assoc_sg_walk, req->assoc);
1097 src = scatterwalk_map(&src_sg_walk, 0);
1098 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1099 dst = src;
1100 if (unlikely(req->src != req->dst)) {
1101 scatterwalk_start(&dst_sg_walk, req->dst);
1102 dst = scatterwalk_map(&dst_sg_walk, 0);
1103 }
1104
1105 } else {
1106 /* Allocate memory for src, dst, assoc */
1107 src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
1108 GFP_ATOMIC);
1109 if (unlikely(!src))
1110 return -ENOMEM;
1111 assoc = (src + req->cryptlen + auth_tag_len);
1112 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1113 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1114 req->assoclen, 0);
1115 dst = src;
1116 }
1117
1118 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
1119 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
1120 + ((unsigned long)req->cryptlen), auth_tag_len);
1121
1122 /* The authTag (aka the Integrity Check Value) needs to be written
1123 * back to the packet. */
1124 if (one_entry_in_sg) {
1125 if (unlikely(req->src != req->dst)) {
1126 scatterwalk_unmap(dst, 0);
1127 scatterwalk_done(&dst_sg_walk, 0, 0);
1128 }
1129 scatterwalk_unmap(src, 0);
1130 scatterwalk_unmap(assoc, 0);
1131 scatterwalk_done(&src_sg_walk, 0, 0);
1132 scatterwalk_done(&assoc_sg_walk, 0, 0);
1133 } else {
1134 scatterwalk_map_and_copy(dst, req->dst, 0,
1135 req->cryptlen + auth_tag_len, 1);
1136 kfree(src);
1137 }
1138 return 0;
1139}
1140
1141static int __driver_rfc4106_decrypt(struct aead_request *req)
1142{
1143 u8 one_entry_in_sg = 0;
1144 u8 *src, *dst, *assoc;
1145 unsigned long tempCipherLen = 0;
1146 __be32 counter = cpu_to_be32(1);
1147 int retval = 0;
1148 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1149 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1150 void *aes_ctx = &(ctx->aes_key_expanded);
1151 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1152 u8 iv_and_authTag[32+AESNI_ALIGN];
1153 u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
1154 u8 *authTag = iv + 16;
1155 struct scatter_walk src_sg_walk;
1156 struct scatter_walk assoc_sg_walk;
1157 struct scatter_walk dst_sg_walk;
1158 unsigned int i;
1159
1160 if (unlikely((req->cryptlen < auth_tag_len) ||
1161 (req->assoclen != 8 && req->assoclen != 12)))
1162 return -EINVAL;
1163 /* Assuming we are supporting rfc4106 64-bit extended */
1164 /* sequence numbers We need to have the AAD length */
1165 /* equal to 8 or 12 bytes */
1166
1167 tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
1168 /* IV below built */
1169 for (i = 0; i < 4; i++)
1170 *(iv+i) = ctx->nonce[i];
1171 for (i = 0; i < 8; i++)
1172 *(iv+4+i) = req->iv[i];
1173 *((__be32 *)(iv+12)) = counter;
1174
1175 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1176 one_entry_in_sg = 1;
1177 scatterwalk_start(&src_sg_walk, req->src);
1178 scatterwalk_start(&assoc_sg_walk, req->assoc);
1179 src = scatterwalk_map(&src_sg_walk, 0);
1180 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1181 dst = src;
1182 if (unlikely(req->src != req->dst)) {
1183 scatterwalk_start(&dst_sg_walk, req->dst);
1184 dst = scatterwalk_map(&dst_sg_walk, 0);
1185 }
1186
1187 } else {
1188 /* Allocate memory for src, dst, assoc */
1189 src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
1190 if (!src)
1191 return -ENOMEM;
1192 assoc = (src + req->cryptlen + auth_tag_len);
1193 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1194 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1195 req->assoclen, 0);
1196 dst = src;
1197 }
1198
1199 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
1200 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1201 authTag, auth_tag_len);
1202
1203 /* Compare generated tag with passed in tag. */
1204 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
1205 -EBADMSG : 0;
1206
1207 if (one_entry_in_sg) {
1208 if (unlikely(req->src != req->dst)) {
1209 scatterwalk_unmap(dst, 0);
1210 scatterwalk_done(&dst_sg_walk, 0, 0);
1211 }
1212 scatterwalk_unmap(src, 0);
1213 scatterwalk_unmap(assoc, 0);
1214 scatterwalk_done(&src_sg_walk, 0, 0);
1215 scatterwalk_done(&assoc_sg_walk, 0, 0);
1216 } else {
1217 scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
1218 kfree(src);
1219 }
1220 return retval;
1221}
1222
1223static struct crypto_alg __rfc4106_alg = {
1224 .cra_name = "__gcm-aes-aesni",
1225 .cra_driver_name = "__driver-gcm-aes-aesni",
1226 .cra_priority = 0,
1227 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
1228 .cra_blocksize = 1,
1229 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1230 .cra_alignmask = 0,
1231 .cra_type = &crypto_aead_type,
1232 .cra_module = THIS_MODULE,
1233 .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
1234 .cra_u = {
1235 .aead = {
1236 .encrypt = __driver_rfc4106_encrypt,
1237 .decrypt = __driver_rfc4106_decrypt,
1238 },
1239 },
1240};
1241#endif
1242
733static int __init aesni_init(void) 1243static int __init aesni_init(void)
734{ 1244{
735 int err; 1245 int err;
@@ -738,6 +1248,7 @@ static int __init aesni_init(void)
738 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); 1248 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
739 return -ENODEV; 1249 return -ENODEV;
740 } 1250 }
1251
741 if ((err = crypto_register_alg(&aesni_alg))) 1252 if ((err = crypto_register_alg(&aesni_alg)))
742 goto aes_err; 1253 goto aes_err;
743 if ((err = crypto_register_alg(&__aesni_alg))) 1254 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1257,24 @@ static int __init aesni_init(void)
746 goto blk_ecb_err; 1257 goto blk_ecb_err;
747 if ((err = crypto_register_alg(&blk_cbc_alg))) 1258 if ((err = crypto_register_alg(&blk_cbc_alg)))
748 goto blk_cbc_err; 1259 goto blk_cbc_err;
749 if ((err = crypto_register_alg(&blk_ctr_alg)))
750 goto blk_ctr_err;
751 if ((err = crypto_register_alg(&ablk_ecb_alg))) 1260 if ((err = crypto_register_alg(&ablk_ecb_alg)))
752 goto ablk_ecb_err; 1261 goto ablk_ecb_err;
753 if ((err = crypto_register_alg(&ablk_cbc_alg))) 1262 if ((err = crypto_register_alg(&ablk_cbc_alg)))
754 goto ablk_cbc_err; 1263 goto ablk_cbc_err;
1264#ifdef CONFIG_X86_64
1265 if ((err = crypto_register_alg(&blk_ctr_alg)))
1266 goto blk_ctr_err;
755 if ((err = crypto_register_alg(&ablk_ctr_alg))) 1267 if ((err = crypto_register_alg(&ablk_ctr_alg)))
756 goto ablk_ctr_err; 1268 goto ablk_ctr_err;
1269 if ((err = crypto_register_alg(&__rfc4106_alg)))
1270 goto __aead_gcm_err;
1271 if ((err = crypto_register_alg(&rfc4106_alg)))
1272 goto aead_gcm_err;
757#ifdef HAS_CTR 1273#ifdef HAS_CTR
758 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) 1274 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
759 goto ablk_rfc3686_ctr_err; 1275 goto ablk_rfc3686_ctr_err;
760#endif 1276#endif
1277#endif
761#ifdef HAS_LRW 1278#ifdef HAS_LRW
762 if ((err = crypto_register_alg(&ablk_lrw_alg))) 1279 if ((err = crypto_register_alg(&ablk_lrw_alg)))
763 goto ablk_lrw_err; 1280 goto ablk_lrw_err;
@@ -770,7 +1287,6 @@ static int __init aesni_init(void)
770 if ((err = crypto_register_alg(&ablk_xts_alg))) 1287 if ((err = crypto_register_alg(&ablk_xts_alg)))
771 goto ablk_xts_err; 1288 goto ablk_xts_err;
772#endif 1289#endif
773
774 return err; 1290 return err;
775 1291
776#ifdef HAS_XTS 1292#ifdef HAS_XTS
@@ -784,18 +1300,24 @@ ablk_pcbc_err:
784 crypto_unregister_alg(&ablk_lrw_alg); 1300 crypto_unregister_alg(&ablk_lrw_alg);
785ablk_lrw_err: 1301ablk_lrw_err:
786#endif 1302#endif
1303#ifdef CONFIG_X86_64
787#ifdef HAS_CTR 1304#ifdef HAS_CTR
788 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1305 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
789ablk_rfc3686_ctr_err: 1306ablk_rfc3686_ctr_err:
790#endif 1307#endif
1308 crypto_unregister_alg(&rfc4106_alg);
1309aead_gcm_err:
1310 crypto_unregister_alg(&__rfc4106_alg);
1311__aead_gcm_err:
791 crypto_unregister_alg(&ablk_ctr_alg); 1312 crypto_unregister_alg(&ablk_ctr_alg);
792ablk_ctr_err: 1313ablk_ctr_err:
1314 crypto_unregister_alg(&blk_ctr_alg);
1315blk_ctr_err:
1316#endif
793 crypto_unregister_alg(&ablk_cbc_alg); 1317 crypto_unregister_alg(&ablk_cbc_alg);
794ablk_cbc_err: 1318ablk_cbc_err:
795 crypto_unregister_alg(&ablk_ecb_alg); 1319 crypto_unregister_alg(&ablk_ecb_alg);
796ablk_ecb_err: 1320ablk_ecb_err:
797 crypto_unregister_alg(&blk_ctr_alg);
798blk_ctr_err:
799 crypto_unregister_alg(&blk_cbc_alg); 1321 crypto_unregister_alg(&blk_cbc_alg);
800blk_cbc_err: 1322blk_cbc_err:
801 crypto_unregister_alg(&blk_ecb_alg); 1323 crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +1340,17 @@ static void __exit aesni_exit(void)
818#ifdef HAS_LRW 1340#ifdef HAS_LRW
819 crypto_unregister_alg(&ablk_lrw_alg); 1341 crypto_unregister_alg(&ablk_lrw_alg);
820#endif 1342#endif
1343#ifdef CONFIG_X86_64
821#ifdef HAS_CTR 1344#ifdef HAS_CTR
822 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1345 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
823#endif 1346#endif
1347 crypto_unregister_alg(&rfc4106_alg);
1348 crypto_unregister_alg(&__rfc4106_alg);
824 crypto_unregister_alg(&ablk_ctr_alg); 1349 crypto_unregister_alg(&ablk_ctr_alg);
1350 crypto_unregister_alg(&blk_ctr_alg);
1351#endif
825 crypto_unregister_alg(&ablk_cbc_alg); 1352 crypto_unregister_alg(&ablk_cbc_alg);
826 crypto_unregister_alg(&ablk_ecb_alg); 1353 crypto_unregister_alg(&ablk_ecb_alg);
827 crypto_unregister_alg(&blk_ctr_alg);
828 crypto_unregister_alg(&blk_cbc_alg); 1354 crypto_unregister_alg(&blk_cbc_alg);
829 crypto_unregister_alg(&blk_ecb_alg); 1355 crypto_unregister_alg(&blk_ecb_alg);
830 crypto_unregister_alg(&__aesni_alg); 1356 crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index cbcc8d8ea93a..7a6e68e4f748 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -10,6 +10,7 @@
10 * by the Free Software Foundation. 10 * by the Free Software Foundation.
11 */ 11 */
12 12
13#include <linux/err.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/kernel.h> 16#include <linux/kernel.h>
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 849813f398e7..5852519b2d0f 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -28,7 +28,6 @@
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/times.h> 29#include <linux/times.h>
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/smp_lock.h>
32#include <linux/mm.h> 31#include <linux/mm.h>
33#include <linux/uio.h> 32#include <linux/uio.h>
34#include <linux/poll.h> 33#include <linux/poll.h>
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 55d106b5e31b..211ca3f7fd16 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,17 +185,16 @@ struct bootnode;
185 185
186#ifdef CONFIG_ACPI_NUMA 186#ifdef CONFIG_ACPI_NUMA
187extern int acpi_numa; 187extern int acpi_numa;
188extern int acpi_get_nodes(struct bootnode *physnodes); 188extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
189 unsigned long end);
189extern int acpi_scan_nodes(unsigned long start, unsigned long end); 190extern int acpi_scan_nodes(unsigned long start, unsigned long end);
190#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 191#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
192
193#ifdef CONFIG_NUMA_EMU
191extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 194extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
192 int num_nodes); 195 int num_nodes);
193#else
194static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
195 int num_nodes)
196{
197}
198#endif 196#endif
197#endif /* CONFIG_ACPI_NUMA */
199 198
200#define acpi_unlazy_tlb(x) leave_mm(x) 199#define acpi_unlazy_tlb(x) leave_mm(x)
201 200
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 76561d20ea2f..13009d1af99a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
66extern void alternatives_smp_module_del(struct module *mod); 66extern void alternatives_smp_module_del(struct module *mod);
67extern void alternatives_smp_switch(int smp); 67extern void alternatives_smp_switch(int smp);
68extern int alternatives_text_reserved(void *start, void *end); 68extern int alternatives_text_reserved(void *start, void *end);
69extern bool skip_smp_alternatives;
69#else 70#else
70static inline void alternatives_smp_module_add(struct module *mod, char *name, 71static inline void alternatives_smp_module_add(struct module *mod, char *name,
71 void *locks, void *locks_end, 72 void *locks, void *locks_end,
@@ -180,8 +181,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
180 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 181 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
181 * inconsistent instruction while you patch. 182 * inconsistent instruction while you patch.
182 */ 183 */
184struct text_poke_param {
185 void *addr;
186 const void *opcode;
187 size_t len;
188};
189
183extern void *text_poke(void *addr, const void *opcode, size_t len); 190extern void *text_poke(void *addr, const void *opcode, size_t len);
184extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 191extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
192extern void text_poke_smp_batch(struct text_poke_param *params, int n);
185 193
186#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) 194#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
187#define IDEAL_NOP_SIZE_5 5 195#define IDEAL_NOP_SIZE_5 5
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index c8517f81b21e..64dc82ee19f0 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,36 +3,64 @@
3 3
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6struct amd_nb_bus_dev_range {
7 u8 bus;
8 u8 dev_base;
9 u8 dev_limit;
10};
11
12extern struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
7struct bootnode; 14struct bootnode;
8 15
9extern int early_is_k8_nb(u32 value); 16extern int early_is_amd_nb(u32 value);
10extern int cache_k8_northbridges(void); 17extern int amd_cache_northbridges(void);
11extern void k8_flush_garts(void); 18extern void amd_flush_garts(void);
12extern int k8_get_nodes(struct bootnode *nodes); 19extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
13extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); 20extern int amd_scan_nodes(void);
14extern int k8_scan_nodes(void); 21
22#ifdef CONFIG_NUMA_EMU
23extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
24extern void amd_get_nodes(struct bootnode *nodes);
25#endif
15 26
16struct k8_northbridge_info { 27struct amd_northbridge {
28 struct pci_dev *misc;
29};
30
31struct amd_northbridge_info {
17 u16 num; 32 u16 num;
18 u8 gart_supported; 33 u64 flags;
19 struct pci_dev **nb_misc; 34 struct amd_northbridge *nb;
20}; 35};
21extern struct k8_northbridge_info k8_northbridges; 36extern struct amd_northbridge_info amd_northbridges;
37
38#define AMD_NB_GART 0x1
39#define AMD_NB_L3_INDEX_DISABLE 0x2
22 40
23#ifdef CONFIG_AMD_NB 41#ifdef CONFIG_AMD_NB
24 42
25static inline struct pci_dev *node_to_k8_nb_misc(int node) 43static inline int amd_nb_num(void)
26{ 44{
27 return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; 45 return amd_northbridges.num;
28} 46}
29 47
30#else 48static inline int amd_nb_has_feature(int feature)
49{
50 return ((amd_northbridges.flags & feature) == feature);
51}
31 52
32static inline struct pci_dev *node_to_k8_nb_misc(int node) 53static inline struct amd_northbridge *node_to_amd_nb(int node)
33{ 54{
34 return NULL; 55 return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
35} 56}
57
58#else
59
60#define amd_nb_num(x) 0
61#define amd_nb_has_feature(x) false
62#define node_to_amd_nb(x) NULL
63
36#endif 64#endif
37 65
38 66
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 286de34b0ed6..5e3969c36d7f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -141,13 +141,13 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
141 141
142static inline u32 native_apic_msr_read(u32 reg) 142static inline u32 native_apic_msr_read(u32 reg)
143{ 143{
144 u32 low, high; 144 u64 msr;
145 145
146 if (reg == APIC_DFR) 146 if (reg == APIC_DFR)
147 return -1; 147 return -1;
148 148
149 rdmsr(APIC_BASE_MSR + (reg >> 4), low, high); 149 rdmsrl(APIC_BASE_MSR + (reg >> 4), msr);
150 return low; 150 return (u32)msr;
151} 151}
152 152
153static inline void native_x2apic_wait_icr_idle(void) 153static inline void native_x2apic_wait_icr_idle(void)
@@ -181,12 +181,12 @@ extern void enable_x2apic(void);
181extern void x2apic_icr_write(u32 low, u32 id); 181extern void x2apic_icr_write(u32 low, u32 id);
182static inline int x2apic_enabled(void) 182static inline int x2apic_enabled(void)
183{ 183{
184 int msr, msr2; 184 u64 msr;
185 185
186 if (!cpu_has_x2apic) 186 if (!cpu_has_x2apic)
187 return 0; 187 return 0;
188 188
189 rdmsr(MSR_IA32_APICBASE, msr, msr2); 189 rdmsrl(MSR_IA32_APICBASE, msr);
190 if (msr & X2APIC_ENABLE) 190 if (msr & X2APIC_ENABLE)
191 return 1; 191 return 1;
192 return 0; 192 return 0;
@@ -234,16 +234,17 @@ extern void init_bsp_APIC(void);
234extern void setup_local_APIC(void); 234extern void setup_local_APIC(void);
235extern void end_local_APIC_setup(void); 235extern void end_local_APIC_setup(void);
236extern void init_apic_mappings(void); 236extern void init_apic_mappings(void);
237void register_lapic_address(unsigned long address);
237extern void setup_boot_APIC_clock(void); 238extern void setup_boot_APIC_clock(void);
238extern void setup_secondary_APIC_clock(void); 239extern void setup_secondary_APIC_clock(void);
239extern int APIC_init_uniprocessor(void); 240extern int APIC_init_uniprocessor(void);
240extern void enable_NMI_through_LVT0(void); 241extern void enable_NMI_through_LVT0(void);
242extern int apic_force_enable(void);
241 243
242/* 244/*
243 * On 32bit this is mach-xxx local 245 * On 32bit this is mach-xxx local
244 */ 246 */
245#ifdef CONFIG_X86_64 247#ifdef CONFIG_X86_64
246extern void early_init_lapic_mapping(void);
247extern int apic_is_clustered_box(void); 248extern int apic_is_clustered_box(void);
248#else 249#else
249static inline int apic_is_clustered_box(void) 250static inline int apic_is_clustered_box(void)
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index a859ca461fb0..47a30ff8e517 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -145,6 +145,7 @@
145 145
146#ifdef CONFIG_X86_32 146#ifdef CONFIG_X86_32
147# define MAX_IO_APICS 64 147# define MAX_IO_APICS 64
148# define MAX_LOCAL_APIC 256
148#else 149#else
149# define MAX_IO_APICS 128 150# define MAX_IO_APICS 128
150# define MAX_LOCAL_APIC 32768 151# define MAX_LOCAL_APIC 32768
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3b62ab56c7a0..5e1a2eef3e7c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -32,11 +32,7 @@
32#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
33#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
34 34
35#ifdef CONFIG_X86_64 35#define BOOT_HEAP_SIZE 0x8000
36#define BOOT_HEAP_SIZE 0x7000
37#else
38#define BOOT_HEAP_SIZE 0x4000
39#endif
40 36
41#endif /* !CONFIG_KERNEL_BZIP2 */ 37#endif /* !CONFIG_KERNEL_BZIP2 */
42 38
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 63e35ec9075c..62f084478f7e 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -1,48 +1,8 @@
1#ifndef _ASM_X86_CACHEFLUSH_H 1#ifndef _ASM_X86_CACHEFLUSH_H
2#define _ASM_X86_CACHEFLUSH_H 2#define _ASM_X86_CACHEFLUSH_H
3 3
4/* Keep includes the same across arches. */
5#include <linux/mm.h>
6
7/* Caches aren't brain-dead on the intel. */ 4/* Caches aren't brain-dead on the intel. */
8static inline void flush_cache_all(void) { } 5#include <asm-generic/cacheflush.h>
9static inline void flush_cache_mm(struct mm_struct *mm) { }
10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
16static inline void flush_dcache_page(struct page *page) { }
17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
19static inline void flush_icache_range(unsigned long start,
20 unsigned long end) { }
21static inline void flush_icache_page(struct vm_area_struct *vma,
22 struct page *page) { }
23static inline void flush_icache_user_range(struct vm_area_struct *vma,
24 struct page *page,
25 unsigned long addr,
26 unsigned long len) { }
27static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
28static inline void flush_cache_vunmap(unsigned long start,
29 unsigned long end) { }
30
31static inline void copy_to_user_page(struct vm_area_struct *vma,
32 struct page *page, unsigned long vaddr,
33 void *dst, const void *src,
34 unsigned long len)
35{
36 memcpy(dst, src, len);
37}
38
39static inline void copy_from_user_page(struct vm_area_struct *vma,
40 struct page *page, unsigned long vaddr,
41 void *dst, const void *src,
42 unsigned long len)
43{
44 memcpy(dst, src, len);
45}
46 6
47#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
48/* 8/*
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 4fab24de26b1..6e6e7558e702 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,5 +32,6 @@ extern void arch_unregister_cpu(int);
32 32
33DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34 34
35int __cpuinit mwait_usable(const struct cpuinfo_x86 *);
35 36
36#endif /* _ASM_X86_CPU_H */ 37#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b81002f23614..078ad0caefc6 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -94,7 +94,7 @@ static inline void hw_breakpoint_disable(void)
94 94
95static inline int hw_breakpoint_active(void) 95static inline int hw_breakpoint_active(void)
96{ 96{
97 return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; 97 return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
98} 98}
99 99
100extern void aout_dump_debugregs(struct user *dump); 100extern void aout_dump_debugregs(struct user *dump);
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 5be1542fbfaf..e99d55d74df5 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,6 +72,9 @@ struct e820map {
72#define BIOS_BEGIN 0x000a0000 72#define BIOS_BEGIN 0x000a0000
73#define BIOS_END 0x00100000 73#define BIOS_END 0x00100000
74 74
75#define BIOS_ROM_BASE 0xffe00000
76#define BIOS_ROM_END 0xffffffff
77
75#ifdef __KERNEL__ 78#ifdef __KERNEL__
76/* see comment in arch/x86/kernel/e820.c */ 79/* see comment in arch/x86/kernel/e820.c */
77extern struct e820map e820; 80extern struct e820map e820;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 139591a933f6..4729b2b63117 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -116,11 +116,11 @@ enum fixed_addresses {
116#endif 116#endif
117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ 117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ 118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
119 __end_of_permanent_fixed_addresses,
120
121#ifdef CONFIG_X86_MRST 119#ifdef CONFIG_X86_MRST
122 FIX_LNW_VRTC, 120 FIX_LNW_VRTC,
123#endif 121#endif
122 __end_of_permanent_fixed_addresses,
123
124 /* 124 /*
125 * 256 temporary boot-time mappings, used by early_ioremap(), 125 * 256 temporary boot-time mappings, used by early_ioremap(),
126 * before ioremap() is functional. 126 * before ioremap() is functional.
@@ -220,8 +220,8 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
220} 220}
221 221
222/* Return an pointer with offset calculated */ 222/* Return an pointer with offset calculated */
223static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx, 223static __always_inline unsigned long
224 phys_addr_t phys, pgprot_t flags) 224__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
225{ 225{
226 __set_fixmap(idx, phys, flags); 226 __set_fixmap(idx, phys, flags);
227 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); 227 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 49dbfdfa50f9..91d915a65259 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -38,12 +38,9 @@ static inline int gpio_cansleep(unsigned int gpio)
38 return __gpio_cansleep(gpio); 38 return __gpio_cansleep(gpio);
39} 39}
40 40
41/*
42 * Not implemented, yet.
43 */
44static inline int gpio_to_irq(unsigned int gpio) 41static inline int gpio_to_irq(unsigned int gpio)
45{ 42{
46 return -ENOSYS; 43 return __gpio_to_irq(gpio);
47} 44}
48 45
49static inline int irq_to_gpio(unsigned int irq) 46static inline int irq_to_gpio(unsigned int irq)
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index ff2546ce7178..7a15153c675d 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,6 +20,9 @@
20#ifndef _ASM_X86_HYPERVISOR_H 20#ifndef _ASM_X86_HYPERVISOR_H
21#define _ASM_X86_HYPERVISOR_H 21#define _ASM_X86_HYPERVISOR_H
22 22
23#include <asm/kvm_para.h>
24#include <asm/xen/hypervisor.h>
25
23extern void init_hypervisor(struct cpuinfo_x86 *c); 26extern void init_hypervisor(struct cpuinfo_x86 *c);
24extern void init_hypervisor_platform(void); 27extern void init_hypervisor_platform(void);
25 28
@@ -47,4 +50,13 @@ extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 50extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48extern const struct hypervisor_x86 x86_hyper_xen_hvm; 51extern const struct hypervisor_x86 x86_hyper_xen_hvm;
49 52
53static inline bool hypervisor_x2apic_available(void)
54{
55 if (kvm_para_available())
56 return true;
57 if (xen_x2apic_para_available())
58 return true;
59 return false;
60}
61
50#endif 62#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 4aa2bb3b242a..ef328901c802 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
93 int err; 93 int err;
94 94
95 /* See comment in fxsave() below. */ 95 /* See comment in fxsave() below. */
96#ifdef CONFIG_AS_FXSAVEQ
97 asm volatile("1: fxrstorq %[fx]\n\t"
98 "2:\n"
99 ".section .fixup,\"ax\"\n"
100 "3: movl $-1,%[err]\n"
101 " jmp 2b\n"
102 ".previous\n"
103 _ASM_EXTABLE(1b, 3b)
104 : [err] "=r" (err)
105 : [fx] "m" (*fx), "0" (0));
106#else
96 asm volatile("1: rex64/fxrstor (%[fx])\n\t" 107 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
97 "2:\n" 108 "2:\n"
98 ".section .fixup,\"ax\"\n" 109 ".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
102 _ASM_EXTABLE(1b, 3b) 113 _ASM_EXTABLE(1b, 3b)
103 : [err] "=r" (err) 114 : [err] "=r" (err)
104 : [fx] "R" (fx), "m" (*fx), "0" (0)); 115 : [fx] "R" (fx), "m" (*fx), "0" (0));
116#endif
105 return err; 117 return err;
106} 118}
107 119
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
119 return -EFAULT; 131 return -EFAULT;
120 132
121 /* See comment in fxsave() below. */ 133 /* See comment in fxsave() below. */
134#ifdef CONFIG_AS_FXSAVEQ
135 asm volatile("1: fxsaveq %[fx]\n\t"
136 "2:\n"
137 ".section .fixup,\"ax\"\n"
138 "3: movl $-1,%[err]\n"
139 " jmp 2b\n"
140 ".previous\n"
141 _ASM_EXTABLE(1b, 3b)
142 : [err] "=r" (err), [fx] "=m" (*fx)
143 : "0" (0));
144#else
122 asm volatile("1: rex64/fxsave (%[fx])\n\t" 145 asm volatile("1: rex64/fxsave (%[fx])\n\t"
123 "2:\n" 146 "2:\n"
124 ".section .fixup,\"ax\"\n" 147 ".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
128 _ASM_EXTABLE(1b, 3b) 151 _ASM_EXTABLE(1b, 3b)
129 : [err] "=r" (err), "=m" (*fx) 152 : [err] "=r" (err), "=m" (*fx)
130 : [fx] "R" (fx), "0" (0)); 153 : [fx] "R" (fx), "0" (0));
154#endif
131 if (unlikely(err) && 155 if (unlikely(err) &&
132 __clear_user(fx, sizeof(struct i387_fxsave_struct))) 156 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
133 err = -EFAULT; 157 err = -EFAULT;
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a6b28d017c2f..f327d386d6cc 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,7 @@ struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 159extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 160 struct io_apic_irq_attr *irq_attr);
161void setup_IO_APIC_irq_extra(u32 gsi); 161void setup_IO_APIC_irq_extra(u32 gsi);
162extern void ioapic_init_mappings(void); 162extern void ioapic_and_gsi_init(void);
163extern void ioapic_insert_resources(void); 163extern void ioapic_insert_resources(void);
164 164
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
@@ -168,10 +168,10 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
170 170
171extern void probe_nr_irqs_gsi(void);
172extern int get_nr_irqs_gsi(void); 171extern int get_nr_irqs_gsi(void);
173 172
174extern void setup_ioapic_ids_from_mpc(void); 173extern void setup_ioapic_ids_from_mpc(void);
174extern void setup_ioapic_ids_from_mpc_nocheck(void);
175 175
176struct mp_ioapic_gsi{ 176struct mp_ioapic_gsi{
177 u32 gsi_base; 177 u32 gsi_base;
@@ -184,14 +184,15 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi);
184void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 184void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
185extern void __init pre_init_apic_IRQ0(void); 185extern void __init pre_init_apic_IRQ0(void);
186 186
187extern void mp_save_irq(struct mpc_intsrc *m);
188
187#else /* !CONFIG_X86_IO_APIC */ 189#else /* !CONFIG_X86_IO_APIC */
188 190
189#define io_apic_assign_pci_irqs 0 191#define io_apic_assign_pci_irqs 0
190#define setup_ioapic_ids_from_mpc x86_init_noop 192#define setup_ioapic_ids_from_mpc x86_init_noop
191static const int timer_through_8259 = 0; 193static const int timer_through_8259 = 0;
192static inline void ioapic_init_mappings(void) { } 194static inline void ioapic_and_gsi_init(void) { }
193static inline void ioapic_insert_resources(void) { } 195static inline void ioapic_insert_resources(void) { }
194static inline void probe_nr_irqs_gsi(void) { }
195#define gsi_top (NR_IRQS_LEGACY) 196#define gsi_top (NR_IRQS_LEGACY)
196static inline int mp_find_ioapic(u32 gsi) { return 0; } 197static inline int mp_find_ioapic(u32 gsi) { return 0; }
197 198
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 13b0ebaa512f..c704b38c57a2 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,15 +10,14 @@
10#include <asm/apicdef.h> 10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h> 11#include <asm/irq_vectors.h>
12 12
13/* Even though we don't support this, supply it to appease OF */
14static inline void irq_dispose_mapping(unsigned int virq) { }
15
13static inline int irq_canonicalize(int irq) 16static inline int irq_canonicalize(int irq)
14{ 17{
15 return ((irq == 2) ? 9 : irq); 18 return ((irq == 2) ? 9 : irq);
16} 19}
17 20
18#ifdef CONFIG_X86_LOCAL_APIC
19# define ARCH_HAS_NMI_WATCHDOG
20#endif
21
22#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
23extern void irq_ctx_init(int cpu); 22extern void irq_ctx_init(int cpu);
24#else 23#else
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index f52d42e80585..574dbc22893a 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,7 +14,7 @@
14 do { \ 14 do { \
15 asm goto("1:" \ 15 asm goto("1:" \
16 JUMP_LABEL_INITIAL_NOP \ 16 JUMP_LABEL_INITIAL_NOP \
17 ".pushsection __jump_table, \"a\" \n\t"\ 17 ".pushsection __jump_table, \"aw\" \n\t"\
18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ 18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
19 ".popsection \n\t" \ 19 ".popsection \n\t" \
20 : : "i" (key) : : label); \ 20 : : "i" (key) : : label); \
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 5bdfca86581b..ca242d35e873 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -18,7 +18,6 @@ enum die_val {
18 DIE_TRAP, 18 DIE_TRAP,
19 DIE_GPF, 19 DIE_GPF,
20 DIE_CALL, 20 DIE_CALL,
21 DIE_NMI_IPI,
22 DIE_PAGE_FAULT, 21 DIE_PAGE_FAULT,
23 DIE_NMIUNKNOWN, 22 DIE_NMIUNKNOWN,
24}; 23};
@@ -28,7 +27,7 @@ extern void die(const char *, struct pt_regs *,long);
28extern int __must_check __die(const char *, struct pt_regs *, long); 27extern int __must_check __die(const char *, struct pt_regs *, long);
29extern void show_registers(struct pt_regs *regs); 28extern void show_registers(struct pt_regs *regs);
30extern void show_trace(struct task_struct *t, struct pt_regs *regs, 29extern void show_trace(struct task_struct *t, struct pt_regs *regs,
31 unsigned long *sp, unsigned long bp); 30 unsigned long *sp);
32extern void __show_regs(struct pt_regs *regs, int all); 31extern void __show_regs(struct pt_regs *regs, int all);
33extern void show_regs(struct pt_regs *regs); 32extern void show_regs(struct pt_regs *regs);
34extern unsigned long oops_begin(void); 33extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3fe144..8e37deb1eb38 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,14 @@
15 15
16struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
17 17
18struct x86_exception {
19 u8 vector;
20 bool error_code_valid;
21 u16 error_code;
22 bool nested_page_fault;
23 u64 address; /* cr2 or nested page fault gpa */
24};
25
18/* 26/*
19 * x86_emulate_ops: 27 * x86_emulate_ops:
20 * 28 *
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
64 * @bytes: [IN ] Number of bytes to read from memory. 72 * @bytes: [IN ] Number of bytes to read from memory.
65 */ 73 */
66 int (*read_std)(unsigned long addr, void *val, 74 int (*read_std)(unsigned long addr, void *val,
67 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 75 unsigned int bytes, struct kvm_vcpu *vcpu,
76 struct x86_exception *fault);
68 77
69 /* 78 /*
70 * write_std: Write bytes of standard (non-emulated/special) memory. 79 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
74 * @bytes: [IN ] Number of bytes to write to memory. 83 * @bytes: [IN ] Number of bytes to write to memory.
75 */ 84 */
76 int (*write_std)(unsigned long addr, void *val, 85 int (*write_std)(unsigned long addr, void *val,
77 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 86 unsigned int bytes, struct kvm_vcpu *vcpu,
87 struct x86_exception *fault);
78 /* 88 /*
79 * fetch: Read bytes of standard (non-emulated/special) memory. 89 * fetch: Read bytes of standard (non-emulated/special) memory.
80 * Used for instruction fetch. 90 * Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
83 * @bytes: [IN ] Number of bytes to read from memory. 93 * @bytes: [IN ] Number of bytes to read from memory.
84 */ 94 */
85 int (*fetch)(unsigned long addr, void *val, 95 int (*fetch)(unsigned long addr, void *val,
86 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 96 unsigned int bytes, struct kvm_vcpu *vcpu,
97 struct x86_exception *fault);
87 98
88 /* 99 /*
89 * read_emulated: Read bytes from emulated/special memory area. 100 * read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
94 int (*read_emulated)(unsigned long addr, 105 int (*read_emulated)(unsigned long addr,
95 void *val, 106 void *val,
96 unsigned int bytes, 107 unsigned int bytes,
97 unsigned int *error, 108 struct x86_exception *fault,
98 struct kvm_vcpu *vcpu); 109 struct kvm_vcpu *vcpu);
99 110
100 /* 111 /*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
107 int (*write_emulated)(unsigned long addr, 118 int (*write_emulated)(unsigned long addr,
108 const void *val, 119 const void *val,
109 unsigned int bytes, 120 unsigned int bytes,
110 unsigned int *error, 121 struct x86_exception *fault,
111 struct kvm_vcpu *vcpu); 122 struct kvm_vcpu *vcpu);
112 123
113 /* 124 /*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
122 const void *old, 133 const void *old,
123 const void *new, 134 const void *new,
124 unsigned int bytes, 135 unsigned int bytes,
125 unsigned int *error, 136 struct x86_exception *fault,
126 struct kvm_vcpu *vcpu); 137 struct kvm_vcpu *vcpu);
127 138
128 int (*pio_in_emulated)(int size, unsigned short port, void *val, 139 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
159 }; 170 };
160 union { 171 union {
161 unsigned long *reg; 172 unsigned long *reg;
162 unsigned long mem; 173 struct segmented_address {
174 ulong ea;
175 unsigned seg;
176 } mem;
163 } addr; 177 } addr;
164 union { 178 union {
165 unsigned long val; 179 unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
226 240
227 bool perm_ok; /* do not check permissions if true */ 241 bool perm_ok; /* do not check permissions if true */
228 242
229 int exception; /* exception that happens during emulation or -1 */ 243 bool have_exception;
230 u32 error_code; /* error code for exception */ 244 struct x86_exception exception;
231 bool error_code_valid;
232 245
233 /* decode cache */ 246 /* decode cache */
234 struct decode_cache decode; 247 struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
252#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 265#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
253#endif 266#endif
254 267
255int x86_decode_insn(struct x86_emulate_ctxt *ctxt); 268int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
256#define EMULATION_FAILED -1 269#define EMULATION_FAILED -1
257#define EMULATION_OK 0 270#define EMULATION_OK 0
258#define EMULATION_RESTART 1 271#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e6fe391094e..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,15 +79,18 @@
79#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) 79#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
80#define KVM_MIN_FREE_MMU_PAGES 5 80#define KVM_MIN_FREE_MMU_PAGES 5
81#define KVM_REFILL_PAGES 25 81#define KVM_REFILL_PAGES 25
82#define KVM_MAX_CPUID_ENTRIES 40 82#define KVM_MAX_CPUID_ENTRIES 80
83#define KVM_NR_FIXED_MTRR_REGION 88 83#define KVM_NR_FIXED_MTRR_REGION 88
84#define KVM_NR_VAR_MTRR 8 84#define KVM_NR_VAR_MTRR 8
85 85
86#define ASYNC_PF_PER_VCPU 64
87
86extern spinlock_t kvm_lock; 88extern spinlock_t kvm_lock;
87extern struct list_head vm_list; 89extern struct list_head vm_list;
88 90
89struct kvm_vcpu; 91struct kvm_vcpu;
90struct kvm; 92struct kvm;
93struct kvm_async_pf;
91 94
92enum kvm_reg { 95enum kvm_reg {
93 VCPU_REGS_RAX = 0, 96 VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
114 117
115enum kvm_reg_ex { 118enum kvm_reg_ex {
116 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 119 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
120 VCPU_EXREG_CR3,
117}; 121};
118 122
119enum { 123enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
238 void (*new_cr3)(struct kvm_vcpu *vcpu); 242 void (*new_cr3)(struct kvm_vcpu *vcpu);
239 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 243 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
240 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); 244 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
241 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 245 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
242 void (*inject_page_fault)(struct kvm_vcpu *vcpu); 246 bool prefault);
247 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
248 struct x86_exception *fault);
243 void (*free)(struct kvm_vcpu *vcpu); 249 void (*free)(struct kvm_vcpu *vcpu);
244 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 250 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
245 u32 *error); 251 struct x86_exception *exception);
246 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); 252 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
247 void (*prefetch_page)(struct kvm_vcpu *vcpu, 253 void (*prefetch_page)(struct kvm_vcpu *vcpu,
248 struct kvm_mmu_page *page); 254 struct kvm_mmu_page *page);
249 int (*sync_page)(struct kvm_vcpu *vcpu, 255 int (*sync_page)(struct kvm_vcpu *vcpu,
250 struct kvm_mmu_page *sp, bool clear_unsync); 256 struct kvm_mmu_page *sp);
251 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
252 hpa_t root_hpa; 258 hpa_t root_hpa;
253 int root_level; 259 int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
315 */ 321 */
316 struct kvm_mmu *walk_mmu; 322 struct kvm_mmu *walk_mmu;
317 323
318 /*
319 * This struct is filled with the necessary information to propagate a
320 * page fault into the guest
321 */
322 struct {
323 u64 address;
324 unsigned error_code;
325 bool nested;
326 } fault;
327
328 /* only needed in kvm_pv_mmu_op() path, but it's hot so 324 /* only needed in kvm_pv_mmu_op() path, but it's hot so
329 * put it here to avoid allocation */ 325 * put it here to avoid allocation */
330 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 326 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
412 u64 hv_vapic; 408 u64 hv_vapic;
413 409
414 cpumask_var_t wbinvd_dirty_mask; 410 cpumask_var_t wbinvd_dirty_mask;
411
412 struct {
413 bool halted;
414 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
415 struct gfn_to_hva_cache data;
416 u64 msr_val;
417 u32 id;
418 bool send_user_only;
419 } apf;
415}; 420};
416 421
417struct kvm_arch { 422struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
456 /* fields used by HYPER-V emulation */ 461 /* fields used by HYPER-V emulation */
457 u64 hv_guest_os_id; 462 u64 hv_guest_os_id;
458 u64 hv_hypercall; 463 u64 hv_hypercall;
464
465 #ifdef CONFIG_KVM_MMU_AUDIT
466 int audit_point;
467 #endif
459}; 468};
460 469
461struct kvm_vm_stat { 470struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
529 struct kvm_segment *var, int seg); 538 struct kvm_segment *var, int seg);
530 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 539 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
531 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); 540 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
541 void (*decache_cr3)(struct kvm_vcpu *vcpu);
532 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 542 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
533 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 543 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
534 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 544 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
582 592
583 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 593 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
584 594
595 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
585 const struct trace_print_flags *exit_reasons_str; 596 const struct trace_print_flags *exit_reasons_str;
586}; 597};
587 598
599struct kvm_arch_async_pf {
600 u32 token;
601 gfn_t gfn;
602 unsigned long cr3;
603 bool direct_map;
604};
605
588extern struct kvm_x86_ops *kvm_x86_ops; 606extern struct kvm_x86_ops *kvm_x86_ops;
589 607
590int kvm_mmu_module_init(void); 608int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
594int kvm_mmu_create(struct kvm_vcpu *vcpu); 612int kvm_mmu_create(struct kvm_vcpu *vcpu);
595int kvm_mmu_setup(struct kvm_vcpu *vcpu); 613int kvm_mmu_setup(struct kvm_vcpu *vcpu);
596void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 614void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
597void kvm_mmu_set_base_ptes(u64 base_pte);
598void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 615void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
599 u64 dirty_mask, u64 nx_mask, u64 x_mask); 616 u64 dirty_mask, u64 nx_mask, u64 x_mask);
600 617
@@ -623,8 +640,15 @@ enum emulation_result {
623#define EMULTYPE_NO_DECODE (1 << 0) 640#define EMULTYPE_NO_DECODE (1 << 0)
624#define EMULTYPE_TRAP_UD (1 << 1) 641#define EMULTYPE_TRAP_UD (1 << 1)
625#define EMULTYPE_SKIP (1 << 2) 642#define EMULTYPE_SKIP (1 << 2)
626int emulate_instruction(struct kvm_vcpu *vcpu, 643int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
627 unsigned long cr2, u16 error_code, int emulation_type); 644 int emulation_type, void *insn, int insn_len);
645
646static inline int emulate_instruction(struct kvm_vcpu *vcpu,
647 int emulation_type)
648{
649 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
650}
651
628void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 652void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
629void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 653void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
630 654
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
650int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 674int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
651int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 675int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
652int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 676int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
653void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 677int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
654int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 678int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
655int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 679int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
656unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 680unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
668void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 692void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
669void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); 693void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
670void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 694void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
671void kvm_inject_page_fault(struct kvm_vcpu *vcpu); 695void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
672int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 696int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
673 gfn_t gfn, void *data, int offset, int len, 697 gfn_t gfn, void *data, int offset, int len,
674 u32 access); 698 u32 access);
675void kvm_propagate_fault(struct kvm_vcpu *vcpu); 699void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
676bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 700bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
677 701
678int kvm_pic_set_irq(void *opaque, int irq, int level); 702int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
690int kvm_mmu_load(struct kvm_vcpu *vcpu); 714int kvm_mmu_load(struct kvm_vcpu *vcpu);
691void kvm_mmu_unload(struct kvm_vcpu *vcpu); 715void kvm_mmu_unload(struct kvm_vcpu *vcpu);
692void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 716void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
693gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 717gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
694gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 718 struct x86_exception *exception);
695gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 719gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
696gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 720 struct x86_exception *exception);
721gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
722 struct x86_exception *exception);
723gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
724 struct x86_exception *exception);
697 725
698int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 726int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
699 727
700int kvm_fix_hypercall(struct kvm_vcpu *vcpu); 728int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
701 729
702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); 730int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
731 void *insn, int insn_len);
703void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 732void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
704 733
705void kvm_enable_tdp(void); 734void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
766#define HF_VINTR_MASK (1 << 2) 795#define HF_VINTR_MASK (1 << 2)
767#define HF_NMI_MASK (1 << 3) 796#define HF_NMI_MASK (1 << 3)
768#define HF_IRET_MASK (1 << 4) 797#define HF_IRET_MASK (1 << 4)
798#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
769 799
770/* 800/*
771 * Hardware virtualization extension instructions may fault if a 801 * Hardware virtualization extension instructions may fault if a
772 * reboot turns off virtualization while processes are running. 802 * reboot turns off virtualization while processes are running.
773 * Trap the fault and ignore the instruction if that happens. 803 * Trap the fault and ignore the instruction if that happens.
774 */ 804 */
775asmlinkage void kvm_handle_fault_on_reboot(void); 805asmlinkage void kvm_spurious_fault(void);
806extern bool kvm_rebooting;
776 807
777#define __kvm_handle_fault_on_reboot(insn) \ 808#define __kvm_handle_fault_on_reboot(insn) \
778 "666: " insn "\n\t" \ 809 "666: " insn "\n\t" \
810 "668: \n\t" \
779 ".pushsection .fixup, \"ax\" \n" \ 811 ".pushsection .fixup, \"ax\" \n" \
780 "667: \n\t" \ 812 "667: \n\t" \
813 "cmpb $0, kvm_rebooting \n\t" \
814 "jne 668b \n\t" \
781 __ASM_SIZE(push) " $666b \n\t" \ 815 __ASM_SIZE(push) " $666b \n\t" \
782 "jmp kvm_handle_fault_on_reboot \n\t" \ 816 "call kvm_spurious_fault \n\t" \
783 ".popsection \n\t" \ 817 ".popsection \n\t" \
784 ".pushsection __ex_table, \"a\" \n\t" \ 818 ".pushsection __ex_table, \"a\" \n\t" \
785 _ASM_PTR " 666b, 667b \n\t" \ 819 _ASM_PTR " 666b, 667b \n\t" \
@@ -788,6 +822,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
788#define KVM_ARCH_WANT_MMU_NOTIFIER 822#define KVM_ARCH_WANT_MMU_NOTIFIER
789int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
790int kvm_age_hva(struct kvm *kvm, unsigned long hva); 824int kvm_age_hva(struct kvm *kvm, unsigned long hva);
825int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
791void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 826void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
792int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 827int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
793int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 828int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
@@ -799,4 +834,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
799 834
800bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 835bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
801 836
837void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
838 struct kvm_async_pf *work);
839void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
840 struct kvm_async_pf *work);
841void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
842 struct kvm_async_pf *work);
843bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
844extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
845
846void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
847
802#endif /* _ASM_X86_KVM_HOST_H */ 848#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6184bc..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
20 * are available. The use of 0x11 and 0x12 is deprecated 20 * are available. The use of 0x11 and 0x12 is deprecated
21 */ 21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4
23 24
24/* The last 8 bits are used to indicate how to interpret the flags field 25/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored. 26 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ 33/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 34#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 35#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
36#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
35 37
36#define KVM_MAX_MMU_OP_BATCH 32 38#define KVM_MAX_MMU_OP_BATCH 32
37 39
40#define KVM_ASYNC_PF_ENABLED (1 << 0)
41#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
42
38/* Operations for KVM_HC_MMU_OP */ 43/* Operations for KVM_HC_MMU_OP */
39#define KVM_MMU_OP_WRITE_PTE 1 44#define KVM_MMU_OP_WRITE_PTE 1
40#define KVM_MMU_OP_FLUSH_TLB 2 45#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
61 __u64 pt_phys; 66 __u64 pt_phys;
62}; 67};
63 68
69#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
70#define KVM_PV_REASON_PAGE_READY 2
71
72struct kvm_vcpu_pv_apf_data {
73 __u32 reason;
74 __u8 pad[60];
75 __u32 enabled;
76};
77
64#ifdef __KERNEL__ 78#ifdef __KERNEL__
65#include <asm/processor.h> 79#include <asm/processor.h>
66 80
67extern void kvmclock_init(void); 81extern void kvmclock_init(void);
82extern int kvm_register_clock(char *txt);
68 83
69 84
70/* This instruction is vmcall. On non-VT architectures, it will generate a 85/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
160 175
161#ifdef CONFIG_KVM_GUEST 176#ifdef CONFIG_KVM_GUEST
162void __init kvm_guest_init(void); 177void __init kvm_guest_init(void);
178void kvm_async_pf_task_wait(u32 token);
179void kvm_async_pf_task_wake(u32 token);
180u32 kvm_read_and_reset_pf_reason(void);
163#else 181#else
164#define kvm_guest_init() do { } while (0) 182#define kvm_guest_init() do { } while (0)
183#define kvm_async_pf_task_wait(T) do {} while(0)
184#define kvm_async_pf_task_wake(T) do {} while(0)
185static inline u32 kvm_read_and_reset_pf_reason(void)
186{
187 return 0;
188}
165#endif 189#endif
166 190
167#endif /* __KERNEL__ */ 191#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..72a8b52e7dfd 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
7 7
8#include <asm/mc146818rtc.h> 8#include <asm/mc146818rtc.h>
9 9
10#define NMI_REASON_PORT 0x61
11
12#define NMI_REASON_SERR 0x80
13#define NMI_REASON_IOCHK 0x40
14#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
15
16#define NMI_REASON_CLEAR_SERR 0x04
17#define NMI_REASON_CLEAR_IOCHK 0x08
18#define NMI_REASON_CLEAR_MASK 0x0f
19
10static inline unsigned char get_nmi_reason(void) 20static inline unsigned char get_nmi_reason(void)
11{ 21{
12 return inb(0x61); 22 return inb(NMI_REASON_PORT);
13} 23}
14 24
15static inline void reassert_nmi(void) 25static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13cb9788..eb16e94ae04f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
223 223
224void mce_log_therm_throt_event(__u64 status); 224void mce_log_therm_throt_event(__u64 status);
225 225
226/* Interrupt Handler for core thermal thresholds */
227extern int (*platform_thermal_notify)(__u64 msr_val);
228
226#ifdef CONFIG_X86_THERMAL_VECTOR 229#ifdef CONFIG_X86_THERMAL_VECTOR
227extern void mcheck_intel_therm_init(void); 230extern void mcheck_intel_therm_init(void);
228#else 231#else
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..24215072d0e1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
48 48
49#ifdef CONFIG_MICROCODE_AMD 49#ifdef CONFIG_MICROCODE_AMD
50extern struct microcode_ops * __init init_amd_microcode(void); 50extern struct microcode_ops * __init init_amd_microcode(void);
51
52static inline void get_ucode_data(void *to, const u8 *from, size_t n)
53{
54 memcpy(to, from, n);
55}
56
51#else 57#else
52static inline struct microcode_ops * __init init_amd_microcode(void) 58static inline struct microcode_ops * __init init_amd_microcode(void)
53{ 59{
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c82868e9f905..0c90dd9f0505 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,8 +5,9 @@
5 5
6#include <asm/mpspec_def.h> 6#include <asm/mpspec_def.h>
7#include <asm/x86_init.h> 7#include <asm/x86_init.h>
8#include <asm/apicdef.h>
8 9
9extern int apic_version[MAX_APICS]; 10extern int apic_version[];
10extern int pic_mode; 11extern int pic_mode;
11 12
12#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
107 int active_high_low); 108 int active_high_low);
108#endif /* CONFIG_ACPI */ 109#endif /* CONFIG_ACPI */
109 110
110#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) 111#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
111 112
112struct physid_mask { 113struct physid_mask {
113 unsigned long mask[PHYSID_ARRAY_SIZE]; 114 unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
122 test_and_set_bit(physid, (map).mask) 123 test_and_set_bit(physid, (map).mask)
123 124
124#define physids_and(dst, src1, src2) \ 125#define physids_and(dst, src1, src2) \
125 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 126 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
126 127
127#define physids_or(dst, src1, src2) \ 128#define physids_or(dst, src1, src2) \
128 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 129 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
129 130
130#define physids_clear(map) \ 131#define physids_clear(map) \
131 bitmap_zero((map).mask, MAX_APICS) 132 bitmap_zero((map).mask, MAX_LOCAL_APIC)
132 133
133#define physids_complement(dst, src) \ 134#define physids_complement(dst, src) \
134 bitmap_complement((dst).mask, (src).mask, MAX_APICS) 135 bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
135 136
136#define physids_empty(map) \ 137#define physids_empty(map) \
137 bitmap_empty((map).mask, MAX_APICS) 138 bitmap_empty((map).mask, MAX_LOCAL_APIC)
138 139
139#define physids_equal(map1, map2) \ 140#define physids_equal(map1, map2) \
140 bitmap_equal((map1).mask, (map2).mask, MAX_APICS) 141 bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
141 142
142#define physids_weight(map) \ 143#define physids_weight(map) \
143 bitmap_weight((map).mask, MAX_APICS) 144 bitmap_weight((map).mask, MAX_LOCAL_APIC)
144 145
145#define physids_shift_right(d, s, n) \ 146#define physids_shift_right(d, s, n) \
146 bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) 147 bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
147 148
148#define physids_shift_left(d, s, n) \ 149#define physids_shift_left(d, s, n) \
149 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 150 bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
150 151
151static inline unsigned long physids_coerce(physid_mask_t *map) 152static inline unsigned long physids_coerce(physid_mask_t *map)
152{ 153{
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
159 map->mask[0] = physids; 160 map->mask[0] = physids;
160} 161}
161 162
162/* Note: will create very large stack frames if physid_mask_t is big */
163#define physid_mask_of_physid(physid) \
164 ({ \
165 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
166 physid_set(physid, __physid_mask); \
167 __physid_mask; \
168 })
169
170static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) 163static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
171{ 164{
172 physids_clear(*map); 165 physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d7c188..c0a955a9a087 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
15 15
16#ifdef CONFIG_X86_32 16#ifdef CONFIG_X86_32
17# define MAX_MPC_ENTRY 1024 17# define MAX_MPC_ENTRY 1024
18# define MAX_APICS 256
19#else
20# if NR_CPUS <= 255
21# define MAX_APICS 255
22# else
23# define MAX_APICS 32768
24# endif
25#endif 18#endif
26 19
27/* Intel MP Floating Pointer Structure */ 20/* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3ea3dc487047..4d0dfa0d998e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -123,12 +123,16 @@
123#define MSR_AMD64_IBSCTL 0xc001103a 123#define MSR_AMD64_IBSCTL 0xc001103a
124#define MSR_AMD64_IBSBRTARGET 0xc001103b 124#define MSR_AMD64_IBSBRTARGET 0xc001103b
125 125
126/* Fam 15h MSRs */
127#define MSR_F15H_PERF_CTL 0xc0010200
128#define MSR_F15H_PERF_CTR 0xc0010201
129
126/* Fam 10h MSRs */ 130/* Fam 10h MSRs */
127#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 131#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
128#define FAM10H_MMIO_CONF_ENABLE (1<<0) 132#define FAM10H_MMIO_CONF_ENABLE (1<<0)
129#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf 133#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
130#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 134#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
131#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff 135#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
132#define FAM10H_MMIO_CONF_BASE_SHIFT 20 136#define FAM10H_MMIO_CONF_BASE_SHIFT 20
133#define MSR_FAM10H_NODE_ID 0xc001100c 137#define MSR_FAM10H_NODE_ID 0xc001100c
134 138
@@ -253,6 +257,18 @@
253#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) 257#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
254#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) 258#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
255 259
260/* Thermal Thresholds Support */
261#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
262#define THERM_SHIFT_THRESHOLD0 8
263#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
264#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
265#define THERM_SHIFT_THRESHOLD1 16
266#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
267#define THERM_STATUS_THRESHOLD0 (1 << 6)
268#define THERM_LOG_THRESHOLD0 (1 << 7)
269#define THERM_STATUS_THRESHOLD1 (1 << 8)
270#define THERM_LOG_THRESHOLD1 (1 << 9)
271
256/* MISC_ENABLE bits: architectural */ 272/* MISC_ENABLE bits: architectural */
257#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 273#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
258#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 274#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 932f0f86b4b7..c76f5b92b840 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -5,41 +5,15 @@
5#include <asm/irq.h> 5#include <asm/irq.h>
6#include <asm/io.h> 6#include <asm/io.h>
7 7
8#ifdef ARCH_HAS_NMI_WATCHDOG 8#ifdef CONFIG_X86_LOCAL_APIC
9
10/**
11 * do_nmi_callback
12 *
13 * Check to see if a callback exists and execute it. Return 1
14 * if the handler exists and was handled successfully.
15 */
16int do_nmi_callback(struct pt_regs *regs, int cpu);
17 9
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); 10extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void);
20#if !defined(CONFIG_LOCKUP_DETECTOR)
21extern int nmi_watchdog_enabled;
22#endif
23extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 11extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
24extern int reserve_perfctr_nmi(unsigned int); 12extern int reserve_perfctr_nmi(unsigned int);
25extern void release_perfctr_nmi(unsigned int); 13extern void release_perfctr_nmi(unsigned int);
26extern int reserve_evntsel_nmi(unsigned int); 14extern int reserve_evntsel_nmi(unsigned int);
27extern void release_evntsel_nmi(unsigned int); 15extern void release_evntsel_nmi(unsigned int);
28 16
29extern void setup_apic_nmi_watchdog(void *);
30extern void stop_apic_nmi_watchdog(void *);
31extern void disable_timer_nmi_watchdog(void);
32extern void enable_timer_nmi_watchdog(void);
33extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
34extern void cpu_nmi_set_wd_enabled(void);
35
36extern atomic_t nmi_active;
37extern unsigned int nmi_watchdog;
38#define NMI_NONE 0
39#define NMI_IO_APIC 1
40#define NMI_LOCAL_APIC 2
41#define NMI_INVALID 3
42
43struct ctl_table; 17struct ctl_table;
44extern int proc_nmi_enabled(struct ctl_table *, int , 18extern int proc_nmi_enabled(struct ctl_table *, int ,
45 void __user *, size_t *, loff_t *); 19 void __user *, size_t *, loff_t *);
@@ -47,33 +21,28 @@ extern int unknown_nmi_panic;
47 21
48void arch_trigger_all_cpu_backtrace(void); 22void arch_trigger_all_cpu_backtrace(void);
49#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 23#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
24#endif
50 25
51static inline void localise_nmi_watchdog(void) 26/*
52{ 27 * Define some priorities for the nmi notifier call chain.
53 if (nmi_watchdog == NMI_IO_APIC) 28 *
54 nmi_watchdog = NMI_LOCAL_APIC; 29 * Create a local nmi bit that has a higher priority than
55} 30 * external nmis, because the local ones are more frequent.
31 *
32 * Also setup some default high/normal/low settings for
33 * subsystems to registers with. Using 4 bits to seperate
34 * the priorities. This can go alot higher if needed be.
35 */
56 36
57/* check if nmi_watchdog is active (ie was specified at boot) */ 37#define NMI_LOCAL_SHIFT 16 /* randomly picked */
58static inline int nmi_watchdog_active(void) 38#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
59{ 39#define NMI_HIGH_PRIOR (1ULL << 8)
60 /* 40#define NMI_NORMAL_PRIOR (1ULL << 4)
61 * actually it should be: 41#define NMI_LOW_PRIOR (1ULL << 0)
62 * return (nmi_watchdog == NMI_LOCAL_APIC || 42#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
63 * nmi_watchdog == NMI_IO_APIC) 43#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
64 * but since they are power of two we could use a 44#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
65 * cheaper way --cvg
66 */
67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68}
69#endif
70 45
71void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz);
75void disable_lapic_nmi_watchdog(void);
76void enable_lapic_nmi_watchdog(void);
77void stop_nmi(void); 46void stop_nmi(void);
78void restart_nmi(void); 47void restart_nmi(void);
79 48
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index a37229011b56..b0ef2b449a9d 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int numa_off;
5
4extern int pxm_to_nid(int pxm); 6extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu); 7extern void numa_remove_cpu(int cpu);
6 8
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 823e070e7c26..0493be39607c 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -38,8 +38,9 @@ extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39 39
40#ifdef CONFIG_NUMA_EMU 40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) 41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43void numa_emu_cmdline(char *);
43#endif /* CONFIG_NUMA_EMU */ 44#endif /* CONFIG_NUMA_EMU */
44#else 45#else
45static inline void init_cpu_to_node(void) { } 46static inline void init_cpu_to_node(void) { }
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 42a978c0c1b3..f482010350fb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -107,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits);
107/* GPIO assignments */ 107/* GPIO assignments */
108 108
109#define OLPC_GPIO_MIC_AC 1 109#define OLPC_GPIO_MIC_AC 1
110#define OLPC_GPIO_DCON_IRQ geode_gpio(7) 110#define OLPC_GPIO_DCON_STAT0 5
111#define OLPC_GPIO_DCON_STAT1 6
112#define OLPC_GPIO_DCON_IRQ 7
111#define OLPC_GPIO_THRM_ALRM geode_gpio(10) 113#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
112#define OLPC_GPIO_SMB_CLK geode_gpio(14) 114#define OLPC_GPIO_DCON_LOAD 11
113#define OLPC_GPIO_SMB_DATA geode_gpio(15) 115#define OLPC_GPIO_DCON_BLANK 12
116#define OLPC_GPIO_SMB_CLK 14
117#define OLPC_GPIO_SMB_DATA 15
114#define OLPC_GPIO_WORKAUX geode_gpio(24) 118#define OLPC_GPIO_WORKAUX geode_gpio(24)
115#define OLPC_GPIO_LID geode_gpio(26) 119#define OLPC_GPIO_LID geode_gpio(26)
116#define OLPC_GPIO_ECSCI geode_gpio(27) 120#define OLPC_GPIO_ECSCI geode_gpio(27)
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 2a8478140bb3..641988efe063 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -8,6 +8,8 @@
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC_OPENFIRMWARE
10 10
11extern bool olpc_ofw_is_installed(void);
12
11/* run an OFW command by calling into the firmware */ 13/* run an OFW command by calling into the firmware */
12#define olpc_ofw(name, args, res) \ 14#define olpc_ofw(name, args, res) \
13 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) 15 __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
@@ -26,10 +28,17 @@ extern bool olpc_ofw_present(void);
26 28
27#else /* !CONFIG_OLPC_OPENFIRMWARE */ 29#else /* !CONFIG_OLPC_OPENFIRMWARE */
28 30
31static inline bool olpc_ofw_is_installed(void) { return false; }
29static inline void olpc_ofw_detect(void) { } 32static inline void olpc_ofw_detect(void) { }
30static inline void setup_olpc_ofw_pgd(void) { } 33static inline void setup_olpc_ofw_pgd(void) { }
31static inline bool olpc_ofw_present(void) { return false; } 34static inline bool olpc_ofw_present(void) { return false; }
32 35
33#endif /* !CONFIG_OLPC_OPENFIRMWARE */ 36#endif /* !CONFIG_OLPC_OPENFIRMWARE */
34 37
38#ifdef CONFIG_OLPC_OPENFIRMWARE_DT
39extern void olpc_dt_build_devicetree(void);
40#else
41static inline void olpc_dt_build_devicetree(void) { }
42#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */
43
35#endif /* _ASM_X86_OLPC_OFW_H */ 44#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 18e3b8a8709f..ebbc4d8ab170 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
112 112
113static inline void halt(void) 113static inline void halt(void)
114{ 114{
115 PVOP_VCALL0(pv_irq_ops.safe_halt); 115 PVOP_VCALL0(pv_irq_ops.halt);
116} 116}
117 117
118static inline void wbinvd(void) 118static inline void wbinvd(void)
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
435{ 435{
436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); 436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
437} 437}
438static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
439 pmd_t *pmdp)
440{
441 PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
442}
438 443
439static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, 444static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep) 445 pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
442 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); 447 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
443} 448}
444 449
450static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
451 pmd_t *pmdp)
452{
453 PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
454}
455
445static inline pte_t __pte(pteval_t val) 456static inline pte_t __pte(pteval_t val)
446{ 457{
447 pteval_t ret; 458 pteval_t ret;
@@ -543,6 +554,19 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
543 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); 554 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
544} 555}
545 556
557#ifdef CONFIG_TRANSPARENT_HUGEPAGE
558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
559 pmd_t *pmdp, pmd_t pmd)
560{
561 if (sizeof(pmdval_t) > sizeof(long))
562 /* 5 arg words */
563 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
564 else
565 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
566 native_pmd_val(pmd));
567}
568#endif
569
546static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 570static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
547{ 571{
548 pmdval_t val = native_pmd_val(pmd); 572 pmdval_t val = native_pmd_val(pmd);
@@ -824,27 +848,27 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
824#define __PV_IS_CALLEE_SAVE(func) \ 848#define __PV_IS_CALLEE_SAVE(func) \
825 ((struct paravirt_callee_save) { func }) 849 ((struct paravirt_callee_save) { func })
826 850
827static inline unsigned long arch_local_save_flags(void) 851static inline notrace unsigned long arch_local_save_flags(void)
828{ 852{
829 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); 853 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
830} 854}
831 855
832static inline void arch_local_irq_restore(unsigned long f) 856static inline notrace void arch_local_irq_restore(unsigned long f)
833{ 857{
834 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); 858 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
835} 859}
836 860
837static inline void arch_local_irq_disable(void) 861static inline notrace void arch_local_irq_disable(void)
838{ 862{
839 PVOP_VCALLEE0(pv_irq_ops.irq_disable); 863 PVOP_VCALLEE0(pv_irq_ops.irq_disable);
840} 864}
841 865
842static inline void arch_local_irq_enable(void) 866static inline notrace void arch_local_irq_enable(void)
843{ 867{
844 PVOP_VCALLEE0(pv_irq_ops.irq_enable); 868 PVOP_VCALLEE0(pv_irq_ops.irq_enable);
845} 869}
846 870
847static inline unsigned long arch_local_irq_save(void) 871static inline notrace unsigned long arch_local_irq_save(void)
848{ 872{
849 unsigned long f; 873 unsigned long f;
850 874
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, 265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
266 pte_t *ptep, pte_t pteval); 266 pte_t *ptep, pte_t pteval);
267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
268 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
269 pmd_t *pmdp, pmd_t pmdval);
268 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 270 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
269 pte_t *ptep); 271 pte_t *ptep);
270 void (*pte_update_defer)(struct mm_struct *mm, 272 void (*pte_update_defer)(struct mm_struct *mm,
271 unsigned long addr, pte_t *ptep); 273 unsigned long addr, pte_t *ptep);
274 void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
275 pmd_t *pmdp);
276 void (*pmd_update_defer)(struct mm_struct *mm,
277 unsigned long addr, pmd_t *pmdp);
272 278
273 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 279 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
274 pte_t *ptep); 280 pte_t *ptep);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ca0437c714b2..676129229630 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
65 65
66#define PCIBIOS_MIN_CARDBUS_IO 0x4000 66#define PCIBIOS_MIN_CARDBUS_IO 0x4000
67 67
68extern int pcibios_enabled;
68void pcibios_config_init(void); 69void pcibios_config_init(void);
69struct pci_bus *pcibios_scan_root(int bus); 70struct pci_bus *pcibios_scan_root(int bus);
70 71
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index f899e01a8ac9..7e172955ee57 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,6 +230,125 @@ do { \
230}) 230})
231 231
232/* 232/*
233 * Add return operation
234 */
235#define percpu_add_return_op(var, val) \
236({ \
237 typeof(var) paro_ret__ = val; \
238 switch (sizeof(var)) { \
239 case 1: \
240 asm("xaddb %0, "__percpu_arg(1) \
241 : "+q" (paro_ret__), "+m" (var) \
242 : : "memory"); \
243 break; \
244 case 2: \
245 asm("xaddw %0, "__percpu_arg(1) \
246 : "+r" (paro_ret__), "+m" (var) \
247 : : "memory"); \
248 break; \
249 case 4: \
250 asm("xaddl %0, "__percpu_arg(1) \
251 : "+r" (paro_ret__), "+m" (var) \
252 : : "memory"); \
253 break; \
254 case 8: \
255 asm("xaddq %0, "__percpu_arg(1) \
256 : "+re" (paro_ret__), "+m" (var) \
257 : : "memory"); \
258 break; \
259 default: __bad_percpu_size(); \
260 } \
261 paro_ret__ += val; \
262 paro_ret__; \
263})
264
265/*
266 * xchg is implemented using cmpxchg without a lock prefix. xchg is
267 * expensive due to the implied lock prefix. The processor cannot prefetch
268 * cachelines if xchg is used.
269 */
270#define percpu_xchg_op(var, nval) \
271({ \
272 typeof(var) pxo_ret__; \
273 typeof(var) pxo_new__ = (nval); \
274 switch (sizeof(var)) { \
275 case 1: \
276 asm("\n\tmov "__percpu_arg(1)",%%al" \
277 "\n1:\tcmpxchgb %2, "__percpu_arg(1) \
278 "\n\tjnz 1b" \
279 : "=&a" (pxo_ret__), "+m" (var) \
280 : "q" (pxo_new__) \
281 : "memory"); \
282 break; \
283 case 2: \
284 asm("\n\tmov "__percpu_arg(1)",%%ax" \
285 "\n1:\tcmpxchgw %2, "__percpu_arg(1) \
286 "\n\tjnz 1b" \
287 : "=&a" (pxo_ret__), "+m" (var) \
288 : "r" (pxo_new__) \
289 : "memory"); \
290 break; \
291 case 4: \
292 asm("\n\tmov "__percpu_arg(1)",%%eax" \
293 "\n1:\tcmpxchgl %2, "__percpu_arg(1) \
294 "\n\tjnz 1b" \
295 : "=&a" (pxo_ret__), "+m" (var) \
296 : "r" (pxo_new__) \
297 : "memory"); \
298 break; \
299 case 8: \
300 asm("\n\tmov "__percpu_arg(1)",%%rax" \
301 "\n1:\tcmpxchgq %2, "__percpu_arg(1) \
302 "\n\tjnz 1b" \
303 : "=&a" (pxo_ret__), "+m" (var) \
304 : "r" (pxo_new__) \
305 : "memory"); \
306 break; \
307 default: __bad_percpu_size(); \
308 } \
309 pxo_ret__; \
310})
311
312/*
313 * cmpxchg has no such implied lock semantics as a result it is much
314 * more efficient for cpu local operations.
315 */
316#define percpu_cmpxchg_op(var, oval, nval) \
317({ \
318 typeof(var) pco_ret__; \
319 typeof(var) pco_old__ = (oval); \
320 typeof(var) pco_new__ = (nval); \
321 switch (sizeof(var)) { \
322 case 1: \
323 asm("cmpxchgb %2, "__percpu_arg(1) \
324 : "=a" (pco_ret__), "+m" (var) \
325 : "q" (pco_new__), "0" (pco_old__) \
326 : "memory"); \
327 break; \
328 case 2: \
329 asm("cmpxchgw %2, "__percpu_arg(1) \
330 : "=a" (pco_ret__), "+m" (var) \
331 : "r" (pco_new__), "0" (pco_old__) \
332 : "memory"); \
333 break; \
334 case 4: \
335 asm("cmpxchgl %2, "__percpu_arg(1) \
336 : "=a" (pco_ret__), "+m" (var) \
337 : "r" (pco_new__), "0" (pco_old__) \
338 : "memory"); \
339 break; \
340 case 8: \
341 asm("cmpxchgq %2, "__percpu_arg(1) \
342 : "=a" (pco_ret__), "+m" (var) \
343 : "r" (pco_new__), "0" (pco_old__) \
344 : "memory"); \
345 break; \
346 default: __bad_percpu_size(); \
347 } \
348 pco_ret__; \
349})
350
351/*
233 * percpu_read() makes gcc load the percpu variable every time it is 352 * percpu_read() makes gcc load the percpu variable every time it is
234 * accessed while percpu_read_stable() allows the value to be cached. 353 * accessed while percpu_read_stable() allows the value to be cached.
235 * percpu_read_stable() is more efficient and can be used if its value 354 * percpu_read_stable() is more efficient and can be used if its value
@@ -267,6 +386,12 @@ do { \
267#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 386#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
268#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 387#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
269#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 388#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
389/*
390 * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
391 * faster than an xchg with forced lock semantics.
392 */
393#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
394#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
270 395
271#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 396#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
272#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 397#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -286,6 +411,9 @@ do { \
286#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 411#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
287#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 412#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
288#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 413#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
414#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
415#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
416#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
289 417
290#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) 418#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
291#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) 419#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -299,6 +427,29 @@ do { \
299#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 427#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
300#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 428#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
301#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 429#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
430#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
431#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
432#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
433
434#ifndef CONFIG_M386
435#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
436#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
437#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
438#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
439#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
440#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
441
442#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
443#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
444#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
445#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
446#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
447#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
448
449#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
450#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
451#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
452#endif /* !CONFIG_M386 */
302 453
303/* 454/*
304 * Per cpu atomic 64 bit operations are only available under 64 bit. 455 * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -311,6 +462,7 @@ do { \
311#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 462#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
312#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 463#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
313#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 464#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
465#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
314 466
315#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 467#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
316#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 468#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
@@ -318,12 +470,16 @@ do { \
318#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 470#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
319#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 471#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
320#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 472#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
473#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
474#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
475#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
321 476
322#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 477#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
323#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 478#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
324#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 479#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
325#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 480#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
326 481#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
482#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
327#endif 483#endif
328 484
329/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 485/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 550e26b1dbb3..d9d4dae305f6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -125,7 +125,6 @@ union cpuid10_edx {
125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ 125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
126 126
127#ifdef CONFIG_PERF_EVENTS 127#ifdef CONFIG_PERF_EVENTS
128extern void init_hw_perf_events(void);
129extern void perf_events_lapic_init(void); 128extern void perf_events_lapic_init(void);
130 129
131#define PERF_EVENT_INDEX_OFFSET 0 130#define PERF_EVENT_INDEX_OFFSET 0
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
156} 155}
157 156
158#else 157#else
159static inline void init_hw_perf_events(void) { }
160static inline void perf_events_lapic_init(void) { } 158static inline void perf_events_lapic_init(void) { }
161#endif 159#endif
162 160
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index a70cd216be5d..e2f6a99f14ab 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -20,6 +20,9 @@
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) 20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18) 21#define ARCH_P4_MAX_CCCR (18)
22 22
23#define ARCH_P4_CNTRVAL_BITS (40)
24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
25
23#define P4_ESCR_EVENT_MASK 0x7e000000U 26#define P4_ESCR_EVENT_MASK 0x7e000000U
24#define P4_ESCR_EVENT_SHIFT 25 27#define P4_ESCR_EVENT_SHIFT 25
25#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U 28#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
@@ -744,14 +747,6 @@ enum P4_ESCR_EMASKS {
744}; 747};
745 748
746/* 749/*
747 * P4 PEBS specifics (Replay Event only)
748 *
749 * Format (bits):
750 * 0-6: metric from P4_PEBS_METRIC enum
751 * 7 : reserved
752 * 8 : reserved
753 * 9-11 : reserved
754 *
755 * Note we have UOP and PEBS bits reserved for now 750 * Note we have UOP and PEBS bits reserved for now
756 * just in case if we will need them once 751 * just in case if we will need them once
757 */ 752 */
@@ -788,5 +783,60 @@ enum P4_PEBS_METRIC {
788 P4_PEBS_METRIC__max 783 P4_PEBS_METRIC__max
789}; 784};
790 785
786/*
787 * Notes on internal configuration of ESCR+CCCR tuples
788 *
789 * Since P4 has quite the different architecture of
790 * performance registers in compare with "architectural"
791 * once and we have on 64 bits to keep configuration
792 * of performance event, the following trick is used.
793 *
794 * 1) Since both ESCR and CCCR registers have only low
795 * 32 bits valuable, we pack them into a single 64 bit
796 * configuration. Low 32 bits of such config correspond
797 * to low 32 bits of CCCR register and high 32 bits
798 * correspond to low 32 bits of ESCR register.
799 *
800 * 2) The meaning of every bit of such config field can
801 * be found in Intel SDM but it should be noted that
802 * we "borrow" some reserved bits for own usage and
803 * clean them or set to a proper value when we do
804 * a real write to hardware registers.
805 *
806 * 3) The format of bits of config is the following
807 * and should be either 0 or set to some predefined
808 * values:
809 *
810 * Low 32 bits
811 * -----------
812 * 0-6: P4_PEBS_METRIC enum
813 * 7-11: reserved
814 * 12: reserved (Enable)
815 * 13-15: reserved (ESCR select)
816 * 16-17: Active Thread
817 * 18: Compare
818 * 19: Complement
819 * 20-23: Threshold
820 * 24: Edge
821 * 25: reserved (FORCE_OVF)
822 * 26: reserved (OVF_PMI_T0)
823 * 27: reserved (OVF_PMI_T1)
824 * 28-29: reserved
825 * 30: reserved (Cascade)
826 * 31: reserved (OVF)
827 *
828 * High 32 bits
829 * ------------
830 * 0: reserved (T1_USR)
831 * 1: reserved (T1_OS)
832 * 2: reserved (T0_USR)
833 * 3: reserved (T0_OS)
834 * 4: Tag Enable
835 * 5-8: Tag Value
836 * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
837 * 25-30: enum P4_EVENTS
838 * 31: reserved (HT thread)
839 */
840
791#endif /* PERF_EVENT_P4_H */ 841#endif /* PERF_EVENT_P4_H */
792 842
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94c3810..b4389a468fb6 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
93 93
94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, 94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
95 unsigned long adddress) 95 unsigned long address)
96{ 96{
97 ___pmd_free_tlb(tlb, pmd); 97 ___pmd_free_tlb(tlb, pmd);
98} 98}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
47#endif 47#endif
48 48
49#ifdef CONFIG_SMP
50static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
51{
52 return __pmd(xchg((pmdval_t *)xp, 0));
53}
54#else
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif
57
49/* 58/*
50 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
51 * split up the 29 bits of offset into this range: 60 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
105#endif 105#endif
106 106
107#ifdef CONFIG_SMP
108union split_pmd {
109 struct {
110 u32 pmd_low;
111 u32 pmd_high;
112 };
113 pmd_t pmd;
114};
115static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
116{
117 union split_pmd res, *orig = (union split_pmd *)pmdp;
118
119 /* xchg acts as a barrier before setting of the high bits */
120 res.pmd_low = xchg(&orig->pmd_low, 0);
121 res.pmd_high = orig->pmd_high;
122 orig->pmd_high = 0;
123
124 return res.pmd;
125}
126#else
127#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
128#endif
129
107/* 130/*
108 * Bits 0, 6 and 7 are taken in the low part of the pte, 131 * Bits 0, 6 and 7 are taken in the low part of the pte,
109 * put the 32 bits of offset into the high part. 132 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
35#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
36#define set_pte(ptep, pte) native_set_pte(ptep, pte) 36#define set_pte(ptep, pte) native_set_pte(ptep, pte)
37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
38#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
38 39
39#define set_pte_atomic(ptep, pte) \ 40#define set_pte_atomic(ptep, pte) \
40 native_set_pte_atomic(ptep, pte) 41 native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
59 60
60#define pte_update(mm, addr, ptep) do { } while (0) 61#define pte_update(mm, addr, ptep) do { } while (0)
61#define pte_update_defer(mm, addr, ptep) do { } while (0) 62#define pte_update_defer(mm, addr, ptep) do { } while (0)
63#define pmd_update(mm, addr, ptep) do { } while (0)
64#define pmd_update_defer(mm, addr, ptep) do { } while (0)
62 65
63#define pgd_val(x) native_pgd_val(x) 66#define pgd_val(x) native_pgd_val(x)
64#define __pgd(x) native_make_pgd(x) 67#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
94 return pte_flags(pte) & _PAGE_ACCESSED; 97 return pte_flags(pte) & _PAGE_ACCESSED;
95} 98}
96 99
100static inline int pmd_young(pmd_t pmd)
101{
102 return pmd_flags(pmd) & _PAGE_ACCESSED;
103}
104
97static inline int pte_write(pte_t pte) 105static inline int pte_write(pte_t pte)
98{ 106{
99 return pte_flags(pte) & _PAGE_RW; 107 return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
142 (_PAGE_PSE | _PAGE_PRESENT); 150 (_PAGE_PSE | _PAGE_PRESENT);
143} 151}
144 152
153#ifdef CONFIG_TRANSPARENT_HUGEPAGE
154static inline int pmd_trans_splitting(pmd_t pmd)
155{
156 return pmd_val(pmd) & _PAGE_SPLITTING;
157}
158
159static inline int pmd_trans_huge(pmd_t pmd)
160{
161 return pmd_val(pmd) & _PAGE_PSE;
162}
163
164static inline int has_transparent_hugepage(void)
165{
166 return cpu_has_pse;
167}
168#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
169
145static inline pte_t pte_set_flags(pte_t pte, pteval_t set) 170static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
146{ 171{
147 pteval_t v = native_pte_val(pte); 172 pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
216 return pte_set_flags(pte, _PAGE_SPECIAL); 241 return pte_set_flags(pte, _PAGE_SPECIAL);
217} 242}
218 243
244static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
245{
246 pmdval_t v = native_pmd_val(pmd);
247
248 return __pmd(v | set);
249}
250
251static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
252{
253 pmdval_t v = native_pmd_val(pmd);
254
255 return __pmd(v & ~clear);
256}
257
258static inline pmd_t pmd_mkold(pmd_t pmd)
259{
260 return pmd_clear_flags(pmd, _PAGE_ACCESSED);
261}
262
263static inline pmd_t pmd_wrprotect(pmd_t pmd)
264{
265 return pmd_clear_flags(pmd, _PAGE_RW);
266}
267
268static inline pmd_t pmd_mkdirty(pmd_t pmd)
269{
270 return pmd_set_flags(pmd, _PAGE_DIRTY);
271}
272
273static inline pmd_t pmd_mkhuge(pmd_t pmd)
274{
275 return pmd_set_flags(pmd, _PAGE_PSE);
276}
277
278static inline pmd_t pmd_mkyoung(pmd_t pmd)
279{
280 return pmd_set_flags(pmd, _PAGE_ACCESSED);
281}
282
283static inline pmd_t pmd_mkwrite(pmd_t pmd)
284{
285 return pmd_set_flags(pmd, _PAGE_RW);
286}
287
288static inline pmd_t pmd_mknotpresent(pmd_t pmd)
289{
290 return pmd_clear_flags(pmd, _PAGE_PRESENT);
291}
292
219/* 293/*
220 * Mask out unsupported bits in a present pgprot. Non-present pgprots 294 * Mask out unsupported bits in a present pgprot. Non-present pgprots
221 * can use those bits for other purposes, so leave them be. 295 * can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
256 return __pte(val); 330 return __pte(val);
257} 331}
258 332
333static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
334{
335 pmdval_t val = pmd_val(pmd);
336
337 val &= _HPAGE_CHG_MASK;
338 val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
339
340 return __pmd(val);
341}
342
259/* mprotect needs to preserve PAT bits when updating vm_page_prot */ 343/* mprotect needs to preserve PAT bits when updating vm_page_prot */
260#define pgprot_modify pgprot_modify 344#define pgprot_modify pgprot_modify
261static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) 345static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
350 * Currently stuck as a macro due to indirect forward reference to 434 * Currently stuck as a macro due to indirect forward reference to
351 * linux/mmzone.h's __section_mem_map_addr() definition: 435 * linux/mmzone.h's __section_mem_map_addr() definition:
352 */ 436 */
353#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 437#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
354 438
355/* 439/*
356 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 440 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
524 return res; 608 return res;
525} 609}
526 610
611static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
612{
613 pmd_t res = *pmdp;
614
615 native_pmd_clear(pmdp);
616 return res;
617}
618
527static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 619static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
528 pte_t *ptep , pte_t pte) 620 pte_t *ptep , pte_t pte)
529{ 621{
530 native_set_pte(ptep, pte); 622 native_set_pte(ptep, pte);
531} 623}
532 624
625static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
626 pmd_t *pmdp , pmd_t pmd)
627{
628 native_set_pmd(pmdp, pmd);
629}
630
533#ifndef CONFIG_PARAVIRT 631#ifndef CONFIG_PARAVIRT
534/* 632/*
535 * Rules for using pte_update - it must be called after any PTE update which 633 * Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
607 705
608#define flush_tlb_fix_spurious_fault(vma, address) 706#define flush_tlb_fix_spurious_fault(vma, address)
609 707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709
710#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
711extern int pmdp_set_access_flags(struct vm_area_struct *vma,
712 unsigned long address, pmd_t *pmdp,
713 pmd_t entry, int dirty);
714
715#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
716extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
717 unsigned long addr, pmd_t *pmdp);
718
719#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
720extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
721 unsigned long address, pmd_t *pmdp);
722
723
724#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
725extern void pmdp_splitting_flush(struct vm_area_struct *vma,
726 unsigned long addr, pmd_t *pmdp);
727
728#define __HAVE_ARCH_PMD_WRITE
729static inline int pmd_write(pmd_t pmd)
730{
731 return pmd_flags(pmd) & _PAGE_RW;
732}
733
734#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
735static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
736 pmd_t *pmdp)
737{
738 pmd_t pmd = native_pmdp_get_and_clear(pmdp);
739 pmd_update(mm, addr, pmdp);
740 return pmd;
741}
742
743#define __HAVE_ARCH_PMDP_SET_WRPROTECT
744static inline void pmdp_set_wrprotect(struct mm_struct *mm,
745 unsigned long addr, pmd_t *pmdp)
746{
747 clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
748 pmd_update(mm, addr, pmdp);
749}
750
610/* 751/*
611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 752 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
612 * 753 *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
59 native_set_pte(ptep, pte); 59 native_set_pte(ptep, pte);
60} 60}
61 61
62static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
63{
64 *pmdp = pmd;
65}
66
67static inline void native_pmd_clear(pmd_t *pmd)
68{
69 native_set_pmd(pmd, native_make_pmd(0));
70}
71
62static inline pte_t native_ptep_get_and_clear(pte_t *xp) 72static inline pte_t native_ptep_get_and_clear(pte_t *xp)
63{ 73{
64#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
72#endif 82#endif
73} 83}
74 84
75static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 85static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
76{ 86{
77 *pmdp = pmd; 87#ifdef CONFIG_SMP
78} 88 return native_make_pmd(xchg(&xp->pmd, 0));
79 89#else
80static inline void native_pmd_clear(pmd_t *pmd) 90 /* native_local_pmdp_get_and_clear,
81{ 91 but duplicated because of cyclic dependency */
82 native_set_pmd(pmd, native_make_pmd(0)); 92 pmd_t ret = *xp;
93 native_pmd_clear(xp);
94 return ret;
95#endif
83} 96}
84 97
85static inline void native_set_pud(pud_t *pudp, pud_t pud) 98static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) 181#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
169 182
170#define __HAVE_ARCH_PTE_SAME 183#define __HAVE_ARCH_PTE_SAME
184
171#endif /* !__ASSEMBLY__ */ 185#endif /* !__ASSEMBLY__ */
172 186
173#endif /* _ASM_X86_PGTABLE_64_H */ 187#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
25#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
26 27
27/* If _PAGE_BIT_PRESENT is clear, we use these: */ 28/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
48#define __HAVE_ARCH_PTE_SPECIAL 50#define __HAVE_ARCH_PTE_SPECIAL
49 51
50#ifdef CONFIG_KMEMCHECK 52#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
70/* Set of bits not changed in pte_modify */ 72/* Set of bits not changed in pte_modify */
71#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 73#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
72 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) 74 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
75#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
73 76
74#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 77#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
75#define _PAGE_CACHE_WB (0) 78#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cae9c3cb95cf..45636cefa186 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -141,10 +141,9 @@ extern __u32 cpu_caps_set[NCAPINTS];
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 142DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
143#define cpu_data(cpu) per_cpu(cpu_info, cpu) 143#define cpu_data(cpu) per_cpu(cpu_info, cpu)
144#define current_cpu_data __get_cpu_var(cpu_info)
145#else 144#else
145#define cpu_info boot_cpu_data
146#define cpu_data(cpu) boot_cpu_data 146#define cpu_data(cpu) boot_cpu_data
147#define current_cpu_data boot_cpu_data
148#endif 147#endif
149 148
150extern const struct seq_operations cpuinfo_op; 149extern const struct seq_operations cpuinfo_op;
@@ -762,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
762extern void init_c1e_mask(void); 761extern void init_c1e_mask(void);
763 762
764extern unsigned long boot_option_idle_override; 763extern unsigned long boot_option_idle_override;
765extern unsigned long idle_halt;
766extern unsigned long idle_nomwait;
767extern bool c1e_detected; 764extern bool c1e_detected;
768 765
766enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
767 IDLE_POLL, IDLE_FORCE_MWAIT};
768
769extern void enable_sep_cpu(void); 769extern void enable_sep_cpu(void);
770extern int sysenter_setup(void); 770extern int sysenter_setup(void);
771 771
@@ -902,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
902/* 902/*
903 * The below -8 is to reserve 8 bytes on top of the ring0 stack. 903 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
904 * This is necessary to guarantee that the entire "struct pt_regs" 904 * This is necessary to guarantee that the entire "struct pt_regs"
905 * is accessable even if the CPU haven't stored the SS/ESP registers 905 * is accessible even if the CPU haven't stored the SS/ESP registers
906 * on the stack (interrupt gate does not save these registers 906 * on the stack (interrupt gate does not save these registers
907 * when switching to the same priv ring). 907 * when switching to the same priv ring).
908 * Therefore beware: accessing the ss/esp fields of the 908 * Therefore beware: accessing the ss/esp fields of the
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000000..b4ec95f07518
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1 @@
/* dummy prom.h; here to make linux/of.h's #includes happy */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7f7e577a0e39..31d84acc1512 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -11,6 +11,7 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
11void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 11void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
12 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
13 struct timespec *ts); 13 struct timespec *ts);
14void pvclock_resume(void);
14 15
15/* 16/*
16 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 17 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 1def60114906..6c22bf353f26 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void)
48 setup_IO_APIC(); 48 setup_IO_APIC();
49 else { 49 else {
50 nr_ioapics = 0; 50 nr_ioapics = 0;
51 localise_nmi_watchdog();
52 } 51 }
53#endif 52#endif
54} 53}
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 2b16a2ad23dc..52b5c7ed3608 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -7,6 +7,7 @@
7#define _ASM_X86_STACKTRACE_H 7#define _ASM_X86_STACKTRACE_H
8 8
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/ptrace.h>
10 11
11extern int kstack_depth_to_print; 12extern int kstack_depth_to_print;
12 13
@@ -46,7 +47,7 @@ struct stacktrace_ops {
46}; 47};
47 48
48void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 49void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
49 unsigned long *stack, unsigned long bp, 50 unsigned long *stack,
50 const struct stacktrace_ops *ops, void *data); 51 const struct stacktrace_ops *ops, void *data);
51 52
52#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
@@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
57#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 58#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
58#endif 59#endif
59 60
61#ifdef CONFIG_FRAME_POINTER
62static inline unsigned long
63stack_frame(struct task_struct *task, struct pt_regs *regs)
64{
65 unsigned long bp;
66
67 if (regs)
68 return regs->bp;
69
70 if (task == current) {
71 /* Grab bp right from our regs */
72 get_bp(bp);
73 return bp;
74 }
75
76 /* bp is the last reg pushed by switch_to */
77 return *(unsigned long *)task->thread.sp;
78}
79#else
80static inline unsigned long
81stack_frame(struct task_struct *task, struct pt_regs *regs)
82{
83 return 0;
84}
85#endif
86
60extern void 87extern void
61show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 88show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
62 unsigned long *stack, unsigned long bp, char *log_lvl); 89 unsigned long *stack, char *log_lvl);
63 90
64extern void 91extern void
65show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 92show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
66 unsigned long *sp, unsigned long bp, char *log_lvl); 93 unsigned long *sp, char *log_lvl);
67 94
68extern unsigned int code_bytes; 95extern unsigned int code_bytes;
69 96
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
47 INTERCEPT_MONITOR, 47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT, 48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND, 49 INTERCEPT_MWAIT_COND,
50 INTERCEPT_XSETBV,
50}; 51};
51 52
52 53
53struct __attribute__ ((__packed__)) vmcb_control_area { 54struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read; 55 u32 intercept_cr;
55 u16 intercept_cr_write; 56 u32 intercept_dr;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 57 u32 intercept_exceptions;
59 u64 intercept; 58 u64 intercept;
60 u8 reserved_1[42]; 59 u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 80 u32 event_inj_err;
82 u64 nested_cr3; 81 u64 nested_cr3;
83 u64 lbr_ctl; 82 u64 lbr_ctl;
84 u64 reserved_5; 83 u32 clean;
84 u32 reserved_5;
85 u64 next_rip; 85 u64 next_rip;
86 u8 reserved_6[816]; 86 u8 insn_len;
87 u8 insn_bytes[15];
88 u8 reserved_6[800];
87}; 89};
88 90
89 91
90#define TLB_CONTROL_DO_NOTHING 0 92#define TLB_CONTROL_DO_NOTHING 0
91#define TLB_CONTROL_FLUSH_ALL_ASID 1 93#define TLB_CONTROL_FLUSH_ALL_ASID 1
94#define TLB_CONTROL_FLUSH_ASID 3
95#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
92 96
93#define V_TPR_MASK 0x0f 97#define V_TPR_MASK 0x0f
94 98
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
204#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK 208#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
205#define SVM_SELECTOR_CODE_MASK (1 << 3) 209#define SVM_SELECTOR_CODE_MASK (1 << 3)
206 210
207#define INTERCEPT_CR0_MASK 1 211#define INTERCEPT_CR0_READ 0
208#define INTERCEPT_CR3_MASK (1 << 3) 212#define INTERCEPT_CR3_READ 3
209#define INTERCEPT_CR4_MASK (1 << 4) 213#define INTERCEPT_CR4_READ 4
210#define INTERCEPT_CR8_MASK (1 << 8) 214#define INTERCEPT_CR8_READ 8
211 215#define INTERCEPT_CR0_WRITE (16 + 0)
212#define INTERCEPT_DR0_MASK 1 216#define INTERCEPT_CR3_WRITE (16 + 3)
213#define INTERCEPT_DR1_MASK (1 << 1) 217#define INTERCEPT_CR4_WRITE (16 + 4)
214#define INTERCEPT_DR2_MASK (1 << 2) 218#define INTERCEPT_CR8_WRITE (16 + 8)
215#define INTERCEPT_DR3_MASK (1 << 3) 219
216#define INTERCEPT_DR4_MASK (1 << 4) 220#define INTERCEPT_DR0_READ 0
217#define INTERCEPT_DR5_MASK (1 << 5) 221#define INTERCEPT_DR1_READ 1
218#define INTERCEPT_DR6_MASK (1 << 6) 222#define INTERCEPT_DR2_READ 2
219#define INTERCEPT_DR7_MASK (1 << 7) 223#define INTERCEPT_DR3_READ 3
224#define INTERCEPT_DR4_READ 4
225#define INTERCEPT_DR5_READ 5
226#define INTERCEPT_DR6_READ 6
227#define INTERCEPT_DR7_READ 7
228#define INTERCEPT_DR0_WRITE (16 + 0)
229#define INTERCEPT_DR1_WRITE (16 + 1)
230#define INTERCEPT_DR2_WRITE (16 + 2)
231#define INTERCEPT_DR3_WRITE (16 + 3)
232#define INTERCEPT_DR4_WRITE (16 + 4)
233#define INTERCEPT_DR5_WRITE (16 + 5)
234#define INTERCEPT_DR6_WRITE (16 + 6)
235#define INTERCEPT_DR7_WRITE (16 + 7)
220 236
221#define SVM_EVTINJ_VEC_MASK 0xff 237#define SVM_EVTINJ_VEC_MASK 0xff
222 238
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 262#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 263#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
248 264
265#define SVM_EXITINFO_REG_MASK 0x0F
266
249#define SVM_EXIT_READ_CR0 0x000 267#define SVM_EXIT_READ_CR0 0x000
250#define SVM_EXIT_READ_CR3 0x003 268#define SVM_EXIT_READ_CR3 0x003
251#define SVM_EXIT_READ_CR4 0x004 269#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
316#define SVM_EXIT_MONITOR 0x08a 334#define SVM_EXIT_MONITOR 0x08a
317#define SVM_EXIT_MWAIT 0x08b 335#define SVM_EXIT_MWAIT 0x08b
318#define SVM_EXIT_MWAIT_COND 0x08c 336#define SVM_EXIT_MWAIT_COND 0x08c
337#define SVM_EXIT_XSETBV 0x08d
319#define SVM_EXIT_NPF 0x400 338#define SVM_EXIT_NPF 0x400
320 339
321#define SVM_EXIT_ERR -1 340#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
deleted file mode 100644
index 1159e091ad09..000000000000
--- a/arch/x86/include/asm/system_64.h
+++ /dev/null
@@ -1,22 +0,0 @@
1#ifndef _ASM_X86_SYSTEM_64_H
2#define _ASM_X86_SYSTEM_64_H
3
4#include <asm/segment.h>
5#include <asm/cmpxchg.h>
6
7
8static inline unsigned long read_cr8(void)
9{
10 unsigned long cr8;
11 asm volatile("movq %%cr8,%0" : "=r" (cr8));
12 return cr8;
13}
14
15static inline void write_cr8(unsigned long val)
16{
17 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
18}
19
20#include <linux/irqflags.h>
21
22#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 5469630b27f5..fa7b9176b76c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -10,12 +10,6 @@
10unsigned long long native_sched_clock(void); 10unsigned long long native_sched_clock(void);
11extern int recalibrate_cpu_khz(void); 11extern int recalibrate_cpu_khz(void);
12 12
13#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
14extern int timer_ack;
15#else
16# define timer_ack (0)
17#endif
18
19extern int no_timer_check; 13extern int no_timer_check;
20 14
21/* Accelerators for sched_clock() 15/* Accelerators for sched_clock()
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
30asmlinkage void stack_segment(void); 30asmlinkage void stack_segment(void);
31asmlinkage void general_protection(void); 31asmlinkage void general_protection(void);
32asmlinkage void page_fault(void); 32asmlinkage void page_fault(void);
33asmlinkage void async_page_fault(void);
33asmlinkage void spurious_interrupt_bug(void); 34asmlinkage void spurious_interrupt_bug(void);
34asmlinkage void coprocessor_error(void); 35asmlinkage void coprocessor_error(void);
35asmlinkage void alignment_check(void); 36asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 42d412fd8b02..ce1d54c8a433 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -26,20 +26,22 @@
26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, 26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. 27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
28 * 28 *
29 * We will use 31 sets, one for sending BAU messages from each of the 32 29 * We will use one set for sending BAU messages from each of the
30 * cpu's on the uvhub. 30 * cpu's on the uvhub.
31 * 31 *
32 * TLB shootdown will use the first of the 8 descriptors of each set. 32 * TLB shootdown will use the first of the 8 descriptors of each set.
33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). 33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
34 */ 34 */
35 35
36#define MAX_CPUS_PER_UVHUB 64
37#define MAX_CPUS_PER_SOCKET 32
38#define UV_ADP_SIZE 64 /* hardware-provided max. */
39#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */
36#define UV_ITEMS_PER_DESCRIPTOR 8 40#define UV_ITEMS_PER_DESCRIPTOR 8
37/* the 'throttle' to prevent the hardware stay-busy bug */ 41/* the 'throttle' to prevent the hardware stay-busy bug */
38#define MAX_BAU_CONCURRENT 3 42#define MAX_BAU_CONCURRENT 3
39#define UV_CPUS_PER_ACT_STATUS 32
40#define UV_ACT_STATUS_MASK 0x3 43#define UV_ACT_STATUS_MASK 0x3
41#define UV_ACT_STATUS_SIZE 2 44#define UV_ACT_STATUS_SIZE 2
42#define UV_ADP_SIZE 32
43#define UV_DISTRIBUTION_SIZE 256 45#define UV_DISTRIBUTION_SIZE 256
44#define UV_SW_ACK_NPENDING 8 46#define UV_SW_ACK_NPENDING 8
45#define UV_NET_ENDPOINT_INTD 0x38 47#define UV_NET_ENDPOINT_INTD 0x38
@@ -100,7 +102,6 @@
100 * number of destination side software ack resources 102 * number of destination side software ack resources
101 */ 103 */
102#define DEST_NUM_RESOURCES 8 104#define DEST_NUM_RESOURCES 8
103#define MAX_CPUS_PER_NODE 32
104/* 105/*
105 * completion statuses for sending a TLB flush message 106 * completion statuses for sending a TLB flush message
106 */ 107 */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index e969f691cbfd..a501741c2335 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,8 @@ union uvh_apicid {
199#define UVH_APICID 0x002D0E00L 199#define UVH_APICID 0x002D0E00L
200#define UV_APIC_PNODE_SHIFT 6 200#define UV_APIC_PNODE_SHIFT 6
201 201
202#define UV_APICID_HIBIT_MASK 0xffff0000
203
202/* Local Bus from cpu's perspective */ 204/* Local Bus from cpu's perspective */
203#define LOCAL_BUS_BASE 0x1c00000 205#define LOCAL_BUS_BASE 0x1c00000
204#define LOCAL_BUS_SIZE (4 * 1024 * 1024) 206#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
@@ -491,8 +493,10 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
491 } 493 }
492} 494}
493 495
496extern unsigned int uv_apicid_hibits;
494static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) 497static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
495{ 498{
499 apicid |= uv_apicid_hibits;
496 return (1UL << UVH_IPI_INT_SEND_SHFT) | 500 return (1UL << UVH_IPI_INT_SEND_SHFT) |
497 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | 501 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
498 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | 502 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index b2f2d2e05cec..20cafeac7455 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV MMR definitions 6 * SGI UV MMR definitions
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#ifndef _ASM_X86_UV_UV_MMRS_H 11#ifndef _ASM_X86_UV_UV_MMRS_H
@@ -754,6 +754,23 @@ union uvh_lb_bau_sb_descriptor_base_u {
754}; 754};
755 755
756/* ========================================================================= */ 756/* ========================================================================= */
757/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */
758/* ========================================================================= */
759#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
760#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
761
762#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
763#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
764
765union uvh_lb_target_physical_apic_id_mask_u {
766 unsigned long v;
767 struct uvh_lb_target_physical_apic_id_mask_s {
768 unsigned long bit_enables : 32; /* RW */
769 unsigned long rsvd_32_63 : 32; /* */
770 } s;
771};
772
773/* ========================================================================= */
757/* UVH_NODE_ID */ 774/* UVH_NODE_ID */
758/* ========================================================================= */ 775/* ========================================================================= */
759#define UVH_NODE_ID 0x0UL 776#define UVH_NODE_ID 0x0UL
@@ -806,6 +823,78 @@ union uvh_node_present_table_u {
806}; 823};
807 824
808/* ========================================================================= */ 825/* ========================================================================= */
826/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
827/* ========================================================================= */
828#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
829
830#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
831#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
832#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
833#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
834#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
835#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
836
837union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
838 unsigned long v;
839 struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
840 unsigned long rsvd_0_23: 24; /* */
841 unsigned long base : 8; /* RW */
842 unsigned long rsvd_32_47: 16; /* */
843 unsigned long m_alias : 5; /* RW */
844 unsigned long rsvd_53_62: 10; /* */
845 unsigned long enable : 1; /* RW */
846 } s;
847};
848
849/* ========================================================================= */
850/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
851/* ========================================================================= */
852#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
853
854#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
855#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
856#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
857#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
858#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
859#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
860
861union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
862 unsigned long v;
863 struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
864 unsigned long rsvd_0_23: 24; /* */
865 unsigned long base : 8; /* RW */
866 unsigned long rsvd_32_47: 16; /* */
867 unsigned long m_alias : 5; /* RW */
868 unsigned long rsvd_53_62: 10; /* */
869 unsigned long enable : 1; /* RW */
870 } s;
871};
872
873/* ========================================================================= */
874/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
875/* ========================================================================= */
876#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
877
878#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
879#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
880#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
881#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
882#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
883#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
884
885union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
886 unsigned long v;
887 struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
888 unsigned long rsvd_0_23: 24; /* */
889 unsigned long base : 8; /* RW */
890 unsigned long rsvd_32_47: 16; /* */
891 unsigned long m_alias : 5; /* RW */
892 unsigned long rsvd_53_62: 10; /* */
893 unsigned long enable : 1; /* RW */
894 } s;
895};
896
897/* ========================================================================= */
809/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ 898/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
810/* ========================================================================= */ 899/* ========================================================================= */
811#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL 900#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
@@ -857,6 +946,29 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
857}; 946};
858 947
859/* ========================================================================= */ 948/* ========================================================================= */
949/* UVH_RH_GAM_CONFIG_MMR */
950/* ========================================================================= */
951#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
952
953#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
954#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
955#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
956#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
957#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
958#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
959
960union uvh_rh_gam_config_mmr_u {
961 unsigned long v;
962 struct uvh_rh_gam_config_mmr_s {
963 unsigned long m_skt : 6; /* RW */
964 unsigned long n_skt : 4; /* RW */
965 unsigned long rsvd_10_11: 2; /* */
966 unsigned long mmiol_cfg : 1; /* RW */
967 unsigned long rsvd_13_63: 51; /* */
968 } s;
969};
970
971/* ========================================================================= */
860/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ 972/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
861/* ========================================================================= */ 973/* ========================================================================= */
862#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL 974#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
@@ -987,97 +1099,5 @@ union uvh_rtc1_int_config_u {
987 } s; 1099 } s;
988}; 1100};
989 1101
990/* ========================================================================= */
991/* UVH_SI_ADDR_MAP_CONFIG */
992/* ========================================================================= */
993#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
994
995#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
996#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
997#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
998#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
999
1000union uvh_si_addr_map_config_u {
1001 unsigned long v;
1002 struct uvh_si_addr_map_config_s {
1003 unsigned long m_skt : 6; /* RW */
1004 unsigned long rsvd_6_7: 2; /* */
1005 unsigned long n_skt : 4; /* RW */
1006 unsigned long rsvd_12_63: 52; /* */
1007 } s;
1008};
1009
1010/* ========================================================================= */
1011/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
1012/* ========================================================================= */
1013#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
1014
1015#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
1016#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1017#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1018#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1019#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
1020#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1021
1022union uvh_si_alias0_overlay_config_u {
1023 unsigned long v;
1024 struct uvh_si_alias0_overlay_config_s {
1025 unsigned long rsvd_0_23: 24; /* */
1026 unsigned long base : 8; /* RW */
1027 unsigned long rsvd_32_47: 16; /* */
1028 unsigned long m_alias : 5; /* RW */
1029 unsigned long rsvd_53_62: 10; /* */
1030 unsigned long enable : 1; /* RW */
1031 } s;
1032};
1033
1034/* ========================================================================= */
1035/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
1036/* ========================================================================= */
1037#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
1038
1039#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
1040#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1041#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1042#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1043#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
1044#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1045
1046union uvh_si_alias1_overlay_config_u {
1047 unsigned long v;
1048 struct uvh_si_alias1_overlay_config_s {
1049 unsigned long rsvd_0_23: 24; /* */
1050 unsigned long base : 8; /* RW */
1051 unsigned long rsvd_32_47: 16; /* */
1052 unsigned long m_alias : 5; /* RW */
1053 unsigned long rsvd_53_62: 10; /* */
1054 unsigned long enable : 1; /* RW */
1055 } s;
1056};
1057
1058/* ========================================================================= */
1059/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
1060/* ========================================================================= */
1061#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
1062
1063#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
1064#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1065#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1066#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1067#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
1068#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1069
1070union uvh_si_alias2_overlay_config_u {
1071 unsigned long v;
1072 struct uvh_si_alias2_overlay_config_s {
1073 unsigned long rsvd_0_23: 24; /* */
1074 unsigned long base : 8; /* RW */
1075 unsigned long rsvd_32_47: 16; /* */
1076 unsigned long m_alias : 5; /* RW */
1077 unsigned long rsvd_53_62: 10; /* */
1078 unsigned long enable : 1; /* RW */
1079 } s;
1080};
1081
1082 1102
1083#endif /* _ASM_X86_UV_UV_MMRS_H */ 1103#endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
66#define PIN_BASED_NMI_EXITING 0x00000008 66#define PIN_BASED_NMI_EXITING 0x00000008
67#define PIN_BASED_VIRTUAL_NMIS 0x00000020 67#define PIN_BASED_VIRTUAL_NMIS 0x00000020
68 68
69#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
69#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 70#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
71#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
70#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 72#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
71#define VM_EXIT_SAVE_IA32_PAT 0x00040000 73#define VM_EXIT_SAVE_IA32_PAT 0x00040000
72#define VM_EXIT_LOAD_IA32_PAT 0x00080000 74#define VM_EXIT_LOAD_IA32_PAT 0x00080000
75#define VM_EXIT_SAVE_IA32_EFER 0x00100000
76#define VM_EXIT_LOAD_IA32_EFER 0x00200000
77#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
73 78
79#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
74#define VM_ENTRY_IA32E_MODE 0x00000200 80#define VM_ENTRY_IA32E_MODE 0x00000200
75#define VM_ENTRY_SMM 0x00000400 81#define VM_ENTRY_SMM 0x00000400
76#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 82#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
83#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
77#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 84#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
85#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
78 86
79/* VMCS Encodings */ 87/* VMCS Encodings */
80enum vmcs_field { 88enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
239#define EXIT_REASON_TASK_SWITCH 9 247#define EXIT_REASON_TASK_SWITCH 9
240#define EXIT_REASON_CPUID 10 248#define EXIT_REASON_CPUID 10
241#define EXIT_REASON_HLT 12 249#define EXIT_REASON_HLT 12
250#define EXIT_REASON_INVD 13
242#define EXIT_REASON_INVLPG 14 251#define EXIT_REASON_INVLPG 14
243#define EXIT_REASON_RDPMC 15 252#define EXIT_REASON_RDPMC 15
244#define EXIT_REASON_RDTSC 16 253#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
296#define GUEST_INTR_STATE_SMI 0x00000004 305#define GUEST_INTR_STATE_SMI 0x00000004
297#define GUEST_INTR_STATE_NMI 0x00000008 306#define GUEST_INTR_STATE_NMI 0x00000008
298 307
308/* GUEST_ACTIVITY_STATE flags */
309#define GUEST_ACTIVITY_ACTIVE 0
310#define GUEST_ACTIVITY_HLT 1
311#define GUEST_ACTIVITY_SHUTDOWN 2
312#define GUEST_ACTIVITY_WAIT_SIPI 3
313
299/* 314/*
300 * Exit Qualifications for MOV for Control Register Access 315 * Exit Qualifications for MOV for Control Register Access
301 */ 316 */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 396ff4cc8ed4..66d0fff1ee84 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,4 +37,39 @@
37extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40#include <asm/processor.h>
41
42static inline uint32_t xen_cpuid_base(void)
43{
44 uint32_t base, eax, ebx, ecx, edx;
45 char signature[13];
46
47 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
48 cpuid(base, &eax, &ebx, &ecx, &edx);
49 *(uint32_t *)(signature + 0) = ebx;
50 *(uint32_t *)(signature + 4) = ecx;
51 *(uint32_t *)(signature + 8) = edx;
52 signature[12] = 0;
53
54 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
55 return base;
56 }
57
58 return 0;
59}
60
61#ifdef CONFIG_XEN
62extern bool xen_hvm_need_lapic(void);
63
64static inline bool xen_x2apic_para_available(void)
65{
66 return xen_hvm_need_lapic();
67}
68#else
69static inline bool xen_x2apic_para_available(void)
70{
71 return (xen_cpuid_base() != 0);
72}
73#endif
74
40#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 75#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1f0c55..1c10c88ee4e1 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void);
61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) 61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
62#endif 62#endif
63 63
64#ifndef machine_to_phys_mapping 64#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
65#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) 65#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
66#endif 66#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
67 67
68/* Maximum number of virtual CPUs in multi-processor guests. */ 68/* Maximum number of virtual CPUs in multi-processor guests. */
69#define MAX_VIRT_CPUS 32 69#define MAX_VIRT_CPUS 32
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e004ae5c..8413688b2571 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -32,6 +32,11 @@
32/* And the trap vector is... */ 32/* And the trap vector is... */
33#define TRAP_INSTR "int $0x82" 33#define TRAP_INSTR "int $0x82"
34 34
35#define __MACH2PHYS_VIRT_START 0xF5800000
36#define __MACH2PHYS_VIRT_END 0xF6800000
37
38#define __MACH2PHYS_SHIFT 2
39
35/* 40/*
36 * Virtual addresses beyond this are not modifiable by guest OSes. The 41 * Virtual addresses beyond this are not modifiable by guest OSes. The
37 * machine->physical mapping table starts at this address, read-only. 42 * machine->physical mapping table starts at this address, read-only.
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d2662b97c..839a4811cf98 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -39,18 +39,7 @@
39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000 39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000 40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000 41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
42 42#define __MACH2PHYS_SHIFT 3
43#ifndef HYPERVISOR_VIRT_START
44#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
45#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
46#endif
47
48#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
49#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
50#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
51#ifndef machine_to_phys_mapping
52#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
53#endif
54 43
55/* 44/*
56 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) 45 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index dd8c1414b3d5..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -5,6 +5,7 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/pfn.h> 7#include <linux/pfn.h>
8#include <linux/mm.h>
8 9
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10#include <asm/page.h> 11#include <asm/page.h>
@@ -35,10 +36,17 @@ typedef struct xpaddr {
35#define MAX_DOMAIN_PAGES \ 36#define MAX_DOMAIN_PAGES \
36 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) 37 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
37 38
39extern unsigned long *machine_to_phys_mapping;
40extern unsigned int machine_to_phys_order;
38 41
39extern unsigned long get_phys_to_machine(unsigned long pfn); 42extern unsigned long get_phys_to_machine(unsigned long pfn);
40extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
41 44
45extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49
42static inline unsigned long pfn_to_mfn(unsigned long pfn) 50static inline unsigned long pfn_to_mfn(unsigned long pfn)
43{ 51{
44 unsigned long mfn; 52 unsigned long mfn;
@@ -69,11 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
69 if (xen_feature(XENFEAT_auto_translated_physmap)) 77 if (xen_feature(XENFEAT_auto_translated_physmap))
70 return mfn; 78 return mfn;
71 79
72#if 0
73 if (unlikely((mfn >> machine_to_phys_order) != 0))
74 return max_mapnr;
75#endif
76
77 pfn = 0; 80 pfn = 0;
78 /* 81 /*
79 * The array access can fail (e.g., device space beyond end of RAM). 82 * The array access can fail (e.g., device space beyond end of RAM).
@@ -82,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
82 */ 85 */
83 __get_user(pfn, &machine_to_phys_mapping[mfn]); 86 __get_user(pfn, &machine_to_phys_mapping[mfn]);
84 87
88 /*
89 * If this appears to be a foreign mfn (because the pfn
90 * doesn't map back to the mfn), then check the local override
91 * table to see if there's a better pfn to use.
92 */
93 if (get_phys_to_machine(pfn) != mfn)
94 pfn = m2p_find_override_pfn(mfn, pfn);
95
85 return pfn; 96 return pfn;
86} 97}
87 98
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f60153d5de57..34244b2cd880 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -45,6 +45,7 @@ obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 46obj-y += tsc.o io_delay.o rtc.o
47obj-y += pci-iommu_table.o 47obj-y += pci-iommu_table.o
48obj-y += resource.o
48 49
49obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 50obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
50obj-y += process.o 51obj-y += process.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 71232b941b6c..b3a71137983a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
198{ 198{
199 unsigned int ver = 0; 199 unsigned int ver = 0;
200 200
201 if (id >= (MAX_LOCAL_APIC-1)) {
202 printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
203 return;
204 }
205
201 if (!enabled) { 206 if (!enabled) {
202 ++disabled_cpus; 207 ++disabled_cpus;
203 return; 208 return;
@@ -504,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
504 509
505 return 0; 510 return 0;
506} 511}
512EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
507 513
508int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) 514int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
509{ 515{
@@ -847,18 +853,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
847 * returns 0 on success, < 0 on error 853 * returns 0 on success, < 0 on error
848 */ 854 */
849 855
850static void __init acpi_register_lapic_address(unsigned long address)
851{
852 mp_lapic_addr = address;
853
854 set_fixmap_nocache(FIX_APIC_BASE, address);
855 if (boot_cpu_physical_apicid == -1U) {
856 boot_cpu_physical_apicid = read_apic_id();
857 apic_version[boot_cpu_physical_apicid] =
858 GET_APIC_VERSION(apic_read(APIC_LVR));
859 }
860}
861
862static int __init early_acpi_parse_madt_lapic_addr_ovr(void) 856static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
863{ 857{
864 int count; 858 int count;
@@ -880,7 +874,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
880 return count; 874 return count;
881 } 875 }
882 876
883 acpi_register_lapic_address(acpi_lapic_addr); 877 register_lapic_address(acpi_lapic_addr);
884 878
885 return count; 879 return count;
886} 880}
@@ -907,16 +901,16 @@ static int __init acpi_parse_madt_lapic_entries(void)
907 return count; 901 return count;
908 } 902 }
909 903
910 acpi_register_lapic_address(acpi_lapic_addr); 904 register_lapic_address(acpi_lapic_addr);
911 905
912 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, 906 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
913 acpi_parse_sapic, MAX_APICS); 907 acpi_parse_sapic, MAX_LOCAL_APIC);
914 908
915 if (!count) { 909 if (!count) {
916 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, 910 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
917 acpi_parse_x2apic, MAX_APICS); 911 acpi_parse_x2apic, MAX_LOCAL_APIC);
918 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 912 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
919 acpi_parse_lapic, MAX_APICS); 913 acpi_parse_lapic, MAX_LOCAL_APIC);
920 } 914 }
921 if (!count && !x2count) { 915 if (!count && !x2count) {
922 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 916 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -949,32 +943,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
949extern int es7000_plat; 943extern int es7000_plat;
950#endif 944#endif
951 945
952static void assign_to_mp_irq(struct mpc_intsrc *m,
953 struct mpc_intsrc *mp_irq)
954{
955 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
956}
957
958static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
959 struct mpc_intsrc *m)
960{
961 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
962}
963
964static void save_mp_irq(struct mpc_intsrc *m)
965{
966 int i;
967
968 for (i = 0; i < mp_irq_entries; i++) {
969 if (!mp_irq_cmp(&mp_irqs[i], m))
970 return;
971 }
972
973 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
974 if (++mp_irq_entries == MAX_IRQ_SOURCES)
975 panic("Max # of irq sources exceeded!!\n");
976}
977
978void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) 946void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
979{ 947{
980 int ioapic; 948 int ioapic;
@@ -1005,7 +973,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1005 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ 973 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
1006 mp_irq.dstirq = pin; /* INTIN# */ 974 mp_irq.dstirq = pin; /* INTIN# */
1007 975
1008 save_mp_irq(&mp_irq); 976 mp_save_irq(&mp_irq);
1009 977
1010 isa_irq_to_gsi[bus_irq] = gsi; 978 isa_irq_to_gsi[bus_irq] = gsi;
1011} 979}
@@ -1080,7 +1048,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1080 mp_irq.srcbusirq = i; /* Identity mapped */ 1048 mp_irq.srcbusirq = i; /* Identity mapped */
1081 mp_irq.dstirq = pin; 1049 mp_irq.dstirq = pin;
1082 1050
1083 save_mp_irq(&mp_irq); 1051 mp_save_irq(&mp_irq);
1084 } 1052 }
1085} 1053}
1086 1054
@@ -1117,7 +1085,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1117 mp_irq.dstapic = mp_ioapics[ioapic].apicid; 1085 mp_irq.dstapic = mp_ioapics[ioapic].apicid;
1118 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); 1086 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1119 1087
1120 save_mp_irq(&mp_irq); 1088 mp_save_irq(&mp_irq);
1121#endif 1089#endif
1122 return 0; 1090 return 0;
1123} 1091}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5079f24c955a..123608531c8f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
353 mutex_unlock(&smp_alt); 353 mutex_unlock(&smp_alt);
354} 354}
355 355
356bool skip_smp_alternatives;
356void alternatives_smp_switch(int smp) 357void alternatives_smp_switch(int smp)
357{ 358{
358 struct smp_alt_module *mod; 359 struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
368 printk("lockdep: fixing up alternatives.\n"); 369 printk("lockdep: fixing up alternatives.\n");
369#endif 370#endif
370 371
371 if (noreplace_smp || smp_alt_once) 372 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
372 return; 373 return;
373 BUG_ON(!smp && (num_online_cpus() > 1)); 374 BUG_ON(!smp && (num_online_cpus() > 1));
374 375
@@ -591,17 +592,21 @@ static atomic_t stop_machine_first;
591static int wrote_text; 592static int wrote_text;
592 593
593struct text_poke_params { 594struct text_poke_params {
594 void *addr; 595 struct text_poke_param *params;
595 const void *opcode; 596 int nparams;
596 size_t len;
597}; 597};
598 598
599static int __kprobes stop_machine_text_poke(void *data) 599static int __kprobes stop_machine_text_poke(void *data)
600{ 600{
601 struct text_poke_params *tpp = data; 601 struct text_poke_params *tpp = data;
602 struct text_poke_param *p;
603 int i;
602 604
603 if (atomic_dec_and_test(&stop_machine_first)) { 605 if (atomic_dec_and_test(&stop_machine_first)) {
604 text_poke(tpp->addr, tpp->opcode, tpp->len); 606 for (i = 0; i < tpp->nparams; i++) {
607 p = &tpp->params[i];
608 text_poke(p->addr, p->opcode, p->len);
609 }
605 smp_wmb(); /* Make sure other cpus see that this has run */ 610 smp_wmb(); /* Make sure other cpus see that this has run */
606 wrote_text = 1; 611 wrote_text = 1;
607 } else { 612 } else {
@@ -610,8 +615,12 @@ static int __kprobes stop_machine_text_poke(void *data)
610 smp_mb(); /* Load wrote_text before following execution */ 615 smp_mb(); /* Load wrote_text before following execution */
611 } 616 }
612 617
613 flush_icache_range((unsigned long)tpp->addr, 618 for (i = 0; i < tpp->nparams; i++) {
614 (unsigned long)tpp->addr + tpp->len); 619 p = &tpp->params[i];
620 flush_icache_range((unsigned long)p->addr,
621 (unsigned long)p->addr + p->len);
622 }
623
615 return 0; 624 return 0;
616} 625}
617 626
@@ -631,10 +640,13 @@ static int __kprobes stop_machine_text_poke(void *data)
631void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) 640void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
632{ 641{
633 struct text_poke_params tpp; 642 struct text_poke_params tpp;
643 struct text_poke_param p;
634 644
635 tpp.addr = addr; 645 p.addr = addr;
636 tpp.opcode = opcode; 646 p.opcode = opcode;
637 tpp.len = len; 647 p.len = len;
648 tpp.params = &p;
649 tpp.nparams = 1;
638 atomic_set(&stop_machine_first, 1); 650 atomic_set(&stop_machine_first, 1);
639 wrote_text = 0; 651 wrote_text = 0;
640 /* Use __stop_machine() because the caller already got online_cpus. */ 652 /* Use __stop_machine() because the caller already got online_cpus. */
@@ -642,6 +654,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
642 return addr; 654 return addr;
643} 655}
644 656
657/**
658 * text_poke_smp_batch - Update instructions on a live kernel on SMP
659 * @params: an array of text_poke parameters
660 * @n: the number of elements in params.
661 *
662 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
663 * stop_machine() is heavy task, it is better to aggregate text_poke requests
664 * and do it once if possible.
665 *
666 * Note: Must be called under get_online_cpus() and text_mutex.
667 */
668void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
669{
670 struct text_poke_params tpp = {.params = params, .nparams = n};
671
672 atomic_set(&stop_machine_first, 1);
673 wrote_text = 0;
674 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
675}
676
645#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) 677#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
646 678
647#ifdef CONFIG_X86_64 679#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d2fdb0826df2..57ca77787220 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1086,7 +1086,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1086 1086
1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1088 1088
1089 /* Intialize the exclusion range if necessary */ 1089 /* Initialize the exclusion range if necessary */
1090 for_each_iommu(iommu) { 1090 for_each_iommu(iommu) {
1091 if (iommu->exclusion_start && 1091 if (iommu->exclusion_start &&
1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset 1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1353,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1353 1353
1354/* 1354/*
1355 * Allocates a new protection domain usable for the dma_ops functions. 1355 * Allocates a new protection domain usable for the dma_ops functions.
1356 * It also intializes the page table and the address allocator data 1356 * It also initializes the page table and the address allocator data
1357 * structures required for the dma_ops interface 1357 * structures required for the dma_ops interface
1358 */ 1358 */
1359static struct dma_ops_domain *dma_ops_domain_alloc(void) 1359static struct dma_ops_domain *dma_ops_domain_alloc(void)
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 8f6463d8ed0d..0a99f7198bc3 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,95 +12,123 @@
12 12
13static u32 *flush_words; 13static u32 *flush_words;
14 14
15struct pci_device_id k8_nb_ids[] = { 15struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, 18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
19 {} 19 {}
20}; 20};
21EXPORT_SYMBOL(k8_nb_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
22 22
23struct k8_northbridge_info k8_northbridges; 23const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
24EXPORT_SYMBOL(k8_northbridges); 24 { 0x00, 0x18, 0x20 },
25 { 0xff, 0x00, 0x20 },
26 { 0xfe, 0x00, 0x20 },
27 { }
28};
29
30struct amd_northbridge_info amd_northbridges;
31EXPORT_SYMBOL(amd_northbridges);
25 32
26static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) 33static struct pci_dev *next_northbridge(struct pci_dev *dev,
34 struct pci_device_id *ids)
27{ 35{
28 do { 36 do {
29 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); 37 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
30 if (!dev) 38 if (!dev)
31 break; 39 break;
32 } while (!pci_match_id(&k8_nb_ids[0], dev)); 40 } while (!pci_match_id(ids, dev));
33 return dev; 41 return dev;
34} 42}
35 43
36int cache_k8_northbridges(void) 44int amd_cache_northbridges(void)
37{ 45{
38 int i; 46 int i = 0;
39 struct pci_dev *dev; 47 struct amd_northbridge *nb;
48 struct pci_dev *misc;
40 49
41 if (k8_northbridges.num) 50 if (amd_nb_num())
42 return 0; 51 return 0;
43 52
44 dev = NULL; 53 misc = NULL;
45 while ((dev = next_k8_northbridge(dev)) != NULL) 54 while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
46 k8_northbridges.num++; 55 i++;
47 56
48 /* some CPU families (e.g. family 0x11) do not support GART */ 57 if (i == 0)
49 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || 58 return 0;
50 boot_cpu_data.x86 == 0x15)
51 k8_northbridges.gart_supported = 1;
52 59
53 k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * 60 nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
54 sizeof(void *), GFP_KERNEL); 61 if (!nb)
55 if (!k8_northbridges.nb_misc)
56 return -ENOMEM; 62 return -ENOMEM;
57 63
58 if (!k8_northbridges.num) { 64 amd_northbridges.nb = nb;
59 k8_northbridges.nb_misc[0] = NULL; 65 amd_northbridges.num = i;
60 return 0;
61 }
62 66
63 if (k8_northbridges.gart_supported) { 67 misc = NULL;
64 flush_words = kmalloc(k8_northbridges.num * sizeof(u32), 68 for (i = 0; i != amd_nb_num(); i++) {
65 GFP_KERNEL); 69 node_to_amd_nb(i)->misc = misc =
66 if (!flush_words) { 70 next_northbridge(misc, amd_nb_misc_ids);
67 kfree(k8_northbridges.nb_misc); 71 }
68 return -ENOMEM; 72
69 } 73 /* some CPU families (e.g. family 0x11) do not support GART */
70 } 74 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
75 boot_cpu_data.x86 == 0x15)
76 amd_northbridges.flags |= AMD_NB_GART;
77
78 /*
79 * Some CPU families support L3 Cache Index Disable. There are some
80 * limitations because of E382 and E388 on family 0x10.
81 */
82 if (boot_cpu_data.x86 == 0x10 &&
83 boot_cpu_data.x86_model >= 0x8 &&
84 (boot_cpu_data.x86_model > 0x9 ||
85 boot_cpu_data.x86_mask >= 0x1))
86 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
71 87
72 dev = NULL;
73 i = 0;
74 while ((dev = next_k8_northbridge(dev)) != NULL) {
75 k8_northbridges.nb_misc[i] = dev;
76 if (k8_northbridges.gart_supported)
77 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
78 }
79 k8_northbridges.nb_misc[i] = NULL;
80 return 0; 88 return 0;
81} 89}
82EXPORT_SYMBOL_GPL(cache_k8_northbridges); 90EXPORT_SYMBOL_GPL(amd_cache_northbridges);
83 91
84/* Ignores subdevice/subvendor but as far as I can figure out 92/* Ignores subdevice/subvendor but as far as I can figure out
85 they're useless anyways */ 93 they're useless anyways */
86int __init early_is_k8_nb(u32 device) 94int __init early_is_amd_nb(u32 device)
87{ 95{
88 struct pci_device_id *id; 96 struct pci_device_id *id;
89 u32 vendor = device & 0xffff; 97 u32 vendor = device & 0xffff;
90 device >>= 16; 98 device >>= 16;
91 for (id = k8_nb_ids; id->vendor; id++) 99 for (id = amd_nb_misc_ids; id->vendor; id++)
92 if (vendor == id->vendor && device == id->device) 100 if (vendor == id->vendor && device == id->device)
93 return 1; 101 return 1;
94 return 0; 102 return 0;
95} 103}
96 104
97void k8_flush_garts(void) 105int amd_cache_gart(void)
106{
107 int i;
108
109 if (!amd_nb_has_feature(AMD_NB_GART))
110 return 0;
111
112 flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
113 if (!flush_words) {
114 amd_northbridges.flags &= ~AMD_NB_GART;
115 return -ENOMEM;
116 }
117
118 for (i = 0; i != amd_nb_num(); i++)
119 pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
120 &flush_words[i]);
121
122 return 0;
123}
124
125void amd_flush_garts(void)
98{ 126{
99 int flushed, i; 127 int flushed, i;
100 unsigned long flags; 128 unsigned long flags;
101 static DEFINE_SPINLOCK(gart_lock); 129 static DEFINE_SPINLOCK(gart_lock);
102 130
103 if (!k8_northbridges.gart_supported) 131 if (!amd_nb_has_feature(AMD_NB_GART))
104 return; 132 return;
105 133
106 /* Avoid races between AGP and IOMMU. In theory it's not needed 134 /* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +137,16 @@ void k8_flush_garts(void)
109 that it doesn't matter to serialize more. -AK */ 137 that it doesn't matter to serialize more. -AK */
110 spin_lock_irqsave(&gart_lock, flags); 138 spin_lock_irqsave(&gart_lock, flags);
111 flushed = 0; 139 flushed = 0;
112 for (i = 0; i < k8_northbridges.num; i++) { 140 for (i = 0; i < amd_nb_num(); i++) {
113 pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, 141 pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
114 flush_words[i]|1); 142 flush_words[i] | 1);
115 flushed++; 143 flushed++;
116 } 144 }
117 for (i = 0; i < k8_northbridges.num; i++) { 145 for (i = 0; i < amd_nb_num(); i++) {
118 u32 w; 146 u32 w;
119 /* Make sure the hardware actually executed the flush*/ 147 /* Make sure the hardware actually executed the flush*/
120 for (;;) { 148 for (;;) {
121 pci_read_config_dword(k8_northbridges.nb_misc[i], 149 pci_read_config_dword(node_to_amd_nb(i)->misc,
122 0x9c, &w); 150 0x9c, &w);
123 if (!(w & 1)) 151 if (!(w & 1))
124 break; 152 break;
@@ -129,19 +157,23 @@ void k8_flush_garts(void)
129 if (!flushed) 157 if (!flushed)
130 printk("nothing to flush?\n"); 158 printk("nothing to flush?\n");
131} 159}
132EXPORT_SYMBOL_GPL(k8_flush_garts); 160EXPORT_SYMBOL_GPL(amd_flush_garts);
133 161
134static __init int init_k8_nbs(void) 162static __init int init_amd_nbs(void)
135{ 163{
136 int err = 0; 164 int err = 0;
137 165
138 err = cache_k8_northbridges(); 166 err = amd_cache_northbridges();
139 167
140 if (err < 0) 168 if (err < 0)
141 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); 169 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
170
171 if (amd_cache_gart() < 0)
172 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
173 "GART support disabled.\n");
142 174
143 return err; 175 return err;
144} 176}
145 177
146/* This has to go after the PCI subsystem */ 178/* This has to go after the PCI subsystem */
147fs_initcall(init_k8_nbs); 179fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index afc406498c9d..671d5aad7a0c 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev)
313 if (adev->irq == 0) 313 if (adev->irq == 0)
314 return; 314 return;
315 315
316 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
317 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
318 /* APB timer irqs are set up as mp_irqs, timer is edge type */
319 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
320
316 if (system_state == SYSTEM_BOOTING) { 321 if (system_state == SYSTEM_BOOTING) {
317 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
318 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
319 /* APB timer irqs are set up as mp_irqs, timer is edge type */
320 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
321 if (request_irq(adev->irq, apbt_interrupt_handler, 322 if (request_irq(adev->irq, apbt_interrupt_handler,
322 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, 323 IRQF_TIMER | IRQF_DISABLED |
323 adev->name, adev)) { 324 IRQF_NOBALANCING,
325 adev->name, adev)) {
324 printk(KERN_ERR "Failed request IRQ for APBT%d\n", 326 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
325 adev->num); 327 adev->num);
326 } 328 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b3a16e8f0703..5955a7800a96 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -39,18 +39,6 @@ int fallback_aper_force __initdata;
39 39
40int fix_aperture __initdata = 1; 40int fix_aperture __initdata = 1;
41 41
42struct bus_dev_range {
43 int bus;
44 int dev_base;
45 int dev_limit;
46};
47
48static struct bus_dev_range bus_dev_ranges[] __initdata = {
49 { 0x00, 0x18, 0x20},
50 { 0xff, 0x00, 0x20},
51 { 0xfe, 0x00, 0x20}
52};
53
54static struct resource gart_resource = { 42static struct resource gart_resource = {
55 .name = "GART", 43 .name = "GART",
56 .flags = IORESOURCE_MEM, 44 .flags = IORESOURCE_MEM,
@@ -206,7 +194,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
206 * Do an PCI bus scan by hand because we're running before the PCI 194 * Do an PCI bus scan by hand because we're running before the PCI
207 * subsystem. 195 * subsystem.
208 * 196 *
209 * All K8 AGP bridges are AGPv3 compliant, so we can do this scan 197 * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
210 * generically. It's probably overkill to always scan all slots because 198 * generically. It's probably overkill to always scan all slots because
211 * the AGP bridges should be always an own bus on the HT hierarchy, 199 * the AGP bridges should be always an own bus on the HT hierarchy,
212 * but do it here for future safety. 200 * but do it here for future safety.
@@ -294,16 +282,16 @@ void __init early_gart_iommu_check(void)
294 search_agp_bridge(&agp_aper_order, &valid_agp); 282 search_agp_bridge(&agp_aper_order, &valid_agp);
295 283
296 fix = 0; 284 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 285 for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
298 int bus; 286 int bus;
299 int dev_base, dev_limit; 287 int dev_base, dev_limit;
300 288
301 bus = bus_dev_ranges[i].bus; 289 bus = amd_nb_bus_dev_ranges[i].bus;
302 dev_base = bus_dev_ranges[i].dev_base; 290 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
303 dev_limit = bus_dev_ranges[i].dev_limit; 291 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
304 292
305 for (slot = dev_base; slot < dev_limit; slot++) { 293 for (slot = dev_base; slot < dev_limit; slot++) {
306 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 294 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
307 continue; 295 continue;
308 296
309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 297 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -349,16 +337,16 @@ void __init early_gart_iommu_check(void)
349 return; 337 return;
350 338
351 /* disable them all at first */ 339 /* disable them all at first */
352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 340 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
353 int bus; 341 int bus;
354 int dev_base, dev_limit; 342 int dev_base, dev_limit;
355 343
356 bus = bus_dev_ranges[i].bus; 344 bus = amd_nb_bus_dev_ranges[i].bus;
357 dev_base = bus_dev_ranges[i].dev_base; 345 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
358 dev_limit = bus_dev_ranges[i].dev_limit; 346 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
359 347
360 for (slot = dev_base; slot < dev_limit; slot++) { 348 for (slot = dev_base; slot < dev_limit; slot++) {
361 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 349 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
362 continue; 350 continue;
363 351
364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 352 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -390,17 +378,17 @@ int __init gart_iommu_hole_init(void)
390 378
391 fix = 0; 379 fix = 0;
392 node = 0; 380 node = 0;
393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 381 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
394 int bus; 382 int bus;
395 int dev_base, dev_limit; 383 int dev_base, dev_limit;
396 u32 ctl; 384 u32 ctl;
397 385
398 bus = bus_dev_ranges[i].bus; 386 bus = amd_nb_bus_dev_ranges[i].bus;
399 dev_base = bus_dev_ranges[i].dev_base; 387 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
400 dev_limit = bus_dev_ranges[i].dev_limit; 388 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
401 389
402 for (slot = dev_base; slot < dev_limit; slot++) { 390 for (slot = dev_base; slot < dev_limit; slot++) {
403 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 391 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
404 continue; 392 continue;
405 393
406 iommu_detected = 1; 394 iommu_detected = 1;
@@ -505,7 +493,7 @@ out:
505 } 493 }
506 494
507 /* Fix up the north bridges */ 495 /* Fix up the north bridges */
508 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 496 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
509 int bus, dev_base, dev_limit; 497 int bus, dev_base, dev_limit;
510 498
511 /* 499 /*
@@ -514,11 +502,11 @@ out:
514 */ 502 */
515 u32 ctl = DISTLBWALKPRB | aper_order << 1; 503 u32 ctl = DISTLBWALKPRB | aper_order << 1;
516 504
517 bus = bus_dev_ranges[i].bus; 505 bus = amd_nb_bus_dev_ranges[i].bus;
518 dev_base = bus_dev_ranges[i].dev_base; 506 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
519 dev_limit = bus_dev_ranges[i].dev_limit; 507 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
520 for (slot = dev_base; slot < dev_limit; slot++) { 508 for (slot = dev_base; slot < dev_limit; slot++) {
521 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 509 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
522 continue; 510 continue;
523 511
524 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); 512 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 910f20b457c4..3966b564ea47 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,7 @@
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) 6obj-y += hw_nmi.o
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10 7
11obj-$(CONFIG_X86_IO_APIC) += io_apic.o 8obj-$(CONFIG_X86_IO_APIC) += io_apic.o
12obj-$(CONFIG_SMP) += ipi.o 9obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 850657d1b0ed..06c196d7e59c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -31,7 +31,6 @@
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/cpu.h> 32#include <linux/cpu.h>
33#include <linux/dmi.h> 33#include <linux/dmi.h>
34#include <linux/nmi.h>
35#include <linux/smp.h> 34#include <linux/smp.h>
36#include <linux/mm.h> 35#include <linux/mm.h>
37 36
@@ -50,9 +49,8 @@
50#include <asm/mtrr.h> 49#include <asm/mtrr.h>
51#include <asm/smp.h> 50#include <asm/smp.h>
52#include <asm/mce.h> 51#include <asm/mce.h>
53#include <asm/kvm_para.h>
54#include <asm/tsc.h> 52#include <asm/tsc.h>
55#include <asm/atomic.h> 53#include <asm/hypervisor.h>
56 54
57unsigned int num_processors; 55unsigned int num_processors;
58 56
@@ -433,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
433 reserved = reserve_eilvt_offset(offset, new); 431 reserved = reserve_eilvt_offset(offset, new);
434 432
435 if (reserved != new) { 433 if (reserved != new) {
436 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " 434 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
437 "vector 0x%x was already reserved by another core, " 435 "vector 0x%x, but the register is already in use for "
438 "APIC%lX=0x%x\n", 436 "vector 0x%x on another cpu\n",
439 smp_processor_id(), new, reserved, reg, old); 437 smp_processor_id(), reg, offset, new, reserved);
440 return -EINVAL; 438 return -EINVAL;
441 } 439 }
442 440
443 if (!eilvt_entry_is_changeable(old, new)) { 441 if (!eilvt_entry_is_changeable(old, new)) {
444 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " 442 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
445 "register already in use, APIC%lX=0x%x\n", 443 "vector 0x%x, but the register is already in use for "
446 smp_processor_id(), new, reg, old); 444 "vector 0x%x on this cpu\n",
445 smp_processor_id(), reg, offset, new, old);
447 return -EBUSY; 446 return -EBUSY;
448 } 447 }
449 448
@@ -517,7 +516,7 @@ static void __cpuinit setup_APIC_timer(void)
517{ 516{
518 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 517 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
519 518
520 if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) { 519 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
521 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; 520 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
522 /* Make LAPIC timer preferrable over percpu HPET */ 521 /* Make LAPIC timer preferrable over percpu HPET */
523 lapic_clockevent.rating = 150; 522 lapic_clockevent.rating = 150;
@@ -685,7 +684,7 @@ static int __init calibrate_APIC_clock(void)
685 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 684 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
686 lapic_clockevent.shift); 685 lapic_clockevent.shift);
687 lapic_clockevent.max_delta_ns = 686 lapic_clockevent.max_delta_ns =
688 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); 687 clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
689 lapic_clockevent.min_delta_ns = 688 lapic_clockevent.min_delta_ns =
690 clockevent_delta2ns(0xF, &lapic_clockevent); 689 clockevent_delta2ns(0xF, &lapic_clockevent);
691 690
@@ -800,11 +799,7 @@ void __init setup_boot_APIC_clock(void)
800 * PIT/HPET going. Otherwise register lapic as a dummy 799 * PIT/HPET going. Otherwise register lapic as a dummy
801 * device. 800 * device.
802 */ 801 */
803 if (nmi_watchdog != NMI_IO_APIC) 802 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
804 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
805 else
806 pr_warning("APIC timer registered as dummy,"
807 " due to nmi_watchdog=%d!\n", nmi_watchdog);
808 803
809 /* Setup the lapic or request the broadcast */ 804 /* Setup the lapic or request the broadcast */
810 setup_APIC_timer(); 805 setup_APIC_timer();
@@ -1196,12 +1191,15 @@ static void __cpuinit lapic_setup_esr(void)
1196 oldvalue, value); 1191 oldvalue, value);
1197} 1192}
1198 1193
1199
1200/** 1194/**
1201 * setup_local_APIC - setup the local APIC 1195 * setup_local_APIC - setup the local APIC
1196 *
1197 * Used to setup local APIC while initializing BSP or bringin up APs.
1198 * Always called with preemption disabled.
1202 */ 1199 */
1203void __cpuinit setup_local_APIC(void) 1200void __cpuinit setup_local_APIC(void)
1204{ 1201{
1202 int cpu = smp_processor_id();
1205 unsigned int value, queued; 1203 unsigned int value, queued;
1206 int i, j, acked = 0; 1204 int i, j, acked = 0;
1207 unsigned long long tsc = 0, ntsc; 1205 unsigned long long tsc = 0, ntsc;
@@ -1226,8 +1224,6 @@ void __cpuinit setup_local_APIC(void)
1226#endif 1224#endif
1227 perf_events_lapic_init(); 1225 perf_events_lapic_init();
1228 1226
1229 preempt_disable();
1230
1231 /* 1227 /*
1232 * Double-check whether this APIC is really registered. 1228 * Double-check whether this APIC is really registered.
1233 * This is meaningless in clustered apic mode, so we skip it. 1229 * This is meaningless in clustered apic mode, so we skip it.
@@ -1343,21 +1339,19 @@ void __cpuinit setup_local_APIC(void)
1343 * TODO: set up through-local-APIC from through-I/O-APIC? --macro 1339 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
1344 */ 1340 */
1345 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; 1341 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
1346 if (!smp_processor_id() && (pic_mode || !value)) { 1342 if (!cpu && (pic_mode || !value)) {
1347 value = APIC_DM_EXTINT; 1343 value = APIC_DM_EXTINT;
1348 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", 1344 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
1349 smp_processor_id());
1350 } else { 1345 } else {
1351 value = APIC_DM_EXTINT | APIC_LVT_MASKED; 1346 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
1352 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1347 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
1353 smp_processor_id());
1354 } 1348 }
1355 apic_write(APIC_LVT0, value); 1349 apic_write(APIC_LVT0, value);
1356 1350
1357 /* 1351 /*
1358 * only the BP should see the LINT1 NMI signal, obviously. 1352 * only the BP should see the LINT1 NMI signal, obviously.
1359 */ 1353 */
1360 if (!smp_processor_id()) 1354 if (!cpu)
1361 value = APIC_DM_NMI; 1355 value = APIC_DM_NMI;
1362 else 1356 else
1363 value = APIC_DM_NMI | APIC_LVT_MASKED; 1357 value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -1365,11 +1359,9 @@ void __cpuinit setup_local_APIC(void)
1365 value |= APIC_LVT_LEVEL_TRIGGER; 1359 value |= APIC_LVT_LEVEL_TRIGGER;
1366 apic_write(APIC_LVT1, value); 1360 apic_write(APIC_LVT1, value);
1367 1361
1368 preempt_enable();
1369
1370#ifdef CONFIG_X86_MCE_INTEL 1362#ifdef CONFIG_X86_MCE_INTEL
1371 /* Recheck CMCI information after local APIC is up on CPU #0 */ 1363 /* Recheck CMCI information after local APIC is up on CPU #0 */
1372 if (smp_processor_id() == 0) 1364 if (!cpu)
1373 cmci_recheck(); 1365 cmci_recheck();
1374#endif 1366#endif
1375} 1367}
@@ -1388,8 +1380,15 @@ void __cpuinit end_local_APIC_setup(void)
1388 } 1380 }
1389#endif 1381#endif
1390 1382
1391 setup_apic_nmi_watchdog(NULL);
1392 apic_pm_activate(); 1383 apic_pm_activate();
1384
1385 /*
1386 * Now that local APIC setup is completed for BP, configure the fault
1387 * handling for interrupt remapping.
1388 */
1389 if (!smp_processor_id() && intr_remapping_enabled)
1390 enable_drhd_fault_handling();
1391
1393} 1392}
1394 1393
1395#ifdef CONFIG_X86_X2APIC 1394#ifdef CONFIG_X86_X2APIC
@@ -1477,7 +1476,8 @@ void __init enable_IR_x2apic(void)
1477 /* IR is required if there is APIC ID > 255 even when running 1476 /* IR is required if there is APIC ID > 255 even when running
1478 * under KVM 1477 * under KVM
1479 */ 1478 */
1480 if (max_physical_apicid > 255 || !kvm_para_available()) 1479 if (max_physical_apicid > 255 ||
1480 !hypervisor_x2apic_available())
1481 goto nox2apic; 1481 goto nox2apic;
1482 /* 1482 /*
1483 * without IR all CPUs can be addressed by IOAPIC/MSI 1483 * without IR all CPUs can be addressed by IOAPIC/MSI
@@ -1531,13 +1531,60 @@ static int __init detect_init_APIC(void)
1531 return 0; 1531 return 0;
1532} 1532}
1533#else 1533#else
1534
1535static int apic_verify(void)
1536{
1537 u32 features, h, l;
1538
1539 /*
1540 * The APIC feature bit should now be enabled
1541 * in `cpuid'
1542 */
1543 features = cpuid_edx(1);
1544 if (!(features & (1 << X86_FEATURE_APIC))) {
1545 pr_warning("Could not enable APIC!\n");
1546 return -1;
1547 }
1548 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1549 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1550
1551 /* The BIOS may have set up the APIC at some other address */
1552 rdmsr(MSR_IA32_APICBASE, l, h);
1553 if (l & MSR_IA32_APICBASE_ENABLE)
1554 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1555
1556 pr_info("Found and enabled local APIC!\n");
1557 return 0;
1558}
1559
1560int apic_force_enable(void)
1561{
1562 u32 h, l;
1563
1564 if (disable_apic)
1565 return -1;
1566
1567 /*
1568 * Some BIOSes disable the local APIC in the APIC_BASE
1569 * MSR. This can only be done in software for Intel P6 or later
1570 * and AMD K7 (Model > 1) or later.
1571 */
1572 rdmsr(MSR_IA32_APICBASE, l, h);
1573 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1574 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1575 l &= ~MSR_IA32_APICBASE_BASE;
1576 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1577 wrmsr(MSR_IA32_APICBASE, l, h);
1578 enabled_via_apicbase = 1;
1579 }
1580 return apic_verify();
1581}
1582
1534/* 1583/*
1535 * Detect and initialize APIC 1584 * Detect and initialize APIC
1536 */ 1585 */
1537static int __init detect_init_APIC(void) 1586static int __init detect_init_APIC(void)
1538{ 1587{
1539 u32 h, l, features;
1540
1541 /* Disabled by kernel option? */ 1588 /* Disabled by kernel option? */
1542 if (disable_apic) 1589 if (disable_apic)
1543 return -1; 1590 return -1;
@@ -1567,38 +1614,12 @@ static int __init detect_init_APIC(void)
1567 "you can enable it with \"lapic\"\n"); 1614 "you can enable it with \"lapic\"\n");
1568 return -1; 1615 return -1;
1569 } 1616 }
1570 /* 1617 if (apic_force_enable())
1571 * Some BIOSes disable the local APIC in the APIC_BASE 1618 return -1;
1572 * MSR. This can only be done in software for Intel P6 or later 1619 } else {
1573 * and AMD K7 (Model > 1) or later. 1620 if (apic_verify())
1574 */ 1621 return -1;
1575 rdmsr(MSR_IA32_APICBASE, l, h);
1576 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1577 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1578 l &= ~MSR_IA32_APICBASE_BASE;
1579 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1580 wrmsr(MSR_IA32_APICBASE, l, h);
1581 enabled_via_apicbase = 1;
1582 }
1583 }
1584 /*
1585 * The APIC feature bit should now be enabled
1586 * in `cpuid'
1587 */
1588 features = cpuid_edx(1);
1589 if (!(features & (1 << X86_FEATURE_APIC))) {
1590 pr_warning("Could not enable APIC!\n");
1591 return -1;
1592 } 1622 }
1593 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1594 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1595
1596 /* The BIOS may have set up the APIC at some other address */
1597 rdmsr(MSR_IA32_APICBASE, l, h);
1598 if (l & MSR_IA32_APICBASE_ENABLE)
1599 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1600
1601 pr_info("Found and enabled local APIC!\n");
1602 1623
1603 apic_pm_activate(); 1624 apic_pm_activate();
1604 1625
@@ -1610,28 +1631,6 @@ no_apic:
1610} 1631}
1611#endif 1632#endif
1612 1633
1613#ifdef CONFIG_X86_64
1614void __init early_init_lapic_mapping(void)
1615{
1616 /*
1617 * If no local APIC can be found then go out
1618 * : it means there is no mpatable and MADT
1619 */
1620 if (!smp_found_config)
1621 return;
1622
1623 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
1624 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1625 APIC_BASE, mp_lapic_addr);
1626
1627 /*
1628 * Fetch the APIC ID of the BSP in case we have a
1629 * default configuration (or the MP table is broken).
1630 */
1631 boot_cpu_physical_apicid = read_apic_id();
1632}
1633#endif
1634
1635/** 1634/**
1636 * init_apic_mappings - initialize APIC mappings 1635 * init_apic_mappings - initialize APIC mappings
1637 */ 1636 */
@@ -1657,10 +1656,7 @@ void __init init_apic_mappings(void)
1657 * acpi_register_lapic_address() 1656 * acpi_register_lapic_address()
1658 */ 1657 */
1659 if (!acpi_lapic && !smp_found_config) 1658 if (!acpi_lapic && !smp_found_config)
1660 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1659 register_lapic_address(apic_phys);
1661
1662 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1663 APIC_BASE, apic_phys);
1664 } 1660 }
1665 1661
1666 /* 1662 /*
@@ -1682,11 +1678,27 @@ void __init init_apic_mappings(void)
1682 } 1678 }
1683} 1679}
1684 1680
1681void __init register_lapic_address(unsigned long address)
1682{
1683 mp_lapic_addr = address;
1684
1685 if (!x2apic_mode) {
1686 set_fixmap_nocache(FIX_APIC_BASE, address);
1687 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1688 APIC_BASE, mp_lapic_addr);
1689 }
1690 if (boot_cpu_physical_apicid == -1U) {
1691 boot_cpu_physical_apicid = read_apic_id();
1692 apic_version[boot_cpu_physical_apicid] =
1693 GET_APIC_VERSION(apic_read(APIC_LVR));
1694 }
1695}
1696
1685/* 1697/*
1686 * This initializes the IO-APIC and APIC hardware if this is 1698 * This initializes the IO-APIC and APIC hardware if this is
1687 * a UP kernel. 1699 * a UP kernel.
1688 */ 1700 */
1689int apic_version[MAX_APICS]; 1701int apic_version[MAX_LOCAL_APIC];
1690 1702
1691int __init APIC_init_uniprocessor(void) 1703int __init APIC_init_uniprocessor(void)
1692{ 1704{
@@ -1751,17 +1763,10 @@ int __init APIC_init_uniprocessor(void)
1751 setup_IO_APIC(); 1763 setup_IO_APIC();
1752 else { 1764 else {
1753 nr_ioapics = 0; 1765 nr_ioapics = 0;
1754 localise_nmi_watchdog();
1755 } 1766 }
1756#else
1757 localise_nmi_watchdog();
1758#endif 1767#endif
1759 1768
1760 x86_init.timers.setup_percpu_clockev(); 1769 x86_init.timers.setup_percpu_clockev();
1761#ifdef CONFIG_X86_64
1762 check_nmi_watchdog();
1763#endif
1764
1765 return 0; 1770 return 0;
1766} 1771}
1767 1772
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd6942f0e9..79fd43ca6f96 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,19 +17,31 @@
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19 19
20/* For reliability, we're prepared to waste bits here. */ 20#ifdef CONFIG_HARDLOCKUP_DETECTOR
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22
23u64 hw_nmi_get_sample_period(void) 21u64 hw_nmi_get_sample_period(void)
24{ 22{
25 return (u64)(cpu_khz) * 1000 * 60; 23 return (u64)(cpu_khz) * 1000 * 60;
26} 24}
25#endif
26
27#ifdef arch_trigger_all_cpu_backtrace
28/* For reliability, we're prepared to waste bits here. */
29static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
30
31/* "in progress" flag of arch_trigger_all_cpu_backtrace */
32static unsigned long backtrace_flag;
27 33
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void) 34void arch_trigger_all_cpu_backtrace(void)
30{ 35{
31 int i; 36 int i;
32 37
38 if (test_and_set_bit(0, &backtrace_flag))
39 /*
40 * If there is already a trigger_all_cpu_backtrace() in progress
41 * (backtrace_flag == 1), don't output double cpu dump infos.
42 */
43 return;
44
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); 45 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34 46
35 printk(KERN_INFO "sending NMI to all CPUs:\n"); 47 printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -41,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void)
41 break; 53 break;
42 mdelay(1); 54 mdelay(1);
43 } 55 }
56
57 clear_bit(0, &backtrace_flag);
58 smp_mb__after_clear_bit();
44} 59}
45 60
46static int __kprobes 61static int __kprobes
@@ -49,11 +64,10 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
49{ 64{
50 struct die_args *args = __args; 65 struct die_args *args = __args;
51 struct pt_regs *regs; 66 struct pt_regs *regs;
52 int cpu = smp_processor_id(); 67 int cpu;
53 68
54 switch (cmd) { 69 switch (cmd) {
55 case DIE_NMI: 70 case DIE_NMI:
56 case DIE_NMI_IPI:
57 break; 71 break;
58 72
59 default: 73 default:
@@ -61,6 +75,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
61 } 75 }
62 76
63 regs = args->regs; 77 regs = args->regs;
78 cpu = smp_processor_id();
64 79
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 80 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; 81 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -80,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
80static __read_mostly struct notifier_block backtrace_notifier = { 95static __read_mostly struct notifier_block backtrace_notifier = {
81 .notifier_call = arch_trigger_all_cpu_backtrace_handler, 96 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
82 .next = NULL, 97 .next = NULL,
83 .priority = 1 98 .priority = NMI_LOCAL_LOW_PRIOR,
84}; 99};
85 100
86static int __init register_trigger_all_cpu_backtrace(void) 101static int __init register_trigger_all_cpu_backtrace(void)
@@ -90,18 +105,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
90} 105}
91early_initcall(register_trigger_all_cpu_backtrace); 106early_initcall(register_trigger_all_cpu_backtrace);
92#endif 107#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7cc0a721f628..697dc34b7b87 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -54,7 +54,6 @@
54#include <asm/dma.h> 54#include <asm/dma.h>
55#include <asm/timer.h> 55#include <asm/timer.h>
56#include <asm/i8259.h> 56#include <asm/i8259.h>
57#include <asm/nmi.h>
58#include <asm/msidef.h> 57#include <asm/msidef.h>
59#include <asm/hypertransport.h> 58#include <asm/hypertransport.h>
60#include <asm/setup.h> 59#include <asm/setup.h>
@@ -126,6 +125,26 @@ static int __init parse_noapic(char *str)
126} 125}
127early_param("noapic", parse_noapic); 126early_param("noapic", parse_noapic);
128 127
128/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
129void mp_save_irq(struct mpc_intsrc *m)
130{
131 int i;
132
133 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
134 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
135 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
136 m->srcbusirq, m->dstapic, m->dstirq);
137
138 for (i = 0; i < mp_irq_entries; i++) {
139 if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
140 return;
141 }
142
143 memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
144 if (++mp_irq_entries == MAX_IRQ_SOURCES)
145 panic("Max # of irq sources exceeded!!\n");
146}
147
129struct irq_pin_list { 148struct irq_pin_list {
130 int apic, pin; 149 int apic, pin;
131 struct irq_pin_list *next; 150 struct irq_pin_list *next;
@@ -136,6 +155,7 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
136 return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); 155 return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
137} 156}
138 157
158
139/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 159/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
140#ifdef CONFIG_SPARSE_IRQ 160#ifdef CONFIG_SPARSE_IRQ
141static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; 161static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
@@ -1934,8 +1954,7 @@ void disable_IO_APIC(void)
1934 * 1954 *
1935 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 1955 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1936 */ 1956 */
1937 1957void __init setup_ioapic_ids_from_mpc_nocheck(void)
1938void __init setup_ioapic_ids_from_mpc(void)
1939{ 1958{
1940 union IO_APIC_reg_00 reg_00; 1959 union IO_APIC_reg_00 reg_00;
1941 physid_mask_t phys_id_present_map; 1960 physid_mask_t phys_id_present_map;
@@ -1944,15 +1963,6 @@ void __init setup_ioapic_ids_from_mpc(void)
1944 unsigned char old_id; 1963 unsigned char old_id;
1945 unsigned long flags; 1964 unsigned long flags;
1946 1965
1947 if (acpi_ioapic)
1948 return;
1949 /*
1950 * Don't check I/O APIC IDs for xAPIC systems. They have
1951 * no meaning without the serial APIC bus.
1952 */
1953 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1954 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1955 return;
1956 /* 1966 /*
1957 * This is broken; anything with a real cpu count has to 1967 * This is broken; anything with a real cpu count has to
1958 * circumvent this idiocy regardless. 1968 * circumvent this idiocy regardless.
@@ -2006,7 +2016,6 @@ void __init setup_ioapic_ids_from_mpc(void)
2006 physids_or(phys_id_present_map, phys_id_present_map, tmp); 2016 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2007 } 2017 }
2008 2018
2009
2010 /* 2019 /*
2011 * We need to adjust the IRQ routing table 2020 * We need to adjust the IRQ routing table
2012 * if the ID changed. 2021 * if the ID changed.
@@ -2018,9 +2027,12 @@ void __init setup_ioapic_ids_from_mpc(void)
2018 = mp_ioapics[apic_id].apicid; 2027 = mp_ioapics[apic_id].apicid;
2019 2028
2020 /* 2029 /*
2021 * Read the right value from the MPC table and 2030 * Update the ID register according to the right value
2022 * write it into the ID register. 2031 * from the MPC table if they are different.
2023 */ 2032 */
2033 if (mp_ioapics[apic_id].apicid == reg_00.bits.ID)
2034 continue;
2035
2024 apic_printk(APIC_VERBOSE, KERN_INFO 2036 apic_printk(APIC_VERBOSE, KERN_INFO
2025 "...changing IO-APIC physical APIC ID to %d ...", 2037 "...changing IO-APIC physical APIC ID to %d ...",
2026 mp_ioapics[apic_id].apicid); 2038 mp_ioapics[apic_id].apicid);
@@ -2042,6 +2054,21 @@ void __init setup_ioapic_ids_from_mpc(void)
2042 apic_printk(APIC_VERBOSE, " ok.\n"); 2054 apic_printk(APIC_VERBOSE, " ok.\n");
2043 } 2055 }
2044} 2056}
2057
2058void __init setup_ioapic_ids_from_mpc(void)
2059{
2060
2061 if (acpi_ioapic)
2062 return;
2063 /*
2064 * Don't check I/O APIC IDs for xAPIC systems. They have
2065 * no meaning without the serial APIC bus.
2066 */
2067 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2068 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
2069 return;
2070 setup_ioapic_ids_from_mpc_nocheck();
2071}
2045#endif 2072#endif
2046 2073
2047int no_timer_check __initdata; 2074int no_timer_check __initdata;
@@ -2302,7 +2329,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2302 unsigned int irr; 2329 unsigned int irr;
2303 struct irq_desc *desc; 2330 struct irq_desc *desc;
2304 struct irq_cfg *cfg; 2331 struct irq_cfg *cfg;
2305 irq = __get_cpu_var(vector_irq)[vector]; 2332 irq = __this_cpu_read(vector_irq[vector]);
2306 2333
2307 if (irq == -1) 2334 if (irq == -1)
2308 continue; 2335 continue;
@@ -2336,7 +2363,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2336 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); 2363 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2337 goto unlock; 2364 goto unlock;
2338 } 2365 }
2339 __get_cpu_var(vector_irq)[vector] = -1; 2366 __this_cpu_write(vector_irq[vector], -1);
2340unlock: 2367unlock:
2341 raw_spin_unlock(&desc->lock); 2368 raw_spin_unlock(&desc->lock);
2342 } 2369 }
@@ -2430,13 +2457,12 @@ static void ack_apic_level(struct irq_data *data)
2430{ 2457{
2431 struct irq_cfg *cfg = data->chip_data; 2458 struct irq_cfg *cfg = data->chip_data;
2432 int i, do_unmask_irq = 0, irq = data->irq; 2459 int i, do_unmask_irq = 0, irq = data->irq;
2433 struct irq_desc *desc = irq_to_desc(irq);
2434 unsigned long v; 2460 unsigned long v;
2435 2461
2436 irq_complete_move(cfg); 2462 irq_complete_move(cfg);
2437#ifdef CONFIG_GENERIC_PENDING_IRQ 2463#ifdef CONFIG_GENERIC_PENDING_IRQ
2438 /* If we are moving the irq we need to mask it */ 2464 /* If we are moving the irq we need to mask it */
2439 if (unlikely(desc->status & IRQ_MOVE_PENDING)) { 2465 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
2440 do_unmask_irq = 1; 2466 do_unmask_irq = 1;
2441 mask_ioapic(cfg); 2467 mask_ioapic(cfg);
2442 } 2468 }
@@ -2643,24 +2669,6 @@ static void lapic_register_intr(int irq)
2643 "edge"); 2669 "edge");
2644} 2670}
2645 2671
2646static void __init setup_nmi(void)
2647{
2648 /*
2649 * Dirty trick to enable the NMI watchdog ...
2650 * We put the 8259A master into AEOI mode and
2651 * unmask on all local APICs LVT0 as NMI.
2652 *
2653 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2654 * is from Maciej W. Rozycki - so we do not have to EOI from
2655 * the NMI handler or the timer interrupt.
2656 */
2657 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2658
2659 enable_NMI_through_LVT0();
2660
2661 apic_printk(APIC_VERBOSE, " done.\n");
2662}
2663
2664/* 2672/*
2665 * This looks a bit hackish but it's about the only one way of sending 2673 * This looks a bit hackish but it's about the only one way of sending
2666 * a few INTA cycles to 8259As and any associated glue logic. ICR does 2674 * a few INTA cycles to 8259As and any associated glue logic. ICR does
@@ -2766,15 +2774,6 @@ static inline void __init check_timer(void)
2766 */ 2774 */
2767 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2775 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2768 legacy_pic->init(1); 2776 legacy_pic->init(1);
2769#ifdef CONFIG_X86_32
2770 {
2771 unsigned int ver;
2772
2773 ver = apic_read(APIC_LVR);
2774 ver = GET_APIC_VERSION(ver);
2775 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2776 }
2777#endif
2778 2777
2779 pin1 = find_isa_irq_pin(0, mp_INT); 2778 pin1 = find_isa_irq_pin(0, mp_INT);
2780 apic1 = find_isa_irq_apic(0, mp_INT); 2779 apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2822,10 +2821,6 @@ static inline void __init check_timer(void)
2822 unmask_ioapic(cfg); 2821 unmask_ioapic(cfg);
2823 } 2822 }
2824 if (timer_irq_works()) { 2823 if (timer_irq_works()) {
2825 if (nmi_watchdog == NMI_IO_APIC) {
2826 setup_nmi();
2827 legacy_pic->unmask(0);
2828 }
2829 if (disable_timer_pin_1 > 0) 2824 if (disable_timer_pin_1 > 0)
2830 clear_IO_APIC_pin(0, pin1); 2825 clear_IO_APIC_pin(0, pin1);
2831 goto out; 2826 goto out;
@@ -2851,11 +2846,6 @@ static inline void __init check_timer(void)
2851 if (timer_irq_works()) { 2846 if (timer_irq_works()) {
2852 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2847 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2853 timer_through_8259 = 1; 2848 timer_through_8259 = 1;
2854 if (nmi_watchdog == NMI_IO_APIC) {
2855 legacy_pic->mask(0);
2856 setup_nmi();
2857 legacy_pic->unmask(0);
2858 }
2859 goto out; 2849 goto out;
2860 } 2850 }
2861 /* 2851 /*
@@ -2867,15 +2857,6 @@ static inline void __init check_timer(void)
2867 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2857 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2868 } 2858 }
2869 2859
2870 if (nmi_watchdog == NMI_IO_APIC) {
2871 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2872 "through the IO-APIC - disabling NMI Watchdog!\n");
2873 nmi_watchdog = NMI_NONE;
2874 }
2875#ifdef CONFIG_X86_32
2876 timer_ack = 0;
2877#endif
2878
2879 apic_printk(APIC_QUIET, KERN_INFO 2860 apic_printk(APIC_QUIET, KERN_INFO
2880 "...trying to set up timer as Virtual Wire IRQ...\n"); 2861 "...trying to set up timer as Virtual Wire IRQ...\n");
2881 2862
@@ -3413,6 +3394,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3413 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3394 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3414 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3395 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3415 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3396 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3397 msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
3416 3398
3417 dmar_msi_write(irq, &msg); 3399 dmar_msi_write(irq, &msg);
3418 3400
@@ -3639,7 +3621,7 @@ int __init io_apic_get_redir_entries (int ioapic)
3639 return reg_01.bits.entries + 1; 3621 return reg_01.bits.entries + 1;
3640} 3622}
3641 3623
3642void __init probe_nr_irqs_gsi(void) 3624static void __init probe_nr_irqs_gsi(void)
3643{ 3625{
3644 int nr; 3626 int nr;
3645 3627
@@ -3956,7 +3938,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
3956 return res; 3938 return res;
3957} 3939}
3958 3940
3959void __init ioapic_init_mappings(void) 3941void __init ioapic_and_gsi_init(void)
3960{ 3942{
3961 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 3943 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3962 struct resource *ioapic_res; 3944 struct resource *ioapic_res;
@@ -3994,6 +3976,8 @@ fake_ioapic_page:
3994 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; 3976 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
3995 ioapic_res++; 3977 ioapic_res++;
3996 } 3978 }
3979
3980 probe_nr_irqs_gsi();
3997} 3981}
3998 3982
3999void __init ioapic_insert_resources(void) 3983void __init ioapic_insert_resources(void)
@@ -4103,7 +4087,8 @@ void __init pre_init_apic_IRQ0(void)
4103 4087
4104 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4088 printk(KERN_INFO "Early APIC setup for system timer0\n");
4105#ifndef CONFIG_SMP 4089#ifndef CONFIG_SMP
4106 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 4090 physid_set_mask_of_physid(boot_cpu_physical_apicid,
4091 &phys_cpu_present_map);
4107#endif 4092#endif
4108 /* Make sure the irq descriptor is set up */ 4093 /* Make sure the irq descriptor is set up */
4109 cfg = alloc_irq_and_cfg_at(0, 0); 4094 cfg = alloc_irq_and_cfg_at(0, 0);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index c90041ccb742..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
1/*
2 * NMI watchdog support on APIC systems
3 *
4 * Started by Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
10 * Pavel Machek and
11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
12 */
13
14#include <asm/apic.h>
15
16#include <linux/nmi.h>
17#include <linux/mm.h>
18#include <linux/delay.h>
19#include <linux/interrupt.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/sysdev.h>
23#include <linux/sysctl.h>
24#include <linux/percpu.h>
25#include <linux/kprobes.h>
26#include <linux/cpumask.h>
27#include <linux/kernel_stat.h>
28#include <linux/kdebug.h>
29#include <linux/smp.h>
30
31#include <asm/i8259.h>
32#include <asm/io_apic.h>
33#include <asm/proto.h>
34#include <asm/timer.h>
35
36#include <asm/mce.h>
37
38#include <asm/mach_traps.h>
39
40int unknown_nmi_panic;
41int nmi_watchdog_enabled;
42
43/* For reliability, we're prepared to waste bits here. */
44static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
45
46/* nmi_active:
47 * >0: the lapic NMI watchdog is active, but can be disabled
48 * <0: the lapic NMI watchdog has not been set up, and cannot
49 * be enabled
50 * 0: the lapic NMI watchdog is disabled, but can be enabled
51 */
52atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
53EXPORT_SYMBOL(nmi_active);
54
55unsigned int nmi_watchdog = NMI_NONE;
56EXPORT_SYMBOL(nmi_watchdog);
57
58static int panic_on_timeout;
59
60static unsigned int nmi_hz = HZ;
61static DEFINE_PER_CPU(short, wd_enabled);
62static int endflag __initdata;
63
64static inline unsigned int get_nmi_count(int cpu)
65{
66 return per_cpu(irq_stat, cpu).__nmi_count;
67}
68
69static inline int mce_in_progress(void)
70{
71#if defined(CONFIG_X86_MCE)
72 return atomic_read(&mce_entry) > 0;
73#endif
74 return 0;
75}
76
77/*
78 * Take the local apic timer and PIT/HPET into account. We don't
79 * know which one is active, when we have highres/dyntick on
80 */
81static inline unsigned int get_timer_irqs(int cpu)
82{
83 return per_cpu(irq_stat, cpu).apic_timer_irqs +
84 per_cpu(irq_stat, cpu).irq0_irqs;
85}
86
87#ifdef CONFIG_SMP
88/*
89 * The performance counters used by NMI_LOCAL_APIC don't trigger when
90 * the CPU is idle. To make sure the NMI watchdog really ticks on all
91 * CPUs during the test make them busy.
92 */
93static __init void nmi_cpu_busy(void *data)
94{
95 local_irq_enable_in_hardirq();
96 /*
97 * Intentionally don't use cpu_relax here. This is
98 * to make sure that the performance counter really ticks,
99 * even if there is a simulator or similar that catches the
100 * pause instruction. On a real HT machine this is fine because
101 * all other CPUs are busy with "useless" delay loops and don't
102 * care if they get somewhat less cycles.
103 */
104 while (endflag == 0)
105 mb();
106}
107#endif
108
109static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
110{
111 printk(KERN_CONT "\n");
112
113 printk(KERN_WARNING
114 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
115 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
116
117 printk(KERN_WARNING
118 "Please report this to bugzilla.kernel.org,\n");
119 printk(KERN_WARNING
120 "and attach the output of the 'dmesg' command.\n");
121
122 per_cpu(wd_enabled, cpu) = 0;
123 atomic_dec(&nmi_active);
124}
125
126static void __acpi_nmi_disable(void *__unused)
127{
128 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
129}
130
131int __init check_nmi_watchdog(void)
132{
133 unsigned int *prev_nmi_count;
134 int cpu;
135
136 if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
137 return 0;
138
139 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
140 if (!prev_nmi_count)
141 goto error;
142
143 printk(KERN_INFO "Testing NMI watchdog ... ");
144
145#ifdef CONFIG_SMP
146 if (nmi_watchdog == NMI_LOCAL_APIC)
147 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
148#endif
149
150 for_each_possible_cpu(cpu)
151 prev_nmi_count[cpu] = get_nmi_count(cpu);
152 local_irq_enable();
153 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
154
155 for_each_online_cpu(cpu) {
156 if (!per_cpu(wd_enabled, cpu))
157 continue;
158 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
159 report_broken_nmi(cpu, prev_nmi_count);
160 }
161 endflag = 1;
162 if (!atomic_read(&nmi_active)) {
163 kfree(prev_nmi_count);
164 atomic_set(&nmi_active, -1);
165 goto error;
166 }
167 printk("OK.\n");
168
169 /*
170 * now that we know it works we can reduce NMI frequency to
171 * something more reasonable; makes a difference in some configs
172 */
173 if (nmi_watchdog == NMI_LOCAL_APIC)
174 nmi_hz = lapic_adjust_nmi_hz(1);
175
176 kfree(prev_nmi_count);
177 return 0;
178error:
179 if (nmi_watchdog == NMI_IO_APIC) {
180 if (!timer_through_8259)
181 legacy_pic->mask(0);
182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
183 }
184
185#ifdef CONFIG_X86_32
186 timer_ack = 0;
187#endif
188 return -1;
189}
190
191static int __init setup_nmi_watchdog(char *str)
192{
193 unsigned int nmi;
194
195 if (!strncmp(str, "panic", 5)) {
196 panic_on_timeout = 1;
197 str = strchr(str, ',');
198 if (!str)
199 return 1;
200 ++str;
201 }
202
203 if (!strncmp(str, "lapic", 5))
204 nmi_watchdog = NMI_LOCAL_APIC;
205 else if (!strncmp(str, "ioapic", 6))
206 nmi_watchdog = NMI_IO_APIC;
207 else {
208 get_option(&str, &nmi);
209 if (nmi >= NMI_INVALID)
210 return 0;
211 nmi_watchdog = nmi;
212 }
213
214 return 1;
215}
216__setup("nmi_watchdog=", setup_nmi_watchdog);
217
218/*
219 * Suspend/resume support
220 */
221#ifdef CONFIG_PM
222
223static int nmi_pm_active; /* nmi_active before suspend */
224
225static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
226{
227 /* only CPU0 goes here, other CPUs should be offline */
228 nmi_pm_active = atomic_read(&nmi_active);
229 stop_apic_nmi_watchdog(NULL);
230 BUG_ON(atomic_read(&nmi_active) != 0);
231 return 0;
232}
233
234static int lapic_nmi_resume(struct sys_device *dev)
235{
236 /* only CPU0 goes here, other CPUs should be offline */
237 if (nmi_pm_active > 0) {
238 setup_apic_nmi_watchdog(NULL);
239 touch_nmi_watchdog();
240 }
241 return 0;
242}
243
244static struct sysdev_class nmi_sysclass = {
245 .name = "lapic_nmi",
246 .resume = lapic_nmi_resume,
247 .suspend = lapic_nmi_suspend,
248};
249
250static struct sys_device device_lapic_nmi = {
251 .id = 0,
252 .cls = &nmi_sysclass,
253};
254
255static int __init init_lapic_nmi_sysfs(void)
256{
257 int error;
258
259 /*
260 * should really be a BUG_ON but b/c this is an
261 * init call, it just doesn't work. -dcz
262 */
263 if (nmi_watchdog != NMI_LOCAL_APIC)
264 return 0;
265
266 if (atomic_read(&nmi_active) < 0)
267 return 0;
268
269 error = sysdev_class_register(&nmi_sysclass);
270 if (!error)
271 error = sysdev_register(&device_lapic_nmi);
272 return error;
273}
274
275/* must come after the local APIC's device_initcall() */
276late_initcall(init_lapic_nmi_sysfs);
277
278#endif /* CONFIG_PM */
279
280static void __acpi_nmi_enable(void *__unused)
281{
282 apic_write(APIC_LVT0, APIC_DM_NMI);
283}
284
285/*
286 * Enable timer based NMIs on all CPUs:
287 */
288void acpi_nmi_enable(void)
289{
290 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
291 on_each_cpu(__acpi_nmi_enable, NULL, 1);
292}
293
294/*
295 * Disable timer based NMIs on all CPUs:
296 */
297void acpi_nmi_disable(void)
298{
299 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
300 on_each_cpu(__acpi_nmi_disable, NULL, 1);
301}
302
303/*
304 * This function is called as soon the LAPIC NMI watchdog driver has everything
305 * in place and it's ready to check if the NMIs belong to the NMI watchdog
306 */
307void cpu_nmi_set_wd_enabled(void)
308{
309 __get_cpu_var(wd_enabled) = 1;
310}
311
312void setup_apic_nmi_watchdog(void *unused)
313{
314 if (__get_cpu_var(wd_enabled))
315 return;
316
317 /* cheap hack to support suspend/resume */
318 /* if cpu0 is not active neither should the other cpus */
319 if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
320 return;
321
322 switch (nmi_watchdog) {
323 case NMI_LOCAL_APIC:
324 if (lapic_watchdog_init(nmi_hz) < 0) {
325 __get_cpu_var(wd_enabled) = 0;
326 return;
327 }
328 /* FALL THROUGH */
329 case NMI_IO_APIC:
330 __get_cpu_var(wd_enabled) = 1;
331 atomic_inc(&nmi_active);
332 }
333}
334
335void stop_apic_nmi_watchdog(void *unused)
336{
337 /* only support LOCAL and IO APICs for now */
338 if (!nmi_watchdog_active())
339 return;
340 if (__get_cpu_var(wd_enabled) == 0)
341 return;
342 if (nmi_watchdog == NMI_LOCAL_APIC)
343 lapic_watchdog_stop();
344 else
345 __acpi_nmi_disable(NULL);
346 __get_cpu_var(wd_enabled) = 0;
347 atomic_dec(&nmi_active);
348}
349
350/*
351 * the best way to detect whether a CPU has a 'hard lockup' problem
352 * is to check it's local APIC timer IRQ counts. If they are not
353 * changing then that CPU has some problem.
354 *
355 * as these watchdog NMI IRQs are generated on every CPU, we only
356 * have to check the current processor.
357 *
358 * since NMIs don't listen to _any_ locks, we have to be extremely
359 * careful not to rely on unsafe variables. The printk might lock
360 * up though, so we have to break up any console locks first ...
361 * [when there will be more tty-related locks, break them up here too!]
362 */
363
364static DEFINE_PER_CPU(unsigned, last_irq_sum);
365static DEFINE_PER_CPU(long, alert_counter);
366static DEFINE_PER_CPU(int, nmi_touch);
367
368void touch_nmi_watchdog(void)
369{
370 if (nmi_watchdog_active()) {
371 unsigned cpu;
372
373 /*
374 * Tell other CPUs to reset their alert counters. We cannot
375 * do it ourselves because the alert count increase is not
376 * atomic.
377 */
378 for_each_present_cpu(cpu) {
379 if (per_cpu(nmi_touch, cpu) != 1)
380 per_cpu(nmi_touch, cpu) = 1;
381 }
382 }
383
384 /*
385 * Tickle the softlockup detector too:
386 */
387 touch_softlockup_watchdog();
388}
389EXPORT_SYMBOL(touch_nmi_watchdog);
390
391notrace __kprobes int
392nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
393{
394 /*
395 * Since current_thread_info()-> is always on the stack, and we
396 * always switch the stack NMI-atomically, it's safe to use
397 * smp_processor_id().
398 */
399 unsigned int sum;
400 int touched = 0;
401 int cpu = smp_processor_id();
402 int rc = 0;
403
404 sum = get_timer_irqs(cpu);
405
406 if (__get_cpu_var(nmi_touch)) {
407 __get_cpu_var(nmi_touch) = 0;
408 touched = 1;
409 }
410
411 /* We can be called before check_nmi_watchdog, hence NULL check. */
412 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
413 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
414
415 raw_spin_lock(&lock);
416 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
417 show_regs(regs);
418 dump_stack();
419 raw_spin_unlock(&lock);
420 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
421
422 rc = 1;
423 }
424
425 /* Could check oops_in_progress here too, but it's safer not to */
426 if (mce_in_progress())
427 touched = 1;
428
429 /* if the none of the timers isn't firing, this cpu isn't doing much */
430 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
431 /*
432 * Ayiee, looks like this CPU is stuck ...
433 * wait a few IRQs (5 seconds) before doing the oops ...
434 */
435 __this_cpu_inc(alert_counter);
436 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
437 /*
438 * die_nmi will return ONLY if NOTIFY_STOP happens..
439 */
440 die_nmi("BUG: NMI Watchdog detected LOCKUP",
441 regs, panic_on_timeout);
442 } else {
443 __get_cpu_var(last_irq_sum) = sum;
444 __this_cpu_write(alert_counter, 0);
445 }
446
447 /* see if the nmi watchdog went off */
448 if (!__get_cpu_var(wd_enabled))
449 return rc;
450 switch (nmi_watchdog) {
451 case NMI_LOCAL_APIC:
452 rc |= lapic_wd_event(nmi_hz);
453 break;
454 case NMI_IO_APIC:
455 /*
456 * don't know how to accurately check for this.
457 * just assume it was a watchdog timer interrupt
458 * This matches the old behaviour.
459 */
460 rc = 1;
461 break;
462 }
463 return rc;
464}
465
466#ifdef CONFIG_SYSCTL
467
468static void enable_ioapic_nmi_watchdog_single(void *unused)
469{
470 __get_cpu_var(wd_enabled) = 1;
471 atomic_inc(&nmi_active);
472 __acpi_nmi_enable(NULL);
473}
474
475static void enable_ioapic_nmi_watchdog(void)
476{
477 on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
478 touch_nmi_watchdog();
479}
480
481static void disable_ioapic_nmi_watchdog(void)
482{
483 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
484}
485
486static int __init setup_unknown_nmi_panic(char *str)
487{
488 unknown_nmi_panic = 1;
489 return 1;
490}
491__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
492
493static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
494{
495 unsigned char reason = get_nmi_reason();
496 char buf[64];
497
498 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
499 die_nmi(buf, regs, 1); /* Always panic here */
500 return 0;
501}
502
503/*
504 * proc handler for /proc/sys/kernel/nmi
505 */
506int proc_nmi_enabled(struct ctl_table *table, int write,
507 void __user *buffer, size_t *length, loff_t *ppos)
508{
509 int old_state;
510
511 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
512 old_state = nmi_watchdog_enabled;
513 proc_dointvec(table, write, buffer, length, ppos);
514 if (!!old_state == !!nmi_watchdog_enabled)
515 return 0;
516
517 if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
518 printk(KERN_WARNING
519 "NMI watchdog is permanently disabled\n");
520 return -EIO;
521 }
522
523 if (nmi_watchdog == NMI_LOCAL_APIC) {
524 if (nmi_watchdog_enabled)
525 enable_lapic_nmi_watchdog();
526 else
527 disable_lapic_nmi_watchdog();
528 } else if (nmi_watchdog == NMI_IO_APIC) {
529 if (nmi_watchdog_enabled)
530 enable_ioapic_nmi_watchdog();
531 else
532 disable_ioapic_nmi_watchdog();
533 } else {
534 printk(KERN_WARNING
535 "NMI watchdog doesn't know what hardware to touch\n");
536 return -EIO;
537 }
538 return 0;
539}
540
541#endif /* CONFIG_SYSCTL */
542
543int do_nmi_callback(struct pt_regs *regs, int cpu)
544{
545#ifdef CONFIG_SYSCTL
546 if (unknown_nmi_panic)
547 return unknown_nmi_panic_callback(regs, cpu);
548#endif
549 return 0;
550}
551
552void arch_trigger_all_cpu_backtrace(void)
553{
554 int i;
555
556 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
557
558 printk(KERN_INFO "sending NMI to all CPUs:\n");
559 apic->send_IPI_all(NMI_VECTOR);
560
561 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
562 for (i = 0; i < 10 * 1000; i++) {
563 if (cpumask_empty(to_cpumask(backtrace_mask)))
564 break;
565 mdelay(1);
566 }
567}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index f9e4e6a54073..d8c4a6feb286 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void)
79 /* need to update phys_pkg_id */ 79 /* need to update phys_pkg_id */
80 apic->phys_pkg_id = apicid_phys_pkg_id; 80 apic->phys_pkg_id = apicid_phys_pkg_id;
81 } 81 }
82
83 /*
84 * Now that apic routing model is selected, configure the
85 * fault handling for intr remapping.
86 */
87 if (intr_remapping_enabled)
88 enable_drhd_fault_handling();
89} 82}
90 83
91/* Same for both flat and physical. */ 84/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ed4118de249e..bd16b58b8850 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -44,8 +44,20 @@ static u64 gru_start_paddr, gru_end_paddr;
44static union uvh_apicid uvh_apicid; 44static union uvh_apicid uvh_apicid;
45int uv_min_hub_revision_id; 45int uv_min_hub_revision_id;
46EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); 46EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
47unsigned int uv_apicid_hibits;
48EXPORT_SYMBOL_GPL(uv_apicid_hibits);
47static DEFINE_SPINLOCK(uv_nmi_lock); 49static DEFINE_SPINLOCK(uv_nmi_lock);
48 50
51static unsigned long __init uv_early_read_mmr(unsigned long addr)
52{
53 unsigned long val, *mmr;
54
55 mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
56 val = *mmr;
57 early_iounmap(mmr, sizeof(*mmr));
58 return val;
59}
60
49static inline bool is_GRU_range(u64 start, u64 end) 61static inline bool is_GRU_range(u64 start, u64 end)
50{ 62{
51 return start >= gru_start_paddr && end <= gru_end_paddr; 63 return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -56,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
56 return is_ISA_range(start, end) || is_GRU_range(start, end); 68 return is_ISA_range(start, end) || is_GRU_range(start, end);
57} 69}
58 70
59static int early_get_nodeid(void) 71static int __init early_get_pnodeid(void)
60{ 72{
61 union uvh_node_id_u node_id; 73 union uvh_node_id_u node_id;
62 unsigned long *mmr; 74 union uvh_rh_gam_config_mmr_u m_n_config;
63 75 int pnode;
64 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
65 node_id.v = *mmr;
66 early_iounmap(mmr, sizeof(*mmr));
67 76
68 /* Currently, all blades have same revision number */ 77 /* Currently, all blades have same revision number */
78 node_id.v = uv_early_read_mmr(UVH_NODE_ID);
79 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
69 uv_min_hub_revision_id = node_id.s.revision; 80 uv_min_hub_revision_id = node_id.s.revision;
70 81
71 return node_id.s.node_id; 82 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
83 return pnode;
72} 84}
73 85
74static void __init early_get_apic_pnode_shift(void) 86static void __init early_get_apic_pnode_shift(void)
75{ 87{
76 unsigned long *mmr; 88 uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
77
78 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
79 uvh_apicid.v = *mmr;
80 early_iounmap(mmr, sizeof(*mmr));
81 if (!uvh_apicid.v) 89 if (!uvh_apicid.v)
82 /* 90 /*
83 * Old bios, use default value 91 * Old bios, use default value
@@ -85,12 +93,25 @@ static void __init early_get_apic_pnode_shift(void)
85 uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; 93 uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
86} 94}
87 95
96/*
97 * Add an extra bit as dictated by bios to the destination apicid of
98 * interrupts potentially passing through the UV HUB. This prevents
99 * a deadlock between interrupts and IO port operations.
100 */
101static void __init uv_set_apicid_hibit(void)
102{
103 union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
104
105 apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
106 uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
107}
108
88static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 109static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
89{ 110{
90 int nodeid; 111 int pnodeid;
91 112
92 if (!strcmp(oem_id, "SGI")) { 113 if (!strcmp(oem_id, "SGI")) {
93 nodeid = early_get_nodeid(); 114 pnodeid = early_get_pnodeid();
94 early_get_apic_pnode_shift(); 115 early_get_apic_pnode_shift();
95 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 116 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
96 x86_platform.nmi_init = uv_nmi_init; 117 x86_platform.nmi_init = uv_nmi_init;
@@ -99,9 +120,10 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
99 else if (!strcmp(oem_table_id, "UVX")) 120 else if (!strcmp(oem_table_id, "UVX"))
100 uv_system_type = UV_X2APIC; 121 uv_system_type = UV_X2APIC;
101 else if (!strcmp(oem_table_id, "UVH")) { 122 else if (!strcmp(oem_table_id, "UVH")) {
102 __get_cpu_var(x2apic_extra_bits) = 123 __this_cpu_write(x2apic_extra_bits,
103 nodeid << (uvh_apicid.s.pnode_shift - 1); 124 pnodeid << uvh_apicid.s.pnode_shift);
104 uv_system_type = UV_NON_UNIQUE_APIC; 125 uv_system_type = UV_NON_UNIQUE_APIC;
126 uv_set_apicid_hibit();
105 return 1; 127 return 1;
106 } 128 }
107 } 129 }
@@ -155,6 +177,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
155 int pnode; 177 int pnode;
156 178
157 pnode = uv_apicid_to_pnode(phys_apicid); 179 pnode = uv_apicid_to_pnode(phys_apicid);
180 phys_apicid |= uv_apicid_hibits;
158 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 181 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
159 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 182 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
160 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 183 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
@@ -236,7 +259,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
236 int cpu = cpumask_first(cpumask); 259 int cpu = cpumask_first(cpumask);
237 260
238 if ((unsigned)cpu < nr_cpu_ids) 261 if ((unsigned)cpu < nr_cpu_ids)
239 return per_cpu(x86_cpu_to_apicid, cpu); 262 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
240 else 263 else
241 return BAD_APICID; 264 return BAD_APICID;
242} 265}
@@ -255,7 +278,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
255 if (cpumask_test_cpu(cpu, cpu_online_mask)) 278 if (cpumask_test_cpu(cpu, cpu_online_mask))
256 break; 279 break;
257 } 280 }
258 return per_cpu(x86_cpu_to_apicid, cpu); 281 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
259} 282}
260 283
261static unsigned int x2apic_get_apic_id(unsigned long x) 284static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -263,7 +286,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
263 unsigned int id; 286 unsigned int id;
264 287
265 WARN_ON(preemptible() && num_online_cpus() > 1); 288 WARN_ON(preemptible() && num_online_cpus() > 1);
266 id = x | __get_cpu_var(x2apic_extra_bits); 289 id = x | __this_cpu_read(x2apic_extra_bits);
267 290
268 return id; 291 return id;
269} 292}
@@ -355,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = {
355 378
356static __cpuinit void set_x2apic_extra_bits(int pnode) 379static __cpuinit void set_x2apic_extra_bits(int pnode)
357{ 380{
358 __get_cpu_var(x2apic_extra_bits) = (pnode << 6); 381 __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
359} 382}
360 383
361/* 384/*
@@ -379,14 +402,14 @@ struct redir_addr {
379#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 402#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
380 403
381static __initdata struct redir_addr redir_addrs[] = { 404static __initdata struct redir_addr redir_addrs[] = {
382 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, 405 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
383 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, 406 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
384 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, 407 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
385}; 408};
386 409
387static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) 410static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
388{ 411{
389 union uvh_si_alias0_overlay_config_u alias; 412 union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
390 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; 413 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
391 int i; 414 int i;
392 415
@@ -618,7 +641,7 @@ void __cpuinit uv_cpu_init(void)
618 */ 641 */
619int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) 642int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
620{ 643{
621 if (reason != DIE_NMI_IPI) 644 if (reason != DIE_NMIUNKNOWN)
622 return NOTIFY_OK; 645 return NOTIFY_OK;
623 646
624 if (in_crash_kexec) 647 if (in_crash_kexec)
@@ -660,28 +683,33 @@ void uv_nmi_init(void)
660 683
661void __init uv_system_init(void) 684void __init uv_system_init(void)
662{ 685{
663 union uvh_si_addr_map_config_u m_n_config; 686 union uvh_rh_gam_config_mmr_u m_n_config;
687 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
664 union uvh_node_id_u node_id; 688 union uvh_node_id_u node_id;
665 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 689 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
666 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 690 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
667 int gnode_extra, max_pnode = 0; 691 int gnode_extra, max_pnode = 0;
668 unsigned long mmr_base, present, paddr; 692 unsigned long mmr_base, present, paddr;
669 unsigned short pnode_mask; 693 unsigned short pnode_mask, pnode_io_mask;
670 694
671 map_low_mmrs(); 695 map_low_mmrs();
672 696
673 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 697 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
674 m_val = m_n_config.s.m_skt; 698 m_val = m_n_config.s.m_skt;
675 n_val = m_n_config.s.n_skt; 699 n_val = m_n_config.s.n_skt;
700 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
701 n_io = mmioh.s.n_io;
676 mmr_base = 702 mmr_base =
677 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 703 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
678 ~UV_MMR_ENABLE; 704 ~UV_MMR_ENABLE;
679 pnode_mask = (1 << n_val) - 1; 705 pnode_mask = (1 << n_val) - 1;
706 pnode_io_mask = (1 << n_io) - 1;
707
680 node_id.v = uv_read_local_mmr(UVH_NODE_ID); 708 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
681 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; 709 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
682 gnode_upper = ((unsigned long)gnode_extra << m_val); 710 gnode_upper = ((unsigned long)gnode_extra << m_val);
683 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", 711 printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
684 n_val, m_val, gnode_upper, gnode_extra); 712 n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
685 713
686 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 714 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
687 715
@@ -714,7 +742,7 @@ void __init uv_system_init(void)
714 for (j = 0; j < 64; j++) { 742 for (j = 0; j < 64; j++) {
715 if (!test_bit(j, &present)) 743 if (!test_bit(j, &present))
716 continue; 744 continue;
717 pnode = (i * 64 + j); 745 pnode = (i * 64 + j) & pnode_mask;
718 uv_blade_info[blade].pnode = pnode; 746 uv_blade_info[blade].pnode = pnode;
719 uv_blade_info[blade].nr_possible_cpus = 0; 747 uv_blade_info[blade].nr_possible_cpus = 0;
720 uv_blade_info[blade].nr_online_cpus = 0; 748 uv_blade_info[blade].nr_online_cpus = 0;
@@ -735,6 +763,7 @@ void __init uv_system_init(void)
735 /* 763 /*
736 * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); 764 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
737 */ 765 */
766 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
738 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; 767 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
739 pnode = uv_apicid_to_pnode(apicid); 768 pnode = uv_apicid_to_pnode(apicid);
740 blade = boot_pnode_to_blade(pnode); 769 blade = boot_pnode_to_blade(pnode);
@@ -751,7 +780,6 @@ void __init uv_system_init(void)
751 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 780 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
752 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 781 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
753 uv_cpu_hub_info(cpu)->pnode = pnode; 782 uv_cpu_hub_info(cpu)->pnode = pnode;
754 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
755 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; 783 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
756 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 784 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
757 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 785 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -775,7 +803,7 @@ void __init uv_system_init(void)
775 803
776 map_gru_high(max_pnode); 804 map_gru_high(max_pnode);
777 map_mmr_high(max_pnode); 805 map_mmr_high(max_pnode);
778 map_mmioh_high(max_pnode); 806 map_mmioh_high(max_pnode & pnode_io_mask);
779 807
780 uv_cpu_init(); 808 uv_cpu_init();
781 uv_scir_register_cpu_notifier(); 809 uv_scir_register_cpu_notifier();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9e093f8fe78c..7c7bedb83c5a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -668,7 +668,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383);
668 668
669bool cpu_has_amd_erratum(const int *erratum) 669bool cpu_has_amd_erratum(const int *erratum)
670{ 670{
671 struct cpuinfo_x86 *cpu = &current_cpu_data; 671 struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
672 int osvw_id = *erratum++; 672 int osvw_id = *erratum++;
673 u32 range; 673 u32 range;
674 u32 ms; 674 u32 ms;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b68bda30938..1d59834396bd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -894,7 +894,6 @@ void __init identify_boot_cpu(void)
894#else 894#else
895 vgetcpu_set_mode(); 895 vgetcpu_set_mode();
896#endif 896#endif
897 init_hw_perf_events();
898} 897}
899 898
900void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 899void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 491977baf6c0..35c7e65e59be 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -521,7 +521,7 @@ static void check_supported_cpu(void *_rc)
521 521
522 *rc = -ENODEV; 522 *rc = -ENODEV;
523 523
524 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) 524 if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
525 return; 525 return;
526 526
527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
@@ -1377,7 +1377,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1377static void query_values_on_cpu(void *_err) 1377static void query_values_on_cpu(void *_err)
1378{ 1378{
1379 int *err = _err; 1379 int *err = _err;
1380 struct powernow_k8_data *data = __get_cpu_var(powernow_data); 1380 struct powernow_k8_data *data = __this_cpu_read(powernow_data);
1381 1381
1382 *err = query_current_values_with_pending_wait(data); 1382 *err = query_current_values_with_pending_wait(data);
1383} 1383}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 17ad03366211..ec2c19a7b8ef 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ 45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ 46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
48 { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 49 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 50 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ 67 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ 68 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ 69 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
70 { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ 71 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 89 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ 90 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 91 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
92 { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 94 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ 95 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
@@ -149,8 +152,7 @@ union _cpuid4_leaf_ecx {
149}; 152};
150 153
151struct amd_l3_cache { 154struct amd_l3_cache {
152 struct pci_dev *dev; 155 struct amd_northbridge *nb;
153 bool can_disable;
154 unsigned indices; 156 unsigned indices;
155 u8 subcaches[4]; 157 u8 subcaches[4];
156}; 158};
@@ -266,7 +268,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
266 line_size = l2.line_size; 268 line_size = l2.line_size;
267 lines_per_tag = l2.lines_per_tag; 269 lines_per_tag = l2.lines_per_tag;
268 /* cpu_data has errata corrections for K7 applied */ 270 /* cpu_data has errata corrections for K7 applied */
269 size_in_kb = current_cpu_data.x86_cache_size; 271 size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
270 break; 272 break;
271 case 3: 273 case 3:
272 if (!l3.val) 274 if (!l3.val)
@@ -288,7 +290,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
288 eax->split.type = types[leaf]; 290 eax->split.type = types[leaf];
289 eax->split.level = levels[leaf]; 291 eax->split.level = levels[leaf];
290 eax->split.num_threads_sharing = 0; 292 eax->split.num_threads_sharing = 0;
291 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 293 eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
292 294
293 295
294 if (assoc == 0xffff) 296 if (assoc == 0xffff)
@@ -311,14 +313,12 @@ struct _cache_attr {
311/* 313/*
312 * L3 cache descriptors 314 * L3 cache descriptors
313 */ 315 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) 316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
317{ 317{
318 unsigned int sc0, sc1, sc2, sc3; 318 unsigned int sc0, sc1, sc2, sc3;
319 u32 val = 0; 319 u32 val = 0;
320 320
321 pci_read_config_dword(l3->dev, 0x1C4, &val); 321 pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
322 322
323 /* calculate subcache sizes */ 323 /* calculate subcache sizes */
324 l3->subcaches[0] = sc0 = !(val & BIT(0)); 324 l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -330,47 +330,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
331} 331}
332 332
333static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) 333static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
334{ 334 int index)
335 struct amd_l3_cache *l3;
336 struct pci_dev *dev = node_to_k8_nb_misc(node);
337
338 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
339 if (!l3) {
340 printk(KERN_WARNING "Error allocating L3 struct\n");
341 return NULL;
342 }
343
344 l3->dev = dev;
345
346 amd_calc_l3_indices(l3);
347
348 return l3;
349}
350
351static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
352 int index)
353{ 335{
336 static struct amd_l3_cache *__cpuinitdata l3_caches;
354 int node; 337 int node;
355 338
356 if (boot_cpu_data.x86 != 0x10) 339 /* only for L3, and not in virtualized environments */
357 return; 340 if (index < 3 || amd_nb_num() == 0)
358
359 if (index < 3)
360 return;
361
362 /* see errata #382 and #388 */
363 if (boot_cpu_data.x86_model < 0x8)
364 return;
365
366 if ((boot_cpu_data.x86_model == 0x8 ||
367 boot_cpu_data.x86_model == 0x9)
368 &&
369 boot_cpu_data.x86_mask < 0x1)
370 return;
371
372 /* not in virtualized environments */
373 if (k8_northbridges.num == 0)
374 return; 341 return;
375 342
376 /* 343 /*
@@ -378,7 +345,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
378 * never freed but this is done only on shutdown so it doesn't matter. 345 * never freed but this is done only on shutdown so it doesn't matter.
379 */ 346 */
380 if (!l3_caches) { 347 if (!l3_caches) {
381 int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); 348 int size = amd_nb_num() * sizeof(struct amd_l3_cache);
382 349
383 l3_caches = kzalloc(size, GFP_ATOMIC); 350 l3_caches = kzalloc(size, GFP_ATOMIC);
384 if (!l3_caches) 351 if (!l3_caches)
@@ -387,14 +354,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
387 354
388 node = amd_get_nb_id(smp_processor_id()); 355 node = amd_get_nb_id(smp_processor_id());
389 356
390 if (!l3_caches[node]) { 357 if (!l3_caches[node].nb) {
391 l3_caches[node] = amd_init_l3_cache(node); 358 l3_caches[node].nb = node_to_amd_nb(node);
392 l3_caches[node]->can_disable = true; 359 amd_calc_l3_indices(&l3_caches[node]);
393 } 360 }
394 361
395 WARN_ON(!l3_caches[node]); 362 this_leaf->l3 = &l3_caches[node];
396
397 this_leaf->l3 = l3_caches[node];
398} 363}
399 364
400/* 365/*
@@ -408,7 +373,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
408{ 373{
409 unsigned int reg = 0; 374 unsigned int reg = 0;
410 375
411 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg); 376 pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
412 377
413 /* check whether this slot is activated already */ 378 /* check whether this slot is activated already */
414 if (reg & (3UL << 30)) 379 if (reg & (3UL << 30))
@@ -422,7 +387,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
422{ 387{
423 int index; 388 int index;
424 389
425 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 390 if (!this_leaf->l3 ||
391 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
426 return -EINVAL; 392 return -EINVAL;
427 393
428 index = amd_get_l3_disable_slot(this_leaf->l3, slot); 394 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -457,7 +423,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
457 if (!l3->subcaches[i]) 423 if (!l3->subcaches[i])
458 continue; 424 continue;
459 425
460 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 426 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
461 427
462 /* 428 /*
463 * We need to WBINVD on a core on the node containing the L3 429 * We need to WBINVD on a core on the node containing the L3
@@ -467,7 +433,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
467 wbinvd_on_cpu(cpu); 433 wbinvd_on_cpu(cpu);
468 434
469 reg |= BIT(31); 435 reg |= BIT(31);
470 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 436 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
471 } 437 }
472} 438}
473 439
@@ -524,7 +490,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
524 if (!capable(CAP_SYS_ADMIN)) 490 if (!capable(CAP_SYS_ADMIN))
525 return -EPERM; 491 return -EPERM;
526 492
527 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 493 if (!this_leaf->l3 ||
494 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
528 return -EINVAL; 495 return -EINVAL;
529 496
530 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 497 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -545,7 +512,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
545#define STORE_CACHE_DISABLE(slot) \ 512#define STORE_CACHE_DISABLE(slot) \
546static ssize_t \ 513static ssize_t \
547store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 514store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
548 const char *buf, size_t count) \ 515 const char *buf, size_t count) \
549{ \ 516{ \
550 return store_cache_disable(this_leaf, buf, count, slot); \ 517 return store_cache_disable(this_leaf, buf, count, slot); \
551} 518}
@@ -558,10 +525,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
558 show_cache_disable_1, store_cache_disable_1); 525 show_cache_disable_1, store_cache_disable_1);
559 526
560#else /* CONFIG_AMD_NB */ 527#else /* CONFIG_AMD_NB */
561static void __cpuinit 528#define amd_init_l3_cache(x, y)
562amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
563{
564};
565#endif /* CONFIG_AMD_NB */ 529#endif /* CONFIG_AMD_NB */
566 530
567static int 531static int
@@ -575,7 +539,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
575 539
576 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 540 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
577 amd_cpuid4(index, &eax, &ebx, &ecx); 541 amd_cpuid4(index, &eax, &ebx, &ecx);
578 amd_check_l3_disable(this_leaf, index); 542 amd_init_l3_cache(this_leaf, index);
579 } else { 543 } else {
580 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 544 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
581 } 545 }
@@ -983,30 +947,48 @@ define_one_ro(size);
983define_one_ro(shared_cpu_map); 947define_one_ro(shared_cpu_map);
984define_one_ro(shared_cpu_list); 948define_one_ro(shared_cpu_list);
985 949
986#define DEFAULT_SYSFS_CACHE_ATTRS \
987 &type.attr, \
988 &level.attr, \
989 &coherency_line_size.attr, \
990 &physical_line_partition.attr, \
991 &ways_of_associativity.attr, \
992 &number_of_sets.attr, \
993 &size.attr, \
994 &shared_cpu_map.attr, \
995 &shared_cpu_list.attr
996
997static struct attribute *default_attrs[] = { 950static struct attribute *default_attrs[] = {
998 DEFAULT_SYSFS_CACHE_ATTRS, 951 &type.attr,
952 &level.attr,
953 &coherency_line_size.attr,
954 &physical_line_partition.attr,
955 &ways_of_associativity.attr,
956 &number_of_sets.attr,
957 &size.attr,
958 &shared_cpu_map.attr,
959 &shared_cpu_list.attr,
999 NULL 960 NULL
1000}; 961};
1001 962
1002static struct attribute *default_l3_attrs[] = {
1003 DEFAULT_SYSFS_CACHE_ATTRS,
1004#ifdef CONFIG_AMD_NB 963#ifdef CONFIG_AMD_NB
1005 &cache_disable_0.attr, 964static struct attribute ** __cpuinit amd_l3_attrs(void)
1006 &cache_disable_1.attr, 965{
966 static struct attribute **attrs;
967 int n;
968
969 if (attrs)
970 return attrs;
971
972 n = sizeof (default_attrs) / sizeof (struct attribute *);
973
974 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
975 n += 2;
976
977 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
978 if (attrs == NULL)
979 return attrs = default_attrs;
980
981 for (n = 0; default_attrs[n]; n++)
982 attrs[n] = default_attrs[n];
983
984 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
985 attrs[n++] = &cache_disable_0.attr;
986 attrs[n++] = &cache_disable_1.attr;
987 }
988
989 return attrs;
990}
1007#endif 991#endif
1008 NULL
1009};
1010 992
1011static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 993static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1012{ 994{
@@ -1117,11 +1099,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1117 1099
1118 this_leaf = CPUID4_INFO_IDX(cpu, i); 1100 this_leaf = CPUID4_INFO_IDX(cpu, i);
1119 1101
1120 if (this_leaf->l3 && this_leaf->l3->can_disable) 1102 ktype_cache.default_attrs = default_attrs;
1121 ktype_cache.default_attrs = default_l3_attrs; 1103#ifdef CONFIG_AMD_NB
1122 else 1104 if (this_leaf->l3)
1123 ktype_cache.default_attrs = default_attrs; 1105 ktype_cache.default_attrs = amd_l3_attrs();
1124 1106#endif
1125 retval = kobject_init_and_add(&(this_object->kobj), 1107 retval = kobject_init_and_add(&(this_object->kobj),
1126 &ktype_cache, 1108 &ktype_cache,
1127 per_cpu(ici_cache_kobject, cpu), 1109 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..a77971979564 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,6 +25,7 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <asm/mce.h> 26#include <asm/mce.h>
27#include <asm/apic.h> 27#include <asm/apic.h>
28#include <asm/nmi.h>
28 29
29/* Update fake mce registers on current CPU. */ 30/* Update fake mce registers on current CPU. */
30static void inject_mce(struct mce *m) 31static void inject_mce(struct mce *m)
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
83 struct die_args *args = (struct die_args *)data; 84 struct die_args *args = (struct die_args *)data;
84 int cpu = smp_processor_id(); 85 int cpu = smp_processor_id();
85 struct mce *m = &__get_cpu_var(injectm); 86 struct mce *m = &__get_cpu_var(injectm);
86 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) 87 if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
87 return NOTIFY_DONE; 88 return NOTIFY_DONE;
88 cpumask_clear_cpu(cpu, mce_inject_cpumask); 89 cpumask_clear_cpu(cpu, mce_inject_cpumask);
89 if (m->inject_flags & MCJ_EXCEPTION) 90 if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
95 96
96static struct notifier_block mce_raise_nb = { 97static struct notifier_block mce_raise_nb = {
97 .notifier_call = mce_raise_notify, 98 .notifier_call = mce_raise_notify,
98 .priority = 1000, 99 .priority = NMI_LOCAL_NORMAL_PRIOR,
99}; 100};
100 101
101/* Inject mce on current CPU */ 102/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a35b72d7c03..d916183b7f9c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -326,7 +326,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
326 326
327static int msr_to_offset(u32 msr) 327static int msr_to_offset(u32 msr)
328{ 328{
329 unsigned bank = __get_cpu_var(injectm.bank); 329 unsigned bank = __this_cpu_read(injectm.bank);
330 330
331 if (msr == rip_msr) 331 if (msr == rip_msr)
332 return offsetof(struct mce, ip); 332 return offsetof(struct mce, ip);
@@ -346,7 +346,7 @@ static u64 mce_rdmsrl(u32 msr)
346{ 346{
347 u64 v; 347 u64 v;
348 348
349 if (__get_cpu_var(injectm).finished) { 349 if (__this_cpu_read(injectm.finished)) {
350 int offset = msr_to_offset(msr); 350 int offset = msr_to_offset(msr);
351 351
352 if (offset < 0) 352 if (offset < 0)
@@ -369,7 +369,7 @@ static u64 mce_rdmsrl(u32 msr)
369 369
370static void mce_wrmsrl(u32 msr, u64 v) 370static void mce_wrmsrl(u32 msr, u64 v)
371{ 371{
372 if (__get_cpu_var(injectm).finished) { 372 if (__this_cpu_read(injectm.finished)) {
373 int offset = msr_to_offset(msr); 373 int offset = msr_to_offset(msr);
374 374
375 if (offset >= 0) 375 if (offset >= 0)
@@ -1159,7 +1159,7 @@ static void mce_start_timer(unsigned long data)
1159 1159
1160 WARN_ON(smp_processor_id() != data); 1160 WARN_ON(smp_processor_id() != data);
1161 1161
1162 if (mce_available(&current_cpu_data)) { 1162 if (mce_available(__this_cpu_ptr(&cpu_info))) {
1163 machine_check_poll(MCP_TIMESTAMP, 1163 machine_check_poll(MCP_TIMESTAMP,
1164 &__get_cpu_var(mce_poll_banks)); 1164 &__get_cpu_var(mce_poll_banks));
1165 } 1165 }
@@ -1767,7 +1767,7 @@ static int mce_shutdown(struct sys_device *dev)
1767static int mce_resume(struct sys_device *dev) 1767static int mce_resume(struct sys_device *dev)
1768{ 1768{
1769 __mcheck_cpu_init_generic(); 1769 __mcheck_cpu_init_generic();
1770 __mcheck_cpu_init_vendor(&current_cpu_data); 1770 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1771 1771
1772 return 0; 1772 return 0;
1773} 1773}
@@ -1775,7 +1775,7 @@ static int mce_resume(struct sys_device *dev)
1775static void mce_cpu_restart(void *data) 1775static void mce_cpu_restart(void *data)
1776{ 1776{
1777 del_timer_sync(&__get_cpu_var(mce_timer)); 1777 del_timer_sync(&__get_cpu_var(mce_timer));
1778 if (!mce_available(&current_cpu_data)) 1778 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1779 return; 1779 return;
1780 __mcheck_cpu_init_generic(); 1780 __mcheck_cpu_init_generic();
1781 __mcheck_cpu_init_timer(); 1781 __mcheck_cpu_init_timer();
@@ -1790,7 +1790,7 @@ static void mce_restart(void)
1790/* Toggle features for corrected errors */ 1790/* Toggle features for corrected errors */
1791static void mce_disable_ce(void *all) 1791static void mce_disable_ce(void *all)
1792{ 1792{
1793 if (!mce_available(&current_cpu_data)) 1793 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1794 return; 1794 return;
1795 if (all) 1795 if (all)
1796 del_timer_sync(&__get_cpu_var(mce_timer)); 1796 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1799,7 +1799,7 @@ static void mce_disable_ce(void *all)
1799 1799
1800static void mce_enable_ce(void *all) 1800static void mce_enable_ce(void *all)
1801{ 1801{
1802 if (!mce_available(&current_cpu_data)) 1802 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1803 return; 1803 return;
1804 cmci_reenable(); 1804 cmci_reenable();
1805 cmci_recheck(); 1805 cmci_recheck();
@@ -2022,7 +2022,7 @@ static void __cpuinit mce_disable_cpu(void *h)
2022 unsigned long action = *(unsigned long *)h; 2022 unsigned long action = *(unsigned long *)h;
2023 int i; 2023 int i;
2024 2024
2025 if (!mce_available(&current_cpu_data)) 2025 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2026 return; 2026 return;
2027 2027
2028 if (!(action & CPU_TASKS_FROZEN)) 2028 if (!(action & CPU_TASKS_FROZEN))
@@ -2040,7 +2040,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
2040 unsigned long action = *(unsigned long *)h; 2040 unsigned long action = *(unsigned long *)h;
2041 int i; 2041 int i;
2042 2042
2043 if (!mce_available(&current_cpu_data)) 2043 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2044 return; 2044 return;
2045 2045
2046 if (!(action & CPU_TASKS_FROZEN)) 2046 if (!(action & CPU_TASKS_FROZEN))
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 80c482382d5c..5bf2fac52aca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
31#include <asm/mce.h> 31#include <asm/mce.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33 33
34#define PFX "mce_threshold: "
35#define VERSION "version 1.1.1"
36#define NR_BANKS 6 34#define NR_BANKS 6
37#define NR_BLOCKS 9 35#define NR_BLOCKS 9
38#define THRESHOLD_MAX 0xFFF 36#define THRESHOLD_MAX 0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
59 struct list_head miscj; 57 struct list_head miscj;
60}; 58};
61 59
62/* defaults used early on boot */
63static struct threshold_block threshold_defaults = {
64 .interrupt_enable = 0,
65 .threshold_limit = THRESHOLD_MAX,
66};
67
68struct threshold_bank { 60struct threshold_bank {
69 struct kobject *kobj; 61 struct kobject *kobj;
70 struct threshold_block *blocks; 62 struct threshold_block *blocks;
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void);
89struct thresh_restart { 81struct thresh_restart {
90 struct threshold_block *b; 82 struct threshold_block *b;
91 int reset; 83 int reset;
84 int set_lvt_off;
85 int lvt_off;
92 u16 old_limit; 86 u16 old_limit;
93}; 87};
94 88
89static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
90{
91 int msr = (hi & MASK_LVTOFF_HI) >> 20;
92
93 if (apic < 0) {
94 pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
95 "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
96 b->bank, b->block, b->address, hi, lo);
97 return 0;
98 }
99
100 if (apic != msr) {
101 pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
102 "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
103 b->cpu, apic, b->bank, b->block, b->address, hi, lo);
104 return 0;
105 }
106
107 return 1;
108};
109
95/* must be called with correct cpu affinity */ 110/* must be called with correct cpu affinity */
96/* Called via smp_call_function_single() */ 111/* Called via smp_call_function_single() */
97static void threshold_restart_bank(void *_tr) 112static void threshold_restart_bank(void *_tr)
98{ 113{
99 struct thresh_restart *tr = _tr; 114 struct thresh_restart *tr = _tr;
100 u32 mci_misc_hi, mci_misc_lo; 115 u32 hi, lo;
101 116
102 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 117 rdmsr(tr->b->address, lo, hi);
103 118
104 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 119 if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
105 tr->reset = 1; /* limit cannot be lower than err count */ 120 tr->reset = 1; /* limit cannot be lower than err count */
106 121
107 if (tr->reset) { /* reset err count and overflow bit */ 122 if (tr->reset) { /* reset err count and overflow bit */
108 mci_misc_hi = 123 hi =
109 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 124 (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
110 (THRESHOLD_MAX - tr->b->threshold_limit); 125 (THRESHOLD_MAX - tr->b->threshold_limit);
111 } else if (tr->old_limit) { /* change limit w/o reset */ 126 } else if (tr->old_limit) { /* change limit w/o reset */
112 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 127 int new_count = (hi & THRESHOLD_MAX) +
113 (tr->old_limit - tr->b->threshold_limit); 128 (tr->old_limit - tr->b->threshold_limit);
114 129
115 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 130 hi = (hi & ~MASK_ERR_COUNT_HI) |
116 (new_count & THRESHOLD_MAX); 131 (new_count & THRESHOLD_MAX);
117 } 132 }
118 133
134 if (tr->set_lvt_off) {
135 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
136 /* set new lvt offset */
137 hi &= ~MASK_LVTOFF_HI;
138 hi |= tr->lvt_off << 20;
139 }
140 }
141
119 tr->b->interrupt_enable ? 142 tr->b->interrupt_enable ?
120 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 143 (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
121 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 144 (hi &= ~MASK_INT_TYPE_HI);
122 145
123 mci_misc_hi |= MASK_COUNT_EN_HI; 146 hi |= MASK_COUNT_EN_HI;
124 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 147 wrmsr(tr->b->address, lo, hi);
148}
149
150static void mce_threshold_block_init(struct threshold_block *b, int offset)
151{
152 struct thresh_restart tr = {
153 .b = b,
154 .set_lvt_off = 1,
155 .lvt_off = offset,
156 };
157
158 b->threshold_limit = THRESHOLD_MAX;
159 threshold_restart_bank(&tr);
160};
161
162static int setup_APIC_mce(int reserved, int new)
163{
164 if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
165 APIC_EILVT_MSG_FIX, 0))
166 return new;
167
168 return reserved;
125} 169}
126 170
127/* cpu init entry point, called from mce.c with preempt off */ 171/* cpu init entry point, called from mce.c with preempt off */
128void mce_amd_feature_init(struct cpuinfo_x86 *c) 172void mce_amd_feature_init(struct cpuinfo_x86 *c)
129{ 173{
174 struct threshold_block b;
130 unsigned int cpu = smp_processor_id(); 175 unsigned int cpu = smp_processor_id();
131 u32 low = 0, high = 0, address = 0; 176 u32 low = 0, high = 0, address = 0;
132 unsigned int bank, block; 177 unsigned int bank, block;
133 struct thresh_restart tr; 178 int offset = -1;
134 int lvt_off = -1;
135 u8 offset;
136 179
137 for (bank = 0; bank < NR_BANKS; ++bank) { 180 for (bank = 0; bank < NR_BANKS; ++bank) {
138 for (block = 0; block < NR_BLOCKS; ++block) { 181 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
163 if (shared_bank[bank] && c->cpu_core_id) 206 if (shared_bank[bank] && c->cpu_core_id)
164 break; 207 break;
165#endif 208#endif
166 offset = (high & MASK_LVTOFF_HI) >> 20; 209 offset = setup_APIC_mce(offset,
167 if (lvt_off < 0) { 210 (high & MASK_LVTOFF_HI) >> 20);
168 if (setup_APIC_eilvt(offset,
169 THRESHOLD_APIC_VECTOR,
170 APIC_EILVT_MSG_FIX, 0)) {
171 pr_err(FW_BUG "cpu %d, failed to "
172 "setup threshold interrupt "
173 "for bank %d, block %d "
174 "(MSR%08X=0x%x%08x)",
175 smp_processor_id(), bank, block,
176 address, high, low);
177 continue;
178 }
179 lvt_off = offset;
180 } else if (lvt_off != offset) {
181 pr_err(FW_BUG "cpu %d, invalid threshold "
182 "interrupt offset %d for bank %d,"
183 "block %d (MSR%08X=0x%x%08x)",
184 smp_processor_id(), lvt_off, bank,
185 block, address, high, low);
186 continue;
187 }
188
189 high &= ~MASK_LVTOFF_HI;
190 high |= lvt_off << 20;
191 wrmsr(address, low, high);
192 211
193 threshold_defaults.address = address; 212 memset(&b, 0, sizeof(b));
194 tr.b = &threshold_defaults; 213 b.cpu = cpu;
195 tr.reset = 0; 214 b.bank = bank;
196 tr.old_limit = 0; 215 b.block = block;
197 threshold_restart_bank(&tr); 216 b.address = address;
198 217
218 mce_threshold_block_init(&b, offset);
199 mce_threshold_vector = amd_threshold_interrupt; 219 mce_threshold_vector = amd_threshold_interrupt;
200 } 220 }
201 } 221 }
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
298 318
299 b->interrupt_enable = !!new; 319 b->interrupt_enable = !!new;
300 320
321 memset(&tr, 0, sizeof(tr));
301 tr.b = b; 322 tr.b = b;
302 tr.reset = 0;
303 tr.old_limit = 0;
304 323
305 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 324 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
306 325
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
321 if (new < 1) 340 if (new < 1)
322 new = 1; 341 new = 1;
323 342
343 memset(&tr, 0, sizeof(tr));
324 tr.old_limit = b->threshold_limit; 344 tr.old_limit = b->threshold_limit;
325 b->threshold_limit = new; 345 b->threshold_limit = new;
326 tr.b = b; 346 tr.b = b;
327 tr.reset = 0;
328 347
329 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 348 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
330 349
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
603 continue; 622 continue;
604 err = threshold_create_bank(cpu, bank); 623 err = threshold_create_bank(cpu, bank);
605 if (err) 624 if (err)
606 goto out; 625 return err;
607 } 626 }
608out: 627
609 return err; 628 return err;
610} 629}
611 630
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 6fcd0936194f..8694ef56459d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -130,7 +130,7 @@ void cmci_recheck(void)
130 unsigned long flags; 130 unsigned long flags;
131 int banks; 131 int banks;
132 132
133 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks)) 133 if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
134 return; 134 return;
135 local_irq_save(flags); 135 local_irq_save(flags);
136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 4b683267eca5..6f8c5e9da97f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,14 @@ struct thermal_state {
53 struct _thermal_state core_power_limit; 53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle; 54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit; 55 struct _thermal_state package_power_limit;
56 struct _thermal_state core_thresh0;
57 struct _thermal_state core_thresh1;
56}; 58};
57 59
60/* Callback to handle core threshold interrupts */
61int (*platform_thermal_notify)(__u64 msr_val);
62EXPORT_SYMBOL(platform_thermal_notify);
63
58static DEFINE_PER_CPU(struct thermal_state, thermal_state); 64static DEFINE_PER_CPU(struct thermal_state, thermal_state);
59 65
60static atomic_t therm_throt_en = ATOMIC_INIT(0); 66static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -200,6 +206,22 @@ static int therm_throt_process(bool new_event, int event, int level)
200 return 0; 206 return 0;
201} 207}
202 208
209static int thresh_event_valid(int event)
210{
211 struct _thermal_state *state;
212 unsigned int this_cpu = smp_processor_id();
213 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
214 u64 now = get_jiffies_64();
215
216 state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
217
218 if (time_before64(now, state->next_check))
219 return 0;
220
221 state->next_check = now + CHECK_INTERVAL;
222 return 1;
223}
224
203#ifdef CONFIG_SYSFS 225#ifdef CONFIG_SYSFS
204/* Add/Remove thermal_throttle interface for CPU device: */ 226/* Add/Remove thermal_throttle interface for CPU device: */
205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 227static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,6 +335,22 @@ device_initcall(thermal_throttle_init_device);
313#define PACKAGE_THROTTLED ((__u64)2 << 62) 335#define PACKAGE_THROTTLED ((__u64)2 << 62)
314#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) 336#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
315 337
338static void notify_thresholds(__u64 msr_val)
339{
340 /* check whether the interrupt handler is defined;
341 * otherwise simply return
342 */
343 if (!platform_thermal_notify)
344 return;
345
346 /* lower threshold reached */
347 if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
348 platform_thermal_notify(msr_val);
349 /* higher threshold reached */
350 if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
351 platform_thermal_notify(msr_val);
352}
353
316/* Thermal transition interrupt handler */ 354/* Thermal transition interrupt handler */
317static void intel_thermal_interrupt(void) 355static void intel_thermal_interrupt(void)
318{ 356{
@@ -321,6 +359,9 @@ static void intel_thermal_interrupt(void)
321 359
322 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 360 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
323 361
362 /* Check for violation of core thermal thresholds*/
363 notify_thresholds(msr_val);
364
324 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 365 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
325 THERMAL_THROTTLING_EVENT, 366 THERMAL_THROTTLING_EVENT,
326 CORE_LEVEL) != 0) 367 CORE_LEVEL) != 0)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed6310183efb..9d977a2ea693 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void)
330{ 330{
331 int i; 331 int i;
332 332
333 if (nmi_watchdog == NMI_LOCAL_APIC)
334 disable_lapic_nmi_watchdog();
335
336 for (i = 0; i < x86_pmu.num_counters; i++) { 333 for (i = 0; i < x86_pmu.num_counters; i++) {
337 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 334 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
338 goto perfctr_fail; 335 goto perfctr_fail;
@@ -355,9 +352,6 @@ perfctr_fail:
355 for (i--; i >= 0; i--) 352 for (i--; i >= 0; i--)
356 release_perfctr_nmi(x86_pmu.perfctr + i); 353 release_perfctr_nmi(x86_pmu.perfctr + i);
357 354
358 if (nmi_watchdog == NMI_LOCAL_APIC)
359 enable_lapic_nmi_watchdog();
360
361 return false; 355 return false;
362} 356}
363 357
@@ -369,9 +363,6 @@ static void release_pmc_hardware(void)
369 release_perfctr_nmi(x86_pmu.perfctr + i); 363 release_perfctr_nmi(x86_pmu.perfctr + i);
370 release_evntsel_nmi(x86_pmu.eventsel + i); 364 release_evntsel_nmi(x86_pmu.eventsel + i);
371 } 365 }
372
373 if (nmi_watchdog == NMI_LOCAL_APIC)
374 enable_lapic_nmi_watchdog();
375} 366}
376 367
377#else 368#else
@@ -381,6 +372,58 @@ static void release_pmc_hardware(void) {}
381 372
382#endif 373#endif
383 374
375static bool check_hw_exists(void)
376{
377 u64 val, val_new = 0;
378 int i, reg, ret = 0;
379
380 /*
381 * Check to see if the BIOS enabled any of the counters, if so
382 * complain and bail.
383 */
384 for (i = 0; i < x86_pmu.num_counters; i++) {
385 reg = x86_pmu.eventsel + i;
386 ret = rdmsrl_safe(reg, &val);
387 if (ret)
388 goto msr_fail;
389 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
390 goto bios_fail;
391 }
392
393 if (x86_pmu.num_counters_fixed) {
394 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
395 ret = rdmsrl_safe(reg, &val);
396 if (ret)
397 goto msr_fail;
398 for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
399 if (val & (0x03 << i*4))
400 goto bios_fail;
401 }
402 }
403
404 /*
405 * Now write a value and read it back to see if it matches,
406 * this is needed to detect certain hardware emulators (qemu/kvm)
407 * that don't trap on the MSR access and always return 0s.
408 */
409 val = 0xabcdUL;
410 ret = checking_wrmsrl(x86_pmu.perfctr, val);
411 ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
412 if (ret || val != val_new)
413 goto msr_fail;
414
415 return true;
416
417bios_fail:
418 printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
419 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
420 return false;
421
422msr_fail:
423 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
424 return false;
425}
426
384static void reserve_ds_buffers(void); 427static void reserve_ds_buffers(void);
385static void release_ds_buffers(void); 428static void release_ds_buffers(void);
386 429
@@ -437,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event)
437 struct hw_perf_event *hwc = &event->hw; 480 struct hw_perf_event *hwc = &event->hw;
438 u64 config; 481 u64 config;
439 482
440 if (!hwc->sample_period) { 483 if (!is_sampling_event(event)) {
441 hwc->sample_period = x86_pmu.max_period; 484 hwc->sample_period = x86_pmu.max_period;
442 hwc->last_period = hwc->sample_period; 485 hwc->last_period = hwc->sample_period;
443 local64_set(&hwc->period_left, hwc->sample_period); 486 local64_set(&hwc->period_left, hwc->sample_period);
@@ -954,8 +997,7 @@ x86_perf_event_set_period(struct perf_event *event)
954 997
955static void x86_pmu_enable_event(struct perf_event *event) 998static void x86_pmu_enable_event(struct perf_event *event)
956{ 999{
957 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1000 if (__this_cpu_read(cpu_hw_events.enabled))
958 if (cpuc->enabled)
959 __x86_pmu_enable_event(&event->hw, 1001 __x86_pmu_enable_event(&event->hw,
960 ARCH_PERFMON_EVENTSEL_ENABLE); 1002 ARCH_PERFMON_EVENTSEL_ENABLE);
961} 1003}
@@ -1225,11 +1267,10 @@ perf_event_nmi_handler(struct notifier_block *self,
1225 1267
1226 switch (cmd) { 1268 switch (cmd) {
1227 case DIE_NMI: 1269 case DIE_NMI:
1228 case DIE_NMI_IPI:
1229 break; 1270 break;
1230 case DIE_NMIUNKNOWN: 1271 case DIE_NMIUNKNOWN:
1231 this_nmi = percpu_read(irq_stat.__nmi_count); 1272 this_nmi = percpu_read(irq_stat.__nmi_count);
1232 if (this_nmi != __get_cpu_var(pmu_nmi).marked) 1273 if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1233 /* let the kernel handle the unknown nmi */ 1274 /* let the kernel handle the unknown nmi */
1234 return NOTIFY_DONE; 1275 return NOTIFY_DONE;
1235 /* 1276 /*
@@ -1253,8 +1294,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1253 this_nmi = percpu_read(irq_stat.__nmi_count); 1294 this_nmi = percpu_read(irq_stat.__nmi_count);
1254 if ((handled > 1) || 1295 if ((handled > 1) ||
1255 /* the next nmi could be a back-to-back nmi */ 1296 /* the next nmi could be a back-to-back nmi */
1256 ((__get_cpu_var(pmu_nmi).marked == this_nmi) && 1297 ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1257 (__get_cpu_var(pmu_nmi).handled > 1))) { 1298 (__this_cpu_read(pmu_nmi.handled) > 1))) {
1258 /* 1299 /*
1259 * We could have two subsequent back-to-back nmis: The 1300 * We could have two subsequent back-to-back nmis: The
1260 * first handles more than one counter, the 2nd 1301 * first handles more than one counter, the 2nd
@@ -1265,8 +1306,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1265 * handling more than one counter. We will mark the 1306 * handling more than one counter. We will mark the
1266 * next (3rd) and then drop it if unhandled. 1307 * next (3rd) and then drop it if unhandled.
1267 */ 1308 */
1268 __get_cpu_var(pmu_nmi).marked = this_nmi + 1; 1309 __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1269 __get_cpu_var(pmu_nmi).handled = handled; 1310 __this_cpu_write(pmu_nmi.handled, handled);
1270 } 1311 }
1271 1312
1272 return NOTIFY_STOP; 1313 return NOTIFY_STOP;
@@ -1275,7 +1316,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1275static __read_mostly struct notifier_block perf_event_nmi_notifier = { 1316static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1276 .notifier_call = perf_event_nmi_handler, 1317 .notifier_call = perf_event_nmi_handler,
1277 .next = NULL, 1318 .next = NULL,
1278 .priority = 1 1319 .priority = NMI_LOCAL_LOW_PRIOR,
1279}; 1320};
1280 1321
1281static struct event_constraint unconstrained; 1322static struct event_constraint unconstrained;
@@ -1348,7 +1389,7 @@ static void __init pmu_check_apic(void)
1348 pr_info("no hardware sampling interrupt available.\n"); 1389 pr_info("no hardware sampling interrupt available.\n");
1349} 1390}
1350 1391
1351void __init init_hw_perf_events(void) 1392int __init init_hw_perf_events(void)
1352{ 1393{
1353 struct event_constraint *c; 1394 struct event_constraint *c;
1354 int err; 1395 int err;
@@ -1363,15 +1404,19 @@ void __init init_hw_perf_events(void)
1363 err = amd_pmu_init(); 1404 err = amd_pmu_init();
1364 break; 1405 break;
1365 default: 1406 default:
1366 return; 1407 return 0;
1367 } 1408 }
1368 if (err != 0) { 1409 if (err != 0) {
1369 pr_cont("no PMU driver, software events only.\n"); 1410 pr_cont("no PMU driver, software events only.\n");
1370 return; 1411 return 0;
1371 } 1412 }
1372 1413
1373 pmu_check_apic(); 1414 pmu_check_apic();
1374 1415
1416 /* sanity check that the hardware exists or is emulated */
1417 if (!check_hw_exists())
1418 return 0;
1419
1375 pr_cont("%s PMU driver.\n", x86_pmu.name); 1420 pr_cont("%s PMU driver.\n", x86_pmu.name);
1376 1421
1377 if (x86_pmu.quirks) 1422 if (x86_pmu.quirks)
@@ -1418,9 +1463,12 @@ void __init init_hw_perf_events(void)
1418 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1463 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1419 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 1464 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1420 1465
1421 perf_pmu_register(&pmu); 1466 perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1422 perf_cpu_notifier(x86_pmu_notifier); 1467 perf_cpu_notifier(x86_pmu_notifier);
1468
1469 return 0;
1423} 1470}
1471early_initcall(init_hw_perf_events);
1424 1472
1425static inline void x86_pmu_read(struct perf_event *event) 1473static inline void x86_pmu_read(struct perf_event *event)
1426{ 1474{
@@ -1434,11 +1482,9 @@ static inline void x86_pmu_read(struct perf_event *event)
1434 */ 1482 */
1435static void x86_pmu_start_txn(struct pmu *pmu) 1483static void x86_pmu_start_txn(struct pmu *pmu)
1436{ 1484{
1437 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1438
1439 perf_pmu_disable(pmu); 1485 perf_pmu_disable(pmu);
1440 cpuc->group_flag |= PERF_EVENT_TXN; 1486 __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1441 cpuc->n_txn = 0; 1487 __this_cpu_write(cpu_hw_events.n_txn, 0);
1442} 1488}
1443 1489
1444/* 1490/*
@@ -1448,14 +1494,12 @@ static void x86_pmu_start_txn(struct pmu *pmu)
1448 */ 1494 */
1449static void x86_pmu_cancel_txn(struct pmu *pmu) 1495static void x86_pmu_cancel_txn(struct pmu *pmu)
1450{ 1496{
1451 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1497 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1452
1453 cpuc->group_flag &= ~PERF_EVENT_TXN;
1454 /* 1498 /*
1455 * Truncate the collected events. 1499 * Truncate the collected events.
1456 */ 1500 */
1457 cpuc->n_added -= cpuc->n_txn; 1501 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1458 cpuc->n_events -= cpuc->n_txn; 1502 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1459 perf_pmu_enable(pmu); 1503 perf_pmu_enable(pmu);
1460} 1504}
1461 1505
@@ -1666,7 +1710,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1666 1710
1667 perf_callchain_store(entry, regs->ip); 1711 perf_callchain_store(entry, regs->ip);
1668 1712
1669 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1713 dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
1670} 1714}
1671 1715
1672#ifdef CONFIG_COMPAT 1716#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 46d58448c3af..67e2202a6039 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,7 +1,5 @@
1#ifdef CONFIG_CPU_SUP_AMD 1#ifdef CONFIG_CPU_SUP_AMD
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst const u64 amd_hw_cache_event_ids 3static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 4 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 5 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -275,17 +273,17 @@ done:
275 return &emptyconstraint; 273 return &emptyconstraint;
276} 274}
277 275
278static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) 276static struct amd_nb *amd_alloc_nb(int cpu)
279{ 277{
280 struct amd_nb *nb; 278 struct amd_nb *nb;
281 int i; 279 int i;
282 280
283 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); 281 nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
282 cpu_to_node(cpu));
284 if (!nb) 283 if (!nb)
285 return NULL; 284 return NULL;
286 285
287 memset(nb, 0, sizeof(*nb)); 286 nb->nb_id = -1;
288 nb->nb_id = nb_id;
289 287
290 /* 288 /*
291 * initialize all possible NB constraints 289 * initialize all possible NB constraints
@@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu)
306 if (boot_cpu_data.x86_max_cores < 2) 304 if (boot_cpu_data.x86_max_cores < 2)
307 return NOTIFY_OK; 305 return NOTIFY_OK;
308 306
309 cpuc->amd_nb = amd_alloc_nb(cpu, -1); 307 cpuc->amd_nb = amd_alloc_nb(cpu);
310 if (!cpuc->amd_nb) 308 if (!cpuc->amd_nb)
311 return NOTIFY_BAD; 309 return NOTIFY_BAD;
312 310
@@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu)
325 nb_id = amd_get_nb_id(cpu); 323 nb_id = amd_get_nb_id(cpu);
326 WARN_ON_ONCE(nb_id == BAD_APICID); 324 WARN_ON_ONCE(nb_id == BAD_APICID);
327 325
328 raw_spin_lock(&amd_nb_lock);
329
330 for_each_online_cpu(i) { 326 for_each_online_cpu(i) {
331 nb = per_cpu(cpu_hw_events, i).amd_nb; 327 nb = per_cpu(cpu_hw_events, i).amd_nb;
332 if (WARN_ON_ONCE(!nb)) 328 if (WARN_ON_ONCE(!nb))
@@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu)
341 337
342 cpuc->amd_nb->nb_id = nb_id; 338 cpuc->amd_nb->nb_id = nb_id;
343 cpuc->amd_nb->refcnt++; 339 cpuc->amd_nb->refcnt++;
344
345 raw_spin_unlock(&amd_nb_lock);
346} 340}
347 341
348static void amd_pmu_cpu_dead(int cpu) 342static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu)
354 348
355 cpuhw = &per_cpu(cpu_hw_events, cpu); 349 cpuhw = &per_cpu(cpu_hw_events, cpu);
356 350
357 raw_spin_lock(&amd_nb_lock);
358
359 if (cpuhw->amd_nb) { 351 if (cpuhw->amd_nb) {
360 struct amd_nb *nb = cpuhw->amd_nb; 352 struct amd_nb *nb = cpuhw->amd_nb;
361 353
@@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu)
364 356
365 cpuhw->amd_nb = NULL; 357 cpuhw->amd_nb = NULL;
366 } 358 }
367
368 raw_spin_unlock(&amd_nb_lock);
369} 359}
370 360
371static __initconst const struct x86_pmu amd_pmu = { 361static __initconst const struct x86_pmu amd_pmu = {
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c8f5c088cad1..008835c1d79c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -649,7 +649,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
649 struct hw_perf_event *hwc = &event->hw; 649 struct hw_perf_event *hwc = &event->hw;
650 650
651 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 651 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
652 if (!__get_cpu_var(cpu_hw_events).enabled) 652 if (!__this_cpu_read(cpu_hw_events.enabled))
653 return; 653 return;
654 654
655 intel_pmu_enable_bts(hwc->config); 655 intel_pmu_enable_bts(hwc->config);
@@ -679,7 +679,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event)
679 679
680static void intel_pmu_reset(void) 680static void intel_pmu_reset(void)
681{ 681{
682 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; 682 struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
683 unsigned long flags; 683 unsigned long flags;
684 int idx; 684 int idx;
685 685
@@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
816 if (ret) 816 if (ret)
817 return ret; 817 return ret;
818 818
819 if (event->attr.precise_ip &&
820 (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
821 /*
822 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
823 * (0x003c) so that we can use it with PEBS.
824 *
825 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
826 * PEBS capable. However we can use INST_RETIRED.ANY_P
827 * (0x00c0), which is a PEBS capable event, to get the same
828 * count.
829 *
830 * INST_RETIRED.ANY_P counts the number of cycles that retires
831 * CNTMASK instructions. By setting CNTMASK to a value (16)
832 * larger than the maximum number of instructions that can be
833 * retired per cycle (4) and then inverting the condition, we
834 * count all cycles that retire 16 or less instructions, which
835 * is every cycle.
836 *
837 * Thereby we gain a PEBS capable cycle counter.
838 */
839 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
840
841 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
842 event->hw.config = alt_config;
843 }
844
819 if (event->attr.type != PERF_TYPE_RAW) 845 if (event->attr.type != PERF_TYPE_RAW)
820 return 0; 846 return 0;
821 847
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 81400b93e694..e56b9bfbabd1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,21 @@ out:
753 753
754static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) 754static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
755{ 755{
756 int overflow = 0; 756 u64 v;
757 u32 low, high;
758 757
759 rdmsr(hwc->config_base + hwc->idx, low, high); 758 /* an official way for overflow indication */
760 759 rdmsrl(hwc->config_base + hwc->idx, v);
761 /* we need to check high bit for unflagged overflows */ 760 if (v & P4_CCCR_OVF) {
762 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { 761 wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
763 overflow = 1; 762 return 1;
764 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
765 ((u64)low) & ~P4_CCCR_OVF);
766 } 763 }
767 764
768 return overflow; 765 /* it might be unflagged overflow */
766 rdmsrl(hwc->event_base + hwc->idx, v);
767 if (!(v & ARCH_P4_CNTRVAL_MASK))
768 return 1;
769
770 return 0;
769} 771}
770 772
771static void p4_pmu_disable_pebs(void) 773static void p4_pmu_disable_pebs(void)
@@ -1152,9 +1154,9 @@ static __initconst const struct x86_pmu p4_pmu = {
1152 */ 1154 */
1153 .num_counters = ARCH_P4_MAX_CCCR, 1155 .num_counters = ARCH_P4_MAX_CCCR,
1154 .apic = 1, 1156 .apic = 1,
1155 .cntval_bits = 40, 1157 .cntval_bits = ARCH_P4_CNTRVAL_BITS,
1156 .cntval_mask = (1ULL << 40) - 1, 1158 .cntval_mask = ARCH_P4_CNTRVAL_MASK,
1157 .max_period = (1ULL << 39) - 1, 1159 .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
1158 .hw_config = p4_hw_config, 1160 .hw_config = p4_hw_config,
1159 .schedule_events = p4_pmu_schedule_events, 1161 .schedule_events = p4_pmu_schedule_events,
1160 /* 1162 /*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d9f4ff8fcd69..d5a236615501 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -16,32 +16,12 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <asm/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_event.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr;
27 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
28 unsigned int evntsel_msr; /* the MSR to select the events to handle */
29};
30
31/* Interface defining a CPU specific perfctr watchdog */
32struct wd_ops {
33 int (*reserve)(void);
34 void (*unreserve)(void);
35 int (*setup)(unsigned nmi_hz);
36 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
37 void (*stop)(void);
38 unsigned perfctr;
39 unsigned evntsel;
40 u64 checkbit;
41};
42
43static const struct wd_ops *wd_ops;
44
45/* 25/*
46 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 26 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
47 * offset from MSR_P4_BSU_ESCR0. 27 * offset from MSR_P4_BSU_ESCR0.
@@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops;
60static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); 40static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
61static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); 41static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
62 42
63static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
64
65/* converts an msr to an appropriate reservation bit */ 43/* converts an msr to an appropriate reservation bit */
66static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) 44static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
67{ 45{
@@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr)
172 clear_bit(counter, evntsel_nmi_owner); 150 clear_bit(counter, evntsel_nmi_owner);
173} 151}
174EXPORT_SYMBOL(release_evntsel_nmi); 152EXPORT_SYMBOL(release_evntsel_nmi);
175
176void disable_lapic_nmi_watchdog(void)
177{
178 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
179
180 if (atomic_read(&nmi_active) <= 0)
181 return;
182
183 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
184
185 if (wd_ops)
186 wd_ops->unreserve();
187
188 BUG_ON(atomic_read(&nmi_active) != 0);
189}
190
191void enable_lapic_nmi_watchdog(void)
192{
193 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
194
195 /* are we already enabled */
196 if (atomic_read(&nmi_active) != 0)
197 return;
198
199 /* are we lapic aware */
200 if (!wd_ops)
201 return;
202 if (!wd_ops->reserve()) {
203 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
204 return;
205 }
206
207 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
208 touch_nmi_watchdog();
209}
210
211/*
212 * Activate the NMI watchdog via the local APIC.
213 */
214
215static unsigned int adjust_for_32bit_ctr(unsigned int hz)
216{
217 u64 counter_val;
218 unsigned int retval = hz;
219
220 /*
221 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
222 * are writable, with higher bits sign extending from bit 31.
223 * So, we can only program the counter with 31 bit values and
224 * 32nd bit should be 1, for 33.. to be 1.
225 * Find the appropriate nmi_hz
226 */
227 counter_val = (u64)cpu_khz * 1000;
228 do_div(counter_val, retval);
229 if (counter_val > 0x7fffffffULL) {
230 u64 count = (u64)cpu_khz * 1000;
231 do_div(count, 0x7fffffffUL);
232 retval = count + 1;
233 }
234 return retval;
235}
236
237static void write_watchdog_counter(unsigned int perfctr_msr,
238 const char *descr, unsigned nmi_hz)
239{
240 u64 count = (u64)cpu_khz * 1000;
241
242 do_div(count, nmi_hz);
243 if (descr)
244 pr_debug("setting %s to -0x%08Lx\n", descr, count);
245 wrmsrl(perfctr_msr, 0 - count);
246}
247
248static void write_watchdog_counter32(unsigned int perfctr_msr,
249 const char *descr, unsigned nmi_hz)
250{
251 u64 count = (u64)cpu_khz * 1000;
252
253 do_div(count, nmi_hz);
254 if (descr)
255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256 wrmsr(perfctr_msr, (u32)(-count), 0);
257}
258
259/*
260 * AMD K7/K8/Family10h/Family11h support.
261 * AMD keeps this interface nicely stable so there is not much variety
262 */
263#define K7_EVNTSEL_ENABLE (1 << 22)
264#define K7_EVNTSEL_INT (1 << 20)
265#define K7_EVNTSEL_OS (1 << 17)
266#define K7_EVNTSEL_USR (1 << 16)
267#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
268#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
269
270static int setup_k7_watchdog(unsigned nmi_hz)
271{
272 unsigned int perfctr_msr, evntsel_msr;
273 unsigned int evntsel;
274 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
275
276 perfctr_msr = wd_ops->perfctr;
277 evntsel_msr = wd_ops->evntsel;
278
279 wrmsrl(perfctr_msr, 0UL);
280
281 evntsel = K7_EVNTSEL_INT
282 | K7_EVNTSEL_OS
283 | K7_EVNTSEL_USR
284 | K7_NMI_EVENT;
285
286 /* setup the timer */
287 wrmsr(evntsel_msr, evntsel, 0);
288 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
289
290 /* initialize the wd struct before enabling */
291 wd->perfctr_msr = perfctr_msr;
292 wd->evntsel_msr = evntsel_msr;
293 wd->cccr_msr = 0; /* unused */
294
295 /* ok, everything is initialized, announce that we're set */
296 cpu_nmi_set_wd_enabled();
297
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301
302 return 1;
303}
304
305static void single_msr_stop_watchdog(void)
306{
307 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
308
309 wrmsr(wd->evntsel_msr, 0, 0);
310}
311
312static int single_msr_reserve(void)
313{
314 if (!reserve_perfctr_nmi(wd_ops->perfctr))
315 return 0;
316
317 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
318 release_perfctr_nmi(wd_ops->perfctr);
319 return 0;
320 }
321 return 1;
322}
323
324static void single_msr_unreserve(void)
325{
326 release_evntsel_nmi(wd_ops->evntsel);
327 release_perfctr_nmi(wd_ops->perfctr);
328}
329
330static void __kprobes
331single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
332{
333 /* start the cycle over again */
334 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
335}
336
337static const struct wd_ops k7_wd_ops = {
338 .reserve = single_msr_reserve,
339 .unreserve = single_msr_unreserve,
340 .setup = setup_k7_watchdog,
341 .rearm = single_msr_rearm,
342 .stop = single_msr_stop_watchdog,
343 .perfctr = MSR_K7_PERFCTR0,
344 .evntsel = MSR_K7_EVNTSEL0,
345 .checkbit = 1ULL << 47,
346};
347
348/*
349 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
350 */
351#define P6_EVNTSEL0_ENABLE (1 << 22)
352#define P6_EVNTSEL_INT (1 << 20)
353#define P6_EVNTSEL_OS (1 << 17)
354#define P6_EVNTSEL_USR (1 << 16)
355#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
356#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
357
358static int setup_p6_watchdog(unsigned nmi_hz)
359{
360 unsigned int perfctr_msr, evntsel_msr;
361 unsigned int evntsel;
362 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
363
364 perfctr_msr = wd_ops->perfctr;
365 evntsel_msr = wd_ops->evntsel;
366
367 /* KVM doesn't implement this MSR */
368 if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
369 return 0;
370
371 evntsel = P6_EVNTSEL_INT
372 | P6_EVNTSEL_OS
373 | P6_EVNTSEL_USR
374 | P6_NMI_EVENT;
375
376 /* setup the timer */
377 wrmsr(evntsel_msr, evntsel, 0);
378 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
379 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
380
381 /* initialize the wd struct before enabling */
382 wd->perfctr_msr = perfctr_msr;
383 wd->evntsel_msr = evntsel_msr;
384 wd->cccr_msr = 0; /* unused */
385
386 /* ok, everything is initialized, announce that we're set */
387 cpu_nmi_set_wd_enabled();
388
389 apic_write(APIC_LVTPC, APIC_DM_NMI);
390 evntsel |= P6_EVNTSEL0_ENABLE;
391 wrmsr(evntsel_msr, evntsel, 0);
392
393 return 1;
394}
395
396static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
397{
398 /*
399 * P6 based Pentium M need to re-unmask
400 * the apic vector but it doesn't hurt
401 * other P6 variant.
402 * ArchPerfom/Core Duo also needs this
403 */
404 apic_write(APIC_LVTPC, APIC_DM_NMI);
405
406 /* P6/ARCH_PERFMON has 32 bit counter write */
407 write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
408}
409
410static const struct wd_ops p6_wd_ops = {
411 .reserve = single_msr_reserve,
412 .unreserve = single_msr_unreserve,
413 .setup = setup_p6_watchdog,
414 .rearm = p6_rearm,
415 .stop = single_msr_stop_watchdog,
416 .perfctr = MSR_P6_PERFCTR0,
417 .evntsel = MSR_P6_EVNTSEL0,
418 .checkbit = 1ULL << 39,
419};
420
421/*
422 * Intel P4 performance counters.
423 * By far the most complicated of all.
424 */
425#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
426#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
427#define P4_ESCR_OS (1 << 3)
428#define P4_ESCR_USR (1 << 2)
429#define P4_CCCR_OVF_PMI0 (1 << 26)
430#define P4_CCCR_OVF_PMI1 (1 << 27)
431#define P4_CCCR_THRESHOLD(N) ((N) << 20)
432#define P4_CCCR_COMPLEMENT (1 << 19)
433#define P4_CCCR_COMPARE (1 << 18)
434#define P4_CCCR_REQUIRED (3 << 16)
435#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
436#define P4_CCCR_ENABLE (1 << 12)
437#define P4_CCCR_OVF (1 << 31)
438
439#define P4_CONTROLS 18
440static unsigned int p4_controls[18] = {
441 MSR_P4_BPU_CCCR0,
442 MSR_P4_BPU_CCCR1,
443 MSR_P4_BPU_CCCR2,
444 MSR_P4_BPU_CCCR3,
445 MSR_P4_MS_CCCR0,
446 MSR_P4_MS_CCCR1,
447 MSR_P4_MS_CCCR2,
448 MSR_P4_MS_CCCR3,
449 MSR_P4_FLAME_CCCR0,
450 MSR_P4_FLAME_CCCR1,
451 MSR_P4_FLAME_CCCR2,
452 MSR_P4_FLAME_CCCR3,
453 MSR_P4_IQ_CCCR0,
454 MSR_P4_IQ_CCCR1,
455 MSR_P4_IQ_CCCR2,
456 MSR_P4_IQ_CCCR3,
457 MSR_P4_IQ_CCCR4,
458 MSR_P4_IQ_CCCR5,
459};
460/*
461 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
462 * CRU_ESCR0 (with any non-null event selector) through a complemented
463 * max threshold. [IA32-Vol3, Section 14.9.9]
464 */
465static int setup_p4_watchdog(unsigned nmi_hz)
466{
467 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
468 unsigned int evntsel, cccr_val;
469 unsigned int misc_enable, dummy;
470 unsigned int ht_num;
471 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
472
473 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
474 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
475 return 0;
476
477#ifdef CONFIG_SMP
478 /* detect which hyperthread we are on */
479 if (smp_num_siblings == 2) {
480 unsigned int ebx, apicid;
481
482 ebx = cpuid_ebx(1);
483 apicid = (ebx >> 24) & 0xff;
484 ht_num = apicid & 1;
485 } else
486#endif
487 ht_num = 0;
488
489 /*
490 * performance counters are shared resources
491 * assign each hyperthread its own set
492 * (re-use the ESCR0 register, seems safe
493 * and keeps the cccr_val the same)
494 */
495 if (!ht_num) {
496 /* logical cpu 0 */
497 perfctr_msr = MSR_P4_IQ_PERFCTR0;
498 evntsel_msr = MSR_P4_CRU_ESCR0;
499 cccr_msr = MSR_P4_IQ_CCCR0;
500 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
501
502 /*
503 * If we're on the kdump kernel or other situation, we may
504 * still have other performance counter registers set to
505 * interrupt and they'll keep interrupting forever because
506 * of the P4_CCCR_OVF quirk. So we need to ACK all the
507 * pending interrupts and disable all the registers here,
508 * before reenabling the NMI delivery. Refer to p4_rearm()
509 * about the P4_CCCR_OVF quirk.
510 */
511 if (reset_devices) {
512 unsigned int low, high;
513 int i;
514
515 for (i = 0; i < P4_CONTROLS; i++) {
516 rdmsr(p4_controls[i], low, high);
517 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
518 wrmsr(p4_controls[i], low, high);
519 }
520 }
521 } else {
522 /* logical cpu 1 */
523 perfctr_msr = MSR_P4_IQ_PERFCTR1;
524 evntsel_msr = MSR_P4_CRU_ESCR0;
525 cccr_msr = MSR_P4_IQ_CCCR1;
526
527 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
528 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
529 cccr_val = P4_CCCR_OVF_PMI0;
530 else
531 cccr_val = P4_CCCR_OVF_PMI1;
532 cccr_val |= P4_CCCR_ESCR_SELECT(4);
533 }
534
535 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
536 | P4_ESCR_OS
537 | P4_ESCR_USR;
538
539 cccr_val |= P4_CCCR_THRESHOLD(15)
540 | P4_CCCR_COMPLEMENT
541 | P4_CCCR_COMPARE
542 | P4_CCCR_REQUIRED;
543
544 wrmsr(evntsel_msr, evntsel, 0);
545 wrmsr(cccr_msr, cccr_val, 0);
546 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
547
548 wd->perfctr_msr = perfctr_msr;
549 wd->evntsel_msr = evntsel_msr;
550 wd->cccr_msr = cccr_msr;
551
552 /* ok, everything is initialized, announce that we're set */
553 cpu_nmi_set_wd_enabled();
554
555 apic_write(APIC_LVTPC, APIC_DM_NMI);
556 cccr_val |= P4_CCCR_ENABLE;
557 wrmsr(cccr_msr, cccr_val, 0);
558 return 1;
559}
560
561static void stop_p4_watchdog(void)
562{
563 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
564 wrmsr(wd->cccr_msr, 0, 0);
565 wrmsr(wd->evntsel_msr, 0, 0);
566}
567
568static int p4_reserve(void)
569{
570 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
571 return 0;
572#ifdef CONFIG_SMP
573 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
574 goto fail1;
575#endif
576 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
577 goto fail2;
578 /* RED-PEN why is ESCR1 not reserved here? */
579 return 1;
580 fail2:
581#ifdef CONFIG_SMP
582 if (smp_num_siblings > 1)
583 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
584 fail1:
585#endif
586 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
587 return 0;
588}
589
590static void p4_unreserve(void)
591{
592#ifdef CONFIG_SMP
593 if (smp_num_siblings > 1)
594 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
595#endif
596 release_evntsel_nmi(MSR_P4_CRU_ESCR0);
597 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
598}
599
600static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
601{
602 unsigned dummy;
603 /*
604 * P4 quirks:
605 * - An overflown perfctr will assert its interrupt
606 * until the OVF flag in its CCCR is cleared.
607 * - LVTPC is masked on interrupt and must be
608 * unmasked by the LVTPC handler.
609 */
610 rdmsrl(wd->cccr_msr, dummy);
611 dummy &= ~P4_CCCR_OVF;
612 wrmsrl(wd->cccr_msr, dummy);
613 apic_write(APIC_LVTPC, APIC_DM_NMI);
614 /* start the cycle over again */
615 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
616}
617
618static const struct wd_ops p4_wd_ops = {
619 .reserve = p4_reserve,
620 .unreserve = p4_unreserve,
621 .setup = setup_p4_watchdog,
622 .rearm = p4_rearm,
623 .stop = stop_p4_watchdog,
624 /* RED-PEN this is wrong for the other sibling */
625 .perfctr = MSR_P4_BPU_PERFCTR0,
626 .evntsel = MSR_P4_BSU_ESCR0,
627 .checkbit = 1ULL << 39,
628};
629
630/*
631 * Watchdog using the Intel architected PerfMon.
632 * Used for Core2 and hopefully all future Intel CPUs.
633 */
634#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
635#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
636
637static struct wd_ops intel_arch_wd_ops;
638
639static int setup_intel_arch_watchdog(unsigned nmi_hz)
640{
641 unsigned int ebx;
642 union cpuid10_eax eax;
643 unsigned int unused;
644 unsigned int perfctr_msr, evntsel_msr;
645 unsigned int evntsel;
646 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
647
648 /*
649 * Check whether the Architectural PerfMon supports
650 * Unhalted Core Cycles Event or not.
651 * NOTE: Corresponding bit = 0 in ebx indicates event present.
652 */
653 cpuid(10, &(eax.full), &ebx, &unused, &unused);
654 if ((eax.split.mask_length <
655 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
656 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
657 return 0;
658
659 perfctr_msr = wd_ops->perfctr;
660 evntsel_msr = wd_ops->evntsel;
661
662 wrmsrl(perfctr_msr, 0UL);
663
664 evntsel = ARCH_PERFMON_EVENTSEL_INT
665 | ARCH_PERFMON_EVENTSEL_OS
666 | ARCH_PERFMON_EVENTSEL_USR
667 | ARCH_PERFMON_NMI_EVENT_SEL
668 | ARCH_PERFMON_NMI_EVENT_UMASK;
669
670 /* setup the timer */
671 wrmsr(evntsel_msr, evntsel, 0);
672 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
673 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
674
675 wd->perfctr_msr = perfctr_msr;
676 wd->evntsel_msr = evntsel_msr;
677 wd->cccr_msr = 0; /* unused */
678
679 /* ok, everything is initialized, announce that we're set */
680 cpu_nmi_set_wd_enabled();
681
682 apic_write(APIC_LVTPC, APIC_DM_NMI);
683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
684 wrmsr(evntsel_msr, evntsel, 0);
685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
686 return 1;
687}
688
689static struct wd_ops intel_arch_wd_ops __read_mostly = {
690 .reserve = single_msr_reserve,
691 .unreserve = single_msr_unreserve,
692 .setup = setup_intel_arch_watchdog,
693 .rearm = p6_rearm,
694 .stop = single_msr_stop_watchdog,
695 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
696 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
697};
698
699static void probe_nmi_watchdog(void)
700{
701 switch (boot_cpu_data.x86_vendor) {
702 case X86_VENDOR_AMD:
703 if (boot_cpu_data.x86 == 6 ||
704 (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
705 wd_ops = &k7_wd_ops;
706 return;
707 case X86_VENDOR_INTEL:
708 /* Work around where perfctr1 doesn't have a working enable
709 * bit as described in the following errata:
710 * AE49 Core Duo and Intel Core Solo 65 nm
711 * AN49 Intel Pentium Dual-Core
712 * AF49 Dual-Core Intel Xeon Processor LV
713 */
714 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
715 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
716 boot_cpu_data.x86_mask == 4))) {
717 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
718 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
719 }
720 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
721 wd_ops = &intel_arch_wd_ops;
722 break;
723 }
724 switch (boot_cpu_data.x86) {
725 case 6:
726 if (boot_cpu_data.x86_model > 13)
727 return;
728
729 wd_ops = &p6_wd_ops;
730 break;
731 case 15:
732 wd_ops = &p4_wd_ops;
733 break;
734 default:
735 return;
736 }
737 break;
738 }
739}
740
741/* Interface to nmi.c */
742
743int lapic_watchdog_init(unsigned nmi_hz)
744{
745 if (!wd_ops) {
746 probe_nmi_watchdog();
747 if (!wd_ops) {
748 printk(KERN_INFO "NMI watchdog: CPU not supported\n");
749 return -1;
750 }
751
752 if (!wd_ops->reserve()) {
753 printk(KERN_ERR
754 "NMI watchdog: cannot reserve perfctrs\n");
755 return -1;
756 }
757 }
758
759 if (!(wd_ops->setup(nmi_hz))) {
760 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
761 raw_smp_processor_id());
762 return -1;
763 }
764
765 return 0;
766}
767
768void lapic_watchdog_stop(void)
769{
770 if (wd_ops)
771 wd_ops->stop();
772}
773
774unsigned lapic_adjust_nmi_hz(unsigned hz)
775{
776 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
777 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
778 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
779 hz = adjust_for_32bit_ctr(hz);
780 return hz;
781}
782
783int __kprobes lapic_wd_event(unsigned nmi_hz)
784{
785 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
786 u64 ctr;
787
788 rdmsrl(wd->perfctr_msr, ctr);
789 if (ctr & wd_ops->checkbit) /* perfctr still running? */
790 return 0;
791
792 wd_ops->rearm(wd, nmi_hz);
793 return 1;
794}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d86..212a6a42527c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,7 +33,6 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/poll.h> 34#include <linux/poll.h>
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/smp_lock.h>
37#include <linux/major.h> 36#include <linux/major.h>
38#include <linux/fs.h> 37#include <linux/fs.h>
39#include <linux/device.h> 38#include <linux/device.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..df20723a6a1b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
175 175
176void 176void
177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 177show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
178 unsigned long *stack, unsigned long bp, char *log_lvl) 178 unsigned long *stack, char *log_lvl)
179{ 179{
180 printk("%sCall Trace:\n", log_lvl); 180 printk("%sCall Trace:\n", log_lvl);
181 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 181 dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
182} 182}
183 183
184void show_trace(struct task_struct *task, struct pt_regs *regs, 184void show_trace(struct task_struct *task, struct pt_regs *regs,
185 unsigned long *stack, unsigned long bp) 185 unsigned long *stack)
186{ 186{
187 show_trace_log_lvl(task, regs, stack, bp, ""); 187 show_trace_log_lvl(task, regs, stack, "");
188} 188}
189 189
190void show_stack(struct task_struct *task, unsigned long *sp) 190void show_stack(struct task_struct *task, unsigned long *sp)
191{ 191{
192 show_stack_log_lvl(task, NULL, sp, 0, ""); 192 show_stack_log_lvl(task, NULL, sp, "");
193} 193}
194 194
195/* 195/*
@@ -197,20 +197,14 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 197 */
198void dump_stack(void) 198void dump_stack(void)
199{ 199{
200 unsigned long bp = 0;
201 unsigned long stack; 200 unsigned long stack;
202 201
203#ifdef CONFIG_FRAME_POINTER
204 if (!bp)
205 get_bp(bp);
206#endif
207
208 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 202 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
209 current->pid, current->comm, print_tainted(), 203 current->pid, current->comm, print_tainted(),
210 init_utsname()->release, 204 init_utsname()->release,
211 (int)strcspn(init_utsname()->version, " "), 205 (int)strcspn(init_utsname()->version, " "),
212 init_utsname()->version); 206 init_utsname()->version);
213 show_trace(NULL, NULL, &stack, bp); 207 show_trace(NULL, NULL, &stack);
214} 208}
215EXPORT_SYMBOL(dump_stack); 209EXPORT_SYMBOL(dump_stack);
216 210
@@ -240,6 +234,7 @@ unsigned __kprobes long oops_begin(void)
240 bust_spinlocks(1); 234 bust_spinlocks(1);
241 return flags; 235 return flags;
242} 236}
237EXPORT_SYMBOL_GPL(oops_begin);
243 238
244void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 239void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
245{ 240{
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 1bc7f75a5bda..74cc1eda384b 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,11 +17,12 @@
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19 19
20void dump_trace(struct task_struct *task, struct pt_regs *regs, 20void dump_trace(struct task_struct *task,
21 unsigned long *stack, unsigned long bp, 21 struct pt_regs *regs, unsigned long *stack,
22 const struct stacktrace_ops *ops, void *data) 22 const struct stacktrace_ops *ops, void *data)
23{ 23{
24 int graph = 0; 24 int graph = 0;
25 unsigned long bp;
25 26
26 if (!task) 27 if (!task)
27 task = current; 28 task = current;
@@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
34 stack = (unsigned long *)task->thread.sp; 35 stack = (unsigned long *)task->thread.sp;
35 } 36 }
36 37
37#ifdef CONFIG_FRAME_POINTER 38 bp = stack_frame(task, regs);
38 if (!bp) {
39 if (task == current) {
40 /* Grab bp right from our regs */
41 get_bp(bp);
42 } else {
43 /* bp is the last reg pushed by switch_to */
44 bp = *(unsigned long *) task->thread.sp;
45 }
46 }
47#endif
48
49 for (;;) { 39 for (;;) {
50 struct thread_info *context; 40 struct thread_info *context;
51 41
@@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace);
65 55
66void 56void
67show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 57show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
68 unsigned long *sp, unsigned long bp, char *log_lvl) 58 unsigned long *sp, char *log_lvl)
69{ 59{
70 unsigned long *stack; 60 unsigned long *stack;
71 int i; 61 int i;
@@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
87 touch_nmi_watchdog(); 77 touch_nmi_watchdog();
88 } 78 }
89 printk(KERN_CONT "\n"); 79 printk(KERN_CONT "\n");
90 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 80 show_trace_log_lvl(task, regs, sp, log_lvl);
91} 81}
92 82
93 83
@@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs)
112 u8 *ip; 102 u8 *ip;
113 103
114 printk(KERN_EMERG "Stack:\n"); 104 printk(KERN_EMERG "Stack:\n");
115 show_stack_log_lvl(NULL, regs, &regs->sp, 105 show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
116 0, KERN_EMERG);
117 106
118 printk(KERN_EMERG "Code: "); 107 printk(KERN_EMERG "Code: ");
119 108
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6a340485249a..a6b6fcf7f0ae 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 139 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
140 */ 140 */
141 141
142void dump_trace(struct task_struct *task, struct pt_regs *regs, 142void dump_trace(struct task_struct *task,
143 unsigned long *stack, unsigned long bp, 143 struct pt_regs *regs, unsigned long *stack,
144 const struct stacktrace_ops *ops, void *data) 144 const struct stacktrace_ops *ops, void *data)
145{ 145{
146 const unsigned cpu = get_cpu(); 146 const unsigned cpu = get_cpu();
@@ -149,29 +149,19 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
149 unsigned used = 0; 149 unsigned used = 0;
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long dummy;
153 unsigned long bp;
152 154
153 if (!task) 155 if (!task)
154 task = current; 156 task = current;
155 157
156 if (!stack) { 158 if (!stack) {
157 unsigned long dummy;
158 stack = &dummy; 159 stack = &dummy;
159 if (task && task != current) 160 if (task && task != current)
160 stack = (unsigned long *)task->thread.sp; 161 stack = (unsigned long *)task->thread.sp;
161 } 162 }
162 163
163#ifdef CONFIG_FRAME_POINTER 164 bp = stack_frame(task, regs);
164 if (!bp) {
165 if (task == current) {
166 /* Grab bp right from our regs */
167 get_bp(bp);
168 } else {
169 /* bp is the last reg pushed by switch_to */
170 bp = *(unsigned long *) task->thread.sp;
171 }
172 }
173#endif
174
175 /* 165 /*
176 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
177 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
235 225
236void 226void
237show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 227show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
238 unsigned long *sp, unsigned long bp, char *log_lvl) 228 unsigned long *sp, char *log_lvl)
239{ 229{
240 unsigned long *irq_stack_end; 230 unsigned long *irq_stack_end;
241 unsigned long *irq_stack; 231 unsigned long *irq_stack;
@@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
279 preempt_enable(); 269 preempt_enable();
280 270
281 printk(KERN_CONT "\n"); 271 printk(KERN_CONT "\n");
282 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, log_lvl);
283} 273}
284 274
285void show_registers(struct pt_regs *regs) 275void show_registers(struct pt_regs *regs)
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
308 298
309 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
310 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
311 regs->bp, KERN_EMERG); 301 KERN_EMERG);
312 302
313 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
314 304
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34d..294f26da0c0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/pfn.h> 15#include <linux/pfn.h>
16#include <linux/suspend.h> 16#include <linux/suspend.h>
17#include <linux/acpi.h>
17#include <linux/firmware-map.h> 18#include <linux/firmware-map.h>
18#include <linux/memblock.h> 19#include <linux/memblock.h>
19 20
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 59e175e89599..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,7 +395,7 @@ sysenter_past_esp:
395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
396 * pushed above; +8 corresponds to copy_thread's esp0 setting. 396 * pushed above; +8 corresponds to copy_thread's esp0 setting.
397 */ 397 */
398 pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp) 398 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp)
399 CFI_REL_OFFSET eip, 0 399 CFI_REL_OFFSET eip, 0
400 400
401 pushl_cfi %eax 401 pushl_cfi %eax
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
1406 CFI_ENDPROC 1406 CFI_ENDPROC
1407END(general_protection) 1407END(general_protection)
1408 1408
1409#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault)
1411 RING0_EC_FRAME
1412 pushl $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code
1415 CFI_ENDPROC
1416END(apf_page_fault)
1417#endif
1418
1409/* 1419/*
1410 * End of kprobes section 1420 * End of kprobes section
1411 */ 1421 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe2690d71c0c..aed1ffbeb0c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,20 +295,25 @@ ENDPROC(native_usergs_sysret64)
295 .endm 295 .endm
296 296
297/* save partial stack frame */ 297/* save partial stack frame */
298 .pushsection .kprobes.text, "ax"
298ENTRY(save_args) 299ENTRY(save_args)
299 XCPT_FRAME 300 XCPT_FRAME
300 cld 301 cld
301 movq_cfi rdi, RDI+16-ARGOFFSET 302 /*
302 movq_cfi rsi, RSI+16-ARGOFFSET 303 * start from rbp in pt_regs and jump over
303 movq_cfi rdx, RDX+16-ARGOFFSET 304 * return address.
304 movq_cfi rcx, RCX+16-ARGOFFSET 305 */
305 movq_cfi rax, RAX+16-ARGOFFSET 306 movq_cfi rdi, RDI+8-RBP
306 movq_cfi r8, R8+16-ARGOFFSET 307 movq_cfi rsi, RSI+8-RBP
307 movq_cfi r9, R9+16-ARGOFFSET 308 movq_cfi rdx, RDX+8-RBP
308 movq_cfi r10, R10+16-ARGOFFSET 309 movq_cfi rcx, RCX+8-RBP
309 movq_cfi r11, R11+16-ARGOFFSET 310 movq_cfi rax, RAX+8-RBP
310 311 movq_cfi r8, R8+8-RBP
311 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ 312 movq_cfi r9, R9+8-RBP
313 movq_cfi r10, R10+8-RBP
314 movq_cfi r11, R11+8-RBP
315
316 leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
312 movq_cfi rbp, 8 /* push %rbp */ 317 movq_cfi rbp, 8 /* push %rbp */
313 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ 318 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
314 testl $3, CS(%rdi) 319 testl $3, CS(%rdi)
@@ -334,6 +339,7 @@ ENTRY(save_args)
334 ret 339 ret
335 CFI_ENDPROC 340 CFI_ENDPROC
336END(save_args) 341END(save_args)
342 .popsection
337 343
338ENTRY(save_rest) 344ENTRY(save_rest)
339 PARTIAL_FRAME 1 REST_SKIP+8 345 PARTIAL_FRAME 1 REST_SKIP+8
@@ -780,8 +786,9 @@ END(interrupt)
780 786
781/* 0(%rsp): ~(interrupt number) */ 787/* 0(%rsp): ~(interrupt number) */
782 .macro interrupt func 788 .macro interrupt func
783 subq $ORIG_RAX-ARGOFFSET+8, %rsp 789 /* reserve pt_regs for scratch regs and rbp */
784 CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 790 subq $ORIG_RAX-RBP, %rsp
791 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
785 call save_args 792 call save_args
786 PARTIAL_FRAME 0 793 PARTIAL_FRAME 0
787 call \func 794 call \func
@@ -806,9 +813,14 @@ ret_from_intr:
806 TRACE_IRQS_OFF 813 TRACE_IRQS_OFF
807 decl PER_CPU_VAR(irq_count) 814 decl PER_CPU_VAR(irq_count)
808 leaveq 815 leaveq
816
809 CFI_RESTORE rbp 817 CFI_RESTORE rbp
810 CFI_DEF_CFA_REGISTER rsp 818 CFI_DEF_CFA_REGISTER rsp
811 CFI_ADJUST_CFA_OFFSET -8 819 CFI_ADJUST_CFA_OFFSET -8
820
821 /* we did not save rbx, restore only from ARGOFFSET */
822 addq $8, %rsp
823 CFI_ADJUST_CFA_OFFSET -8
812exit_intr: 824exit_intr:
813 GET_THREAD_INFO(%rcx) 825 GET_THREAD_INFO(%rcx)
814 testl $3,CS-ARGOFFSET(%rsp) 826 testl $3,CS-ARGOFFSET(%rsp)
@@ -1317,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
1317#endif 1329#endif
1318errorentry general_protection do_general_protection 1330errorentry general_protection do_general_protection
1319errorentry page_fault do_page_fault 1331errorentry page_fault do_page_fault
1332#ifdef CONFIG_KVM_GUEST
1333errorentry async_page_fault do_async_page_fault
1334#endif
1320#ifdef CONFIG_X86_MCE 1335#ifdef CONFIG_X86_MCE
1321paranoidzeroentry machine_check *machine_check_vector(%rip) 1336paranoidzeroentry machine_check *machine_check_vector(%rip)
1322#endif 1337#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3afb33f14d2d..382eb2936d4d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/module.h>
22 23
23#include <trace/syscall.h> 24#include <trace/syscall.h>
24 25
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
49int ftrace_arch_code_modify_prepare(void) 50int ftrace_arch_code_modify_prepare(void)
50{ 51{
51 set_kernel_text_rw(); 52 set_kernel_text_rw();
53 set_all_modules_text_rw();
52 modifying_code = 1; 54 modifying_code = 1;
53 return 0; 55 return 0;
54} 56}
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
56int ftrace_arch_code_modify_post_process(void) 58int ftrace_arch_code_modify_post_process(void)
57{ 59{
58 modifying_code = 0; 60 modifying_code = 0;
61 set_all_modules_text_ro();
59 set_kernel_text_ro(); 62 set_kernel_text_ro();
60 return 0; 63 return 0;
61} 64}
@@ -167,9 +170,9 @@ static void ftrace_mod_code(void)
167 170
168void ftrace_nmi_enter(void) 171void ftrace_nmi_enter(void)
169{ 172{
170 __get_cpu_var(save_modifying_code) = modifying_code; 173 __this_cpu_write(save_modifying_code, modifying_code);
171 174
172 if (!__get_cpu_var(save_modifying_code)) 175 if (!__this_cpu_read(save_modifying_code))
173 return; 176 return;
174 177
175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 178 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
@@ -183,7 +186,7 @@ void ftrace_nmi_enter(void)
183 186
184void ftrace_nmi_exit(void) 187void ftrace_nmi_exit(void)
185{ 188{
186 if (!__get_cpu_var(save_modifying_code)) 189 if (!__this_cpu_read(save_modifying_code))
187 return; 190 return;
188 191
189 /* Finish all executions before clearing nmi_running */ 192 /* Finish all executions before clearing nmi_running */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bcece91dd311..fc293dc8dc35 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,16 +60,18 @@
60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) 60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
61#endif 61#endif
62 62
63/* Number of possible pages in the lowmem region */
64LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
65
63/* Enough space to fit pagetables for the low memory linear map */ 66/* Enough space to fit pagetables for the low memory linear map */
64MAPPING_BEYOND_END = \ 67MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
65 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
66 68
67/* 69/*
68 * Worst-case size of the kernel mapping we need to make: 70 * Worst-case size of the kernel mapping we need to make:
69 * the worst-case size of the kernel itself, plus the extra we need 71 * a relocatable kernel can live anywhere in lowmem, so we need to be able
70 * to map for the linear map. 72 * to map all of lowmem.
71 */ 73 */
72KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT 74KERNEL_PAGES = LOWMEM_PAGES
73 75
74INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
75RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
@@ -124,7 +126,7 @@ ENTRY(startup_32)
124 movsl 126 movsl
125 movl pa(boot_params) + NEW_CL_POINTER,%esi 127 movl pa(boot_params) + NEW_CL_POINTER,%esi
126 andl %esi,%esi 128 andl %esi,%esi
127 jz 1f # No comand line 129 jz 1f # No command line
128 movl $pa(boot_command_line),%edi 130 movl $pa(boot_command_line),%edi
129 movl $(COMMAND_LINE_SIZE/4),%ecx 131 movl $(COMMAND_LINE_SIZE/4),%ecx
130 rep 132 rep
@@ -137,39 +139,6 @@ ENTRY(startup_32)
137 movl %eax, pa(olpc_ofw_pgd) 139 movl %eax, pa(olpc_ofw_pgd)
138#endif 140#endif
139 141
140#ifdef CONFIG_PARAVIRT
141 /* This is can only trip for a broken bootloader... */
142 cmpw $0x207, pa(boot_params + BP_version)
143 jb default_entry
144
145 /* Paravirt-compatible boot parameters. Look to see what architecture
146 we're booting under. */
147 movl pa(boot_params + BP_hardware_subarch), %eax
148 cmpl $num_subarch_entries, %eax
149 jae bad_subarch
150
151 movl pa(subarch_entries)(,%eax,4), %eax
152 subl $__PAGE_OFFSET, %eax
153 jmp *%eax
154
155bad_subarch:
156WEAK(lguest_entry)
157WEAK(xen_entry)
158 /* Unknown implementation; there's really
159 nothing we can do at this point. */
160 ud2a
161
162 __INITDATA
163
164subarch_entries:
165 .long default_entry /* normal x86/PC */
166 .long lguest_entry /* lguest hypervisor */
167 .long xen_entry /* Xen hypervisor */
168 .long default_entry /* Moorestown MID */
169num_subarch_entries = (. - subarch_entries) / 4
170.previous
171#endif /* CONFIG_PARAVIRT */
172
173/* 142/*
174 * Initialize page tables. This creates a PDE and a set of page 143 * Initialize page tables. This creates a PDE and a set of page
175 * tables, which are located immediately beyond __brk_base. The variable 144 * tables, which are located immediately beyond __brk_base. The variable
@@ -179,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4
179 * 148 *
180 * Note that the stack is not yet set up! 149 * Note that the stack is not yet set up!
181 */ 150 */
182default_entry:
183#ifdef CONFIG_X86_PAE 151#ifdef CONFIG_X86_PAE
184 152
185 /* 153 /*
@@ -259,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
259 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax 227 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
260 movl %eax,pa(initial_page_table+0xffc) 228 movl %eax,pa(initial_page_table+0xffc)
261#endif 229#endif
262 jmp 3f 230
231#ifdef CONFIG_PARAVIRT
232 /* This is can only trip for a broken bootloader... */
233 cmpw $0x207, pa(boot_params + BP_version)
234 jb default_entry
235
236 /* Paravirt-compatible boot parameters. Look to see what architecture
237 we're booting under. */
238 movl pa(boot_params + BP_hardware_subarch), %eax
239 cmpl $num_subarch_entries, %eax
240 jae bad_subarch
241
242 movl pa(subarch_entries)(,%eax,4), %eax
243 subl $__PAGE_OFFSET, %eax
244 jmp *%eax
245
246bad_subarch:
247WEAK(lguest_entry)
248WEAK(xen_entry)
249 /* Unknown implementation; there's really
250 nothing we can do at this point. */
251 ud2a
252
253 __INITDATA
254
255subarch_entries:
256 .long default_entry /* normal x86/PC */
257 .long lguest_entry /* lguest hypervisor */
258 .long xen_entry /* Xen hypervisor */
259 .long default_entry /* Moorestown MID */
260num_subarch_entries = (. - subarch_entries) / 4
261.previous
262#else
263 jmp default_entry
264#endif /* CONFIG_PARAVIRT */
265
263/* 266/*
264 * Non-boot CPU entry point; entered from trampoline.S 267 * Non-boot CPU entry point; entered from trampoline.S
265 * We can't lgdt here, because lgdt itself uses a data segment, but 268 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -280,7 +283,7 @@ ENTRY(startup_32_smp)
280 movl %eax,%fs 283 movl %eax,%fs
281 movl %eax,%gs 284 movl %eax,%gs
282#endif /* CONFIG_SMP */ 285#endif /* CONFIG_SMP */
2833: 286default_entry:
284 287
285/* 288/*
286 * New page tables may be in 4Mbyte page mode and may 289 * New page tables may be in 4Mbyte page mode and may
@@ -314,6 +317,10 @@ ENTRY(startup_32_smp)
314 subl $0x80000001, %eax 317 subl $0x80000001, %eax
315 cmpl $(0x8000ffff-0x80000001), %eax 318 cmpl $(0x8000ffff-0x80000001), %eax
316 ja 6f 319 ja 6f
320
321 /* Clear bogus XD_DISABLE bits */
322 call verify_cpu
323
317 mov $0x80000001, %eax 324 mov $0x80000001, %eax
318 cpuid 325 cpuid
319 /* Execute Disable bit supported? */ 326 /* Execute Disable bit supported? */
@@ -609,6 +616,8 @@ ignore_int:
609#endif 616#endif
610 iret 617 iret
611 618
619#include "verify_cpu.S"
620
612 __REFDATA 621 __REFDATA
613.align 4 622.align 4
614ENTRY(initial_code) 623ENTRY(initial_code)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ae03cab4352e..4ff5968f12d2 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -27,6 +27,9 @@
27#define HPET_DEV_FSB_CAP 0x1000 27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000 28#define HPET_DEV_PERI_CAP 0x2000
29 29
30#define HPET_MIN_CYCLES 128
31#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
32
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) 33#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
31 34
32/* 35/*
@@ -299,8 +302,9 @@ static void hpet_legacy_clockevent_register(void)
299 /* Calculate the min / max delta */ 302 /* Calculate the min / max delta */
300 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 303 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
301 &hpet_clockevent); 304 &hpet_clockevent);
302 /* 5 usec minimum reprogramming delta. */ 305 /* Setup minimum reprogramming delta. */
303 hpet_clockevent.min_delta_ns = 5000; 306 hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
307 &hpet_clockevent);
304 308
305 /* 309 /*
306 * Start hpet with the boot cpu mask and make it 310 * Start hpet with the boot cpu mask and make it
@@ -393,22 +397,24 @@ static int hpet_next_event(unsigned long delta,
393 * the wraparound into account) nor a simple count down event 397 * the wraparound into account) nor a simple count down event
394 * mode. Further the write to the comparator register is 398 * mode. Further the write to the comparator register is
395 * delayed internally up to two HPET clock cycles in certain 399 * delayed internally up to two HPET clock cycles in certain
396 * chipsets (ATI, ICH9,10). We worked around that by reading 400 * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
397 * back the compare register, but that required another 401 * longer delays. We worked around that by reading back the
398 * workaround for ICH9,10 chips where the first readout after 402 * compare register, but that required another workaround for
399 * write can return the old stale value. We already have a 403 * ICH9,10 chips where the first readout after write can
400 * minimum delta of 5us enforced, but a NMI or SMI hitting 404 * return the old stale value. We already had a minimum
405 * programming delta of 5us enforced, but a NMI or SMI hitting
401 * between the counter readout and the comparator write can 406 * between the counter readout and the comparator write can
402 * move us behind that point easily. Now instead of reading 407 * move us behind that point easily. Now instead of reading
403 * the compare register back several times, we make the ETIME 408 * the compare register back several times, we make the ETIME
404 * decision based on the following: Return ETIME if the 409 * decision based on the following: Return ETIME if the
405 * counter value after the write is less than 8 HPET cycles 410 * counter value after the write is less than HPET_MIN_CYCLES
406 * away from the event or if the counter is already ahead of 411 * away from the event or if the counter is already ahead of
407 * the event. 412 * the event. The minimum programming delta for the generic
413 * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
408 */ 414 */
409 res = (s32)(cnt - hpet_readl(HPET_COUNTER)); 415 res = (s32)(cnt - hpet_readl(HPET_COUNTER));
410 416
411 return res < 8 ? -ETIME : 0; 417 return res < HPET_MIN_CYCLES ? -ETIME : 0;
412} 418}
413 419
414static void hpet_legacy_set_mode(enum clock_event_mode mode, 420static void hpet_legacy_set_mode(enum clock_event_mode mode,
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff15c9dcc25d..02f07634d265 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
122 return -EBUSY; 122 return -EBUSY;
123 123
124 set_debugreg(info->address, i); 124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address; 125 __this_cpu_write(cpu_debugreg[i], info->address);
126 126
127 dr7 = &__get_cpu_var(cpu_dr7); 127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type); 128 *dr7 |= encode_dr7(i, info->len, info->type);
@@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
397 397
398void hw_breakpoint_restore(void) 398void hw_breakpoint_restore(void)
399{ 399{
400 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); 400 set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
401 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); 401 set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
402 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); 402 set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
403 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); 403 set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
404 set_debugreg(current->thread.debugreg6, 6); 404 set_debugreg(current->thread.debugreg6, 6);
405 set_debugreg(__get_cpu_var(cpu_dr7), 7); 405 set_debugreg(__this_cpu_read(cpu_dr7), 7);
406} 406}
407EXPORT_SYMBOL_GPL(hw_breakpoint_restore); 407EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
408 408
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
433 dr6_p = (unsigned long *)ERR_PTR(args->err); 433 dr6_p = (unsigned long *)ERR_PTR(args->err);
434 dr6 = *dr6_p; 434 dr6 = *dr6_p;
435 435
436 /* If it's a single step, TRAP bits are random */
437 if (dr6 & DR_STEP)
438 return NOTIFY_DONE;
439
436 /* Do an early return if no trap bits are set in DR6 */ 440 /* Do an early return if no trap bits are set in DR6 */
437 if ((dr6 & DR_TRAP_BITS) == 0) 441 if ((dr6 & DR_TRAP_BITS) == 0)
438 return NOTIFY_DONE; 442 return NOTIFY_DONE;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239a2fd7..e60c38cc0eed 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
169 set_stopped_child_used_math(tsk); 169 set_stopped_child_used_math(tsk);
170 return 0; 170 return 0;
171} 171}
172EXPORT_SYMBOL_GPL(init_fpu);
172 173
173/* 174/*
174 * The xstateregs_active() routine is the same as the fpregs_active() routine, 175 * The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 83ec0175f986..52945da52a94 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/of.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
8#include <linux/smp.h> 9#include <linux/smp.h>
9#include <linux/ftrace.h> 10#include <linux/ftrace.h>
@@ -234,7 +235,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
234 exit_idle(); 235 exit_idle();
235 irq_enter(); 236 irq_enter();
236 237
237 irq = __get_cpu_var(vector_irq)[vector]; 238 irq = __this_cpu_read(vector_irq[vector]);
238 239
239 if (!handle_irq(irq, regs)) { 240 if (!handle_irq(irq, regs)) {
240 ack_APIC_irq(); 241 ack_APIC_irq();
@@ -275,6 +276,15 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
275 276
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 277EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277 278
279#ifdef CONFIG_OF
280unsigned int irq_create_of_mapping(struct device_node *controller,
281 const u32 *intspec, unsigned int intsize)
282{
283 return intspec[0];
284}
285EXPORT_SYMBOL_GPL(irq_create_of_mapping);
286#endif
287
278#ifdef CONFIG_HOTPLUG_CPU 288#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 289/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void) 290void fixup_irqs(void)
@@ -350,12 +360,12 @@ void fixup_irqs(void)
350 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 360 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
351 unsigned int irr; 361 unsigned int irr;
352 362
353 if (__get_cpu_var(vector_irq)[vector] < 0) 363 if (__this_cpu_read(vector_irq[vector]) < 0)
354 continue; 364 continue;
355 365
356 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 366 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
357 if (irr & (1 << (vector % 32))) { 367 if (irr & (1 << (vector % 32))) {
358 irq = __get_cpu_var(vector_irq)[vector]; 368 irq = __this_cpu_read(vector_irq[vector]);
359 369
360 data = irq_get_irq_data(irq); 370 data = irq_get_irq_data(irq);
361 raw_spin_lock(&desc->lock); 371 raw_spin_lock(&desc->lock);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 96656f207751..9974d21048fd 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -79,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
79 u32 *isp, arg1, arg2; 79 u32 *isp, arg1, arg2;
80 80
81 curctx = (union irq_ctx *) current_thread_info(); 81 curctx = (union irq_ctx *) current_thread_info();
82 irqctx = __get_cpu_var(hardirq_ctx); 82 irqctx = __this_cpu_read(hardirq_ctx);
83 83
84 /* 84 /*
85 * this is where we switch to the IRQ stack. However, if we are 85 * this is where we switch to the IRQ stack. However, if we are
@@ -129,8 +129,7 @@ void __cpuinit irq_ctx_init(int cpu)
129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
130 THREAD_FLAGS, 130 THREAD_FLAGS,
131 THREAD_ORDER)); 131 THREAD_ORDER));
132 irqctx->tinfo.task = NULL; 132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -140,10 +139,8 @@ void __cpuinit irq_ctx_init(int cpu)
140 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
141 THREAD_FLAGS, 140 THREAD_FLAGS,
142 THREAD_ORDER)); 141 THREAD_ORDER));
143 irqctx->tinfo.task = NULL; 142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
144 irqctx->tinfo.exec_domain = NULL;
145 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
146 irqctx->tinfo.preempt_count = 0;
147 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
148 145
149 per_cpu(softirq_ctx, cpu) = irqctx; 146 per_cpu(softirq_ctx, cpu) = irqctx;
@@ -166,7 +163,7 @@ asmlinkage void do_softirq(void)
166 163
167 if (local_softirq_pending()) { 164 if (local_softirq_pending()) {
168 curctx = current_thread_info(); 165 curctx = current_thread_info();
169 irqctx = __get_cpu_var(softirq_ctx); 166 irqctx = __this_cpu_read(softirq_ctx);
170 irqctx->tinfo.task = curctx->task; 167 irqctx->tinfo.task = curctx->task;
171 irqctx->tinfo.previous_esp = current_stack_pointer; 168 irqctx->tinfo.previous_esp = current_stack_pointer;
172 169
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ec592caac4b4..a4130005028a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
48#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51#include <asm/nmi.h>
51 52
52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = 53struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53{ 54{
@@ -315,14 +316,18 @@ static void kgdb_remove_all_hw_break(void)
315 if (!breakinfo[i].enabled) 316 if (!breakinfo[i].enabled)
316 continue; 317 continue;
317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 318 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
318 if (bp->attr.disabled == 1) 319 if (!bp->attr.disabled) {
320 arch_uninstall_hw_breakpoint(bp);
321 bp->attr.disabled = 1;
319 continue; 322 continue;
323 }
320 if (dbg_is_early) 324 if (dbg_is_early)
321 early_dr7 &= ~encode_dr7(i, breakinfo[i].len, 325 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
322 breakinfo[i].type); 326 breakinfo[i].type);
323 else 327 else if (hw_break_release_slot(i))
324 arch_uninstall_hw_breakpoint(bp); 328 printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
325 bp->attr.disabled = 1; 329 breakinfo[i].addr);
330 breakinfo[i].enabled = 0;
326 } 331 }
327} 332}
328 333
@@ -521,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
521 } 526 }
522 return NOTIFY_DONE; 527 return NOTIFY_DONE;
523 528
524 case DIE_NMI_IPI:
525 /* Just ignore, we will handle the roundup on DIE_NMI. */
526 return NOTIFY_DONE;
527
528 case DIE_NMIUNKNOWN: 529 case DIE_NMIUNKNOWN:
529 if (was_in_debug_nmi[raw_smp_processor_id()]) { 530 if (was_in_debug_nmi[raw_smp_processor_id()]) {
530 was_in_debug_nmi[raw_smp_processor_id()] = 0; 531 was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -602,7 +603,7 @@ static struct notifier_block kgdb_notifier = {
602 /* 603 /*
603 * Lowest-prio notifier priority, we want to be notified last: 604 * Lowest-prio notifier priority, we want to be notified last:
604 */ 605 */
605 .priority = -INT_MAX, 606 .priority = NMI_LOCAL_LOW_PRIOR,
606}; 607};
607 608
608/** 609/**
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1cbd54c0df99..d91c477b3f62 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -403,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
403 403
404static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 404static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
405{ 405{
406 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; 406 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
407 kcb->kprobe_status = kcb->prev_kprobe.status; 407 kcb->kprobe_status = kcb->prev_kprobe.status;
408 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; 408 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
409 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; 409 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
@@ -412,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
412static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 412static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
413 struct kprobe_ctlblk *kcb) 413 struct kprobe_ctlblk *kcb)
414{ 414{
415 __get_cpu_var(current_kprobe) = p; 415 __this_cpu_write(current_kprobe, p);
416 kcb->kprobe_saved_flags = kcb->kprobe_old_flags 416 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
417 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 417 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
418 if (is_IF_modifier(p->ainsn.insn)) 418 if (is_IF_modifier(p->ainsn.insn))
@@ -586,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
586 preempt_enable_no_resched(); 586 preempt_enable_no_resched();
587 return 1; 587 return 1;
588 } else if (kprobe_running()) { 588 } else if (kprobe_running()) {
589 p = __get_cpu_var(current_kprobe); 589 p = __this_cpu_read(current_kprobe);
590 if (p->break_handler && p->break_handler(p, regs)) { 590 if (p->break_handler && p->break_handler(p, regs)) {
591 setup_singlestep(p, regs, kcb, 0); 591 setup_singlestep(p, regs, kcb, 0);
592 return 1; 592 return 1;
@@ -759,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
759 759
760 orig_ret_address = (unsigned long)ri->ret_addr; 760 orig_ret_address = (unsigned long)ri->ret_addr;
761 if (ri->rp && ri->rp->handler) { 761 if (ri->rp && ri->rp->handler) {
762 __get_cpu_var(current_kprobe) = &ri->rp->kp; 762 __this_cpu_write(current_kprobe, &ri->rp->kp);
763 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; 763 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
764 ri->ret_addr = correct_ret_addr; 764 ri->ret_addr = correct_ret_addr;
765 ri->rp->handler(ri, regs); 765 ri->rp->handler(ri, regs);
766 __get_cpu_var(current_kprobe) = NULL; 766 __this_cpu_write(current_kprobe, NULL);
767 } 767 }
768 768
769 recycle_rp_inst(ri, &empty_rp); 769 recycle_rp_inst(ri, &empty_rp);
@@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1184{ 1184{
1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1186 1186
1187 /* This is possible if op is under delayed unoptimizing */
1188 if (kprobe_disabled(&op->kp))
1189 return;
1190
1187 preempt_disable(); 1191 preempt_disable();
1188 if (kprobe_running()) { 1192 if (kprobe_running()) {
1189 kprobes_inc_nmissed_count(&op->kp); 1193 kprobes_inc_nmissed_count(&op->kp);
@@ -1198,10 +1202,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1198 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; 1202 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1199 regs->orig_ax = ~0UL; 1203 regs->orig_ax = ~0UL;
1200 1204
1201 __get_cpu_var(current_kprobe) = &op->kp; 1205 __this_cpu_write(current_kprobe, &op->kp);
1202 kcb->kprobe_status = KPROBE_HIT_ACTIVE; 1206 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1203 opt_pre_handler(&op->kp, regs); 1207 opt_pre_handler(&op->kp, regs);
1204 __get_cpu_var(current_kprobe) = NULL; 1208 __this_cpu_write(current_kprobe, NULL);
1205 } 1209 }
1206 preempt_enable_no_resched(); 1210 preempt_enable_no_resched();
1207} 1211}
@@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1401 return 0; 1405 return 0;
1402} 1406}
1403 1407
1404/* Replace a breakpoint (int3) with a relative jump. */ 1408#define MAX_OPTIMIZE_PROBES 256
1405int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) 1409static struct text_poke_param *jump_poke_params;
1410static struct jump_poke_buffer {
1411 u8 buf[RELATIVEJUMP_SIZE];
1412} *jump_poke_bufs;
1413
1414static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1415 u8 *insn_buf,
1416 struct optimized_kprobe *op)
1406{ 1417{
1407 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1408 s32 rel = (s32)((long)op->optinsn.insn - 1418 s32 rel = (s32)((long)op->optinsn.insn -
1409 ((long)op->kp.addr + RELATIVEJUMP_SIZE)); 1419 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1410 1420
@@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1412 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, 1422 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1413 RELATIVE_ADDR_SIZE); 1423 RELATIVE_ADDR_SIZE);
1414 1424
1415 jmp_code[0] = RELATIVEJUMP_OPCODE; 1425 insn_buf[0] = RELATIVEJUMP_OPCODE;
1416 *(s32 *)(&jmp_code[1]) = rel; 1426 *(s32 *)(&insn_buf[1]) = rel;
1427
1428 tprm->addr = op->kp.addr;
1429 tprm->opcode = insn_buf;
1430 tprm->len = RELATIVEJUMP_SIZE;
1431}
1432
1433/*
1434 * Replace breakpoints (int3) with relative jumps.
1435 * Caller must call with locking kprobe_mutex and text_mutex.
1436 */
1437void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1438{
1439 struct optimized_kprobe *op, *tmp;
1440 int c = 0;
1441
1442 list_for_each_entry_safe(op, tmp, oplist, list) {
1443 WARN_ON(kprobe_disabled(&op->kp));
1444 /* Setup param */
1445 setup_optimize_kprobe(&jump_poke_params[c],
1446 jump_poke_bufs[c].buf, op);
1447 list_del_init(&op->list);
1448 if (++c >= MAX_OPTIMIZE_PROBES)
1449 break;
1450 }
1417 1451
1418 /* 1452 /*
1419 * text_poke_smp doesn't support NMI/MCE code modifying. 1453 * text_poke_smp doesn't support NMI/MCE code modifying.
1420 * However, since kprobes itself also doesn't support NMI/MCE 1454 * However, since kprobes itself also doesn't support NMI/MCE
1421 * code probing, it's not a problem. 1455 * code probing, it's not a problem.
1422 */ 1456 */
1423 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); 1457 text_poke_smp_batch(jump_poke_params, c);
1424 return 0; 1458}
1459
1460static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1461 u8 *insn_buf,
1462 struct optimized_kprobe *op)
1463{
1464 /* Set int3 to first byte for kprobes */
1465 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1466 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1467
1468 tprm->addr = op->kp.addr;
1469 tprm->opcode = insn_buf;
1470 tprm->len = RELATIVEJUMP_SIZE;
1471}
1472
1473/*
1474 * Recover original instructions and breakpoints from relative jumps.
1475 * Caller must call with locking kprobe_mutex.
1476 */
1477extern void arch_unoptimize_kprobes(struct list_head *oplist,
1478 struct list_head *done_list)
1479{
1480 struct optimized_kprobe *op, *tmp;
1481 int c = 0;
1482
1483 list_for_each_entry_safe(op, tmp, oplist, list) {
1484 /* Setup param */
1485 setup_unoptimize_kprobe(&jump_poke_params[c],
1486 jump_poke_bufs[c].buf, op);
1487 list_move(&op->list, done_list);
1488 if (++c >= MAX_OPTIMIZE_PROBES)
1489 break;
1490 }
1491
1492 /*
1493 * text_poke_smp doesn't support NMI/MCE code modifying.
1494 * However, since kprobes itself also doesn't support NMI/MCE
1495 * code probing, it's not a problem.
1496 */
1497 text_poke_smp_batch(jump_poke_params, c);
1425} 1498}
1426 1499
1427/* Replace a relative jump with a breakpoint (int3). */ 1500/* Replace a relative jump with a breakpoint (int3). */
@@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p,
1453 } 1526 }
1454 return 0; 1527 return 0;
1455} 1528}
1529
1530static int __kprobes init_poke_params(void)
1531{
1532 /* Allocate code buffer and parameter array */
1533 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1534 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1535 if (!jump_poke_bufs)
1536 return -ENOMEM;
1537
1538 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1539 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1540 if (!jump_poke_params) {
1541 kfree(jump_poke_bufs);
1542 jump_poke_bufs = NULL;
1543 return -ENOMEM;
1544 }
1545
1546 return 0;
1547}
1548#else /* !CONFIG_OPTPROBES */
1549static int __kprobes init_poke_params(void)
1550{
1551 return 0;
1552}
1456#endif 1553#endif
1457 1554
1458int __init arch_init_kprobes(void) 1555int __init arch_init_kprobes(void)
1459{ 1556{
1460 return 0; 1557 return init_poke_params();
1461} 1558}
1462 1559
1463int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1560int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..8dc44662394b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/notifier.h>
31#include <linux/reboot.h>
32#include <linux/hash.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/kprobes.h>
30#include <asm/timer.h> 36#include <asm/timer.h>
37#include <asm/cpu.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
31 41
32#define MMU_QUEUE_SIZE 1024 42#define MMU_QUEUE_SIZE 1024
33 43
44static int kvmapf = 1;
45
46static int parse_no_kvmapf(char *arg)
47{
48 kvmapf = 0;
49 return 0;
50}
51
52early_param("no-kvmapf", parse_no_kvmapf);
53
34struct kvm_para_state { 54struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 55 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 56 int mmu_queue_len;
37}; 57};
38 58
39static DEFINE_PER_CPU(struct kvm_para_state, para_state); 59static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
40 61
41static struct kvm_para_state *kvm_para_state(void) 62static struct kvm_para_state *kvm_para_state(void)
42{ 63{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
50{ 71{
51} 72}
52 73
74#define KVM_TASK_SLEEP_HASHBITS 8
75#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
76
77struct kvm_task_sleep_node {
78 struct hlist_node link;
79 wait_queue_head_t wq;
80 u32 token;
81 int cpu;
82 bool halted;
83 struct mm_struct *mm;
84};
85
86static struct kvm_task_sleep_head {
87 spinlock_t lock;
88 struct hlist_head list;
89} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
90
91static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
92 u32 token)
93{
94 struct hlist_node *p;
95
96 hlist_for_each(p, &b->list) {
97 struct kvm_task_sleep_node *n =
98 hlist_entry(p, typeof(*n), link);
99 if (n->token == token)
100 return n;
101 }
102
103 return NULL;
104}
105
106void kvm_async_pf_task_wait(u32 token)
107{
108 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
109 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
110 struct kvm_task_sleep_node n, *e;
111 DEFINE_WAIT(wait);
112 int cpu, idle;
113
114 cpu = get_cpu();
115 idle = idle_cpu(cpu);
116 put_cpu();
117
118 spin_lock(&b->lock);
119 e = _find_apf_task(b, token);
120 if (e) {
121 /* dummy entry exist -> wake up was delivered ahead of PF */
122 hlist_del(&e->link);
123 kfree(e);
124 spin_unlock(&b->lock);
125 return;
126 }
127
128 n.token = token;
129 n.cpu = smp_processor_id();
130 n.mm = current->active_mm;
131 n.halted = idle || preempt_count() > 1;
132 atomic_inc(&n.mm->mm_count);
133 init_waitqueue_head(&n.wq);
134 hlist_add_head(&n.link, &b->list);
135 spin_unlock(&b->lock);
136
137 for (;;) {
138 if (!n.halted)
139 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
140 if (hlist_unhashed(&n.link))
141 break;
142
143 if (!n.halted) {
144 local_irq_enable();
145 schedule();
146 local_irq_disable();
147 } else {
148 /*
149 * We cannot reschedule. So halt.
150 */
151 native_safe_halt();
152 local_irq_disable();
153 }
154 }
155 if (!n.halted)
156 finish_wait(&n.wq, &wait);
157
158 return;
159}
160EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
161
162static void apf_task_wake_one(struct kvm_task_sleep_node *n)
163{
164 hlist_del_init(&n->link);
165 if (!n->mm)
166 return;
167 mmdrop(n->mm);
168 if (n->halted)
169 smp_send_reschedule(n->cpu);
170 else if (waitqueue_active(&n->wq))
171 wake_up(&n->wq);
172}
173
174static void apf_task_wake_all(void)
175{
176 int i;
177
178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179 struct hlist_node *p, *next;
180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
181 spin_lock(&b->lock);
182 hlist_for_each_safe(p, next, &b->list) {
183 struct kvm_task_sleep_node *n =
184 hlist_entry(p, typeof(*n), link);
185 if (n->cpu == smp_processor_id())
186 apf_task_wake_one(n);
187 }
188 spin_unlock(&b->lock);
189 }
190}
191
192void kvm_async_pf_task_wake(u32 token)
193{
194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
196 struct kvm_task_sleep_node *n;
197
198 if (token == ~0) {
199 apf_task_wake_all();
200 return;
201 }
202
203again:
204 spin_lock(&b->lock);
205 n = _find_apf_task(b, token);
206 if (!n) {
207 /*
208 * async PF was not yet handled.
209 * Add dummy entry for the token.
210 */
211 n = kmalloc(sizeof(*n), GFP_ATOMIC);
212 if (!n) {
213 /*
214 * Allocation failed! Busy wait while other cpu
215 * handles async PF.
216 */
217 spin_unlock(&b->lock);
218 cpu_relax();
219 goto again;
220 }
221 n->token = token;
222 n->cpu = smp_processor_id();
223 n->mm = NULL;
224 init_waitqueue_head(&n->wq);
225 hlist_add_head(&n->link, &b->list);
226 } else
227 apf_task_wake_one(n);
228 spin_unlock(&b->lock);
229 return;
230}
231EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
232
233u32 kvm_read_and_reset_pf_reason(void)
234{
235 u32 reason = 0;
236
237 if (__get_cpu_var(apf_reason).enabled) {
238 reason = __get_cpu_var(apf_reason).reason;
239 __get_cpu_var(apf_reason).reason = 0;
240 }
241
242 return reason;
243}
244EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
245
246dotraplinkage void __kprobes
247do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
248{
249 switch (kvm_read_and_reset_pf_reason()) {
250 default:
251 do_page_fault(regs, error_code);
252 break;
253 case KVM_PV_REASON_PAGE_NOT_PRESENT:
254 /* page is swapped out by the host. */
255 kvm_async_pf_task_wait((u32)read_cr2());
256 break;
257 case KVM_PV_REASON_PAGE_READY:
258 kvm_async_pf_task_wake((u32)read_cr2());
259 break;
260 }
261}
262
53static void kvm_mmu_op(void *buffer, unsigned len) 263static void kvm_mmu_op(void *buffer, unsigned len)
54{ 264{
55 int r; 265 int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
231#endif 441#endif
232} 442}
233 443
444void __cpuinit kvm_guest_cpu_init(void)
445{
446 if (!kvm_para_available())
447 return;
448
449 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
450 u64 pa = __pa(&__get_cpu_var(apf_reason));
451
452#ifdef CONFIG_PREEMPT
453 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
454#endif
455 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
456 __get_cpu_var(apf_reason).enabled = 1;
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id());
459 }
460}
461
462static void kvm_pv_disable_apf(void *unused)
463{
464 if (!__get_cpu_var(apf_reason).enabled)
465 return;
466
467 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
468 __get_cpu_var(apf_reason).enabled = 0;
469
470 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
471 smp_processor_id());
472}
473
474static int kvm_pv_reboot_notify(struct notifier_block *nb,
475 unsigned long code, void *unused)
476{
477 if (code == SYS_RESTART)
478 on_each_cpu(kvm_pv_disable_apf, NULL, 1);
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify,
484};
485
486#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void)
488{
489#ifdef CONFIG_KVM_CLOCK
490 WARN_ON(kvm_register_clock("primary cpu clock"));
491#endif
492 kvm_guest_cpu_init();
493 native_smp_prepare_boot_cpu();
494}
495
496static void kvm_guest_cpu_online(void *dummy)
497{
498 kvm_guest_cpu_init();
499}
500
501static void kvm_guest_cpu_offline(void *dummy)
502{
503 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all();
505}
506
507static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
508 unsigned long action, void *hcpu)
509{
510 int cpu = (unsigned long)hcpu;
511 switch (action) {
512 case CPU_ONLINE:
513 case CPU_DOWN_FAILED:
514 case CPU_ONLINE_FROZEN:
515 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
516 break;
517 case CPU_DOWN_PREPARE:
518 case CPU_DOWN_PREPARE_FROZEN:
519 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
520 break;
521 default:
522 break;
523 }
524 return NOTIFY_OK;
525}
526
527static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
528 .notifier_call = kvm_cpu_notify,
529};
530#endif
531
532static void __init kvm_apf_trap_init(void)
533{
534 set_intr_gate(14, &async_page_fault);
535}
536
234void __init kvm_guest_init(void) 537void __init kvm_guest_init(void)
235{ 538{
539 int i;
540
236 if (!kvm_para_available()) 541 if (!kvm_para_available())
237 return; 542 return;
238 543
239 paravirt_ops_setup(); 544 paravirt_ops_setup();
545 register_reboot_notifier(&kvm_pv_reboot_nb);
546 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
547 spin_lock_init(&async_pf_sleepers[i].lock);
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init;
550
551#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier);
554#else
555 kvm_guest_cpu_init();
556#endif
240} 557}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce31a19c..f98d3eafe07a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 125 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 126};
127 127
128static int kvm_register_clock(char *txt) 128int kvm_register_clock(char *txt)
129{ 129{
130 int cpu = smp_processor_id(); 130 int cpu = smp_processor_id();
131 int low, high, ret; 131 int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
152} 152}
153#endif 153#endif
154 154
155#ifdef CONFIG_SMP
156static void __init kvm_smp_prepare_boot_cpu(void)
157{
158 WARN_ON(kvm_register_clock("primary cpu clock"));
159 native_smp_prepare_boot_cpu();
160}
161#endif
162
163/* 155/*
164 * After the clock is registered, the host will keep writing to the 156 * After the clock is registered, the host will keep writing to the
165 * registered memory location. If the guest happens to shutdown, this memory 157 * registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
206 x86_cpuinit.setup_percpu_clockev = 198 x86_cpuinit.setup_percpu_clockev =
207 kvm_setup_secondary_clock; 199 kvm_setup_secondary_clock;
208#endif 200#endif
209#ifdef CONFIG_SMP
210 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
211#endif
212 machine_ops.shutdown = kvm_shutdown; 201 machine_ops.shutdown = kvm_shutdown;
213#ifdef CONFIG_KEXEC 202#ifdef CONFIG_KEXEC
214 machine_ops.crash_shutdown = kvm_crash_shutdown; 203 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c055c7d..0fe6d1a66c38 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
155 return 0; 155 return 0;
156} 156}
157 157
158static int get_ucode_data(void *to, const u8 *from, size_t n)
159{
160 memcpy(to, from, n);
161 return 0;
162}
163
164static void * 158static void *
165get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) 159get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
166{ 160{
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
168 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 162 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
169 void *mc; 163 void *mc;
170 164
171 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) 165 get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
172 return NULL;
173 166
174 if (section_hdr[0] != UCODE_UCODE_TYPE) { 167 if (section_hdr[0] != UCODE_UCODE_TYPE) {
175 pr_err("error: invalid type field in container file section header\n"); 168 pr_err("error: invalid type field in container file section header\n");
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 mc = vmalloc(UCODE_MAX_SIZE); 179 mc = vzalloc(UCODE_MAX_SIZE);
187 if (mc) { 180 if (!mc)
188 memset(mc, 0, UCODE_MAX_SIZE); 181 return NULL;
189 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, 182
190 total_size)) { 183 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
191 vfree(mc); 184 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
192 mc = NULL; 185
193 } else
194 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
195 }
196 return mc; 186 return mc;
197} 187}
198 188
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
202 unsigned int *buf_pos = (unsigned int *)container_hdr; 192 unsigned int *buf_pos = (unsigned int *)container_hdr;
203 unsigned long size; 193 unsigned long size;
204 194
205 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) 195 get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
206 return 0;
207 196
208 size = buf_pos[2]; 197 size = buf_pos[2];
209 198
@@ -212,17 +201,14 @@ static int install_equiv_cpu_table(const u8 *buf)
212 return 0; 201 return 0;
213 } 202 }
214 203
215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 204 equiv_cpu_table = vmalloc(size);
216 if (!equiv_cpu_table) { 205 if (!equiv_cpu_table) {
217 pr_err("failed to allocate equivalent CPU table\n"); 206 pr_err("failed to allocate equivalent CPU table\n");
218 return 0; 207 return 0;
219 } 208 }
220 209
221 buf += UCODE_CONTAINER_HEADER_SIZE; 210 buf += UCODE_CONTAINER_HEADER_SIZE;
222 if (get_ucode_data(equiv_cpu_table, buf, size)) { 211 get_ucode_data(equiv_cpu_table, buf, size);
223 vfree(equiv_cpu_table);
224 return 0;
225 }
226 212
227 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 213 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
228} 214}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index dcb65cc0a053..1a1b606d3e92 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
364 364
365 /* For performance reasons, reuse mc area when possible */ 365 /* For performance reasons, reuse mc area when possible */
366 if (!mc || mc_size > curr_mc_size) { 366 if (!mc || mc_size > curr_mc_size) {
367 if (mc) 367 vfree(mc);
368 vfree(mc);
369 mc = vmalloc(mc_size); 368 mc = vmalloc(mc_size);
370 if (!mc) 369 if (!mc)
371 break; 370 break;
@@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
374 373
375 if (get_ucode_data(mc, ucode_ptr, mc_size) || 374 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
376 microcode_sanity_check(mc) < 0) { 375 microcode_sanity_check(mc) < 0) {
377 vfree(mc);
378 break; 376 break;
379 } 377 }
380 378
381 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { 379 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
382 if (new_mc) 380 vfree(new_mc);
383 vfree(new_mc);
384 new_rev = mc_header.rev; 381 new_rev = mc_header.rev;
385 new_mc = mc; 382 new_mc = mc;
386 mc = NULL; /* trigger new vmalloc */ 383 mc = NULL; /* trigger new vmalloc */
@@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
390 leftover -= mc_size; 387 leftover -= mc_size;
391 } 388 }
392 389
393 if (mc) 390 vfree(mc);
394 vfree(mc);
395 391
396 if (leftover) { 392 if (leftover) {
397 if (new_mc) 393 vfree(new_mc);
398 vfree(new_mc);
399 state = UCODE_ERROR; 394 state = UCODE_ERROR;
400 goto out; 395 goto out;
401 } 396 }
@@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 goto out; 400 goto out;
406 } 401 }
407 402
408 if (uci->mc) 403 vfree(uci->mc);
409 vfree(uci->mc);
410 uci->mc = (struct microcode_intel *)new_mc; 404 uci->mc = (struct microcode_intel *)new_mc;
411 405
412 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", 406 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd44..ac861b8348e2 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe {
25}; 25};
26 26
27static u64 __cpuinitdata fam10h_pci_mmconf_base; 27static u64 __cpuinitdata fam10h_pci_mmconf_base;
28static int __cpuinitdata fam10h_pci_mmconf_base_status;
29 28
30static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { 29static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
31 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, 30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
44 return start1 - start2; 43 return start1 - start2;
45} 44}
46 45
47/*[47:0] */ 46#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
48/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ 47#define MMCONF_MASK (~(MMCONF_UNIT - 1))
48#define MMCONF_SIZE (MMCONF_UNIT << 8)
49/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
49#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) 50#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
50#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) 51#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
51static void __cpuinit get_fam10h_pci_mmconf_base(void) 52static void __cpuinit get_fam10h_pci_mmconf_base(void)
52{ 53{
53 int i; 54 int i;
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
64 struct range range[8]; 65 struct range range[8];
65 66
66 /* only try to get setting from BSP */ 67 /* only try to get setting from BSP */
67 /* -1 or 1 */ 68 if (fam10h_pci_mmconf_base)
68 if (fam10h_pci_mmconf_base_status)
69 return; 69 return;
70 70
71 if (!early_pci_allowed()) 71 if (!early_pci_allowed())
72 goto fail; 72 return;
73 73
74 found = 0; 74 found = 0;
75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
91 } 91 }
92 92
93 if (!found) 93 if (!found)
94 goto fail; 94 return;
95 95
96 /* SYS_CFG */ 96 /* SYS_CFG */
97 address = MSR_K8_SYSCFG; 97 address = MSR_K8_SYSCFG;
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
99 99
100 /* TOP_MEM2 is not enabled? */ 100 /* TOP_MEM2 is not enabled? */
101 if (!(val & (1<<21))) { 101 if (!(val & (1<<21))) {
102 tom2 = 0; 102 tom2 = 1ULL << 32;
103 } else { 103 } else {
104 /* TOP_MEM2 */ 104 /* TOP_MEM2 */
105 address = MSR_K8_TOP_MEM2; 105 address = MSR_K8_TOP_MEM2;
106 rdmsrl(address, val); 106 rdmsrl(address, val);
107 tom2 = val & (0xffffULL<<32); 107 tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
108 } 108 }
109 109
110 if (base <= tom2) 110 if (base <= tom2)
111 base = tom2 + (1ULL<<32); 111 base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
112 112
113 /* 113 /*
114 * need to check if the range is in the high mmio range that is 114 * need to check if the range is in the high mmio range that is
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
123 if (!(reg & 3)) 123 if (!(reg & 3))
124 continue; 124 continue;
125 125
126 start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 126 start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); 127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
128 end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 128 end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
129 129
130 if (!end) 130 if (end < tom2)
131 continue; 131 continue;
132 132
133 range[hi_mmio_num].start = start; 133 range[hi_mmio_num].start = start;
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
143 143
144 if (range[hi_mmio_num - 1].end < base) 144 if (range[hi_mmio_num - 1].end < base)
145 goto out; 145 goto out;
146 if (range[0].start > base) 146 if (range[0].start > base + MMCONF_SIZE)
147 goto out; 147 goto out;
148 148
149 /* need to find one window */ 149 /* need to find one window */
150 base = range[0].start - (1ULL << 32); 150 base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
151 if ((base > tom2) && BASE_VALID(base)) 151 if ((base > tom2) && BASE_VALID(base))
152 goto out; 152 goto out;
153 base = range[hi_mmio_num - 1].end + (1ULL << 32); 153 base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
154 if ((base > tom2) && BASE_VALID(base)) 154 if (BASE_VALID(base))
155 goto out; 155 goto out;
156 /* need to find window between ranges */ 156 /* need to find window between ranges */
157 if (hi_mmio_num > 1) 157 for (i = 1; i < hi_mmio_num; i++) {
158 for (i = 0; i < hi_mmio_num - 1; i++) { 158 base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
159 if (range[i + 1].start > (range[i].end + (1ULL << 32))) { 159 val = range[i].start & MMCONF_MASK;
160 base = range[i].end + (1ULL << 32); 160 if (val >= base + MMCONF_SIZE && BASE_VALID(base))
161 if ((base > tom2) && BASE_VALID(base)) 161 goto out;
162 goto out;
163 }
164 } 162 }
165
166fail:
167 fam10h_pci_mmconf_base_status = -1;
168 return; 163 return;
164
169out: 165out:
170 fam10h_pci_mmconf_base = base; 166 fam10h_pci_mmconf_base = base;
171 fam10h_pci_mmconf_base_status = 1;
172} 167}
173 168
174void __cpuinit fam10h_check_enable_mmcfg(void) 169void __cpuinit fam10h_check_enable_mmcfg(void)
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
190 185
191 /* only trust the one handle 256 buses, if acpi=off */ 186 /* only trust the one handle 256 buses, if acpi=off */
192 if (!acpi_pci_disabled || busnbits >= 8) { 187 if (!acpi_pci_disabled || busnbits >= 8) {
193 u64 base; 188 u64 base = val & MMCONF_MASK;
194 base = val & (0xffffULL << 32); 189
195 if (fam10h_pci_mmconf_base_status <= 0) { 190 if (!fam10h_pci_mmconf_base) {
196 fam10h_pci_mmconf_base = base; 191 fam10h_pci_mmconf_base = base;
197 fam10h_pci_mmconf_base_status = 1;
198 return; 192 return;
199 } else if (fam10h_pci_mmconf_base == base) 193 } else if (fam10h_pci_mmconf_base == base)
200 return; 194 return;
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
206 * with 256 buses 200 * with 256 buses
207 */ 201 */
208 get_fam10h_pci_mmconf_base(); 202 get_fam10h_pci_mmconf_base();
209 if (fam10h_pci_mmconf_base_status <= 0) 203 if (!fam10h_pci_mmconf_base) {
204 pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
210 return; 205 return;
206 }
211 207
212 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); 208 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
213 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | 209 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
@@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
217 wrmsrl(address, val); 213 wrmsrl(address, val);
218} 214}
219 215
220static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) 216static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
221{ 217{
222 pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; 218 pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
223 return 0; 219 return 0;
224} 220}
225 221
226static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { 222static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
227 { 223 {
228 .callback = set_check_enable_amd_mmconf, 224 .callback = set_check_enable_amd_mmconf,
229 .ident = "Sun Microsystems Machine", 225 .ident = "Sun Microsystems Machine",
@@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
234 {} 230 {}
235}; 231};
236 232
237void __cpuinit check_enable_amd_mmconf_dmi(void) 233/* Called from a __cpuinit function, but only on the BSP. */
234void __ref check_enable_amd_mmconf_dmi(void)
238{ 235{
239 dmi_check_system(mmconf_dmi_table); 236 dmi_check_system(mmconf_dmi_table);
240} 237}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
37 37
38void *module_alloc(unsigned long size) 38void *module_alloc(unsigned long size)
39{ 39{
40 struct vm_struct *area; 40 if (PAGE_ALIGN(size) > MODULES_LEN)
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL; 41 return NULL;
47 42 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); 43 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
49 if (!area) 44 -1, __builtin_return_address(0));
50 return NULL;
51
52 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
53 PAGE_KERNEL_EXEC);
54} 45}
55 46
56/* Free memory returned from module_alloc */ 47/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 9af64d9c4b67..01b0f6d06451 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -118,21 +118,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
118 118
119static void __init MP_ioapic_info(struct mpc_ioapic *m) 119static void __init MP_ioapic_info(struct mpc_ioapic *m)
120{ 120{
121 if (!(m->flags & MPC_APIC_USABLE)) 121 if (m->flags & MPC_APIC_USABLE)
122 return; 122 mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
123
124 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
125 m->apicid, m->apicver, m->apicaddr);
126
127 mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
128}
129
130static void print_MP_intsrc_info(struct mpc_intsrc *m)
131{
132 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
133 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
134 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
135 m->srcbusirq, m->dstapic, m->dstirq);
136} 123}
137 124
138static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) 125static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
@@ -144,73 +131,11 @@ static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
144 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); 131 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
145} 132}
146 133
147static void __init assign_to_mp_irq(struct mpc_intsrc *m,
148 struct mpc_intsrc *mp_irq)
149{
150 mp_irq->dstapic = m->dstapic;
151 mp_irq->type = m->type;
152 mp_irq->irqtype = m->irqtype;
153 mp_irq->irqflag = m->irqflag;
154 mp_irq->srcbus = m->srcbus;
155 mp_irq->srcbusirq = m->srcbusirq;
156 mp_irq->dstirq = m->dstirq;
157}
158
159static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
160 struct mpc_intsrc *m)
161{
162 m->dstapic = mp_irq->dstapic;
163 m->type = mp_irq->type;
164 m->irqtype = mp_irq->irqtype;
165 m->irqflag = mp_irq->irqflag;
166 m->srcbus = mp_irq->srcbus;
167 m->srcbusirq = mp_irq->srcbusirq;
168 m->dstirq = mp_irq->dstirq;
169}
170
171static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
172 struct mpc_intsrc *m)
173{
174 if (mp_irq->dstapic != m->dstapic)
175 return 1;
176 if (mp_irq->type != m->type)
177 return 2;
178 if (mp_irq->irqtype != m->irqtype)
179 return 3;
180 if (mp_irq->irqflag != m->irqflag)
181 return 4;
182 if (mp_irq->srcbus != m->srcbus)
183 return 5;
184 if (mp_irq->srcbusirq != m->srcbusirq)
185 return 6;
186 if (mp_irq->dstirq != m->dstirq)
187 return 7;
188
189 return 0;
190}
191
192static void __init MP_intsrc_info(struct mpc_intsrc *m)
193{
194 int i;
195
196 print_MP_intsrc_info(m);
197
198 for (i = 0; i < mp_irq_entries; i++) {
199 if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
200 return;
201 }
202
203 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
204 if (++mp_irq_entries == MAX_IRQ_SOURCES)
205 panic("Max # of irq sources exceeded!!\n");
206}
207#else /* CONFIG_X86_IO_APIC */ 134#else /* CONFIG_X86_IO_APIC */
208static inline void __init MP_bus_info(struct mpc_bus *m) {} 135static inline void __init MP_bus_info(struct mpc_bus *m) {}
209static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} 136static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
210static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
211#endif /* CONFIG_X86_IO_APIC */ 137#endif /* CONFIG_X86_IO_APIC */
212 138
213
214static void __init MP_lintsrc_info(struct mpc_lintsrc *m) 139static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
215{ 140{
216 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," 141 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
@@ -222,7 +147,6 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
222/* 147/*
223 * Read/parse the MPC 148 * Read/parse the MPC
224 */ 149 */
225
226static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) 150static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
227{ 151{
228 152
@@ -275,18 +199,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
275 199
276void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } 200void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
277 201
278static void __init smp_register_lapic_address(unsigned long address)
279{
280 mp_lapic_addr = address;
281
282 set_fixmap_nocache(FIX_APIC_BASE, address);
283 if (boot_cpu_physical_apicid == -1U) {
284 boot_cpu_physical_apicid = read_apic_id();
285 apic_version[boot_cpu_physical_apicid] =
286 GET_APIC_VERSION(apic_read(APIC_LVR));
287 }
288}
289
290static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 202static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
291{ 203{
292 char str[16]; 204 char str[16];
@@ -301,17 +213,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
301#ifdef CONFIG_X86_32 213#ifdef CONFIG_X86_32
302 generic_mps_oem_check(mpc, oem, str); 214 generic_mps_oem_check(mpc, oem, str);
303#endif 215#endif
304 /* save the local APIC address, it might be non-default */ 216 /* Initialize the lapic mapping */
305 if (!acpi_lapic) 217 if (!acpi_lapic)
306 mp_lapic_addr = mpc->lapic; 218 register_lapic_address(mpc->lapic);
307 219
308 if (early) 220 if (early)
309 return 1; 221 return 1;
310 222
311 /* Initialize the lapic mapping */
312 if (!acpi_lapic)
313 smp_register_lapic_address(mpc->lapic);
314
315 if (mpc->oemptr) 223 if (mpc->oemptr)
316 x86_init.mpparse.smp_read_mpc_oem(mpc); 224 x86_init.mpparse.smp_read_mpc_oem(mpc);
317 225
@@ -337,7 +245,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
337 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); 245 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
338 break; 246 break;
339 case MP_INTSRC: 247 case MP_INTSRC:
340 MP_intsrc_info((struct mpc_intsrc *)mpt); 248 mp_save_irq((struct mpc_intsrc *)mpt);
341 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); 249 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
342 break; 250 break;
343 case MP_LINTSRC: 251 case MP_LINTSRC:
@@ -429,13 +337,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
429 337
430 intsrc.srcbusirq = i; 338 intsrc.srcbusirq = i;
431 intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ 339 intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
432 MP_intsrc_info(&intsrc); 340 mp_save_irq(&intsrc);
433 } 341 }
434 342
435 intsrc.irqtype = mp_ExtINT; 343 intsrc.irqtype = mp_ExtINT;
436 intsrc.srcbusirq = 0; 344 intsrc.srcbusirq = 0;
437 intsrc.dstirq = 0; /* 8259A to INTIN0 */ 345 intsrc.dstirq = 0; /* 8259A to INTIN0 */
438 MP_intsrc_info(&intsrc); 346 mp_save_irq(&intsrc);
439} 347}
440 348
441 349
@@ -784,11 +692,11 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
784 int i; 692 int i;
785 693
786 apic_printk(APIC_VERBOSE, "OLD "); 694 apic_printk(APIC_VERBOSE, "OLD ");
787 print_MP_intsrc_info(m); 695 print_mp_irq_info(m);
788 696
789 i = get_MP_intsrc_index(m); 697 i = get_MP_intsrc_index(m);
790 if (i > 0) { 698 if (i > 0) {
791 assign_to_mpc_intsrc(&mp_irqs[i], m); 699 memcpy(m, &mp_irqs[i], sizeof(*m));
792 apic_printk(APIC_VERBOSE, "NEW "); 700 apic_printk(APIC_VERBOSE, "NEW ");
793 print_mp_irq_info(&mp_irqs[i]); 701 print_mp_irq_info(&mp_irqs[i]);
794 return; 702 return;
@@ -875,14 +783,14 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
875 if (nr_m_spare > 0) { 783 if (nr_m_spare > 0) {
876 apic_printk(APIC_VERBOSE, "*NEW* found\n"); 784 apic_printk(APIC_VERBOSE, "*NEW* found\n");
877 nr_m_spare--; 785 nr_m_spare--;
878 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 786 memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
879 m_spare[nr_m_spare] = NULL; 787 m_spare[nr_m_spare] = NULL;
880 } else { 788 } else {
881 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 789 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
882 count += sizeof(struct mpc_intsrc); 790 count += sizeof(struct mpc_intsrc);
883 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) 791 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
884 goto out; 792 goto out;
885 assign_to_mpc_intsrc(&mp_irqs[i], m); 793 memcpy(m, &mp_irqs[i], sizeof(*m));
886 mpc->length = count; 794 mpc->length = count;
887 mpt += sizeof(struct mpc_intsrc); 795 mpt += sizeof(struct mpc_intsrc);
888 } 796 }
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f70..12fcbe2c143e 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -30,7 +30,6 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/smp.h> 32#include <linux/smp.h>
33#include <linux/smp_lock.h>
34#include <linux/major.h> 33#include <linux/major.h>
35#include <linux/fs.h> 34#include <linux/fs.h>
36#include <linux/device.h> 35#include <linux/device.h>
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
421 .set_pte = native_set_pte, 421 .set_pte = native_set_pte,
422 .set_pte_at = native_set_pte_at, 422 .set_pte_at = native_set_pte_at,
423 .set_pmd = native_set_pmd, 423 .set_pmd = native_set_pmd,
424 .set_pmd_at = native_set_pmd_at,
424 .pte_update = paravirt_nop, 425 .pte_update = paravirt_nop,
425 .pte_update_defer = paravirt_nop, 426 .pte_update_defer = paravirt_nop,
427 .pmd_update = paravirt_nop,
428 .pmd_update_defer = paravirt_nop,
426 429
427 .ptep_modify_prot_start = __ptep_modify_prot_start, 430 .ptep_modify_prot_start = __ptep_modify_prot_start,
428 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 431 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba0f0ca9f280..c01ffa5b9b87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -143,7 +143,7 @@ static void flush_gart(void)
143 143
144 spin_lock_irqsave(&iommu_bitmap_lock, flags); 144 spin_lock_irqsave(&iommu_bitmap_lock, flags);
145 if (need_flush) { 145 if (need_flush) {
146 k8_flush_garts(); 146 amd_flush_garts();
147 need_flush = false; 147 need_flush = false;
148 } 148 }
149 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 149 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -561,17 +561,17 @@ static void enable_gart_translations(void)
561{ 561{
562 int i; 562 int i;
563 563
564 if (!k8_northbridges.gart_supported) 564 if (!amd_nb_has_feature(AMD_NB_GART))
565 return; 565 return;
566 566
567 for (i = 0; i < k8_northbridges.num; i++) { 567 for (i = 0; i < amd_nb_num(); i++) {
568 struct pci_dev *dev = k8_northbridges.nb_misc[i]; 568 struct pci_dev *dev = node_to_amd_nb(i)->misc;
569 569
570 enable_gart_translation(dev, __pa(agp_gatt_table)); 570 enable_gart_translation(dev, __pa(agp_gatt_table));
571 } 571 }
572 572
573 /* Flush the GART-TLB to remove stale entries */ 573 /* Flush the GART-TLB to remove stale entries */
574 k8_flush_garts(); 574 amd_flush_garts();
575} 575}
576 576
577/* 577/*
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
596 if (!fix_up_north_bridges) 596 if (!fix_up_north_bridges)
597 return; 597 return;
598 598
599 if (!k8_northbridges.gart_supported) 599 if (!amd_nb_has_feature(AMD_NB_GART))
600 return; 600 return;
601 601
602 pr_info("PCI-DMA: Restoring GART aperture settings\n"); 602 pr_info("PCI-DMA: Restoring GART aperture settings\n");
603 603
604 for (i = 0; i < k8_northbridges.num; i++) { 604 for (i = 0; i < amd_nb_num(); i++) {
605 struct pci_dev *dev = k8_northbridges.nb_misc[i]; 605 struct pci_dev *dev = node_to_amd_nb(i)->misc;
606 606
607 /* 607 /*
608 * Don't enable translations just yet. That is the next 608 * Don't enable translations just yet. That is the next
@@ -644,7 +644,7 @@ static struct sys_device device_gart = {
644 * Private Northbridge GATT initialization in case we cannot use the 644 * Private Northbridge GATT initialization in case we cannot use the
645 * AGP driver for some reason. 645 * AGP driver for some reason.
646 */ 646 */
647static __init int init_k8_gatt(struct agp_kern_info *info) 647static __init int init_amd_gatt(struct agp_kern_info *info)
648{ 648{
649 unsigned aper_size, gatt_size, new_aper_size; 649 unsigned aper_size, gatt_size, new_aper_size;
650 unsigned aper_base, new_aper_base; 650 unsigned aper_base, new_aper_base;
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
656 656
657 aper_size = aper_base = info->aper_size = 0; 657 aper_size = aper_base = info->aper_size = 0;
658 dev = NULL; 658 dev = NULL;
659 for (i = 0; i < k8_northbridges.num; i++) { 659 for (i = 0; i < amd_nb_num(); i++) {
660 dev = k8_northbridges.nb_misc[i]; 660 dev = node_to_amd_nb(i)->misc;
661 new_aper_base = read_aperture(dev, &new_aper_size); 661 new_aper_base = read_aperture(dev, &new_aper_size);
662 if (!new_aper_base) 662 if (!new_aper_base)
663 goto nommu; 663 goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
725 if (!no_agp) 725 if (!no_agp)
726 return; 726 return;
727 727
728 if (!k8_northbridges.gart_supported) 728 if (!amd_nb_has_feature(AMD_NB_GART))
729 return; 729 return;
730 730
731 for (i = 0; i < k8_northbridges.num; i++) { 731 for (i = 0; i < amd_nb_num(); i++) {
732 u32 ctl; 732 u32 ctl;
733 733
734 dev = k8_northbridges.nb_misc[i]; 734 dev = node_to_amd_nb(i)->misc;
735 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 735 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
736 736
737 ctl &= ~GARTEN; 737 ctl &= ~GARTEN;
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void)
749 unsigned long scratch; 749 unsigned long scratch;
750 long i; 750 long i;
751 751
752 if (!k8_northbridges.gart_supported) 752 if (!amd_nb_has_feature(AMD_NB_GART))
753 return 0; 753 return 0;
754 754
755#ifndef CONFIG_AGP_AMD64 755#ifndef CONFIG_AGP_AMD64
756 no_agp = 1; 756 no_agp = 1;
757#else 757#else
758 /* Makefile puts PCI initialization via subsys_initcall first. */ 758 /* Makefile puts PCI initialization via subsys_initcall first. */
759 /* Add other K8 AGP bridge drivers here */ 759 /* Add other AMD AGP bridge drivers here */
760 no_agp = no_agp || 760 no_agp = no_agp ||
761 (agp_amd64_init() < 0) || 761 (agp_amd64_init() < 0) ||
762 (agp_copy_info(agp_bridge, &info) < 0); 762 (agp_copy_info(agp_bridge, &info) < 0);
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void)
765 if (no_iommu || 765 if (no_iommu ||
766 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 766 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
767 !gart_iommu_aperture || 767 !gart_iommu_aperture ||
768 (no_agp && init_k8_gatt(&info) < 0)) { 768 (no_agp && init_amd_gatt(&info) < 0)) {
769 if (max_pfn > MAX_DMA32_PFN) { 769 if (max_pfn > MAX_DMA32_PFN) {
770 pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); 770 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
771 pr_warning("falling back to iommu=soft.\n"); 771 pr_warning("falling back to iommu=soft.\n");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868a86aa..e764fc05d700 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h> 16#include <linux/hw_breakpoint.h>
17#include <asm/cpu.h>
17#include <asm/system.h> 18#include <asm/system.h>
18#include <asm/apic.h> 19#include <asm/apic.h>
19#include <asm/syscalls.h> 20#include <asm/syscalls.h>
@@ -22,11 +23,6 @@
22#include <asm/i387.h> 23#include <asm/i387.h>
23#include <asm/debugreg.h> 24#include <asm/debugreg.h>
24 25
25unsigned long idle_halt;
26EXPORT_SYMBOL(idle_halt);
27unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait);
29
30struct kmem_cache *task_xstate_cachep; 26struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep); 27EXPORT_SYMBOL_GPL(task_xstate_cachep);
32 28
@@ -91,8 +87,7 @@ void exit_thread(void)
91void show_regs(struct pt_regs *regs) 87void show_regs(struct pt_regs *regs)
92{ 88{
93 show_registers(regs); 89 show_registers(regs);
94 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
95 regs->bp);
96} 91}
97 92
98void show_regs_common(void) 93void show_regs_common(void)
@@ -328,7 +323,7 @@ long sys_execve(const char __user *name,
328/* 323/*
329 * Idle related variables and functions 324 * Idle related variables and functions
330 */ 325 */
331unsigned long boot_option_idle_override = 0; 326unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
332EXPORT_SYMBOL(boot_option_idle_override); 327EXPORT_SYMBOL(boot_option_idle_override);
333 328
334/* 329/*
@@ -374,6 +369,7 @@ void default_idle(void)
374{ 369{
375 if (hlt_use_halt()) { 370 if (hlt_use_halt()) {
376 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 371 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
372 trace_cpu_idle(1, smp_processor_id());
377 current_thread_info()->status &= ~TS_POLLING; 373 current_thread_info()->status &= ~TS_POLLING;
378 /* 374 /*
379 * TS_POLLING-cleared state must be visible before we 375 * TS_POLLING-cleared state must be visible before we
@@ -386,6 +382,8 @@ void default_idle(void)
386 else 382 else
387 local_irq_enable(); 383 local_irq_enable();
388 current_thread_info()->status |= TS_POLLING; 384 current_thread_info()->status |= TS_POLLING;
385 trace_power_end(smp_processor_id());
386 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
389 } else { 387 } else {
390 local_irq_enable(); 388 local_irq_enable();
391 /* loop is done by the caller */ 389 /* loop is done by the caller */
@@ -443,9 +441,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
443 */ 441 */
444void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 442void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
445{ 443{
446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
447 if (!need_resched()) { 444 if (!need_resched()) {
448 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 445 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
449 clflush((void *)&current_thread_info()->flags); 446 clflush((void *)&current_thread_info()->flags);
450 447
451 __monitor((void *)&current_thread_info()->flags, 0, 0); 448 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -460,7 +457,8 @@ static void mwait_idle(void)
460{ 457{
461 if (!need_resched()) { 458 if (!need_resched()) {
462 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 459 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
463 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 460 trace_cpu_idle(1, smp_processor_id());
461 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
464 clflush((void *)&current_thread_info()->flags); 462 clflush((void *)&current_thread_info()->flags);
465 463
466 __monitor((void *)&current_thread_info()->flags, 0, 0); 464 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -469,6 +467,8 @@ static void mwait_idle(void)
469 __sti_mwait(0, 0); 467 __sti_mwait(0, 0);
470 else 468 else
471 local_irq_enable(); 469 local_irq_enable();
470 trace_power_end(smp_processor_id());
471 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
472 } else 472 } else
473 local_irq_enable(); 473 local_irq_enable();
474} 474}
@@ -481,10 +481,12 @@ static void mwait_idle(void)
481static void poll_idle(void) 481static void poll_idle(void)
482{ 482{
483 trace_power_start(POWER_CSTATE, 0, smp_processor_id()); 483 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
484 trace_cpu_idle(0, smp_processor_id());
484 local_irq_enable(); 485 local_irq_enable();
485 while (!need_resched()) 486 while (!need_resched())
486 cpu_relax(); 487 cpu_relax();
487 trace_power_end(0); 488 trace_power_end(smp_processor_id());
489 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
488} 490}
489 491
490/* 492/*
@@ -499,17 +501,16 @@ static void poll_idle(void)
499 * 501 *
500 * idle=mwait overrides this decision and forces the usage of mwait. 502 * idle=mwait overrides this decision and forces the usage of mwait.
501 */ 503 */
502static int __cpuinitdata force_mwait;
503 504
504#define MWAIT_INFO 0x05 505#define MWAIT_INFO 0x05
505#define MWAIT_ECX_EXTENDED_INFO 0x01 506#define MWAIT_ECX_EXTENDED_INFO 0x01
506#define MWAIT_EDX_C1 0xf0 507#define MWAIT_EDX_C1 0xf0
507 508
508static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 509int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
509{ 510{
510 u32 eax, ebx, ecx, edx; 511 u32 eax, ebx, ecx, edx;
511 512
512 if (force_mwait) 513 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
513 return 1; 514 return 1;
514 515
515 if (c->cpuid_level < MWAIT_INFO) 516 if (c->cpuid_level < MWAIT_INFO)
@@ -629,9 +630,10 @@ static int __init idle_setup(char *str)
629 if (!strcmp(str, "poll")) { 630 if (!strcmp(str, "poll")) {
630 printk("using polling idle threads.\n"); 631 printk("using polling idle threads.\n");
631 pm_idle = poll_idle; 632 pm_idle = poll_idle;
632 } else if (!strcmp(str, "mwait")) 633 boot_option_idle_override = IDLE_POLL;
633 force_mwait = 1; 634 } else if (!strcmp(str, "mwait")) {
634 else if (!strcmp(str, "halt")) { 635 boot_option_idle_override = IDLE_FORCE_MWAIT;
636 } else if (!strcmp(str, "halt")) {
635 /* 637 /*
636 * When the boot option of idle=halt is added, halt is 638 * When the boot option of idle=halt is added, halt is
637 * forced to be used for CPU idle. In such case CPU C2/C3 639 * forced to be used for CPU idle. In such case CPU C2/C3
@@ -640,8 +642,7 @@ static int __init idle_setup(char *str)
640 * the boot_option_idle_override. 642 * the boot_option_idle_override.
641 */ 643 */
642 pm_idle = default_idle; 644 pm_idle = default_idle;
643 idle_halt = 1; 645 boot_option_idle_override = IDLE_HALT;
644 return 0;
645 } else if (!strcmp(str, "nomwait")) { 646 } else if (!strcmp(str, "nomwait")) {
646 /* 647 /*
647 * If the boot option of "idle=nomwait" is added, 648 * If the boot option of "idle=nomwait" is added,
@@ -649,12 +650,10 @@ static int __init idle_setup(char *str)
649 * states. In such case it won't touch the variable 650 * states. In such case it won't touch the variable
650 * of boot_option_idle_override. 651 * of boot_option_idle_override.
651 */ 652 */
652 idle_nomwait = 1; 653 boot_option_idle_override = IDLE_NOMWAIT;
653 return 0;
654 } else 654 } else
655 return -1; 655 return -1;
656 656
657 boot_option_idle_override = 1;
658 return 0; 657 return 0;
659} 658}
660early_param("idle", idle_setup); 659early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3cbbbf..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 61
64/* 62/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
113 stop_critical_timings(); 111 stop_critical_timings();
114 pm_idle(); 112 pm_idle();
115 start_critical_timings(); 113 start_critical_timings();
116
117 trace_power_end(smp_processor_id());
118 } 114 }
119 tick_nohz_restart_sched_tick(); 115 tick_nohz_restart_sched_tick();
120 preempt_enable_no_resched(); 116 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3d7a3a04f38..bd387e8f73b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
56asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
57 55
58DEFINE_PER_CPU(unsigned long, old_rsp); 56DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,8 +139,6 @@ void cpu_idle(void)
141 pm_idle(); 139 pm_idle();
142 start_critical_timings(); 140 start_critical_timings();
143 141
144 trace_power_end(smp_processor_id());
145
146 /* In many cases the interrupt that ended idle 142 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle 143 has already called exit_idle. But some idle
148 loops can be woken up without interrupt. */ 144 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index bab3b9e6f66d..42eb3300dfc6 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -41,44 +41,6 @@ void pvclock_set_flags(u8 flags)
41 valid_flags = flags; 41 valid_flags = flags;
42} 42}
43 43
44/*
45 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
46 * yielding a 64-bit result.
47 */
48static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
49{
50 u64 product;
51#ifdef __i386__
52 u32 tmp1, tmp2;
53#endif
54
55 if (shift < 0)
56 delta >>= -shift;
57 else
58 delta <<= shift;
59
60#ifdef __i386__
61 __asm__ (
62 "mul %5 ; "
63 "mov %4,%%eax ; "
64 "mov %%edx,%4 ; "
65 "mul %5 ; "
66 "xor %5,%5 ; "
67 "add %4,%%eax ; "
68 "adc %5,%%edx ; "
69 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
70 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
71#elif defined(__x86_64__)
72 __asm__ (
73 "mul %%rdx ; shrd $32,%%rdx,%%rax"
74 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
75#else
76#error implement me!
77#endif
78
79 return product;
80}
81
82static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
83{ 45{
84 u64 delta = native_read_tsc() - shadow->tsc_timestamp; 46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
@@ -121,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
121 83
122static atomic64_t last_value = ATOMIC64_INIT(0); 84static atomic64_t last_value = ATOMIC64_INIT(0);
123 85
86void pvclock_resume(void)
87{
88 atomic64_set(&last_value, 0);
89}
90
124cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
125{ 92{
126 struct pvclock_shadow_time shadow; 93 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c495aa8d4815..fc7aae1e2bc7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -18,6 +18,7 @@
18#include <asm/pci_x86.h> 18#include <asm/pci_x86.h>
19#include <asm/virtext.h> 19#include <asm/virtext.h>
20#include <asm/cpu.h> 20#include <asm/cpu.h>
21#include <asm/nmi.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23# include <linux/ctype.h> 24# include <linux/ctype.h>
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self,
747{ 748{
748 int cpu; 749 int cpu;
749 750
750 if (val != DIE_NMI_IPI) 751 if (val != DIE_NMI)
751 return NOTIFY_OK; 752 return NOTIFY_OK;
752 753
753 cpu = raw_smp_processor_id(); 754 cpu = raw_smp_processor_id();
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void)
778 779
779static struct notifier_block crash_nmi_nb = { 780static struct notifier_block crash_nmi_nb = {
780 .notifier_call = crash_nmi_callback, 781 .notifier_call = crash_nmi_callback,
782 /* we want to be the first one called */
783 .priority = NMI_LOCAL_HIGH_PRIOR+1,
781}; 784};
782 785
783/* Halt all other CPUs, calling the specified function on each of them 786/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..2a26819bb6a8
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,48 @@
1#include <linux/ioport.h>
2#include <asm/e820.h>
3
4static void resource_clip(struct resource *res, resource_size_t start,
5 resource_size_t end)
6{
7 resource_size_t low = 0, high = 0;
8
9 if (res->end < start || res->start > end)
10 return; /* no conflict */
11
12 if (res->start < start)
13 low = start - res->start;
14
15 if (res->end > end)
16 high = res->end - end;
17
18 /* Keep the area above or below the conflict, whichever is larger */
19 if (low > high)
20 res->end = start - 1;
21 else
22 res->start = end + 1;
23}
24
25static void remove_e820_regions(struct resource *avail)
26{
27 int i;
28 struct e820entry *entry;
29
30 for (i = 0; i < e820.nr_map; i++) {
31 entry = &e820.map[i];
32
33 resource_clip(avail, entry->addr,
34 entry->addr + entry->size - 1);
35 }
36}
37
38void arch_remove_reservations(struct resource *avail)
39{
40 /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
41 if (avail->flags & IORESOURCE_MEM) {
42 if (avail->start < BIOS_END)
43 avail->start = BIOS_END;
44 resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
45
46 remove_e820_regions(avail);
47 }
48}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..6f39cab052d5 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -76,7 +76,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
76 CMOS_WRITE(real_seconds, RTC_SECONDS); 76 CMOS_WRITE(real_seconds, RTC_SECONDS);
77 CMOS_WRITE(real_minutes, RTC_MINUTES); 77 CMOS_WRITE(real_minutes, RTC_MINUTES);
78 } else { 78 } else {
79 printk(KERN_WARNING 79 printk_once(KERN_NOTICE
80 "set_rtc_mmss: can't update from %d to %d\n", 80 "set_rtc_mmss: can't update from %d to %d\n",
81 cmos_minutes, real_minutes); 81 cmos_minutes, real_minutes);
82 retval = -1; 82 retval = -1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 68d535a77df0..ca2f10622a79 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -501,7 +501,18 @@ static inline unsigned long long get_total_mem(void)
501 return total << PAGE_SHIFT; 501 return total << PAGE_SHIFT;
502} 502}
503 503
504#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF 504/*
505 * Keep the crash kernel below this limit. On 32 bits earlier kernels
506 * would limit the kernel to the low 512 MiB due to mapping restrictions.
507 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
508 * limit once kexec-tools are fixed.
509 */
510#ifdef CONFIG_X86_32
511# define CRASH_KERNEL_ADDR_MAX (512 << 20)
512#else
513# define CRASH_KERNEL_ADDR_MAX (896 << 20)
514#endif
515
505static void __init reserve_crashkernel(void) 516static void __init reserve_crashkernel(void)
506{ 517{
507 unsigned long long total_mem; 518 unsigned long long total_mem;
@@ -520,10 +531,10 @@ static void __init reserve_crashkernel(void)
520 const unsigned long long alignment = 16<<20; /* 16M */ 531 const unsigned long long alignment = 16<<20; /* 16M */
521 532
522 /* 533 /*
523 * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX 534 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
524 */ 535 */
525 crash_base = memblock_find_in_range(alignment, 536 crash_base = memblock_find_in_range(alignment,
526 DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment); 537 CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
527 538
528 if (crash_base == MEMBLOCK_ERROR) { 539 if (crash_base == MEMBLOCK_ERROR) {
529 pr_info("crashkernel reservation failed - No suitable area found.\n"); 540 pr_info("crashkernel reservation failed - No suitable area found.\n");
@@ -694,7 +705,7 @@ static u64 __init get_max_mapped(void)
694void __init setup_arch(char **cmdline_p) 705void __init setup_arch(char **cmdline_p)
695{ 706{
696 int acpi = 0; 707 int acpi = 0;
697 int k8 = 0; 708 int amd = 0;
698 unsigned long flags; 709 unsigned long flags;
699 710
700#ifdef CONFIG_X86_32 711#ifdef CONFIG_X86_32
@@ -769,7 +780,6 @@ void __init setup_arch(char **cmdline_p)
769 780
770 x86_init.oem.arch_setup(); 781 x86_init.oem.arch_setup();
771 782
772 resource_alloc_from_bottom = 0;
773 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 783 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
774 setup_memory_map(); 784 setup_memory_map();
775 parse_setup_data(); 785 parse_setup_data();
@@ -981,12 +991,12 @@ void __init setup_arch(char **cmdline_p)
981 acpi = acpi_numa_init(); 991 acpi = acpi_numa_init();
982#endif 992#endif
983 993
984#ifdef CONFIG_K8_NUMA 994#ifdef CONFIG_AMD_NUMA
985 if (!acpi) 995 if (!acpi)
986 k8 = !k8_numa_init(0, max_pfn); 996 amd = !amd_numa_init(0, max_pfn);
987#endif 997#endif
988 998
989 initmem_init(0, max_pfn, acpi, k8); 999 initmem_init(0, max_pfn, acpi, amd);
990 memblock_find_dma_reserve(); 1000 memblock_find_dma_reserve();
991 dma32_reserve_bootmem(); 1001 dma32_reserve_bootmem();
992 1002
@@ -1035,10 +1045,7 @@ void __init setup_arch(char **cmdline_p)
1035#endif 1045#endif
1036 1046
1037 init_apic_mappings(); 1047 init_apic_mappings();
1038 ioapic_init_mappings(); 1048 ioapic_and_gsi_init();
1039
1040 /* need to wait for io_apic is mapped */
1041 probe_nr_irqs_gsi();
1042 1049
1043 kvm_guest_init(); 1050 kvm_guest_init();
1044 1051
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 083e99d1b7df..0cbe8c0b35ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,12 +97,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
97 */ 97 */
98static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); 98static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
99 99
100void cpu_hotplug_driver_lock() 100void cpu_hotplug_driver_lock(void)
101{ 101{
102 mutex_lock(&x86_cpu_hotplug_driver_mutex); 102 mutex_lock(&x86_cpu_hotplug_driver_mutex);
103} 103}
104 104
105void cpu_hotplug_driver_unlock() 105void cpu_hotplug_driver_unlock(void)
106{ 106{
107 mutex_unlock(&x86_cpu_hotplug_driver_mutex); 107 mutex_unlock(&x86_cpu_hotplug_driver_mutex);
108} 108}
@@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void)
281 */ 281 */
282 smp_store_cpu_info(cpuid); 282 smp_store_cpu_info(cpuid);
283 283
284 /*
285 * This must be done before setting cpu_online_mask
286 * or calling notify_cpu_starting.
287 */
288 set_cpu_sibling_map(raw_smp_processor_id());
289 wmb();
290
284 notify_cpu_starting(cpuid); 291 notify_cpu_starting(cpuid);
285 292
286 /* 293 /*
@@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
316 */ 323 */
317 check_tsc_sync_target(); 324 check_tsc_sync_target();
318 325
319 if (nmi_watchdog == NMI_IO_APIC) {
320 legacy_pic->mask(0);
321 enable_NMI_through_LVT0();
322 legacy_pic->unmask(0);
323 }
324
325 /* This must be done before setting cpu_online_mask */
326 set_cpu_sibling_map(raw_smp_processor_id());
327 wmb();
328
329 /* 326 /*
330 * We need to hold call_lock, so there is no inconsistency 327 * We need to hold call_lock, so there is no inconsistency
331 * between the time smp_call_function() determines number of 328 * between the time smp_call_function() determines number of
@@ -430,7 +427,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
430 427
431 cpumask_set_cpu(cpu, c->llc_shared_map); 428 cpumask_set_cpu(cpu, c->llc_shared_map);
432 429
433 if (current_cpu_data.x86_max_cores == 1) { 430 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
434 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 431 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
435 c->booted_cores = 1; 432 c->booted_cores = 1;
436 return; 433 return;
@@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
1061 printk(KERN_INFO "SMP mode deactivated.\n"); 1058 printk(KERN_INFO "SMP mode deactivated.\n");
1062 smpboot_clear_io_apic(); 1059 smpboot_clear_io_apic();
1063 1060
1064 localise_nmi_watchdog();
1065
1066 connect_bsp_APIC(); 1061 connect_bsp_APIC();
1067 setup_local_APIC(); 1062 setup_local_APIC();
1068 end_local_APIC_setup(); 1063 end_local_APIC_setup();
@@ -1094,7 +1089,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1094 1089
1095 preempt_disable(); 1090 preempt_disable();
1096 smp_cpu_index_default(); 1091 smp_cpu_index_default();
1097 current_cpu_data = boot_cpu_data; 1092 memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info));
1098 cpumask_copy(cpu_callin_mask, cpumask_of(0)); 1093 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1099 mb(); 1094 mb();
1100 /* 1095 /*
@@ -1166,6 +1161,20 @@ out:
1166 preempt_enable(); 1161 preempt_enable();
1167} 1162}
1168 1163
1164void arch_disable_nonboot_cpus_begin(void)
1165{
1166 /*
1167 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
1168 * In the suspend path, we will be back in the SMP mode shortly anyways.
1169 */
1170 skip_smp_alternatives = true;
1171}
1172
1173void arch_disable_nonboot_cpus_end(void)
1174{
1175 skip_smp_alternatives = false;
1176}
1177
1169void arch_enable_nonboot_cpus_begin(void) 1178void arch_enable_nonboot_cpus_begin(void)
1170{ 1179{
1171 set_mtrr_aps_delayed_init(); 1180 set_mtrr_aps_delayed_init();
@@ -1196,7 +1205,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1196#ifdef CONFIG_X86_IO_APIC 1205#ifdef CONFIG_X86_IO_APIC
1197 setup_ioapic_dest(); 1206 setup_ioapic_dest();
1198#endif 1207#endif
1199 check_nmi_watchdog();
1200 mtrr_aps_init(); 1208 mtrr_aps_init();
1201} 1209}
1202 1210
@@ -1341,8 +1349,6 @@ int native_cpu_disable(void)
1341 if (cpu == 0) 1349 if (cpu == 0)
1342 return -EBUSY; 1350 return -EBUSY;
1343 1351
1344 if (nmi_watchdog == NMI_LOCAL_APIC)
1345 stop_apic_nmi_watchdog(NULL);
1346 clear_local_APIC(); 1352 clear_local_APIC();
1347 1353
1348 cpu_disable_common(); 1354 cpu_disable_common();
@@ -1377,7 +1383,7 @@ void play_dead_common(void)
1377 1383
1378 mb(); 1384 mb();
1379 /* Ack it */ 1385 /* Ack it */
1380 __get_cpu_var(cpu_state) = CPU_DEAD; 1386 __this_cpu_write(cpu_state, CPU_DEAD);
1381 1387
1382 /* 1388 /*
1383 * With physical CPU hotplug, we should halt the cpu 1389 * With physical CPU hotplug, we should halt the cpu
@@ -1396,12 +1402,13 @@ static inline void mwait_play_dead(void)
1396 unsigned int highest_subcstate = 0; 1402 unsigned int highest_subcstate = 0;
1397 int i; 1403 int i;
1398 void *mwait_ptr; 1404 void *mwait_ptr;
1405 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1399 1406
1400 if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT)) 1407 if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
1401 return; 1408 return;
1402 if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH)) 1409 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
1403 return; 1410 return;
1404 if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 1411 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1405 return; 1412 return;
1406 1413
1407 eax = CPUID_MWAIT_LEAF; 1414 eax = CPUID_MWAIT_LEAF;
@@ -1452,7 +1459,7 @@ static inline void mwait_play_dead(void)
1452 1459
1453static inline void hlt_play_dead(void) 1460static inline void hlt_play_dead(void)
1454{ 1461{
1455 if (current_cpu_data.x86 >= 4) 1462 if (__this_cpu_read(cpu_info.x86) >= 4)
1456 wbinvd(); 1463 wbinvd();
1457 1464
1458 while (1) { 1465 while (1) {
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b53c525368a7..938c8e10a19a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
73 */ 73 */
74void save_stack_trace(struct stack_trace *trace) 74void save_stack_trace(struct stack_trace *trace)
75{ 75{
76 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); 76 dump_trace(current, NULL, NULL, &save_stack_ops, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
80EXPORT_SYMBOL_GPL(save_stack_trace); 80EXPORT_SYMBOL_GPL(save_stack_trace);
81 81
82void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) 82void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 83{
84 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); 84 dump_trace(current, regs, NULL, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 85 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 86 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 87}
88 88
89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 89void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
90{ 90{
91 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 91 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
92 if (trace->nr_entries < trace->max_entries) 92 if (trace->nr_entries < trace->max_entries)
93 trace->entries[trace->nr_entries++] = ULONG_MAX; 93 trace->entries[trace->nr_entries++] = ULONG_MAX;
94} 94}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
133 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 133 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
134 if (!pmd) 134 if (!pmd)
135 return -1; 135 return -1;
136 pte = pte_alloc_map(&tboot_mm, pmd, vaddr); 136 pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
137 if (!pte) 137 if (!pte)
138 return -1; 138 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..25a28a245937 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,10 +22,6 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 26volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
31#endif 27#endif
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
63 /* Keep nmi watchdog up to date */ 59 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs); 60 inc_irq_stat(irq0_irqs);
65 61
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 raw_spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event); 62 global_clock_event->event_handler(global_clock_event);
81 63
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ 64 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..075d130efcf9 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -127,7 +127,7 @@ startup_64:
127no_longmode: 127no_longmode:
128 hlt 128 hlt
129 jmp no_longmode 129 jmp no_longmode
130#include "verify_cpu_64.S" 130#include "verify_cpu.S"
131 131
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index cb838ca42c96..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,13 @@ EXPORT_SYMBOL_GPL(used_vectors);
83 83
84static int ignore_nmis; 84static int ignore_nmis;
85 85
86int unknown_nmi_panic;
87/*
88 * Prevent NMI reason port (0x61) being accessed simultaneously, can
89 * only be used in NMI handler.
90 */
91static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
92
86static inline void conditional_sti(struct pt_regs *regs) 93static inline void conditional_sti(struct pt_regs *regs)
87{ 94{
88 if (regs->flags & X86_EFLAGS_IF) 95 if (regs->flags & X86_EFLAGS_IF)
@@ -300,16 +307,23 @@ gp_in_kernel:
300 die("general protection fault", regs, error_code); 307 die("general protection fault", regs, error_code);
301} 308}
302 309
303static notrace __kprobes void 310static int __init setup_unknown_nmi_panic(char *str)
304mem_parity_error(unsigned char reason, struct pt_regs *regs)
305{ 311{
306 printk(KERN_EMERG 312 unknown_nmi_panic = 1;
307 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 313 return 1;
308 reason, smp_processor_id()); 314}
315__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
309 316
310 printk(KERN_EMERG 317static notrace __kprobes void
311 "You have some hardware problem, likely on the PCI bus.\n"); 318pci_serr_error(unsigned char reason, struct pt_regs *regs)
319{
320 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
321 reason, smp_processor_id());
312 322
323 /*
324 * On some machines, PCI SERR line is used to report memory
325 * errors. EDAC makes use of it.
326 */
313#if defined(CONFIG_EDAC) 327#if defined(CONFIG_EDAC)
314 if (edac_handler_set()) { 328 if (edac_handler_set()) {
315 edac_atomic_assert_error(); 329 edac_atomic_assert_error();
@@ -320,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
320 if (panic_on_unrecovered_nmi) 334 if (panic_on_unrecovered_nmi)
321 panic("NMI: Not continuing"); 335 panic("NMI: Not continuing");
322 336
323 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 337 pr_emerg("Dazed and confused, but trying to continue\n");
324 338
325 /* Clear and disable the memory parity error line. */ 339 /* Clear and disable the PCI SERR error line. */
326 reason = (reason & 0xf) | 4; 340 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
327 outb(reason, 0x61); 341 outb(reason, NMI_REASON_PORT);
328} 342}
329 343
330static notrace __kprobes void 344static notrace __kprobes void
@@ -332,22 +346,26 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
332{ 346{
333 unsigned long i; 347 unsigned long i;
334 348
335 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 349 pr_emerg(
350 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
351 reason, smp_processor_id());
336 show_registers(regs); 352 show_registers(regs);
337 353
338 if (panic_on_io_nmi) 354 if (panic_on_io_nmi)
339 panic("NMI IOCK error: Not continuing"); 355 panic("NMI IOCK error: Not continuing");
340 356
341 /* Re-enable the IOCK line, wait for a few seconds */ 357 /* Re-enable the IOCK line, wait for a few seconds */
342 reason = (reason & 0xf) | 8; 358 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
343 outb(reason, 0x61); 359 outb(reason, NMI_REASON_PORT);
344 360
345 i = 2000; 361 i = 20000;
346 while (--i) 362 while (--i) {
347 udelay(1000); 363 touch_nmi_watchdog();
364 udelay(100);
365 }
348 366
349 reason &= ~8; 367 reason &= ~NMI_REASON_CLEAR_IOCHK;
350 outb(reason, 0x61); 368 outb(reason, NMI_REASON_PORT);
351} 369}
352 370
353static notrace __kprobes void 371static notrace __kprobes void
@@ -366,69 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
366 return; 384 return;
367 } 385 }
368#endif 386#endif
369 printk(KERN_EMERG 387 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
370 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 388 reason, smp_processor_id());
371 reason, smp_processor_id());
372 389
373 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); 390 pr_emerg("Do you have a strange power saving mode enabled?\n");
374 if (panic_on_unrecovered_nmi) 391 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
375 panic("NMI: Not continuing"); 392 panic("NMI: Not continuing");
376 393
377 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 394 pr_emerg("Dazed and confused, but trying to continue\n");
378} 395}
379 396
380static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 397static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
381{ 398{
382 unsigned char reason = 0; 399 unsigned char reason = 0;
383 int cpu;
384 400
385 cpu = smp_processor_id(); 401 /*
386 402 * CPU-specific NMI must be processed before non-CPU-specific
387 /* Only the BSP gets external NMIs from the system. */ 403 * NMI, otherwise we may lose it, because the CPU-specific
388 if (!cpu) 404 * NMI can not be detected/processed on other CPUs.
389 reason = get_nmi_reason(); 405 */
390 406 if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
391 if (!(reason & 0xc0)) { 407 return;
392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
393 == NOTIFY_STOP)
394 return;
395 408
396#ifdef CONFIG_X86_LOCAL_APIC 409 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 410 raw_spin_lock(&nmi_reason_lock);
398 == NOTIFY_STOP) 411 reason = get_nmi_reason();
399 return;
400 412
401#ifndef CONFIG_LOCKUP_DETECTOR 413 if (reason & NMI_REASON_MASK) {
414 if (reason & NMI_REASON_SERR)
415 pci_serr_error(reason, regs);
416 else if (reason & NMI_REASON_IOCHK)
417 io_check_error(reason, regs);
418#ifdef CONFIG_X86_32
402 /* 419 /*
403 * Ok, so this is none of the documented NMI sources, 420 * Reassert NMI in case it became active
404 * so it must be the NMI watchdog. 421 * meanwhile as it's edge-triggered:
405 */ 422 */
406 if (nmi_watchdog_tick(regs, reason)) 423 reassert_nmi();
407 return;
408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
410 unknown_nmi_error(reason, regs);
411#else
412 unknown_nmi_error(reason, regs);
413#endif 424#endif
414 425 raw_spin_unlock(&nmi_reason_lock);
415 return; 426 return;
416 } 427 }
417 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 428 raw_spin_unlock(&nmi_reason_lock);
418 return;
419 429
420 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 430 unknown_nmi_error(reason, regs);
421 if (reason & 0x80)
422 mem_parity_error(reason, regs);
423 if (reason & 0x40)
424 io_check_error(reason, regs);
425#ifdef CONFIG_X86_32
426 /*
427 * Reassert NMI in case it became active meanwhile
428 * as it's edge-triggered:
429 */
430 reassert_nmi();
431#endif
432} 431}
433 432
434dotraplinkage notrace __kprobes void 433dotraplinkage notrace __kprobes void
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
446 445
447void stop_nmi(void) 446void stop_nmi(void)
448{ 447{
449 acpi_nmi_disable();
450 ignore_nmis++; 448 ignore_nmis++;
451} 449}
452 450
453void restart_nmi(void) 451void restart_nmi(void)
454{ 452{
455 ignore_nmis--; 453 ignore_nmis--;
456 acpi_nmi_enable();
457} 454}
458 455
459/* May run on IST stack. */ 456/* May run on IST stack. */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0c40d8b72416..ffe5755caa8b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
465 465
466 /* hpet or pmtimer available ? */ 466 /* hpet or pmtimer available ? */
467 if (!hpet && !ref1 && !ref2) 467 if (ref1 == ref2)
468 continue; 468 continue;
469 469
470 /* Check, whether the sampling was disturbed by an SMI */ 470 /* Check, whether the sampling was disturbed by an SMI */
@@ -659,7 +659,7 @@ void restore_sched_clock_state(void)
659 659
660 local_irq_save(flags); 660 local_irq_save(flags);
661 661
662 __get_cpu_var(cyc2ns_offset) = 0; 662 __this_cpu_write(cyc2ns_offset, 0);
663 offset = cyc2ns_suspend - sched_clock(); 663 offset = cyc2ns_suspend - sched_clock();
664 664
665 for_each_possible_cpu(cpu) 665 for_each_possible_cpu(cpu)
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void)
872 872
873 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 873 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
874 return 0; 874 return 0;
875
876 if (tsc_clocksource_reliable)
877 return 0;
875 /* 878 /*
876 * Intel systems are normally all synchronized. 879 * Intel systems are normally all synchronized.
877 * Exceptions must mark TSC as unstable: 880 * Exceptions must mark TSC as unstable:
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void)
879 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { 882 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
880 /* assume multi socket systems are not synchronized: */ 883 /* assume multi socket systems are not synchronized: */
881 if (num_possible_cpus() > 1) 884 if (num_possible_cpus() > 1)
882 tsc_unstable = 1; 885 return 1;
883 } 886 }
884 887
885 return tsc_unstable; 888 return 0;
889}
890
891
892static void tsc_refine_calibration_work(struct work_struct *work);
893static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
894/**
895 * tsc_refine_calibration_work - Further refine tsc freq calibration
896 * @work - ignored.
897 *
898 * This functions uses delayed work over a period of a
899 * second to further refine the TSC freq value. Since this is
900 * timer based, instead of loop based, we don't block the boot
901 * process while this longer calibration is done.
902 *
903 * If there are any calibration anomolies (too many SMIs, etc),
904 * or the refined calibration is off by 1% of the fast early
905 * calibration, we throw out the new calibration and use the
906 * early calibration.
907 */
908static void tsc_refine_calibration_work(struct work_struct *work)
909{
910 static u64 tsc_start = -1, ref_start;
911 static int hpet;
912 u64 tsc_stop, ref_stop, delta;
913 unsigned long freq;
914
915 /* Don't bother refining TSC on unstable systems */
916 if (check_tsc_unstable())
917 goto out;
918
919 /*
920 * Since the work is started early in boot, we may be
921 * delayed the first time we expire. So set the workqueue
922 * again once we know timers are working.
923 */
924 if (tsc_start == -1) {
925 /*
926 * Only set hpet once, to avoid mixing hardware
927 * if the hpet becomes enabled later.
928 */
929 hpet = is_hpet_enabled();
930 schedule_delayed_work(&tsc_irqwork, HZ);
931 tsc_start = tsc_read_refs(&ref_start, hpet);
932 return;
933 }
934
935 tsc_stop = tsc_read_refs(&ref_stop, hpet);
936
937 /* hpet or pmtimer available ? */
938 if (ref_start == ref_stop)
939 goto out;
940
941 /* Check, whether the sampling was disturbed by an SMI */
942 if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
943 goto out;
944
945 delta = tsc_stop - tsc_start;
946 delta *= 1000000LL;
947 if (hpet)
948 freq = calc_hpet_ref(delta, ref_start, ref_stop);
949 else
950 freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
951
952 /* Make sure we're within 1% */
953 if (abs(tsc_khz - freq) > tsc_khz/100)
954 goto out;
955
956 tsc_khz = freq;
957 printk(KERN_INFO "Refined TSC clocksource calibration: "
958 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
959 (unsigned long)tsc_khz % 1000);
960
961out:
962 clocksource_register_khz(&clocksource_tsc, tsc_khz);
886} 963}
887 964
888static void __init init_tsc_clocksource(void) 965
966static int __init init_tsc_clocksource(void)
889{ 967{
968 if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
969 return 0;
970
890 if (tsc_clocksource_reliable) 971 if (tsc_clocksource_reliable)
891 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 972 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
892 /* lower the rating if we already know its unstable: */ 973 /* lower the rating if we already know its unstable: */
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void)
894 clocksource_tsc.rating = 0; 975 clocksource_tsc.rating = 0;
895 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 976 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
896 } 977 }
897 clocksource_register_khz(&clocksource_tsc, tsc_khz); 978 schedule_delayed_work(&tsc_irqwork, 0);
979 return 0;
898} 980}
981/*
982 * We use device_initcall here, to ensure we run after the hpet
983 * is fully initialized, which may occur at fs_initcall time.
984 */
985device_initcall(init_tsc_clocksource);
899 986
900void __init tsc_init(void) 987void __init tsc_init(void)
901{ 988{
@@ -949,6 +1036,5 @@ void __init tsc_init(void)
949 mark_tsc_unstable("TSCs unsynchronized"); 1036 mark_tsc_unstable("TSCs unsynchronized");
950 1037
951 check_system_tsc_reliable(); 1038 check_system_tsc_reliable();
952 init_tsc_clocksource();
953} 1039}
954 1040
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..0edefc19a113 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de) 7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) 8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) 9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
10 * 11 *
11 * This source code is licensed under the GNU General Public License, 12 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details. 13 * Version 2. See the file COPYING for more details.
@@ -14,18 +15,17 @@
14 * This is a common code for verification whether CPU supports 15 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this 16 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context. 17 * file is included at various places and compiled in that context.
17 * Following are the current usage. 18 * This file is expected to run in 32bit code. Currently:
18 * 19 *
19 * This file is included by both 16bit and 32bit code. 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verfication
22 * arch/x86/kernel/head_32.S: processor startup
20 * 23 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure 25 * 0: Success 1: Failure
28 * 26 *
27 * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
28 *
29 * The caller needs to check for the error code and take the action 29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt. 30 * appropriately. Either display a message or halt.
31 */ 31 */
@@ -62,8 +62,41 @@ verify_cpu:
62 cmpl $0x444d4163,%ecx 62 cmpl $0x444d4163,%ecx
63 jnz verify_cpu_noamd 63 jnz verify_cpu_noamd
64 mov $1,%di # cpu is from AMD 64 mov $1,%di # cpu is from AMD
65 jmp verify_cpu_check
65 66
66verify_cpu_noamd: 67verify_cpu_noamd:
68 cmpl $0x756e6547,%ebx # GenuineIntel?
69 jnz verify_cpu_check
70 cmpl $0x49656e69,%edx
71 jnz verify_cpu_check
72 cmpl $0x6c65746e,%ecx
73 jnz verify_cpu_check
74
75 # only call IA32_MISC_ENABLE when:
76 # family > 6 || (family == 6 && model >= 0xd)
77 movl $0x1, %eax # check CPU family and model
78 cpuid
79 movl %eax, %ecx
80
81 andl $0x0ff00f00, %eax # mask family and extended family
82 shrl $8, %eax
83 cmpl $6, %eax
84 ja verify_cpu_clear_xd # family > 6, ok
85 jb verify_cpu_check # family < 6, skip
86
87 andl $0x000f00f0, %ecx # mask model and extended model
88 shrl $4, %ecx
89 cmpl $0xd, %ecx
90 jb verify_cpu_check # family == 6, model < 0xd, skip
91
92verify_cpu_clear_xd:
93 movl $MSR_IA32_MISC_ENABLE, %ecx
94 rdmsr
95 btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
96 jnc verify_cpu_check # only write MSR if bit was changed
97 wrmsr
98
99verify_cpu_check:
67 movl $0x1,%eax # Does the cpu have what it takes 100 movl $0x1,%eax # Does the cpu have what it takes
68 cpuid 101 cpuid
69 andl $REQUIRED_MASK0,%edx 102 andl $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
179 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
180 goto out; 180 goto out;
181 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
182 split_huge_page_pmd(mm, pmd);
182 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
183 goto out; 184 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e03530aebfd0..bf4700755184 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
69 69
70PHDRS { 70PHDRS {
71 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
72 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(6); /* RW_ */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 user PT_LOAD FLAGS(5); /* R_E */ 74 user PT_LOAD FLAGS(5); /* R_E */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
116 116
117 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
118 118
119#if defined(CONFIG_DEBUG_RODATA)
120 /* .text should occupy whole number of pages */
121 . = ALIGN(PAGE_SIZE);
122#endif
119 X64_ALIGN_DEBUG_RODATA_BEGIN 123 X64_ALIGN_DEBUG_RODATA_BEGIN
120 RO_DATA(PAGE_SIZE) 124 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END 125 X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
335 __bss_start = .; 339 __bss_start = .;
336 *(.bss..page_aligned) 340 *(.bss..page_aligned)
337 *(.bss) 341 *(.bss)
338 . = ALIGN(4); 342 . = ALIGN(PAGE_SIZE);
339 __bss_stop = .; 343 __bss_stop = .;
340 } 344 }
341 345
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9c253bd65e24..547128546cc3 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void)
394 * Setup init_xstate_buf to represent the init state of 394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave 395 * all the features managed by the xsave
396 */ 396 */
397 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem_align(xstate_size,
398 __alignof__(struct xsave_struct));
398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 399 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399 400
400 clts(); 401 clts();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
21 */ 21 */
22 22
23#ifndef __KERNEL__
24#include <stdio.h>
25#include <stdint.h>
26#include <public/xen.h>
27#define DPRINTF(_f, _a ...) printf(_f , ## _a)
28#else
29#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
30#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
31#define DPRINTF(x...) do {} while (0)
32#endif
33#include <linux/module.h> 25#include <linux/module.h>
34#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
35 27
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
418} 410}
419 411
420static inline unsigned long 412static inline unsigned long
421register_address(struct decode_cache *c, unsigned long base, unsigned long reg) 413register_address(struct decode_cache *c, unsigned long reg)
422{ 414{
423 return base + address_mask(c, reg); 415 return address_mask(c, reg);
424} 416}
425 417
426static inline void 418static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
452 return ops->get_cached_segment_base(seg, ctxt->vcpu); 444 return ops->get_cached_segment_base(seg, ctxt->vcpu);
453} 445}
454 446
455static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 447static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
456 struct x86_emulate_ops *ops, 448 struct x86_emulate_ops *ops,
457 struct decode_cache *c) 449 struct decode_cache *c)
458{ 450{
459 if (!c->has_seg_override) 451 if (!c->has_seg_override)
460 return 0; 452 return 0;
461 453
462 return seg_base(ctxt, ops, c->seg_override); 454 return c->seg_override;
463} 455}
464 456
465static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 457static ulong linear(struct x86_emulate_ctxt *ctxt,
466 struct x86_emulate_ops *ops) 458 struct segmented_address addr)
467{ 459{
468 return seg_base(ctxt, ops, VCPU_SREG_ES); 460 struct decode_cache *c = &ctxt->decode;
469} 461 ulong la;
470
471static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
472 struct x86_emulate_ops *ops)
473{
474 return seg_base(ctxt, ops, VCPU_SREG_SS);
475}
476 462
477static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 463 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
478 u32 error, bool valid) 464 if (c->ad_bytes != 8)
479{ 465 la &= (u32)-1;
480 ctxt->exception = vec; 466 return la;
481 ctxt->error_code = error;
482 ctxt->error_code_valid = valid;
483} 467}
484 468
485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 469static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
470 u32 error, bool valid)
486{ 471{
487 emulate_exception(ctxt, GP_VECTOR, err, true); 472 ctxt->exception.vector = vec;
473 ctxt->exception.error_code = error;
474 ctxt->exception.error_code_valid = valid;
475 return X86EMUL_PROPAGATE_FAULT;
488} 476}
489 477
490static void emulate_pf(struct x86_emulate_ctxt *ctxt) 478static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
491{ 479{
492 emulate_exception(ctxt, PF_VECTOR, 0, true); 480 return emulate_exception(ctxt, GP_VECTOR, err, true);
493} 481}
494 482
495static void emulate_ud(struct x86_emulate_ctxt *ctxt) 483static int emulate_ud(struct x86_emulate_ctxt *ctxt)
496{ 484{
497 emulate_exception(ctxt, UD_VECTOR, 0, false); 485 return emulate_exception(ctxt, UD_VECTOR, 0, false);
498} 486}
499 487
500static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 488static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
501{ 489{
502 emulate_exception(ctxt, TS_VECTOR, err, true); 490 return emulate_exception(ctxt, TS_VECTOR, err, true);
503} 491}
504 492
505static int emulate_de(struct x86_emulate_ctxt *ctxt) 493static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{ 494{
507 emulate_exception(ctxt, DE_VECTOR, 0, false); 495 return emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509} 496}
510 497
511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 498static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
520 cur_size = fc->end - fc->start; 507 cur_size = fc->end - fc->start;
521 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 508 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
522 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 509 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
523 size, ctxt->vcpu, NULL); 510 size, ctxt->vcpu, &ctxt->exception);
524 if (rc != X86EMUL_CONTINUE) 511 if (rc != X86EMUL_CONTINUE)
525 return rc; 512 return rc;
526 fc->end += size; 513 fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
564 551
565static int read_descriptor(struct x86_emulate_ctxt *ctxt, 552static int read_descriptor(struct x86_emulate_ctxt *ctxt,
566 struct x86_emulate_ops *ops, 553 struct x86_emulate_ops *ops,
567 ulong addr, 554 struct segmented_address addr,
568 u16 *size, unsigned long *address, int op_bytes) 555 u16 *size, unsigned long *address, int op_bytes)
569{ 556{
570 int rc; 557 int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
572 if (op_bytes == 2) 559 if (op_bytes == 2)
573 op_bytes = 3; 560 op_bytes = 3;
574 *address = 0; 561 *address = 0;
575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); 562 rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
563 ctxt->vcpu, &ctxt->exception);
576 if (rc != X86EMUL_CONTINUE) 564 if (rc != X86EMUL_CONTINUE)
577 return rc; 565 return rc;
578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); 566 addr.ea += 2;
567 rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
568 ctxt->vcpu, &ctxt->exception);
579 return rc; 569 return rc;
580} 570}
581 571
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
768 break; 758 break;
769 } 759 }
770 } 760 }
771 op->addr.mem = modrm_ea; 761 op->addr.mem.ea = modrm_ea;
772done: 762done:
773 return rc; 763 return rc;
774} 764}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
783 op->type = OP_MEM; 773 op->type = OP_MEM;
784 switch (c->ad_bytes) { 774 switch (c->ad_bytes) {
785 case 2: 775 case 2:
786 op->addr.mem = insn_fetch(u16, 2, c->eip); 776 op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
787 break; 777 break;
788 case 4: 778 case 4:
789 op->addr.mem = insn_fetch(u32, 4, c->eip); 779 op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
790 break; 780 break;
791 case 8: 781 case 8:
792 op->addr.mem = insn_fetch(u64, 8, c->eip); 782 op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
793 break; 783 break;
794 } 784 }
795done: 785done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
808 else if (c->src.bytes == 4) 798 else if (c->src.bytes == 4)
809 sv = (s32)c->src.val & (s32)mask; 799 sv = (s32)c->src.val & (s32)mask;
810 800
811 c->dst.addr.mem += (sv >> 3); 801 c->dst.addr.mem.ea += (sv >> 3);
812 } 802 }
813 803
814 /* only subword offset */ 804 /* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
821{ 811{
822 int rc; 812 int rc;
823 struct read_cache *mc = &ctxt->decode.mem_read; 813 struct read_cache *mc = &ctxt->decode.mem_read;
824 u32 err;
825 814
826 while (size) { 815 while (size) {
827 int n = min(size, 8u); 816 int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
829 if (mc->pos < mc->end) 818 if (mc->pos < mc->end)
830 goto read_cached; 819 goto read_cached;
831 820
832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 821 rc = ops->read_emulated(addr, mc->data + mc->end, n,
833 ctxt->vcpu); 822 &ctxt->exception, ctxt->vcpu);
834 if (rc == X86EMUL_PROPAGATE_FAULT)
835 emulate_pf(ctxt);
836 if (rc != X86EMUL_CONTINUE) 823 if (rc != X86EMUL_CONTINUE)
837 return rc; 824 return rc;
838 mc->end += n; 825 mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
907 struct desc_ptr dt; 894 struct desc_ptr dt;
908 u16 index = selector >> 3; 895 u16 index = selector >> 3;
909 int ret; 896 int ret;
910 u32 err;
911 ulong addr; 897 ulong addr;
912 898
913 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 899 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
914 900
915 if (dt.size < index * 8 + 7) { 901 if (dt.size < index * 8 + 7)
916 emulate_gp(ctxt, selector & 0xfffc); 902 return emulate_gp(ctxt, selector & 0xfffc);
917 return X86EMUL_PROPAGATE_FAULT;
918 }
919 addr = dt.address + index * 8; 903 addr = dt.address + index * 8;
920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 904 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
921 if (ret == X86EMUL_PROPAGATE_FAULT) 905 &ctxt->exception);
922 emulate_pf(ctxt);
923 906
924 return ret; 907 return ret;
925} 908}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
931{ 914{
932 struct desc_ptr dt; 915 struct desc_ptr dt;
933 u16 index = selector >> 3; 916 u16 index = selector >> 3;
934 u32 err;
935 ulong addr; 917 ulong addr;
936 int ret; 918 int ret;
937 919
938 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 920 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
939 921
940 if (dt.size < index * 8 + 7) { 922 if (dt.size < index * 8 + 7)
941 emulate_gp(ctxt, selector & 0xfffc); 923 return emulate_gp(ctxt, selector & 0xfffc);
942 return X86EMUL_PROPAGATE_FAULT;
943 }
944 924
945 addr = dt.address + index * 8; 925 addr = dt.address + index * 8;
946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 926 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
947 if (ret == X86EMUL_PROPAGATE_FAULT) 927 &ctxt->exception);
948 emulate_pf(ctxt);
949 928
950 return ret; 929 return ret;
951} 930}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1092{ 1071{
1093 int rc; 1072 int rc;
1094 struct decode_cache *c = &ctxt->decode; 1073 struct decode_cache *c = &ctxt->decode;
1095 u32 err;
1096 1074
1097 switch (c->dst.type) { 1075 switch (c->dst.type) {
1098 case OP_REG: 1076 case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1101 case OP_MEM: 1079 case OP_MEM:
1102 if (c->lock_prefix) 1080 if (c->lock_prefix)
1103 rc = ops->cmpxchg_emulated( 1081 rc = ops->cmpxchg_emulated(
1104 c->dst.addr.mem, 1082 linear(ctxt, c->dst.addr.mem),
1105 &c->dst.orig_val, 1083 &c->dst.orig_val,
1106 &c->dst.val, 1084 &c->dst.val,
1107 c->dst.bytes, 1085 c->dst.bytes,
1108 &err, 1086 &ctxt->exception,
1109 ctxt->vcpu); 1087 ctxt->vcpu);
1110 else 1088 else
1111 rc = ops->write_emulated( 1089 rc = ops->write_emulated(
1112 c->dst.addr.mem, 1090 linear(ctxt, c->dst.addr.mem),
1113 &c->dst.val, 1091 &c->dst.val,
1114 c->dst.bytes, 1092 c->dst.bytes,
1115 &err, 1093 &ctxt->exception,
1116 ctxt->vcpu); 1094 ctxt->vcpu);
1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1118 emulate_pf(ctxt);
1119 if (rc != X86EMUL_CONTINUE) 1095 if (rc != X86EMUL_CONTINUE)
1120 return rc; 1096 return rc;
1121 break; 1097 break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1137 c->dst.bytes = c->op_bytes; 1113 c->dst.bytes = c->op_bytes;
1138 c->dst.val = c->src.val; 1114 c->dst.val = c->src.val;
1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1115 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), 1116 c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1141 c->regs[VCPU_REGS_RSP]); 1117 c->dst.addr.mem.seg = VCPU_SREG_SS;
1142} 1118}
1143 1119
1144static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1120static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1147{ 1123{
1148 struct decode_cache *c = &ctxt->decode; 1124 struct decode_cache *c = &ctxt->decode;
1149 int rc; 1125 int rc;
1126 struct segmented_address addr;
1150 1127
1151 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1128 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1152 c->regs[VCPU_REGS_RSP]), 1129 addr.seg = VCPU_SREG_SS;
1153 dest, len); 1130 rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
1154 if (rc != X86EMUL_CONTINUE) 1131 if (rc != X86EMUL_CONTINUE)
1155 return rc; 1132 return rc;
1156 1133
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1184 change_mask |= EFLG_IF; 1161 change_mask |= EFLG_IF;
1185 break; 1162 break;
1186 case X86EMUL_MODE_VM86: 1163 case X86EMUL_MODE_VM86:
1187 if (iopl < 3) { 1164 if (iopl < 3)
1188 emulate_gp(ctxt, 0); 1165 return emulate_gp(ctxt, 0);
1189 return X86EMUL_PROPAGATE_FAULT;
1190 }
1191 change_mask |= EFLG_IF; 1166 change_mask |= EFLG_IF;
1192 break; 1167 break;
1193 default: /* real mode */ 1168 default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1198 *(unsigned long *)dest = 1173 *(unsigned long *)dest =
1199 (ctxt->eflags & ~change_mask) | (val & change_mask); 1174 (ctxt->eflags & ~change_mask) | (val & change_mask);
1200 1175
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1204 return rc; 1176 return rc;
1205} 1177}
1206 1178
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1287 gva_t cs_addr; 1259 gva_t cs_addr;
1288 gva_t eip_addr; 1260 gva_t eip_addr;
1289 u16 cs, eip; 1261 u16 cs, eip;
1290 u32 err;
1291 1262
1292 /* TODO: Add limit checks */ 1263 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags; 1264 c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1317 eip_addr = dt.address + (irq << 2); 1288 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2; 1289 cs_addr = dt.address + (irq << 2) + 2;
1319 1290
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); 1291 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
1321 if (rc != X86EMUL_CONTINUE) 1292 if (rc != X86EMUL_CONTINUE)
1322 return rc; 1293 return rc;
1323 1294
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); 1295 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
1325 if (rc != X86EMUL_CONTINUE) 1296 if (rc != X86EMUL_CONTINUE)
1326 return rc; 1297 return rc;
1327 1298
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1370 if (rc != X86EMUL_CONTINUE) 1341 if (rc != X86EMUL_CONTINUE)
1371 return rc; 1342 return rc;
1372 1343
1373 if (temp_eip & ~0xffff) { 1344 if (temp_eip & ~0xffff)
1374 emulate_gp(ctxt, 0); 1345 return emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377 1346
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1347 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379 1348
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1624 1593
1625 /* syscall is not available in real mode */ 1594 /* syscall is not available in real mode */
1626 if (ctxt->mode == X86EMUL_MODE_REAL || 1595 if (ctxt->mode == X86EMUL_MODE_REAL ||
1627 ctxt->mode == X86EMUL_MODE_VM86) { 1596 ctxt->mode == X86EMUL_MODE_VM86)
1628 emulate_ud(ctxt); 1597 return emulate_ud(ctxt);
1629 return X86EMUL_PROPAGATE_FAULT;
1630 }
1631 1598
1632 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1599 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1633 1600
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1678 u16 cs_sel, ss_sel; 1645 u16 cs_sel, ss_sel;
1679 1646
1680 /* inject #GP if in real mode */ 1647 /* inject #GP if in real mode */
1681 if (ctxt->mode == X86EMUL_MODE_REAL) { 1648 if (ctxt->mode == X86EMUL_MODE_REAL)
1682 emulate_gp(ctxt, 0); 1649 return emulate_gp(ctxt, 0);
1683 return X86EMUL_PROPAGATE_FAULT;
1684 }
1685 1650
1686 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1651 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1687 * Therefore, we inject an #UD. 1652 * Therefore, we inject an #UD.
1688 */ 1653 */
1689 if (ctxt->mode == X86EMUL_MODE_PROT64) { 1654 if (ctxt->mode == X86EMUL_MODE_PROT64)
1690 emulate_ud(ctxt); 1655 return emulate_ud(ctxt);
1691 return X86EMUL_PROPAGATE_FAULT;
1692 }
1693 1656
1694 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1657 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1695 1658
1696 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1659 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1697 switch (ctxt->mode) { 1660 switch (ctxt->mode) {
1698 case X86EMUL_MODE_PROT32: 1661 case X86EMUL_MODE_PROT32:
1699 if ((msr_data & 0xfffc) == 0x0) { 1662 if ((msr_data & 0xfffc) == 0x0)
1700 emulate_gp(ctxt, 0); 1663 return emulate_gp(ctxt, 0);
1701 return X86EMUL_PROPAGATE_FAULT;
1702 }
1703 break; 1664 break;
1704 case X86EMUL_MODE_PROT64: 1665 case X86EMUL_MODE_PROT64:
1705 if (msr_data == 0x0) { 1666 if (msr_data == 0x0)
1706 emulate_gp(ctxt, 0); 1667 return emulate_gp(ctxt, 0);
1707 return X86EMUL_PROPAGATE_FAULT;
1708 }
1709 break; 1668 break;
1710 } 1669 }
1711 1670
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1745 1704
1746 /* inject #GP if in real mode or Virtual 8086 mode */ 1705 /* inject #GP if in real mode or Virtual 8086 mode */
1747 if (ctxt->mode == X86EMUL_MODE_REAL || 1706 if (ctxt->mode == X86EMUL_MODE_REAL ||
1748 ctxt->mode == X86EMUL_MODE_VM86) { 1707 ctxt->mode == X86EMUL_MODE_VM86)
1749 emulate_gp(ctxt, 0); 1708 return emulate_gp(ctxt, 0);
1750 return X86EMUL_PROPAGATE_FAULT;
1751 }
1752 1709
1753 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1710 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1754 1711
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1763 switch (usermode) { 1720 switch (usermode) {
1764 case X86EMUL_MODE_PROT32: 1721 case X86EMUL_MODE_PROT32:
1765 cs_sel = (u16)(msr_data + 16); 1722 cs_sel = (u16)(msr_data + 16);
1766 if ((msr_data & 0xfffc) == 0x0) { 1723 if ((msr_data & 0xfffc) == 0x0)
1767 emulate_gp(ctxt, 0); 1724 return emulate_gp(ctxt, 0);
1768 return X86EMUL_PROPAGATE_FAULT;
1769 }
1770 ss_sel = (u16)(msr_data + 24); 1725 ss_sel = (u16)(msr_data + 24);
1771 break; 1726 break;
1772 case X86EMUL_MODE_PROT64: 1727 case X86EMUL_MODE_PROT64:
1773 cs_sel = (u16)(msr_data + 32); 1728 cs_sel = (u16)(msr_data + 32);
1774 if (msr_data == 0x0) { 1729 if (msr_data == 0x0)
1775 emulate_gp(ctxt, 0); 1730 return emulate_gp(ctxt, 0);
1776 return X86EMUL_PROPAGATE_FAULT;
1777 }
1778 ss_sel = cs_sel + 8; 1731 ss_sel = cs_sel + 8;
1779 cs.d = 0; 1732 cs.d = 0;
1780 cs.l = 1; 1733 cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1934{ 1887{
1935 struct tss_segment_16 tss_seg; 1888 struct tss_segment_16 tss_seg;
1936 int ret; 1889 int ret;
1937 u32 err, new_tss_base = get_desc_base(new_desc); 1890 u32 new_tss_base = get_desc_base(new_desc);
1938 1891
1939 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1892 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1940 &err); 1893 &ctxt->exception);
1941 if (ret == X86EMUL_PROPAGATE_FAULT) { 1894 if (ret != X86EMUL_CONTINUE)
1942 /* FIXME: need to provide precise fault address */ 1895 /* FIXME: need to provide precise fault address */
1943 emulate_pf(ctxt);
1944 return ret; 1896 return ret;
1945 }
1946 1897
1947 save_state_to_tss16(ctxt, ops, &tss_seg); 1898 save_state_to_tss16(ctxt, ops, &tss_seg);
1948 1899
1949 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1900 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1950 &err); 1901 &ctxt->exception);
1951 if (ret == X86EMUL_PROPAGATE_FAULT) { 1902 if (ret != X86EMUL_CONTINUE)
1952 /* FIXME: need to provide precise fault address */ 1903 /* FIXME: need to provide precise fault address */
1953 emulate_pf(ctxt);
1954 return ret; 1904 return ret;
1955 }
1956 1905
1957 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 1906 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
1958 &err); 1907 &ctxt->exception);
1959 if (ret == X86EMUL_PROPAGATE_FAULT) { 1908 if (ret != X86EMUL_CONTINUE)
1960 /* FIXME: need to provide precise fault address */ 1909 /* FIXME: need to provide precise fault address */
1961 emulate_pf(ctxt);
1962 return ret; 1910 return ret;
1963 }
1964 1911
1965 if (old_tss_sel != 0xffff) { 1912 if (old_tss_sel != 0xffff) {
1966 tss_seg.prev_task_link = old_tss_sel; 1913 tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1968 ret = ops->write_std(new_tss_base, 1915 ret = ops->write_std(new_tss_base,
1969 &tss_seg.prev_task_link, 1916 &tss_seg.prev_task_link,
1970 sizeof tss_seg.prev_task_link, 1917 sizeof tss_seg.prev_task_link,
1971 ctxt->vcpu, &err); 1918 ctxt->vcpu, &ctxt->exception);
1972 if (ret == X86EMUL_PROPAGATE_FAULT) { 1919 if (ret != X86EMUL_CONTINUE)
1973 /* FIXME: need to provide precise fault address */ 1920 /* FIXME: need to provide precise fault address */
1974 emulate_pf(ctxt);
1975 return ret; 1921 return ret;
1976 }
1977 } 1922 }
1978 1923
1979 return load_state_from_tss16(ctxt, ops, &tss_seg); 1924 return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2013 struct decode_cache *c = &ctxt->decode; 1958 struct decode_cache *c = &ctxt->decode;
2014 int ret; 1959 int ret;
2015 1960
2016 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 1961 if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
2017 emulate_gp(ctxt, 0); 1962 return emulate_gp(ctxt, 0);
2018 return X86EMUL_PROPAGATE_FAULT;
2019 }
2020 c->eip = tss->eip; 1963 c->eip = tss->eip;
2021 ctxt->eflags = tss->eflags | 2; 1964 ctxt->eflags = tss->eflags | 2;
2022 c->regs[VCPU_REGS_RAX] = tss->eax; 1965 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2076{ 2019{
2077 struct tss_segment_32 tss_seg; 2020 struct tss_segment_32 tss_seg;
2078 int ret; 2021 int ret;
2079 u32 err, new_tss_base = get_desc_base(new_desc); 2022 u32 new_tss_base = get_desc_base(new_desc);
2080 2023
2081 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2024 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2082 &err); 2025 &ctxt->exception);
2083 if (ret == X86EMUL_PROPAGATE_FAULT) { 2026 if (ret != X86EMUL_CONTINUE)
2084 /* FIXME: need to provide precise fault address */ 2027 /* FIXME: need to provide precise fault address */
2085 emulate_pf(ctxt);
2086 return ret; 2028 return ret;
2087 }
2088 2029
2089 save_state_to_tss32(ctxt, ops, &tss_seg); 2030 save_state_to_tss32(ctxt, ops, &tss_seg);
2090 2031
2091 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2032 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2092 &err); 2033 &ctxt->exception);
2093 if (ret == X86EMUL_PROPAGATE_FAULT) { 2034 if (ret != X86EMUL_CONTINUE)
2094 /* FIXME: need to provide precise fault address */ 2035 /* FIXME: need to provide precise fault address */
2095 emulate_pf(ctxt);
2096 return ret; 2036 return ret;
2097 }
2098 2037
2099 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2038 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2100 &err); 2039 &ctxt->exception);
2101 if (ret == X86EMUL_PROPAGATE_FAULT) { 2040 if (ret != X86EMUL_CONTINUE)
2102 /* FIXME: need to provide precise fault address */ 2041 /* FIXME: need to provide precise fault address */
2103 emulate_pf(ctxt);
2104 return ret; 2042 return ret;
2105 }
2106 2043
2107 if (old_tss_sel != 0xffff) { 2044 if (old_tss_sel != 0xffff) {
2108 tss_seg.prev_task_link = old_tss_sel; 2045 tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2110 ret = ops->write_std(new_tss_base, 2047 ret = ops->write_std(new_tss_base,
2111 &tss_seg.prev_task_link, 2048 &tss_seg.prev_task_link,
2112 sizeof tss_seg.prev_task_link, 2049 sizeof tss_seg.prev_task_link,
2113 ctxt->vcpu, &err); 2050 ctxt->vcpu, &ctxt->exception);
2114 if (ret == X86EMUL_PROPAGATE_FAULT) { 2051 if (ret != X86EMUL_CONTINUE)
2115 /* FIXME: need to provide precise fault address */ 2052 /* FIXME: need to provide precise fault address */
2116 emulate_pf(ctxt);
2117 return ret; 2053 return ret;
2118 }
2119 } 2054 }
2120 2055
2121 return load_state_from_tss32(ctxt, ops, &tss_seg); 2056 return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2146 2081
2147 if (reason != TASK_SWITCH_IRET) { 2082 if (reason != TASK_SWITCH_IRET) {
2148 if ((tss_selector & 3) > next_tss_desc.dpl || 2083 if ((tss_selector & 3) > next_tss_desc.dpl ||
2149 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2084 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
2150 emulate_gp(ctxt, 0); 2085 return emulate_gp(ctxt, 0);
2151 return X86EMUL_PROPAGATE_FAULT;
2152 }
2153 } 2086 }
2154 2087
2155 desc_limit = desc_limit_scaled(&next_tss_desc); 2088 desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2231 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2164 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2232} 2165}
2233 2166
2234static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, 2167static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2235 int reg, struct operand *op) 2168 int reg, struct operand *op)
2236{ 2169{
2237 struct decode_cache *c = &ctxt->decode; 2170 struct decode_cache *c = &ctxt->decode;
2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2171 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2239 2172
2240 register_address_increment(c, &c->regs[reg], df * op->bytes); 2173 register_address_increment(c, &c->regs[reg], df * op->bytes);
2241 op->addr.mem = register_address(c, base, c->regs[reg]); 2174 op->addr.mem.ea = register_address(c, c->regs[reg]);
2175 op->addr.mem.seg = seg;
2242} 2176}
2243 2177
2244static int em_push(struct x86_emulate_ctxt *ctxt) 2178static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2369 struct decode_cache *c = &ctxt->decode; 2303 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0; 2304 u64 tsc = 0;
2371 2305
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { 2306 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
2373 emulate_gp(ctxt, 0); 2307 return emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); 2308 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2309 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2310 c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2647 2579
2648 op->type = OP_IMM; 2580 op->type = OP_IMM;
2649 op->bytes = size; 2581 op->bytes = size;
2650 op->addr.mem = c->eip; 2582 op->addr.mem.ea = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */ 2583 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) { 2584 switch (op->bytes) {
2653 case 1: 2585 case 1:
@@ -2678,7 +2610,7 @@ done:
2678} 2610}
2679 2611
2680int 2612int
2681x86_decode_insn(struct x86_emulate_ctxt *ctxt) 2613x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2682{ 2614{
2683 struct x86_emulate_ops *ops = ctxt->ops; 2615 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode; 2616 struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2689 struct operand memop = { .type = OP_NONE }; 2621 struct operand memop = { .type = OP_NONE };
2690 2622
2691 c->eip = ctxt->eip; 2623 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip; 2624 c->fetch.start = c->eip;
2625 c->fetch.end = c->fetch.start + insn_len;
2626 if (insn_len > 0)
2627 memcpy(c->fetch.data, insn, insn_len);
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); 2628 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694 2629
2695 switch (mode) { 2630 switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
2803 c->execute = opcode.u.execute; 2738 c->execute = opcode.u.execute;
2804 2739
2805 /* Unrecognised? */ 2740 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) { 2741 if (c->d == 0 || (c->d & Undefined))
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1; 2742 return -1;
2809 }
2810 2743
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8; 2745 c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
2831 if (!c->has_seg_override) 2764 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS); 2765 set_seg_override(c, VCPU_SREG_DS);
2833 2766
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) 2767 memop.addr.mem.seg = seg_override(ctxt, ops, c);
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836 2768
2837 if (memop.type == OP_MEM && c->ad_bytes != 8) 2769 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem; 2770 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
2839 2771
2840 if (memop.type == OP_MEM && c->rip_relative) 2772 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip; 2773 memop.addr.mem.ea += c->eip;
2842 2774
2843 /* 2775 /*
2844 * Decode and fetch the source operand: register, memory 2776 * Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
2890 case SrcSI: 2822 case SrcSI:
2891 c->src.type = OP_MEM; 2823 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2824 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem = 2825 c->src.addr.mem.ea =
2894 register_address(c, seg_override_base(ctxt, ops, c), 2826 register_address(c, c->regs[VCPU_REGS_RSI]);
2895 c->regs[VCPU_REGS_RSI]); 2827 c->src.addr.mem.seg = seg_override(ctxt, ops, c),
2896 c->src.val = 0; 2828 c->src.val = 0;
2897 break; 2829 break;
2898 case SrcImmFAddr: 2830 case SrcImmFAddr:
2899 c->src.type = OP_IMM; 2831 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip; 2832 c->src.addr.mem.ea = c->eip;
2901 c->src.bytes = c->op_bytes + 2; 2833 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 2834 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break; 2835 break;
@@ -2944,7 +2876,7 @@ done_prefixes:
2944 break; 2876 break;
2945 case DstImmUByte: 2877 case DstImmUByte:
2946 c->dst.type = OP_IMM; 2878 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip; 2879 c->dst.addr.mem.ea = c->eip;
2948 c->dst.bytes = 1; 2880 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip); 2881 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break; 2882 break;
@@ -2969,9 +2901,9 @@ done_prefixes:
2969 case DstDI: 2901 case DstDI:
2970 c->dst.type = OP_MEM; 2902 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2903 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem = 2904 c->dst.addr.mem.ea =
2973 register_address(c, es_base(ctxt, ops), 2905 register_address(c, c->regs[VCPU_REGS_RDI]);
2974 c->regs[VCPU_REGS_RDI]); 2906 c->dst.addr.mem.seg = VCPU_SREG_ES;
2975 c->dst.val = 0; 2907 c->dst.val = 0;
2976 break; 2908 break;
2977 case ImplicitOps: 2909 case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3020 ctxt->decode.mem_read.pos = 0; 2952 ctxt->decode.mem_read.pos = 0;
3021 2953
3022 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2954 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
3023 emulate_ud(ctxt); 2955 rc = emulate_ud(ctxt);
3024 goto done; 2956 goto done;
3025 } 2957 }
3026 2958
3027 /* LOCK prefix is allowed only with some instructions */ 2959 /* LOCK prefix is allowed only with some instructions */
3028 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2960 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
3029 emulate_ud(ctxt); 2961 rc = emulate_ud(ctxt);
3030 goto done; 2962 goto done;
3031 } 2963 }
3032 2964
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 2965 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt); 2966 rc = emulate_ud(ctxt);
3035 goto done; 2967 goto done;
3036 } 2968 }
3037 2969
3038 /* Privileged instruction can be executed only in CPL=0 */ 2970 /* Privileged instruction can be executed only in CPL=0 */
3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2971 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
3040 emulate_gp(ctxt, 0); 2972 rc = emulate_gp(ctxt, 0);
3041 goto done; 2973 goto done;
3042 } 2974 }
3043 2975
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3050 } 2982 }
3051 2983
3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 2984 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
3053 rc = read_emulated(ctxt, ops, c->src.addr.mem, 2985 rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
3054 c->src.valptr, c->src.bytes); 2986 c->src.valptr, c->src.bytes);
3055 if (rc != X86EMUL_CONTINUE) 2987 if (rc != X86EMUL_CONTINUE)
3056 goto done; 2988 goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3058 } 2990 }
3059 2991
3060 if (c->src2.type == OP_MEM) { 2992 if (c->src2.type == OP_MEM) {
3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem, 2993 rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
3062 &c->src2.val, c->src2.bytes); 2994 &c->src2.val, c->src2.bytes);
3063 if (rc != X86EMUL_CONTINUE) 2995 if (rc != X86EMUL_CONTINUE)
3064 goto done; 2996 goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3070 3002
3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3003 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
3072 /* optimisation - avoid slow emulated read if Mov */ 3004 /* optimisation - avoid slow emulated read if Mov */
3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem, 3005 rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
3074 &c->dst.val, c->dst.bytes); 3006 &c->dst.val, c->dst.bytes);
3075 if (rc != X86EMUL_CONTINUE) 3007 if (rc != X86EMUL_CONTINUE)
3076 goto done; 3008 goto done;
@@ -3215,13 +3147,13 @@ special_insn:
3215 break; 3147 break;
3216 case 0x8c: /* mov r/m, sreg */ 3148 case 0x8c: /* mov r/m, sreg */
3217 if (c->modrm_reg > VCPU_SREG_GS) { 3149 if (c->modrm_reg > VCPU_SREG_GS) {
3218 emulate_ud(ctxt); 3150 rc = emulate_ud(ctxt);
3219 goto done; 3151 goto done;
3220 } 3152 }
3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3153 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
3222 break; 3154 break;
3223 case 0x8d: /* lea r16/r32, m */ 3155 case 0x8d: /* lea r16/r32, m */
3224 c->dst.val = c->src.addr.mem; 3156 c->dst.val = c->src.addr.mem.ea;
3225 break; 3157 break;
3226 case 0x8e: { /* mov seg, r/m16 */ 3158 case 0x8e: { /* mov seg, r/m16 */
3227 uint16_t sel; 3159 uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
3230 3162
3231 if (c->modrm_reg == VCPU_SREG_CS || 3163 if (c->modrm_reg == VCPU_SREG_CS ||
3232 c->modrm_reg > VCPU_SREG_GS) { 3164 c->modrm_reg > VCPU_SREG_GS) {
3233 emulate_ud(ctxt); 3165 rc = emulate_ud(ctxt);
3234 goto done; 3166 goto done;
3235 } 3167 }
3236 3168
@@ -3268,7 +3200,6 @@ special_insn:
3268 break; 3200 break;
3269 case 0xa6 ... 0xa7: /* cmps */ 3201 case 0xa6 ... 0xa7: /* cmps */
3270 c->dst.type = OP_NONE; /* Disable writeback. */ 3202 c->dst.type = OP_NONE; /* Disable writeback. */
3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
3272 goto cmp; 3203 goto cmp;
3273 case 0xa8 ... 0xa9: /* test ax, imm */ 3204 case 0xa8 ... 0xa9: /* test ax, imm */
3274 goto test; 3205 goto test;
@@ -3363,7 +3294,7 @@ special_insn:
3363 do_io_in: 3294 do_io_in:
3364 c->dst.bytes = min(c->dst.bytes, 4u); 3295 c->dst.bytes = min(c->dst.bytes, 4u);
3365 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3296 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
3366 emulate_gp(ctxt, 0); 3297 rc = emulate_gp(ctxt, 0);
3367 goto done; 3298 goto done;
3368 } 3299 }
3369 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3300 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
3377 c->src.bytes = min(c->src.bytes, 4u); 3308 c->src.bytes = min(c->src.bytes, 4u);
3378 if (!emulator_io_permited(ctxt, ops, c->dst.val, 3309 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) { 3310 c->src.bytes)) {
3380 emulate_gp(ctxt, 0); 3311 rc = emulate_gp(ctxt, 0);
3381 goto done; 3312 goto done;
3382 } 3313 }
3383 ops->pio_out_emulated(c->src.bytes, c->dst.val, 3314 ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
3402 break; 3333 break;
3403 case 0xfa: /* cli */ 3334 case 0xfa: /* cli */
3404 if (emulator_bad_iopl(ctxt, ops)) { 3335 if (emulator_bad_iopl(ctxt, ops)) {
3405 emulate_gp(ctxt, 0); 3336 rc = emulate_gp(ctxt, 0);
3406 goto done; 3337 goto done;
3407 } else 3338 } else
3408 ctxt->eflags &= ~X86_EFLAGS_IF; 3339 ctxt->eflags &= ~X86_EFLAGS_IF;
3409 break; 3340 break;
3410 case 0xfb: /* sti */ 3341 case 0xfb: /* sti */
3411 if (emulator_bad_iopl(ctxt, ops)) { 3342 if (emulator_bad_iopl(ctxt, ops)) {
3412 emulate_gp(ctxt, 0); 3343 rc = emulate_gp(ctxt, 0);
3413 goto done; 3344 goto done;
3414 } else { 3345 } else {
3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3346 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
3449 c->dst.type = saved_dst_type; 3380 c->dst.type = saved_dst_type;
3450 3381
3451 if ((c->d & SrcMask) == SrcSI) 3382 if ((c->d & SrcMask) == SrcSI)
3452 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 3383 string_addr_inc(ctxt, seg_override(ctxt, ops, c),
3453 VCPU_REGS_RSI, &c->src); 3384 VCPU_REGS_RSI, &c->src);
3454 3385
3455 if ((c->d & DstMask) == DstDI) 3386 if ((c->d & DstMask) == DstDI)
3456 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 3387 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
3457 &c->dst); 3388 &c->dst);
3458 3389
3459 if (c->rep_prefix && (c->d & String)) { 3390 if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
3482 ctxt->eip = c->eip; 3413 ctxt->eip = c->eip;
3483 3414
3484done: 3415done:
3416 if (rc == X86EMUL_PROPAGATE_FAULT)
3417 ctxt->have_exception = true;
3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3418 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3486 3419
3487twobyte_insn: 3420twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
3544 break; 3477 break;
3545 case 5: /* not defined */ 3478 case 5: /* not defined */
3546 emulate_ud(ctxt); 3479 emulate_ud(ctxt);
3480 rc = X86EMUL_PROPAGATE_FAULT;
3547 goto done; 3481 goto done;
3548 case 7: /* invlpg*/ 3482 case 7: /* invlpg*/
3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem); 3483 emulate_invlpg(ctxt->vcpu,
3484 linear(ctxt, c->src.addr.mem));
3550 /* Disable writeback. */ 3485 /* Disable writeback. */
3551 c->dst.type = OP_NONE; 3486 c->dst.type = OP_NONE;
3552 break; 3487 break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
3573 case 5 ... 7: 3508 case 5 ... 7:
3574 case 9 ... 15: 3509 case 9 ... 15:
3575 emulate_ud(ctxt); 3510 emulate_ud(ctxt);
3511 rc = X86EMUL_PROPAGATE_FAULT;
3576 goto done; 3512 goto done;
3577 } 3513 }
3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3514 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3517 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3582 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3518 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3583 emulate_ud(ctxt); 3519 emulate_ud(ctxt);
3520 rc = X86EMUL_PROPAGATE_FAULT;
3584 goto done; 3521 goto done;
3585 } 3522 }
3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); 3523 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
3588 case 0x22: /* mov reg, cr */ 3525 case 0x22: /* mov reg, cr */
3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { 3526 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3590 emulate_gp(ctxt, 0); 3527 emulate_gp(ctxt, 0);
3528 rc = X86EMUL_PROPAGATE_FAULT;
3591 goto done; 3529 goto done;
3592 } 3530 }
3593 c->dst.type = OP_NONE; 3531 c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
3596 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3534 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3597 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3535 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3598 emulate_ud(ctxt); 3536 emulate_ud(ctxt);
3537 rc = X86EMUL_PROPAGATE_FAULT;
3599 goto done; 3538 goto done;
3600 } 3539 }
3601 3540
@@ -3604,6 +3543,7 @@ twobyte_insn:
3604 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3543 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3605 /* #UD condition is already handled by the code above */ 3544 /* #UD condition is already handled by the code above */
3606 emulate_gp(ctxt, 0); 3545 emulate_gp(ctxt, 0);
3546 rc = X86EMUL_PROPAGATE_FAULT;
3607 goto done; 3547 goto done;
3608 } 3548 }
3609 3549
@@ -3615,6 +3555,7 @@ twobyte_insn:
3615 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3555 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3616 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3556 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3617 emulate_gp(ctxt, 0); 3557 emulate_gp(ctxt, 0);
3558 rc = X86EMUL_PROPAGATE_FAULT;
3618 goto done; 3559 goto done;
3619 } 3560 }
3620 rc = X86EMUL_CONTINUE; 3561 rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
3623 /* rdmsr */ 3564 /* rdmsr */
3624 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3565 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3625 emulate_gp(ctxt, 0); 3566 emulate_gp(ctxt, 0);
3567 rc = X86EMUL_PROPAGATE_FAULT;
3626 goto done; 3568 goto done;
3627 } else { 3569 } else {
3628 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3570 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
3785 goto writeback; 3727 goto writeback;
3786 3728
3787cannot_emulate: 3729cannot_emulate:
3788 DPRINTF("Cannot emulate %02x\n", c->b);
3789 return -1; 3730 return -1;
3790} 3731}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index f628234fbeca..3cece05e4ac4 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 575 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 576 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 577 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
578 580
579 /* 581 /*
580 * Initialize PIO device 582 * Initialize PIO device
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
73 return vcpu->arch.cr4 & mask; 73 return vcpu->arch.cr4 & mask;
74} 74}
75 75
76static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
77{
78 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
79 kvm_x86_ops->decache_cr3(vcpu);
80 return vcpu->arch.cr3;
81}
82
76static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 83static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
77{ 84{
78 return kvm_read_cr4_bits(vcpu, ~0UL); 85 return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
84 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 91 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
85} 92}
86 93
94static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
95{
96 vcpu->arch.hflags |= HF_GUEST_MASK;
97}
98
99static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
100{
101 vcpu->arch.hflags &= ~HF_GUEST_MASK;
102}
103
104static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
105{
106 return vcpu->arch.hflags & HF_GUEST_MASK;
107}
108
87#endif 109#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
277 277
278 if (old_ppr != ppr) { 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr); 279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 280 if (ppr < old_ppr)
281 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 } 282 }
282} 283}
283 284
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb8b376bf28c..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
194 196
195static u64 __read_mostly shadow_trap_nonpresent_pte; 197static u64 __read_mostly shadow_trap_nonpresent_pte;
196static u64 __read_mostly shadow_notrap_nonpresent_pte; 198static u64 __read_mostly shadow_notrap_nonpresent_pte;
197static u64 __read_mostly shadow_base_present_pte;
198static u64 __read_mostly shadow_nx_mask; 199static u64 __read_mostly shadow_nx_mask;
199static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 200static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
200static u64 __read_mostly shadow_user_mask; 201static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
213} 214}
214EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 215EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
215 216
216void kvm_mmu_set_base_ptes(u64 base_pte)
217{
218 shadow_base_present_pte = base_pte;
219}
220EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
221
222void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 217void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223 u64 dirty_mask, u64 nx_mask, u64 x_mask) 218 u64 dirty_mask, u64 nx_mask, u64 x_mask)
224{ 219{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
482} 477}
483 478
484/* 479/*
485 * Return the pointer to the largepage write count for a given 480 * Return the pointer to the large page information for a given gfn,
486 * gfn, handling slots that are not large page aligned. 481 * handling slots that are not large page aligned.
487 */ 482 */
488static int *slot_largepage_idx(gfn_t gfn, 483static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
489 struct kvm_memory_slot *slot, 484 struct kvm_memory_slot *slot,
490 int level) 485 int level)
491{ 486{
492 unsigned long idx; 487 unsigned long idx;
493 488
494 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 489 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
495 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 490 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
496 return &slot->lpage_info[level - 2][idx].write_count; 491 return &slot->lpage_info[level - 2][idx];
497} 492}
498 493
499static void account_shadowed(struct kvm *kvm, gfn_t gfn) 494static void account_shadowed(struct kvm *kvm, gfn_t gfn)
500{ 495{
501 struct kvm_memory_slot *slot; 496 struct kvm_memory_slot *slot;
502 int *write_count; 497 struct kvm_lpage_info *linfo;
503 int i; 498 int i;
504 499
505 slot = gfn_to_memslot(kvm, gfn); 500 slot = gfn_to_memslot(kvm, gfn);
506 for (i = PT_DIRECTORY_LEVEL; 501 for (i = PT_DIRECTORY_LEVEL;
507 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 502 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
508 write_count = slot_largepage_idx(gfn, slot, i); 503 linfo = lpage_info_slot(gfn, slot, i);
509 *write_count += 1; 504 linfo->write_count += 1;
510 } 505 }
511} 506}
512 507
513static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 508static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
514{ 509{
515 struct kvm_memory_slot *slot; 510 struct kvm_memory_slot *slot;
516 int *write_count; 511 struct kvm_lpage_info *linfo;
517 int i; 512 int i;
518 513
519 slot = gfn_to_memslot(kvm, gfn); 514 slot = gfn_to_memslot(kvm, gfn);
520 for (i = PT_DIRECTORY_LEVEL; 515 for (i = PT_DIRECTORY_LEVEL;
521 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 516 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
522 write_count = slot_largepage_idx(gfn, slot, i); 517 linfo = lpage_info_slot(gfn, slot, i);
523 *write_count -= 1; 518 linfo->write_count -= 1;
524 WARN_ON(*write_count < 0); 519 WARN_ON(linfo->write_count < 0);
525 } 520 }
526} 521}
527 522
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
530 int level) 525 int level)
531{ 526{
532 struct kvm_memory_slot *slot; 527 struct kvm_memory_slot *slot;
533 int *largepage_idx; 528 struct kvm_lpage_info *linfo;
534 529
535 slot = gfn_to_memslot(kvm, gfn); 530 slot = gfn_to_memslot(kvm, gfn);
536 if (slot) { 531 if (slot) {
537 largepage_idx = slot_largepage_idx(gfn, slot, level); 532 linfo = lpage_info_slot(gfn, slot, level);
538 return *largepage_idx; 533 return linfo->write_count;
539 } 534 }
540 535
541 return 1; 536 return 1;
@@ -559,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
559 return ret; 554 return ret;
560} 555}
561 556
562static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
563{ 558{
564 struct kvm_memory_slot *slot; 559 struct kvm_memory_slot *slot;
565 int host_level, level, max_level;
566
567 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 560 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
568 if (slot && slot->dirty_bitmap) 561 if (slot && slot->dirty_bitmap)
569 return PT_PAGE_TABLE_LEVEL; 562 return true;
563 return false;
564}
565
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 int host_level, level, max_level;
570 569
571 host_level = host_mapping_level(vcpu->kvm, large_gfn); 570 host_level = host_mapping_level(vcpu->kvm, large_gfn);
572 571
@@ -590,16 +589,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
590static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 589static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
591{ 590{
592 struct kvm_memory_slot *slot; 591 struct kvm_memory_slot *slot;
593 unsigned long idx; 592 struct kvm_lpage_info *linfo;
594 593
595 slot = gfn_to_memslot(kvm, gfn); 594 slot = gfn_to_memslot(kvm, gfn);
596 if (likely(level == PT_PAGE_TABLE_LEVEL)) 595 if (likely(level == PT_PAGE_TABLE_LEVEL))
597 return &slot->rmap[gfn - slot->base_gfn]; 596 return &slot->rmap[gfn - slot->base_gfn];
598 597
599 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 598 linfo = lpage_info_slot(gfn, slot, level);
600 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
601 599
602 return &slot->lpage_info[level - 2][idx].rmap_pde; 600 return &linfo->rmap_pde;
603} 601}
604 602
605/* 603/*
@@ -887,19 +885,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
887 end = start + (memslot->npages << PAGE_SHIFT); 885 end = start + (memslot->npages << PAGE_SHIFT);
888 if (hva >= start && hva < end) { 886 if (hva >= start && hva < end) {
889 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 887 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
888 gfn_t gfn = memslot->base_gfn + gfn_offset;
890 889
891 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 890 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
892 891
893 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 892 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
894 unsigned long idx; 893 struct kvm_lpage_info *linfo;
895 int sh; 894
896 895 linfo = lpage_info_slot(gfn, memslot,
897 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 896 PT_DIRECTORY_LEVEL + j);
898 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 897 ret |= handler(kvm, &linfo->rmap_pde, data);
899 (memslot->base_gfn >> sh);
900 ret |= handler(kvm,
901 &memslot->lpage_info[j][idx].rmap_pde,
902 data);
903 } 898 }
904 trace_kvm_age_page(hva, memslot, ret); 899 trace_kvm_age_page(hva, memslot, ret);
905 retval |= ret; 900 retval |= ret;
@@ -950,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
950 return young; 945 return young;
951} 946}
952 947
948static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949 unsigned long data)
950{
951 u64 *spte;
952 int young = 0;
953
954 /*
955 * If there's no access bit in the secondary pte set by the
956 * hardware it's up to gup-fast/gup to set the access bit in
957 * the primary pte or in the page structure.
958 */
959 if (!shadow_accessed_mask)
960 goto out;
961
962 spte = rmap_next(kvm, rmapp, NULL);
963 while (spte) {
964 u64 _spte = *spte;
965 BUG_ON(!(_spte & PT_PRESENT_MASK));
966 young = _spte & PT_ACCESSED_MASK;
967 if (young) {
968 young = 1;
969 break;
970 }
971 spte = rmap_next(kvm, rmapp, spte);
972 }
973out:
974 return young;
975}
976
953#define RMAP_RECYCLE_THRESHOLD 1000 977#define RMAP_RECYCLE_THRESHOLD 1000
954 978
955static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 979static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -970,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
970 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
971} 995}
972 996
997int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998{
999 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000}
1001
973#ifdef MMU_DEBUG 1002#ifdef MMU_DEBUG
974static int is_empty_shadow_page(u64 *spt) 1003static int is_empty_shadow_page(u64 *spt)
975{ 1004{
@@ -1161,7 +1190,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1161} 1190}
1162 1191
1163static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1192static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1164 struct kvm_mmu_page *sp, bool clear_unsync) 1193 struct kvm_mmu_page *sp)
1165{ 1194{
1166 return 1; 1195 return 1;
1167} 1196}
@@ -1291,7 +1320,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1291 if (clear_unsync) 1320 if (clear_unsync)
1292 kvm_unlink_unsync_page(vcpu->kvm, sp); 1321 kvm_unlink_unsync_page(vcpu->kvm, sp);
1293 1322
1294 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1323 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1295 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1324 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1296 return 1; 1325 return 1;
1297 } 1326 }
@@ -1332,12 +1361,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1332 continue; 1361 continue;
1333 1362
1334 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1363 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1364 kvm_unlink_unsync_page(vcpu->kvm, s);
1335 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1365 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1336 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1366 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1337 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1367 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1338 continue; 1368 continue;
1339 } 1369 }
1340 kvm_unlink_unsync_page(vcpu->kvm, s);
1341 flush = true; 1370 flush = true;
1342 } 1371 }
1343 1372
@@ -1963,9 +1992,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1963 unsigned pte_access, int user_fault, 1992 unsigned pte_access, int user_fault,
1964 int write_fault, int dirty, int level, 1993 int write_fault, int dirty, int level,
1965 gfn_t gfn, pfn_t pfn, bool speculative, 1994 gfn_t gfn, pfn_t pfn, bool speculative,
1966 bool can_unsync, bool reset_host_protection) 1995 bool can_unsync, bool host_writable)
1967{ 1996{
1968 u64 spte; 1997 u64 spte, entry = *sptep;
1969 int ret = 0; 1998 int ret = 0;
1970 1999
1971 /* 2000 /*
@@ -1973,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 * whether the guest actually used the pte (in order to detect 2002 * whether the guest actually used the pte (in order to detect
1974 * demand paging). 2003 * demand paging).
1975 */ 2004 */
1976 spte = shadow_base_present_pte; 2005 spte = PT_PRESENT_MASK;
1977 if (!speculative) 2006 if (!speculative)
1978 spte |= shadow_accessed_mask; 2007 spte |= shadow_accessed_mask;
1979 if (!dirty) 2008 if (!dirty)
@@ -1990,8 +2019,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1990 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2019 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1991 kvm_is_mmio_pfn(pfn)); 2020 kvm_is_mmio_pfn(pfn));
1992 2021
1993 if (reset_host_protection) 2022 if (host_writable)
1994 spte |= SPTE_HOST_WRITEABLE; 2023 spte |= SPTE_HOST_WRITEABLE;
2024 else
2025 pte_access &= ~ACC_WRITE_MASK;
1995 2026
1996 spte |= (u64)pfn << PAGE_SHIFT; 2027 spte |= (u64)pfn << PAGE_SHIFT;
1997 2028
@@ -2036,6 +2067,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2036 2067
2037set_pte: 2068set_pte:
2038 update_spte(sptep, spte); 2069 update_spte(sptep, spte);
2070 /*
2071 * If we overwrite a writable spte with a read-only one we
2072 * should flush remote TLBs. Otherwise rmap_write_protect
2073 * will find a read-only spte, even though the writable spte
2074 * might be cached on a CPU's TLB.
2075 */
2076 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2077 kvm_flush_remote_tlbs(vcpu->kvm);
2039done: 2078done:
2040 return ret; 2079 return ret;
2041} 2080}
@@ -2045,7 +2084,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 int user_fault, int write_fault, int dirty, 2084 int user_fault, int write_fault, int dirty,
2046 int *ptwrite, int level, gfn_t gfn, 2085 int *ptwrite, int level, gfn_t gfn,
2047 pfn_t pfn, bool speculative, 2086 pfn_t pfn, bool speculative,
2048 bool reset_host_protection) 2087 bool host_writable)
2049{ 2088{
2050 int was_rmapped = 0; 2089 int was_rmapped = 0;
2051 int rmap_count; 2090 int rmap_count;
@@ -2080,7 +2119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2080 2119
2081 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2120 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2082 dirty, level, gfn, pfn, speculative, true, 2121 dirty, level, gfn, pfn, speculative, true,
2083 reset_host_protection)) { 2122 host_writable)) {
2084 if (write_fault) 2123 if (write_fault)
2085 *ptwrite = 1; 2124 *ptwrite = 1;
2086 kvm_mmu_flush_tlb(vcpu); 2125 kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2250,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2211} 2250}
2212 2251
2213static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2252static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2214 int level, gfn_t gfn, pfn_t pfn) 2253 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2254 bool prefault)
2215{ 2255{
2216 struct kvm_shadow_walk_iterator iterator; 2256 struct kvm_shadow_walk_iterator iterator;
2217 struct kvm_mmu_page *sp; 2257 struct kvm_mmu_page *sp;
@@ -2220,9 +2260,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2220 2260
2221 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2261 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2222 if (iterator.level == level) { 2262 if (iterator.level == level) {
2223 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2263 unsigned pte_access = ACC_ALL;
2264
2265 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2224 0, write, 1, &pt_write, 2266 0, write, 1, &pt_write,
2225 level, gfn, pfn, false, true); 2267 level, gfn, pfn, prefault, map_writable);
2226 direct_pte_prefetch(vcpu, iterator.sptep); 2268 direct_pte_prefetch(vcpu, iterator.sptep);
2227 ++vcpu->stat.pf_fixed; 2269 ++vcpu->stat.pf_fixed;
2228 break; 2270 break;
@@ -2277,27 +2319,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2277 return 1; 2319 return 1;
2278} 2320}
2279 2321
2280static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2322static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2323 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2324{
2325 pfn_t pfn = *pfnp;
2326 gfn_t gfn = *gfnp;
2327 int level = *levelp;
2328
2329 /*
2330 * Check if it's a transparent hugepage. If this would be an
2331 * hugetlbfs page, level wouldn't be set to
2332 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2333 * here.
2334 */
2335 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2336 level == PT_PAGE_TABLE_LEVEL &&
2337 PageTransCompound(pfn_to_page(pfn)) &&
2338 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2339 unsigned long mask;
2340 /*
2341 * mmu_notifier_retry was successful and we hold the
2342 * mmu_lock here, so the pmd can't become splitting
2343 * from under us, and in turn
2344 * __split_huge_page_refcount() can't run from under
2345 * us and we can safely transfer the refcount from
2346 * PG_tail to PG_head as we switch the pfn to tail to
2347 * head.
2348 */
2349 *levelp = level = PT_DIRECTORY_LEVEL;
2350 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2351 VM_BUG_ON((gfn & mask) != (pfn & mask));
2352 if (pfn & mask) {
2353 gfn &= ~mask;
2354 *gfnp = gfn;
2355 kvm_release_pfn_clean(pfn);
2356 pfn &= ~mask;
2357 if (!get_page_unless_zero(pfn_to_page(pfn)))
2358 BUG();
2359 *pfnp = pfn;
2360 }
2361 }
2362}
2363
2364static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2365 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2366
2367static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2368 bool prefault)
2281{ 2369{
2282 int r; 2370 int r;
2283 int level; 2371 int level;
2372 int force_pt_level;
2284 pfn_t pfn; 2373 pfn_t pfn;
2285 unsigned long mmu_seq; 2374 unsigned long mmu_seq;
2375 bool map_writable;
2286 2376
2287 level = mapping_level(vcpu, gfn); 2377 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2288 2378 if (likely(!force_pt_level)) {
2289 /* 2379 level = mapping_level(vcpu, gfn);
2290 * This path builds a PAE pagetable - so we can map 2mb pages at 2380 /*
2291 * maximum. Therefore check if the level is larger than that. 2381 * This path builds a PAE pagetable - so we can map
2292 */ 2382 * 2mb pages at maximum. Therefore check if the level
2293 if (level > PT_DIRECTORY_LEVEL) 2383 * is larger than that.
2294 level = PT_DIRECTORY_LEVEL; 2384 */
2385 if (level > PT_DIRECTORY_LEVEL)
2386 level = PT_DIRECTORY_LEVEL;
2295 2387
2296 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2388 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2389 } else
2390 level = PT_PAGE_TABLE_LEVEL;
2297 2391
2298 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2392 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2299 smp_rmb(); 2393 smp_rmb();
2300 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2394
2395 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2396 return 0;
2301 2397
2302 /* mmio */ 2398 /* mmio */
2303 if (is_error_pfn(pfn)) 2399 if (is_error_pfn(pfn))
@@ -2307,7 +2403,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2307 if (mmu_notifier_retry(vcpu, mmu_seq)) 2403 if (mmu_notifier_retry(vcpu, mmu_seq))
2308 goto out_unlock; 2404 goto out_unlock;
2309 kvm_mmu_free_some_pages(vcpu); 2405 kvm_mmu_free_some_pages(vcpu);
2310 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2406 if (likely(!force_pt_level))
2407 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2408 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2409 prefault);
2311 spin_unlock(&vcpu->kvm->mmu_lock); 2410 spin_unlock(&vcpu->kvm->mmu_lock);
2312 2411
2313 2412
@@ -2394,7 +2493,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2394 ASSERT(!VALID_PAGE(root)); 2493 ASSERT(!VALID_PAGE(root));
2395 spin_lock(&vcpu->kvm->mmu_lock); 2494 spin_lock(&vcpu->kvm->mmu_lock);
2396 kvm_mmu_free_some_pages(vcpu); 2495 kvm_mmu_free_some_pages(vcpu);
2397 sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, 2496 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2497 i << 30,
2398 PT32_ROOT_LEVEL, 1, ACC_ALL, 2498 PT32_ROOT_LEVEL, 1, ACC_ALL,
2399 NULL); 2499 NULL);
2400 root = __pa(sp->spt); 2500 root = __pa(sp->spt);
@@ -2529,6 +2629,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2529 hpa_t root = vcpu->arch.mmu.root_hpa; 2629 hpa_t root = vcpu->arch.mmu.root_hpa;
2530 sp = page_header(root); 2630 sp = page_header(root);
2531 mmu_sync_children(vcpu, sp); 2631 mmu_sync_children(vcpu, sp);
2632 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2532 return; 2633 return;
2533 } 2634 }
2534 for (i = 0; i < 4; ++i) { 2635 for (i = 0; i < 4; ++i) {
@@ -2551,23 +2652,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2551} 2652}
2552 2653
2553static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2654static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2554 u32 access, u32 *error) 2655 u32 access, struct x86_exception *exception)
2555{ 2656{
2556 if (error) 2657 if (exception)
2557 *error = 0; 2658 exception->error_code = 0;
2558 return vaddr; 2659 return vaddr;
2559} 2660}
2560 2661
2561static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 2662static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2562 u32 access, u32 *error) 2663 u32 access,
2664 struct x86_exception *exception)
2563{ 2665{
2564 if (error) 2666 if (exception)
2565 *error = 0; 2667 exception->error_code = 0;
2566 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2668 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2567} 2669}
2568 2670
2569static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2671static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2570 u32 error_code) 2672 u32 error_code, bool prefault)
2571{ 2673{
2572 gfn_t gfn; 2674 gfn_t gfn;
2573 int r; 2675 int r;
@@ -2583,17 +2685,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2583 gfn = gva >> PAGE_SHIFT; 2685 gfn = gva >> PAGE_SHIFT;
2584 2686
2585 return nonpaging_map(vcpu, gva & PAGE_MASK, 2687 return nonpaging_map(vcpu, gva & PAGE_MASK,
2586 error_code & PFERR_WRITE_MASK, gfn); 2688 error_code & PFERR_WRITE_MASK, gfn, prefault);
2689}
2690
2691static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2692{
2693 struct kvm_arch_async_pf arch;
2694
2695 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2696 arch.gfn = gfn;
2697 arch.direct_map = vcpu->arch.mmu.direct_map;
2698 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2699
2700 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2701}
2702
2703static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2704{
2705 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2706 kvm_event_needs_reinjection(vcpu)))
2707 return false;
2708
2709 return kvm_x86_ops->interrupt_allowed(vcpu);
2587} 2710}
2588 2711
2589static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2712static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2590 u32 error_code) 2713 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2714{
2715 bool async;
2716
2717 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2718
2719 if (!async)
2720 return false; /* *pfn has correct page already */
2721
2722 put_page(pfn_to_page(*pfn));
2723
2724 if (!prefault && can_do_async_pf(vcpu)) {
2725 trace_kvm_try_async_get_page(gva, gfn);
2726 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2727 trace_kvm_async_pf_doublefault(gva, gfn);
2728 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2729 return true;
2730 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2731 return true;
2732 }
2733
2734 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2735
2736 return false;
2737}
2738
2739static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2740 bool prefault)
2591{ 2741{
2592 pfn_t pfn; 2742 pfn_t pfn;
2593 int r; 2743 int r;
2594 int level; 2744 int level;
2745 int force_pt_level;
2595 gfn_t gfn = gpa >> PAGE_SHIFT; 2746 gfn_t gfn = gpa >> PAGE_SHIFT;
2596 unsigned long mmu_seq; 2747 unsigned long mmu_seq;
2748 int write = error_code & PFERR_WRITE_MASK;
2749 bool map_writable;
2597 2750
2598 ASSERT(vcpu); 2751 ASSERT(vcpu);
2599 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2752 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2602,21 +2755,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2602 if (r) 2755 if (r)
2603 return r; 2756 return r;
2604 2757
2605 level = mapping_level(vcpu, gfn); 2758 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2606 2759 if (likely(!force_pt_level)) {
2607 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2760 level = mapping_level(vcpu, gfn);
2761 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2762 } else
2763 level = PT_PAGE_TABLE_LEVEL;
2608 2764
2609 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2765 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2610 smp_rmb(); 2766 smp_rmb();
2611 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2767
2768 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2769 return 0;
2770
2771 /* mmio */
2612 if (is_error_pfn(pfn)) 2772 if (is_error_pfn(pfn))
2613 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2773 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2614 spin_lock(&vcpu->kvm->mmu_lock); 2774 spin_lock(&vcpu->kvm->mmu_lock);
2615 if (mmu_notifier_retry(vcpu, mmu_seq)) 2775 if (mmu_notifier_retry(vcpu, mmu_seq))
2616 goto out_unlock; 2776 goto out_unlock;
2617 kvm_mmu_free_some_pages(vcpu); 2777 kvm_mmu_free_some_pages(vcpu);
2618 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2778 if (likely(!force_pt_level))
2619 level, gfn, pfn); 2779 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2780 r = __direct_map(vcpu, gpa, write, map_writable,
2781 level, gfn, pfn, prefault);
2620 spin_unlock(&vcpu->kvm->mmu_lock); 2782 spin_unlock(&vcpu->kvm->mmu_lock);
2621 2783
2622 return r; 2784 return r;
@@ -2658,18 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2658 2820
2659static void paging_new_cr3(struct kvm_vcpu *vcpu) 2821static void paging_new_cr3(struct kvm_vcpu *vcpu)
2660{ 2822{
2661 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2823 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2662 mmu_free_roots(vcpu); 2824 mmu_free_roots(vcpu);
2663} 2825}
2664 2826
2665static unsigned long get_cr3(struct kvm_vcpu *vcpu) 2827static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2666{ 2828{
2667 return vcpu->arch.cr3; 2829 return kvm_read_cr3(vcpu);
2668} 2830}
2669 2831
2670static void inject_page_fault(struct kvm_vcpu *vcpu) 2832static void inject_page_fault(struct kvm_vcpu *vcpu,
2833 struct x86_exception *fault)
2671{ 2834{
2672 vcpu->arch.mmu.inject_page_fault(vcpu); 2835 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2673} 2836}
2674 2837
2675static void paging_free(struct kvm_vcpu *vcpu) 2838static void paging_free(struct kvm_vcpu *vcpu)
@@ -2815,6 +2978,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2815{ 2978{
2816 struct kvm_mmu *context = vcpu->arch.walk_mmu; 2979 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2817 2980
2981 context->base_role.word = 0;
2818 context->new_cr3 = nonpaging_new_cr3; 2982 context->new_cr3 = nonpaging_new_cr3;
2819 context->page_fault = tdp_page_fault; 2983 context->page_fault = tdp_page_fault;
2820 context->free = nonpaging_free; 2984 context->free = nonpaging_free;
@@ -3007,9 +3171,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3007 return; 3171 return;
3008 } 3172 }
3009 3173
3010 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
3011 return;
3012
3013 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
3014 if (!sp->role.cr4_pae) 3175 if (!sp->role.cr4_pae)
3015 paging32_update_pte(vcpu, sp, spte, new); 3176 paging32_update_pte(vcpu, sp, spte, new);
@@ -3263,12 +3424,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3263 } 3424 }
3264} 3425}
3265 3426
3266int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3427int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3428 void *insn, int insn_len)
3267{ 3429{
3268 int r; 3430 int r;
3269 enum emulation_result er; 3431 enum emulation_result er;
3270 3432
3271 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3433 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3272 if (r < 0) 3434 if (r < 0)
3273 goto out; 3435 goto out;
3274 3436
@@ -3281,7 +3443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
3281 if (r) 3443 if (r)
3282 goto out; 3444 goto out;
3283 3445
3284 er = emulate_instruction(vcpu, cr2, error_code, 0); 3446 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3285 3447
3286 switch (er) { 3448 switch (er) {
3287 case EMULATE_DONE: 3449 case EMULATE_DONE:
@@ -3376,11 +3538,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3376 if (!test_bit(slot, sp->slot_bitmap)) 3538 if (!test_bit(slot, sp->slot_bitmap))
3377 continue; 3539 continue;
3378 3540
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3379 pt = sp->spt; 3544 pt = sp->spt;
3380 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3381 /* avoid RMW */ 3546 /* avoid RMW */
3382 if (is_writable_pte(pt[i])) 3547 if (is_writable_pte(pt[i]))
3383 pt[i] &= ~PT_WRITABLE_MASK; 3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3384 } 3549 }
3385 kvm_flush_remote_tlbs(kvm); 3550 kvm_flush_remote_tlbs(kvm);
3386} 3551}
@@ -3462,13 +3627,6 @@ static void mmu_destroy_caches(void)
3462 kmem_cache_destroy(mmu_page_header_cache); 3627 kmem_cache_destroy(mmu_page_header_cache);
3463} 3628}
3464 3629
3465void kvm_mmu_module_exit(void)
3466{
3467 mmu_destroy_caches();
3468 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3469 unregister_shrinker(&mmu_shrinker);
3470}
3471
3472int kvm_mmu_module_init(void) 3630int kvm_mmu_module_init(void)
3473{ 3631{
3474 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3632 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3565,7 +3723,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3565 3723
3566static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3724static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3567{ 3725{
3568 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3726 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3569 return 1; 3727 return 1;
3570} 3728}
3571 3729
@@ -3661,12 +3819,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3661} 3819}
3662EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3820EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3663 3821
3664#ifdef CONFIG_KVM_MMU_AUDIT
3665#include "mmu_audit.c"
3666#else
3667static void mmu_audit_disable(void) { }
3668#endif
3669
3670void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 3822void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3671{ 3823{
3672 ASSERT(vcpu); 3824 ASSERT(vcpu);
@@ -3674,5 +3826,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3674 destroy_kvm_mmu(vcpu); 3826 destroy_kvm_mmu(vcpu);
3675 free_mmu_pages(vcpu); 3827 free_mmu_pages(vcpu);
3676 mmu_free_memory_caches(vcpu); 3828 mmu_free_memory_caches(vcpu);
3829}
3830
3831#ifdef CONFIG_KVM_MMU_AUDIT
3832#include "mmu_audit.c"
3833#else
3834static void mmu_audit_disable(void) { }
3835#endif
3836
3837void kvm_mmu_module_exit(void)
3838{
3839 mmu_destroy_caches();
3840 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3841 unregister_shrinker(&mmu_shrinker);
3677 mmu_audit_disable(); 3842 mmu_audit_disable();
3678} 3843}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
19 19
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21 21
22static int audit_point; 22#define audit_printk(kvm, fmt, args...) \
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \ 23 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args) 24 fmt, audit_point_name[kvm->arch.audit_point], ##args)
27 25
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); 26typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29 27
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
97 95
98 if (sp->unsync) { 96 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) { 97 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level); 98 audit_printk(vcpu->kvm, "unsync sp: %p "
99 "level = %d\n", sp, level);
101 return; 100 return;
102 } 101 }
103 102
104 if (*sptep == shadow_notrap_nonpresent_pte) { 103 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp); 104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return; 106 return;
107 } 107 }
108 } 108 }
109 109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { 110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp); 111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
112 return; 113 return;
113 } 114 }
114 115
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
125 126
126 hpa = pfn << PAGE_SHIFT; 127 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 128 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn", 129 audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep); 130 "ent %llxn", vcpu->arch.mmu.root_level, pfn,
131 hpa, *sptep);
130} 132}
131 133
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 134static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
142 if (!gfn_to_memslot(kvm, gfn)) { 144 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit()) 145 if (!printk_ratelimit())
144 return; 146 return;
145 audit_printk("no memslot for gfn %llx\n", gfn); 147 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n", 148 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn); 149 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack(); 150 dump_stack();
149 return; 151 return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
153 if (!*rmapp) { 155 if (!*rmapp) {
154 if (!printk_ratelimit()) 156 if (!printk_ratelimit())
155 return; 157 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep); 158 audit_printk(kvm, "no rmap for writable spte %llx\n",
159 *sptep);
157 dump_stack(); 160 dump_stack();
158 } 161 }
159} 162}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{ 171{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 172 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170 173
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync) 174 if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp); 175 audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
176 "root.\n", sp);
173} 177}
174 178
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) 179static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
202 spte = rmap_next(kvm, rmapp, NULL); 206 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) { 207 while (spte) {
204 if (is_writable_pte(*spte)) 208 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn " 209 audit_printk(kvm, "shadow page has writable "
206 "%llx role %x\n", sp->gfn, sp->role.word); 210 "mappings: gfn %llx role %x\n",
211 sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte); 212 spte = rmap_next(kvm, rmapp, spte);
208 } 213 }
209} 214}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
238 if (!__ratelimit(&ratelimit_state)) 243 if (!__ratelimit(&ratelimit_state))
239 return; 244 return;
240 245
241 audit_point = point; 246 vcpu->kvm->arch.audit_point = point;
242 audit_all_active_sps(vcpu->kvm); 247 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu); 248 audit_vcpu_spte(vcpu);
244} 249}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
72 unsigned pt_access; 72 unsigned pt_access;
73 unsigned pte_access; 73 unsigned pte_access;
74 gfn_t gfn; 74 gfn_t gfn;
75 u32 error_code; 75 struct x86_exception fault;
76}; 76};
77 77
78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 78static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
266 return 1; 266 return 1;
267 267
268error: 268error:
269 walker->error_code = 0; 269 walker->fault.vector = PF_VECTOR;
270 walker->fault.error_code_valid = true;
271 walker->fault.error_code = 0;
270 if (present) 272 if (present)
271 walker->error_code |= PFERR_PRESENT_MASK; 273 walker->fault.error_code |= PFERR_PRESENT_MASK;
272 274
273 walker->error_code |= write_fault | user_fault; 275 walker->fault.error_code |= write_fault | user_fault;
274 276
275 if (fetch_fault && mmu->nx) 277 if (fetch_fault && mmu->nx)
276 walker->error_code |= PFERR_FETCH_MASK; 278 walker->fault.error_code |= PFERR_FETCH_MASK;
277 if (rsvd_fault) 279 if (rsvd_fault)
278 walker->error_code |= PFERR_RSVD_MASK; 280 walker->fault.error_code |= PFERR_RSVD_MASK;
279 281
280 vcpu->arch.fault.address = addr; 282 walker->fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code; 283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
282 284
283 trace_kvm_mmu_walker_error(walker->error_code); 285 trace_kvm_mmu_walker_error(walker->fault.error_code);
284 return 0; 286 return 0;
285} 287}
286 288
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
299 addr, access); 301 addr, access);
300} 302}
301 303
304static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
305 struct kvm_mmu_page *sp, u64 *spte,
306 pt_element_t gpte)
307{
308 u64 nonpresent = shadow_trap_nonpresent_pte;
309
310 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
311 goto no_present;
312
313 if (!is_present_gpte(gpte)) {
314 if (!sp->unsync)
315 nonpresent = shadow_notrap_nonpresent_pte;
316 goto no_present;
317 }
318
319 if (!(gpte & PT_ACCESSED_MASK))
320 goto no_present;
321
322 return false;
323
324no_present:
325 drop_spte(vcpu->kvm, spte, nonpresent);
326 return true;
327}
328
302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
303 u64 *spte, const void *pte) 330 u64 *spte, const void *pte)
304{ 331{
305 pt_element_t gpte; 332 pt_element_t gpte;
306 unsigned pte_access; 333 unsigned pte_access;
307 pfn_t pfn; 334 pfn_t pfn;
308 u64 new_spte;
309 335
310 gpte = *(const pt_element_t *)pte; 336 gpte = *(const pt_element_t *)pte;
311 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 337 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
312 if (!is_present_gpte(gpte)) {
313 if (sp->unsync)
314 new_spte = shadow_trap_nonpresent_pte;
315 else
316 new_spte = shadow_notrap_nonpresent_pte;
317 __set_spte(spte, new_spte);
318 }
319 return; 338 return;
320 } 339
321 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
322 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
323 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 return; 348 return;
330 kvm_get_pfn(pfn); 349 kvm_get_pfn(pfn);
331 /* 350 /*
332 * we call mmu_set_spte() with reset_host_protection = true beacuse that 351 * we call mmu_set_spte() with host_writable = true beacuse that
333 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
334 */ 353 */
335 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 354 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep) 383 u64 *sptep)
365{ 384{
366 struct kvm_mmu_page *sp; 385 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes; 386 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte; 387 u64 *spte;
370 int i; 388 int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 413
396 gpte = gptep[i]; 414 gpte = gptep[i];
397 415
398 if (!is_present_gpte(gpte) || 416 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue; 417 continue;
407 418
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 419 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
427static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 438static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
428 struct guest_walker *gw, 439 struct guest_walker *gw,
429 int user_fault, int write_fault, int hlevel, 440 int user_fault, int write_fault, int hlevel,
430 int *ptwrite, pfn_t pfn) 441 int *ptwrite, pfn_t pfn, bool map_writable,
442 bool prefault)
431{ 443{
432 unsigned access = gw->pt_access; 444 unsigned access = gw->pt_access;
433 struct kvm_mmu_page *sp = NULL; 445 struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
501 513
502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 514 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
503 user_fault, write_fault, dirty, ptwrite, it.level, 515 user_fault, write_fault, dirty, ptwrite, it.level,
504 gw->gfn, pfn, false, true); 516 gw->gfn, pfn, prefault, map_writable);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 517 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
506 518
507 return it.sptep; 519 return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
527 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 539 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
528 * a negative value on error. 540 * a negative value on error.
529 */ 541 */
530static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 542static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
531 u32 error_code) 543 bool prefault)
532{ 544{
533 int write_fault = error_code & PFERR_WRITE_MASK; 545 int write_fault = error_code & PFERR_WRITE_MASK;
534 int user_fault = error_code & PFERR_USER_MASK; 546 int user_fault = error_code & PFERR_USER_MASK;
@@ -538,7 +550,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
538 int r; 550 int r;
539 pfn_t pfn; 551 pfn_t pfn;
540 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
553 int force_pt_level;
541 unsigned long mmu_seq; 554 unsigned long mmu_seq;
555 bool map_writable;
542 556
543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 557 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
544 558
@@ -556,19 +570,29 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
556 */ 570 */
557 if (!r) { 571 if (!r) {
558 pgprintk("%s: guest page fault\n", __func__); 572 pgprintk("%s: guest page fault\n", __func__);
559 inject_page_fault(vcpu); 573 if (!prefault) {
560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 574 inject_page_fault(vcpu, &walker.fault);
575 /* reset fork detector */
576 vcpu->arch.last_pt_write_count = 0;
577 }
561 return 0; 578 return 0;
562 } 579 }
563 580
564 if (walker.level >= PT_DIRECTORY_LEVEL) { 581 if (walker.level >= PT_DIRECTORY_LEVEL)
582 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
583 else
584 force_pt_level = 1;
585 if (!force_pt_level) {
565 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 586 level = min(walker.level, mapping_level(vcpu, walker.gfn));
566 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 587 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
567 } 588 }
568 589
569 mmu_seq = vcpu->kvm->mmu_notifier_seq; 590 mmu_seq = vcpu->kvm->mmu_notifier_seq;
570 smp_rmb(); 591 smp_rmb();
571 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 592
593 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
594 &map_writable))
595 return 0;
572 596
573 /* mmio */ 597 /* mmio */
574 if (is_error_pfn(pfn)) 598 if (is_error_pfn(pfn))
@@ -580,8 +604,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
580 604
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 605 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
582 kvm_mmu_free_some_pages(vcpu); 606 kvm_mmu_free_some_pages(vcpu);
607 if (!force_pt_level)
608 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 609 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
584 level, &write_pt, pfn); 610 level, &write_pt, pfn, map_writable, prefault);
585 (void)sptep; 611 (void)sptep;
586 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 612 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
587 sptep, *sptep, write_pt); 613 sptep, *sptep, write_pt);
@@ -661,7 +687,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
661} 687}
662 688
663static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 689static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
664 u32 *error) 690 struct x86_exception *exception)
665{ 691{
666 struct guest_walker walker; 692 struct guest_walker walker;
667 gpa_t gpa = UNMAPPED_GVA; 693 gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +698,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
672 if (r) { 698 if (r) {
673 gpa = gfn_to_gpa(walker.gfn); 699 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK; 700 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error) 701 } else if (exception)
676 *error = walker.error_code; 702 *exception = walker.fault;
677 703
678 return gpa; 704 return gpa;
679} 705}
680 706
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 707static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error) 708 u32 access,
709 struct x86_exception *exception)
683{ 710{
684 struct guest_walker walker; 711 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA; 712 gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +717,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
690 if (r) { 717 if (r) {
691 gpa = gfn_to_gpa(walker.gfn); 718 gpa = gfn_to_gpa(walker.gfn);
692 gpa |= vaddr & ~PAGE_MASK; 719 gpa |= vaddr & ~PAGE_MASK;
693 } else if (error) 720 } else if (exception)
694 *error = walker.error_code; 721 *exception = walker.fault;
695 722
696 return gpa; 723 return gpa;
697} 724}
@@ -730,12 +757,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
730 * Using the cached information from sp->gfns is safe because: 757 * Using the cached information from sp->gfns is safe because:
731 * - The spte has a reference to the struct page, so the pfn for a given gfn 758 * - The spte has a reference to the struct page, so the pfn for a given gfn
732 * can't change unless all sptes pointing to it are nuked first. 759 * can't change unless all sptes pointing to it are nuked first.
760 *
761 * Note:
762 * We should flush all tlbs if spte is dropped even though guest is
763 * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
764 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
765 * used by guest then tlbs are not flushed, so guest is allowed to access the
766 * freed pages.
767 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
733 */ 768 */
734static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 769static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
735 bool clear_unsync)
736{ 770{
737 int i, offset, nr_present; 771 int i, offset, nr_present;
738 bool reset_host_protection; 772 bool host_writable;
739 gpa_t first_pte_gpa; 773 gpa_t first_pte_gpa;
740 774
741 offset = nr_present = 0; 775 offset = nr_present = 0;
@@ -764,31 +798,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
764 return -EINVAL; 798 return -EINVAL;
765 799
766 gfn = gpte_to_gfn(gpte); 800 gfn = gpte_to_gfn(gpte);
767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
769 || !(gpte & PT_ACCESSED_MASK)) {
770 u64 nonpresent;
771 801
772 if (is_present_gpte(gpte) || !clear_unsync) 802 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
773 nonpresent = shadow_trap_nonpresent_pte; 803 vcpu->kvm->tlbs_dirty++;
774 else 804 continue;
775 nonpresent = shadow_notrap_nonpresent_pte; 805 }
776 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 806
807 if (gfn != sp->gfns[i]) {
808 drop_spte(vcpu->kvm, &sp->spt[i],
809 shadow_trap_nonpresent_pte);
810 vcpu->kvm->tlbs_dirty++;
777 continue; 811 continue;
778 } 812 }
779 813
780 nr_present++; 814 nr_present++;
781 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 815 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
782 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 816 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
783 pte_access &= ~ACC_WRITE_MASK; 817
784 reset_host_protection = 0;
785 } else {
786 reset_host_protection = 1;
787 }
788 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 818 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
789 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 819 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
790 spte_to_pfn(sp->spt[i]), true, false, 820 spte_to_pfn(sp->spt[i]), true, false,
791 reset_host_protection); 821 host_writable);
792 } 822 }
793 823
794 return !nr_present; 824 return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 82e144a4e514..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 51#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_FEATURE_SVML (1 << 2) 52#define SVM_FEATURE_SVML (1 << 2)
52#define SVM_FEATURE_NRIP (1 << 3) 53#define SVM_FEATURE_NRIP (1 << 3)
54#define SVM_FEATURE_TSC_RATE (1 << 4)
55#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
56#define SVM_FEATURE_FLUSH_ASID (1 << 6)
57#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
53#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 58#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
54 59
55#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 60#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
97 unsigned long vmexit_rax; 102 unsigned long vmexit_rax;
98 103
99 /* cache for intercepts of the guest */ 104 /* cache for intercepts of the guest */
100 u16 intercept_cr_read; 105 u32 intercept_cr;
101 u16 intercept_cr_write; 106 u32 intercept_dr;
102 u16 intercept_dr_read;
103 u16 intercept_dr_write;
104 u32 intercept_exceptions; 107 u32 intercept_exceptions;
105 u64 intercept; 108 u64 intercept;
106 109
@@ -123,7 +126,12 @@ struct vcpu_svm {
123 u64 next_rip; 126 u64 next_rip;
124 127
125 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 128 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
126 u64 host_gs_base; 129 struct {
130 u16 fs;
131 u16 gs;
132 u16 ldt;
133 u64 gs_base;
134 } host;
127 135
128 u32 *msrpm; 136 u32 *msrpm;
129 137
@@ -133,6 +141,7 @@ struct vcpu_svm {
133 141
134 unsigned int3_injected; 142 unsigned int3_injected;
135 unsigned long int3_rip; 143 unsigned long int3_rip;
144 u32 apf_reason;
136}; 145};
137 146
138#define MSR_INVALID 0xffffffffU 147#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
180static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 189static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
181 bool has_error_code, u32 error_code); 190 bool has_error_code, u32 error_code);
182 191
192enum {
193 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
194 pause filter count */
195 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
196 VMCB_ASID, /* ASID */
197 VMCB_INTR, /* int_ctl, int_vector */
198 VMCB_NPT, /* npt_en, nCR3, gPAT */
199 VMCB_CR, /* CR0, CR3, CR4, EFER */
200 VMCB_DR, /* DR6, DR7 */
201 VMCB_DT, /* GDT, IDT */
202 VMCB_SEG, /* CS, DS, SS, ES, CPL */
203 VMCB_CR2, /* CR2 only */
204 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
205 VMCB_DIRTY_MAX,
206};
207
208/* TPR and CR2 are always written before VMRUN */
209#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
210
211static inline void mark_all_dirty(struct vmcb *vmcb)
212{
213 vmcb->control.clean = 0;
214}
215
216static inline void mark_all_clean(struct vmcb *vmcb)
217{
218 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
219 & ~VMCB_ALWAYS_DIRTY_MASK;
220}
221
222static inline void mark_dirty(struct vmcb *vmcb, int bit)
223{
224 vmcb->control.clean &= ~(1 << bit);
225}
226
183static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 227static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
184{ 228{
185 return container_of(vcpu, struct vcpu_svm, vcpu); 229 return container_of(vcpu, struct vcpu_svm, vcpu);
186} 230}
187 231
188static inline bool is_nested(struct vcpu_svm *svm) 232static void recalc_intercepts(struct vcpu_svm *svm)
233{
234 struct vmcb_control_area *c, *h;
235 struct nested_state *g;
236
237 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
238
239 if (!is_guest_mode(&svm->vcpu))
240 return;
241
242 c = &svm->vmcb->control;
243 h = &svm->nested.hsave->control;
244 g = &svm->nested;
245
246 c->intercept_cr = h->intercept_cr | g->intercept_cr;
247 c->intercept_dr = h->intercept_dr | g->intercept_dr;
248 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
249 c->intercept = h->intercept | g->intercept;
250}
251
252static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
253{
254 if (is_guest_mode(&svm->vcpu))
255 return svm->nested.hsave;
256 else
257 return svm->vmcb;
258}
259
260static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
261{
262 struct vmcb *vmcb = get_host_vmcb(svm);
263
264 vmcb->control.intercept_cr |= (1U << bit);
265
266 recalc_intercepts(svm);
267}
268
269static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
270{
271 struct vmcb *vmcb = get_host_vmcb(svm);
272
273 vmcb->control.intercept_cr &= ~(1U << bit);
274
275 recalc_intercepts(svm);
276}
277
278static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
279{
280 struct vmcb *vmcb = get_host_vmcb(svm);
281
282 return vmcb->control.intercept_cr & (1U << bit);
283}
284
285static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
286{
287 struct vmcb *vmcb = get_host_vmcb(svm);
288
289 vmcb->control.intercept_dr |= (1U << bit);
290
291 recalc_intercepts(svm);
292}
293
294static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
295{
296 struct vmcb *vmcb = get_host_vmcb(svm);
297
298 vmcb->control.intercept_dr &= ~(1U << bit);
299
300 recalc_intercepts(svm);
301}
302
303static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
304{
305 struct vmcb *vmcb = get_host_vmcb(svm);
306
307 vmcb->control.intercept_exceptions |= (1U << bit);
308
309 recalc_intercepts(svm);
310}
311
312static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
189{ 313{
190 return svm->nested.vmcb; 314 struct vmcb *vmcb = get_host_vmcb(svm);
315
316 vmcb->control.intercept_exceptions &= ~(1U << bit);
317
318 recalc_intercepts(svm);
319}
320
321static inline void set_intercept(struct vcpu_svm *svm, int bit)
322{
323 struct vmcb *vmcb = get_host_vmcb(svm);
324
325 vmcb->control.intercept |= (1ULL << bit);
326
327 recalc_intercepts(svm);
328}
329
330static inline void clr_intercept(struct vcpu_svm *svm, int bit)
331{
332 struct vmcb *vmcb = get_host_vmcb(svm);
333
334 vmcb->control.intercept &= ~(1ULL << bit);
335
336 recalc_intercepts(svm);
191} 337}
192 338
193static inline void enable_gif(struct vcpu_svm *svm) 339static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
264 410
265#define MAX_INST_SIZE 15 411#define MAX_INST_SIZE 15
266 412
267static inline u32 svm_has(u32 feat)
268{
269 return svm_features & feat;
270}
271
272static inline void clgi(void) 413static inline void clgi(void)
273{ 414{
274 asm volatile (__ex(SVM_CLGI)); 415 asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
284 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 425 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
285} 426}
286 427
287static inline void force_new_asid(struct kvm_vcpu *vcpu)
288{
289 to_svm(vcpu)->asid_generation--;
290}
291
292static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
293{
294 force_new_asid(vcpu);
295}
296
297static int get_npt_level(void) 428static int get_npt_level(void)
298{ 429{
299#ifdef CONFIG_X86_64 430#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
310 efer &= ~EFER_LME; 441 efer &= ~EFER_LME;
311 442
312 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 443 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
444 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
313} 445}
314 446
315static int is_external_interrupt(u32 info) 447static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
347 svm->next_rip = svm->vmcb->control.next_rip; 479 svm->next_rip = svm->vmcb->control.next_rip;
348 480
349 if (!svm->next_rip) { 481 if (!svm->next_rip) {
350 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 482 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
351 EMULATE_DONE) 483 EMULATE_DONE)
352 printk(KERN_DEBUG "%s: NOP\n", __func__); 484 printk(KERN_DEBUG "%s: NOP\n", __func__);
353 return; 485 return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
374 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 506 nested_svm_check_exception(svm, nr, has_error_code, error_code))
375 return; 507 return;
376 508
377 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { 509 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
378 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 510 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
379 511
380 /* 512 /*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
670 802
671 svm_features = cpuid_edx(SVM_CPUID_FUNC); 803 svm_features = cpuid_edx(SVM_CPUID_FUNC);
672 804
673 if (!svm_has(SVM_FEATURE_NPT)) 805 if (!boot_cpu_has(X86_FEATURE_NPT))
674 npt_enabled = false; 806 npt_enabled = false;
675 807
676 if (npt_enabled && !npt) { 808 if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
725 struct vcpu_svm *svm = to_svm(vcpu); 857 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0; 858 u64 g_tsc_offset = 0;
727 859
728 if (is_nested(svm)) { 860 if (is_guest_mode(vcpu)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset - 861 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset; 862 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset; 863 svm->nested.hsave->control.tsc_offset = offset;
732 } 864 }
733 865
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 866 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
867
868 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
735} 869}
736 870
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 871static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
739 struct vcpu_svm *svm = to_svm(vcpu); 873 struct vcpu_svm *svm = to_svm(vcpu);
740 874
741 svm->vmcb->control.tsc_offset += adjustment; 875 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm)) 876 if (is_guest_mode(vcpu))
743 svm->nested.hsave->control.tsc_offset += adjustment; 877 svm->nested.hsave->control.tsc_offset += adjustment;
878 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
744} 879}
745 880
746static void init_vmcb(struct vcpu_svm *svm) 881static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
749 struct vmcb_save_area *save = &svm->vmcb->save; 884 struct vmcb_save_area *save = &svm->vmcb->save;
750 885
751 svm->vcpu.fpu_active = 1; 886 svm->vcpu.fpu_active = 1;
887 svm->vcpu.arch.hflags = 0;
752 888
753 control->intercept_cr_read = INTERCEPT_CR0_MASK | 889 set_cr_intercept(svm, INTERCEPT_CR0_READ);
754 INTERCEPT_CR3_MASK | 890 set_cr_intercept(svm, INTERCEPT_CR3_READ);
755 INTERCEPT_CR4_MASK; 891 set_cr_intercept(svm, INTERCEPT_CR4_READ);
756 892 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
757 control->intercept_cr_write = INTERCEPT_CR0_MASK | 893 set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
758 INTERCEPT_CR3_MASK | 894 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
759 INTERCEPT_CR4_MASK | 895 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
760 INTERCEPT_CR8_MASK; 896
761 897 set_dr_intercept(svm, INTERCEPT_DR0_READ);
762 control->intercept_dr_read = INTERCEPT_DR0_MASK | 898 set_dr_intercept(svm, INTERCEPT_DR1_READ);
763 INTERCEPT_DR1_MASK | 899 set_dr_intercept(svm, INTERCEPT_DR2_READ);
764 INTERCEPT_DR2_MASK | 900 set_dr_intercept(svm, INTERCEPT_DR3_READ);
765 INTERCEPT_DR3_MASK | 901 set_dr_intercept(svm, INTERCEPT_DR4_READ);
766 INTERCEPT_DR4_MASK | 902 set_dr_intercept(svm, INTERCEPT_DR5_READ);
767 INTERCEPT_DR5_MASK | 903 set_dr_intercept(svm, INTERCEPT_DR6_READ);
768 INTERCEPT_DR6_MASK | 904 set_dr_intercept(svm, INTERCEPT_DR7_READ);
769 INTERCEPT_DR7_MASK; 905
770 906 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
771 control->intercept_dr_write = INTERCEPT_DR0_MASK | 907 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
772 INTERCEPT_DR1_MASK | 908 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
773 INTERCEPT_DR2_MASK | 909 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
774 INTERCEPT_DR3_MASK | 910 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
775 INTERCEPT_DR4_MASK | 911 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
776 INTERCEPT_DR5_MASK | 912 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
777 INTERCEPT_DR6_MASK | 913 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
778 INTERCEPT_DR7_MASK; 914
779 915 set_exception_intercept(svm, PF_VECTOR);
780 control->intercept_exceptions = (1 << PF_VECTOR) | 916 set_exception_intercept(svm, UD_VECTOR);
781 (1 << UD_VECTOR) | 917 set_exception_intercept(svm, MC_VECTOR);
782 (1 << MC_VECTOR); 918
783 919 set_intercept(svm, INTERCEPT_INTR);
784 920 set_intercept(svm, INTERCEPT_NMI);
785 control->intercept = (1ULL << INTERCEPT_INTR) | 921 set_intercept(svm, INTERCEPT_SMI);
786 (1ULL << INTERCEPT_NMI) | 922 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
787 (1ULL << INTERCEPT_SMI) | 923 set_intercept(svm, INTERCEPT_CPUID);
788 (1ULL << INTERCEPT_SELECTIVE_CR0) | 924 set_intercept(svm, INTERCEPT_INVD);
789 (1ULL << INTERCEPT_CPUID) | 925 set_intercept(svm, INTERCEPT_HLT);
790 (1ULL << INTERCEPT_INVD) | 926 set_intercept(svm, INTERCEPT_INVLPG);
791 (1ULL << INTERCEPT_HLT) | 927 set_intercept(svm, INTERCEPT_INVLPGA);
792 (1ULL << INTERCEPT_INVLPG) | 928 set_intercept(svm, INTERCEPT_IOIO_PROT);
793 (1ULL << INTERCEPT_INVLPGA) | 929 set_intercept(svm, INTERCEPT_MSR_PROT);
794 (1ULL << INTERCEPT_IOIO_PROT) | 930 set_intercept(svm, INTERCEPT_TASK_SWITCH);
795 (1ULL << INTERCEPT_MSR_PROT) | 931 set_intercept(svm, INTERCEPT_SHUTDOWN);
796 (1ULL << INTERCEPT_TASK_SWITCH) | 932 set_intercept(svm, INTERCEPT_VMRUN);
797 (1ULL << INTERCEPT_SHUTDOWN) | 933 set_intercept(svm, INTERCEPT_VMMCALL);
798 (1ULL << INTERCEPT_VMRUN) | 934 set_intercept(svm, INTERCEPT_VMLOAD);
799 (1ULL << INTERCEPT_VMMCALL) | 935 set_intercept(svm, INTERCEPT_VMSAVE);
800 (1ULL << INTERCEPT_VMLOAD) | 936 set_intercept(svm, INTERCEPT_STGI);
801 (1ULL << INTERCEPT_VMSAVE) | 937 set_intercept(svm, INTERCEPT_CLGI);
802 (1ULL << INTERCEPT_STGI) | 938 set_intercept(svm, INTERCEPT_SKINIT);
803 (1ULL << INTERCEPT_CLGI) | 939 set_intercept(svm, INTERCEPT_WBINVD);
804 (1ULL << INTERCEPT_SKINIT) | 940 set_intercept(svm, INTERCEPT_MONITOR);
805 (1ULL << INTERCEPT_WBINVD) | 941 set_intercept(svm, INTERCEPT_MWAIT);
806 (1ULL << INTERCEPT_MONITOR) | 942 set_intercept(svm, INTERCEPT_XSETBV);
807 (1ULL << INTERCEPT_MWAIT);
808 943
809 control->iopm_base_pa = iopm_base; 944 control->iopm_base_pa = iopm_base;
810 control->msrpm_base_pa = __pa(svm->msrpm); 945 control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
855 if (npt_enabled) { 990 if (npt_enabled) {
856 /* Setup VMCB for Nested Paging */ 991 /* Setup VMCB for Nested Paging */
857 control->nested_ctl = 1; 992 control->nested_ctl = 1;
858 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 993 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
859 (1ULL << INTERCEPT_INVLPG)); 994 clr_intercept(svm, INTERCEPT_INVLPG);
860 control->intercept_exceptions &= ~(1 << PF_VECTOR); 995 clr_exception_intercept(svm, PF_VECTOR);
861 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; 996 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
862 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; 997 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
863 save->g_pat = 0x0007040600070406ULL; 998 save->g_pat = 0x0007040600070406ULL;
864 save->cr3 = 0; 999 save->cr3 = 0;
865 save->cr4 = 0; 1000 save->cr4 = 0;
866 } 1001 }
867 force_new_asid(&svm->vcpu); 1002 svm->asid_generation = 0;
868 1003
869 svm->nested.vmcb = 0; 1004 svm->nested.vmcb = 0;
870 svm->vcpu.arch.hflags = 0; 1005 svm->vcpu.arch.hflags = 0;
871 1006
872 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { 1007 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
873 control->pause_filter_count = 3000; 1008 control->pause_filter_count = 3000;
874 control->intercept |= (1ULL << INTERCEPT_PAUSE); 1009 set_intercept(svm, INTERCEPT_PAUSE);
875 } 1010 }
876 1011
1012 mark_all_dirty(svm->vmcb);
1013
877 enable_gif(svm); 1014 enable_gif(svm);
878} 1015}
879 1016
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
990 1127
991 if (unlikely(cpu != vcpu->cpu)) { 1128 if (unlikely(cpu != vcpu->cpu)) {
992 svm->asid_generation = 0; 1129 svm->asid_generation = 0;
1130 mark_all_dirty(svm->vmcb);
993 } 1131 }
994 1132
1133#ifdef CONFIG_X86_64
1134 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1135#endif
1136 savesegment(fs, svm->host.fs);
1137 savesegment(gs, svm->host.gs);
1138 svm->host.ldt = kvm_read_ldt();
1139
995 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1140 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
996 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1141 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
997} 1142}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1002 int i; 1147 int i;
1003 1148
1004 ++vcpu->stat.host_state_reload; 1149 ++vcpu->stat.host_state_reload;
1150 kvm_load_ldt(svm->host.ldt);
1151#ifdef CONFIG_X86_64
1152 loadsegment(fs, svm->host.fs);
1153 load_gs_index(svm->host.gs);
1154 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1155#else
1156 loadsegment(gs, svm->host.gs);
1157#endif
1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1007} 1160}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1021 switch (reg) { 1174 switch (reg) {
1022 case VCPU_EXREG_PDPTR: 1175 case VCPU_EXREG_PDPTR:
1023 BUG_ON(!npt_enabled); 1176 BUG_ON(!npt_enabled);
1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 1177 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1025 break; 1178 break;
1026 default: 1179 default:
1027 BUG(); 1180 BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1030 1183
1031static void svm_set_vintr(struct vcpu_svm *svm) 1184static void svm_set_vintr(struct vcpu_svm *svm)
1032{ 1185{
1033 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 1186 set_intercept(svm, INTERCEPT_VINTR);
1034} 1187}
1035 1188
1036static void svm_clear_vintr(struct vcpu_svm *svm) 1189static void svm_clear_vintr(struct vcpu_svm *svm)
1037{ 1190{
1038 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1191 clr_intercept(svm, INTERCEPT_VINTR);
1039} 1192}
1040 1193
1041static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1194static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1150 1303
1151 svm->vmcb->save.idtr.limit = dt->size; 1304 svm->vmcb->save.idtr.limit = dt->size;
1152 svm->vmcb->save.idtr.base = dt->address ; 1305 svm->vmcb->save.idtr.base = dt->address ;
1306 mark_dirty(svm->vmcb, VMCB_DT);
1153} 1307}
1154 1308
1155static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1309static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1166 1320
1167 svm->vmcb->save.gdtr.limit = dt->size; 1321 svm->vmcb->save.gdtr.limit = dt->size;
1168 svm->vmcb->save.gdtr.base = dt->address ; 1322 svm->vmcb->save.gdtr.base = dt->address ;
1323 mark_dirty(svm->vmcb, VMCB_DT);
1169} 1324}
1170 1325
1171static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1326static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1327{
1173} 1328}
1174 1329
1330static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1331{
1332}
1333
1175static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1334static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1176{ 1335{
1177} 1336}
1178 1337
1179static void update_cr0_intercept(struct vcpu_svm *svm) 1338static void update_cr0_intercept(struct vcpu_svm *svm)
1180{ 1339{
1181 struct vmcb *vmcb = svm->vmcb;
1182 ulong gcr0 = svm->vcpu.arch.cr0; 1340 ulong gcr0 = svm->vcpu.arch.cr0;
1183 u64 *hcr0 = &svm->vmcb->save.cr0; 1341 u64 *hcr0 = &svm->vmcb->save.cr0;
1184 1342
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1188 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 1346 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1189 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 1347 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1190 1348
1349 mark_dirty(svm->vmcb, VMCB_CR);
1191 1350
1192 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1351 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1193 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1352 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1194 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1353 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1195 if (is_nested(svm)) {
1196 struct vmcb *hsave = svm->nested.hsave;
1197
1198 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1199 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1200 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1201 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1202 }
1203 } else { 1354 } else {
1204 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1355 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1205 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1356 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1206 if (is_nested(svm)) {
1207 struct vmcb *hsave = svm->nested.hsave;
1208
1209 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1210 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1211 }
1212 } 1357 }
1213} 1358}
1214 1359
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1216{ 1361{
1217 struct vcpu_svm *svm = to_svm(vcpu); 1362 struct vcpu_svm *svm = to_svm(vcpu);
1218 1363
1219 if (is_nested(svm)) { 1364 if (is_guest_mode(vcpu)) {
1220 /* 1365 /*
1221 * We are here because we run in nested mode, the host kvm 1366 * We are here because we run in nested mode, the host kvm
1222 * intercepts cr0 writes but the l1 hypervisor does not. 1367 * intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1268 */ 1413 */
1269 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1414 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1270 svm->vmcb->save.cr0 = cr0; 1415 svm->vmcb->save.cr0 = cr0;
1416 mark_dirty(svm->vmcb, VMCB_CR);
1271 update_cr0_intercept(svm); 1417 update_cr0_intercept(svm);
1272} 1418}
1273 1419
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1277 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1423 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1278 1424
1279 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1425 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1280 force_new_asid(vcpu); 1426 svm_flush_tlb(vcpu);
1281 1427
1282 vcpu->arch.cr4 = cr4; 1428 vcpu->arch.cr4 = cr4;
1283 if (!npt_enabled) 1429 if (!npt_enabled)
1284 cr4 |= X86_CR4_PAE; 1430 cr4 |= X86_CR4_PAE;
1285 cr4 |= host_cr4_mce; 1431 cr4 |= host_cr4_mce;
1286 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1432 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1433 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1287} 1434}
1288 1435
1289static void svm_set_segment(struct kvm_vcpu *vcpu, 1436static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1312 = (svm->vmcb->save.cs.attrib 1459 = (svm->vmcb->save.cs.attrib
1313 >> SVM_SELECTOR_DPL_SHIFT) & 3; 1460 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1314 1461
1462 mark_dirty(svm->vmcb, VMCB_SEG);
1315} 1463}
1316 1464
1317static void update_db_intercept(struct kvm_vcpu *vcpu) 1465static void update_db_intercept(struct kvm_vcpu *vcpu)
1318{ 1466{
1319 struct vcpu_svm *svm = to_svm(vcpu); 1467 struct vcpu_svm *svm = to_svm(vcpu);
1320 1468
1321 svm->vmcb->control.intercept_exceptions &= 1469 clr_exception_intercept(svm, DB_VECTOR);
1322 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1470 clr_exception_intercept(svm, BP_VECTOR);
1323 1471
1324 if (svm->nmi_singlestep) 1472 if (svm->nmi_singlestep)
1325 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1473 set_exception_intercept(svm, DB_VECTOR);
1326 1474
1327 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1475 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1328 if (vcpu->guest_debug & 1476 if (vcpu->guest_debug &
1329 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 1477 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1330 svm->vmcb->control.intercept_exceptions |= 1478 set_exception_intercept(svm, DB_VECTOR);
1331 1 << DB_VECTOR;
1332 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1479 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1333 svm->vmcb->control.intercept_exceptions |= 1480 set_exception_intercept(svm, BP_VECTOR);
1334 1 << BP_VECTOR;
1335 } else 1481 } else
1336 vcpu->guest_debug = 0; 1482 vcpu->guest_debug = 0;
1337} 1483}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1345 else 1491 else
1346 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1492 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1347 1493
1348 update_db_intercept(vcpu); 1494 mark_dirty(svm->vmcb, VMCB_DR);
1349}
1350
1351static void load_host_msrs(struct kvm_vcpu *vcpu)
1352{
1353#ifdef CONFIG_X86_64
1354 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1355#endif
1356}
1357 1495
1358static void save_host_msrs(struct kvm_vcpu *vcpu) 1496 update_db_intercept(vcpu);
1359{
1360#ifdef CONFIG_X86_64
1361 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1362#endif
1363} 1497}
1364 1498
1365static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1499static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1372 1506
1373 svm->asid_generation = sd->asid_generation; 1507 svm->asid_generation = sd->asid_generation;
1374 svm->vmcb->control.asid = sd->next_asid++; 1508 svm->vmcb->control.asid = sd->next_asid++;
1509
1510 mark_dirty(svm->vmcb, VMCB_ASID);
1375} 1511}
1376 1512
1377static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1513static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1379 struct vcpu_svm *svm = to_svm(vcpu); 1515 struct vcpu_svm *svm = to_svm(vcpu);
1380 1516
1381 svm->vmcb->save.dr7 = value; 1517 svm->vmcb->save.dr7 = value;
1518 mark_dirty(svm->vmcb, VMCB_DR);
1382} 1519}
1383 1520
1384static int pf_interception(struct vcpu_svm *svm) 1521static int pf_interception(struct vcpu_svm *svm)
1385{ 1522{
1386 u64 fault_address; 1523 u64 fault_address = svm->vmcb->control.exit_info_2;
1387 u32 error_code; 1524 u32 error_code;
1525 int r = 1;
1388 1526
1389 fault_address = svm->vmcb->control.exit_info_2; 1527 switch (svm->apf_reason) {
1390 error_code = svm->vmcb->control.exit_info_1; 1528 default:
1391 1529 error_code = svm->vmcb->control.exit_info_1;
1392 trace_kvm_page_fault(fault_address, error_code); 1530
1393 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1531 trace_kvm_page_fault(fault_address, error_code);
1394 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1532 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1395 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1533 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1534 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1535 svm->vmcb->control.insn_bytes,
1536 svm->vmcb->control.insn_len);
1537 break;
1538 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1539 svm->apf_reason = 0;
1540 local_irq_disable();
1541 kvm_async_pf_task_wait(fault_address);
1542 local_irq_enable();
1543 break;
1544 case KVM_PV_REASON_PAGE_READY:
1545 svm->apf_reason = 0;
1546 local_irq_disable();
1547 kvm_async_pf_task_wake(fault_address);
1548 local_irq_enable();
1549 break;
1550 }
1551 return r;
1396} 1552}
1397 1553
1398static int db_interception(struct vcpu_svm *svm) 1554static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
1440{ 1596{
1441 int er; 1597 int er;
1442 1598
1443 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); 1599 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1444 if (er != EMULATE_DONE) 1600 if (er != EMULATE_DONE)
1445 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1601 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1446 return 1; 1602 return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
1449static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1605static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1450{ 1606{
1451 struct vcpu_svm *svm = to_svm(vcpu); 1607 struct vcpu_svm *svm = to_svm(vcpu);
1452 u32 excp;
1453
1454 if (is_nested(svm)) {
1455 u32 h_excp, n_excp;
1456
1457 h_excp = svm->nested.hsave->control.intercept_exceptions;
1458 n_excp = svm->nested.intercept_exceptions;
1459 h_excp &= ~(1 << NM_VECTOR);
1460 excp = h_excp | n_excp;
1461 } else {
1462 excp = svm->vmcb->control.intercept_exceptions;
1463 excp &= ~(1 << NM_VECTOR);
1464 }
1465 1608
1466 svm->vmcb->control.intercept_exceptions = excp; 1609 clr_exception_intercept(svm, NM_VECTOR);
1467 1610
1468 svm->vcpu.fpu_active = 1; 1611 svm->vcpu.fpu_active = 1;
1469 update_cr0_intercept(svm); 1612 update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
1570 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1713 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1571 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1714 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1572 if (string || in) 1715 if (string || in)
1573 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1716 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1574 1717
1575 port = io_info >> 16; 1718 port = io_info >> 16;
1576 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1719 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1624 struct vcpu_svm *svm = to_svm(vcpu); 1767 struct vcpu_svm *svm = to_svm(vcpu);
1625 1768
1626 svm->vmcb->control.nested_cr3 = root; 1769 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu); 1770 mark_dirty(svm->vmcb, VMCB_NPT);
1771 svm_flush_tlb(vcpu);
1628} 1772}
1629 1773
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) 1774static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1775 struct x86_exception *fault)
1631{ 1776{
1632 struct vcpu_svm *svm = to_svm(vcpu); 1777 struct vcpu_svm *svm = to_svm(vcpu);
1633 1778
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF; 1779 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0; 1780 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; 1781 svm->vmcb->control.exit_info_1 = fault->error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; 1782 svm->vmcb->control.exit_info_2 = fault->address;
1638 1783
1639 nested_svm_vmexit(svm); 1784 nested_svm_vmexit(svm);
1640} 1785}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1680{ 1825{
1681 int vmexit; 1826 int vmexit;
1682 1827
1683 if (!is_nested(svm)) 1828 if (!is_guest_mode(&svm->vcpu))
1684 return 0; 1829 return 0;
1685 1830
1686 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1831 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1698/* This function returns true if it is save to enable the irq window */ 1843/* This function returns true if it is save to enable the irq window */
1699static inline bool nested_svm_intr(struct vcpu_svm *svm) 1844static inline bool nested_svm_intr(struct vcpu_svm *svm)
1700{ 1845{
1701 if (!is_nested(svm)) 1846 if (!is_guest_mode(&svm->vcpu))
1702 return true; 1847 return true;
1703 1848
1704 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1849 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1737/* This function returns true if it is save to enable the nmi window */ 1882/* This function returns true if it is save to enable the nmi window */
1738static inline bool nested_svm_nmi(struct vcpu_svm *svm) 1883static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1739{ 1884{
1740 if (!is_nested(svm)) 1885 if (!is_guest_mode(&svm->vcpu))
1741 return true; 1886 return true;
1742 1887
1743 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 1888 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1836 return NESTED_EXIT_HOST; 1981 return NESTED_EXIT_HOST;
1837 break; 1982 break;
1838 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1983 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1839 /* When we're shadowing, trap PFs */ 1984 /* When we're shadowing, trap PFs, but not async PF */
1840 if (!npt_enabled) 1985 if (!npt_enabled && svm->apf_reason == 0)
1841 return NESTED_EXIT_HOST; 1986 return NESTED_EXIT_HOST;
1842 break; 1987 break;
1843 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 1988 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1865 case SVM_EXIT_IOIO: 2010 case SVM_EXIT_IOIO:
1866 vmexit = nested_svm_intercept_ioio(svm); 2011 vmexit = nested_svm_intercept_ioio(svm);
1867 break; 2012 break;
1868 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 2013 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1869 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 2014 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
1870 if (svm->nested.intercept_cr_read & cr_bits) 2015 if (svm->nested.intercept_cr & bit)
1871 vmexit = NESTED_EXIT_DONE; 2016 vmexit = NESTED_EXIT_DONE;
1872 break; 2017 break;
1873 } 2018 }
1874 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { 2019 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1875 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); 2020 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
1876 if (svm->nested.intercept_cr_write & cr_bits) 2021 if (svm->nested.intercept_dr & bit)
1877 vmexit = NESTED_EXIT_DONE;
1878 break;
1879 }
1880 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1881 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1882 if (svm->nested.intercept_dr_read & dr_bits)
1883 vmexit = NESTED_EXIT_DONE;
1884 break;
1885 }
1886 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1887 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1888 if (svm->nested.intercept_dr_write & dr_bits)
1889 vmexit = NESTED_EXIT_DONE; 2022 vmexit = NESTED_EXIT_DONE;
1890 break; 2023 break;
1891 } 2024 }
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1893 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2026 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1894 if (svm->nested.intercept_exceptions & excp_bits) 2027 if (svm->nested.intercept_exceptions & excp_bits)
1895 vmexit = NESTED_EXIT_DONE; 2028 vmexit = NESTED_EXIT_DONE;
2029 /* async page fault always cause vmexit */
2030 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2031 svm->apf_reason != 0)
2032 vmexit = NESTED_EXIT_DONE;
1896 break; 2033 break;
1897 } 2034 }
1898 case SVM_EXIT_ERR: { 2035 case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
1926 struct vmcb_control_area *dst = &dst_vmcb->control; 2063 struct vmcb_control_area *dst = &dst_vmcb->control;
1927 struct vmcb_control_area *from = &from_vmcb->control; 2064 struct vmcb_control_area *from = &from_vmcb->control;
1928 2065
1929 dst->intercept_cr_read = from->intercept_cr_read; 2066 dst->intercept_cr = from->intercept_cr;
1930 dst->intercept_cr_write = from->intercept_cr_write; 2067 dst->intercept_dr = from->intercept_dr;
1931 dst->intercept_dr_read = from->intercept_dr_read;
1932 dst->intercept_dr_write = from->intercept_dr_write;
1933 dst->intercept_exceptions = from->intercept_exceptions; 2068 dst->intercept_exceptions = from->intercept_exceptions;
1934 dst->intercept = from->intercept; 2069 dst->intercept = from->intercept;
1935 dst->iopm_base_pa = from->iopm_base_pa; 2070 dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1970 if (!nested_vmcb) 2105 if (!nested_vmcb)
1971 return 1; 2106 return 1;
1972 2107
1973 /* Exit nested SVM mode */ 2108 /* Exit Guest-Mode */
2109 leave_guest_mode(&svm->vcpu);
1974 svm->nested.vmcb = 0; 2110 svm->nested.vmcb = 0;
1975 2111
1976 /* Give the current vmcb to the guest */ 2112 /* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1984 nested_vmcb->save.idtr = vmcb->save.idtr; 2120 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer; 2121 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 2122 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 2123 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
1988 nested_vmcb->save.cr2 = vmcb->save.cr2; 2124 nested_vmcb->save.cr2 = vmcb->save.cr2;
1989 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2125 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1990 nested_vmcb->save.rflags = vmcb->save.rflags; 2126 nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2061 svm->vmcb->save.cpl = 0; 2197 svm->vmcb->save.cpl = 0;
2062 svm->vmcb->control.exit_int_info = 0; 2198 svm->vmcb->control.exit_int_info = 0;
2063 2199
2200 mark_all_dirty(svm->vmcb);
2201
2064 nested_svm_unmap(page); 2202 nested_svm_unmap(page);
2065 2203
2066 nested_svm_uninit_mmu_context(&svm->vcpu); 2204 nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2148 nested_vmcb->control.event_inj, 2286 nested_vmcb->control.event_inj,
2149 nested_vmcb->control.nested_ctl); 2287 nested_vmcb->control.nested_ctl);
2150 2288
2151 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, 2289 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2152 nested_vmcb->control.intercept_cr_write, 2290 nested_vmcb->control.intercept_cr >> 16,
2153 nested_vmcb->control.intercept_exceptions, 2291 nested_vmcb->control.intercept_exceptions,
2154 nested_vmcb->control.intercept); 2292 nested_vmcb->control.intercept);
2155 2293
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2177 if (npt_enabled) 2315 if (npt_enabled)
2178 hsave->save.cr3 = vmcb->save.cr3; 2316 hsave->save.cr3 = vmcb->save.cr3;
2179 else 2317 else
2180 hsave->save.cr3 = svm->vcpu.arch.cr3; 2318 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2181 2319
2182 copy_vmcb_control_area(hsave, vmcb); 2320 copy_vmcb_control_area(hsave, vmcb);
2183 2321
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2229 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 2367 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
2230 2368
2231 /* cache intercepts */ 2369 /* cache intercepts */
2232 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2370 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
2233 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; 2371 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
2234 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
2235 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
2236 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 2372 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2237 svm->nested.intercept = nested_vmcb->control.intercept; 2373 svm->nested.intercept = nested_vmcb->control.intercept;
2238 2374
2239 force_new_asid(&svm->vcpu); 2375 svm_flush_tlb(&svm->vcpu);
2240 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 2376 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2241 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 2377 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2242 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 2378 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2245 2381
2246 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 2382 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2247 /* We only want the cr8 intercept bits of the guest */ 2383 /* We only want the cr8 intercept bits of the guest */
2248 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; 2384 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2249 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2385 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2250 } 2386 }
2251 2387
2252 /* We don't want to see VMMCALLs from a nested guest */ 2388 /* We don't want to see VMMCALLs from a nested guest */
2253 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); 2389 clr_intercept(svm, INTERCEPT_VMMCALL);
2254
2255 /*
2256 * We don't want a nested guest to be more powerful than the guest, so
2257 * all intercepts are ORed
2258 */
2259 svm->vmcb->control.intercept_cr_read |=
2260 nested_vmcb->control.intercept_cr_read;
2261 svm->vmcb->control.intercept_cr_write |=
2262 nested_vmcb->control.intercept_cr_write;
2263 svm->vmcb->control.intercept_dr_read |=
2264 nested_vmcb->control.intercept_dr_read;
2265 svm->vmcb->control.intercept_dr_write |=
2266 nested_vmcb->control.intercept_dr_write;
2267 svm->vmcb->control.intercept_exceptions |=
2268 nested_vmcb->control.intercept_exceptions;
2269
2270 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2271 2390
2272 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 2391 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2273 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2392 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2278 2397
2279 nested_svm_unmap(page); 2398 nested_svm_unmap(page);
2280 2399
2281 /* nested_vmcb is our indicator if nested SVM is activated */ 2400 /* Enter Guest-Mode */
2401 enter_guest_mode(&svm->vcpu);
2402
2403 /*
2404 * Merge guest and host intercepts - must be called with vcpu in
2405 * guest-mode to take affect here
2406 */
2407 recalc_intercepts(svm);
2408
2282 svm->nested.vmcb = vmcb_gpa; 2409 svm->nested.vmcb = vmcb_gpa;
2283 2410
2284 enable_gif(svm); 2411 enable_gif(svm);
2285 2412
2413 mark_all_dirty(svm->vmcb);
2414
2286 return true; 2415 return true;
2287} 2416}
2288 2417
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
2400 svm_clear_vintr(svm); 2529 svm_clear_vintr(svm);
2401 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2530 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2402 2531
2532 mark_dirty(svm->vmcb, VMCB_INTR);
2533
2403 return 1; 2534 return 1;
2404} 2535}
2405 2536
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
2426 return 1; 2557 return 1;
2427} 2558}
2428 2559
2560static int xsetbv_interception(struct vcpu_svm *svm)
2561{
2562 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2563 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2564
2565 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2566 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2567 skip_emulated_instruction(&svm->vcpu);
2568 }
2569
2570 return 1;
2571}
2572
2429static int invalid_op_interception(struct vcpu_svm *svm) 2573static int invalid_op_interception(struct vcpu_svm *svm)
2430{ 2574{
2431 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2575 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
2507static int iret_interception(struct vcpu_svm *svm) 2651static int iret_interception(struct vcpu_svm *svm)
2508{ 2652{
2509 ++svm->vcpu.stat.nmi_window_exits; 2653 ++svm->vcpu.stat.nmi_window_exits;
2510 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 2654 clr_intercept(svm, INTERCEPT_IRET);
2511 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2655 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2512 return 1; 2656 return 1;
2513} 2657}
2514 2658
2515static int invlpg_interception(struct vcpu_svm *svm) 2659static int invlpg_interception(struct vcpu_svm *svm)
2516{ 2660{
2517 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2661 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2662 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2663
2664 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2665 skip_emulated_instruction(&svm->vcpu);
2666 return 1;
2518} 2667}
2519 2668
2520static int emulate_on_interception(struct vcpu_svm *svm) 2669static int emulate_on_interception(struct vcpu_svm *svm)
2521{ 2670{
2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2671 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2672}
2673
2674#define CR_VALID (1ULL << 63)
2675
2676static int cr_interception(struct vcpu_svm *svm)
2677{
2678 int reg, cr;
2679 unsigned long val;
2680 int err;
2681
2682 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2683 return emulate_on_interception(svm);
2684
2685 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2686 return emulate_on_interception(svm);
2687
2688 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2689 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2690
2691 err = 0;
2692 if (cr >= 16) { /* mov to cr */
2693 cr -= 16;
2694 val = kvm_register_read(&svm->vcpu, reg);
2695 switch (cr) {
2696 case 0:
2697 err = kvm_set_cr0(&svm->vcpu, val);
2698 break;
2699 case 3:
2700 err = kvm_set_cr3(&svm->vcpu, val);
2701 break;
2702 case 4:
2703 err = kvm_set_cr4(&svm->vcpu, val);
2704 break;
2705 case 8:
2706 err = kvm_set_cr8(&svm->vcpu, val);
2707 break;
2708 default:
2709 WARN(1, "unhandled write to CR%d", cr);
2710 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2711 return 1;
2712 }
2713 } else { /* mov from cr */
2714 switch (cr) {
2715 case 0:
2716 val = kvm_read_cr0(&svm->vcpu);
2717 break;
2718 case 2:
2719 val = svm->vcpu.arch.cr2;
2720 break;
2721 case 3:
2722 val = kvm_read_cr3(&svm->vcpu);
2723 break;
2724 case 4:
2725 val = kvm_read_cr4(&svm->vcpu);
2726 break;
2727 case 8:
2728 val = kvm_get_cr8(&svm->vcpu);
2729 break;
2730 default:
2731 WARN(1, "unhandled read from CR%d", cr);
2732 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2733 return 1;
2734 }
2735 kvm_register_write(&svm->vcpu, reg, val);
2736 }
2737 kvm_complete_insn_gp(&svm->vcpu, err);
2738
2739 return 1;
2523} 2740}
2524 2741
2525static int cr0_write_interception(struct vcpu_svm *svm) 2742static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2527 struct kvm_vcpu *vcpu = &svm->vcpu; 2744 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r; 2745 int r;
2529 2746
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0); 2747 r = cr_interception(svm);
2531 2748
2532 if (svm->nested.vmexit_rip) { 2749 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); 2750 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
2536 svm->nested.vmexit_rip = 0; 2753 svm->nested.vmexit_rip = 0;
2537 } 2754 }
2538 2755
2539 return r == EMULATE_DONE; 2756 return r;
2757}
2758
2759static int dr_interception(struct vcpu_svm *svm)
2760{
2761 int reg, dr;
2762 unsigned long val;
2763 int err;
2764
2765 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2766 return emulate_on_interception(svm);
2767
2768 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2769 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2770
2771 if (dr >= 16) { /* mov to DRn */
2772 val = kvm_register_read(&svm->vcpu, reg);
2773 kvm_set_dr(&svm->vcpu, dr - 16, val);
2774 } else {
2775 err = kvm_get_dr(&svm->vcpu, dr, &val);
2776 if (!err)
2777 kvm_register_write(&svm->vcpu, reg, val);
2778 }
2779
2780 return 1;
2540} 2781}
2541 2782
2542static int cr8_write_interception(struct vcpu_svm *svm) 2783static int cr8_write_interception(struct vcpu_svm *svm)
2543{ 2784{
2544 struct kvm_run *kvm_run = svm->vcpu.run; 2785 struct kvm_run *kvm_run = svm->vcpu.run;
2786 int r;
2545 2787
2546 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2788 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2547 /* instruction emulation calls kvm_set_cr8() */ 2789 /* instruction emulation calls kvm_set_cr8() */
2548 emulate_instruction(&svm->vcpu, 0, 0, 0); 2790 r = cr_interception(svm);
2549 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2791 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2550 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2792 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2551 return 1; 2793 return r;
2552 } 2794 }
2553 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2795 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2554 return 1; 2796 return r;
2555 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2797 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2556 return 0; 2798 return 0;
2557} 2799}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2562 2804
2563 switch (ecx) { 2805 switch (ecx) {
2564 case MSR_IA32_TSC: { 2806 case MSR_IA32_TSC: {
2565 u64 tsc_offset; 2807 struct vmcb *vmcb = get_host_vmcb(svm);
2566 2808
2567 if (is_nested(svm)) 2809 *data = vmcb->control.tsc_offset + native_read_tsc();
2568 tsc_offset = svm->nested.hsave->control.tsc_offset;
2569 else
2570 tsc_offset = svm->vmcb->control.tsc_offset;
2571
2572 *data = tsc_offset + native_read_tsc();
2573 break; 2810 break;
2574 } 2811 }
2575 case MSR_STAR: 2812 case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2714 svm->vmcb->save.sysenter_esp = data; 2951 svm->vmcb->save.sysenter_esp = data;
2715 break; 2952 break;
2716 case MSR_IA32_DEBUGCTLMSR: 2953 case MSR_IA32_DEBUGCTLMSR:
2717 if (!svm_has(SVM_FEATURE_LBRV)) { 2954 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2718 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 2955 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2719 __func__, data); 2956 __func__, data);
2720 break; 2957 break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2723 return 1; 2960 return 1;
2724 2961
2725 svm->vmcb->save.dbgctl = data; 2962 svm->vmcb->save.dbgctl = data;
2963 mark_dirty(svm->vmcb, VMCB_LBR);
2726 if (data & (1ULL<<0)) 2964 if (data & (1ULL<<0))
2727 svm_enable_lbrv(svm); 2965 svm_enable_lbrv(svm);
2728 else 2966 else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3013 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2776 svm_clear_vintr(svm); 3014 svm_clear_vintr(svm);
2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3015 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3016 mark_dirty(svm->vmcb, VMCB_INTR);
2778 /* 3017 /*
2779 * If the user space waits to inject interrupts, exit as soon as 3018 * If the user space waits to inject interrupts, exit as soon as
2780 * possible 3019 * possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
2797} 3036}
2798 3037
2799static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3038static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2800 [SVM_EXIT_READ_CR0] = emulate_on_interception, 3039 [SVM_EXIT_READ_CR0] = cr_interception,
2801 [SVM_EXIT_READ_CR3] = emulate_on_interception, 3040 [SVM_EXIT_READ_CR3] = cr_interception,
2802 [SVM_EXIT_READ_CR4] = emulate_on_interception, 3041 [SVM_EXIT_READ_CR4] = cr_interception,
2803 [SVM_EXIT_READ_CR8] = emulate_on_interception, 3042 [SVM_EXIT_READ_CR8] = cr_interception,
2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3043 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception, 3044 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 3045 [SVM_EXIT_WRITE_CR3] = cr_interception,
2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 3046 [SVM_EXIT_WRITE_CR4] = cr_interception,
2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3047 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2809 [SVM_EXIT_READ_DR0] = emulate_on_interception, 3048 [SVM_EXIT_READ_DR0] = dr_interception,
2810 [SVM_EXIT_READ_DR1] = emulate_on_interception, 3049 [SVM_EXIT_READ_DR1] = dr_interception,
2811 [SVM_EXIT_READ_DR2] = emulate_on_interception, 3050 [SVM_EXIT_READ_DR2] = dr_interception,
2812 [SVM_EXIT_READ_DR3] = emulate_on_interception, 3051 [SVM_EXIT_READ_DR3] = dr_interception,
2813 [SVM_EXIT_READ_DR4] = emulate_on_interception, 3052 [SVM_EXIT_READ_DR4] = dr_interception,
2814 [SVM_EXIT_READ_DR5] = emulate_on_interception, 3053 [SVM_EXIT_READ_DR5] = dr_interception,
2815 [SVM_EXIT_READ_DR6] = emulate_on_interception, 3054 [SVM_EXIT_READ_DR6] = dr_interception,
2816 [SVM_EXIT_READ_DR7] = emulate_on_interception, 3055 [SVM_EXIT_READ_DR7] = dr_interception,
2817 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 3056 [SVM_EXIT_WRITE_DR0] = dr_interception,
2818 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 3057 [SVM_EXIT_WRITE_DR1] = dr_interception,
2819 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 3058 [SVM_EXIT_WRITE_DR2] = dr_interception,
2820 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 3059 [SVM_EXIT_WRITE_DR3] = dr_interception,
2821 [SVM_EXIT_WRITE_DR4] = emulate_on_interception, 3060 [SVM_EXIT_WRITE_DR4] = dr_interception,
2822 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 3061 [SVM_EXIT_WRITE_DR5] = dr_interception,
2823 [SVM_EXIT_WRITE_DR6] = emulate_on_interception, 3062 [SVM_EXIT_WRITE_DR6] = dr_interception,
2824 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 3063 [SVM_EXIT_WRITE_DR7] = dr_interception,
2825 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3064 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2826 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3065 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2827 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3066 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2854 [SVM_EXIT_WBINVD] = emulate_on_interception, 3093 [SVM_EXIT_WBINVD] = emulate_on_interception,
2855 [SVM_EXIT_MONITOR] = invalid_op_interception, 3094 [SVM_EXIT_MONITOR] = invalid_op_interception,
2856 [SVM_EXIT_MWAIT] = invalid_op_interception, 3095 [SVM_EXIT_MWAIT] = invalid_op_interception,
3096 [SVM_EXIT_XSETBV] = xsetbv_interception,
2857 [SVM_EXIT_NPF] = pf_interception, 3097 [SVM_EXIT_NPF] = pf_interception,
2858}; 3098};
2859 3099
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2864 struct vmcb_save_area *save = &svm->vmcb->save; 3104 struct vmcb_save_area *save = &svm->vmcb->save;
2865 3105
2866 pr_err("VMCB Control Area:\n"); 3106 pr_err("VMCB Control Area:\n");
2867 pr_err("cr_read: %04x\n", control->intercept_cr_read); 3107 pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
2868 pr_err("cr_write: %04x\n", control->intercept_cr_write); 3108 pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
2869 pr_err("dr_read: %04x\n", control->intercept_dr_read); 3109 pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
2870 pr_err("dr_write: %04x\n", control->intercept_dr_write); 3110 pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
2871 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3111 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2872 pr_err("intercepts: %016llx\n", control->intercept); 3112 pr_err("intercepts: %016llx\n", control->intercept);
2873 pr_err("pause filter count: %d\n", control->pause_filter_count); 3113 pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
2950 3190
2951} 3191}
2952 3192
3193static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3194{
3195 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3196
3197 *info1 = control->exit_info_1;
3198 *info2 = control->exit_info_2;
3199}
3200
2953static int handle_exit(struct kvm_vcpu *vcpu) 3201static int handle_exit(struct kvm_vcpu *vcpu)
2954{ 3202{
2955 struct vcpu_svm *svm = to_svm(vcpu); 3203 struct vcpu_svm *svm = to_svm(vcpu);
2956 struct kvm_run *kvm_run = vcpu->run; 3204 struct kvm_run *kvm_run = vcpu->run;
2957 u32 exit_code = svm->vmcb->control.exit_code; 3205 u32 exit_code = svm->vmcb->control.exit_code;
2958 3206
2959 trace_kvm_exit(exit_code, vcpu); 3207 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
2960 3208
2961 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) 3209 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2962 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3210 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2963 if (npt_enabled) 3211 if (npt_enabled)
2964 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3212 vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2970 return 1; 3218 return 1;
2971 } 3219 }
2972 3220
2973 if (is_nested(svm)) { 3221 if (is_guest_mode(vcpu)) {
2974 int vmexit; 3222 int vmexit;
2975 3223
2976 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 3224 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
3033 3281
3034 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 3282 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3035 3283
3036 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3037 /* FIXME: handle wraparound of asid_generation */ 3284 /* FIXME: handle wraparound of asid_generation */
3038 if (svm->asid_generation != sd->asid_generation) 3285 if (svm->asid_generation != sd->asid_generation)
3039 new_asid(svm, sd); 3286 new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3045 3292
3046 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3293 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3047 vcpu->arch.hflags |= HF_NMI_MASK; 3294 vcpu->arch.hflags |= HF_NMI_MASK;
3048 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3295 set_intercept(svm, INTERCEPT_IRET);
3049 ++vcpu->stat.nmi_injections; 3296 ++vcpu->stat.nmi_injections;
3050} 3297}
3051 3298
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3058 control->int_ctl &= ~V_INTR_PRIO_MASK; 3305 control->int_ctl &= ~V_INTR_PRIO_MASK;
3059 control->int_ctl |= V_IRQ_MASK | 3306 control->int_ctl |= V_IRQ_MASK |
3060 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 3307 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3308 mark_dirty(svm->vmcb, VMCB_INTR);
3061} 3309}
3062 3310
3063static void svm_set_irq(struct kvm_vcpu *vcpu) 3311static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3077{ 3325{
3078 struct vcpu_svm *svm = to_svm(vcpu); 3326 struct vcpu_svm *svm = to_svm(vcpu);
3079 3327
3080 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3328 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3081 return; 3329 return;
3082 3330
3083 if (irr == -1) 3331 if (irr == -1)
3084 return; 3332 return;
3085 3333
3086 if (tpr >= irr) 3334 if (tpr >= irr)
3087 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 3335 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3088} 3336}
3089 3337
3090static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3338static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3112 3360
3113 if (masked) { 3361 if (masked) {
3114 svm->vcpu.arch.hflags |= HF_NMI_MASK; 3362 svm->vcpu.arch.hflags |= HF_NMI_MASK;
3115 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3363 set_intercept(svm, INTERCEPT_IRET);
3116 } else { 3364 } else {
3117 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 3365 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3118 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 3366 clr_intercept(svm, INTERCEPT_IRET);
3119 } 3367 }
3120} 3368}
3121 3369
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3131 3379
3132 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3380 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
3133 3381
3134 if (is_nested(svm)) 3382 if (is_guest_mode(vcpu))
3135 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3383 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3136 3384
3137 return ret; 3385 return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3177 3425
3178static void svm_flush_tlb(struct kvm_vcpu *vcpu) 3426static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3179{ 3427{
3180 force_new_asid(vcpu); 3428 struct vcpu_svm *svm = to_svm(vcpu);
3429
3430 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3431 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3432 else
3433 svm->asid_generation--;
3181} 3434}
3182 3435
3183static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 3436static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3188{ 3441{
3189 struct vcpu_svm *svm = to_svm(vcpu); 3442 struct vcpu_svm *svm = to_svm(vcpu);
3190 3443
3191 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3444 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3192 return; 3445 return;
3193 3446
3194 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 3447 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3195 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3448 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3196 kvm_set_cr8(vcpu, cr8); 3449 kvm_set_cr8(vcpu, cr8);
3197 } 3450 }
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3202 struct vcpu_svm *svm = to_svm(vcpu); 3455 struct vcpu_svm *svm = to_svm(vcpu);
3203 u64 cr8; 3456 u64 cr8;
3204 3457
3205 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3458 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3206 return; 3459 return;
3207 3460
3208 cr8 = kvm_get_cr8(vcpu); 3461 cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3289static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3542static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3290{ 3543{
3291 struct vcpu_svm *svm = to_svm(vcpu); 3544 struct vcpu_svm *svm = to_svm(vcpu);
3292 u16 fs_selector;
3293 u16 gs_selector;
3294 u16 ldt_selector;
3295 3545
3296 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3546 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3297 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3547 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3308 3558
3309 sync_lapic_to_cr8(vcpu); 3559 sync_lapic_to_cr8(vcpu);
3310 3560
3311 save_host_msrs(vcpu);
3312 savesegment(fs, fs_selector);
3313 savesegment(gs, gs_selector);
3314 ldt_selector = kvm_read_ldt();
3315 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3561 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3316 3562
3317 clgi(); 3563 clgi();
@@ -3389,20 +3635,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3389#endif 3635#endif
3390 ); 3636 );
3391 3637
3392 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3393 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3394 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3395 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3396
3397 load_host_msrs(vcpu);
3398 loadsegment(fs, fs_selector);
3399#ifdef CONFIG_X86_64 3638#ifdef CONFIG_X86_64
3400 load_gs_index(gs_selector); 3639 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3401 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3402#else 3640#else
3403 loadsegment(gs, gs_selector); 3641 loadsegment(fs, svm->host.fs);
3404#endif 3642#endif
3405 kvm_load_ldt(ldt_selector);
3406 3643
3407 reload_tss(vcpu); 3644 reload_tss(vcpu);
3408 3645
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3410 3647
3411 stgi(); 3648 stgi();
3412 3649
3650 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3651 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3652 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3653 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3654
3413 sync_cr8_to_lapic(vcpu); 3655 sync_cr8_to_lapic(vcpu);
3414 3656
3415 svm->next_rip = 0; 3657 svm->next_rip = 0;
3416 3658
3659 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3660
3661 /* if exit due to PF check for async PF */
3662 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3663 svm->apf_reason = kvm_read_and_reset_pf_reason();
3664
3417 if (npt_enabled) { 3665 if (npt_enabled) {
3418 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3666 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3419 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3667 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3426 if (unlikely(svm->vmcb->control.exit_code == 3674 if (unlikely(svm->vmcb->control.exit_code ==
3427 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3675 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3428 svm_handle_mce(svm); 3676 svm_handle_mce(svm);
3677
3678 mark_all_clean(svm->vmcb);
3429} 3679}
3430 3680
3431#undef R 3681#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3435 struct vcpu_svm *svm = to_svm(vcpu); 3685 struct vcpu_svm *svm = to_svm(vcpu);
3436 3686
3437 svm->vmcb->save.cr3 = root; 3687 svm->vmcb->save.cr3 = root;
3438 force_new_asid(vcpu); 3688 mark_dirty(svm->vmcb, VMCB_CR);
3689 svm_flush_tlb(vcpu);
3439} 3690}
3440 3691
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3692static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3443 struct vcpu_svm *svm = to_svm(vcpu); 3694 struct vcpu_svm *svm = to_svm(vcpu);
3444 3695
3445 svm->vmcb->control.nested_cr3 = root; 3696 svm->vmcb->control.nested_cr3 = root;
3697 mark_dirty(svm->vmcb, VMCB_NPT);
3446 3698
3447 /* Also sync guest cr3 here in case we live migrate */ 3699 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3; 3700 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3701 mark_dirty(svm->vmcb, VMCB_CR);
3449 3702
3450 force_new_asid(vcpu); 3703 svm_flush_tlb(vcpu);
3451} 3704}
3452 3705
3453static int is_disabled(void) 3706static int is_disabled(void)
@@ -3507,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3507 additional features */ 3760 additional features */
3508 3761
3509 /* Support next_rip if host supports it */ 3762 /* Support next_rip if host supports it */
3510 if (svm_has(SVM_FEATURE_NRIP)) 3763 if (boot_cpu_has(X86_FEATURE_NRIPS))
3511 entry->edx |= SVM_FEATURE_NRIP; 3764 entry->edx |= SVM_FEATURE_NRIP;
3512 3765
3513 /* Support NPT for the guest if enabled */ 3766 /* Support NPT for the guest if enabled */
@@ -3567,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
3567 { SVM_EXIT_WBINVD, "wbinvd" }, 3820 { SVM_EXIT_WBINVD, "wbinvd" },
3568 { SVM_EXIT_MONITOR, "monitor" }, 3821 { SVM_EXIT_MONITOR, "monitor" },
3569 { SVM_EXIT_MWAIT, "mwait" }, 3822 { SVM_EXIT_MWAIT, "mwait" },
3823 { SVM_EXIT_XSETBV, "xsetbv" },
3570 { SVM_EXIT_NPF, "npf" }, 3824 { SVM_EXIT_NPF, "npf" },
3571 { -1, NULL } 3825 { -1, NULL }
3572}; 3826};
@@ -3590,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3590{ 3844{
3591 struct vcpu_svm *svm = to_svm(vcpu); 3845 struct vcpu_svm *svm = to_svm(vcpu);
3592 3846
3593 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3847 set_exception_intercept(svm, NM_VECTOR);
3594 if (is_nested(svm))
3595 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3596 update_cr0_intercept(svm); 3848 update_cr0_intercept(svm);
3597} 3849}
3598 3850
@@ -3623,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3623 .get_cpl = svm_get_cpl, 3875 .get_cpl = svm_get_cpl,
3624 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 3876 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3625 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, 3877 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
3878 .decache_cr3 = svm_decache_cr3,
3626 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 3879 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3627 .set_cr0 = svm_set_cr0, 3880 .set_cr0 = svm_set_cr0,
3628 .set_cr3 = svm_set_cr3, 3881 .set_cr3 = svm_set_cr3,
@@ -3663,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
3663 .get_tdp_level = get_npt_level, 3916 .get_tdp_level = get_npt_level,
3664 .get_mt_mask = svm_get_mt_mask, 3917 .get_mt_mask = svm_get_mt_mask,
3665 3918
3919 .get_exit_info = svm_get_exit_info,
3666 .exit_reasons_str = svm_exit_reasons_str, 3920 .exit_reasons_str = svm_exit_reasons_str,
3921
3667 .get_lpage_level = svm_get_lpage_level, 3922 .get_lpage_level = svm_get_lpage_level,
3668 3923
3669 .cpuid_update = svm_cpuid_update, 3924 .cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) 178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) 179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
180 180
181#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2
183
181/* 184/*
182 * Tracepoint for kvm guest exit: 185 * Tracepoint for kvm guest exit:
183 */ 186 */
184TRACE_EVENT(kvm_exit, 187TRACE_EVENT(kvm_exit,
185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), 188 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
186 TP_ARGS(exit_reason, vcpu), 189 TP_ARGS(exit_reason, vcpu, isa),
187 190
188 TP_STRUCT__entry( 191 TP_STRUCT__entry(
189 __field( unsigned int, exit_reason ) 192 __field( unsigned int, exit_reason )
190 __field( unsigned long, guest_rip ) 193 __field( unsigned long, guest_rip )
194 __field( u32, isa )
195 __field( u64, info1 )
196 __field( u64, info2 )
191 ), 197 ),
192 198
193 TP_fast_assign( 199 TP_fast_assign(
194 __entry->exit_reason = exit_reason; 200 __entry->exit_reason = exit_reason;
195 __entry->guest_rip = kvm_rip_read(vcpu); 201 __entry->guest_rip = kvm_rip_read(vcpu);
202 __entry->isa = isa;
203 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
204 &__entry->info2);
196 ), 205 ),
197 206
198 TP_printk("reason %s rip 0x%lx", 207 TP_printk("reason %s rip 0x%lx info %llx %llx",
199 ftrace_print_symbols_seq(p, __entry->exit_reason, 208 ftrace_print_symbols_seq(p, __entry->exit_reason,
200 kvm_x86_ops->exit_reasons_str), 209 kvm_x86_ops->exit_reasons_str),
201 __entry->guest_rip) 210 __entry->guest_rip, __entry->info1, __entry->info2)
202); 211);
203 212
204/* 213/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8da0e45ff7c9..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69static int __read_mostly vmm_exclusive = 1; 69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 70module_param(vmm_exclusive, bool, S_IRUGO);
71 71
72static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO);
74
72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
74#define KVM_GUEST_CR0_MASK \ 77#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
177static u64 construct_eptp(unsigned long root_hpa); 180static u64 construct_eptp(unsigned long root_hpa);
178static void kvm_cpu_vmxon(u64 addr); 181static void kvm_cpu_vmxon(u64 addr);
179static void kvm_cpu_vmxoff(void); 182static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
180 184
181static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
182static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
188static unsigned long *vmx_msr_bitmap_legacy; 192static unsigned long *vmx_msr_bitmap_legacy;
189static unsigned long *vmx_msr_bitmap_longmode; 193static unsigned long *vmx_msr_bitmap_longmode;
190 194
195static bool cpu_has_load_ia32_efer;
196
191static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 197static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
192static DEFINE_SPINLOCK(vmx_vpid_lock); 198static DEFINE_SPINLOCK(vmx_vpid_lock);
193 199
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
472 u8 error; 478 u8 error;
473 479
474 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 480 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
475 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 481 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
476 : "cc", "memory"); 482 : "cc", "memory");
477 if (error) 483 if (error)
478 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 484 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
485 u8 error; 491 u8 error;
486 492
487 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 493 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
488 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 494 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
489 : "cc", "memory"); 495 : "cc", "memory");
490 if (error) 496 if (error)
491 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 497 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
565 571
566static unsigned long vmcs_readl(unsigned long field) 572static unsigned long vmcs_readl(unsigned long field)
567{ 573{
568 unsigned long value; 574 unsigned long value = 0;
569 575
570 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 576 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
571 : "=a"(value) : "d"(field) : "cc"); 577 : "+a"(value) : "d"(field) : "cc");
572 return value; 578 return value;
573} 579}
574 580
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
661 unsigned i; 667 unsigned i;
662 struct msr_autoload *m = &vmx->msr_autoload; 668 struct msr_autoload *m = &vmx->msr_autoload;
663 669
670 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
671 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
672 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
673 return;
674 }
675
664 for (i = 0; i < m->nr; ++i) 676 for (i = 0; i < m->nr; ++i)
665 if (m->guest[i].index == msr) 677 if (m->guest[i].index == msr)
666 break; 678 break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
680 unsigned i; 692 unsigned i;
681 struct msr_autoload *m = &vmx->msr_autoload; 693 struct msr_autoload *m = &vmx->msr_autoload;
682 694
695 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
696 vmcs_write64(GUEST_IA32_EFER, guest_val);
697 vmcs_write64(HOST_IA32_EFER, host_val);
698 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
699 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
700 return;
701 }
702
683 for (i = 0; i < m->nr; ++i) 703 for (i = 0; i < m->nr; ++i)
684 if (m->guest[i].index == msr) 704 if (m->guest[i].index == msr)
685 break; 705 break;
@@ -821,10 +841,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
821#endif 841#endif
822 842
823#ifdef CONFIG_X86_64 843#ifdef CONFIG_X86_64
824 if (is_long_mode(&vmx->vcpu)) { 844 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
825 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 845 if (is_long_mode(&vmx->vcpu))
826 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 846 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
827 }
828#endif 847#endif
829 for (i = 0; i < vmx->save_nmsrs; ++i) 848 for (i = 0; i < vmx->save_nmsrs; ++i)
830 kvm_set_shared_msr(vmx->guest_msrs[i].index, 849 kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -839,23 +858,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
839 858
840 ++vmx->vcpu.stat.host_state_reload; 859 ++vmx->vcpu.stat.host_state_reload;
841 vmx->host_state.loaded = 0; 860 vmx->host_state.loaded = 0;
842 if (vmx->host_state.fs_reload_needed) 861#ifdef CONFIG_X86_64
843 loadsegment(fs, vmx->host_state.fs_sel); 862 if (is_long_mode(&vmx->vcpu))
863 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
864#endif
844 if (vmx->host_state.gs_ldt_reload_needed) { 865 if (vmx->host_state.gs_ldt_reload_needed) {
845 kvm_load_ldt(vmx->host_state.ldt_sel); 866 kvm_load_ldt(vmx->host_state.ldt_sel);
846#ifdef CONFIG_X86_64 867#ifdef CONFIG_X86_64
847 load_gs_index(vmx->host_state.gs_sel); 868 load_gs_index(vmx->host_state.gs_sel);
848 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
849#else 869#else
850 loadsegment(gs, vmx->host_state.gs_sel); 870 loadsegment(gs, vmx->host_state.gs_sel);
851#endif 871#endif
852 } 872 }
873 if (vmx->host_state.fs_reload_needed)
874 loadsegment(fs, vmx->host_state.fs_sel);
853 reload_tss(); 875 reload_tss();
854#ifdef CONFIG_X86_64 876#ifdef CONFIG_X86_64
855 if (is_long_mode(&vmx->vcpu)) { 877 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
856 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
857 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
858 }
859#endif 878#endif
860 if (current_thread_info()->status & TS_USEDFPU) 879 if (current_thread_info()->status & TS_USEDFPU)
861 clts(); 880 clts();
@@ -1010,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1010 vmx_set_interrupt_shadow(vcpu, 0); 1029 vmx_set_interrupt_shadow(vcpu, 0);
1011} 1030}
1012 1031
1032static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1033{
1034 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1035 * explicitly skip the instruction because if the HLT state is set, then
1036 * the instruction is already executing and RIP has already been
1037 * advanced. */
1038 if (!yield_on_hlt &&
1039 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1040 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1041}
1042
1013static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1043static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1014 bool has_error_code, u32 error_code, 1044 bool has_error_code, u32 error_code,
1015 bool reinject) 1045 bool reinject)
@@ -1036,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1036 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1066 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1037 1067
1038 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1068 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1069 vmx_clear_hlt(vcpu);
1039} 1070}
1040 1071
1041static bool vmx_rdtscp_supported(void) 1072static bool vmx_rdtscp_supported(void)
@@ -1306,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
1306 && tboot_enabled()) 1337 && tboot_enabled())
1307 return 1; 1338 return 1;
1308 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1309 && !tboot_enabled()) 1340 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n");
1310 return 1; 1343 return 1;
1344 }
1311 } 1345 }
1312 1346
1313 return 0; 1347 return 0;
@@ -1401,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1401 return 0; 1435 return 0;
1402} 1436}
1403 1437
1438static __init bool allow_1_setting(u32 msr, u32 ctl)
1439{
1440 u32 vmx_msr_low, vmx_msr_high;
1441
1442 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1443 return vmx_msr_high & ctl;
1444}
1445
1404static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 1446static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1405{ 1447{
1406 u32 vmx_msr_low, vmx_msr_high; 1448 u32 vmx_msr_low, vmx_msr_high;
@@ -1417,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1417 &_pin_based_exec_control) < 0) 1459 &_pin_based_exec_control) < 0)
1418 return -EIO; 1460 return -EIO;
1419 1461
1420 min = CPU_BASED_HLT_EXITING | 1462 min =
1421#ifdef CONFIG_X86_64 1463#ifdef CONFIG_X86_64
1422 CPU_BASED_CR8_LOAD_EXITING | 1464 CPU_BASED_CR8_LOAD_EXITING |
1423 CPU_BASED_CR8_STORE_EXITING | 1465 CPU_BASED_CR8_STORE_EXITING |
@@ -1430,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1430 CPU_BASED_MWAIT_EXITING | 1472 CPU_BASED_MWAIT_EXITING |
1431 CPU_BASED_MONITOR_EXITING | 1473 CPU_BASED_MONITOR_EXITING |
1432 CPU_BASED_INVLPG_EXITING; 1474 CPU_BASED_INVLPG_EXITING;
1475
1476 if (yield_on_hlt)
1477 min |= CPU_BASED_HLT_EXITING;
1478
1433 opt = CPU_BASED_TPR_SHADOW | 1479 opt = CPU_BASED_TPR_SHADOW |
1434 CPU_BASED_USE_MSR_BITMAPS | 1480 CPU_BASED_USE_MSR_BITMAPS |
1435 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1481 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1511,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1511 vmcs_conf->vmexit_ctrl = _vmexit_control; 1557 vmcs_conf->vmexit_ctrl = _vmexit_control;
1512 vmcs_conf->vmentry_ctrl = _vmentry_control; 1558 vmcs_conf->vmentry_ctrl = _vmentry_control;
1513 1559
1560 cpu_has_load_ia32_efer =
1561 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1562 VM_ENTRY_LOAD_IA32_EFER)
1563 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1564 VM_EXIT_LOAD_IA32_EFER);
1565
1514 return 0; 1566 return 0;
1515} 1567}
1516 1568
@@ -1684,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1684 save->limit = vmcs_read32(sf->limit); 1736 save->limit = vmcs_read32(sf->limit);
1685 save->ar = vmcs_read32(sf->ar_bytes); 1737 save->ar = vmcs_read32(sf->ar_bytes);
1686 vmcs_write16(sf->selector, save->base >> 4); 1738 vmcs_write16(sf->selector, save->base >> 4);
1687 vmcs_write32(sf->base, save->base & 0xfffff); 1739 vmcs_write32(sf->base, save->base & 0xffff0);
1688 vmcs_write32(sf->limit, 0xffff); 1740 vmcs_write32(sf->limit, 0xffff);
1689 vmcs_write32(sf->ar_bytes, 0xf3); 1741 vmcs_write32(sf->ar_bytes, 0xf3);
1742 if (save->base & 0xf)
1743 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1744 " aligned when entering protected mode (seg=%d)",
1745 seg);
1690} 1746}
1691 1747
1692static void enter_rmode(struct kvm_vcpu *vcpu) 1748static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1815,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1815 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 1871 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1816} 1872}
1817 1873
1874static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
1875{
1876 if (enable_ept && is_paging(vcpu))
1877 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
1878 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1879}
1880
1818static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1881static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1819{ 1882{
1820 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 1883 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1858,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1858 unsigned long cr0, 1921 unsigned long cr0,
1859 struct kvm_vcpu *vcpu) 1922 struct kvm_vcpu *vcpu)
1860{ 1923{
1924 vmx_decache_cr3(vcpu);
1861 if (!(cr0 & X86_CR0_PG)) { 1925 if (!(cr0 & X86_CR0_PG)) {
1862 /* From paging/starting to nonpaging */ 1926 /* From paging/starting to nonpaging */
1863 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1927 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1938,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1938 if (enable_ept) { 2002 if (enable_ept) {
1939 eptp = construct_eptp(cr3); 2003 eptp = construct_eptp(cr3);
1940 vmcs_write64(EPT_POINTER, eptp); 2004 vmcs_write64(EPT_POINTER, eptp);
1941 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 2005 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
1942 vcpu->kvm->arch.ept_identity_map_addr; 2006 vcpu->kvm->arch.ept_identity_map_addr;
1943 ept_load_pdptrs(vcpu); 2007 ept_load_pdptrs(vcpu);
1944 } 2008 }
@@ -2726,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2726 vmcs_writel(GUEST_IDTR_BASE, 0); 2790 vmcs_writel(GUEST_IDTR_BASE, 0);
2727 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 2791 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2728 2792
2729 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 2793 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2730 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2794 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2731 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2795 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2732 2796
@@ -2788,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2788 return; 2852 return;
2789 } 2853 }
2790 2854
2855 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
2856 enable_irq_window(vcpu);
2857 return;
2858 }
2791 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2859 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2792 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 2860 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2793 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2861 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2815,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2815 } else 2883 } else
2816 intr |= INTR_TYPE_EXT_INTR; 2884 intr |= INTR_TYPE_EXT_INTR;
2817 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 2885 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2886 vmx_clear_hlt(vcpu);
2818} 2887}
2819 2888
2820static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2889static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2842,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2842 } 2911 }
2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2912 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2844 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2913 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2914 vmx_clear_hlt(vcpu);
2845} 2915}
2846 2916
2847static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 2917static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2850,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2850 return 0; 2920 return 0;
2851 2921
2852 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2922 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2853 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); 2923 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
2924 | GUEST_INTR_STATE_NMI));
2854} 2925}
2855 2926
2856static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 2927static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2911,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2911 * Cause the #SS fault with 0 error code in VM86 mode. 2982 * Cause the #SS fault with 0 error code in VM86 mode.
2912 */ 2983 */
2913 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2984 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2914 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) 2985 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2915 return 1; 2986 return 1;
2916 /* 2987 /*
2917 * Forward all other exceptions that are valid in real mode. 2988 * Forward all other exceptions that are valid in real mode.
@@ -3008,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3008 } 3079 }
3009 3080
3010 if (is_invalid_opcode(intr_info)) { 3081 if (is_invalid_opcode(intr_info)) {
3011 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); 3082 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3012 if (er != EMULATE_DONE) 3083 if (er != EMULATE_DONE)
3013 kvm_queue_exception(vcpu, UD_VECTOR); 3084 kvm_queue_exception(vcpu, UD_VECTOR);
3014 return 1; 3085 return 1;
@@ -3027,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3027 3098
3028 if (kvm_event_needs_reinjection(vcpu)) 3099 if (kvm_event_needs_reinjection(vcpu))
3029 kvm_mmu_unprotect_page_virt(vcpu, cr2); 3100 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3030 return kvm_mmu_page_fault(vcpu, cr2, error_code); 3101 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3031 } 3102 }
3032 3103
3033 if (vmx->rmode.vm86_active && 3104 if (vmx->rmode.vm86_active &&
@@ -3099,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3099 ++vcpu->stat.io_exits; 3170 ++vcpu->stat.io_exits;
3100 3171
3101 if (string || in) 3172 if (string || in)
3102 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3173 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3103 3174
3104 port = exit_qualification >> 16; 3175 port = exit_qualification >> 16;
3105 size = (exit_qualification & 7) + 1; 3176 size = (exit_qualification & 7) + 1;
@@ -3119,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3119 hypercall[2] = 0xc1; 3190 hypercall[2] = 0xc1;
3120} 3191}
3121 3192
3122static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3123{
3124 if (err)
3125 kvm_inject_gp(vcpu, 0);
3126 else
3127 skip_emulated_instruction(vcpu);
3128}
3129
3130static int handle_cr(struct kvm_vcpu *vcpu) 3193static int handle_cr(struct kvm_vcpu *vcpu)
3131{ 3194{
3132 unsigned long exit_qualification, val; 3195 unsigned long exit_qualification, val;
@@ -3144,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3144 switch (cr) { 3207 switch (cr) {
3145 case 0: 3208 case 0:
3146 err = kvm_set_cr0(vcpu, val); 3209 err = kvm_set_cr0(vcpu, val);
3147 complete_insn_gp(vcpu, err); 3210 kvm_complete_insn_gp(vcpu, err);
3148 return 1; 3211 return 1;
3149 case 3: 3212 case 3:
3150 err = kvm_set_cr3(vcpu, val); 3213 err = kvm_set_cr3(vcpu, val);
3151 complete_insn_gp(vcpu, err); 3214 kvm_complete_insn_gp(vcpu, err);
3152 return 1; 3215 return 1;
3153 case 4: 3216 case 4:
3154 err = kvm_set_cr4(vcpu, val); 3217 err = kvm_set_cr4(vcpu, val);
3155 complete_insn_gp(vcpu, err); 3218 kvm_complete_insn_gp(vcpu, err);
3156 return 1; 3219 return 1;
3157 case 8: { 3220 case 8: {
3158 u8 cr8_prev = kvm_get_cr8(vcpu); 3221 u8 cr8_prev = kvm_get_cr8(vcpu);
3159 u8 cr8 = kvm_register_read(vcpu, reg); 3222 u8 cr8 = kvm_register_read(vcpu, reg);
3160 kvm_set_cr8(vcpu, cr8); 3223 err = kvm_set_cr8(vcpu, cr8);
3161 skip_emulated_instruction(vcpu); 3224 kvm_complete_insn_gp(vcpu, err);
3162 if (irqchip_in_kernel(vcpu->kvm)) 3225 if (irqchip_in_kernel(vcpu->kvm))
3163 return 1; 3226 return 1;
3164 if (cr8_prev <= cr8) 3227 if (cr8_prev <= cr8)
@@ -3177,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3177 case 1: /*mov from cr*/ 3240 case 1: /*mov from cr*/
3178 switch (cr) { 3241 switch (cr) {
3179 case 3: 3242 case 3:
3180 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 3243 val = kvm_read_cr3(vcpu);
3181 trace_kvm_cr_read(cr, vcpu->arch.cr3); 3244 kvm_register_write(vcpu, reg, val);
3245 trace_kvm_cr_read(cr, val);
3182 skip_emulated_instruction(vcpu); 3246 skip_emulated_instruction(vcpu);
3183 return 1; 3247 return 1;
3184 case 8: 3248 case 8:
@@ -3350,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3350 return 1; 3414 return 1;
3351} 3415}
3352 3416
3417static int handle_invd(struct kvm_vcpu *vcpu)
3418{
3419 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3420}
3421
3353static int handle_invlpg(struct kvm_vcpu *vcpu) 3422static int handle_invlpg(struct kvm_vcpu *vcpu)
3354{ 3423{
3355 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3424 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3378,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
3378 3447
3379static int handle_apic_access(struct kvm_vcpu *vcpu) 3448static int handle_apic_access(struct kvm_vcpu *vcpu)
3380{ 3449{
3381 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3450 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3382} 3451}
3383 3452
3384static int handle_task_switch(struct kvm_vcpu *vcpu) 3453static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3477,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
3477 3546
3478 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3547 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3479 trace_kvm_page_fault(gpa, exit_qualification); 3548 trace_kvm_page_fault(gpa, exit_qualification);
3480 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3549 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3481} 3550}
3482 3551
3483static u64 ept_rsvd_mask(u64 spte, int level) 3552static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3593,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3593 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) 3662 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3594 return handle_interrupt_window(&vmx->vcpu); 3663 return handle_interrupt_window(&vmx->vcpu);
3595 3664
3596 err = emulate_instruction(vcpu, 0, 0, 0); 3665 err = emulate_instruction(vcpu, 0);
3597 3666
3598 if (err == EMULATE_DO_MMIO) { 3667 if (err == EMULATE_DO_MMIO) {
3599 ret = 0; 3668 ret = 0;
@@ -3650,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3650 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 3719 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3651 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 3720 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3652 [EXIT_REASON_HLT] = handle_halt, 3721 [EXIT_REASON_HLT] = handle_halt,
3722 [EXIT_REASON_INVD] = handle_invd,
3653 [EXIT_REASON_INVLPG] = handle_invlpg, 3723 [EXIT_REASON_INVLPG] = handle_invlpg,
3654 [EXIT_REASON_VMCALL] = handle_vmcall, 3724 [EXIT_REASON_VMCALL] = handle_vmcall,
3655 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 3725 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3677,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3677static const int kvm_vmx_max_exit_handlers = 3747static const int kvm_vmx_max_exit_handlers =
3678 ARRAY_SIZE(kvm_vmx_exit_handlers); 3748 ARRAY_SIZE(kvm_vmx_exit_handlers);
3679 3749
3750static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3751{
3752 *info1 = vmcs_readl(EXIT_QUALIFICATION);
3753 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3754}
3755
3680/* 3756/*
3681 * The guest has exited. See if we can fix it or if we need userspace 3757 * The guest has exited. See if we can fix it or if we need userspace
3682 * assistance. 3758 * assistance.
@@ -3687,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3687 u32 exit_reason = vmx->exit_reason; 3763 u32 exit_reason = vmx->exit_reason;
3688 u32 vectoring_info = vmx->idt_vectoring_info; 3764 u32 vectoring_info = vmx->idt_vectoring_info;
3689 3765
3690 trace_kvm_exit(exit_reason, vcpu); 3766 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3691 3767
3692 /* If guest state is invalid, start emulating */ 3768 /* If guest state is invalid, start emulating */
3693 if (vmx->emulation_required && emulate_invalid_guest_state) 3769 if (vmx->emulation_required && emulate_invalid_guest_state)
3694 return handle_invalid_guest_state(vcpu); 3770 return handle_invalid_guest_state(vcpu);
3695 3771
3696 /* Access CR3 don't cause VMExit in paging mode, so we need
3697 * to sync with guest real CR3. */
3698 if (enable_ept && is_paging(vcpu))
3699 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3700
3701 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3772 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3702 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3773 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3703 vcpu->run->fail_entry.hardware_entry_failure_reason 3774 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4014,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4014 ); 4085 );
4015 4086
4016 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4087 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4017 | (1 << VCPU_EXREG_PDPTR)); 4088 | (1 << VCPU_EXREG_PDPTR)
4089 | (1 << VCPU_EXREG_CR3));
4018 vcpu->arch.regs_dirty = 0; 4090 vcpu->arch.regs_dirty = 0;
4019 4091
4020 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4092 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4228,11 +4300,6 @@ static int vmx_get_lpage_level(void)
4228 return PT_PDPE_LEVEL; 4300 return PT_PDPE_LEVEL;
4229} 4301}
4230 4302
4231static inline u32 bit(int bitno)
4232{
4233 return 1 << (bitno & 31);
4234}
4235
4236static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 4303static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4237{ 4304{
4238 struct kvm_cpuid_entry2 *best; 4305 struct kvm_cpuid_entry2 *best;
@@ -4286,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4286 .get_cpl = vmx_get_cpl, 4353 .get_cpl = vmx_get_cpl,
4287 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4354 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4288 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 4355 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4356 .decache_cr3 = vmx_decache_cr3,
4289 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4357 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4290 .set_cr0 = vmx_set_cr0, 4358 .set_cr0 = vmx_set_cr0,
4291 .set_cr3 = vmx_set_cr3, 4359 .set_cr3 = vmx_set_cr3,
@@ -4326,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
4326 .get_tdp_level = get_ept_level, 4394 .get_tdp_level = get_ept_level,
4327 .get_mt_mask = vmx_get_mt_mask, 4395 .get_mt_mask = vmx_get_mt_mask,
4328 4396
4397 .get_exit_info = vmx_get_exit_info,
4329 .exit_reasons_str = vmx_exit_reasons_str, 4398 .exit_reasons_str = vmx_exit_reasons_str,
4399
4330 .get_lpage_level = vmx_get_lpage_level, 4400 .get_lpage_level = vmx_get_lpage_level,
4331 4401
4332 .cpuid_update = vmx_cpuid_update, 4402 .cpuid_update = vmx_cpuid_update,
@@ -4402,8 +4472,6 @@ static int __init vmx_init(void)
4402 4472
4403 if (enable_ept) { 4473 if (enable_ept) {
4404 bypass_guest_pf = 0; 4474 bypass_guest_pf = 0;
4405 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
4406 VMX_EPT_WRITABLE_MASK);
4407 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 4475 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4408 VMX_EPT_EXECUTABLE_MASK); 4476 VMX_EPT_EXECUTABLE_MASK);
4409 kvm_enable_tdp(); 4477 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdac9e592aa5..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -155,9 +156,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
155 156
156u64 __read_mostly host_xcr0; 157u64 __read_mostly host_xcr0;
157 158
158static inline u32 bit(int bitno) 159static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
159{ 160{
160 return 1 << (bitno & 31); 161 int i;
162 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
163 vcpu->arch.apf.gfns[i] = ~0;
161} 164}
162 165
163static void kvm_on_user_return(struct user_return_notifier *urn) 166static void kvm_on_user_return(struct user_return_notifier *urn)
@@ -331,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
331} 334}
332EXPORT_SYMBOL_GPL(kvm_requeue_exception); 335EXPORT_SYMBOL_GPL(kvm_requeue_exception);
333 336
334void kvm_inject_page_fault(struct kvm_vcpu *vcpu) 337void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
335{ 338{
336 unsigned error_code = vcpu->arch.fault.error_code; 339 if (err)
340 kvm_inject_gp(vcpu, 0);
341 else
342 kvm_x86_ops->skip_emulated_instruction(vcpu);
343}
344EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
337 345
346void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347{
338 ++vcpu->stat.pf_guest; 348 ++vcpu->stat.pf_guest;
339 vcpu->arch.cr2 = vcpu->arch.fault.address; 349 vcpu->arch.cr2 = fault->address;
340 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 350 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
341} 351}
342 352
343void kvm_propagate_fault(struct kvm_vcpu *vcpu) 353void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
344{ 354{
345 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) 355 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
346 vcpu->arch.nested_mmu.inject_page_fault(vcpu); 356 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
347 else 357 else
348 vcpu->arch.mmu.inject_page_fault(vcpu); 358 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
349
350 vcpu->arch.fault.nested = false;
351} 359}
352 360
353void kvm_inject_nmi(struct kvm_vcpu *vcpu) 361void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -465,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
465 (unsigned long *)&vcpu->arch.regs_avail)) 473 (unsigned long *)&vcpu->arch.regs_avail))
466 return true; 474 return true;
467 475
468 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; 476 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
469 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); 477 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
470 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), 478 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
471 PFERR_USER_MASK | PFERR_WRITE_MASK); 479 PFERR_USER_MASK | PFERR_WRITE_MASK);
472 if (r < 0) 480 if (r < 0)
@@ -511,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
511 } else 519 } else
512#endif 520#endif
513 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, 521 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
514 vcpu->arch.cr3)) 522 kvm_read_cr3(vcpu)))
515 return 1; 523 return 1;
516 } 524 }
517 525
518 kvm_x86_ops->set_cr0(vcpu, cr0); 526 kvm_x86_ops->set_cr0(vcpu, cr0);
519 527
528 if ((cr0 ^ old_cr0) & X86_CR0_PG)
529 kvm_clear_async_pf_completion_queue(vcpu);
530
520 if ((cr0 ^ old_cr0) & update_bits) 531 if ((cr0 ^ old_cr0) & update_bits)
521 kvm_mmu_reset_context(vcpu); 532 kvm_mmu_reset_context(vcpu);
522 return 0; 533 return 0;
@@ -600,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
600 return 1; 611 return 1;
601 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
602 && ((cr4 ^ old_cr4) & pdptr_bits) 613 && ((cr4 ^ old_cr4) & pdptr_bits)
603 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
615 kvm_read_cr3(vcpu)))
604 return 1; 616 return 1;
605 617
606 if (cr4 & X86_CR4_VMXE) 618 if (cr4 & X86_CR4_VMXE)
@@ -620,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
620 632
621int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 633int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
622{ 634{
623 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
624 kvm_mmu_sync_roots(vcpu); 636 kvm_mmu_sync_roots(vcpu);
625 kvm_mmu_flush_tlb(vcpu); 637 kvm_mmu_flush_tlb(vcpu);
626 return 0; 638 return 0;
@@ -655,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
655 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
656 return 1; 668 return 1;
657 vcpu->arch.cr3 = cr3; 669 vcpu->arch.cr3 = cr3;
670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
658 vcpu->arch.mmu.new_cr3(vcpu); 671 vcpu->arch.mmu.new_cr3(vcpu);
659 return 0; 672 return 0;
660} 673}
661EXPORT_SYMBOL_GPL(kvm_set_cr3); 674EXPORT_SYMBOL_GPL(kvm_set_cr3);
662 675
663int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 676int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
664{ 677{
665 if (cr8 & CR8_RESERVED_BITS) 678 if (cr8 & CR8_RESERVED_BITS)
666 return 1; 679 return 1;
@@ -670,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
670 vcpu->arch.cr8 = cr8; 683 vcpu->arch.cr8 = cr8;
671 return 0; 684 return 0;
672} 685}
673
674void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
675{
676 if (__kvm_set_cr8(vcpu, cr8))
677 kvm_inject_gp(vcpu, 0);
678}
679EXPORT_SYMBOL_GPL(kvm_set_cr8); 686EXPORT_SYMBOL_GPL(kvm_set_cr8);
680 687
681unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 688unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -780,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
780 * kvm-specific. Those are put in the beginning of the list. 787 * kvm-specific. Those are put in the beginning of the list.
781 */ 788 */
782 789
783#define KVM_SAVE_MSRS_BEGIN 7 790#define KVM_SAVE_MSRS_BEGIN 8
784static u32 msrs_to_save[] = { 791static u32 msrs_to_save[] = {
785 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
786 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
787 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
788 HV_X64_MSR_APIC_ASSIST_PAGE, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
789 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
790 MSR_STAR, 797 MSR_STAR,
791#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -835,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
835 kvm_x86_ops->set_efer(vcpu, efer); 842 kvm_x86_ops->set_efer(vcpu, efer);
836 843
837 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
838 kvm_mmu_reset_context(vcpu);
839 845
840 /* Update reserved bits */ 846 /* Update reserved bits */
841 if ((efer ^ old_efer) & EFER_NX) 847 if ((efer ^ old_efer) & EFER_NX)
@@ -981,7 +987,7 @@ static inline u64 nsec_to_cycles(u64 nsec)
981 if (kvm_tsc_changes_freq()) 987 if (kvm_tsc_changes_freq())
982 printk_once(KERN_WARNING 988 printk_once(KERN_WARNING
983 "kvm: unreliable cycle conversion on adjustable rate TSC\n"); 989 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
984 ret = nsec * __get_cpu_var(cpu_tsc_khz); 990 ret = nsec * __this_cpu_read(cpu_tsc_khz);
985 do_div(ret, USEC_PER_SEC); 991 do_div(ret, USEC_PER_SEC);
986 return ret; 992 return ret;
987} 993}
@@ -1066,7 +1072,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1066 local_irq_save(flags); 1072 local_irq_save(flags);
1067 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1073 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1068 kernel_ns = get_kernel_ns(); 1074 kernel_ns = get_kernel_ns();
1069 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1075 this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
1070 1076
1071 if (unlikely(this_tsc_khz == 0)) { 1077 if (unlikely(this_tsc_khz == 0)) {
1072 local_irq_restore(flags); 1078 local_irq_restore(flags);
@@ -1423,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1423 return 0; 1429 return 0;
1424} 1430}
1425 1431
1432static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1433{
1434 gpa_t gpa = data & ~0x3f;
1435
1436 /* Bits 2:5 are resrved, Should be zero */
1437 if (data & 0x3c)
1438 return 1;
1439
1440 vcpu->arch.apf.msr_val = data;
1441
1442 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1443 kvm_clear_async_pf_completion_queue(vcpu);
1444 kvm_async_pf_hash_reset(vcpu);
1445 return 0;
1446 }
1447
1448 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1449 return 1;
1450
1451 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1452 kvm_async_pf_wakeup_all(vcpu);
1453 return 0;
1454}
1455
1426int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1427{ 1457{
1428 switch (msr) { 1458 switch (msr) {
@@ -1504,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1504 } 1534 }
1505 break; 1535 break;
1506 } 1536 }
1537 case MSR_KVM_ASYNC_PF_EN:
1538 if (kvm_pv_enable_async_pf(vcpu, data))
1539 return 1;
1540 break;
1507 case MSR_IA32_MCG_CTL: 1541 case MSR_IA32_MCG_CTL:
1508 case MSR_IA32_MCG_STATUS: 1542 case MSR_IA32_MCG_STATUS:
1509 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1543 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1780,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1780 case MSR_KVM_SYSTEM_TIME_NEW: 1814 case MSR_KVM_SYSTEM_TIME_NEW:
1781 data = vcpu->arch.time; 1815 data = vcpu->arch.time;
1782 break; 1816 break;
1817 case MSR_KVM_ASYNC_PF_EN:
1818 data = vcpu->arch.apf.msr_val;
1819 break;
1783 case MSR_IA32_P5_MC_ADDR: 1820 case MSR_IA32_P5_MC_ADDR:
1784 case MSR_IA32_P5_MC_TYPE: 1821 case MSR_IA32_P5_MC_TYPE:
1785 case MSR_IA32_MCG_CAP: 1822 case MSR_IA32_MCG_CAP:
@@ -1909,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1909 case KVM_CAP_NOP_IO_DELAY: 1946 case KVM_CAP_NOP_IO_DELAY:
1910 case KVM_CAP_MP_STATE: 1947 case KVM_CAP_MP_STATE:
1911 case KVM_CAP_SYNC_MMU: 1948 case KVM_CAP_SYNC_MMU:
1949 case KVM_CAP_USER_NMI:
1912 case KVM_CAP_REINJECT_CONTROL: 1950 case KVM_CAP_REINJECT_CONTROL:
1913 case KVM_CAP_IRQ_INJECT_STATUS: 1951 case KVM_CAP_IRQ_INJECT_STATUS:
1914 case KVM_CAP_ASSIGN_DEV_IRQ: 1952 case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1927,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1927 case KVM_CAP_DEBUGREGS: 1965 case KVM_CAP_DEBUGREGS:
1928 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1966 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1929 case KVM_CAP_XSAVE: 1967 case KVM_CAP_XSAVE:
1968 case KVM_CAP_ASYNC_PF:
1930 r = 1; 1969 r = 1;
1931 break; 1970 break;
1932 case KVM_CAP_COALESCED_MMIO: 1971 case KVM_CAP_COALESCED_MMIO:
@@ -2190,6 +2229,11 @@ out:
2190 return r; 2229 return r;
2191} 2230}
2192 2231
2232static void cpuid_mask(u32 *word, int wordnum)
2233{
2234 *word &= boot_cpu_data.x86_capability[wordnum];
2235}
2236
2193static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2237static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2194 u32 index) 2238 u32 index)
2195{ 2239{
@@ -2264,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2264 break; 2308 break;
2265 case 1: 2309 case 1:
2266 entry->edx &= kvm_supported_word0_x86_features; 2310 entry->edx &= kvm_supported_word0_x86_features;
2311 cpuid_mask(&entry->edx, 0);
2267 entry->ecx &= kvm_supported_word4_x86_features; 2312 entry->ecx &= kvm_supported_word4_x86_features;
2313 cpuid_mask(&entry->ecx, 4);
2268 /* we support x2apic emulation even if host does not support 2314 /* we support x2apic emulation even if host does not support
2269 * it since we emulate x2apic in software */ 2315 * it since we emulate x2apic in software */
2270 entry->ecx |= F(X2APIC); 2316 entry->ecx |= F(X2APIC);
@@ -2355,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2355 break; 2401 break;
2356 case 0x80000001: 2402 case 0x80000001:
2357 entry->edx &= kvm_supported_word1_x86_features; 2403 entry->edx &= kvm_supported_word1_x86_features;
2404 cpuid_mask(&entry->edx, 1);
2358 entry->ecx &= kvm_supported_word6_x86_features; 2405 entry->ecx &= kvm_supported_word6_x86_features;
2406 cpuid_mask(&entry->ecx, 6);
2359 break; 2407 break;
2360 } 2408 }
2361 2409
@@ -3174,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3174 struct kvm_memslots *slots, *old_slots; 3222 struct kvm_memslots *slots, *old_slots;
3175 unsigned long *dirty_bitmap; 3223 unsigned long *dirty_bitmap;
3176 3224
3177 r = -ENOMEM; 3225 dirty_bitmap = memslot->dirty_bitmap_head;
3178 dirty_bitmap = vmalloc(n); 3226 if (memslot->dirty_bitmap == dirty_bitmap)
3179 if (!dirty_bitmap) 3227 dirty_bitmap += n / sizeof(long);
3180 goto out;
3181 memset(dirty_bitmap, 0, n); 3228 memset(dirty_bitmap, 0, n);
3182 3229
3183 r = -ENOMEM; 3230 r = -ENOMEM;
3184 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3231 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
3185 if (!slots) { 3232 if (!slots)
3186 vfree(dirty_bitmap);
3187 goto out; 3233 goto out;
3188 }
3189 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3234 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
3190 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3235 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3236 slots->generation++;
3191 3237
3192 old_slots = kvm->memslots; 3238 old_slots = kvm->memslots;
3193 rcu_assign_pointer(kvm->memslots, slots); 3239 rcu_assign_pointer(kvm->memslots, slots);
@@ -3200,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3200 spin_unlock(&kvm->mmu_lock); 3246 spin_unlock(&kvm->mmu_lock);
3201 3247
3202 r = -EFAULT; 3248 r = -EFAULT;
3203 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 3249 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
3204 vfree(dirty_bitmap);
3205 goto out; 3250 goto out;
3206 }
3207 vfree(dirty_bitmap);
3208 } else { 3251 } else {
3209 r = -EFAULT; 3252 r = -EFAULT;
3210 if (clear_user(log->dirty_bitmap, n)) 3253 if (clear_user(log->dirty_bitmap, n))
@@ -3271,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3271 if (vpic) { 3314 if (vpic) {
3272 r = kvm_ioapic_init(kvm); 3315 r = kvm_ioapic_init(kvm);
3273 if (r) { 3316 if (r) {
3317 mutex_lock(&kvm->slots_lock);
3274 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3318 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3275 &vpic->dev); 3319 &vpic->dev);
3320 mutex_unlock(&kvm->slots_lock);
3276 kfree(vpic); 3321 kfree(vpic);
3277 goto create_irqchip_unlock; 3322 goto create_irqchip_unlock;
3278 } 3323 }
@@ -3283,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
3283 smp_wmb(); 3328 smp_wmb();
3284 r = kvm_setup_default_irq_routing(kvm); 3329 r = kvm_setup_default_irq_routing(kvm);
3285 if (r) { 3330 if (r) {
3331 mutex_lock(&kvm->slots_lock);
3286 mutex_lock(&kvm->irq_lock); 3332 mutex_lock(&kvm->irq_lock);
3287 kvm_ioapic_destroy(kvm); 3333 kvm_ioapic_destroy(kvm);
3288 kvm_destroy_pic(kvm); 3334 kvm_destroy_pic(kvm);
3289 mutex_unlock(&kvm->irq_lock); 3335 mutex_unlock(&kvm->irq_lock);
3336 mutex_unlock(&kvm->slots_lock);
3290 } 3337 }
3291 create_irqchip_unlock: 3338 create_irqchip_unlock:
3292 mutex_unlock(&kvm->lock); 3339 mutex_unlock(&kvm->lock);
@@ -3562,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3562static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) 3609static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3563{ 3610{
3564 gpa_t t_gpa; 3611 gpa_t t_gpa;
3565 u32 error; 3612 struct x86_exception exception;
3566 3613
3567 BUG_ON(!mmu_is_nested(vcpu)); 3614 BUG_ON(!mmu_is_nested(vcpu));
3568 3615
3569 /* NPT walks are always user-walks */ 3616 /* NPT walks are always user-walks */
3570 access |= PFERR_USER_MASK; 3617 access |= PFERR_USER_MASK;
3571 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); 3618 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3572 if (t_gpa == UNMAPPED_GVA)
3573 vcpu->arch.fault.nested = true;
3574 3619
3575 return t_gpa; 3620 return t_gpa;
3576} 3621}
3577 3622
3578gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3623gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3624 struct x86_exception *exception)
3579{ 3625{
3580 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3626 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3581 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3627 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3582} 3628}
3583 3629
3584 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3630 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3631 struct x86_exception *exception)
3585{ 3632{
3586 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3633 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3587 access |= PFERR_FETCH_MASK; 3634 access |= PFERR_FETCH_MASK;
3588 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3635 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3589} 3636}
3590 3637
3591gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3638gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3639 struct x86_exception *exception)
3592{ 3640{
3593 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3641 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3594 access |= PFERR_WRITE_MASK; 3642 access |= PFERR_WRITE_MASK;
3595 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); 3643 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3596} 3644}
3597 3645
3598/* uses this to access any guest's mapped memory without checking CPL */ 3646/* uses this to access any guest's mapped memory without checking CPL */
3599gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3647gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3648 struct x86_exception *exception)
3600{ 3649{
3601 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); 3650 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3602} 3651}
3603 3652
3604static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3653static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3605 struct kvm_vcpu *vcpu, u32 access, 3654 struct kvm_vcpu *vcpu, u32 access,
3606 u32 *error) 3655 struct x86_exception *exception)
3607{ 3656{
3608 void *data = val; 3657 void *data = val;
3609 int r = X86EMUL_CONTINUE; 3658 int r = X86EMUL_CONTINUE;
3610 3659
3611 while (bytes) { 3660 while (bytes) {
3612 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, 3661 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3613 error); 3662 exception);
3614 unsigned offset = addr & (PAGE_SIZE-1); 3663 unsigned offset = addr & (PAGE_SIZE-1);
3615 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3664 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3616 int ret; 3665 int ret;
3617 3666
3618 if (gpa == UNMAPPED_GVA) { 3667 if (gpa == UNMAPPED_GVA)
3619 r = X86EMUL_PROPAGATE_FAULT; 3668 return X86EMUL_PROPAGATE_FAULT;
3620 goto out;
3621 }
3622 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3669 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3623 if (ret < 0) { 3670 if (ret < 0) {
3624 r = X86EMUL_IO_NEEDED; 3671 r = X86EMUL_IO_NEEDED;
@@ -3635,31 +3682,35 @@ out:
3635 3682
3636/* used for instruction fetching */ 3683/* used for instruction fetching */
3637static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3684static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3638 struct kvm_vcpu *vcpu, u32 *error) 3685 struct kvm_vcpu *vcpu,
3686 struct x86_exception *exception)
3639{ 3687{
3640 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3641 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3689 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3642 access | PFERR_FETCH_MASK, error); 3690 access | PFERR_FETCH_MASK,
3691 exception);
3643} 3692}
3644 3693
3645static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3694static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3646 struct kvm_vcpu *vcpu, u32 *error) 3695 struct kvm_vcpu *vcpu,
3696 struct x86_exception *exception)
3647{ 3697{
3648 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3698 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3649 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3699 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3650 error); 3700 exception);
3651} 3701}
3652 3702
3653static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3703static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3654 struct kvm_vcpu *vcpu, u32 *error) 3704 struct kvm_vcpu *vcpu,
3705 struct x86_exception *exception)
3655{ 3706{
3656 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3707 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3657} 3708}
3658 3709
3659static int kvm_write_guest_virt_system(gva_t addr, void *val, 3710static int kvm_write_guest_virt_system(gva_t addr, void *val,
3660 unsigned int bytes, 3711 unsigned int bytes,
3661 struct kvm_vcpu *vcpu, 3712 struct kvm_vcpu *vcpu,
3662 u32 *error) 3713 struct x86_exception *exception)
3663{ 3714{
3664 void *data = val; 3715 void *data = val;
3665 int r = X86EMUL_CONTINUE; 3716 int r = X86EMUL_CONTINUE;
@@ -3667,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3667 while (bytes) { 3718 while (bytes) {
3668 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 3719 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3669 PFERR_WRITE_MASK, 3720 PFERR_WRITE_MASK,
3670 error); 3721 exception);
3671 unsigned offset = addr & (PAGE_SIZE-1); 3722 unsigned offset = addr & (PAGE_SIZE-1);
3672 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3723 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3673 int ret; 3724 int ret;
3674 3725
3675 if (gpa == UNMAPPED_GVA) { 3726 if (gpa == UNMAPPED_GVA)
3676 r = X86EMUL_PROPAGATE_FAULT; 3727 return X86EMUL_PROPAGATE_FAULT;
3677 goto out;
3678 }
3679 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3728 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3680 if (ret < 0) { 3729 if (ret < 0) {
3681 r = X86EMUL_IO_NEEDED; 3730 r = X86EMUL_IO_NEEDED;
@@ -3693,7 +3742,7 @@ out:
3693static int emulator_read_emulated(unsigned long addr, 3742static int emulator_read_emulated(unsigned long addr,
3694 void *val, 3743 void *val,
3695 unsigned int bytes, 3744 unsigned int bytes,
3696 unsigned int *error_code, 3745 struct x86_exception *exception,
3697 struct kvm_vcpu *vcpu) 3746 struct kvm_vcpu *vcpu)
3698{ 3747{
3699 gpa_t gpa; 3748 gpa_t gpa;
@@ -3706,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
3706 return X86EMUL_CONTINUE; 3755 return X86EMUL_CONTINUE;
3707 } 3756 }
3708 3757
3709 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3758 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
3710 3759
3711 if (gpa == UNMAPPED_GVA) 3760 if (gpa == UNMAPPED_GVA)
3712 return X86EMUL_PROPAGATE_FAULT; 3761 return X86EMUL_PROPAGATE_FAULT;
@@ -3715,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
3715 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3764 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3716 goto mmio; 3765 goto mmio;
3717 3766
3718 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3767 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
3719 == X86EMUL_CONTINUE) 3768 == X86EMUL_CONTINUE)
3720 return X86EMUL_CONTINUE; 3769 return X86EMUL_CONTINUE;
3721 3770
3722mmio: 3771mmio:
@@ -3740,7 +3789,7 @@ mmio:
3740} 3789}
3741 3790
3742int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3791int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3743 const void *val, int bytes) 3792 const void *val, int bytes)
3744{ 3793{
3745 int ret; 3794 int ret;
3746 3795
@@ -3754,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3754static int emulator_write_emulated_onepage(unsigned long addr, 3803static int emulator_write_emulated_onepage(unsigned long addr,
3755 const void *val, 3804 const void *val,
3756 unsigned int bytes, 3805 unsigned int bytes,
3757 unsigned int *error_code, 3806 struct x86_exception *exception,
3758 struct kvm_vcpu *vcpu) 3807 struct kvm_vcpu *vcpu)
3759{ 3808{
3760 gpa_t gpa; 3809 gpa_t gpa;
3761 3810
3762 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3811 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3763 3812
3764 if (gpa == UNMAPPED_GVA) 3813 if (gpa == UNMAPPED_GVA)
3765 return X86EMUL_PROPAGATE_FAULT; 3814 return X86EMUL_PROPAGATE_FAULT;
@@ -3792,7 +3841,7 @@ mmio:
3792int emulator_write_emulated(unsigned long addr, 3841int emulator_write_emulated(unsigned long addr,
3793 const void *val, 3842 const void *val,
3794 unsigned int bytes, 3843 unsigned int bytes,
3795 unsigned int *error_code, 3844 struct x86_exception *exception,
3796 struct kvm_vcpu *vcpu) 3845 struct kvm_vcpu *vcpu)
3797{ 3846{
3798 /* Crossing a page boundary? */ 3847 /* Crossing a page boundary? */
@@ -3800,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
3800 int rc, now; 3849 int rc, now;
3801 3850
3802 now = -addr & ~PAGE_MASK; 3851 now = -addr & ~PAGE_MASK;
3803 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 3852 rc = emulator_write_emulated_onepage(addr, val, now, exception,
3804 vcpu); 3853 vcpu);
3805 if (rc != X86EMUL_CONTINUE) 3854 if (rc != X86EMUL_CONTINUE)
3806 return rc; 3855 return rc;
@@ -3808,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
3808 val += now; 3857 val += now;
3809 bytes -= now; 3858 bytes -= now;
3810 } 3859 }
3811 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 3860 return emulator_write_emulated_onepage(addr, val, bytes, exception,
3812 vcpu); 3861 vcpu);
3813} 3862}
3814 3863
@@ -3826,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3826 const void *old, 3875 const void *old,
3827 const void *new, 3876 const void *new,
3828 unsigned int bytes, 3877 unsigned int bytes,
3829 unsigned int *error_code, 3878 struct x86_exception *exception,
3830 struct kvm_vcpu *vcpu) 3879 struct kvm_vcpu *vcpu)
3831{ 3880{
3832 gpa_t gpa; 3881 gpa_t gpa;
@@ -3884,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3884emul_write: 3933emul_write:
3885 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3934 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3886 3935
3887 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 3936 return emulator_write_emulated(addr, new, bytes, exception, vcpu);
3888} 3937}
3889 3938
3890static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3939static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3909,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3909 if (vcpu->arch.pio.count) 3958 if (vcpu->arch.pio.count)
3910 goto data_avail; 3959 goto data_avail;
3911 3960
3912 trace_kvm_pio(0, port, size, 1); 3961 trace_kvm_pio(0, port, size, count);
3913 3962
3914 vcpu->arch.pio.port = port; 3963 vcpu->arch.pio.port = port;
3915 vcpu->arch.pio.in = 1; 3964 vcpu->arch.pio.in = 1;
@@ -3937,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3937 const void *val, unsigned int count, 3986 const void *val, unsigned int count,
3938 struct kvm_vcpu *vcpu) 3987 struct kvm_vcpu *vcpu)
3939{ 3988{
3940 trace_kvm_pio(1, port, size, 1); 3989 trace_kvm_pio(1, port, size, count);
3941 3990
3942 vcpu->arch.pio.port = port; 3991 vcpu->arch.pio.port = port;
3943 vcpu->arch.pio.in = 0; 3992 vcpu->arch.pio.in = 0;
@@ -3978,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3978 return X86EMUL_CONTINUE; 4027 return X86EMUL_CONTINUE;
3979 4028
3980 if (kvm_x86_ops->has_wbinvd_exit()) { 4029 if (kvm_x86_ops->has_wbinvd_exit()) {
3981 preempt_disable(); 4030 int cpu = get_cpu();
4031
4032 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3982 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4033 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3983 wbinvd_ipi, NULL, 1); 4034 wbinvd_ipi, NULL, 1);
3984 preempt_enable(); 4035 put_cpu();
3985 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4036 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3986 } 4037 } else
3987 wbinvd(); 4038 wbinvd();
3988 return X86EMUL_CONTINUE; 4039 return X86EMUL_CONTINUE;
3989} 4040}
3990EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4041EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4024,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4024 value = vcpu->arch.cr2; 4075 value = vcpu->arch.cr2;
4025 break; 4076 break;
4026 case 3: 4077 case 3:
4027 value = vcpu->arch.cr3; 4078 value = kvm_read_cr3(vcpu);
4028 break; 4079 break;
4029 case 4: 4080 case 4:
4030 value = kvm_read_cr4(vcpu); 4081 value = kvm_read_cr4(vcpu);
@@ -4058,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
4058 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4109 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4059 break; 4110 break;
4060 case 8: 4111 case 8:
4061 res = __kvm_set_cr8(vcpu, val & 0xfUL); 4112 res = kvm_set_cr8(vcpu, val);
4062 break; 4113 break;
4063 default: 4114 default:
4064 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4115 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4211,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4211static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4262static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4212{ 4263{
4213 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4264 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4214 if (ctxt->exception == PF_VECTOR) 4265 if (ctxt->exception.vector == PF_VECTOR)
4215 kvm_propagate_fault(vcpu); 4266 kvm_propagate_fault(vcpu, &ctxt->exception);
4216 else if (ctxt->error_code_valid) 4267 else if (ctxt->exception.error_code_valid)
4217 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4268 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4269 ctxt->exception.error_code);
4218 else 4270 else
4219 kvm_queue_exception(vcpu, ctxt->exception); 4271 kvm_queue_exception(vcpu, ctxt->exception.vector);
4220} 4272}
4221 4273
4222static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4274static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4272,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4272 4324
4273static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4325static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4274{ 4326{
4327 int r = EMULATE_DONE;
4328
4275 ++vcpu->stat.insn_emulation_fail; 4329 ++vcpu->stat.insn_emulation_fail;
4276 trace_kvm_emulate_insn_failed(vcpu); 4330 trace_kvm_emulate_insn_failed(vcpu);
4277 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4331 if (!is_guest_mode(vcpu)) {
4278 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4332 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4279 vcpu->run->internal.ndata = 0; 4333 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4334 vcpu->run->internal.ndata = 0;
4335 r = EMULATE_FAIL;
4336 }
4280 kvm_queue_exception(vcpu, UD_VECTOR); 4337 kvm_queue_exception(vcpu, UD_VECTOR);
4281 return EMULATE_FAIL; 4338
4339 return r;
4282} 4340}
4283 4341
4284static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4342static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4307,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4307 return false; 4365 return false;
4308} 4366}
4309 4367
4310int emulate_instruction(struct kvm_vcpu *vcpu, 4368int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4311 unsigned long cr2, 4369 unsigned long cr2,
4312 u16 error_code, 4370 int emulation_type,
4313 int emulation_type) 4371 void *insn,
4372 int insn_len)
4314{ 4373{
4315 int r; 4374 int r;
4316 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4375 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4328,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4328 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4387 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4329 init_emulate_ctxt(vcpu); 4388 init_emulate_ctxt(vcpu);
4330 vcpu->arch.emulate_ctxt.interruptibility = 0; 4389 vcpu->arch.emulate_ctxt.interruptibility = 0;
4331 vcpu->arch.emulate_ctxt.exception = -1; 4390 vcpu->arch.emulate_ctxt.have_exception = false;
4332 vcpu->arch.emulate_ctxt.perm_ok = false; 4391 vcpu->arch.emulate_ctxt.perm_ok = false;
4333 4392
4334 r = x86_decode_insn(&vcpu->arch.emulate_ctxt); 4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4335 if (r == X86EMUL_PROPAGATE_FAULT) 4394 if (r == X86EMUL_PROPAGATE_FAULT)
4336 goto done; 4395 goto done;
4337 4396
@@ -4394,7 +4453,7 @@ restart:
4394 } 4453 }
4395 4454
4396done: 4455done:
4397 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4456 if (vcpu->arch.emulate_ctxt.have_exception) {
4398 inject_emulated_exception(vcpu); 4457 inject_emulated_exception(vcpu);
4399 r = EMULATE_DONE; 4458 r = EMULATE_DONE;
4400 } else if (vcpu->arch.pio.count) { 4459 } else if (vcpu->arch.pio.count) {
@@ -4418,7 +4477,7 @@ done:
4418 4477
4419 return r; 4478 return r;
4420} 4479}
4421EXPORT_SYMBOL_GPL(emulate_instruction); 4480EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4422 4481
4423int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4482int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4424{ 4483{
@@ -4432,7 +4491,7 @@ EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4432 4491
4433static void tsc_bad(void *info) 4492static void tsc_bad(void *info)
4434{ 4493{
4435 __get_cpu_var(cpu_tsc_khz) = 0; 4494 __this_cpu_write(cpu_tsc_khz, 0);
4436} 4495}
4437 4496
4438static void tsc_khz_changed(void *data) 4497static void tsc_khz_changed(void *data)
@@ -4446,7 +4505,7 @@ static void tsc_khz_changed(void *data)
4446 khz = cpufreq_quick_get(raw_smp_processor_id()); 4505 khz = cpufreq_quick_get(raw_smp_processor_id());
4447 if (!khz) 4506 if (!khz)
4448 khz = tsc_khz; 4507 khz = tsc_khz;
4449 __get_cpu_var(cpu_tsc_khz) = khz; 4508 __this_cpu_write(cpu_tsc_khz, khz);
4450} 4509}
4451 4510
4452static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4511static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4569,9 +4628,11 @@ static void kvm_timer_init(void)
4569#ifdef CONFIG_CPU_FREQ 4628#ifdef CONFIG_CPU_FREQ
4570 struct cpufreq_policy policy; 4629 struct cpufreq_policy policy;
4571 memset(&policy, 0, sizeof(policy)); 4630 memset(&policy, 0, sizeof(policy));
4572 cpufreq_get_policy(&policy, get_cpu()); 4631 cpu = get_cpu();
4632 cpufreq_get_policy(&policy, cpu);
4573 if (policy.cpuinfo.max_freq) 4633 if (policy.cpuinfo.max_freq)
4574 max_tsc_khz = policy.cpuinfo.max_freq; 4634 max_tsc_khz = policy.cpuinfo.max_freq;
4635 put_cpu();
4575#endif 4636#endif
4576 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4637 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4577 CPUFREQ_TRANSITION_NOTIFIER); 4638 CPUFREQ_TRANSITION_NOTIFIER);
@@ -4656,7 +4717,6 @@ int kvm_arch_init(void *opaque)
4656 4717
4657 kvm_x86_ops = ops; 4718 kvm_x86_ops = ops;
4658 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4719 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4659 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4660 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4720 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4661 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4721 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4662 4722
@@ -5119,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5119 vcpu->fpu_active = 0; 5179 vcpu->fpu_active = 0;
5120 kvm_x86_ops->fpu_deactivate(vcpu); 5180 kvm_x86_ops->fpu_deactivate(vcpu);
5121 } 5181 }
5182 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5183 /* Page is swapped out. Do synthetic halt */
5184 vcpu->arch.apf.halted = true;
5185 r = 1;
5186 goto out;
5187 }
5122 } 5188 }
5123 5189
5124 r = kvm_mmu_reload(vcpu); 5190 r = kvm_mmu_reload(vcpu);
@@ -5247,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5247 5313
5248 r = 1; 5314 r = 1;
5249 while (r > 0) { 5315 while (r > 0) {
5250 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5316 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5317 !vcpu->arch.apf.halted)
5251 r = vcpu_enter_guest(vcpu); 5318 r = vcpu_enter_guest(vcpu);
5252 else { 5319 else {
5253 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5320 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5260,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5260 vcpu->arch.mp_state = 5327 vcpu->arch.mp_state =
5261 KVM_MP_STATE_RUNNABLE; 5328 KVM_MP_STATE_RUNNABLE;
5262 case KVM_MP_STATE_RUNNABLE: 5329 case KVM_MP_STATE_RUNNABLE:
5330 vcpu->arch.apf.halted = false;
5263 break; 5331 break;
5264 case KVM_MP_STATE_SIPI_RECEIVED: 5332 case KVM_MP_STATE_SIPI_RECEIVED:
5265 default: 5333 default:
@@ -5281,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5281 vcpu->run->exit_reason = KVM_EXIT_INTR; 5349 vcpu->run->exit_reason = KVM_EXIT_INTR;
5282 ++vcpu->stat.request_irq_exits; 5350 ++vcpu->stat.request_irq_exits;
5283 } 5351 }
5352
5353 kvm_check_async_pf_completion(vcpu);
5354
5284 if (signal_pending(current)) { 5355 if (signal_pending(current)) {
5285 r = -EINTR; 5356 r = -EINTR;
5286 vcpu->run->exit_reason = KVM_EXIT_INTR; 5357 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5305,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5305 int r; 5376 int r;
5306 sigset_t sigsaved; 5377 sigset_t sigsaved;
5307 5378
5379 if (!tsk_used_math(current) && init_fpu(current))
5380 return -ENOMEM;
5381
5308 if (vcpu->sigset_active) 5382 if (vcpu->sigset_active)
5309 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5383 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
5310 5384
@@ -5316,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5316 } 5390 }
5317 5391
5318 /* re-sync apic's tpr */ 5392 /* re-sync apic's tpr */
5319 if (!irqchip_in_kernel(vcpu->kvm)) 5393 if (!irqchip_in_kernel(vcpu->kvm)) {
5320 kvm_set_cr8(vcpu, kvm_run->cr8); 5394 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
5395 r = -EINVAL;
5396 goto out;
5397 }
5398 }
5321 5399
5322 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5400 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
5323 if (vcpu->mmio_needed) { 5401 if (vcpu->mmio_needed) {
@@ -5326,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5326 vcpu->mmio_needed = 0; 5404 vcpu->mmio_needed = 0;
5327 } 5405 }
5328 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5406 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5329 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 5407 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5330 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5408 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5331 if (r != EMULATE_DONE) { 5409 if (r != EMULATE_DONE) {
5332 r = 0; 5410 r = 0;
@@ -5439,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5439 5517
5440 sregs->cr0 = kvm_read_cr0(vcpu); 5518 sregs->cr0 = kvm_read_cr0(vcpu);
5441 sregs->cr2 = vcpu->arch.cr2; 5519 sregs->cr2 = vcpu->arch.cr2;
5442 sregs->cr3 = vcpu->arch.cr3; 5520 sregs->cr3 = kvm_read_cr3(vcpu);
5443 sregs->cr4 = kvm_read_cr4(vcpu); 5521 sregs->cr4 = kvm_read_cr4(vcpu);
5444 sregs->cr8 = kvm_get_cr8(vcpu); 5522 sregs->cr8 = kvm_get_cr8(vcpu);
5445 sregs->efer = vcpu->arch.efer; 5523 sregs->efer = vcpu->arch.efer;
@@ -5507,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5507 kvm_x86_ops->set_gdt(vcpu, &dt); 5585 kvm_x86_ops->set_gdt(vcpu, &dt);
5508 5586
5509 vcpu->arch.cr2 = sregs->cr2; 5587 vcpu->arch.cr2 = sregs->cr2;
5510 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5588 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
5511 vcpu->arch.cr3 = sregs->cr3; 5589 vcpu->arch.cr3 = sregs->cr3;
5590 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5512 5591
5513 kvm_set_cr8(vcpu, sregs->cr8); 5592 kvm_set_cr8(vcpu, sregs->cr8);
5514 5593
@@ -5522,8 +5601,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5522 5601
5523 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5602 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5524 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5604 if (sregs->cr4 & X86_CR4_OSXSAVE)
5605 update_cpuid(vcpu);
5525 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5526 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); 5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5527 mmu_reset_needed = 1; 5608 mmu_reset_needed = 1;
5528 } 5609 }
5529 5610
@@ -5774,6 +5855,8 @@ free_vcpu:
5774 5855
5775void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5856void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5776{ 5857{
5858 vcpu->arch.apf.msr_val = 0;
5859
5777 vcpu_load(vcpu); 5860 vcpu_load(vcpu);
5778 kvm_mmu_unload(vcpu); 5861 kvm_mmu_unload(vcpu);
5779 vcpu_put(vcpu); 5862 vcpu_put(vcpu);
@@ -5793,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5793 vcpu->arch.dr7 = DR7_FIXED_1; 5876 vcpu->arch.dr7 = DR7_FIXED_1;
5794 5877
5795 kvm_make_request(KVM_REQ_EVENT, vcpu); 5878 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0;
5880
5881 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false;
5796 5884
5797 return kvm_x86_ops->vcpu_reset(vcpu); 5885 return kvm_x86_ops->vcpu_reset(vcpu);
5798} 5886}
@@ -5882,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5882 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5970 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5883 goto fail_free_mce_banks; 5971 goto fail_free_mce_banks;
5884 5972
5973 kvm_async_pf_hash_reset(vcpu);
5974
5885 return 0; 5975 return 0;
5886fail_free_mce_banks: 5976fail_free_mce_banks:
5887 kfree(vcpu->arch.mce_banks); 5977 kfree(vcpu->arch.mce_banks);
@@ -5907,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5907 free_page((unsigned long)vcpu->arch.pio_data); 5997 free_page((unsigned long)vcpu->arch.pio_data);
5908} 5998}
5909 5999
5910struct kvm *kvm_arch_create_vm(void) 6000int kvm_arch_init_vm(struct kvm *kvm)
5911{ 6001{
5912 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5913
5914 if (!kvm)
5915 return ERR_PTR(-ENOMEM);
5916
5917 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6002 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5918 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6003 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5919 6004
@@ -5922,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
5922 6007
5923 spin_lock_init(&kvm->arch.tsc_write_lock); 6008 spin_lock_init(&kvm->arch.tsc_write_lock);
5924 6009
5925 return kvm; 6010 return 0;
5926} 6011}
5927 6012
5928static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6013static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5940,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5940 /* 6025 /*
5941 * Unpin any mmu pages first. 6026 * Unpin any mmu pages first.
5942 */ 6027 */
5943 kvm_for_each_vcpu(i, vcpu, kvm) 6028 kvm_for_each_vcpu(i, vcpu, kvm) {
6029 kvm_clear_async_pf_completion_queue(vcpu);
5944 kvm_unload_vcpu_mmu(vcpu); 6030 kvm_unload_vcpu_mmu(vcpu);
6031 }
5945 kvm_for_each_vcpu(i, vcpu, kvm) 6032 kvm_for_each_vcpu(i, vcpu, kvm)
5946 kvm_arch_vcpu_free(vcpu); 6033 kvm_arch_vcpu_free(vcpu);
5947 6034
@@ -5965,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5965 kfree(kvm->arch.vpic); 6052 kfree(kvm->arch.vpic);
5966 kfree(kvm->arch.vioapic); 6053 kfree(kvm->arch.vioapic);
5967 kvm_free_vcpus(kvm); 6054 kvm_free_vcpus(kvm);
5968 kvm_free_physmem(kvm);
5969 if (kvm->arch.apic_access_page) 6055 if (kvm->arch.apic_access_page)
5970 put_page(kvm->arch.apic_access_page); 6056 put_page(kvm->arch.apic_access_page);
5971 if (kvm->arch.ept_identity_pagetable) 6057 if (kvm->arch.ept_identity_pagetable)
5972 put_page(kvm->arch.ept_identity_pagetable); 6058 put_page(kvm->arch.ept_identity_pagetable);
5973 cleanup_srcu_struct(&kvm->srcu);
5974 kfree(kvm);
5975} 6059}
5976 6060
5977int kvm_arch_prepare_memory_region(struct kvm *kvm, 6061int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6052,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
6052 6136
6053int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6137int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6054{ 6138{
6055 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6139 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6140 !vcpu->arch.apf.halted)
6141 || !list_empty_careful(&vcpu->async_pf.done)
6056 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6142 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6057 || vcpu->arch.nmi_pending || 6143 || vcpu->arch.nmi_pending ||
6058 (kvm_arch_interrupt_allowed(vcpu) && 6144 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -6111,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6111} 6197}
6112EXPORT_SYMBOL_GPL(kvm_set_rflags); 6198EXPORT_SYMBOL_GPL(kvm_set_rflags);
6113 6199
6200void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6201{
6202 int r;
6203
6204 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6205 is_error_page(work->page))
6206 return;
6207
6208 r = kvm_mmu_reload(vcpu);
6209 if (unlikely(r))
6210 return;
6211
6212 if (!vcpu->arch.mmu.direct_map &&
6213 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6214 return;
6215
6216 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6217}
6218
6219static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6220{
6221 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6222}
6223
6224static inline u32 kvm_async_pf_next_probe(u32 key)
6225{
6226 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6227}
6228
6229static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6230{
6231 u32 key = kvm_async_pf_hash_fn(gfn);
6232
6233 while (vcpu->arch.apf.gfns[key] != ~0)
6234 key = kvm_async_pf_next_probe(key);
6235
6236 vcpu->arch.apf.gfns[key] = gfn;
6237}
6238
6239static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6240{
6241 int i;
6242 u32 key = kvm_async_pf_hash_fn(gfn);
6243
6244 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6245 (vcpu->arch.apf.gfns[key] != gfn &&
6246 vcpu->arch.apf.gfns[key] != ~0); i++)
6247 key = kvm_async_pf_next_probe(key);
6248
6249 return key;
6250}
6251
6252bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6253{
6254 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6255}
6256
6257static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6258{
6259 u32 i, j, k;
6260
6261 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6262 while (true) {
6263 vcpu->arch.apf.gfns[i] = ~0;
6264 do {
6265 j = kvm_async_pf_next_probe(j);
6266 if (vcpu->arch.apf.gfns[j] == ~0)
6267 return;
6268 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6269 /*
6270 * k lies cyclically in ]i,j]
6271 * | i.k.j |
6272 * |....j i.k.| or |.k..j i...|
6273 */
6274 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6275 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6276 i = j;
6277 }
6278}
6279
6280static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6281{
6282
6283 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6284 sizeof(val));
6285}
6286
6287void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6288 struct kvm_async_pf *work)
6289{
6290 struct x86_exception fault;
6291
6292 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6293 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6294
6295 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6296 (vcpu->arch.apf.send_user_only &&
6297 kvm_x86_ops->get_cpl(vcpu) == 0))
6298 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6299 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6300 fault.vector = PF_VECTOR;
6301 fault.error_code_valid = true;
6302 fault.error_code = 0;
6303 fault.nested_page_fault = false;
6304 fault.address = work->arch.token;
6305 kvm_inject_page_fault(vcpu, &fault);
6306 }
6307}
6308
6309void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6310 struct kvm_async_pf *work)
6311{
6312 struct x86_exception fault;
6313
6314 trace_kvm_async_pf_ready(work->arch.token, work->gva);
6315 if (is_error_page(work->page))
6316 work->arch.token = ~0; /* broadcast wakeup */
6317 else
6318 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6319
6320 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6321 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6322 fault.vector = PF_VECTOR;
6323 fault.error_code_valid = true;
6324 fault.error_code = 0;
6325 fault.nested_page_fault = false;
6326 fault.address = work->arch.token;
6327 kvm_inject_page_fault(vcpu, &fault);
6328 }
6329 vcpu->arch.apf.halted = false;
6330}
6331
6332bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6333{
6334 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6335 return true;
6336 else
6337 return !kvm_event_needs_reinjection(vcpu) &&
6338 kvm_x86_ops->interrupt_allowed(vcpu);
6339}
6340
6114EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6341EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
6115EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6342EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
6116EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6343EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414489f3..c600da830ce0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -70,6 +70,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
71} 71}
72 72
73static inline u32 bit(int bitno)
74{
75 return 1 << (bitno & 31);
76}
77
73void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
74void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
75int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); 80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc3..6e121a2a49e1 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 select VIRTUALIZATION
5 select VIRTIO 6 select VIRTIO
6 select VIRTIO_RING 7 select VIRTIO_RING
7 select VIRTIO_CONSOLE 8 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 73b1e1a1f489..eba687f0cc0c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3)
531{ 531{
532 lguest_data.pgdir = cr3; 532 lguest_data.pgdir = cr3;
533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); 533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
534 cr3_changed = true; 534
535 /* These two page tables are simple, linear, and used during boot */
536 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
537 cr3_changed = true;
535} 538}
536 539
537static unsigned long lguest_read_cr3(void) 540static unsigned long lguest_read_cr3(void)
@@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
703 * to forget all of them. Fortunately, this is very rare. 706 * to forget all of them. Fortunately, this is very rare.
704 * 707 *
705 * ... except in early boot when the kernel sets up the initial pagetables, 708 * ... except in early boot when the kernel sets up the initial pagetables,
706 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 709 * which makes booting astonishingly slow: 48 seconds! So we don't even tell
707 * the Host anything changed until we've done the first page table switch, 710 * the Host anything changed until we've done the first real page table switch,
708 * which brings boot back to 0.25 seconds. 711 * which brings boot back to 4.3 seconds.
709 */ 712 */
710static void lguest_set_pte(pte_t *ptep, pte_t pteval) 713static void lguest_set_pte(pte_t *ptep, pte_t pteval)
711{ 714{
@@ -821,7 +824,7 @@ static void __init lguest_init_IRQ(void)
821 824
822 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
823 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 826 /* Some systems map "vectors" to interrupts weirdly. Not us! */
824 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 827 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
825 if (i != SYSCALL_VECTOR) 828 if (i != SYSCALL_VECTOR)
826 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
827 } 830 }
@@ -1002,7 +1005,7 @@ static void lguest_time_init(void)
1002 clockevents_register_device(&lguest_clockevent); 1005 clockevents_register_device(&lguest_clockevent);
1003 1006
1004 /* Finally, we unblock the timer interrupt. */ 1007 /* Finally, we unblock the timer interrupt. */
1005 enable_lguest_irq(0); 1008 clear_bit(0, lguest_data.blocked_interrupts);
1006} 1009}
1007 1010
1008/* 1011/*
@@ -1349,9 +1352,6 @@ __init void lguest_init(void)
1349 */ 1352 */
1350 switch_to_new_gdt(0); 1353 switch_to_new_gdt(0);
1351 1354
1352 /* We actually boot with all memory mapped, but let's say 128MB. */
1353 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1354
1355 /* 1355 /*
1356 * The Host<->Guest Switcher lives at the top of our address space, and 1356 * The Host<->Guest Switcher lives at the top of our address space, and
1357 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1357 * the Host told us how big it is when we made LGUEST_INIT hypercall:
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index ff485d361182..fc45ba887d05 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops)
121 asm("mull %%edx" 121 asm("mull %%edx"
122 :"=d" (xloops), "=&a" (d0) 122 :"=d" (xloops), "=&a" (d0)
123 :"1" (xloops), "0" 123 :"1" (xloops), "0"
124 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); 124 (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
125 125
126 __delay(++xloops); 126 __delay(++xloops);
127} 127}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 55543397a8a7..09df2f9a3d69 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
24 24
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_K8_NUMA) += k8topology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28 28
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c
index 804a3b6c6e14..f21962c435ed 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * AMD K8 NUMA support. 2 * AMD NUMA support.
3 * Discover the memory map and associated nodes. 3 * Discover the memory map and associated nodes.
4 * 4 *
5 * This version reads it directly from the K8 northbridge. 5 * This version reads it directly from the AMD northbridge.
6 * 6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */ 8 */
@@ -27,6 +27,7 @@
27#include <asm/amd_nb.h> 27#include <asm/amd_nb.h>
28 28
29static struct bootnode __initdata nodes[8]; 29static struct bootnode __initdata nodes[8];
30static unsigned char __initdata nodeids[8];
30static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 31static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
31 32
32static __init int find_northbridge(void) 33static __init int find_northbridge(void)
@@ -57,7 +58,7 @@ static __init void early_get_boot_cpu_id(void)
57{ 58{
58 /* 59 /*
59 * need to get the APIC ID of the BSP so can use that to 60 * need to get the APIC ID of the BSP so can use that to
60 * create apicid_to_node in k8_scan_nodes() 61 * create apicid_to_node in amd_scan_nodes()
61 */ 62 */
62#ifdef CONFIG_X86_MPPARSE 63#ifdef CONFIG_X86_MPPARSE
63 /* 64 /*
@@ -66,23 +67,9 @@ static __init void early_get_boot_cpu_id(void)
66 if (smp_found_config) 67 if (smp_found_config)
67 early_get_smp_config(); 68 early_get_smp_config();
68#endif 69#endif
69 early_init_lapic_mapping();
70} 70}
71 71
72int __init k8_get_nodes(struct bootnode *physnodes) 72int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
73{
74 int i;
75 int ret = 0;
76
77 for_each_node_mask(i, nodes_parsed) {
78 physnodes[ret].start = nodes[i].start;
79 physnodes[ret].end = nodes[i].end;
80 ret++;
81 }
82 return ret;
83}
84
85int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
86{ 73{
87 unsigned long start = PFN_PHYS(start_pfn); 74 unsigned long start = PFN_PHYS(start_pfn);
88 unsigned long end = PFN_PHYS(end_pfn); 75 unsigned long end = PFN_PHYS(end_pfn);
@@ -114,7 +101,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
114 base = read_pci_config(0, nb, 1, 0x40 + i*8); 101 base = read_pci_config(0, nb, 1, 0x40 + i*8);
115 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 102 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
116 103
117 nodeid = limit & 7; 104 nodeids[i] = nodeid = limit & 7;
118 if ((base & 3) == 0) { 105 if ((base & 3) == 0) {
119 if (i < numnodes) 106 if (i < numnodes)
120 pr_info("Skipping disabled node %d\n", i); 107 pr_info("Skipping disabled node %d\n", i);
@@ -194,7 +181,77 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
194 return 0; 181 return 0;
195} 182}
196 183
197int __init k8_scan_nodes(void) 184#ifdef CONFIG_NUMA_EMU
185static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
186 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
187};
188
189void __init amd_get_nodes(struct bootnode *physnodes)
190{
191 int i;
192
193 for_each_node_mask(i, nodes_parsed) {
194 physnodes[i].start = nodes[i].start;
195 physnodes[i].end = nodes[i].end;
196 }
197}
198
199static int __init find_node_by_addr(unsigned long addr)
200{
201 int ret = NUMA_NO_NODE;
202 int i;
203
204 for (i = 0; i < 8; i++)
205 if (addr >= nodes[i].start && addr < nodes[i].end) {
206 ret = i;
207 break;
208 }
209 return ret;
210}
211
212/*
213 * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
214 * setup to represent the physical topology but reflect the emulated
215 * environment. For each emulated node, the real node which it appears on is
216 * found and a fake pxm to nid mapping is created which mirrors the actual
217 * locality. node_distance() then represents the correct distances between
218 * emulated nodes by using the fake acpi mappings to pxms.
219 */
220void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
221{
222 unsigned int bits;
223 unsigned int cores;
224 unsigned int apicid_base = 0;
225 int i;
226
227 bits = boot_cpu_data.x86_coreid_bits;
228 cores = 1 << bits;
229 early_get_boot_cpu_id();
230 if (boot_cpu_physical_apicid > 0)
231 apicid_base = boot_cpu_physical_apicid;
232
233 for (i = 0; i < nr_nodes; i++) {
234 int index;
235 int nid;
236 int j;
237
238 nid = find_node_by_addr(nodes[i].start);
239 if (nid == NUMA_NO_NODE)
240 continue;
241
242 index = nodeids[nid] << bits;
243 if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
244 for (j = apicid_base; j < cores + apicid_base; j++)
245 fake_apicid_to_node[index + j] = i;
246#ifdef CONFIG_ACPI_NUMA
247 __acpi_map_pxm_to_node(nid, i);
248#endif
249 }
250 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
251}
252#endif /* CONFIG_NUMA_EMU */
253
254int __init amd_scan_nodes(void)
198{ 255{
199 unsigned int bits; 256 unsigned int bits;
200 unsigned int cores; 257 unsigned int cores;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
109}
110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
106} 119}
107 120
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
128 do { 141 do {
129 VM_BUG_ON(compound_head(page) != head); 142 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page; 143 pages[*nr] = page;
144 if (PageTail(page))
145 get_huge_page_tail(page);
131 (*nr)++; 146 (*nr)++;
132 page++; 147 page++;
133 refs++; 148 refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
148 pmd_t pmd = *pmdp; 163 pmd_t pmd = *pmdp;
149 164
150 next = pmd_addr_end(addr, end); 165 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd)) 166 /*
167 * The pmd_trans_splitting() check below explains why
168 * pmdp_splitting_flush has to flush the tlb, to stop
169 * this gup-fast code from running while we set the
170 * splitting bit in the pmd. Returning zero will take
171 * the slow path that will call wait_split_huge_page()
172 * if the pmd is still in splitting state. gup-fast
173 * can't because it has irq disabled and
174 * wait_split_huge_page() would never return as the
175 * tlb flush IPI wouldn't run.
176 */
177 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
152 return 0; 178 return 0;
153 if (unlikely(pmd_large(pmd))) { 179 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 180 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7d..947f42abe820 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
364 /* 364 /*
365 * We just marked the kernel text read only above, now that 365 * We just marked the kernel text read only above, now that
366 * we are going to free part of that, we need to make that 366 * we are going to free part of that, we need to make that
367 * writeable first. 367 * writeable and non-executable first.
368 */ 368 */
369 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
369 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 370 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
370 371
371 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 372 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..c821074b7f0b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
45#include <asm/bugs.h> 45#include <asm/bugs.h>
46#include <asm/tlb.h> 46#include <asm/tlb.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/olpc_ofw.h>
48#include <asm/pgalloc.h> 49#include <asm/pgalloc.h>
49#include <asm/sections.h> 50#include <asm/sections.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
@@ -226,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
226 227
227static inline int is_kernel_text(unsigned long addr) 228static inline int is_kernel_text(unsigned long addr)
228{ 229{
229 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) 230 if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
230 return 1; 231 return 1;
231 return 0; 232 return 0;
232} 233}
@@ -715,6 +716,7 @@ void __init paging_init(void)
715 /* 716 /*
716 * NOTE: at this point the bootmem allocator is fully available. 717 * NOTE: at this point the bootmem allocator is fully available.
717 */ 718 */
719 olpc_dt_build_devicetree();
718 sparse_init(); 720 sparse_init();
719 zone_sizes_init(); 721 zone_sizes_init();
720} 722}
@@ -912,6 +914,23 @@ void set_kernel_text_ro(void)
912 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 914 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
913} 915}
914 916
917static void mark_nxdata_nx(void)
918{
919 /*
920 * When this called, init has already been executed and released,
921 * so everything past _etext sould be NX.
922 */
923 unsigned long start = PFN_ALIGN(_etext);
924 /*
925 * This comes from is_kernel_text upper limit. Also HPAGE where used:
926 */
927 unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
928
929 if (__supported_pte_mask & _PAGE_NX)
930 printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
931 set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
932}
933
915void mark_rodata_ro(void) 934void mark_rodata_ro(void)
916{ 935{
917 unsigned long start = PFN_ALIGN(_text); 936 unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +965,7 @@ void mark_rodata_ro(void)
946 printk(KERN_INFO "Testing CPA: write protecting again\n"); 965 printk(KERN_INFO "Testing CPA: write protecting again\n");
947 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 966 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
948#endif 967#endif
968 mark_nxdata_nx();
949} 969}
950#endif 970#endif
951 971
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
185 e->trace.entries = e->trace_entries; 185 e->trace.entries = e->trace_entries;
186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries); 186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
187 e->trace.skip = 0; 187 e->trace.skip = 0;
188 save_stack_trace_bp(&e->trace, regs->bp); 188 save_stack_trace_regs(&e->trace, regs);
189 189
190 /* Round address down to nearest 16 bytes */ 190 /* Round address down to nearest 16 bytes */
191 shadow_copy = kmemcheck_shadow_lookup(address 191 shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..ebf6d7887a38 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
2#include <linux/topology.h> 2#include <linux/topology.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <asm/numa.h>
6#include <asm/acpi.h>
7
8int __initdata numa_off;
9
10static __init int numa_setup(char *opt)
11{
12 if (!opt)
13 return -EINVAL;
14 if (!strncmp(opt, "off", 3))
15 numa_off = 1;
16#ifdef CONFIG_NUMA_EMU
17 if (!strncmp(opt, "fake=", 5))
18 numa_emu_cmdline(opt + 5);
19#endif
20#ifdef CONFIG_ACPI_NUMA
21 if (!strncmp(opt, "noacpi", 6))
22 acpi_numa = -1;
23#endif
24 return 0;
25}
26early_param("numa", numa_setup);
5 27
6/* 28/*
7 * Which logical CPUs are on which nodes 29 * Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7ffc9b727efd..95ea1551eebc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31}; 31};
32 32
33int numa_off __initdata;
34static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
35static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
36 35
@@ -260,30 +259,35 @@ void __init numa_init_array(void)
260#ifdef CONFIG_NUMA_EMU 259#ifdef CONFIG_NUMA_EMU
261/* Numa emulation */ 260/* Numa emulation */
262static struct bootnode nodes[MAX_NUMNODES] __initdata; 261static struct bootnode nodes[MAX_NUMNODES] __initdata;
263static struct bootnode physnodes[MAX_NUMNODES] __initdata; 262static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
264static char *cmdline __initdata; 263static char *cmdline __initdata;
265 264
265void __init numa_emu_cmdline(char *str)
266{
267 cmdline = str;
268}
269
266static int __init setup_physnodes(unsigned long start, unsigned long end, 270static int __init setup_physnodes(unsigned long start, unsigned long end,
267 int acpi, int k8) 271 int acpi, int amd)
268{ 272{
269 int nr_nodes = 0;
270 int ret = 0; 273 int ret = 0;
271 int i; 274 int i;
272 275
276 memset(physnodes, 0, sizeof(physnodes));
273#ifdef CONFIG_ACPI_NUMA 277#ifdef CONFIG_ACPI_NUMA
274 if (acpi) 278 if (acpi)
275 nr_nodes = acpi_get_nodes(physnodes); 279 acpi_get_nodes(physnodes, start, end);
276#endif 280#endif
277#ifdef CONFIG_K8_NUMA 281#ifdef CONFIG_AMD_NUMA
278 if (k8) 282 if (amd)
279 nr_nodes = k8_get_nodes(physnodes); 283 amd_get_nodes(physnodes);
280#endif 284#endif
281 /* 285 /*
282 * Basic sanity checking on the physical node map: there may be errors 286 * Basic sanity checking on the physical node map: there may be errors
283 * if the SRAT or K8 incorrectly reported the topology or the mem= 287 * if the SRAT or AMD code incorrectly reported the topology or the mem=
284 * kernel parameter is used. 288 * kernel parameter is used.
285 */ 289 */
286 for (i = 0; i < nr_nodes; i++) { 290 for (i = 0; i < MAX_NUMNODES; i++) {
287 if (physnodes[i].start == physnodes[i].end) 291 if (physnodes[i].start == physnodes[i].end)
288 continue; 292 continue;
289 if (physnodes[i].start > end) { 293 if (physnodes[i].start > end) {
@@ -298,17 +302,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
298 physnodes[i].start = start; 302 physnodes[i].start = start;
299 if (physnodes[i].end > end) 303 if (physnodes[i].end > end)
300 physnodes[i].end = end; 304 physnodes[i].end = end;
301 }
302
303 /*
304 * Remove all nodes that have no memory or were truncated because of the
305 * limited address range.
306 */
307 for (i = 0; i < nr_nodes; i++) {
308 if (physnodes[i].start == physnodes[i].end)
309 continue;
310 physnodes[ret].start = physnodes[i].start;
311 physnodes[ret].end = physnodes[i].end;
312 ret++; 305 ret++;
313 } 306 }
314 307
@@ -324,6 +317,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
324 return ret; 317 return ret;
325} 318}
326 319
320static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
321{
322 int i;
323
324 BUG_ON(acpi && amd);
325#ifdef CONFIG_ACPI_NUMA
326 if (acpi)
327 acpi_fake_nodes(nodes, nr_nodes);
328#endif
329#ifdef CONFIG_AMD_NUMA
330 if (amd)
331 amd_fake_nodes(nodes, nr_nodes);
332#endif
333 if (!acpi && !amd)
334 for (i = 0; i < nr_cpu_ids; i++)
335 numa_set_node(i, 0);
336}
337
327/* 338/*
328 * Setups up nid to range from addr to addr + size. If the end 339 * Setups up nid to range from addr to addr + size. If the end
329 * boundary is greater than max_addr, then max_addr is used instead. 340 * boundary is greater than max_addr, then max_addr is used instead.
@@ -352,8 +363,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
352 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 363 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
353 * to max_addr. The return value is the number of nodes allocated. 364 * to max_addr. The return value is the number of nodes allocated.
354 */ 365 */
355static int __init split_nodes_interleave(u64 addr, u64 max_addr, 366static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
356 int nr_phys_nodes, int nr_nodes)
357{ 367{
358 nodemask_t physnode_mask = NODE_MASK_NONE; 368 nodemask_t physnode_mask = NODE_MASK_NONE;
359 u64 size; 369 u64 size;
@@ -384,7 +394,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
384 return -1; 394 return -1;
385 } 395 }
386 396
387 for (i = 0; i < nr_phys_nodes; i++) 397 for (i = 0; i < MAX_NUMNODES; i++)
388 if (physnodes[i].start != physnodes[i].end) 398 if (physnodes[i].start != physnodes[i].end)
389 node_set(i, physnode_mask); 399 node_set(i, physnode_mask);
390 400
@@ -549,15 +559,13 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
549 * numa=fake command-line option. 559 * numa=fake command-line option.
550 */ 560 */
551static int __init numa_emulation(unsigned long start_pfn, 561static int __init numa_emulation(unsigned long start_pfn,
552 unsigned long last_pfn, int acpi, int k8) 562 unsigned long last_pfn, int acpi, int amd)
553{ 563{
554 u64 addr = start_pfn << PAGE_SHIFT; 564 u64 addr = start_pfn << PAGE_SHIFT;
555 u64 max_addr = last_pfn << PAGE_SHIFT; 565 u64 max_addr = last_pfn << PAGE_SHIFT;
556 int num_phys_nodes;
557 int num_nodes; 566 int num_nodes;
558 int i; 567 int i;
559 568
560 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
561 /* 569 /*
562 * If the numa=fake command-line contains a 'M' or 'G', it represents 570 * If the numa=fake command-line contains a 'M' or 'G', it represents
563 * the fixed node size. Otherwise, if it is just a single number N, 571 * the fixed node size. Otherwise, if it is just a single number N,
@@ -572,7 +580,7 @@ static int __init numa_emulation(unsigned long start_pfn,
572 unsigned long n; 580 unsigned long n;
573 581
574 n = simple_strtoul(cmdline, NULL, 0); 582 n = simple_strtoul(cmdline, NULL, 0);
575 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); 583 num_nodes = split_nodes_interleave(addr, max_addr, n);
576 } 584 }
577 585
578 if (num_nodes < 0) 586 if (num_nodes < 0)
@@ -595,14 +603,15 @@ static int __init numa_emulation(unsigned long start_pfn,
595 nodes[i].end >> PAGE_SHIFT); 603 nodes[i].end >> PAGE_SHIFT);
596 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 604 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
597 } 605 }
598 acpi_fake_nodes(nodes, num_nodes); 606 setup_physnodes(addr, max_addr, acpi, amd);
607 fake_physnodes(acpi, amd, num_nodes);
599 numa_init_array(); 608 numa_init_array();
600 return 0; 609 return 0;
601} 610}
602#endif /* CONFIG_NUMA_EMU */ 611#endif /* CONFIG_NUMA_EMU */
603 612
604void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 613void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
605 int acpi, int k8) 614 int acpi, int amd)
606{ 615{
607 int i; 616 int i;
608 617
@@ -610,8 +619,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
610 nodes_clear(node_online_map); 619 nodes_clear(node_online_map);
611 620
612#ifdef CONFIG_NUMA_EMU 621#ifdef CONFIG_NUMA_EMU
613 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) 622 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
623 acpi, amd);
624 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
614 return; 625 return;
626 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
627 acpi, amd);
615 nodes_clear(node_possible_map); 628 nodes_clear(node_possible_map);
616 nodes_clear(node_online_map); 629 nodes_clear(node_online_map);
617#endif 630#endif
@@ -624,8 +637,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
624 nodes_clear(node_online_map); 637 nodes_clear(node_online_map);
625#endif 638#endif
626 639
627#ifdef CONFIG_K8_NUMA 640#ifdef CONFIG_AMD_NUMA
628 if (!numa_off && k8 && !k8_scan_nodes()) 641 if (!numa_off && amd && !amd_scan_nodes())
629 return; 642 return;
630 nodes_clear(node_possible_map); 643 nodes_clear(node_possible_map);
631 nodes_clear(node_online_map); 644 nodes_clear(node_online_map);
@@ -661,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void)
661 return pages; 674 return pages;
662} 675}
663 676
664static __init int numa_setup(char *opt)
665{
666 if (!opt)
667 return -EINVAL;
668 if (!strncmp(opt, "off", 3))
669 numa_off = 1;
670#ifdef CONFIG_NUMA_EMU
671 if (!strncmp(opt, "fake=", 5))
672 cmdline = opt + 5;
673#endif
674#ifdef CONFIG_ACPI_NUMA
675 if (!strncmp(opt, "noacpi", 6))
676 acpi_numa = -1;
677#endif
678 return 0;
679}
680early_param("numa", numa_setup);
681
682#ifdef CONFIG_NUMA 677#ifdef CONFIG_NUMA
683 678
684static __init int find_near_online_node(int node) 679static __init int find_near_online_node(int node)
@@ -767,6 +762,7 @@ void __cpuinit numa_clear_node(int cpu)
767 762
768#ifndef CONFIG_DEBUG_PER_CPU_MAPS 763#ifndef CONFIG_DEBUG_PER_CPU_MAPS
769 764
765#ifndef CONFIG_NUMA_EMU
770void __cpuinit numa_add_cpu(int cpu) 766void __cpuinit numa_add_cpu(int cpu)
771{ 767{
772 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 768 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -776,34 +772,115 @@ void __cpuinit numa_remove_cpu(int cpu)
776{ 772{
777 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 773 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
778} 774}
775#else
776void __cpuinit numa_add_cpu(int cpu)
777{
778 unsigned long addr;
779 u16 apicid;
780 int physnid;
781 int nid = NUMA_NO_NODE;
782
783 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
784 if (apicid != BAD_APICID)
785 nid = apicid_to_node[apicid];
786 if (nid == NUMA_NO_NODE)
787 nid = early_cpu_to_node(cpu);
788 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
789
790 /*
791 * Use the starting address of the emulated node to find which physical
792 * node it is allocated on.
793 */
794 addr = node_start_pfn(nid) << PAGE_SHIFT;
795 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
796 if (addr >= physnodes[physnid].start &&
797 addr < physnodes[physnid].end)
798 break;
799
800 /*
801 * Map the cpu to each emulated node that is allocated on the physical
802 * node of the cpu's apic id.
803 */
804 for_each_online_node(nid) {
805 addr = node_start_pfn(nid) << PAGE_SHIFT;
806 if (addr >= physnodes[physnid].start &&
807 addr < physnodes[physnid].end)
808 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
809 }
810}
811
812void __cpuinit numa_remove_cpu(int cpu)
813{
814 int i;
815
816 for_each_online_node(i)
817 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
818}
819#endif /* !CONFIG_NUMA_EMU */
779 820
780#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 821#else /* CONFIG_DEBUG_PER_CPU_MAPS */
822static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
823{
824 int node = early_cpu_to_node(cpu);
825 struct cpumask *mask;
826 char buf[64];
827
828 mask = node_to_cpumask_map[node];
829 if (!mask) {
830 pr_err("node_to_cpumask_map[%i] NULL\n", node);
831 dump_stack();
832 return NULL;
833 }
834
835 cpulist_scnprintf(buf, sizeof(buf), mask);
836 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
837 enable ? "numa_add_cpu" : "numa_remove_cpu",
838 cpu, node, buf);
839 return mask;
840}
781 841
782/* 842/*
783 * --------- debug versions of the numa functions --------- 843 * --------- debug versions of the numa functions ---------
784 */ 844 */
845#ifndef CONFIG_NUMA_EMU
785static void __cpuinit numa_set_cpumask(int cpu, int enable) 846static void __cpuinit numa_set_cpumask(int cpu, int enable)
786{ 847{
787 int node = early_cpu_to_node(cpu);
788 struct cpumask *mask; 848 struct cpumask *mask;
789 char buf[64];
790 849
791 mask = node_to_cpumask_map[node]; 850 mask = debug_cpumask_set_cpu(cpu, enable);
792 if (mask == NULL) { 851 if (!mask)
793 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
794 dump_stack();
795 return; 852 return;
796 }
797 853
798 if (enable) 854 if (enable)
799 cpumask_set_cpu(cpu, mask); 855 cpumask_set_cpu(cpu, mask);
800 else 856 else
801 cpumask_clear_cpu(cpu, mask); 857 cpumask_clear_cpu(cpu, mask);
858}
859#else
860static void __cpuinit numa_set_cpumask(int cpu, int enable)
861{
862 int node = early_cpu_to_node(cpu);
863 struct cpumask *mask;
864 int i;
802 865
803 cpulist_scnprintf(buf, sizeof(buf), mask); 866 for_each_online_node(i) {
804 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 867 unsigned long addr;
805 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 868
869 addr = node_start_pfn(i) << PAGE_SHIFT;
870 if (addr < physnodes[node].start ||
871 addr >= physnodes[node].end)
872 continue;
873 mask = debug_cpumask_set_cpu(cpu, enable);
874 if (!mask)
875 return;
876
877 if (enable)
878 cpumask_set_cpu(cpu, mask);
879 else
880 cpumask_clear_cpu(cpu, mask);
881 }
806} 882}
883#endif /* CONFIG_NUMA_EMU */
807 884
808void __cpuinit numa_add_cpu(int cpu) 885void __cpuinit numa_add_cpu(int cpu)
809{ 886{
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..8b830ca14ac4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
13#include <linux/pfn.h> 13#include <linux/pfn.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/pci.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
255 unsigned long pfn) 256 unsigned long pfn)
256{ 257{
257 pgprot_t forbidden = __pgprot(0); 258 pgprot_t forbidden = __pgprot(0);
259 pgprot_t required = __pgprot(0);
258 260
259 /* 261 /*
260 * The BIOS area between 640k and 1Mb needs to be executable for 262 * The BIOS area between 640k and 1Mb needs to be executable for
261 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 263 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
262 */ 264 */
263 if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 265#ifdef CONFIG_PCI_BIOS
266 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
264 pgprot_val(forbidden) |= _PAGE_NX; 267 pgprot_val(forbidden) |= _PAGE_NX;
268#endif
265 269
266 /* 270 /*
267 * The kernel text needs to be executable for obvious reasons 271 * The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
278 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 282 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 283 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 284 pgprot_val(forbidden) |= _PAGE_RW;
285 /*
286 * .data and .bss should always be writable.
287 */
288 if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
289 within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
290 pgprot_val(required) |= _PAGE_RW;
281 291
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 292#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /* 293 /*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
317#endif 327#endif
318 328
319 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 329 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
330 prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
320 331
321 return prot; 332 return prot;
322} 333}
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
393{ 404{
394 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 405 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
395 pte_t new_pte, old_pte, *tmp; 406 pte_t new_pte, old_pte, *tmp;
396 pgprot_t old_prot, new_prot; 407 pgprot_t old_prot, new_prot, req_prot;
397 int i, do_split = 1; 408 int i, do_split = 1;
398 unsigned int level; 409 unsigned int level;
399 410
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
438 * We are safe now. Check whether the new pgprot is the same: 449 * We are safe now. Check whether the new pgprot is the same:
439 */ 450 */
440 old_pte = *kpte; 451 old_pte = *kpte;
441 old_prot = new_prot = pte_pgprot(old_pte); 452 old_prot = new_prot = req_prot = pte_pgprot(old_pte);
442 453
443 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 454 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
444 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 455 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
445 456
446 /* 457 /*
447 * old_pte points to the large page base address. So we need 458 * old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
450 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); 461 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
451 cpa->pfn = pfn; 462 cpa->pfn = pfn;
452 463
453 new_prot = static_protections(new_prot, address, pfn); 464 new_prot = static_protections(req_prot, address, pfn);
454 465
455 /* 466 /*
456 * We need to check the full range, whether 467 * We need to check the full range, whether
457 * static_protection() requires a different pgprot for one of 468 * static_protection() requires a different pgprot for one of
458 * the pages in the range we try to preserve: 469 * the pages in the range we try to preserve:
459 */ 470 */
460 addr = address + PAGE_SIZE; 471 addr = address & pmask;
461 pfn++; 472 pfn = pte_pfn(old_pte);
462 for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { 473 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
463 pgprot_t chk_prot = static_protections(new_prot, addr, pfn); 474 pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
464 475
465 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 476 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
466 goto out_unlock; 477 goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
483 * that we limited the number of possible pages already to 494 * that we limited the number of possible pages already to
484 * the number of pages in the large page. 495 * the number of pages in the large page.
485 */ 496 */
486 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { 497 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
487 /* 498 /*
488 * The address is aligned and the number of pages 499 * The address is aligned and the number of pages
489 * covers the full page. 500 * covers the full page.
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
320 return changed; 320 return changed;
321} 321}
322 322
323#ifdef CONFIG_TRANSPARENT_HUGEPAGE
324int pmdp_set_access_flags(struct vm_area_struct *vma,
325 unsigned long address, pmd_t *pmdp,
326 pmd_t entry, int dirty)
327{
328 int changed = !pmd_same(*pmdp, entry);
329
330 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
331
332 if (changed && dirty) {
333 *pmdp = entry;
334 pmd_update_defer(vma->vm_mm, address, pmdp);
335 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
336 }
337
338 return changed;
339}
340#endif
341
323int ptep_test_and_clear_young(struct vm_area_struct *vma, 342int ptep_test_and_clear_young(struct vm_area_struct *vma,
324 unsigned long addr, pte_t *ptep) 343 unsigned long addr, pte_t *ptep)
325{ 344{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
335 return ret; 354 return ret;
336} 355}
337 356
357#ifdef CONFIG_TRANSPARENT_HUGEPAGE
358int pmdp_test_and_clear_young(struct vm_area_struct *vma,
359 unsigned long addr, pmd_t *pmdp)
360{
361 int ret = 0;
362
363 if (pmd_young(*pmdp))
364 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
365 (unsigned long *)pmdp);
366
367 if (ret)
368 pmd_update(vma->vm_mm, addr, pmdp);
369
370 return ret;
371}
372#endif
373
338int ptep_clear_flush_young(struct vm_area_struct *vma, 374int ptep_clear_flush_young(struct vm_area_struct *vma,
339 unsigned long address, pte_t *ptep) 375 unsigned long address, pte_t *ptep)
340{ 376{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
347 return young; 383 return young;
348} 384}
349 385
386#ifdef CONFIG_TRANSPARENT_HUGEPAGE
387int pmdp_clear_flush_young(struct vm_area_struct *vma,
388 unsigned long address, pmd_t *pmdp)
389{
390 int young;
391
392 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
393
394 young = pmdp_test_and_clear_young(vma, address, pmdp);
395 if (young)
396 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
397
398 return young;
399}
400
401void pmdp_splitting_flush(struct vm_area_struct *vma,
402 unsigned long address, pmd_t *pmdp)
403{
404 int set;
405 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
406 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
407 (unsigned long *)pmdp);
408 if (set) {
409 pmd_update(vma->vm_mm, address, pmdp);
410 /* need tlb flush only to serialize against gup-fast */
411 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
412 }
413}
414#endif
415
350/** 416/**
351 * reserve_top_address - reserves a hole in the top of kernel address space 417 * reserve_top_address - reserves a hole in the top of kernel address space
352 * @reserve - size of hole to reserve 418 * @reserve - size of hole to reserve
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
41{ 41{
42 if (!cpu_has_nx) { 42 if (!cpu_has_nx) {
43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
44 "missing in CPU or disabled in BIOS!\n"); 44 "missing in CPU!\n");
45 } else { 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) { 47 if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..ae96e7b8051d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_APICID];
61 61
62int numa_off __initdata;
63int acpi_numa __initdata; 62int acpi_numa __initdata;
64 63
65static __init void bad_srat(void) 64static __init void bad_srat(void)
@@ -92,6 +91,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
92 /* mark this node as "seen" in node bitmap */ 91 /* mark this node as "seen" in node bitmap */
93 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); 92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
94 93
94 /* don't need to check apic_id here, because it is always 8 bits */
95 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; 95 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
96 96
97 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", 97 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..603d285d1daa 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
134 } 134 }
135 135
136 apic_id = pa->apic_id; 136 apic_id = pa->apic_id;
137 if (apic_id >= MAX_LOCAL_APIC) {
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return;
140 }
137 apicid_to_node[apic_id] = node; 141 apicid_to_node[apic_id] = node;
138 node_set(node, cpu_nodes_parsed); 142 node_set(node, cpu_nodes_parsed);
139 acpi_numa = 1; 143 acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
168 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 172 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
169 else 173 else
170 apic_id = pa->apic_id; 174 apic_id = pa->apic_id;
175
176 if (apic_id >= MAX_LOCAL_APIC) {
177 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
178 return;
179 }
180
171 apicid_to_node[apic_id] = node; 181 apicid_to_node[apic_id] = node;
172 node_set(node, cpu_nodes_parsed); 182 node_set(node, cpu_nodes_parsed);
173 acpi_numa = 1; 183 acpi_numa = 1;
@@ -339,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
339 349
340void __init acpi_numa_arch_fixup(void) {} 350void __init acpi_numa_arch_fixup(void) {}
341 351
342int __init acpi_get_nodes(struct bootnode *physnodes) 352#ifdef CONFIG_NUMA_EMU
353void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
354 unsigned long end)
343{ 355{
344 int i; 356 int i;
345 int ret = 0;
346 357
347 for_each_node_mask(i, nodes_parsed) { 358 for_each_node_mask(i, nodes_parsed) {
348 physnodes[ret].start = nodes[i].start; 359 cutoff_node(i, start, end);
349 physnodes[ret].end = nodes[i].end; 360 physnodes[i].start = nodes[i].start;
350 ret++; 361 physnodes[i].end = nodes[i].end;
351 } 362 }
352 return ret;
353} 363}
364#endif /* CONFIG_NUMA_EMU */
354 365
355/* Use the information discovered above to actually set up the nodes. */ 366/* Use the information discovered above to actually set up the nodes. */
356int __init acpi_scan_nodes(unsigned long start, unsigned long end) 367int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -495,8 +506,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
495{ 506{
496 int i, j; 507 int i, j;
497 508
498 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
499 "topology.\n");
500 for (i = 0; i < num_nodes; i++) { 509 for (i = 0; i < num_nodes; i++) {
501 int nid, pxm; 510 int nid, pxm;
502 511
@@ -516,6 +525,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
516 fake_apicid_to_node[j] == NUMA_NO_NODE) 525 fake_apicid_to_node[j] == NUMA_NO_NODE)
517 fake_apicid_to_node[j] = i; 526 fake_apicid_to_node[j] = i;
518 } 527 }
528
529 /*
530 * If there are apicid-to-node mappings for physical nodes that do not
531 * have a corresponding emulated node, it should default to a guaranteed
532 * value.
533 */
534 for (i = 0; i < MAX_LOCAL_APIC; i++)
535 if (apicid_to_node[i] != NUMA_NO_NODE &&
536 fake_apicid_to_node[i] == NUMA_NO_NODE)
537 fake_apicid_to_node[i] = 0;
538
519 for (i = 0; i < num_nodes; i++) 539 for (i = 0; i < num_nodes; i++)
520 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
521 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); 541 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49358481c733..6acc724d5d8f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
223 223
224static void __cpuinit calculate_tlb_offset(void) 224static void __cpuinit calculate_tlb_offset(void)
225{ 225{
226 int cpu, node, nr_node_vecs; 226 int cpu, node, nr_node_vecs, idx = 0;
227 /* 227 /*
228 * we are changing tlb_vector_offset for each CPU in runtime, but this 228 * we are changing tlb_vector_offset for each CPU in runtime, but this
229 * will not cause inconsistency, as the write is atomic under X86. we 229 * will not cause inconsistency, as the write is atomic under X86. we
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void)
239 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; 239 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
240 240
241 for_each_online_node(node) { 241 for_each_online_node(node) {
242 int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * 242 int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
243 nr_node_vecs; 243 nr_node_vecs;
244 int cpu_offset = 0; 244 int cpu_offset = 0;
245 for_each_cpu(cpu, cpumask_of_node(node)) { 245 for_each_cpu(cpu, cpumask_of_node(node)) {
@@ -248,10 +248,11 @@ static void __cpuinit calculate_tlb_offset(void)
248 cpu_offset++; 248 cpu_offset++;
249 cpu_offset = cpu_offset % nr_node_vecs; 249 cpu_offset = cpu_offset % nr_node_vecs;
250 } 250 }
251 idx++;
251 } 252 }
252} 253}
253 254
254static int tlb_cpuhp_notify(struct notifier_block *n, 255static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
255 unsigned long action, void *hcpu) 256 unsigned long action, void *hcpu)
256{ 257{
257 switch (action & 0xf) { 258 switch (action & 0xf) {
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a36..72cbec14d783 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
126 if (!user_mode_vm(regs)) { 126 if (!user_mode_vm(regs)) {
127 unsigned long stack = kernel_stack_pointer(regs); 127 unsigned long stack = kernel_stack_pointer(regs);
128 if (depth) 128 if (depth)
129 dump_trace(NULL, regs, (unsigned long *)stack, 0, 129 dump_trace(NULL, regs, (unsigned long *)stack,
130 &backtrace_ops, &depth); 130 &backtrace_ops, &depth);
131 return; 131 return;
132 } 132 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 4e8baad36d37..e2b7b0c06cdf 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -65,7 +65,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
65 65
66 switch (val) { 66 switch (val) {
67 case DIE_NMI: 67 case DIE_NMI:
68 case DIE_NMI_IPI:
69 if (ctr_running) 68 if (ctr_running)
70 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); 69 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
71 else if (!nmi_enabled) 70 else if (!nmi_enabled)
@@ -143,7 +142,7 @@ static inline int has_mux(void)
143 142
144inline int op_x86_phys_to_virt(int phys) 143inline int op_x86_phys_to_virt(int phys)
145{ 144{
146 return __get_cpu_var(switch_index) + phys; 145 return __this_cpu_read(switch_index) + phys;
147} 146}
148 147
149inline int op_x86_virt_to_phys(int virt) 148inline int op_x86_virt_to_phys(int virt)
@@ -361,7 +360,7 @@ static void nmi_cpu_setup(void *dummy)
361static struct notifier_block profile_exceptions_nb = { 360static struct notifier_block profile_exceptions_nb = {
362 .notifier_call = profile_exceptions_notify, 361 .notifier_call = profile_exceptions_notify,
363 .next = NULL, 362 .next = NULL,
364 .priority = 2 363 .priority = NMI_LOCAL_LOW_PRIOR,
365}; 364};
366 365
367static void nmi_cpu_restore_registers(struct op_msrs *msrs) 366static void nmi_cpu_restore_registers(struct op_msrs *msrs)
@@ -732,6 +731,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
732 case 0x14: 731 case 0x14:
733 cpu_type = "x86-64/family14h"; 732 cpu_type = "x86-64/family14h";
734 break; 733 break;
734 case 0x15:
735 cpu_type = "x86-64/family15h";
736 break;
735 default: 737 default:
736 return -ENODEV; 738 return -ENODEV;
737 } 739 }
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index e3ecb71b5790..720bf5a53c51 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
38static struct notifier_block profile_timer_exceptions_nb = { 38static struct notifier_block profile_timer_exceptions_nb = {
39 .notifier_call = profile_timer_exceptions_notify, 39 .notifier_call = profile_timer_exceptions_notify,
40 .next = NULL, 40 .next = NULL,
41 .priority = 0 41 .priority = NMI_LOW_PRIOR,
42}; 42};
43 43
44static int timer_start(void) 44static int timer_start(void)
@@ -58,9 +58,6 @@ static void timer_stop(void)
58 58
59int __init op_nmi_timer_init(struct oprofile_operations *ops) 59int __init op_nmi_timer_init(struct oprofile_operations *ops)
60{ 60{
61 if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
62 return -ENODEV;
63
64 ops->start = timer_start; 61 ops->start = timer_start;
65 ops->stop = timer_stop; 62 ops->stop = timer_stop;
66 ops->cpu_type = "timer"; 63 ops->cpu_type = "timer";
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index a011bcc0f943..c3b8e24f2b16 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -29,11 +29,12 @@
29#include "op_x86_model.h" 29#include "op_x86_model.h"
30#include "op_counter.h" 30#include "op_counter.h"
31 31
32#define NUM_COUNTERS 4 32#define NUM_COUNTERS 4
33#define NUM_COUNTERS_F15H 6
33#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 34#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
34#define NUM_VIRT_COUNTERS 32 35#define NUM_VIRT_COUNTERS 32
35#else 36#else
36#define NUM_VIRT_COUNTERS NUM_COUNTERS 37#define NUM_VIRT_COUNTERS 0
37#endif 38#endif
38 39
39#define OP_EVENT_MASK 0x0FFF 40#define OP_EVENT_MASK 0x0FFF
@@ -41,7 +42,8 @@
41 42
42#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) 43#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
43 44
44static unsigned long reset_value[NUM_VIRT_COUNTERS]; 45static int num_counters;
46static unsigned long reset_value[OP_MAX_COUNTER];
45 47
46#define IBS_FETCH_SIZE 6 48#define IBS_FETCH_SIZE 6
47#define IBS_OP_SIZE 12 49#define IBS_OP_SIZE 12
@@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
387 int i; 389 int i;
388 390
389 /* enable active counters */ 391 /* enable active counters */
390 for (i = 0; i < NUM_COUNTERS; ++i) { 392 for (i = 0; i < num_counters; ++i) {
391 int virt = op_x86_phys_to_virt(i); 393 int virt = op_x86_phys_to_virt(i);
392 if (!reset_value[virt]) 394 if (!reset_value[virt])
393 continue; 395 continue;
@@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
406{ 408{
407 int i; 409 int i;
408 410
409 for (i = 0; i < NUM_COUNTERS; ++i) { 411 for (i = 0; i < num_counters; ++i) {
410 if (!msrs->counters[i].addr) 412 if (!msrs->counters[i].addr)
411 continue; 413 continue;
412 release_perfctr_nmi(MSR_K7_PERFCTR0 + i); 414 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
@@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
418{ 420{
419 int i; 421 int i;
420 422
421 for (i = 0; i < NUM_COUNTERS; i++) { 423 for (i = 0; i < num_counters; i++) {
422 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 424 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
423 goto fail; 425 goto fail;
424 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { 426 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
@@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
426 goto fail; 428 goto fail;
427 } 429 }
428 /* both registers must be reserved */ 430 /* both registers must be reserved */
429 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; 431 if (num_counters == NUM_COUNTERS_F15H) {
430 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; 432 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
433 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
434 } else {
435 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
436 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
437 }
431 continue; 438 continue;
432 fail: 439 fail:
433 if (!counter_config[i].enabled) 440 if (!counter_config[i].enabled)
@@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
447 int i; 454 int i;
448 455
449 /* setup reset_value */ 456 /* setup reset_value */
450 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { 457 for (i = 0; i < OP_MAX_COUNTER; ++i) {
451 if (counter_config[i].enabled 458 if (counter_config[i].enabled
452 && msrs->counters[op_x86_virt_to_phys(i)].addr) 459 && msrs->counters[op_x86_virt_to_phys(i)].addr)
453 reset_value[i] = counter_config[i].count; 460 reset_value[i] = counter_config[i].count;
@@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
456 } 463 }
457 464
458 /* clear all counters */ 465 /* clear all counters */
459 for (i = 0; i < NUM_COUNTERS; ++i) { 466 for (i = 0; i < num_counters; ++i) {
460 if (!msrs->controls[i].addr) 467 if (!msrs->controls[i].addr)
461 continue; 468 continue;
462 rdmsrl(msrs->controls[i].addr, val); 469 rdmsrl(msrs->controls[i].addr, val);
@@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
472 } 479 }
473 480
474 /* enable active counters */ 481 /* enable active counters */
475 for (i = 0; i < NUM_COUNTERS; ++i) { 482 for (i = 0; i < num_counters; ++i) {
476 int virt = op_x86_phys_to_virt(i); 483 int virt = op_x86_phys_to_virt(i);
477 if (!reset_value[virt]) 484 if (!reset_value[virt])
478 continue; 485 continue;
@@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
503 u64 val; 510 u64 val;
504 int i; 511 int i;
505 512
506 for (i = 0; i < NUM_COUNTERS; ++i) { 513 for (i = 0; i < num_counters; ++i) {
507 int virt = op_x86_phys_to_virt(i); 514 int virt = op_x86_phys_to_virt(i);
508 if (!reset_value[virt]) 515 if (!reset_value[virt])
509 continue; 516 continue;
@@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
526 u64 val; 533 u64 val;
527 int i; 534 int i;
528 535
529 for (i = 0; i < NUM_COUNTERS; ++i) { 536 for (i = 0; i < num_counters; ++i) {
530 if (!reset_value[op_x86_phys_to_virt(i)]) 537 if (!reset_value[op_x86_phys_to_virt(i)])
531 continue; 538 continue;
532 rdmsrl(msrs->controls[i].addr, val); 539 rdmsrl(msrs->controls[i].addr, val);
@@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
546 * Subtle: stop on all counters to avoid race with setting our 553 * Subtle: stop on all counters to avoid race with setting our
547 * pm callback 554 * pm callback
548 */ 555 */
549 for (i = 0; i < NUM_COUNTERS; ++i) { 556 for (i = 0; i < num_counters; ++i) {
550 if (!reset_value[op_x86_phys_to_virt(i)]) 557 if (!reset_value[op_x86_phys_to_virt(i)])
551 continue; 558 continue;
552 rdmsrl(msrs->controls[i].addr, val); 559 rdmsrl(msrs->controls[i].addr, val);
@@ -603,6 +610,7 @@ static int force_ibs_eilvt_setup(void)
603 ret = setup_ibs_ctl(i); 610 ret = setup_ibs_ctl(i);
604 if (ret) 611 if (ret)
605 return ret; 612 return ret;
613 pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
606 return 0; 614 return 0;
607 } 615 }
608 616
@@ -630,21 +638,29 @@ static int __init_ibs_nmi(void)
630 return 0; 638 return 0;
631} 639}
632 640
633/* initialize the APIC for the IBS interrupts if available */ 641/*
642 * check and reserve APIC extended interrupt LVT offset for IBS if
643 * available
644 *
645 * init_ibs() preforms implicitly cpu-local operations, so pin this
646 * thread to its current CPU
647 */
648
634static void init_ibs(void) 649static void init_ibs(void)
635{ 650{
636 ibs_caps = get_ibs_caps(); 651 preempt_disable();
637 652
653 ibs_caps = get_ibs_caps();
638 if (!ibs_caps) 654 if (!ibs_caps)
639 return; 655 goto out;
640 656
641 if (__init_ibs_nmi()) { 657 if (__init_ibs_nmi() < 0)
642 ibs_caps = 0; 658 ibs_caps = 0;
643 return; 659 else
644 } 660 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
645 661
646 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", 662out:
647 (unsigned)ibs_caps); 663 preempt_enable();
648} 664}
649 665
650static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 666static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -698,18 +714,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
698 return 0; 714 return 0;
699} 715}
700 716
717struct op_x86_model_spec op_amd_spec;
718
701static int op_amd_init(struct oprofile_operations *ops) 719static int op_amd_init(struct oprofile_operations *ops)
702{ 720{
703 init_ibs(); 721 init_ibs();
704 create_arch_files = ops->create_files; 722 create_arch_files = ops->create_files;
705 ops->create_files = setup_ibs_files; 723 ops->create_files = setup_ibs_files;
724
725 if (boot_cpu_data.x86 == 0x15) {
726 num_counters = NUM_COUNTERS_F15H;
727 } else {
728 num_counters = NUM_COUNTERS;
729 }
730
731 op_amd_spec.num_counters = num_counters;
732 op_amd_spec.num_controls = num_counters;
733 op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
734
706 return 0; 735 return 0;
707} 736}
708 737
709struct op_x86_model_spec op_amd_spec = { 738struct op_x86_model_spec op_amd_spec = {
710 .num_counters = NUM_COUNTERS, 739 /* num_counters/num_controls filled in at runtime */
711 .num_controls = NUM_COUNTERS,
712 .num_virt_counters = NUM_VIRT_COUNTERS,
713 .reserved = MSR_AMD_EVENTSEL_RESERVED, 740 .reserved = MSR_AMD_EVENTSEL_RESERVED,
714 .event_mask = OP_EVENT_MASK, 741 .event_mask = OP_EVENT_MASK,
715 .init = op_amd_init, 742 .init = op_amd_init,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 182558dd5515..9fadec074142 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -11,7 +11,7 @@
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/nmi.h> 14#include <asm/nmi.h>
15#include <asm/msr.h> 15#include <asm/msr.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/apic.h> 17#include <asm/apic.h>
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index d769cda54082..94b745045e45 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -95,8 +95,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
95 * counter width: 95 * counter width:
96 */ 96 */
97 if (!(eax.split.version_id == 0 && 97 if (!(eax.split.version_id == 0 &&
98 current_cpu_data.x86 == 6 && 98 __this_cpu_read(cpu_info.x86) == 6 &&
99 current_cpu_data.x86_model == 15)) { 99 __this_cpu_read(cpu_info.x86_model) == 15)) {
100 100
101 if (counter_width < eax.split.bit_width) 101 if (counter_width < eax.split.bit_width)
102 counter_width = eax.split.bit_width; 102 counter_width = eax.split.bit_width;
@@ -235,8 +235,8 @@ static void arch_perfmon_setup_counters(void)
235 eax.full = cpuid_eax(0xa); 235 eax.full = cpuid_eax(0xa);
236 236
237 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ 237 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
238 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && 238 if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 &&
239 current_cpu_data.x86_model == 15) { 239 __this_cpu_read(cpu_info.x86_model) == 15) {
240 eax.split.version_id = 2; 240 eax.split.version_id = 2;
241 eax.split.num_counters = 2; 241 eax.split.num_counters = 2;
242 eax.split.bit_width = 40; 242 eax.split.bit_width = 40;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 15466c096ba5..0972315c3860 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -138,7 +138,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
138 struct acpi_resource_address64 addr; 138 struct acpi_resource_address64 addr;
139 acpi_status status; 139 acpi_status status;
140 unsigned long flags; 140 unsigned long flags;
141 struct resource *root, *conflict;
142 u64 start, end; 141 u64 start, end;
143 142
144 status = resource_to_addr(acpi_res, &addr); 143 status = resource_to_addr(acpi_res, &addr);
@@ -146,12 +145,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
146 return AE_OK; 145 return AE_OK;
147 146
148 if (addr.resource_type == ACPI_MEMORY_RANGE) { 147 if (addr.resource_type == ACPI_MEMORY_RANGE) {
149 root = &iomem_resource;
150 flags = IORESOURCE_MEM; 148 flags = IORESOURCE_MEM;
151 if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY) 149 if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
152 flags |= IORESOURCE_PREFETCH; 150 flags |= IORESOURCE_PREFETCH;
153 } else if (addr.resource_type == ACPI_IO_RANGE) { 151 } else if (addr.resource_type == ACPI_IO_RANGE) {
154 root = &ioport_resource;
155 flags = IORESOURCE_IO; 152 flags = IORESOURCE_IO;
156 } else 153 } else
157 return AE_OK; 154 return AE_OK;
@@ -172,25 +169,90 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
172 return AE_OK; 169 return AE_OK;
173 } 170 }
174 171
175 conflict = insert_resource_conflict(root, res); 172 info->res_num++;
176 if (conflict) { 173 if (addr.translation_offset)
177 dev_err(&info->bridge->dev, 174 dev_info(&info->bridge->dev, "host bridge window %pR "
178 "address space collision: host bridge window %pR " 175 "(PCI address [%#llx-%#llx])\n",
179 "conflicts with %s %pR\n", 176 res, res->start - addr.translation_offset,
180 res, conflict->name, conflict); 177 res->end - addr.translation_offset);
181 } else { 178 else
182 pci_bus_add_resource(info->bus, res, 0); 179 dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
183 info->res_num++; 180
184 if (addr.translation_offset) 181 return AE_OK;
185 dev_info(&info->bridge->dev, "host bridge window %pR " 182}
186 "(PCI address [%#llx-%#llx])\n", 183
187 res, res->start - addr.translation_offset, 184static bool resource_contains(struct resource *res, resource_size_t point)
188 res->end - addr.translation_offset); 185{
186 if (res->start <= point && point <= res->end)
187 return true;
188 return false;
189}
190
191static void coalesce_windows(struct pci_root_info *info, int type)
192{
193 int i, j;
194 struct resource *res1, *res2;
195
196 for (i = 0; i < info->res_num; i++) {
197 res1 = &info->res[i];
198 if (!(res1->flags & type))
199 continue;
200
201 for (j = i + 1; j < info->res_num; j++) {
202 res2 = &info->res[j];
203 if (!(res2->flags & type))
204 continue;
205
206 /*
207 * I don't like throwing away windows because then
208 * our resources no longer match the ACPI _CRS, but
209 * the kernel resource tree doesn't allow overlaps.
210 */
211 if (resource_contains(res1, res2->start) ||
212 resource_contains(res1, res2->end) ||
213 resource_contains(res2, res1->start) ||
214 resource_contains(res2, res1->end)) {
215 res1->start = min(res1->start, res2->start);
216 res1->end = max(res1->end, res2->end);
217 dev_info(&info->bridge->dev,
218 "host bridge window expanded to %pR; %pR ignored\n",
219 res1, res2);
220 res2->flags = 0;
221 }
222 }
223 }
224}
225
226static void add_resources(struct pci_root_info *info)
227{
228 int i;
229 struct resource *res, *root, *conflict;
230
231 if (!pci_use_crs)
232 return;
233
234 coalesce_windows(info, IORESOURCE_MEM);
235 coalesce_windows(info, IORESOURCE_IO);
236
237 for (i = 0; i < info->res_num; i++) {
238 res = &info->res[i];
239
240 if (res->flags & IORESOURCE_MEM)
241 root = &iomem_resource;
242 else if (res->flags & IORESOURCE_IO)
243 root = &ioport_resource;
189 else 244 else
190 dev_info(&info->bridge->dev, 245 continue;
191 "host bridge window %pR\n", res); 246
247 conflict = insert_resource_conflict(root, res);
248 if (conflict)
249 dev_err(&info->bridge->dev,
250 "address space collision: host bridge window %pR "
251 "conflicts with %s %pR\n",
252 res, conflict->name, conflict);
253 else
254 pci_bus_add_resource(info->bus, res, 0);
192 } 255 }
193 return AE_OK;
194} 256}
195 257
196static void 258static void
@@ -224,6 +286,7 @@ get_current_resources(struct acpi_device *device, int busnum,
224 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 286 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
225 &info); 287 &info);
226 288
289 add_resources(&info);
227 return; 290 return;
228 291
229name_alloc_fail: 292name_alloc_fail:
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index fc1e8fe07e5c..e27dffbbb1a7 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/range.h> 5#include <linux/range.h>
6 6
7#include <asm/amd_nb.h>
7#include <asm/pci_x86.h> 8#include <asm/pci_x86.h>
8 9
9#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
@@ -378,6 +379,34 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = {
378 .notifier_call = amd_cpu_notify, 379 .notifier_call = amd_cpu_notify,
379}; 380};
380 381
382static void __init pci_enable_pci_io_ecs(void)
383{
384#ifdef CONFIG_AMD_NB
385 unsigned int i, n;
386
387 for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
388 u8 bus = amd_nb_bus_dev_ranges[i].bus;
389 u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
390 u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
391
392 for (; slot < limit; ++slot) {
393 u32 val = read_pci_config(bus, slot, 3, 0);
394
395 if (!early_is_amd_nb(val))
396 continue;
397
398 val = read_pci_config(bus, slot, 3, 0x8c);
399 if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
400 val |= ENABLE_CF8_EXT_CFG >> 32;
401 write_pci_config(bus, slot, 3, 0x8c, val);
402 }
403 ++n;
404 }
405 }
406 pr_info("Extended Config Space enabled on %u nodes\n", n);
407#endif
408}
409
381static int __init pci_io_ecs_init(void) 410static int __init pci_io_ecs_init(void)
382{ 411{
383 int cpu; 412 int cpu;
@@ -386,6 +415,10 @@ static int __init pci_io_ecs_init(void)
386 if (boot_cpu_data.x86 < 0x10) 415 if (boot_cpu_data.x86 < 0x10)
387 return 0; 416 return 0;
388 417
418 /* Try the PCI method first. */
419 if (early_pci_allowed())
420 pci_enable_pci_io_ecs();
421
389 register_cpu_notifier(&amd_cpu_notifier); 422 register_cpu_notifier(&amd_cpu_notifier);
390 for_each_online_cpu(cpu) 423 for_each_online_cpu(cpu)
391 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, 424 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 0846a5bbbfbd..ab8269b0da29 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -9,6 +9,7 @@
9 * option) any later version. 9 * option) any later version.
10 */ 10 */
11 11
12#include <linux/acpi.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/dmi.h> 14#include <linux/dmi.h>
14#include <linux/pci.h> 15#include <linux/pci.h>
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
25 u8 fbus, lbus; 26 u8 fbus, lbus;
26 int i; 27 int i;
27 28
29#ifdef CONFIG_ACPI
28 /* 30 /*
29 * The x86_pci_root_bus_res_quirks() function already refuses to use 31 * We should get host bridge information from ACPI unless the BIOS
30 * this information if ACPI _CRS was used. Therefore, we don't bother 32 * doesn't support it.
31 * checking if ACPI is enabled, and just generate the information
32 * for both the ACPI _CRS and no ACPI cases.
33 */ 33 */
34 if (acpi_os_get_root_pointer())
35 return;
36#endif
34 37
35 info = &pci_root_info[pci_root_num]; 38 info = &pci_root_info[pci_root_num];
36 pci_root_num++; 39 pci_root_num++;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f7c8a399978c..5fe75026ecc2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
22 22
23unsigned int pci_early_dump_regs; 23unsigned int pci_early_dump_regs;
24static int pci_bf_sort; 24static int pci_bf_sort;
25static int smbios_type_b1_flag;
25int pci_routeirq; 26int pci_routeirq;
26int noioapicquirk; 27int noioapicquirk;
27#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS 28#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d)
185 return 0; 186 return 0;
186} 187}
187 188
189static void __devinit read_dmi_type_b1(const struct dmi_header *dm,
190 void *private_data)
191{
192 u8 *d = (u8 *)dm + 4;
193
194 if (dm->type != 0xB1)
195 return;
196 switch (((*(u32 *)d) >> 9) & 0x03) {
197 case 0x00:
198 printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
199 break;
200 case 0x01: /* set pci=bfsort */
201 smbios_type_b1_flag = 1;
202 break;
203 case 0x02: /* do not set pci=bfsort */
204 smbios_type_b1_flag = 2;
205 break;
206 default:
207 break;
208 }
209}
210
211static int __devinit find_sort_method(const struct dmi_system_id *d)
212{
213 dmi_walk(read_dmi_type_b1, NULL);
214
215 if (smbios_type_b1_flag == 1) {
216 set_bf_sort(d);
217 return 0;
218 }
219 return -1;
220}
221
188/* 222/*
189 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) 223 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
190 */ 224 */
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
213 }, 247 },
214#endif /* __i386__ */ 248#endif /* __i386__ */
215 { 249 {
250 .callback = find_sort_method,
251 .ident = "Dell System",
252 .matches = {
253 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
254 },
255 },
256 {
216 .callback = set_bf_sort, 257 .callback = set_bf_sort,
217 .ident = "Dell PowerEdge 1950", 258 .ident = "Dell PowerEdge 1950",
218 .matches = { 259 .matches = {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index c4bb261c106e..b1805b78842f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -65,21 +65,13 @@ pcibios_align_resource(void *data, const struct resource *res,
65 resource_size_t size, resource_size_t align) 65 resource_size_t size, resource_size_t align)
66{ 66{
67 struct pci_dev *dev = data; 67 struct pci_dev *dev = data;
68 resource_size_t start = round_down(res->end - size + 1, align); 68 resource_size_t start = res->start;
69 69
70 if (res->flags & IORESOURCE_IO) { 70 if (res->flags & IORESOURCE_IO) {
71 71 if (skip_isa_ioresource_align(dev))
72 /* 72 return start;
73 * If we're avoiding ISA aliases, the largest contiguous I/O 73 if (start & 0x300)
74 * port space is 256 bytes. Clearing bits 9 and 10 preserves 74 start = (start + 0x3ff) & ~0x3ff;
75 * all 256-byte and smaller alignments, so the result will
76 * still be correctly aligned.
77 */
78 if (!skip_isa_ioresource_align(dev))
79 start &= ~0x300;
80 } else if (res->flags & IORESOURCE_MEM) {
81 if (start < BIOS_END)
82 start = res->end; /* fail; no space */
83 } 75 }
84 return start; 76 return start;
85} 77}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9f9bfb705cf9..87e6c8323117 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: 592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
593 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
593 r->name = "PIIX/ICH"; 594 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 595 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 596 r->set = pirq_piix_set;
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d165096a..a5f7d0d63de0 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11#include <asm/pci-functions.h> 11#include <asm/pci-functions.h>
12#include <asm/cacheflush.h>
12 13
13/* BIOS32 signature: "_32_" */ 14/* BIOS32 signature: "_32_" */
14#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) 15#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
25#define PCIBIOS_HW_TYPE1_SPEC 0x10 26#define PCIBIOS_HW_TYPE1_SPEC 0x10
26#define PCIBIOS_HW_TYPE2_SPEC 0x20 27#define PCIBIOS_HW_TYPE2_SPEC 0x20
27 28
29int pcibios_enabled;
30
31/* According to the BIOS specification at:
32 * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
33 * restrict the x zone to some pages and make it ro. But this may be
34 * broken on some bios, complex to handle with static_protections.
35 * We could make the 0xe0000-0x100000 range rox, but this can break
36 * some ISA mapping.
37 *
38 * So we let's an rw and x hole when pcibios is used. This shouldn't
39 * happen for modern system with mmconfig, and if you don't want it
40 * you could disable pcibios...
41 */
42static inline void set_bios_x(void)
43{
44 pcibios_enabled = 1;
45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
46 if (__supported_pte_mask & _PAGE_NX)
47 printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
48}
49
28/* 50/*
29 * This is the standard structure used to identify the entry point 51 * This is the standard structure used to identify the entry point
30 * to the BIOS32 Service Directory, as documented in 52 * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
332 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", 354 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
333 bios32_entry); 355 bios32_entry);
334 bios32_indirect.address = bios32_entry + PAGE_OFFSET; 356 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
357 set_bios_x();
335 if (check_pcibios()) 358 if (check_pcibios())
336 return &pci_bios_access; 359 return &pci_bios_access;
337 } 360 }
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 117f5b8daf75..25cd4a07d09f 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -70,6 +70,9 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
70struct xen_pci_frontend_ops *xen_pci_frontend; 70struct xen_pci_frontend_ops *xen_pci_frontend;
71EXPORT_SYMBOL_GPL(xen_pci_frontend); 71EXPORT_SYMBOL_GPL(xen_pci_frontend);
72 72
73#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
74 MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
75
73static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, 76static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
74 struct msi_msg *msg) 77 struct msi_msg *msg)
75{ 78{
@@ -83,12 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
83 MSI_ADDR_REDIRECTION_CPU | 86 MSI_ADDR_REDIRECTION_CPU |
84 MSI_ADDR_DEST_ID(pirq); 87 MSI_ADDR_DEST_ID(pirq);
85 88
86 msg->data = 89 msg->data = XEN_PIRQ_MSI_DATA;
87 MSI_DATA_TRIGGER_EDGE |
88 MSI_DATA_LEVEL_ASSERT |
89 /* delivery mode reserved */
90 (3 << 8) |
91 MSI_DATA_VECTOR(0);
92} 90}
93 91
94static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 92static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -98,8 +96,23 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
98 struct msi_msg msg; 96 struct msi_msg msg;
99 97
100 list_for_each_entry(msidesc, &dev->msi_list, list) { 98 list_for_each_entry(msidesc, &dev->msi_list, list) {
99 __read_msi_msg(msidesc, &msg);
100 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
101 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
102 if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
103 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
104 "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
105 if (irq < 0)
106 goto error;
107 ret = set_irq_msi(irq, msidesc);
108 if (ret < 0)
109 goto error_while;
110 printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
111 " pirq=%d\n", irq, pirq);
112 return 0;
113 }
101 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? 114 xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
102 "msi-x" : "msi", &irq, &pirq); 115 "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ));
103 if (irq < 0 || pirq < 0) 116 if (irq < 0 || pirq < 0)
104 goto error; 117 goto error;
105 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); 118 printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
@@ -147,8 +160,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
147 irq = xen_allocate_pirq(v[i], 0, /* not sharable */ 160 irq = xen_allocate_pirq(v[i], 0, /* not sharable */
148 (type == PCI_CAP_ID_MSIX) ? 161 (type == PCI_CAP_ID_MSIX) ?
149 "pcifront-msi-x" : "pcifront-msi"); 162 "pcifront-msi-x" : "pcifront-msi");
150 if (irq < 0) 163 if (irq < 0) {
151 return -1; 164 ret = -1;
165 goto free;
166 }
152 167
153 ret = set_irq_msi(irq, msidesc); 168 ret = set_irq_msi(irq, msidesc);
154 if (ret) 169 if (ret)
@@ -164,7 +179,7 @@ error:
164 if (ret == -ENODEV) 179 if (ret == -ENODEV)
165 dev_err(&dev->dev, "Xen PCI frontend has not registered" \ 180 dev_err(&dev->dev, "Xen PCI frontend has not registered" \
166 " MSI/MSI-X support!\n"); 181 " MSI/MSI-X support!\n");
167 182free:
168 kfree(v); 183 kfree(v);
169 return ret; 184 return ret;
170} 185}
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 65df603622b2..25bfdbb5b130 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -103,7 +103,7 @@ struct dw_spi_reg {
103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; 103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
104 104
105static u32 *pclk_spi0; 105static u32 *pclk_spi0;
106/* Always contains an accessable address, start with 0 */ 106/* Always contains an accessible address, start with 0 */
107static struct dw_spi_reg *pspi; 107static struct dw_spi_reg *pspi;
108 108
109static struct kmsg_dumper dw_dumper; 109static struct kmsg_dumper dw_dumper;
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 4c542c757cb4..5c0207bf959b 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -72,32 +72,6 @@ struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
72EXPORT_SYMBOL_GPL(sfi_mrtc_array); 72EXPORT_SYMBOL_GPL(sfi_mrtc_array);
73int sfi_mrtc_num; 73int sfi_mrtc_num;
74 74
75static inline void assign_to_mp_irq(struct mpc_intsrc *m,
76 struct mpc_intsrc *mp_irq)
77{
78 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
79}
80
81static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
82 struct mpc_intsrc *m)
83{
84 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
85}
86
87static void save_mp_irq(struct mpc_intsrc *m)
88{
89 int i;
90
91 for (i = 0; i < mp_irq_entries; i++) {
92 if (!mp_irq_cmp(&mp_irqs[i], m))
93 return;
94 }
95
96 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
97 if (++mp_irq_entries == MAX_IRQ_SOURCES)
98 panic("Max # of irq sources exceeded!!\n");
99}
100
101/* parse all the mtimer info to a static mtimer array */ 75/* parse all the mtimer info to a static mtimer array */
102static int __init sfi_parse_mtmr(struct sfi_table_header *table) 76static int __init sfi_parse_mtmr(struct sfi_table_header *table)
103{ 77{
@@ -131,7 +105,7 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
131 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 105 mp_irq.srcbusirq = pentry->irq; /* IRQ */
132 mp_irq.dstapic = MP_APIC_ALL; 106 mp_irq.dstapic = MP_APIC_ALL;
133 mp_irq.dstirq = pentry->irq; 107 mp_irq.dstirq = pentry->irq;
134 save_mp_irq(&mp_irq); 108 mp_save_irq(&mp_irq);
135 } 109 }
136 110
137 return 0; 111 return 0;
@@ -201,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
201 mp_irq.srcbusirq = pentry->irq; /* IRQ */ 175 mp_irq.srcbusirq = pentry->irq; /* IRQ */
202 mp_irq.dstapic = MP_APIC_ALL; 176 mp_irq.dstapic = MP_APIC_ALL;
203 mp_irq.dstirq = pentry->irq; 177 mp_irq.dstirq = pentry->irq;
204 save_mp_irq(&mp_irq); 178 mp_save_irq(&mp_irq);
205 } 179 }
206 return 0; 180 return 0;
207} 181}
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c31b8fcb5a86..e797428b163b 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 3obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
4obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index f5442c03abc3..127775696d6c 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * Support for features of the OLPC XO-1 laptop 2 * Support for features of the OLPC XO-1 laptop
3 * 3 *
4 * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
4 * Copyright (C) 2010 One Laptop per Child 5 * Copyright (C) 2010 One Laptop per Child
5 * Copyright (C) 2006 Red Hat, Inc. 6 * Copyright (C) 2006 Red Hat, Inc.
6 * Copyright (C) 2006 Advanced Micro Devices, Inc. 7 * Copyright (C) 2006 Advanced Micro Devices, Inc.
@@ -12,8 +13,6 @@
12 */ 13 */
13 14
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/pci.h>
16#include <linux/pci_ids.h>
17#include <linux/platform_device.h> 16#include <linux/platform_device.h>
18#include <linux/pm.h> 17#include <linux/pm.h>
19 18
@@ -22,9 +21,6 @@
22 21
23#define DRV_NAME "olpc-xo1" 22#define DRV_NAME "olpc-xo1"
24 23
25#define PMS_BAR 4
26#define ACPI_BAR 5
27
28/* PMC registers (PMS block) */ 24/* PMC registers (PMS block) */
29#define PM_SCLK 0x10 25#define PM_SCLK 0x10
30#define PM_IN_SLPCTL 0x20 26#define PM_IN_SLPCTL 0x20
@@ -57,65 +53,67 @@ static void xo1_power_off(void)
57 outl(0x00002000, acpi_base + PM1_CNT); 53 outl(0x00002000, acpi_base + PM1_CNT);
58} 54}
59 55
60/* Read the base addresses from the PCI BAR info */ 56static int __devinit olpc_xo1_probe(struct platform_device *pdev)
61static int __devinit setup_bases(struct pci_dev *pdev)
62{ 57{
63 int r; 58 struct resource *res;
64 59
65 r = pci_enable_device_io(pdev); 60 /* don't run on non-XOs */
66 if (r) { 61 if (!machine_is_olpc())
67 dev_err(&pdev->dev, "can't enable device IO\n"); 62 return -ENODEV;
68 return r;
69 }
70 63
71 r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); 64 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
72 if (r) { 65 if (!res) {
73 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); 66 dev_err(&pdev->dev, "can't fetch device resource info\n");
74 return r; 67 return -EIO;
75 } 68 }
76 69
77 r = pci_request_region(pdev, PMS_BAR, DRV_NAME); 70 if (!request_region(res->start, resource_size(res), DRV_NAME)) {
78 if (r) { 71 dev_err(&pdev->dev, "can't request region\n");
79 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); 72 return -EIO;
80 pci_release_region(pdev, ACPI_BAR);
81 return r;
82 } 73 }
83 74
84 acpi_base = pci_resource_start(pdev, ACPI_BAR); 75 if (strcmp(pdev->name, "cs5535-pms") == 0)
85 pms_base = pci_resource_start(pdev, PMS_BAR); 76 pms_base = res->start;
77 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
78 acpi_base = res->start;
79
80 /* If we have both addresses, we can override the poweroff hook */
81 if (pms_base && acpi_base) {
82 pm_power_off = xo1_power_off;
83 printk(KERN_INFO "OLPC XO-1 support registered\n");
84 }
86 85
87 return 0; 86 return 0;
88} 87}
89 88
90static int __devinit olpc_xo1_probe(struct platform_device *pdev) 89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
91{ 90{
92 struct pci_dev *pcidev; 91 struct resource *r;
93 int r;
94
95 pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
96 NULL);
97 if (!pdev)
98 return -ENODEV;
99
100 r = setup_bases(pcidev);
101 if (r)
102 return r;
103 92
104 pm_power_off = xo1_power_off; 93 r = platform_get_resource(pdev, IORESOURCE_IO, 0);
94 release_region(r->start, resource_size(r));
105 95
106 printk(KERN_INFO "OLPC XO-1 support registered\n"); 96 if (strcmp(pdev->name, "cs5535-pms") == 0)
107 return 0; 97 pms_base = 0;
108} 98 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
99 acpi_base = 0;
109 100
110static int __devexit olpc_xo1_remove(struct platform_device *pdev)
111{
112 pm_power_off = NULL; 101 pm_power_off = NULL;
113 return 0; 102 return 0;
114} 103}
115 104
116static struct platform_driver olpc_xo1_driver = { 105static struct platform_driver cs5535_pms_drv = {
106 .driver = {
107 .name = "cs5535-pms",
108 .owner = THIS_MODULE,
109 },
110 .probe = olpc_xo1_probe,
111 .remove = __devexit_p(olpc_xo1_remove),
112};
113
114static struct platform_driver cs5535_acpi_drv = {
117 .driver = { 115 .driver = {
118 .name = DRV_NAME, 116 .name = "cs5535-acpi",
119 .owner = THIS_MODULE, 117 .owner = THIS_MODULE,
120 }, 118 },
121 .probe = olpc_xo1_probe, 119 .probe = olpc_xo1_probe,
@@ -124,12 +122,23 @@ static struct platform_driver olpc_xo1_driver = {
124 122
125static int __init olpc_xo1_init(void) 123static int __init olpc_xo1_init(void)
126{ 124{
127 return platform_driver_register(&olpc_xo1_driver); 125 int r;
126
127 r = platform_driver_register(&cs5535_pms_drv);
128 if (r)
129 return r;
130
131 r = platform_driver_register(&cs5535_acpi_drv);
132 if (r)
133 platform_driver_unregister(&cs5535_pms_drv);
134
135 return r;
128} 136}
129 137
130static void __exit olpc_xo1_exit(void) 138static void __exit olpc_xo1_exit(void)
131{ 139{
132 platform_driver_unregister(&olpc_xo1_driver); 140 platform_driver_unregister(&cs5535_acpi_drv);
141 platform_driver_unregister(&cs5535_pms_drv);
133} 142}
134 143
135MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); 144MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
new file mode 100644
index 000000000000..dab874647530
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -0,0 +1,183 @@
1/*
2 * OLPC-specific OFW device tree support code.
3 *
4 * Paul Mackerras August 1996.
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
8 * {engebret|bergner}@us.ibm.com
9 *
10 * Adapted for sparc by David S. Miller davem@davemloft.net
11 * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net>
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bootmem.h>
21#include <linux/of.h>
22#include <linux/of_pdt.h>
23#include <asm/olpc_ofw.h>
24
25static phandle __init olpc_dt_getsibling(phandle node)
26{
27 const void *args[] = { (void *)node };
28 void *res[] = { &node };
29
30 if ((s32)node == -1)
31 return 0;
32
33 if (olpc_ofw("peer", args, res) || (s32)node == -1)
34 return 0;
35
36 return node;
37}
38
39static phandle __init olpc_dt_getchild(phandle node)
40{
41 const void *args[] = { (void *)node };
42 void *res[] = { &node };
43
44 if ((s32)node == -1)
45 return 0;
46
47 if (olpc_ofw("child", args, res) || (s32)node == -1) {
48 pr_err("PROM: %s: fetching child failed!\n", __func__);
49 return 0;
50 }
51
52 return node;
53}
54
55static int __init olpc_dt_getproplen(phandle node, const char *prop)
56{
57 const void *args[] = { (void *)node, prop };
58 int len;
59 void *res[] = { &len };
60
61 if ((s32)node == -1)
62 return -1;
63
64 if (olpc_ofw("getproplen", args, res)) {
65 pr_err("PROM: %s: getproplen failed!\n", __func__);
66 return -1;
67 }
68
69 return len;
70}
71
72static int __init olpc_dt_getproperty(phandle node, const char *prop,
73 char *buf, int bufsize)
74{
75 int plen;
76
77 plen = olpc_dt_getproplen(node, prop);
78 if (plen > bufsize || plen < 1) {
79 return -1;
80 } else {
81 const void *args[] = { (void *)node, prop, buf, (void *)plen };
82 void *res[] = { &plen };
83
84 if (olpc_ofw("getprop", args, res)) {
85 pr_err("PROM: %s: getprop failed!\n", __func__);
86 return -1;
87 }
88 }
89
90 return plen;
91}
92
93static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf)
94{
95 const void *args[] = { (void *)node, prev, buf };
96 int success;
97 void *res[] = { &success };
98
99 buf[0] = '\0';
100
101 if ((s32)node == -1)
102 return -1;
103
104 if (olpc_ofw("nextprop", args, res) || success != 1)
105 return -1;
106
107 return 0;
108}
109
110static int __init olpc_dt_pkg2path(phandle node, char *buf,
111 const int buflen, int *len)
112{
113 const void *args[] = { (void *)node, buf, (void *)buflen };
114 void *res[] = { len };
115
116 if ((s32)node == -1)
117 return -1;
118
119 if (olpc_ofw("package-to-path", args, res) || *len < 1)
120 return -1;
121
122 return 0;
123}
124
125static unsigned int prom_early_allocated __initdata;
126
127void * __init prom_early_alloc(unsigned long size)
128{
129 static u8 *mem;
130 static size_t free_mem;
131 void *res;
132
133 if (free_mem < size) {
134 const size_t chunk_size = max(PAGE_SIZE, size);
135
136 /*
137 * To mimimize the number of allocations, grab at least
138 * PAGE_SIZE of memory (that's an arbitrary choice that's
139 * fast enough on the platforms we care about while minimizing
140 * wasted bootmem) and hand off chunks of it to callers.
141 */
142 res = alloc_bootmem(chunk_size);
143 if (!res)
144 return NULL;
145 prom_early_allocated += chunk_size;
146 memset(res, 0, chunk_size);
147 free_mem = chunk_size;
148 mem = res;
149 }
150
151 /* allocate from the local cache */
152 free_mem -= size;
153 res = mem;
154 mem += size;
155 return res;
156}
157
158static struct of_pdt_ops prom_olpc_ops __initdata = {
159 .nextprop = olpc_dt_nextprop,
160 .getproplen = olpc_dt_getproplen,
161 .getproperty = olpc_dt_getproperty,
162 .getchild = olpc_dt_getchild,
163 .getsibling = olpc_dt_getsibling,
164 .pkg2path = olpc_dt_pkg2path,
165};
166
167void __init olpc_dt_build_devicetree(void)
168{
169 phandle root;
170
171 if (!olpc_ofw_is_installed())
172 return;
173
174 root = olpc_dt_getsibling(0);
175 if (!root) {
176 pr_err("PROM: unable to get root node from OFW!\n");
177 return;
178 }
179 of_pdt_build_devicetree(root, &prom_olpc_ops);
180
181 pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
182 prom_early_allocated);
183}
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
index 787320464379..e7604f62870d 100644
--- a/arch/x86/platform/olpc/olpc_ofw.c
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -110,3 +110,8 @@ void __init olpc_ofw_detect(void)
110 (unsigned long)olpc_ofw_cif, (-start) >> 20); 110 (unsigned long)olpc_ofw_cif, (-start) >> 20);
111 reserve_top_address(-start); 111 reserve_top_address(-start);
112} 112}
113
114bool __init olpc_ofw_is_installed(void)
115{
116 return olpc_ofw_cif != NULL;
117}
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index dd4c281ffe57..7785b72ecc3a 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -34,23 +34,12 @@
34#ifdef CONFIG_X86_LOCAL_APIC 34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36 36
37static void __init mp_sfi_register_lapic_address(unsigned long address)
38{
39 mp_lapic_addr = address;
40
41 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
42 if (boot_cpu_physical_apicid == -1U)
43 boot_cpu_physical_apicid = read_apic_id();
44
45 pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
46}
47
48/* All CPUs enumerated by SFI must be present and enabled */ 37/* All CPUs enumerated by SFI must be present and enabled */
49static void __cpuinit mp_sfi_register_lapic(u8 id) 38static void __cpuinit mp_sfi_register_lapic(u8 id)
50{ 39{
51 if (MAX_APICS - id <= 0) { 40 if (MAX_LOCAL_APIC - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n", 41 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS); 42 id, MAX_LOCAL_APIC);
54 return; 43 return;
55 } 44 }
56 45
@@ -110,7 +99,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
110int __init sfi_platform_init(void) 99int __init sfi_platform_init(void)
111{ 100{
112#ifdef CONFIG_X86_LOCAL_APIC 101#ifdef CONFIG_X86_LOCAL_APIC
113 mp_sfi_register_lapic_address(sfi_lapic_addr); 102 register_lapic_address(sfi_lapic_addr);
114 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); 103 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
115#endif 104#endif
116#ifdef CONFIG_X86_IO_APIC 105#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 20ea20a39e2a..df58e9cad96a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1341,10 +1341,10 @@ uv_activation_descriptor_init(int node, int pnode)
1341 1341
1342 /* 1342 /*
1343 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1343 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
1344 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub 1344 * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
1345 */ 1345 */
1346 bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* 1346 bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
1347 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1347 * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
1348 BUG_ON(!bau_desc); 1348 BUG_ON(!bau_desc);
1349 1349
1350 pa = uv_gpa(bau_desc); /* need the real nasid*/ 1350 pa = uv_gpa(bau_desc); /* need the real nasid*/
@@ -1402,9 +1402,9 @@ uv_payload_queue_init(int node, int pnode)
1402 struct bau_payload_queue_entry *pqp_malloc; 1402 struct bau_payload_queue_entry *pqp_malloc;
1403 struct bau_control *bcp; 1403 struct bau_control *bcp;
1404 1404
1405 pqp = (struct bau_payload_queue_entry *) kmalloc_node( 1405 pqp = kmalloc_node((DEST_Q_SIZE + 1)
1406 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), 1406 * sizeof(struct bau_payload_queue_entry),
1407 GFP_KERNEL, node); 1407 GFP_KERNEL, node);
1408 BUG_ON(!pqp); 1408 BUG_ON(!pqp);
1409 pqp_malloc = pqp; 1409 pqp_malloc = pqp;
1410 1410
@@ -1455,7 +1455,7 @@ static void __init uv_init_uvhub(int uvhub, int vector)
1455 * the below initialization can't be in firmware because the 1455 * the below initialization can't be in firmware because the
1456 * messaging IRQ will be determined by the OS 1456 * messaging IRQ will be determined by the OS
1457 */ 1457 */
1458 apicid = uvhub_to_first_apicid(uvhub); 1458 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
1459 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1459 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
1460 ((apicid << 32) | vector)); 1460 ((apicid << 32) | vector));
1461} 1461}
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
1490/* 1490/*
1491 * initialize the bau_control structure for each cpu 1491 * initialize the bau_control structure for each cpu
1492 */ 1492 */
1493static void __init uv_init_per_cpu(int nuvhubs) 1493static int __init uv_init_per_cpu(int nuvhubs)
1494{ 1494{
1495 int i; 1495 int i;
1496 int cpu; 1496 int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
1507 struct bau_control *smaster = NULL; 1507 struct bau_control *smaster = NULL;
1508 struct socket_desc { 1508 struct socket_desc {
1509 short num_cpus; 1509 short num_cpus;
1510 short cpu_number[16]; 1510 short cpu_number[MAX_CPUS_PER_SOCKET];
1511 }; 1511 };
1512 struct uvhub_desc { 1512 struct uvhub_desc {
1513 unsigned short socket_mask; 1513 unsigned short socket_mask;
@@ -1520,8 +1520,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
1520 1520
1521 timeout_us = calculate_destination_timeout(); 1521 timeout_us = calculate_destination_timeout();
1522 1522
1523 uvhub_descs = (struct uvhub_desc *) 1523 uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1524 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1525 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); 1524 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1526 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); 1525 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1527 for_each_present_cpu(cpu) { 1526 for_each_present_cpu(cpu) {
@@ -1541,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
1541 sdp = &bdp->socket[socket]; 1540 sdp = &bdp->socket[socket];
1542 sdp->cpu_number[sdp->num_cpus] = cpu; 1541 sdp->cpu_number[sdp->num_cpus] = cpu;
1543 sdp->num_cpus++; 1542 sdp->num_cpus++;
1543 if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
1544 printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
1545 return 1;
1546 }
1544 } 1547 }
1545 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 1548 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1546 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) 1549 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1571,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
1571 bcp->uvhub_master = hmaster; 1574 bcp->uvhub_master = hmaster;
1572 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> 1575 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1573 blade_processor_id; 1576 blade_processor_id;
1577 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1578 printk(KERN_EMERG
1579 "%d cpus per uvhub invalid\n",
1580 bcp->uvhub_cpu);
1581 return 1;
1582 }
1574 } 1583 }
1575nextsocket: 1584nextsocket:
1576 socket++; 1585 socket++;
@@ -1596,6 +1605,7 @@ nextsocket:
1596 bcp->congested_reps = congested_reps; 1605 bcp->congested_reps = congested_reps;
1597 bcp->congested_period = congested_period; 1606 bcp->congested_period = congested_period;
1598 } 1607 }
1608 return 0;
1599} 1609}
1600 1610
1601/* 1611/*
@@ -1626,7 +1636,10 @@ static int __init uv_bau_init(void)
1626 spin_lock_init(&disable_lock); 1636 spin_lock_init(&disable_lock);
1627 congested_cycles = microsec_2_cycles(congested_response_us); 1637 congested_cycles = microsec_2_cycles(congested_response_us);
1628 1638
1629 uv_init_per_cpu(nuvhubs); 1639 if (uv_init_per_cpu(nuvhubs)) {
1640 nobau = 1;
1641 return 0;
1642 }
1630 1643
1631 uv_partition_base_pnode = 0x7fffffff; 1644 uv_partition_base_pnode = 0x7fffffff;
1632 for (uvhub = 0; uvhub < nuvhubs; uvhub++) 1645 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 56e421bc379b..9daf5d1af9f1 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -89,6 +89,7 @@ static void uv_rtc_send_IPI(int cpu)
89 89
90 apicid = cpu_physical_id(cpu); 90 apicid = cpu_physical_id(cpu);
91 pnode = uv_apicid_to_pnode(apicid); 91 pnode = uv_apicid_to_pnode(apicid);
92 apicid |= uv_apicid_hibits;
92 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 93 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
93 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 94 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
94 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 95 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
@@ -107,6 +108,7 @@ static int uv_intr_pending(int pnode)
107static int uv_setup_intr(int cpu, u64 expires) 108static int uv_setup_intr(int cpu, u64 expires)
108{ 109{
109 u64 val; 110 u64 val;
111 unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits;
110 int pnode = uv_cpu_to_pnode(cpu); 112 int pnode = uv_cpu_to_pnode(cpu);
111 113
112 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, 114 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
@@ -117,7 +119,7 @@ static int uv_setup_intr(int cpu, u64 expires)
117 UVH_EVENT_OCCURRED0_RTC1_MASK); 119 UVH_EVENT_OCCURRED0_RTC1_MASK);
118 120
119 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 121 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
120 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 122 ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
121 123
122 /* Set configuration */ 124 /* Set configuration */
123 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); 125 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 3371bd053b89..632037671746 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
171 ver = m->apicver; 171 ver = m->apicver;
172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { 172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", 173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
174 m->apicid, MAX_APICS); 174 m->apicid, MAX_LOCAL_APIC);
175 return; 175 return;
176 } 176 }
177 177
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4a2afa1bac51..b6552b189bcd 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
25 25
26export CPPFLAGS_vdso.lds += -P -C 26export CPPFLAGS_vdso.lds += -P -C
27 27
28VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ 28VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
30 30
31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
@@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter
69vdso32-images = $(vdso32.so-y:%=vdso32-%.so) 69vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
70 70
71CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) 71CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
72VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 72VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1
73 73
74# This makes sure the $(obj) subdirectory exists even though vdso32/ 74# This makes sure the $(obj) subdirectory exists even though vdso32/
75# is not a kbuild sub-make subdirectory. 75# is not a kbuild sub-make subdirectory.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o
16 17
17obj-$(CONFIG_SMP) += smp.o 18obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 235c0f4d3861..50542efe45fb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
75enum xen_domain_type xen_domain_type = XEN_NATIVE; 75enum xen_domain_type xen_domain_type = XEN_NATIVE;
76EXPORT_SYMBOL_GPL(xen_domain_type); 76EXPORT_SYMBOL_GPL(xen_domain_type);
77 77
78unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
79EXPORT_SYMBOL(machine_to_phys_mapping);
80unsigned int machine_to_phys_order;
81EXPORT_SYMBOL(machine_to_phys_order);
82
78struct start_info *xen_start_info; 83struct start_info *xen_start_info;
79EXPORT_SYMBOL_GPL(xen_start_info); 84EXPORT_SYMBOL_GPL(xen_start_info);
80 85
@@ -569,8 +574,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
569 574
570 preempt_disable(); 575 preempt_disable();
571 576
572 start = __get_cpu_var(idt_desc).address; 577 start = __this_cpu_read(idt_desc.address);
573 end = start + __get_cpu_var(idt_desc).size + 1; 578 end = start + __this_cpu_read(idt_desc.size) + 1;
574 579
575 xen_mc_flush(); 580 xen_mc_flush();
576 581
@@ -1016,10 +1021,6 @@ static void xen_reboot(int reason)
1016{ 1021{
1017 struct sched_shutdown r = { .reason = reason }; 1022 struct sched_shutdown r = { .reason = reason };
1018 1023
1019#ifdef CONFIG_SMP
1020 stop_other_cpus();
1021#endif
1022
1023 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1024 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1024 BUG(); 1025 BUG();
1025} 1026}
@@ -1090,6 +1091,8 @@ static void __init xen_setup_stackprotector(void)
1090/* First C function to be called on Xen boot */ 1091/* First C function to be called on Xen boot */
1091asmlinkage void __init xen_start_kernel(void) 1092asmlinkage void __init xen_start_kernel(void)
1092{ 1093{
1094 struct physdev_set_iopl set_iopl;
1095 int rc;
1093 pgd_t *pgd; 1096 pgd_t *pgd;
1094 1097
1095 if (!xen_start_info) 1098 if (!xen_start_info)
@@ -1097,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void)
1097 1100
1098 xen_domain_type = XEN_PV_DOMAIN; 1101 xen_domain_type = XEN_PV_DOMAIN;
1099 1102
1103 xen_setup_machphys_mapping();
1104
1100 /* Install Xen paravirt ops */ 1105 /* Install Xen paravirt ops */
1101 pv_info = xen_info; 1106 pv_info = xen_info;
1102 pv_init_ops = xen_init_ops; 1107 pv_init_ops = xen_init_ops;
@@ -1169,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void)
1169 1174
1170 xen_smp_init(); 1175 xen_smp_init();
1171 1176
1177#ifdef CONFIG_ACPI_NUMA
1178 /*
1179 * The pages we from Xen are not related to machine pages, so
1180 * any NUMA information the kernel tries to get from ACPI will
1181 * be meaningless. Prevent it from trying.
1182 */
1183 acpi_numa = -1;
1184#endif
1185
1172 pgd = (pgd_t *)xen_start_info->pt_base; 1186 pgd = (pgd_t *)xen_start_info->pt_base;
1173 1187
1174 if (!xen_initial_domain()) 1188 if (!xen_initial_domain())
@@ -1180,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void)
1180 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1181 1195
1182 local_irq_disable(); 1196 local_irq_disable();
1183 early_boot_irqs_off(); 1197 early_boot_irqs_disabled = true;
1184 1198
1185 memblock_init(); 1199 memblock_init();
1186 1200
@@ -1191,8 +1205,6 @@ asmlinkage void __init xen_start_kernel(void)
1191 /* Allocate and initialize top and mid mfn levels for p2m structure */ 1205 /* Allocate and initialize top and mid mfn levels for p2m structure */
1192 xen_build_mfn_list_list(); 1206 xen_build_mfn_list_list();
1193 1207
1194 init_mm.pgd = pgd;
1195
1196 /* keep using Xen gdt for now; no urgent need to change it */ 1208 /* keep using Xen gdt for now; no urgent need to change it */
1197 1209
1198#ifdef CONFIG_X86_32 1210#ifdef CONFIG_X86_32
@@ -1202,10 +1214,18 @@ asmlinkage void __init xen_start_kernel(void)
1202#else 1214#else
1203 pv_info.kernel_rpl = 0; 1215 pv_info.kernel_rpl = 0;
1204#endif 1216#endif
1205
1206 /* set the limit of our address space */ 1217 /* set the limit of our address space */
1207 xen_reserve_top(); 1218 xen_reserve_top();
1208 1219
1220 /* We used to do this in xen_arch_setup, but that is too late on AMD
1221 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
1222 * which pokes 0xcf8 port.
1223 */
1224 set_iopl.iopl = 1;
1225 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1226 if (rc != 0)
1227 xen_raw_printk("physdev_op failed %d\n", rc);
1228
1209#ifdef CONFIG_X86_32 1229#ifdef CONFIG_X86_32
1210 /* set up basic CPUID stuff */ 1230 /* set up basic CPUID stuff */
1211 cpu_detect(&new_cpu_data); 1231 cpu_detect(&new_cpu_data);
@@ -1245,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void)
1245#endif 1265#endif
1246} 1266}
1247 1267
1248static uint32_t xen_cpuid_base(void)
1249{
1250 uint32_t base, eax, ebx, ecx, edx;
1251 char signature[13];
1252
1253 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1254 cpuid(base, &eax, &ebx, &ecx, &edx);
1255 *(uint32_t *)(signature + 0) = ebx;
1256 *(uint32_t *)(signature + 4) = ecx;
1257 *(uint32_t *)(signature + 8) = edx;
1258 signature[12] = 0;
1259
1260 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1261 return base;
1262 }
1263
1264 return 0;
1265}
1266
1267static int init_hvm_pv_info(int *major, int *minor) 1268static int init_hvm_pv_info(int *major, int *minor)
1268{ 1269{
1269 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1270 uint32_t eax, ebx, ecx, edx, pages, msr, base;
@@ -1373,6 +1374,18 @@ static bool __init xen_hvm_platform(void)
1373 return true; 1374 return true;
1374} 1375}
1375 1376
1377bool xen_hvm_need_lapic(void)
1378{
1379 if (xen_pv_domain())
1380 return false;
1381 if (!xen_hvm_domain())
1382 return false;
1383 if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
1384 return false;
1385 return true;
1386}
1387EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
1388
1376const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { 1389const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
1377 .name = "Xen HVM", 1390 .name = "Xen HVM",
1378 .detect = xen_hvm_platform, 1391 .detect = xen_hvm_platform,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4a..6a6fe8939645 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
126#endif 126#endif
127}; 127};
128 128
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops(void)
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 132 x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c237b810b03f..5e92b61ad574 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
173 */ 173 */
174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175 175
176/*
177 * Xen leaves the responsibility for maintaining p2m mappings to the
178 * guests themselves, but it must also access and update the p2m array
179 * during suspend/resume when all the pages are reallocated.
180 *
181 * The p2m table is logically a flat array, but we implement it as a
182 * three-level tree to allow the address space to be sparse.
183 *
184 * Xen
185 * |
186 * p2m_top p2m_top_mfn
187 * / \ / \
188 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
189 * / \ / \ / /
190 * p2m p2m p2m p2m p2m p2m p2m ...
191 *
192 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193 *
194 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195 * maximum representable pseudo-physical address space is:
196 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197 *
198 * P2M_PER_PAGE depends on the architecture, as a mfn is always
199 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200 * 512 and 1024 entries respectively.
201 */
202
203unsigned long xen_max_p2m_pfn __read_mostly;
204
205#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
206#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
207#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
208
209#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
210
211/* Placeholders for holes in the address space */
212static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
213static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
215
216static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
217static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
219
220RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
221RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
222
223static inline unsigned p2m_top_index(unsigned long pfn)
224{
225 BUG_ON(pfn >= MAX_P2M_PFN);
226 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227}
228
229static inline unsigned p2m_mid_index(unsigned long pfn)
230{
231 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
232}
233
234static inline unsigned p2m_index(unsigned long pfn)
235{
236 return pfn % P2M_PER_PAGE;
237}
238
239static void p2m_top_init(unsigned long ***top)
240{
241 unsigned i;
242
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing;
245}
246
247static void p2m_top_mfn_init(unsigned long *top)
248{
249 unsigned i;
250
251 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253}
254
255static void p2m_top_mfn_p_init(unsigned long **top)
256{
257 unsigned i;
258
259 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 top[i] = p2m_mid_missing_mfn;
261}
262
263static void p2m_mid_init(unsigned long **mid)
264{
265 unsigned i;
266
267 for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 mid[i] = p2m_missing;
269}
270
271static void p2m_mid_mfn_init(unsigned long *mid)
272{
273 unsigned i;
274
275 for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 mid[i] = virt_to_mfn(p2m_missing);
277}
278
279static void p2m_init(unsigned long *p2m)
280{
281 unsigned i;
282
283 for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 p2m[i] = INVALID_P2M_ENTRY;
285}
286
287/*
288 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289 *
290 * This is called both at boot time, and after resuming from suspend:
291 * - At boot time we're called very early, and must use extend_brk()
292 * to allocate memory.
293 *
294 * - After resume we're called from within stop_machine, but the mfn
295 * tree should alreay be completely allocated.
296 */
297void xen_build_mfn_list_list(void)
298{
299 unsigned long pfn;
300
301 /* Pre-initialize p2m_top_mfn to be completely missing */
302 if (p2m_top_mfn == NULL) {
303 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 p2m_mid_mfn_init(p2m_mid_missing_mfn);
305
306 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 p2m_top_mfn_p_init(p2m_top_mfn_p);
308
309 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 p2m_top_mfn_init(p2m_top_mfn);
311 } else {
312 /* Reinitialise, mfn's all change after migration */
313 p2m_mid_mfn_init(p2m_mid_missing_mfn);
314 }
315
316 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
317 unsigned topidx = p2m_top_index(pfn);
318 unsigned mididx = p2m_mid_index(pfn);
319 unsigned long **mid;
320 unsigned long *mid_mfn_p;
321
322 mid = p2m_top[topidx];
323 mid_mfn_p = p2m_top_mfn_p[topidx];
324
325 /* Don't bother allocating any mfn mid levels if
326 * they're just missing, just update the stored mfn,
327 * since all could have changed over a migrate.
328 */
329 if (mid == p2m_mid_missing) {
330 BUG_ON(mididx);
331 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 continue;
335 }
336
337 if (mid_mfn_p == p2m_mid_missing_mfn) {
338 /*
339 * XXX boot-time only! We should never find
340 * missing parts of the mfn tree after
341 * runtime. extend_brk() will BUG if we call
342 * it too late.
343 */
344 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 p2m_mid_mfn_init(mid_mfn_p);
346
347 p2m_top_mfn_p[topidx] = mid_mfn_p;
348 }
349
350 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
352 }
353}
354
355void xen_setup_mfn_list_list(void)
356{
357 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
358
359 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
360 virt_to_mfn(p2m_top_mfn);
361 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
362}
363
364/* Set up p2m_top to point to the domain-builder provided p2m pages */
365void __init xen_build_dynamic_phys_to_machine(void)
366{
367 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
368 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
369 unsigned long pfn;
370
371 xen_max_p2m_pfn = max_pfn;
372
373 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 p2m_init(p2m_missing);
375
376 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 p2m_mid_init(p2m_mid_missing);
378
379 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 p2m_top_init(p2m_top);
381
382 /*
383 * The domain builder gives us a pre-constructed p2m array in
384 * mfn_list for all the pages initially given to us, so we just
385 * need to graft that into our tree structure.
386 */
387 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
388 unsigned topidx = p2m_top_index(pfn);
389 unsigned mididx = p2m_mid_index(pfn);
390
391 if (p2m_top[topidx] == p2m_mid_missing) {
392 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 p2m_mid_init(mid);
394
395 p2m_top[topidx] = mid;
396 }
397
398 p2m_top[topidx][mididx] = &mfn_list[pfn];
399 }
400}
401
402unsigned long get_phys_to_machine(unsigned long pfn)
403{
404 unsigned topidx, mididx, idx;
405
406 if (unlikely(pfn >= MAX_P2M_PFN))
407 return INVALID_P2M_ENTRY;
408
409 topidx = p2m_top_index(pfn);
410 mididx = p2m_mid_index(pfn);
411 idx = p2m_index(pfn);
412
413 return p2m_top[topidx][mididx][idx];
414}
415EXPORT_SYMBOL_GPL(get_phys_to_machine);
416
417static void *alloc_p2m_page(void)
418{
419 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
420}
421
422static void free_p2m_page(void *p)
423{
424 free_page((unsigned long)p);
425}
426
427/*
428 * Fully allocate the p2m structure for a given pfn. We need to check
429 * that both the top and mid levels are allocated, and make sure the
430 * parallel mfn tree is kept in sync. We may race with other cpus, so
431 * the new pages are installed with cmpxchg; if we lose the race then
432 * simply free the page we allocated and use the one that's there.
433 */
434static bool alloc_p2m(unsigned long pfn)
435{
436 unsigned topidx, mididx;
437 unsigned long ***top_p, **mid;
438 unsigned long *top_mfn_p, *mid_mfn;
439
440 topidx = p2m_top_index(pfn);
441 mididx = p2m_mid_index(pfn);
442
443 top_p = &p2m_top[topidx];
444 mid = *top_p;
445
446 if (mid == p2m_mid_missing) {
447 /* Mid level is missing, allocate a new one */
448 mid = alloc_p2m_page();
449 if (!mid)
450 return false;
451
452 p2m_mid_init(mid);
453
454 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 free_p2m_page(mid);
456 }
457
458 top_mfn_p = &p2m_top_mfn[topidx];
459 mid_mfn = p2m_top_mfn_p[topidx];
460
461 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
462
463 if (mid_mfn == p2m_mid_missing_mfn) {
464 /* Separately check the mid mfn level */
465 unsigned long missing_mfn;
466 unsigned long mid_mfn_mfn;
467
468 mid_mfn = alloc_p2m_page();
469 if (!mid_mfn)
470 return false;
471
472 p2m_mid_mfn_init(mid_mfn);
473
474 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
475 mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 free_p2m_page(mid_mfn);
478 else
479 p2m_top_mfn_p[topidx] = mid_mfn;
480 }
481
482 if (p2m_top[topidx][mididx] == p2m_missing) {
483 /* p2m leaf page is missing */
484 unsigned long *p2m;
485
486 p2m = alloc_p2m_page();
487 if (!p2m)
488 return false;
489
490 p2m_init(p2m);
491
492 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 free_p2m_page(p2m);
494 else
495 mid_mfn[mididx] = virt_to_mfn(p2m);
496 }
497
498 return true;
499}
500
501/* Try to install p2m mapping; fail if intermediate bits missing */
502bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
503{
504 unsigned topidx, mididx, idx;
505
506 if (unlikely(pfn >= MAX_P2M_PFN)) {
507 BUG_ON(mfn != INVALID_P2M_ENTRY);
508 return true;
509 }
510
511 topidx = p2m_top_index(pfn);
512 mididx = p2m_mid_index(pfn);
513 idx = p2m_index(pfn);
514
515 if (p2m_top[topidx][mididx] == p2m_missing)
516 return mfn == INVALID_P2M_ENTRY;
517
518 p2m_top[topidx][mididx][idx] = mfn;
519
520 return true;
521}
522
523bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
524{
525 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
526 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
527 return true;
528 }
529
530 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
531 if (!alloc_p2m(pfn))
532 return false;
533
534 if (!__set_phys_to_machine(pfn, mfn))
535 return false;
536 }
537
538 return true;
539}
540
541unsigned long arbitrary_virt_to_mfn(void *vaddr) 176unsigned long arbitrary_virt_to_mfn(void *vaddr)
542{ 177{
543 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 178 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
566 offset = address & ~PAGE_MASK; 201 offset = address & ~PAGE_MASK;
567 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); 202 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
568} 203}
204EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
569 205
570void make_lowmem_page_readonly(void *vaddr) 206void make_lowmem_page_readonly(void *vaddr)
571{ 207{
@@ -2034,6 +1670,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
2034 set_page_prot(pmd, PAGE_KERNEL_RO); 1670 set_page_prot(pmd, PAGE_KERNEL_RO);
2035} 1671}
2036 1672
1673void __init xen_setup_machphys_mapping(void)
1674{
1675 struct xen_machphys_mapping mapping;
1676 unsigned long machine_to_phys_nr_ents;
1677
1678 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1679 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1680 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1681 } else {
1682 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1683 }
1684 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1685}
1686
2037#ifdef CONFIG_X86_64 1687#ifdef CONFIG_X86_64
2038static void convert_pfn_mfn(void *v) 1688static void convert_pfn_mfn(void *v)
2039{ 1689{
@@ -2119,44 +1769,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
2119 return pgd; 1769 return pgd;
2120} 1770}
2121#else /* !CONFIG_X86_64 */ 1771#else /* !CONFIG_X86_64 */
2122static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); 1772static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1773static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1774
1775static __init void xen_write_cr3_init(unsigned long cr3)
1776{
1777 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1778
1779 BUG_ON(read_cr3() != __pa(initial_page_table));
1780 BUG_ON(cr3 != __pa(swapper_pg_dir));
1781
1782 /*
1783 * We are switching to swapper_pg_dir for the first time (from
1784 * initial_page_table) and therefore need to mark that page
1785 * read-only and then pin it.
1786 *
1787 * Xen disallows sharing of kernel PMDs for PAE
1788 * guests. Therefore we must copy the kernel PMD from
1789 * initial_page_table into a new kernel PMD to be used in
1790 * swapper_pg_dir.
1791 */
1792 swapper_kernel_pmd =
1793 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1794 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1795 sizeof(pmd_t) * PTRS_PER_PMD);
1796 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1797 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1798 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1799
1800 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1801 xen_write_cr3(cr3);
1802 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1803
1804 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1805 PFN_DOWN(__pa(initial_page_table)));
1806 set_page_prot(initial_page_table, PAGE_KERNEL);
1807 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1808
1809 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1810}
2123 1811
2124__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1812__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
2125 unsigned long max_pfn) 1813 unsigned long max_pfn)
2126{ 1814{
2127 pmd_t *kernel_pmd; 1815 pmd_t *kernel_pmd;
2128 1816
2129 level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); 1817 initial_kernel_pmd =
1818 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2130 1819
2131 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1820 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
2132 xen_start_info->nr_pt_frames * PAGE_SIZE + 1821 xen_start_info->nr_pt_frames * PAGE_SIZE +
2133 512*1024); 1822 512*1024);
2134 1823
2135 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1824 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2136 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1825 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
2137 1826
2138 xen_map_identity_early(level2_kernel_pgt, max_pfn); 1827 xen_map_identity_early(initial_kernel_pmd, max_pfn);
2139 1828
2140 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); 1829 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
2141 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], 1830 initial_page_table[KERNEL_PGD_BOUNDARY] =
2142 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); 1831 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2143 1832
2144 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1833 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2145 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 1834 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2146 set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 1835 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2147 1836
2148 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1837 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2149 1838
2150 xen_write_cr3(__pa(swapper_pg_dir)); 1839 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2151 1840 PFN_DOWN(__pa(initial_page_table)));
2152 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); 1841 xen_write_cr3(__pa(initial_page_table));
2153 1842
2154 memblock_x86_reserve_range(__pa(xen_start_info->pt_base), 1843 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
2155 __pa(xen_start_info->pt_base + 1844 __pa(xen_start_info->pt_base +
2156 xen_start_info->nr_pt_frames * PAGE_SIZE), 1845 xen_start_info->nr_pt_frames * PAGE_SIZE),
2157 "XEN PAGETABLES"); 1846 "XEN PAGETABLES");
2158 1847
2159 return swapper_pg_dir; 1848 return initial_page_table;
2160} 1849}
2161#endif /* CONFIG_X86_64 */ 1850#endif /* CONFIG_X86_64 */
2162 1851
@@ -2290,7 +1979,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2290 .write_cr2 = xen_write_cr2, 1979 .write_cr2 = xen_write_cr2,
2291 1980
2292 .read_cr3 = xen_read_cr3, 1981 .read_cr3 = xen_read_cr3,
1982#ifdef CONFIG_X86_32
1983 .write_cr3 = xen_write_cr3_init,
1984#else
2293 .write_cr3 = xen_write_cr3, 1985 .write_cr3 = xen_write_cr3,
1986#endif
2294 1987
2295 .flush_tlb_user = xen_flush_tlb, 1988 .flush_tlb_user = xen_flush_tlb,
2296 .flush_tlb_kernel = xen_flush_tlb, 1989 .flush_tlb_kernel = xen_flush_tlb,
@@ -2358,8 +2051,6 @@ void __init xen_init_mmu_ops(void)
2358 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2051 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2359 pv_mmu_ops = xen_mmu_ops; 2052 pv_mmu_ops = xen_mmu_ops;
2360 2053
2361 vmap_lazy_unmap = false;
2362
2363 memset(dummy_mapping, 0xff, PAGE_SIZE); 2054 memset(dummy_mapping, 0xff, PAGE_SIZE);
2364} 2055}
2365 2056
@@ -2627,7 +2318,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2627 2318
2628 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); 2319 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2629 2320
2630 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 2321 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2322 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2631 2323
2632 rmd.mfn = mfn; 2324 rmd.mfn = mfn;
2633 rmd.prot = prot; 2325 rmd.prot = prot;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 9e565da5d1f7..4ec8035e3216 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void)
22 unsigned long flags; 22 unsigned long flags;
23 /* need to disable interrupts until this entry is complete */ 23 /* need to disable interrupts until this entry is complete */
24 local_irq_save(flags); 24 local_irq_save(flags);
25 __get_cpu_var(xen_mc_irq_flags) = flags; 25 __this_cpu_write(xen_mc_irq_flags, flags);
26} 26}
27 27
28static inline struct multicall_space xen_mc_entry(size_t args) 28static inline struct multicall_space xen_mc_entry(size_t args)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..fd12d7ce7ff9
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,522 @@
1/*
2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
5 *
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
8 *
9 * Xen
10 * |
11 * p2m_top p2m_top_mfn
12 * / \ / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / /
15 * p2m p2m p2m p2m p2m p2m p2m ...
16 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/list.h>
31#include <linux/hash.h>
32#include <linux/sched.h>
33
34#include <asm/cache.h>
35#include <asm/setup.h>
36
37#include <asm/xen/page.h>
38#include <asm/xen/hypercall.h>
39#include <asm/xen/hypervisor.h>
40
41#include "xen-ops.h"
42
43static void __init m2p_override_init(void);
44
45unsigned long xen_max_p2m_pfn __read_mostly;
46
47#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
48#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
49#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
50
51#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
52
53/* Placeholders for holes in the address space */
54static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
55static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
56static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
57
58static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64
65static inline unsigned p2m_top_index(unsigned long pfn)
66{
67 BUG_ON(pfn >= MAX_P2M_PFN);
68 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
69}
70
71static inline unsigned p2m_mid_index(unsigned long pfn)
72{
73 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
74}
75
76static inline unsigned p2m_index(unsigned long pfn)
77{
78 return pfn % P2M_PER_PAGE;
79}
80
81static void p2m_top_init(unsigned long ***top)
82{
83 unsigned i;
84
85 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
86 top[i] = p2m_mid_missing;
87}
88
89static void p2m_top_mfn_init(unsigned long *top)
90{
91 unsigned i;
92
93 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
94 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
95}
96
97static void p2m_top_mfn_p_init(unsigned long **top)
98{
99 unsigned i;
100
101 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
102 top[i] = p2m_mid_missing_mfn;
103}
104
105static void p2m_mid_init(unsigned long **mid)
106{
107 unsigned i;
108
109 for (i = 0; i < P2M_MID_PER_PAGE; i++)
110 mid[i] = p2m_missing;
111}
112
113static void p2m_mid_mfn_init(unsigned long *mid)
114{
115 unsigned i;
116
117 for (i = 0; i < P2M_MID_PER_PAGE; i++)
118 mid[i] = virt_to_mfn(p2m_missing);
119}
120
121static void p2m_init(unsigned long *p2m)
122{
123 unsigned i;
124
125 for (i = 0; i < P2M_MID_PER_PAGE; i++)
126 p2m[i] = INVALID_P2M_ENTRY;
127}
128
129/*
130 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
131 *
132 * This is called both at boot time, and after resuming from suspend:
133 * - At boot time we're called very early, and must use extend_brk()
134 * to allocate memory.
135 *
136 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated.
138 */
139void xen_build_mfn_list_list(void)
140{
141 unsigned long pfn;
142
143 /* Pre-initialize p2m_top_mfn to be completely missing */
144 if (p2m_top_mfn == NULL) {
145 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
146 p2m_mid_mfn_init(p2m_mid_missing_mfn);
147
148 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
149 p2m_top_mfn_p_init(p2m_top_mfn_p);
150
151 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
152 p2m_top_mfn_init(p2m_top_mfn);
153 } else {
154 /* Reinitialise, mfn's all change after migration */
155 p2m_mid_mfn_init(p2m_mid_missing_mfn);
156 }
157
158 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
159 unsigned topidx = p2m_top_index(pfn);
160 unsigned mididx = p2m_mid_index(pfn);
161 unsigned long **mid;
162 unsigned long *mid_mfn_p;
163
164 mid = p2m_top[topidx];
165 mid_mfn_p = p2m_top_mfn_p[topidx];
166
167 /* Don't bother allocating any mfn mid levels if
168 * they're just missing, just update the stored mfn,
169 * since all could have changed over a migrate.
170 */
171 if (mid == p2m_mid_missing) {
172 BUG_ON(mididx);
173 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
174 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
175 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
176 continue;
177 }
178
179 if (mid_mfn_p == p2m_mid_missing_mfn) {
180 /*
181 * XXX boot-time only! We should never find
182 * missing parts of the mfn tree after
183 * runtime. extend_brk() will BUG if we call
184 * it too late.
185 */
186 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
187 p2m_mid_mfn_init(mid_mfn_p);
188
189 p2m_top_mfn_p[topidx] = mid_mfn_p;
190 }
191
192 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
193 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
194 }
195}
196
197void xen_setup_mfn_list_list(void)
198{
199 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
200
201 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
202 virt_to_mfn(p2m_top_mfn);
203 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
204}
205
206/* Set up p2m_top to point to the domain-builder provided p2m pages */
207void __init xen_build_dynamic_phys_to_machine(void)
208{
209 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
210 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
211 unsigned long pfn;
212
213 xen_max_p2m_pfn = max_pfn;
214
215 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
216 p2m_init(p2m_missing);
217
218 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
219 p2m_mid_init(p2m_mid_missing);
220
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top);
223
224 /*
225 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just
227 * need to graft that into our tree structure.
228 */
229 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
230 unsigned topidx = p2m_top_index(pfn);
231 unsigned mididx = p2m_mid_index(pfn);
232
233 if (p2m_top[topidx] == p2m_mid_missing) {
234 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
235 p2m_mid_init(mid);
236
237 p2m_top[topidx] = mid;
238 }
239
240 /*
241 * As long as the mfn_list has enough entries to completely
242 * fill a p2m page, pointing into the array is ok. But if
243 * not the entries beyond the last pfn will be undefined.
244 */
245 if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
246 unsigned long p2midx;
247
248 p2midx = max_pfn % P2M_PER_PAGE;
249 for ( ; p2midx < P2M_PER_PAGE; p2midx++)
250 mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
251 }
252 p2m_top[topidx][mididx] = &mfn_list[pfn];
253 }
254
255 m2p_override_init();
256}
257
258unsigned long get_phys_to_machine(unsigned long pfn)
259{
260 unsigned topidx, mididx, idx;
261
262 if (unlikely(pfn >= MAX_P2M_PFN))
263 return INVALID_P2M_ENTRY;
264
265 topidx = p2m_top_index(pfn);
266 mididx = p2m_mid_index(pfn);
267 idx = p2m_index(pfn);
268
269 return p2m_top[topidx][mididx][idx];
270}
271EXPORT_SYMBOL_GPL(get_phys_to_machine);
272
273static void *alloc_p2m_page(void)
274{
275 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
276}
277
278static void free_p2m_page(void *p)
279{
280 free_page((unsigned long)p);
281}
282
283/*
284 * Fully allocate the p2m structure for a given pfn. We need to check
285 * that both the top and mid levels are allocated, and make sure the
286 * parallel mfn tree is kept in sync. We may race with other cpus, so
287 * the new pages are installed with cmpxchg; if we lose the race then
288 * simply free the page we allocated and use the one that's there.
289 */
290static bool alloc_p2m(unsigned long pfn)
291{
292 unsigned topidx, mididx;
293 unsigned long ***top_p, **mid;
294 unsigned long *top_mfn_p, *mid_mfn;
295
296 topidx = p2m_top_index(pfn);
297 mididx = p2m_mid_index(pfn);
298
299 top_p = &p2m_top[topidx];
300 mid = *top_p;
301
302 if (mid == p2m_mid_missing) {
303 /* Mid level is missing, allocate a new one */
304 mid = alloc_p2m_page();
305 if (!mid)
306 return false;
307
308 p2m_mid_init(mid);
309
310 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
311 free_p2m_page(mid);
312 }
313
314 top_mfn_p = &p2m_top_mfn[topidx];
315 mid_mfn = p2m_top_mfn_p[topidx];
316
317 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
318
319 if (mid_mfn == p2m_mid_missing_mfn) {
320 /* Separately check the mid mfn level */
321 unsigned long missing_mfn;
322 unsigned long mid_mfn_mfn;
323
324 mid_mfn = alloc_p2m_page();
325 if (!mid_mfn)
326 return false;
327
328 p2m_mid_mfn_init(mid_mfn);
329
330 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
331 mid_mfn_mfn = virt_to_mfn(mid_mfn);
332 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
333 free_p2m_page(mid_mfn);
334 else
335 p2m_top_mfn_p[topidx] = mid_mfn;
336 }
337
338 if (p2m_top[topidx][mididx] == p2m_missing) {
339 /* p2m leaf page is missing */
340 unsigned long *p2m;
341
342 p2m = alloc_p2m_page();
343 if (!p2m)
344 return false;
345
346 p2m_init(p2m);
347
348 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
349 free_p2m_page(p2m);
350 else
351 mid_mfn[mididx] = virt_to_mfn(p2m);
352 }
353
354 return true;
355}
356
357/* Try to install p2m mapping; fail if intermediate bits missing */
358bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
359{
360 unsigned topidx, mididx, idx;
361
362 if (unlikely(pfn >= MAX_P2M_PFN)) {
363 BUG_ON(mfn != INVALID_P2M_ENTRY);
364 return true;
365 }
366
367 topidx = p2m_top_index(pfn);
368 mididx = p2m_mid_index(pfn);
369 idx = p2m_index(pfn);
370
371 if (p2m_top[topidx][mididx] == p2m_missing)
372 return mfn == INVALID_P2M_ENTRY;
373
374 p2m_top[topidx][mididx][idx] = mfn;
375
376 return true;
377}
378
379bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
380{
381 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
382 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
383 return true;
384 }
385
386 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
387 if (!alloc_p2m(pfn))
388 return false;
389
390 if (!__set_phys_to_machine(pfn, mfn))
391 return false;
392 }
393
394 return true;
395}
396
397#define M2P_OVERRIDE_HASH_SHIFT 10
398#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
399
400static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
401static DEFINE_SPINLOCK(m2p_override_lock);
402
403static void __init m2p_override_init(void)
404{
405 unsigned i;
406
407 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
408 sizeof(unsigned long));
409
410 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
411 INIT_LIST_HEAD(&m2p_overrides[i]);
412}
413
414static unsigned long mfn_hash(unsigned long mfn)
415{
416 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
417}
418
419/* Add an MFN override for a particular page */
420int m2p_add_override(unsigned long mfn, struct page *page)
421{
422 unsigned long flags;
423 unsigned long pfn;
424 unsigned long address;
425 unsigned level;
426 pte_t *ptep = NULL;
427
428 pfn = page_to_pfn(page);
429 if (!PageHighMem(page)) {
430 address = (unsigned long)__va(pfn << PAGE_SHIFT);
431 ptep = lookup_address(address, &level);
432
433 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
434 "m2p_add_override: pfn %lx not mapped", pfn))
435 return -EINVAL;
436 }
437
438 page->private = mfn;
439 page->index = pfn_to_mfn(pfn);
440
441 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
442 if (!PageHighMem(page))
443 /* Just zap old mapping for now */
444 pte_clear(&init_mm, address, ptep);
445
446 spin_lock_irqsave(&m2p_override_lock, flags);
447 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
448 spin_unlock_irqrestore(&m2p_override_lock, flags);
449
450 return 0;
451}
452
453int m2p_remove_override(struct page *page)
454{
455 unsigned long flags;
456 unsigned long mfn;
457 unsigned long pfn;
458 unsigned long address;
459 unsigned level;
460 pte_t *ptep = NULL;
461
462 pfn = page_to_pfn(page);
463 mfn = get_phys_to_machine(pfn);
464 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
465 return -EINVAL;
466
467 if (!PageHighMem(page)) {
468 address = (unsigned long)__va(pfn << PAGE_SHIFT);
469 ptep = lookup_address(address, &level);
470
471 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
472 "m2p_remove_override: pfn %lx not mapped", pfn))
473 return -EINVAL;
474 }
475
476 spin_lock_irqsave(&m2p_override_lock, flags);
477 list_del(&page->lru);
478 spin_unlock_irqrestore(&m2p_override_lock, flags);
479 __set_phys_to_machine(pfn, page->index);
480
481 if (!PageHighMem(page))
482 set_pte_at(&init_mm, address, ptep,
483 pfn_pte(pfn, PAGE_KERNEL));
484 /* No tlb flush necessary because the caller already
485 * left the pte unmapped. */
486
487 return 0;
488}
489
490struct page *m2p_find_override(unsigned long mfn)
491{
492 unsigned long flags;
493 struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
494 struct page *p, *ret;
495
496 ret = NULL;
497
498 spin_lock_irqsave(&m2p_override_lock, flags);
499
500 list_for_each_entry(p, bucket, lru) {
501 if (p->private == mfn) {
502 ret = p;
503 break;
504 }
505 }
506
507 spin_unlock_irqrestore(&m2p_override_lock, flags);
508
509 return ret;
510}
511
512unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
513{
514 struct page *p = m2p_find_override(mfn);
515 unsigned long ret = pfn;
516
517 if (p)
518 ret = page_to_pfn(p);
519
520 return ret;
521}
522EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0f456386cce5..25c52f94a27c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void)
68 return 0; 68 return 0;
69} 69}
70 70
71void __init xen_unplug_emulated_devices(void) 71void xen_unplug_emulated_devices(void)
72{ 72{
73 int r; 73 int r;
74 74
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b1dbdaa23ecc..a8a66a50d446 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -23,7 +23,6 @@
23#include <xen/interface/callback.h> 23#include <xen/interface/callback.h>
24#include <xen/interface/memory.h> 24#include <xen/interface/memory.h>
25#include <xen/interface/physdev.h> 25#include <xen/interface/physdev.h>
26#include <xen/interface/memory.h>
27#include <xen/features.h> 26#include <xen/features.h>
28 27
29#include "xen-ops.h" 28#include "xen-ops.h"
@@ -118,16 +117,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
118 const struct e820map *e820) 117 const struct e820map *e820)
119{ 118{
120 phys_addr_t max_addr = PFN_PHYS(max_pfn); 119 phys_addr_t max_addr = PFN_PHYS(max_pfn);
121 phys_addr_t last_end = 0; 120 phys_addr_t last_end = ISA_END_ADDRESS;
122 unsigned long released = 0; 121 unsigned long released = 0;
123 int i; 122 int i;
124 123
124 /* Free any unused memory above the low 1Mbyte. */
125 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { 125 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
126 phys_addr_t end = e820->map[i].addr; 126 phys_addr_t end = e820->map[i].addr;
127 end = min(max_addr, end); 127 end = min(max_addr, end);
128 128
129 released += xen_release_chunk(last_end, end); 129 if (last_end < end)
130 last_end = e820->map[i].addr + e820->map[i].size; 130 released += xen_release_chunk(last_end, end);
131 last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
131 } 132 }
132 133
133 if (last_end < max_addr) 134 if (last_end < max_addr)
@@ -164,6 +165,7 @@ char * __init xen_memory_setup(void)
164 XENMEM_memory_map; 165 XENMEM_memory_map;
165 rc = HYPERVISOR_memory_op(op, &memmap); 166 rc = HYPERVISOR_memory_op(op, &memmap);
166 if (rc == -ENOSYS) { 167 if (rc == -ENOSYS) {
168 BUG_ON(xen_initial_domain());
167 memmap.nr_entries = 1; 169 memmap.nr_entries = 1;
168 map[0].addr = 0ULL; 170 map[0].addr = 0ULL;
169 map[0].size = mem_end; 171 map[0].size = mem_end;
@@ -177,36 +179,39 @@ char * __init xen_memory_setup(void)
177 e820.nr_map = 0; 179 e820.nr_map = 0;
178 xen_extra_mem_start = mem_end; 180 xen_extra_mem_start = mem_end;
179 for (i = 0; i < memmap.nr_entries; i++) { 181 for (i = 0; i < memmap.nr_entries; i++) {
180 unsigned long long end = map[i].addr + map[i].size; 182 unsigned long long end;
183
184 /* Guard against non-page aligned E820 entries. */
185 if (map[i].type == E820_RAM)
186 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
181 187
182 if (map[i].type == E820_RAM) { 188 end = map[i].addr + map[i].size;
183 if (map[i].addr < mem_end && end > mem_end) { 189 if (map[i].type == E820_RAM && end > mem_end) {
184 /* Truncate region to max_mem. */ 190 /* RAM off the end - may be partially included */
185 u64 delta = end - mem_end; 191 u64 delta = min(map[i].size, end - mem_end);
186 192
187 map[i].size -= delta; 193 map[i].size -= delta;
188 extra_pages += PFN_DOWN(delta); 194 end -= delta;
189 195
190 end = mem_end; 196 extra_pages += PFN_DOWN(delta);
191 }
192 } 197 }
193 198
194 if (end > xen_extra_mem_start) 199 if (map[i].size > 0 && end > xen_extra_mem_start)
195 xen_extra_mem_start = end; 200 xen_extra_mem_start = end;
196 201
197 /* If region is non-RAM or below mem_end, add what remains */ 202 /* Add region if any remains */
198 if ((map[i].type != E820_RAM || map[i].addr < mem_end) && 203 if (map[i].size > 0)
199 map[i].size > 0)
200 e820_add_region(map[i].addr, map[i].size, map[i].type); 204 e820_add_region(map[i].addr, map[i].size, map[i].type);
201 } 205 }
202 206
203 /* 207 /*
204 * Even though this is normal, usable memory under Xen, reserve 208 * In domU, the ISA region is normal, usable memory, but we
205 * ISA memory anyway because too many things think they can poke 209 * reserve ISA memory anyway because too many things poke
206 * about in there. 210 * about in there.
207 * 211 *
208 * In a dom0 kernel, this region is identity mapped with the 212 * In Dom0, the host E820 information can leave gaps in the
209 * hardware ISA area, so it really is out of bounds. 213 * ISA range, which would cause us to release those pages. To
214 * avoid this, we unconditionally reserve them here.
210 */ 215 */
211 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 216 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
212 E820_RESERVED); 217 E820_RESERVED);
@@ -244,26 +249,11 @@ char * __init xen_memory_setup(void)
244 else 249 else
245 extra_pages = 0; 250 extra_pages = 0;
246 251
247 if (!xen_initial_domain()) 252 xen_add_extra_mem(extra_pages);
248 xen_add_extra_mem(extra_pages);
249 253
250 return "Xen"; 254 return "Xen";
251} 255}
252 256
253static void xen_idle(void)
254{
255 local_irq_disable();
256
257 if (need_resched())
258 local_irq_enable();
259 else {
260 current_thread_info()->status &= ~TS_POLLING;
261 smp_mb__after_clear_bit();
262 safe_halt();
263 current_thread_info()->status |= TS_POLLING;
264 }
265}
266
267/* 257/*
268 * Set the bit indicating "nosegneg" library variants should be used. 258 * Set the bit indicating "nosegneg" library variants should be used.
269 * We only need to bother in pure 32-bit mode; compat 32-bit processes 259 * We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -333,9 +323,6 @@ void __cpuinit xen_enable_syscall(void)
333 323
334void __init xen_arch_setup(void) 324void __init xen_arch_setup(void)
335{ 325{
336 struct physdev_set_iopl set_iopl;
337 int rc;
338
339 xen_panic_handler_init(); 326 xen_panic_handler_init();
340 327
341 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 328 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -352,11 +339,6 @@ void __init xen_arch_setup(void)
352 xen_enable_sysenter(); 339 xen_enable_sysenter();
353 xen_enable_syscall(); 340 xen_enable_syscall();
354 341
355 set_iopl.iopl = 1;
356 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
357 if (rc != 0)
358 printk(KERN_INFO "physdev_op failed %d\n", rc);
359
360#ifdef CONFIG_ACPI 342#ifdef CONFIG_ACPI
361 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 343 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
362 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 344 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
@@ -368,7 +350,12 @@ void __init xen_arch_setup(void)
368 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 350 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
369 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 351 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
370 352
371 pm_idle = xen_idle; 353 /* Set up idle, making sure it calls safe_halt() pvop */
354#ifdef CONFIG_X86_32
355 boot_cpu_data.hlt_works_ok = 1;
356#endif
357 pm_idle = default_idle;
358 boot_option_idle_override = IDLE_HALT;
372 359
373 fiddle_vdso(); 360 fiddle_vdso();
374} 361}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23e061b9327b..cc9b1e182fcf 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
159{ 159{
160 struct xen_spinlock *prev; 160 struct xen_spinlock *prev;
161 161
162 prev = __get_cpu_var(lock_spinners); 162 prev = __this_cpu_read(lock_spinners);
163 __get_cpu_var(lock_spinners) = xl; 163 __this_cpu_write(lock_spinners, xl);
164 164
165 wmb(); /* set lock of interest before count */ 165 wmb(); /* set lock of interest before count */
166 166
@@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
179 asm(LOCK_PREFIX " decw %0" 179 asm(LOCK_PREFIX " decw %0"
180 : "+m" (xl->spinners) : : "memory"); 180 : "+m" (xl->spinners) : : "memory");
181 wmb(); /* decrement count before restoring lock */ 181 wmb(); /* decrement count before restoring lock */
182 __get_cpu_var(lock_spinners) = prev; 182 __this_cpu_write(lock_spinners, prev);
183} 183}
184 184
185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) 185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
186{ 186{
187 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 187 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
188 struct xen_spinlock *prev; 188 struct xen_spinlock *prev;
189 int irq = __get_cpu_var(lock_kicker_irq); 189 int irq = __this_cpu_read(lock_kicker_irq);
190 int ret; 190 int ret;
191 u64 start; 191 u64 start;
192 192
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d789d56877c..9bbd63a129b5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -31,6 +31,7 @@ void xen_hvm_post_suspend(int suspend_cancelled)
31 int cpu; 31 int cpu;
32 xen_hvm_init_shared_info(); 32 xen_hvm_init_shared_info();
33 xen_callback_vector(); 33 xen_callback_vector();
34 xen_unplug_emulated_devices();
34 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 35 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
35 for_each_online_cpu(cpu) { 36 for_each_online_cpu(cpu) {
36 xen_setup_runstate_info(cpu); 37 xen_setup_runstate_info(cpu);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b2bb5aa3b054..067759e3d6a5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -135,24 +135,24 @@ static void do_stolen_accounting(void)
135 135
136 /* Add the appropriate number of ticks of stolen time, 136 /* Add the appropriate number of ticks of stolen time,
137 including any left-overs from last time. */ 137 including any left-overs from last time. */
138 stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); 138 stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
139 139
140 if (stolen < 0) 140 if (stolen < 0)
141 stolen = 0; 141 stolen = 0;
142 142
143 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 143 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
144 __get_cpu_var(xen_residual_stolen) = stolen; 144 __this_cpu_write(xen_residual_stolen, stolen);
145 account_steal_ticks(ticks); 145 account_steal_ticks(ticks);
146 146
147 /* Add the appropriate number of ticks of blocked time, 147 /* Add the appropriate number of ticks of blocked time,
148 including any left-overs from last time. */ 148 including any left-overs from last time. */
149 blocked += __get_cpu_var(xen_residual_blocked); 149 blocked += __this_cpu_read(xen_residual_blocked);
150 150
151 if (blocked < 0) 151 if (blocked < 0)
152 blocked = 0; 152 blocked = 0;
153 153
154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 __get_cpu_var(xen_residual_blocked) = blocked; 155 __this_cpu_write(xen_residual_blocked, blocked);
156 account_idle_ticks(ticks); 156 account_idle_ticks(ticks);
157} 157}
158 158
@@ -426,6 +426,8 @@ void xen_timer_resume(void)
426{ 426{
427 int cpu; 427 int cpu;
428 428
429 pvclock_resume();
430
429 if (xen_clockevent != &xen_vcpuop_clockevent) 431 if (xen_clockevent != &xen_vcpuop_clockevent)
430 return; 432 return;
431 433
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 64044747348e..9d41bf985757 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -43,7 +43,7 @@ void xen_vcpu_restore(void);
43 43
44void xen_callback_vector(void); 44void xen_callback_vector(void);
45void xen_hvm_init_shared_info(void); 45void xen_hvm_init_shared_info(void);
46void __init xen_unplug_emulated_devices(void); 46void xen_unplug_emulated_devices(void);
47 47
48void __init xen_build_dynamic_phys_to_machine(void); 48void __init xen_build_dynamic_phys_to_machine(void);
49 49